nominatim/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
  12                    Dict, Set, Iterable
  13 import itertools
  14 import json
  15 import logging
  16 from pathlib import Path
  17 from textwrap import dedent
  18
  19 from nominatim.db.connection import connect, Connection, Cursor
  20 from nominatim.config import Configuration
  21 from nominatim.db.utils import CopyBuffer
  22 from nominatim.db.sql_preprocessor import SQLPreprocessor
  23 from nominatim.data.place_info import PlaceInfo
  24 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  25 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
  26 from nominatim.data.place_name import PlaceName
  27 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
  28 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  29
  30 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  31
  32 LOG = logging.getLogger()
  33
  34 WORD_TYPES =(('country_names', 'C'),
  35              ('postcodes', 'P'),
  36              ('full_word', 'W'),
  37              ('housenumbers', 'H'))
  38
  39 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
  40     """ Create a new instance of the tokenizer provided by this module.
  41     """
  42     return ICUTokenizer(dsn, data_dir)
  43
  44
  45 class ICUTokenizer(AbstractTokenizer):
  46     """ This tokenizer uses libICU to convert names and queries to ASCII.
  47         Otherwise it uses the same algorithms and data structures as the
  48         normalization routines in Nominatim 3.
  49     """
  50
  51     def __init__(self, dsn: str, data_dir: Path) -> None:
  52         self.dsn = dsn
  53         self.data_dir = data_dir
  54         self.loader: Optional[ICURuleLoader] = None
  55
  56
  57     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
  58         """ Set up a new tokenizer for the database.
  59
  60             This copies all necessary data in the project directory to make
  61             sure the tokenizer remains stable even over updates.
  62         """
  63         self.loader = ICURuleLoader(config)
  64
  65         self._install_php(config.lib_dir.php, overwrite=True)
  66         self._save_config()
  67
  68         if init_db:
  69             self.update_sql_functions(config)
  70             self._setup_db_tables(config)
  71             self._create_base_indices(config, 'word')
  72
  73
  74     def init_from_project(self, config: Configuration) -> None:
  75         """ Initialise the tokenizer from the project directory.
  76         """
  77         self.loader = ICURuleLoader(config)
  78
  79         with connect(self.dsn) as conn:
  80             self.loader.load_config_from_db(conn)
  81
  82         self._install_php(config.lib_dir.php, overwrite=False)
  83
  84
  85     def finalize_import(self, config: Configuration) -> None:
  86         """ Do any required postprocessing to make the tokenizer data ready
  87             for use.
  88         """
  89         self._create_lookup_indices(config, 'word')
  90
  91
  92     def update_sql_functions(self, config: Configuration) -> None:
  93         """ Reimport the SQL functions for this tokenizer.
  94         """
  95         with connect(self.dsn) as conn:
  96             sqlp = SQLPreprocessor(conn, config)
  97             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  98
  99
 100     def check_database(self, config: Configuration) -> None:
 101         """ Check that the tokenizer is set up correctly.
 102         """
 103         # Will throw an error if there is an issue.
 104         self.init_from_project(config)
 105
 106
 107     def update_statistics(self, config: Configuration) -> None:
 108         """ Recompute frequencies for all name words.
 109         """
 110         with connect(self.dsn) as conn:
 111             if not conn.table_exists('search_name'):
 112                 return
 113
 114             with conn.cursor() as cur:
 115                 LOG.info('Computing word frequencies')
 116                 cur.drop_table('word_frequencies')
 117                 cur.execute("""CREATE TEMP TABLE word_frequencies AS
 118                                  SELECT unnest(name_vector) as id, count(*)
 119                                  FROM search_name GROUP BY id""")
 120                 cur.execute('CREATE INDEX ON word_frequencies(id)')
 121                 LOG.info('Update word table with recomputed frequencies')
 122                 cur.drop_table('tmp_word')
 123                 cur.execute("""CREATE TABLE tmp_word AS
 124                                 SELECT word_id, word_token, type, word,
 125                                        (CASE WHEN wf.count is null THEN info
 126                                           ELSE info || jsonb_build_object('count', wf.count)
 127                                         END) as info
 128                                 FROM word LEFT JOIN word_frequencies wf
 129                                   ON word.word_id = wf.id""")
 130                 cur.drop_table('word_frequencies')
 131
 132             sqlp = SQLPreprocessor(conn, config)
 133             sqlp.run_string(conn,
 134                             'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
 135             conn.commit()
 136         self._create_base_indices(config, 'tmp_word')
 137         self._create_lookup_indices(config, 'tmp_word')
 138         self._move_temporary_word_table('tmp_word')
 139
 140
 141
 142     def _cleanup_housenumbers(self) -> None:
 143         """ Remove unused house numbers.
 144         """
 145         with connect(self.dsn) as conn:
 146             if not conn.table_exists('search_name'):
 147                 return
 148             with conn.cursor(name="hnr_counter") as cur:
 149                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
 150                                FROM word
 151                                WHERE type = 'H'
 152                                  AND NOT EXISTS(SELECT * FROM search_name
 153                                                 WHERE ARRAY[word.word_id] && name_vector)
 154                                  AND (char_length(coalesce(word, word_token)) > 6
 155                                       OR coalesce(word, word_token) not similar to '\\d+')
 156                             """)
 157                 candidates = {token: wid for wid, token in cur}
 158             with conn.cursor(name="hnr_counter") as cur:
 159                 cur.execute("""SELECT housenumber FROM placex
 160                                WHERE housenumber is not null
 161                                      AND (char_length(housenumber) > 6
 162                                           OR housenumber not similar to '\\d+')
 163                             """)
 164                 for row in cur:
 165                     for hnr in row[0].split(';'):
 166                         candidates.pop(hnr, None)
 167             LOG.info("There are %s outdated housenumbers.", len(candidates))
 168             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 169             if candidates:
 170                 with conn.cursor() as cur:
 171                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 172                                 (list(candidates.values()), ))
 173                 conn.commit()
 174
 175
 176
 177     def update_word_tokens(self) -> None:
 178         """ Remove unused tokens.
 179         """
 180         LOG.warning("Cleaning up housenumber tokens.")
 181         self._cleanup_housenumbers()
 182         LOG.warning("Tokenizer house-keeping done.")
 183
 184
 185     def name_analyzer(self) -> 'ICUNameAnalyzer':
 186         """ Create a new analyzer for tokenizing names and queries
 187             using this tokinzer. Analyzers are context managers and should
 188             be used accordingly:
 189
 190             ```
 191             with tokenizer.name_analyzer() as analyzer:
 192                 analyser.tokenize()
 193             ```
 194
 195             When used outside the with construct, the caller must ensure to
 196             call the close() function before destructing the analyzer.
 197
 198             Analyzers are not thread-safe. You need to instantiate one per thread.
 199         """
 200         assert self.loader is not None
 201         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 202                                self.loader.make_token_analysis())
 203
 204
 205     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
 206         """ Return a list of the `num` most frequent full words
 207             in the database.
 208         """
 209         with conn.cursor() as cur:
 210             cur.execute("""SELECT word, sum((info->>'count')::int) as count
 211                              FROM word WHERE type = 'W'
 212                              GROUP BY word
 213                              ORDER BY count DESC LIMIT %s""", (num,))
 214             return list(s[0].split('@')[0] for s in cur)
 215
 216
 217     def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
 218         """ Install the php script for the tokenizer.
 219         """
 220         assert self.loader is not None
 221         php_file = self.data_dir / "tokenizer.php"
 222
 223         if not php_file.exists() or overwrite:
 224             php_file.write_text(dedent(f"""\
 225                 <?php
 226                 @define('CONST_Max_Word_Frequency', 10000000);
 227                 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 228                 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 229                 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
 230
 231
 232     def _save_config(self) -> None:
 233         """ Save the configuration that needs to remain stable for the given
 234             database as database properties.
 235         """
 236         assert self.loader is not None
 237         with connect(self.dsn) as conn:
 238             self.loader.save_config_to_db(conn)
 239
 240
 241     def _setup_db_tables(self, config: Configuration) -> None:
 242         """ Set up the word table and fill it with pre-computed word
 243             frequencies.
 244         """
 245         with connect(self.dsn) as conn:
 246             with conn.cursor() as cur:
 247                 cur.drop_table('word')
 248             sqlp = SQLPreprocessor(conn, config)
 249             sqlp.run_string(conn, """
 250                 CREATE TABLE word (
 251                       word_id INTEGER,
 252                       word_token text NOT NULL,
 253                       type text NOT NULL,
 254                       word text,
 255                       info jsonb
 256                     ) {{db.tablespace.search_data}};
 257                 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 258
 259                 DROP SEQUENCE IF EXISTS seq_word;
 260                 CREATE SEQUENCE seq_word start 1;
 261                 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
 262             """)
 263             conn.commit()
 264
 265
 266     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
 267         """ Set up the word table and fill it with pre-computed word
 268             frequencies.
 269         """
 270         with connect(self.dsn) as conn:
 271             sqlp = SQLPreprocessor(conn, config)
 272             sqlp.run_string(conn,
 273                             """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
 274                                USING BTREE (word_token) {{db.tablespace.search_index}}""",
 275                             table_name=table_name)
 276             for name, ctype in WORD_TYPES:
 277                 sqlp.run_string(conn,
 278                                 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
 279                                    USING BTREE (word) {{db.tablespace.address_index}}
 280                                    WHERE type = '{{column_type}}'
 281                                 """,
 282                                 table_name=table_name, idx_name=name,
 283                                 column_type=ctype)
 284             conn.commit()
 285
 286
 287     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
 288         """ Create addtional indexes used when running the API.
 289         """
 290         with connect(self.dsn) as conn:
 291             sqlp = SQLPreprocessor(conn, config)
 292             # Index required for details lookup.
 293             sqlp.run_string(conn, """
 294                 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
 295                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
 296             """,
 297             table_name=table_name)
 298             conn.commit()
 299
 300
 301     def _move_temporary_word_table(self, old: str) -> None:
 302         """ Rename all tables and indexes used by the tokenizer.
 303         """
 304         with connect(self.dsn) as conn:
 305             with conn.cursor() as cur:
 306                 cur.drop_table('word')
 307                 cur.execute(f"ALTER TABLE {old} RENAME TO word")
 308                 for idx in ('word_token', 'word_id'):
 309                     cur.execute(f"""ALTER INDEX idx_{old}_{idx}
 310                                       RENAME TO idx_word_{idx}""")
 311                 for name, _ in WORD_TYPES:
 312                     cur.execute(f"""ALTER INDEX idx_{old}_{name}
 313                                     RENAME TO idx_word_{name}""")
 314             conn.commit()
 315
 316
 317
 318
 319 class ICUNameAnalyzer(AbstractAnalyzer):
 320     """ The ICU analyzer uses the ICU library for splitting names.
 321
 322         Each instance opens a connection to the database to request the
 323         normalization.
 324     """
 325
 326     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
 327                  token_analysis: ICUTokenAnalysis) -> None:
 328         self.conn: Optional[Connection] = connect(dsn).connection
 329         self.conn.autocommit = True
 330         self.sanitizer = sanitizer
 331         self.token_analysis = token_analysis
 332
 333         self._cache = _TokenCache()
 334
 335
 336     def close(self) -> None:
 337         """ Free all resources used by the analyzer.
 338         """
 339         if self.conn:
 340             self.conn.close()
 341             self.conn = None
 342
 343
 344     def _search_normalized(self, name: str) -> str:
 345         """ Return the search token transliteration of the given name.
 346         """
 347         return cast(str, self.token_analysis.search.transliterate(name)).strip()
 348
 349
 350     def _normalized(self, name: str) -> str:
 351         """ Return the normalized version of the given name with all
 352             non-relevant information removed.
 353         """
 354         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
 355
 356
 357     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
 358         """ Return token information for the given list of words.
 359             If a word starts with # it is assumed to be a full name
 360             otherwise is a partial name.
 361
 362             The function returns a list of tuples with
 363             (original word, word token, word id).
 364
 365             The function is used for testing and debugging only
 366             and not necessarily efficient.
 367         """
 368         assert self.conn is not None
 369         full_tokens = {}
 370         partial_tokens = {}
 371         for word in words:
 372             if word.startswith('#'):
 373                 full_tokens[word] = self._search_normalized(word[1:])
 374             else:
 375                 partial_tokens[word] = self._search_normalized(word)
 376
 377         with self.conn.cursor() as cur:
 378             cur.execute("""SELECT word_token, word_id
 379                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 380                         """, (list(full_tokens.values()),))
 381             full_ids = {r[0]: r[1] for r in cur}
 382             cur.execute("""SELECT word_token, word_id
 383                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 384                         (list(partial_tokens.values()),))
 385             part_ids = {r[0]: r[1] for r in cur}
 386
 387         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 388                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 389
 390
 391     def normalize_postcode(self, postcode: str) -> str:
 392         """ Convert the postcode to a standardized form.
 393
 394             This function must yield exactly the same result as the SQL function
 395             'token_normalized_postcode()'.
 396         """
 397         return postcode.strip().upper()
 398
 399
 400     def update_postcodes_from_db(self) -> None:
 401         """ Update postcode tokens in the word table from the location_postcode
 402             table.
 403         """
 404         assert self.conn is not None
 405         analyzer = self.token_analysis.analysis.get('@postcode')
 406
 407         with self.conn.cursor() as cur:
 408             # First get all postcode names currently in the word table.
 409             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
 410             word_entries = set((entry[0] for entry in cur))
 411
 412             # Then compute the required postcode names from the postcode table.
 413             needed_entries = set()
 414             cur.execute("SELECT country_code, postcode FROM location_postcode")
 415             for cc, postcode in cur:
 416                 info = PlaceInfo({'country_code': cc,
 417                                   'class': 'place', 'type': 'postcode',
 418                                   'address': {'postcode': postcode}})
 419                 address = self.sanitizer.process_names(info)[1]
 420                 for place in address:
 421                     if place.kind == 'postcode':
 422                         if analyzer is None:
 423                             postcode_name = place.name.strip().upper()
 424                             variant_base = None
 425                         else:
 426                             postcode_name = analyzer.get_canonical_id(place)
 427                             variant_base = place.get_attr("variant")
 428
 429                         if variant_base:
 430                             needed_entries.add(f'{postcode_name}@{variant_base}')
 431                         else:
 432                             needed_entries.add(postcode_name)
 433                         break
 434
 435         # Now update the word table.
 436         self._delete_unused_postcode_words(word_entries - needed_entries)
 437         self._add_missing_postcode_words(needed_entries - word_entries)
 438
 439     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
 440         assert self.conn is not None
 441         if tokens:
 442             with self.conn.cursor() as cur:
 443                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
 444                             (list(tokens), ))
 445
 446     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
 447         assert self.conn is not None
 448         if not tokens:
 449             return
 450
 451         analyzer = self.token_analysis.analysis.get('@postcode')
 452         terms = []
 453
 454         for postcode_name in tokens:
 455             if '@' in postcode_name:
 456                 term, variant = postcode_name.split('@', 2)
 457                 term = self._search_normalized(term)
 458                 if analyzer is None:
 459                     variants = [term]
 460                 else:
 461                     variants = analyzer.compute_variants(variant)
 462                     if term not in variants:
 463                         variants.append(term)
 464             else:
 465                 variants = [self._search_normalized(postcode_name)]
 466             terms.append((postcode_name, variants))
 467
 468         if terms:
 469             with self.conn.cursor() as cur:
 470                 cur.execute_values("""SELECT create_postcode_word(pc, var)
 471                                       FROM (VALUES %s) AS v(pc, var)""",
 472                                    terms)
 473
 474
 475
 476
 477     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
 478                                should_replace: bool) -> None:
 479         """ Replace the search index for special phrases with the new phrases.
 480             If `should_replace` is True, then the previous set of will be
 481             completely replaced. Otherwise the phrases are added to the
 482             already existing ones.
 483         """
 484         assert self.conn is not None
 485         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 486                             for p in phrases))
 487
 488         with self.conn.cursor() as cur:
 489             # Get the old phrases.
 490             existing_phrases = set()
 491             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 492             for word, info in cur:
 493                 existing_phrases.add((word, info['class'], info['type'],
 494                                       info.get('op') or '-'))
 495
 496             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 497             if should_replace:
 498                 deleted = self._remove_special_phrases(cur, norm_phrases,
 499                                                        existing_phrases)
 500             else:
 501                 deleted = 0
 502
 503         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 504                  len(norm_phrases), added, deleted)
 505
 506
 507     def _add_special_phrases(self, cursor: Cursor,
 508                              new_phrases: Set[Tuple[str, str, str, str]],
 509                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 510         """ Add all phrases to the database that are not yet there.
 511         """
 512         to_add = new_phrases - existing_phrases
 513
 514         added = 0
 515         with CopyBuffer() as copystr:
 516             for word, cls, typ, oper in to_add:
 517                 term = self._search_normalized(word)
 518                 if term:
 519                     copystr.add(term, 'S', word,
 520                                 json.dumps({'class': cls, 'type': typ,
 521                                             'op': oper if oper in ('in', 'near') else None}))
 522                     added += 1
 523
 524             copystr.copy_out(cursor, 'word',
 525                              columns=['word_token', 'type', 'word', 'info'])
 526
 527         return added
 528
 529
 530     def _remove_special_phrases(self, cursor: Cursor,
 531                              new_phrases: Set[Tuple[str, str, str, str]],
 532                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 533         """ Remove all phrases from the database that are no longer in the
 534             new phrase list.
 535         """
 536         to_delete = existing_phrases - new_phrases
 537
 538         if to_delete:
 539             cursor.execute_values(
 540                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 541                     WHERE type = 'S' and word = name
 542                           and info->>'class' = in_class and info->>'type' = in_type
 543                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 544                 """, to_delete)
 545
 546         return len(to_delete)
 547
 548
 549     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
 550         """ Add default names for the given country to the search index.
 551         """
 552         # Make sure any name preprocessing for country names applies.
 553         info = PlaceInfo({'name': names, 'country_code': country_code,
 554                           'rank_address': 4, 'class': 'boundary',
 555                           'type': 'administrative'})
 556         self._add_country_full_names(country_code,
 557                                      self.sanitizer.process_names(info)[0],
 558                                      internal=True)
 559
 560
 561     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
 562                                 internal: bool = False) -> None:
 563         """ Add names for the given country from an already sanitized
 564             name list.
 565         """
 566         assert self.conn is not None
 567         word_tokens = set()
 568         for name in names:
 569             norm_name = self._search_normalized(name.name)
 570             if norm_name:
 571                 word_tokens.add(norm_name)
 572
 573         with self.conn.cursor() as cur:
 574             # Get existing names
 575             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 576                              FROM word
 577                              WHERE type = 'C' and word = %s""",
 578                         (country_code, ))
 579             # internal/external names
 580             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
 581             for word in cur:
 582                 existing_tokens[word[1]].add(word[0])
 583
 584             # Delete names that no longer exist.
 585             gone_tokens = existing_tokens[internal] - word_tokens
 586             if internal:
 587                 gone_tokens.update(existing_tokens[False] & word_tokens)
 588             if gone_tokens:
 589                 cur.execute("""DELETE FROM word
 590                                USING unnest(%s) as token
 591                                WHERE type = 'C' and word = %s
 592                                      and word_token = token""",
 593                             (list(gone_tokens), country_code))
 594
 595             # Only add those names that are not yet in the list.
 596             new_tokens = word_tokens - existing_tokens[True]
 597             if not internal:
 598                 new_tokens -= existing_tokens[False]
 599             if new_tokens:
 600                 if internal:
 601                     sql = """INSERT INTO word (word_token, type, word, info)
 602                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 603                                   FROM unnest(%s) as token)
 604                            """
 605                 else:
 606                     sql = """INSERT INTO word (word_token, type, word)
 607                                    (SELECT token, 'C', %s
 608                                     FROM unnest(%s) as token)
 609                           """
 610                 cur.execute(sql, (country_code, list(new_tokens)))
 611
 612
 613     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
 614         """ Determine tokenizer information about the given place.
 615
 616             Returns a JSON-serializable structure that will be handed into
 617             the database via the token_info field.
 618         """
 619         token_info = _TokenInfo()
 620
 621         names, address = self.sanitizer.process_names(place)
 622
 623         if names:
 624             token_info.set_names(*self._compute_name_tokens(names))
 625
 626             if place.is_country():
 627                 assert place.country_code is not None
 628                 self._add_country_full_names(place.country_code, names)
 629
 630         if address:
 631             self._process_place_address(token_info, address)
 632
 633         return token_info.to_dict()
 634
 635
 636     def _process_place_address(self, token_info: '_TokenInfo',
 637                                address: Sequence[PlaceName]) -> None:
 638         for item in address:
 639             if item.kind == 'postcode':
 640                 token_info.set_postcode(self._add_postcode(item))
 641             elif item.kind == 'housenumber':
 642                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 643             elif item.kind == 'street':
 644                 token_info.add_street(self._retrieve_full_tokens(item.name))
 645             elif item.kind == 'place':
 646                 if not item.suffix:
 647                     token_info.add_place(self._compute_partial_tokens(item.name))
 648             elif not item.kind.startswith('_') and not item.suffix and \
 649                  item.kind not in ('country', 'full', 'inclusion'):
 650                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
 651
 652
 653     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
 654         """ Normalize the housenumber and return the word token and the
 655             canonical form.
 656         """
 657         assert self.conn is not None
 658         analyzer = self.token_analysis.analysis.get('@housenumber')
 659         result: Tuple[Optional[int], Optional[str]] = (None, None)
 660
 661         if analyzer is None:
 662             # When no custom analyzer is set, simply normalize and transliterate
 663             norm_name = self._search_normalized(hnr.name)
 664             if norm_name:
 665                 result = self._cache.housenumbers.get(norm_name, result)
 666                 if result[0] is None:
 667                     with self.conn.cursor() as cur:
 668                         hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 669
 670                         result = hid, norm_name
 671                         self._cache.housenumbers[norm_name] = result
 672         else:
 673             # Otherwise use the analyzer to determine the canonical name.
 674             # Per convention we use the first variant as the 'lookup name', the
 675             # name that gets saved in the housenumber field of the place.
 676             word_id = analyzer.get_canonical_id(hnr)
 677             if word_id:
 678                 result = self._cache.housenumbers.get(word_id, result)
 679                 if result[0] is None:
 680                     variants = analyzer.compute_variants(word_id)
 681                     if variants:
 682                         with self.conn.cursor() as cur:
 683                             hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
 684                                              (word_id, list(variants)))
 685                             result = hid, variants[0]
 686                             self._cache.housenumbers[word_id] = result
 687
 688         return result
 689
 690
 691     def _compute_partial_tokens(self, name: str) -> List[int]:
 692         """ Normalize the given term, split it into partial words and return
 693             then token list for them.
 694         """
 695         assert self.conn is not None
 696         norm_name = self._search_normalized(name)
 697
 698         tokens = []
 699         need_lookup = []
 700         for partial in norm_name.split():
 701             token = self._cache.partials.get(partial)
 702             if token:
 703                 tokens.append(token)
 704             else:
 705                 need_lookup.append(partial)
 706
 707         if need_lookup:
 708             with self.conn.cursor() as cur:
 709                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 710                                FROM unnest(%s) word""",
 711                             (need_lookup, ))
 712
 713                 for partial, token in cur:
 714                     assert token is not None
 715                     tokens.append(token)
 716                     self._cache.partials[partial] = token
 717
 718         return tokens
 719
 720
 721     def _retrieve_full_tokens(self, name: str) -> List[int]:
 722         """ Get the full name token for the given name, if it exists.
 723             The name is only retrieved for the standard analyser.
 724         """
 725         assert self.conn is not None
 726         norm_name = self._search_normalized(name)
 727
 728         # return cached if possible
 729         if norm_name in self._cache.fulls:
 730             return self._cache.fulls[norm_name]
 731
 732         with self.conn.cursor() as cur:
 733             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 734                         (norm_name, ))
 735             full = [row[0] for row in cur]
 736
 737         self._cache.fulls[norm_name] = full
 738
 739         return full
 740
 741
 742     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
 743         """ Computes the full name and partial name tokens for the given
 744             dictionary of names.
 745         """
 746         assert self.conn is not None
 747         full_tokens: Set[int] = set()
 748         partial_tokens: Set[int] = set()
 749
 750         for name in names:
 751             analyzer_id = name.get_attr('analyzer')
 752             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 753             word_id = analyzer.get_canonical_id(name)
 754             if analyzer_id is None:
 755                 token_id = word_id
 756             else:
 757                 token_id = f'{word_id}@{analyzer_id}'
 758
 759             full, part = self._cache.names.get(token_id, (None, None))
 760             if full is None:
 761                 variants = analyzer.compute_variants(word_id)
 762                 if not variants:
 763                     continue
 764
 765                 with self.conn.cursor() as cur:
 766                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
 767                                 (token_id, variants))
 768                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
 769
 770                 self._cache.names[token_id] = (full, part)
 771
 772             assert part is not None
 773
 774             full_tokens.add(full)
 775             partial_tokens.update(part)
 776
 777         return full_tokens, partial_tokens
 778
 779
 780     def _add_postcode(self, item: PlaceName) -> Optional[str]:
 781         """ Make sure the normalized postcode is present in the word table.
 782         """
 783         assert self.conn is not None
 784         analyzer = self.token_analysis.analysis.get('@postcode')
 785
 786         if analyzer is None:
 787             postcode_name = item.name.strip().upper()
 788             variant_base = None
 789         else:
 790             postcode_name = analyzer.get_canonical_id(item)
 791             variant_base = item.get_attr("variant")
 792
 793         if variant_base:
 794             postcode = f'{postcode_name}@{variant_base}'
 795         else:
 796             postcode = postcode_name
 797
 798         if postcode not in self._cache.postcodes:
 799             term = self._search_normalized(postcode_name)
 800             if not term:
 801                 return None
 802
 803             variants = {term}
 804             if analyzer is not None and variant_base:
 805                 variants.update(analyzer.compute_variants(variant_base))
 806
 807             with self.conn.cursor() as cur:
 808                 cur.execute("SELECT create_postcode_word(%s, %s)",
 809                             (postcode, list(variants)))
 810             self._cache.postcodes.add(postcode)
 811
 812         return postcode_name
 813
 814
 815 class _TokenInfo:
 816     """ Collect token information to be sent back to the database.
 817     """
 818     def __init__(self) -> None:
 819         self.names: Optional[str] = None
 820         self.housenumbers: Set[str] = set()
 821         self.housenumber_tokens: Set[int] = set()
 822         self.street_tokens: Optional[Set[int]] = None
 823         self.place_tokens: Set[int] = set()
 824         self.address_tokens: Dict[str, str] = {}
 825         self.postcode: Optional[str] = None
 826
 827
 828     def _mk_array(self, tokens: Iterable[Any]) -> str:
 829         return f"{{{','.join((str(s) for s in tokens))}}}"
 830
 831
 832     def to_dict(self) -> Dict[str, Any]:
 833         """ Return the token information in database importable format.
 834         """
 835         out: Dict[str, Any] = {}
 836
 837         if self.names:
 838             out['names'] = self.names
 839
 840         if self.housenumbers:
 841             out['hnr'] = ';'.join(self.housenumbers)
 842             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 843
 844         if self.street_tokens is not None:
 845             out['street'] = self._mk_array(self.street_tokens)
 846
 847         if self.place_tokens:
 848             out['place'] = self._mk_array(self.place_tokens)
 849
 850         if self.address_tokens:
 851             out['addr'] = self.address_tokens
 852
 853         if self.postcode:
 854             out['postcode'] = self.postcode
 855
 856         return out
 857
 858
 859     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
 860         """ Adds token information for the normalised names.
 861         """
 862         self.names = self._mk_array(itertools.chain(fulls, partials))
 863
 864
 865     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
 866         """ Extract housenumber information from a list of normalised
 867             housenumbers.
 868         """
 869         if token:
 870             assert hnr is not None
 871             self.housenumbers.add(hnr)
 872             self.housenumber_tokens.add(token)
 873
 874
 875     def add_street(self, tokens: Iterable[int]) -> None:
 876         """ Add addr:street match terms.
 877         """
 878         if self.street_tokens is None:
 879             self.street_tokens = set()
 880         self.street_tokens.update(tokens)
 881
 882
 883     def add_place(self, tokens: Iterable[int]) -> None:
 884         """ Add addr:place search and match terms.
 885         """
 886         self.place_tokens.update(tokens)
 887
 888
 889     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
 890         """ Add additional address terms.
 891         """
 892         if partials:
 893             self.address_tokens[key] = self._mk_array(partials)
 894
 895     def set_postcode(self, postcode: Optional[str]) -> None:
 896         """ Set the postcode to the given one.
 897         """
 898         self.postcode = postcode
 899
 900
 901 class _TokenCache:
 902     """ Cache for token information to avoid repeated database queries.
 903
 904         This cache is not thread-safe and needs to be instantiated per
 905         analyzer.
 906     """
 907     def __init__(self) -> None:
 908         self.names: Dict[str, Tuple[int, List[int]]] = {}
 909         self.partials: Dict[str, int] = {}
 910         self.fulls: Dict[str, List[int]] = {}
 911         self.postcodes: Set[str] = set()
 912         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}