]> git.openstreetmap.org Git - nominatim.git/commitdiff
add japanese sanitizer
authormiku0 <miku.f.810129@gmail.com>
Wed, 26 Jul 2023 07:54:58 +0000 (07:54 +0000)
committermiku0 <miku.f.810129@gmail.com>
Wed, 26 Jul 2023 07:54:58 +0000 (07:54 +0000)
docs/customize/Tokenizers.md
nominatim/tokenizer/sanitizers/tag_japanese.py [new file with mode: 0644]
settings/icu_tokenizer.yaml
test/bdd/db/query/japanese.feature [new file with mode: 0644]
test/python/tokenizer/sanitizers/test_tag_japanese.py [new file with mode: 0644]

index 11c27e38b903ae0683ace099f417ec16b1077bc8..6199ea4252469537a0c3953415cff05795735cdd 100644 (file)
@@ -229,6 +229,14 @@ The following is a list of sanitizers that are shipped with Nominatim.
     rendering:
         heading_level: 6
 
+#### tag-japanese
+
+::: nominatim.tokenizer.sanitizers.tag_japanese
+    selection:
+        members: False
+    rendering:
+        heading_level: 6
+
 #### Token Analysis
 
 Token analyzers take a full name and transform it into one or more normalized
diff --git a/nominatim/tokenizer/sanitizers/tag_japanese.py b/nominatim/tokenizer/sanitizers/tag_japanese.py
new file mode 100644 (file)
index 0000000..81d3d5b
--- /dev/null
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+This sanitizer maps OSM data to Japanese block addresses.
+It replaces blocknumber and housenumber with housenumber,
+and quarter and neighbourhood with place.
+"""
+
+
+from typing import Callable
+from typing import List
+
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
+from nominatim.data.place_name import PlaceName
+
+def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+#def create(config: SanitizerConfig) -> Callable[[ProcessInfo],None]:
+    """Set up the sanitizer
+    """
+    return tag_japanese
+    #return tag_japanese(config)
+
+def convert_kanji_sequence_to_number(sequence: str) -> str:
+    """Converts Kanji numbers to Arabic numbers
+    """
+    kanji_map = {
+      '零': '0',
+      '一': '1',
+      '二': '2',
+      '三': '3',
+      '四': '4',
+      '五': '5',
+      '六': '6',
+      '七': '7',
+      '八': '8',
+      '九': '9'
+    }
+    converted = ''
+    current_number = ''
+    for char in sequence:
+        if char in kanji_map:
+            current_number += kanji_map[char]
+        else:
+            converted += current_number
+            current_number = ''
+            converted += char
+    converted += current_number
+    return converted
+
+def reconbine_housenumber(
+    new_address: List[PlaceName],
+    tmp_housenumber: str | None,
+    tmp_blocknumber: str | None
+) -> List[PlaceName]:
+    """ Recombine the tag of housenumber by using housenumber and blocknumber
+    """
+    if tmp_blocknumber and tmp_housenumber:
+        new_address.append(
+            PlaceName(
+                kind='housenumber',
+                name=f'{tmp_blocknumber}-{tmp_housenumber}',
+                suffix=''
+            )
+        )
+    elif tmp_blocknumber:
+        new_address.append(
+            PlaceName(
+                kind='housenumber',
+                name=f'{tmp_blocknumber}',
+                suffix=''
+            )
+        )
+    elif tmp_housenumber:
+        new_address.append(
+            PlaceName(
+                kind='housenumber',
+                name=f'{tmp_housenumber}',
+                suffix=''
+            )
+        )
+    return new_address
+
+def reconbine_place(
+    new_address: List[PlaceName],
+    tmp_neighbourhood: str | None,
+    tmp_quarter: str | None
+) -> List[PlaceName]:
+    """ Recombine the tag of place by using neighbourhood and quarter
+    """
+    if tmp_neighbourhood and tmp_quarter:
+        new_address.append(
+            PlaceName(
+                kind='place',
+                name=f'{tmp_quarter}{tmp_neighbourhood}',
+                suffix=''
+            )
+        )
+    elif tmp_neighbourhood:
+        new_address.append(
+            PlaceName(
+                kind='place',
+                name=f'{tmp_neighbourhood}',
+                suffix=''
+            )
+        )
+    elif tmp_quarter:
+        new_address.append(
+            PlaceName(
+                kind='place',
+                name=f'{tmp_quarter}',
+                suffix=''
+            )
+        )
+    return new_address
+def tag_japanese(obj: ProcessInfo) -> None:
+    """Recombine kind of address
+    """
+    if obj.place.country_code != 'jp':
+        return
+    tmp_housenumber = None
+    tmp_blocknumber = None
+    tmp_neighbourhood = None
+    tmp_quarter = None
+
+    new_address = []
+    for item in obj.names:
+        item.name = convert_kanji_sequence_to_number(item.name)
+
+    for item in obj.address:
+        item.name = convert_kanji_sequence_to_number(item.name)
+        if item.kind == 'housenumber':
+            tmp_housenumber = item.name
+        elif item.kind == 'block_number':
+            tmp_blocknumber = item.name
+        elif item.kind == 'neighbourhood':
+            tmp_neighbourhood = item.name
+        elif item.kind == 'quarter':
+            tmp_quarter = item.name
+        else:
+            new_address.append(item)
+
+    new_address = reconbine_housenumber(new_address,tmp_housenumber,tmp_blocknumber)
+    new_address = reconbine_place(new_address,tmp_neighbourhood,tmp_quarter)
+
+    obj.address = [item for item in new_address if item.name is not None]
index 1fa467befebfa4c5977103d6731064dd760791ff..c5a809c68319f3095f2d9b4bf06c6456ff4b2b05 100644 (file)
@@ -45,6 +45,7 @@ sanitizers:
       whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
       use-defaults: all
       mode: append
+    - step: tag-japanese
 token-analysis:
     - analyzer: generic
     - id: "@housenumber"
diff --git a/test/bdd/db/query/japanese.feature b/test/bdd/db/query/japanese.feature
new file mode 100644 (file)
index 0000000..f21e0f5
--- /dev/null
@@ -0,0 +1,29 @@
+@DB
+Feature: Searches in Japan
+    Test specifically for searches of Japanese addresses and in Japanese language.
+    Scenario: A block house-number is parented to the neighbourhood
+        Given the grid with origin JP
+          | 1 |   |   |   | 2 |
+          |   | 3 |   |   |   |
+          |   |   | 9 |   |   |
+          |   |   |   | 6 |   |
+        And the places
+          | osm | class   | type        | name       | geometry |
+          | W1  | highway | residential | 雉子橋通り | 1,2      |
+        And the places
+          | osm | class   | type       | housenr | addr+block_number | addr+neighbourhood | geometry |
+          | N3  | amenity | restaurant | 2       | 6                 | 2丁目              | 3        |
+        And the places
+          | osm | class | type          | name  | geometry |
+          | N9  | place | neighbourhood | 2丁目 | 9        |
+        And the places
+          | osm | class | type    | name | geometry |
+          | N6  | place | quarter | 加瀬 | 6        |
+        When importing
+        Then placex contains
+          | object | parent_place_id |
+          | N3     | N9              |
+        When sending search query "2丁目 6-2"
+        Then results contain
+          | osm |
+          | N3  |
diff --git a/test/python/tokenizer/sanitizers/test_tag_japanese.py b/test/python/tokenizer/sanitizers/test_tag_japanese.py
new file mode 100644 (file)
index 0000000..c82c426
--- /dev/null
@@ -0,0 +1,65 @@
+from nominatim.data.place_info import PlaceInfo
+from nominatim.data.place_name import PlaceName
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from typing import Mapping, Optional, List
+import pytest
+
+class TestTagJapanese:
+    @pytest.fixture(autouse=True)
+    def setup_country(self, def_config):
+        self.config = def_config
+
+    def run_sanitizer_on(self,type, **kwargs):
+        place = PlaceInfo({
+            'address': kwargs,
+            'country_code': 'jp'
+        })
+        sanitizer_args = {'step': 'tag-japanese'}
+        _, address = PlaceSanitizer([sanitizer_args], self.config).process_names(place)
+        tmp_list = [(p.name,p.kind) for p in address]
+        return sorted(tmp_list)
+
+    def test_on_address(self):
+        res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz')
+        assert res == [('bar','ref'),('baz','ref_abc'),('foo','name')]
+
+    def test_housenumber(self):
+        res = self.run_sanitizer_on('address', housenumber='2')
+        assert res == [('2','housenumber')]
+
+    def test_blocknumber(self):
+        res = self.run_sanitizer_on('address', block_number='6')
+        assert res == [('6','housenumber')]
+
+    #def test_neighbourhood(self):
+    #    res = self.run_sanitizer_on('address',neighbourhood='8丁目')
+    #    assert res == [('8','place')]
+    def test_neighbourhood(self):
+        res = self.run_sanitizer_on('address', neighbourhood='8')
+        assert res == [('8','place')]
+    def test_quarter(self):
+        res = self.run_sanitizer_on('address', quarter='kase')
+        assert res==[('kase','place')]
+
+    def test_housenumber_blocknumber(self):
+        res = self.run_sanitizer_on('address', housenumber='2', block_number='6')
+        assert res == [('6-2','housenumber')]
+
+    def test_housenumber_blocknumber(self):
+        res = self.run_sanitizer_on('address', housenumber='2', neighbourhood='8')
+        assert res == [('2','housenumber'),('8','place')]
+
+    def test_housenumber_blocknumber(self):
+        res = self.run_sanitizer_on('address', block_number='6', neighbourhood='8')
+        assert res == [('6','housenumber'),('8','place')]
+
+    def test_housenumber_blocknumber_neighbourhood(self):
+        res = self.run_sanitizer_on('address', housenumber='2', block_number='6', neighbourhood='8')
+        assert res == [('6-2','housenumber'),('8','place')]
+
+    def test_housenumber_blocknumber_neighbourhood_quarter(self):
+        res = self.run_sanitizer_on('address', housenumber='2', block_number='6', neighbourhood='8',quarter='kase')
+        assert res == [('6-2','housenumber'),('kase8','place')]
+    def test_neighbourhood_quarter(self):
+        res = self.run_sanitizer_on('address', neighbourhood='8',quarter='kase')
+        assert res == [('kase8','place')]