drop category tokens when they make up a full phrase

author Sarah Hoffmann <lonvia@denofr.de>

Sun, 26 Nov 2023 19:58:50 +0000 (20:58 +0100)

committer Sarah Hoffmann <lonvia@denofr.de>

Sun, 26 Nov 2023 19:58:50 +0000 (20:58 +0100)
author Sarah Hoffmann <lonvia@denofr.de>
Sun, 26 Nov 2023 19:58:50 +0000 (20:58 +0100)
committer Sarah Hoffmann <lonvia@denofr.de>
Sun, 26 Nov 2023 19:58:50 +0000 (20:58 +0100)
diff --git a/nominatim/api/search/query.py b/nominatim/api/search/query.py

index 5d75eb0fbe98c492638bbb174b5db930490f6788..4bf009a53a7add87b44cee2ac2508b72e1846f2b 100644 (file)
--- a/nominatim/api/search/query.py
+++ b/nominatim/api/search/query.py
@@ -70,14 +70,16 @@ class PhraseType(enum.Enum):
      COUNTRY = enum.auto()
      """ Contains the country name or code. """
  
-    def compatible_with(self, ttype: TokenType) -> bool:
+    def compatible_with(self, ttype: TokenType,
+                        is_full_phrase: bool) -> bool:
          """ Check if the given token type can be used with the phrase type.
          """
          if self == PhraseType.NONE:
-            return True
+            return not is_full_phrase or ttype != TokenType.QUALIFIER
          if self == PhraseType.AMENITY:
-            return ttype in (TokenType.WORD, TokenType.PARTIAL,
-                             TokenType.QUALIFIER, TokenType.CATEGORY)
+            return ttype in (TokenType.WORD, TokenType.PARTIAL)\
+                   or (is_full_phrase and ttype == TokenType.CATEGORY)\
+                   or (not is_full_phrase and ttype == TokenType.QUALIFIER)
          if self == PhraseType.STREET:
              return ttype in (TokenType.WORD, TokenType.PARTIAL, TokenType.HOUSENUMBER)
          if self == PhraseType.POSTCODE:
@@ -244,7 +246,9 @@ class QueryStruct:
              be added to, then the token is silently dropped.
          """
          snode = self.nodes[trange.start]
-        if snode.ptype.compatible_with(ttype):
+        full_phrase = snode.btype in (BreakType.START, BreakType.PHRASE)\
+                      and self.nodes[trange.end].btype in (BreakType.PHRASE, BreakType.END)
+        if snode.ptype.compatible_with(ttype, full_phrase):
              tlist = snode.get_tokens(trange.end, ttype)
              if tlist is None:
                  snode.starting.append(TokenList(trange.end, ttype, [token]))
diff --git a/test/python/api/search/test_api_search_query.py b/test/python/api/search/test_api_search_query.py

index f8c9c2dc865ba9f8ca527014c1d292dfbba14313..69a17412cf14170cd0a2c6a9209dab73676dc5b1 100644 (file)
--- a/test/python/api/search/test_api_search_query.py
+++ b/test/python/api/search/test_api_search_query.py
@@ -28,12 +28,12 @@ def mktoken(tid: int):
                                           ('COUNTRY', 'COUNTRY'),
                                           ('POSTCODE', 'POSTCODE')])
  def test_phrase_compatible(ptype, ttype):
-    assert query.PhraseType[ptype].compatible_with(query.TokenType[ttype])
+    assert query.PhraseType[ptype].compatible_with(query.TokenType[ttype], False)
  
  
  @pytest.mark.parametrize('ptype', ['COUNTRY', 'POSTCODE'])
  def test_phrase_incompatible(ptype):
-    assert not query.PhraseType[ptype].compatible_with(query.TokenType.PARTIAL)
+    assert not query.PhraseType[ptype].compatible_with(query.TokenType.PARTIAL, True)
  
  
  def test_query_node_empty():
@@ -99,3 +99,36 @@ def test_query_struct_incompatible_token():
  
      assert q.get_tokens(query.TokenRange(0, 1), query.TokenType.PARTIAL) == []
      assert len(q.get_tokens(query.TokenRange(1, 2), query.TokenType.COUNTRY)) == 1
+
+
+def test_query_struct_amenity_single_word():
+    q = query.QueryStruct([query.Phrase(query.PhraseType.AMENITY, 'bar')])
+    q.add_node(query.BreakType.END, query.PhraseType.NONE)
+
+    q.add_token(query.TokenRange(0, 1), query.TokenType.PARTIAL, mktoken(1))
+    q.add_token(query.TokenRange(0, 1), query.TokenType.CATEGORY, mktoken(2))
+    q.add_token(query.TokenRange(0, 1), query.TokenType.QUALIFIER, mktoken(3))
+
+    assert len(q.get_tokens(query.TokenRange(0, 1), query.TokenType.PARTIAL)) == 1
+    assert len(q.get_tokens(query.TokenRange(0, 1), query.TokenType.CATEGORY)) == 1
+    assert len(q.get_tokens(query.TokenRange(0, 1), query.TokenType.QUALIFIER)) == 0
+
+
+def test_query_struct_amenity_two_words():
+    q = query.QueryStruct([query.Phrase(query.PhraseType.AMENITY, 'foo bar')])
+    q.add_node(query.BreakType.WORD, query.PhraseType.AMENITY)
+    q.add_node(query.BreakType.END, query.PhraseType.NONE)
+
+    for trange in [(0, 1), (1, 2)]:
+        q.add_token(query.TokenRange(*trange), query.TokenType.PARTIAL, mktoken(1))
+        q.add_token(query.TokenRange(*trange), query.TokenType.CATEGORY, mktoken(2))
+        q.add_token(query.TokenRange(*trange), query.TokenType.QUALIFIER, mktoken(3))
+
+    assert len(q.get_tokens(query.TokenRange(0, 1), query.TokenType.PARTIAL)) == 1
+    assert len(q.get_tokens(query.TokenRange(0, 1), query.TokenType.CATEGORY)) == 0
+    assert len(q.get_tokens(query.TokenRange(0, 1), query.TokenType.QUALIFIER)) == 1
+
+    assert len(q.get_tokens(query.TokenRange(1, 2), query.TokenType.PARTIAL)) == 1
+    assert len(q.get_tokens(query.TokenRange(1, 2), query.TokenType.CATEGORY)) == 0
+    assert len(q.get_tokens(query.TokenRange(1, 2), query.TokenType.QUALIFIER)) == 1
+
diff --git a/test/python/api/search/test_db_search_builder.py b/test/python/api/search/test_db_search_builder.py

index c93b8ead3c2fda0a49320726d72bda6c4282bbb1..c10a6c77f2917828b1ca4007f36789db5881b1e6 100644 (file)
--- a/test/python/api/search/test_db_search_builder.py
+++ b/test/python/api/search/test_db_search_builder.py
@@ -21,21 +21,18 @@ class MyToken(Token):
  
  
  def make_query(*args):
-    q = None
+    q = QueryStruct([Phrase(PhraseType.NONE, '')])
  
-    for tlist in args:
-        if q is None:
-            q = QueryStruct([Phrase(PhraseType.NONE, '')])
-        else:
-            q.add_node(BreakType.WORD, PhraseType.NONE)
+    for _ in range(max(inner[0] for tlist in args for inner in tlist)):
+        q.add_node(BreakType.WORD, PhraseType.NONE)
+    q.add_node(BreakType.END, PhraseType.NONE)
  
-        start = len(q.nodes) - 1
+    for start, tlist in enumerate(args):
          for end, ttype, tinfo in tlist:
              for tid, word in tinfo:
                  q.add_token(TokenRange(start, end), ttype,
                              MyToken(0.5 if ttype == TokenType.PARTIAL else 0.0, tid, 1, word, True))
  
-    q.add_node(BreakType.END, PhraseType.NONE)
  
      return q
  
diff --git a/test/python/api/search/test_token_assignment.py b/test/python/api/search/test_token_assignment.py

index dc123403ab24185aa78e59d842cecb0bce48e296..6dc25b1e7507159dc2163d03ac13451d817559fe 100644 (file)
--- a/test/python/api/search/test_token_assignment.py
+++ b/test/python/api/search/test_token_assignment.py
@@ -18,21 +18,17 @@ class MyToken(Token):
  
  
  def make_query(*args):
-    q = None
+    q = QueryStruct([Phrase(args[0][1], '')])
      dummy = MyToken(3.0, 45, 1, 'foo', True)
  
-    for btype, ptype, tlist in args:
-        if q is None:
-            q = QueryStruct([Phrase(ptype, '')])
-        else:
-            q.add_node(btype, ptype)
+    for btype, ptype, _ in args[1:]:
+        q.add_node(btype, ptype)
+    q.add_node(BreakType.END, PhraseType.NONE)
  
-        start = len(q.nodes) - 1
-        for end, ttype in tlist:
+    for start, t in enumerate(args):
+        for end, ttype in t[2]:
              q.add_token(TokenRange(start, end), ttype, dummy)
  
-    q.add_node(BreakType.END, PhraseType.NONE)
-
      return q
author	Sarah Hoffmann <lonvia@denofr.de>
	Sun, 26 Nov 2023 19:58:50 +0000 (20:58 +0100)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Sun, 26 Nov 2023 19:58:50 +0000 (20:58 +0100)
nominatim/api/search/query.py		patch \| blob \| history
test/python/api/search/test_api_search_query.py		patch \| blob \| history
test/python/api/search/test_db_search_builder.py		patch \| blob \| history
test/python/api/search/test_token_assignment.py		patch \| blob \| history