rerank results by query

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 19 Sep 2023 14:18:09 +0000 (16:18 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 4 Oct 2023 12:58:14 +0000 (14:58 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 19 Sep 2023 14:18:09 +0000 (16:18 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 4 Oct 2023 12:58:14 +0000 (14:58 +0200)
diff --git a/nominatim/api/search/geocoder.py b/nominatim/api/search/geocoder.py

index f88bffbd367bb3b78042375565dc22e820816e6f..5dbc09487528e943587a043d6c44fa8f00f8c538 100644 (file)
--- a/nominatim/api/search/geocoder.py
+++ b/nominatim/api/search/geocoder.py
@@ -9,7 +9,9 @@ Public interface to the search code.
  """
  from typing import List, Any, Optional, Iterator, Tuple
  import itertools
+import re
  import datetime as dt
+import difflib
  
  from nominatim.api.connection import SearchConnection
  from nominatim.api.types import SearchDetails
@@ -92,23 +94,56 @@ class ForwardGeocoder:
              if dt.datetime.now() >= end_time:
                  break
  
+        return results
+
+
+    def sort_and_cut_results(self, results: SearchResults) -> SearchResults:
+        """ Remove badly matching results, sort by ranking and
+            limit to the configured number of results.
+        """
          if results:
              min_ranking = min(r.ranking for r in results)
              results = SearchResults(r for r in results if r.ranking < min_ranking + 0.5)
+            results.sort(key=lambda r: r.ranking)
  
          if results:
-            min_rank = min(r.rank_search for r in results)
-
+            min_rank = results[0].rank_search
              results = SearchResults(r for r in results
                                      if r.ranking + 0.05 * (r.rank_search - min_rank)
                                         < min_ranking + 0.5)
  
-            results.sort(key=lambda r: r.accuracy - r.calculated_importance())
              results = SearchResults(results[:self.limit])
  
          return results
  
  
+    def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None:
+        """ Adjust the accuracy of the localized result according to how well
+            they match the original query.
+        """
+        assert self.query_analyzer is not None
+        qwords = [word for phrase in query.source
+                       for word in re.split('[, ]+', phrase.text) if word]
+        if not qwords:
+            return
+
+        for result in results:
+            if not result.display_name:
+                continue
+            distance = 0.0
+            norm = self.query_analyzer.normalize_text(result.display_name)
+            words = set((w for w in norm.split(' ') if w))
+            if not words:
+                continue
+            for qword in qwords:
+                wdist = max(difflib.SequenceMatcher(a=qword, b=w).quick_ratio() for w in words)
+                if wdist < 0.5:
+                    distance += len(qword)
+                else:
+                    distance += (1.0 - wdist) * len(qword)
+            result.accuracy += distance * 0.5 / sum(len(w) for w in qwords)
+
+
      async def lookup_pois(self, categories: List[Tuple[str, str]],
                            phrases: List[Phrase]) -> SearchResults:
          """ Look up places by category. If phrase is given, a place search
@@ -123,13 +158,16 @@ class ForwardGeocoder:
              if query:
                  searches = [wrap_near_search(categories, s) for s in searches[:50]]
                  results = await self.execute_searches(query, searches)
+                await add_result_details(self.conn, results, self.params)
+                log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
+                results = self.sort_and_cut_results(results)
              else:
                  results = SearchResults()
          else:
              search = build_poi_search(categories, self.params.countries)
              results = await search.lookup(self.conn, self.params)
+            await add_result_details(self.conn, results, self.params)
  
-        await add_result_details(self.conn, results, self.params)
          log().result_dump('Final Results', ((r.accuracy, r) for r in results))
  
          return results
@@ -150,6 +188,10 @@ class ForwardGeocoder:
              # Execute SQL until an appropriate result is found.
              results = await self.execute_searches(query, searches[:50])
              await add_result_details(self.conn, results, self.params)
+            log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
+            self.rerank_by_query(query, results)
+            log().result_dump('Results after reranking', ((r.accuracy, r) for r in results))
+            results = self.sort_and_cut_results(results)
              log().result_dump('Final Results', ((r.accuracy, r) for r in results))
  
          return results
diff --git a/nominatim/api/search/legacy_tokenizer.py b/nominatim/api/search/legacy_tokenizer.py

index 3346584ccd1b35b4e74e4725ee079cb54e45a905..26e4c126b626038f35b8fc97d447d98468d91372 100644 (file)
--- a/nominatim/api/search/legacy_tokenizer.py
+++ b/nominatim/api/search/legacy_tokenizer.py
@@ -127,6 +127,15 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
          return query
  
  
+    def normalize_text(self, text: str) -> str:
+        """ Bring the given text into a normalized form.
+
+            This only removes case, so some difference with the normalization
+            in the phrase remains.
+        """
+        return text.lower()
+
+
      def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str],
                                                              Dict[str, List[qmod.TokenRange]]]:
          """ Transliterate the phrases and split them into tokens.
diff --git a/nominatim/api/search/query_analyzer_factory.py b/nominatim/api/search/query_analyzer_factory.py

index 35649d0ffe4cb544daf5a07a0df17ebbfe159d81..bbc1eb6b1d787c483fc8086912279afda0a53b1a 100644 (file)
--- a/nominatim/api/search/query_analyzer_factory.py
+++ b/nominatim/api/search/query_analyzer_factory.py
@@ -30,6 +30,15 @@ class AbstractQueryAnalyzer(ABC):
          """
  
  
+    @abstractmethod
+    def normalize_text(self, text: str) -> str:
+        """ Bring the given text into a normalized form. That is the
+            standardized form search will work with. All information removed
+            at this stage is inevitably lost.
+        """
+
+
+
  async def make_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
      """ Create a query analyzer for the tokenizer used by the database.
      """
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 19 Sep 2023 14:18:09 +0000 (16:18 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 4 Oct 2023 12:58:14 +0000 (14:58 +0200)
nominatim/api/search/geocoder.py		patch \| blob \| history
nominatim/api/search/legacy_tokenizer.py		patch \| blob \| history
nominatim/api/search/query_analyzer_factory.py		patch \| blob \| history