]> git.openstreetmap.org Git - nominatim.git/commitdiff
rerank results by query
authorSarah Hoffmann <lonvia@denofr.de>
Tue, 19 Sep 2023 14:18:09 +0000 (16:18 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Wed, 4 Oct 2023 12:58:14 +0000 (14:58 +0200)
The algorithm is similar to the PHP reranking and uses the terms from
the display name to check against the query terms. However instead of
exact matching it uses a per-word-edit-distance, so that it is less
strict when it comes to mismatching accents or other one letter
differences.

Country names get a higher penalty because they don't receive a
penalty during token matching right now.

This will work badly with the legacy tokenizer. Given that it is
marked for removal, it is simply not worth optimising for it.

nominatim/api/search/geocoder.py
nominatim/api/search/legacy_tokenizer.py
nominatim/api/search/query_analyzer_factory.py

index f88bffbd367bb3b78042375565dc22e820816e6f..5dbc09487528e943587a043d6c44fa8f00f8c538 100644 (file)
@@ -9,7 +9,9 @@ Public interface to the search code.
 """
 from typing import List, Any, Optional, Iterator, Tuple
 import itertools
+import re
 import datetime as dt
+import difflib
 
 from nominatim.api.connection import SearchConnection
 from nominatim.api.types import SearchDetails
@@ -92,23 +94,56 @@ class ForwardGeocoder:
             if dt.datetime.now() >= end_time:
                 break
 
+        return results
+
+
+    def sort_and_cut_results(self, results: SearchResults) -> SearchResults:
+        """ Remove badly matching results, sort by ranking and
+            limit to the configured number of results.
+        """
         if results:
             min_ranking = min(r.ranking for r in results)
             results = SearchResults(r for r in results if r.ranking < min_ranking + 0.5)
+            results.sort(key=lambda r: r.ranking)
 
         if results:
-            min_rank = min(r.rank_search for r in results)
-
+            min_rank = results[0].rank_search
             results = SearchResults(r for r in results
                                     if r.ranking + 0.05 * (r.rank_search - min_rank)
                                        < min_ranking + 0.5)
 
-            results.sort(key=lambda r: r.accuracy - r.calculated_importance())
             results = SearchResults(results[:self.limit])
 
         return results
 
 
+    def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None:
+        """ Adjust the accuracy of the localized result according to how well
+            they match the original query.
+        """
+        assert self.query_analyzer is not None
+        qwords = [word for phrase in query.source
+                       for word in re.split('[, ]+', phrase.text) if word]
+        if not qwords:
+            return
+
+        for result in results:
+            if not result.display_name:
+                continue
+            distance = 0.0
+            norm = self.query_analyzer.normalize_text(result.display_name)
+            words = set((w for w in norm.split(' ') if w))
+            if not words:
+                continue
+            for qword in qwords:
+                wdist = max(difflib.SequenceMatcher(a=qword, b=w).quick_ratio() for w in words)
+                if wdist < 0.5:
+                    distance += len(qword)
+                else:
+                    distance += (1.0 - wdist) * len(qword)
+            result.accuracy += distance * 0.5 / sum(len(w) for w in qwords)
+
+
     async def lookup_pois(self, categories: List[Tuple[str, str]],
                           phrases: List[Phrase]) -> SearchResults:
         """ Look up places by category. If phrase is given, a place search
@@ -123,13 +158,16 @@ class ForwardGeocoder:
             if query:
                 searches = [wrap_near_search(categories, s) for s in searches[:50]]
                 results = await self.execute_searches(query, searches)
+                await add_result_details(self.conn, results, self.params)
+                log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
+                results = self.sort_and_cut_results(results)
             else:
                 results = SearchResults()
         else:
             search = build_poi_search(categories, self.params.countries)
             results = await search.lookup(self.conn, self.params)
+            await add_result_details(self.conn, results, self.params)
 
-        await add_result_details(self.conn, results, self.params)
         log().result_dump('Final Results', ((r.accuracy, r) for r in results))
 
         return results
@@ -150,6 +188,10 @@ class ForwardGeocoder:
             # Execute SQL until an appropriate result is found.
             results = await self.execute_searches(query, searches[:50])
             await add_result_details(self.conn, results, self.params)
+            log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
+            self.rerank_by_query(query, results)
+            log().result_dump('Results after reranking', ((r.accuracy, r) for r in results))
+            results = self.sort_and_cut_results(results)
             log().result_dump('Final Results', ((r.accuracy, r) for r in results))
 
         return results
index 3346584ccd1b35b4e74e4725ee079cb54e45a905..26e4c126b626038f35b8fc97d447d98468d91372 100644 (file)
@@ -127,6 +127,15 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
         return query
 
 
+    def normalize_text(self, text: str) -> str:
+        """ Bring the given text into a normalized form.
+
+            This only removes case, so some difference with the normalization
+            in the phrase remains.
+        """
+        return text.lower()
+
+
     def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str],
                                                             Dict[str, List[qmod.TokenRange]]]:
         """ Transliterate the phrases and split them into tokens.
index 35649d0ffe4cb544daf5a07a0df17ebbfe159d81..bbc1eb6b1d787c483fc8086912279afda0a53b1a 100644 (file)
@@ -30,6 +30,15 @@ class AbstractQueryAnalyzer(ABC):
         """
 
 
+    @abstractmethod
+    def normalize_text(self, text: str) -> str:
+        """ Bring the given text into a normalized form. That is the
+            standardized form search will work with. All information removed
+            at this stage is inevitably lost.
+        """
+
+
+
 async def make_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
     """ Create a query analyzer for the tokenizer used by the database.
     """