Spaces:

AdhyaSuman
/

DTECT

Build error

AdhyaSuman commited on Jun 30, 2025

Commit

3ff89ee

verified ·

1 Parent(s): 010c288

Update backend/inference/doc_retriever.py

Files changed (1) hide show

backend/inference/doc_retriever.py CHANGED Viewed

@@ -147,17 +147,19 @@ def highlight_words(text, query_words, highlight_color="#24F31D", lemma_to_forms
     # Expand query words to include all surface forms
     expanded_forms = set()
     for lemma in query_words:
         if lemma_to_forms and lemma in lemma_to_forms:
             expanded_forms.update(lemma_to_forms[lemma])
-        else:
-            expanded_forms.add(lemma)  # Fallback if map is missing
     # Sort by length to avoid partial overlaps (e.g., "run" before "running")
-    sorted_queries = sorted(expanded_forms, key=lambda w: -len(w))
     for word in sorted_queries:
-        # Match full word, case insensitive
-        pattern = re.compile(rf'\b({re.escape(word)})\b', flags=re.IGNORECASE)
         def replacer(match):
             matched_text = match.group(1)

     # Expand query words to include all surface forms
     expanded_forms = set()
     for lemma in query_words:
+        # Also handle bigrams passed directly
+        expanded_forms.add(lemma)
         if lemma_to_forms and lemma in lemma_to_forms:
             expanded_forms.update(lemma_to_forms[lemma])
     # Sort by length to avoid partial overlaps (e.g., "run" before "running")
+    sorted_queries = sorted(list(expanded_forms), key=lambda w: -len(w))
     for word in sorted_queries:
+        # Prepare the word for regex: replace underscores with spaces for bigrams
+        search_term = word.replace('_', ' ')
+        # Match full word/phrase, case insensitive
+        pattern = re.compile(rf'\b({re.escape(search_term)})\b', flags=re.IGNORECASE)
         def replacer(match):
             matched_text = match.group(1)