Spaces:
Running
Running
Update backend/inference/doc_retriever.py
Browse files
backend/inference/doc_retriever.py
CHANGED
|
@@ -147,17 +147,19 @@ def highlight_words(text, query_words, highlight_color="#24F31D", lemma_to_forms
|
|
| 147 |
# Expand query words to include all surface forms
|
| 148 |
expanded_forms = set()
|
| 149 |
for lemma in query_words:
|
|
|
|
|
|
|
| 150 |
if lemma_to_forms and lemma in lemma_to_forms:
|
| 151 |
expanded_forms.update(lemma_to_forms[lemma])
|
| 152 |
-
else:
|
| 153 |
-
expanded_forms.add(lemma) # Fallback if map is missing
|
| 154 |
|
| 155 |
# Sort by length to avoid partial overlaps (e.g., "run" before "running")
|
| 156 |
-
sorted_queries = sorted(expanded_forms, key=lambda w: -len(w))
|
| 157 |
|
| 158 |
for word in sorted_queries:
|
| 159 |
-
#
|
| 160 |
-
|
|
|
|
|
|
|
| 161 |
|
| 162 |
def replacer(match):
|
| 163 |
matched_text = match.group(1)
|
|
|
|
| 147 |
# Expand query words to include all surface forms
|
| 148 |
expanded_forms = set()
|
| 149 |
for lemma in query_words:
|
| 150 |
+
# Also handle bigrams passed directly
|
| 151 |
+
expanded_forms.add(lemma)
|
| 152 |
if lemma_to_forms and lemma in lemma_to_forms:
|
| 153 |
expanded_forms.update(lemma_to_forms[lemma])
|
|
|
|
|
|
|
| 154 |
|
| 155 |
# Sort by length to avoid partial overlaps (e.g., "run" before "running")
|
| 156 |
+
sorted_queries = sorted(list(expanded_forms), key=lambda w: -len(w))
|
| 157 |
|
| 158 |
for word in sorted_queries:
|
| 159 |
+
# Prepare the word for regex: replace underscores with spaces for bigrams
|
| 160 |
+
search_term = word.replace('_', ' ')
|
| 161 |
+
# Match full word/phrase, case insensitive
|
| 162 |
+
pattern = re.compile(rf'\b({re.escape(search_term)})\b', flags=re.IGNORECASE)
|
| 163 |
|
| 164 |
def replacer(match):
|
| 165 |
matched_text = match.group(1)
|