AdhyaSuman commited on
Commit
3ff89ee
·
verified ·
1 Parent(s): 010c288

Update backend/inference/doc_retriever.py

Browse files
Files changed (1) hide show
  1. backend/inference/doc_retriever.py +7 -5
backend/inference/doc_retriever.py CHANGED
@@ -147,17 +147,19 @@ def highlight_words(text, query_words, highlight_color="#24F31D", lemma_to_forms
147
  # Expand query words to include all surface forms
148
  expanded_forms = set()
149
  for lemma in query_words:
 
 
150
  if lemma_to_forms and lemma in lemma_to_forms:
151
  expanded_forms.update(lemma_to_forms[lemma])
152
- else:
153
- expanded_forms.add(lemma) # Fallback if map is missing
154
 
155
  # Sort by length to avoid partial overlaps (e.g., "run" before "running")
156
- sorted_queries = sorted(expanded_forms, key=lambda w: -len(w))
157
 
158
  for word in sorted_queries:
159
- # Match full word, case insensitive
160
- pattern = re.compile(rf'\b({re.escape(word)})\b', flags=re.IGNORECASE)
 
 
161
 
162
  def replacer(match):
163
  matched_text = match.group(1)
 
147
  # Expand query words to include all surface forms
148
  expanded_forms = set()
149
  for lemma in query_words:
150
+ # Also handle bigrams passed directly
151
+ expanded_forms.add(lemma)
152
  if lemma_to_forms and lemma in lemma_to_forms:
153
  expanded_forms.update(lemma_to_forms[lemma])
 
 
154
 
155
  # Sort by length to avoid partial overlaps (e.g., "run" before "running")
156
+ sorted_queries = sorted(list(expanded_forms), key=lambda w: -len(w))
157
 
158
  for word in sorted_queries:
159
+ # Prepare the word for regex: replace underscores with spaces for bigrams
160
+ search_term = word.replace('_', ' ')
161
+ # Match full word/phrase, case insensitive
162
+ pattern = re.compile(rf'\b({re.escape(search_term)})\b', flags=re.IGNORECASE)
163
 
164
  def replacer(match):
165
  matched_text = match.group(1)