Spaces:
Running
Running
| # searcher/highlighter.py | |
| class Highlighter: | |
| """ | |
| Extracts the most relevant passage from a chunk and highlights | |
| query-matching terms for display in search results. | |
| Why highlight? | |
| - The full chunk may be 500 words; the user needs a ~2-sentence preview | |
| - Highlighting query terms helps users quickly judge relevance | |
| - This is purely display logic β does not affect ranking | |
| """ | |
| def __init__(self, preview_words: int = 30): | |
| self.preview_words = preview_words | |
| def extract_preview(self, chunk_text: str, query: str) -> str: | |
| """ | |
| Find the sentence in chunk_text most relevant to the query | |
| and return a short preview around it. | |
| Strategy: find the window of preview_words words that contains | |
| the most query term matches. | |
| Args: | |
| chunk_text β full text of the chunk | |
| query β original user query | |
| Returns: | |
| str β the best preview snippet | |
| """ | |
| words = chunk_text.split() | |
| if len(words) <= self.preview_words: | |
| return chunk_text | |
| query_terms = set(query.lower().split()) | |
| best_score = -1 | |
| best_start = 0 | |
| for i in range(len(words) - self.preview_words + 1): | |
| window = words[i: i + self.preview_words] | |
| score = sum(1 for w in window if w.lower().strip(".,;:") in query_terms) | |
| if score > best_score: | |
| best_score = score | |
| best_start = i | |
| snippet = " ".join(words[best_start: best_start + self.preview_words]) | |
| # Add ellipsis if truncated | |
| if best_start > 0: | |
| snippet = "..." + snippet | |
| if best_start + self.preview_words < len(words): | |
| snippet = snippet + "..." | |
| return snippet | |
| def highlight_html(self, text: str, query: str) -> str: | |
| """ | |
| Wrap query-matching words in <mark> tags for HTML display. | |
| Args: | |
| text β preview snippet | |
| query β original user query | |
| Returns: | |
| str β HTML string with <mark> tags around matching words | |
| """ | |
| query_terms = set(query.lower().split()) | |
| highlighted_words = [] | |
| for word in text.split(): | |
| clean = word.lower().strip(".,;:!?") | |
| if clean in query_terms: | |
| highlighted_words.append(f"<mark>{word}</mark>") | |
| else: | |
| highlighted_words.append(word) | |
| return " ".join(highlighted_words) | |
| def annotate(self, results: list[dict], query: str) -> list[dict]: | |
| """ | |
| Add preview and highlighted_preview to each result dict. | |
| Args: | |
| results β list of chunk dicts | |
| query β original user query | |
| Returns: | |
| list[dict] β same results with 'preview' and 'preview_html' added | |
| """ | |
| for result in results: | |
| preview = self.extract_preview(result["chunk_text"], query) | |
| result["preview"] = preview | |
| result["preview_html"] = self.highlight_html(preview, query) | |
| return results |