Spaces:
Sleeping
Sleeping
| """ | |
| GCAS Search Engine β Query Processor | |
| ====================================== | |
| The single entry-point that orchestrates the full pre-search pipeline: | |
| raw query (English / Hindi / Gujarati, possibly with ASR errors) | |
| β | |
| βΌ 1. normalizer.process_query() | |
| β β’ detect language | |
| β β’ transliterate Indic script β Latin | |
| β β’ resolve city/university/ASR aliases | |
| β β’ expand abbreviations | |
| β | |
| βΌ 2. fuzzy_matcher.analyze_entities() | |
| β β’ find entity spans (college, district, program β¦) | |
| β β’ multi-layer matching: exact β fuzzy β phonetic | |
| β | |
| βΌ 3. fuzzy_matcher.rewrite_query_with_entities() | |
| β β’ replace near-miss spans with canonical entity names | |
| β β’ produce final embedding-ready query string | |
| β | |
| βΌ 4. (after search) fuzzy_matcher.assess_confidence() | |
| β’ combine FAISS score + entity match quality | |
| β’ produce confidence_level + did_you_mean suggestions | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| from dataclasses import dataclass, field | |
| from typing import Any, Dict, List, Optional | |
| from normalizer import process_query as normalize_query | |
| from fuzzy_matcher import ( | |
| EntityMatch, | |
| analyze_entities, | |
| assess_confidence, | |
| generate_did_you_mean, | |
| resolve_college_in_query, | |
| rewrite_query_with_entities, | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # --------------------------------------------------------------------------- | |
| # Pre-search result | |
| # --------------------------------------------------------------------------- | |
| class ProcessedQuery: | |
| """Everything the search engine needs, derived from the raw query.""" | |
| # The cleaned, entity-corrected query string to pass to the embedder | |
| final_query: str | |
| # Human-readable metadata about what was transformed | |
| original_query: str | |
| detected_language: str # "en" | "hi" | "gu" | "unknown" | |
| corrected_query: str # after alias/abbreviation resolution | |
| detected_entities: List[EntityMatch] = field(default_factory=list) | |
| # Per-step intermediate strings (useful for debugging) | |
| normalization_metadata: Dict[str, Any] = field(default_factory=dict) | |
| # --------------------------------------------------------------------------- | |
| # Post-search result enrichment | |
| # --------------------------------------------------------------------------- | |
| class SearchAnalysis: | |
| """Confidence + "did you mean" analysis produced after FAISS returns.""" | |
| confidence_level: str # "high" | "medium" | "low" | |
| adjusted_score: float # 0-1 score after entity boost | |
| did_you_mean: List[str] # human-readable suggestions | |
| entity_corrections: List[Dict[str, str]] # [{from, to, type, score}] | |
| # --------------------------------------------------------------------------- | |
| # Pre-search pipeline | |
| # --------------------------------------------------------------------------- | |
| def process(raw_query: str) -> ProcessedQuery: | |
| """ | |
| Run the full pre-search normalisation and entity correction pipeline. | |
| Parameters | |
| ---------- | |
| raw_query : str β raw input from user (voice transcript or chat text) | |
| Returns | |
| ------- | |
| ProcessedQuery β contains `final_query` ready for the embedding model | |
| """ | |
| # Step 1 β Language normalisation | |
| corrected, lang, norm_meta = normalize_query(raw_query) | |
| # Step 1b β College keyword resolution | |
| # Checks CollegeNameSearchKeyword1-4 data: "MN" β "M. N. COLLEGE, VISNAGAR" | |
| # Must run before geographic entity matching so college name is in place. | |
| corrected, college_entities = resolve_college_in_query(corrected) | |
| # Step 2 β Entity detection on the normalised query (districts/talukas only) | |
| geo_entities = analyze_entities(corrected) | |
| entities = college_entities + geo_entities | |
| if entities: | |
| logger.debug( | |
| "Detected %d entity matches: %s", | |
| len(entities), | |
| [(e.query_span, e.matched, e.score) for e in entities], | |
| ) | |
| # Step 3 β Rewrite query with canonical entity names | |
| final = rewrite_query_with_entities(corrected, entities) | |
| # If rewriting produced an empty string (shouldn't happen), fall back | |
| if not final.strip(): | |
| final = corrected | |
| logger.info( | |
| "Query processed | lang=%s | raw=%r | final=%r | entities=%d", | |
| lang, raw_query, final, len(entities), | |
| ) | |
| return ProcessedQuery( | |
| final_query=final, | |
| original_query=raw_query, | |
| detected_language=lang, | |
| corrected_query=corrected, | |
| detected_entities=entities, | |
| normalization_metadata=norm_meta, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Post-search analysis | |
| # --------------------------------------------------------------------------- | |
| def analyze_results( | |
| processed: ProcessedQuery, | |
| top_results: List[Dict[str, Any]], | |
| ) -> SearchAnalysis: | |
| """ | |
| After FAISS + LLM reranking, assess result quality and generate | |
| "did you mean?" suggestions if confidence is low. | |
| Parameters | |
| ---------- | |
| processed : ProcessedQuery from the pre-search step | |
| top_results : list of result dicts from search_engine.search() | |
| Returns | |
| ------- | |
| SearchAnalysis | |
| """ | |
| top_score = max((r.get("score", 0.0) for r in top_results), default=0.0) | |
| confidence_level, adjusted_score = assess_confidence( | |
| top_faiss_score=top_score, | |
| entities=processed.detected_entities, | |
| ) | |
| did_you_mean: List[str] = [] | |
| if confidence_level in ("low", "medium"): | |
| did_you_mean = generate_did_you_mean( | |
| query=processed.final_query, | |
| entities=processed.detected_entities, | |
| top_faiss_score=top_score, | |
| top_results=top_results, | |
| ) | |
| # Summarise entity corrections for the API response | |
| entity_corrections = [ | |
| { | |
| "original_span": e.query_span, | |
| "corrected_to": e.matched, | |
| "entity_type": e.entity_type, | |
| "match_score": e.score, | |
| "method": e.method, | |
| } | |
| for e in processed.detected_entities | |
| if e.query_span.lower() != e.matched.lower() # only show actual corrections | |
| ] | |
| return SearchAnalysis( | |
| confidence_level=confidence_level, | |
| adjusted_score=adjusted_score, | |
| did_you_mean=did_you_mean, | |
| entity_corrections=entity_corrections, | |
| ) | |