""" GCAS Search Engine – Query Processor ====================================== The single entry-point that orchestrates the full pre-search pipeline: raw query (English / Hindi / Gujarati, possibly with ASR errors) │ ▼ 1. normalizer.process_query() │ • detect language │ • transliterate Indic script → Latin │ • resolve city/university/ASR aliases │ • expand abbreviations │ ▼ 2. fuzzy_matcher.analyze_entities() │ • find entity spans (college, district, program …) │ • multi-layer matching: exact → fuzzy → phonetic │ ▼ 3. fuzzy_matcher.rewrite_query_with_entities() │ • replace near-miss spans with canonical entity names │ • produce final embedding-ready query string │ ▼ 4. (after search) fuzzy_matcher.assess_confidence() • combine FAISS score + entity match quality • produce confidence_level + did_you_mean suggestions """ from __future__ import annotations import logging from dataclasses import dataclass, field from typing import Any, Dict, List, Optional from normalizer import process_query as normalize_query from fuzzy_matcher import ( EntityMatch, analyze_entities, assess_confidence, generate_did_you_mean, resolve_college_in_query, rewrite_query_with_entities, ) logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Pre-search result # --------------------------------------------------------------------------- @dataclass class ProcessedQuery: """Everything the search engine needs, derived from the raw query.""" # The cleaned, entity-corrected query string to pass to the embedder final_query: str # Human-readable metadata about what was transformed original_query: str detected_language: str # "en" | "hi" | "gu" | "unknown" corrected_query: str # after alias/abbreviation resolution detected_entities: List[EntityMatch] = field(default_factory=list) # Per-step intermediate strings (useful for debugging) normalization_metadata: Dict[str, Any] = field(default_factory=dict) # --------------------------------------------------------------------------- # Post-search result enrichment # --------------------------------------------------------------------------- @dataclass class SearchAnalysis: """Confidence + "did you mean" analysis produced after FAISS returns.""" confidence_level: str # "high" | "medium" | "low" adjusted_score: float # 0-1 score after entity boost did_you_mean: List[str] # human-readable suggestions entity_corrections: List[Dict[str, str]] # [{from, to, type, score}] # --------------------------------------------------------------------------- # Pre-search pipeline # --------------------------------------------------------------------------- def process(raw_query: str) -> ProcessedQuery: """ Run the full pre-search normalisation and entity correction pipeline. Parameters ---------- raw_query : str – raw input from user (voice transcript or chat text) Returns ------- ProcessedQuery – contains `final_query` ready for the embedding model """ # Step 1 – Language normalisation corrected, lang, norm_meta = normalize_query(raw_query) # Step 1b – College keyword resolution # Checks CollegeNameSearchKeyword1-4 data: "MN" → "M. N. COLLEGE, VISNAGAR" # Must run before geographic entity matching so college name is in place. corrected, college_entities = resolve_college_in_query(corrected) # Step 2 – Entity detection on the normalised query (districts/talukas only) geo_entities = analyze_entities(corrected) entities = college_entities + geo_entities if entities: logger.debug( "Detected %d entity matches: %s", len(entities), [(e.query_span, e.matched, e.score) for e in entities], ) # Step 3 – Rewrite query with canonical entity names final = rewrite_query_with_entities(corrected, entities) # If rewriting produced an empty string (shouldn't happen), fall back if not final.strip(): final = corrected logger.info( "Query processed | lang=%s | raw=%r | final=%r | entities=%d", lang, raw_query, final, len(entities), ) return ProcessedQuery( final_query=final, original_query=raw_query, detected_language=lang, corrected_query=corrected, detected_entities=entities, normalization_metadata=norm_meta, ) # --------------------------------------------------------------------------- # Post-search analysis # --------------------------------------------------------------------------- def analyze_results( processed: ProcessedQuery, top_results: List[Dict[str, Any]], ) -> SearchAnalysis: """ After FAISS + LLM reranking, assess result quality and generate "did you mean?" suggestions if confidence is low. Parameters ---------- processed : ProcessedQuery from the pre-search step top_results : list of result dicts from search_engine.search() Returns ------- SearchAnalysis """ top_score = max((r.get("score", 0.0) for r in top_results), default=0.0) confidence_level, adjusted_score = assess_confidence( top_faiss_score=top_score, entities=processed.detected_entities, ) did_you_mean: List[str] = [] if confidence_level in ("low", "medium"): did_you_mean = generate_did_you_mean( query=processed.final_query, entities=processed.detected_entities, top_faiss_score=top_score, top_results=top_results, ) # Summarise entity corrections for the API response entity_corrections = [ { "original_span": e.query_span, "corrected_to": e.matched, "entity_type": e.entity_type, "match_score": e.score, "method": e.method, } for e in processed.detected_entities if e.query_span.lower() != e.matched.lower() # only show actual corrections ] return SearchAnalysis( confidence_level=confidence_level, adjusted_score=adjusted_score, did_you_mean=did_you_mean, entity_corrections=entity_corrections, )