gsearch-api / query_processor.py
tanmay-bm's picture
feat: college keyword lookup, smart result count, field filtering, context window optimisation
b26d5a9
Raw
History Blame Contribute Delete
6.56 kB
"""
GCAS Search Engine – Query Processor
======================================
The single entry-point that orchestrates the full pre-search pipeline:
raw query (English / Hindi / Gujarati, possibly with ASR errors)
β”‚
β–Ό 1. normalizer.process_query()
β”‚ β€’ detect language
β”‚ β€’ transliterate Indic script β†’ Latin
β”‚ β€’ resolve city/university/ASR aliases
β”‚ β€’ expand abbreviations
β”‚
β–Ό 2. fuzzy_matcher.analyze_entities()
β”‚ β€’ find entity spans (college, district, program …)
β”‚ β€’ multi-layer matching: exact β†’ fuzzy β†’ phonetic
β”‚
β–Ό 3. fuzzy_matcher.rewrite_query_with_entities()
β”‚ β€’ replace near-miss spans with canonical entity names
β”‚ β€’ produce final embedding-ready query string
β”‚
β–Ό 4. (after search) fuzzy_matcher.assess_confidence()
β€’ combine FAISS score + entity match quality
β€’ produce confidence_level + did_you_mean suggestions
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
from normalizer import process_query as normalize_query
from fuzzy_matcher import (
EntityMatch,
analyze_entities,
assess_confidence,
generate_did_you_mean,
resolve_college_in_query,
rewrite_query_with_entities,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Pre-search result
# ---------------------------------------------------------------------------
@dataclass
class ProcessedQuery:
"""Everything the search engine needs, derived from the raw query."""
# The cleaned, entity-corrected query string to pass to the embedder
final_query: str
# Human-readable metadata about what was transformed
original_query: str
detected_language: str # "en" | "hi" | "gu" | "unknown"
corrected_query: str # after alias/abbreviation resolution
detected_entities: List[EntityMatch] = field(default_factory=list)
# Per-step intermediate strings (useful for debugging)
normalization_metadata: Dict[str, Any] = field(default_factory=dict)
# ---------------------------------------------------------------------------
# Post-search result enrichment
# ---------------------------------------------------------------------------
@dataclass
class SearchAnalysis:
"""Confidence + "did you mean" analysis produced after FAISS returns."""
confidence_level: str # "high" | "medium" | "low"
adjusted_score: float # 0-1 score after entity boost
did_you_mean: List[str] # human-readable suggestions
entity_corrections: List[Dict[str, str]] # [{from, to, type, score}]
# ---------------------------------------------------------------------------
# Pre-search pipeline
# ---------------------------------------------------------------------------
def process(raw_query: str) -> ProcessedQuery:
"""
Run the full pre-search normalisation and entity correction pipeline.
Parameters
----------
raw_query : str – raw input from user (voice transcript or chat text)
Returns
-------
ProcessedQuery – contains `final_query` ready for the embedding model
"""
# Step 1 – Language normalisation
corrected, lang, norm_meta = normalize_query(raw_query)
# Step 1b – College keyword resolution
# Checks CollegeNameSearchKeyword1-4 data: "MN" β†’ "M. N. COLLEGE, VISNAGAR"
# Must run before geographic entity matching so college name is in place.
corrected, college_entities = resolve_college_in_query(corrected)
# Step 2 – Entity detection on the normalised query (districts/talukas only)
geo_entities = analyze_entities(corrected)
entities = college_entities + geo_entities
if entities:
logger.debug(
"Detected %d entity matches: %s",
len(entities),
[(e.query_span, e.matched, e.score) for e in entities],
)
# Step 3 – Rewrite query with canonical entity names
final = rewrite_query_with_entities(corrected, entities)
# If rewriting produced an empty string (shouldn't happen), fall back
if not final.strip():
final = corrected
logger.info(
"Query processed | lang=%s | raw=%r | final=%r | entities=%d",
lang, raw_query, final, len(entities),
)
return ProcessedQuery(
final_query=final,
original_query=raw_query,
detected_language=lang,
corrected_query=corrected,
detected_entities=entities,
normalization_metadata=norm_meta,
)
# ---------------------------------------------------------------------------
# Post-search analysis
# ---------------------------------------------------------------------------
def analyze_results(
processed: ProcessedQuery,
top_results: List[Dict[str, Any]],
) -> SearchAnalysis:
"""
After FAISS + LLM reranking, assess result quality and generate
"did you mean?" suggestions if confidence is low.
Parameters
----------
processed : ProcessedQuery from the pre-search step
top_results : list of result dicts from search_engine.search()
Returns
-------
SearchAnalysis
"""
top_score = max((r.get("score", 0.0) for r in top_results), default=0.0)
confidence_level, adjusted_score = assess_confidence(
top_faiss_score=top_score,
entities=processed.detected_entities,
)
did_you_mean: List[str] = []
if confidence_level in ("low", "medium"):
did_you_mean = generate_did_you_mean(
query=processed.final_query,
entities=processed.detected_entities,
top_faiss_score=top_score,
top_results=top_results,
)
# Summarise entity corrections for the API response
entity_corrections = [
{
"original_span": e.query_span,
"corrected_to": e.matched,
"entity_type": e.entity_type,
"match_score": e.score,
"method": e.method,
}
for e in processed.detected_entities
if e.query_span.lower() != e.matched.lower() # only show actual corrections
]
return SearchAnalysis(
confidence_level=confidence_level,
adjusted_score=adjusted_score,
did_you_mean=did_you_mean,
entity_corrections=entity_corrections,
)