Spaces:

tanmay-bm
/

gsearch-api

Sleeping

App Files Files Community

gsearch-api / query_processor.py

tanmay-bm

feat: college keyword lookup, smart result count, field filtering, context window optimisation

b26d5a9 3 months ago

Raw

History Blame Contribute Delete

6.56 kB

	"""
	GCAS Search Engine – Query Processor
	======================================

	The single entry-point that orchestrates the full pre-search pipeline:

	raw query (English / Hindi / Gujarati, possibly with ASR errors)
	│
	▼ 1. normalizer.process_query()
	│ • detect language
	│ • transliterate Indic script → Latin
	│ • resolve city/university/ASR aliases
	│ • expand abbreviations
	│
	▼ 2. fuzzy_matcher.analyze_entities()
	│ • find entity spans (college, district, program …)
	│ • multi-layer matching: exact → fuzzy → phonetic
	│
	▼ 3. fuzzy_matcher.rewrite_query_with_entities()
	│ • replace near-miss spans with canonical entity names
	│ • produce final embedding-ready query string
	│
	▼ 4. (after search) fuzzy_matcher.assess_confidence()
	• combine FAISS score + entity match quality
	• produce confidence_level + did_you_mean suggestions
	"""
	from __future__ import annotations

	import logging
	from dataclasses import dataclass, field
	from typing import Any, Dict, List, Optional

	from normalizer import process_query as normalize_query
	from fuzzy_matcher import (
	EntityMatch,
	analyze_entities,
	assess_confidence,
	generate_did_you_mean,
	resolve_college_in_query,
	rewrite_query_with_entities,
	)

	logger = logging.getLogger(__name__)


	# ---------------------------------------------------------------------------
	# Pre-search result
	# ---------------------------------------------------------------------------

	@dataclass
	class ProcessedQuery:
	"""Everything the search engine needs, derived from the raw query."""

	# The cleaned, entity-corrected query string to pass to the embedder
	final_query: str

	# Human-readable metadata about what was transformed
	original_query: str
	detected_language: str # "en" \| "hi" \| "gu" \| "unknown"
	corrected_query: str # after alias/abbreviation resolution
	detected_entities: List[EntityMatch] = field(default_factory=list)

	# Per-step intermediate strings (useful for debugging)
	normalization_metadata: Dict[str, Any] = field(default_factory=dict)


	# ---------------------------------------------------------------------------
	# Post-search result enrichment
	# ---------------------------------------------------------------------------

	@dataclass
	class SearchAnalysis:
	"""Confidence + "did you mean" analysis produced after FAISS returns."""

	confidence_level: str # "high" \| "medium" \| "low"
	adjusted_score: float # 0-1 score after entity boost
	did_you_mean: List[str] # human-readable suggestions
	entity_corrections: List[Dict[str, str]] # [{from, to, type, score}]


	# ---------------------------------------------------------------------------
	# Pre-search pipeline
	# ---------------------------------------------------------------------------

	def process(raw_query: str) -> ProcessedQuery:
	"""
	Run the full pre-search normalisation and entity correction pipeline.

	Parameters
	----------
	raw_query : str – raw input from user (voice transcript or chat text)

	Returns
	-------
	ProcessedQuery – contains `final_query` ready for the embedding model
	"""
	# Step 1 – Language normalisation
	corrected, lang, norm_meta = normalize_query(raw_query)

	# Step 1b – College keyword resolution
	# Checks CollegeNameSearchKeyword1-4 data: "MN" → "M. N. COLLEGE, VISNAGAR"
	# Must run before geographic entity matching so college name is in place.
	corrected, college_entities = resolve_college_in_query(corrected)

	# Step 2 – Entity detection on the normalised query (districts/talukas only)
	geo_entities = analyze_entities(corrected)
	entities = college_entities + geo_entities

	if entities:
	logger.debug(
	"Detected %d entity matches: %s",
	len(entities),
	[(e.query_span, e.matched, e.score) for e in entities],
	)

	# Step 3 – Rewrite query with canonical entity names
	final = rewrite_query_with_entities(corrected, entities)

	# If rewriting produced an empty string (shouldn't happen), fall back
	if not final.strip():
	final = corrected

	logger.info(
	"Query processed \| lang=%s \| raw=%r \| final=%r \| entities=%d",
	lang, raw_query, final, len(entities),
	)

	return ProcessedQuery(
	final_query=final,
	original_query=raw_query,
	detected_language=lang,
	corrected_query=corrected,
	detected_entities=entities,
	normalization_metadata=norm_meta,
	)


	# ---------------------------------------------------------------------------
	# Post-search analysis
	# ---------------------------------------------------------------------------

	def analyze_results(
	processed: ProcessedQuery,
	top_results: List[Dict[str, Any]],
	) -> SearchAnalysis:
	"""
	After FAISS + LLM reranking, assess result quality and generate
	"did you mean?" suggestions if confidence is low.

	Parameters
	----------
	processed : ProcessedQuery from the pre-search step
	top_results : list of result dicts from search_engine.search()

	Returns
	-------
	SearchAnalysis
	"""
	top_score = max((r.get("score", 0.0) for r in top_results), default=0.0)

	confidence_level, adjusted_score = assess_confidence(
	top_faiss_score=top_score,
	entities=processed.detected_entities,
	)

	did_you_mean: List[str] = []
	if confidence_level in ("low", "medium"):
	did_you_mean = generate_did_you_mean(
	query=processed.final_query,
	entities=processed.detected_entities,
	top_faiss_score=top_score,
	top_results=top_results,
	)

	# Summarise entity corrections for the API response
	entity_corrections = [
	{
	"original_span": e.query_span,
	"corrected_to": e.matched,
	"entity_type": e.entity_type,
	"match_score": e.score,
	"method": e.method,
	}
	for e in processed.detected_entities
	if e.query_span.lower() != e.matched.lower() # only show actual corrections
	]

	return SearchAnalysis(
	confidence_level=confidence_level,
	adjusted_score=adjusted_score,
	did_you_mean=did_you_mean,
	entity_corrections=entity_corrections,
	)