Spaces:

vampokala
/

doc-ingestion

Running

doc-ingestion / src /core /query_processor.py

Vamshi Pokala

Merge pull request #2 from vampokala/phase-2-hybrid-retrieval

35e041a unverified about 1 month ago

3.67 kB

	import re
	from dataclasses import dataclass
	from enum import Enum
	from typing import List


	class QueryIntent(Enum):
	FACTUAL = "factual"
	EXPLORATORY = "exploratory"
	COMPARATIVE = "comparative"


	@dataclass
	class ProcessedQuery:
	original: str
	normalized: str
	tokens: List[str]
	expanded_terms: List[str]
	intent: QueryIntent
	is_complex: bool

	@property
	def all_terms(self) -> List[str]:
	return list(dict.fromkeys(self.tokens + self.expanded_terms))


	_STOP_WORDS = {
	"a", "an", "the", "is", "it", "in", "on", "at", "to", "for",
	"of", "and", "or", "but", "with", "this", "that", "are", "was",
	"be", "has", "have", "do", "does", "did", "will", "would", "can",
	"could", "should", "may", "might", "shall",
	}

	_FACTUAL_SIGNALS = {"what", "who", "when", "where", "which", "how many", "define", "list"}
	_EXPLORATORY_SIGNALS = {"why", "how", "explain", "describe", "discuss", "compare", "analyze"}
	_COMPARATIVE_SIGNALS = {"vs", "versus", "compare", "difference", "between", "better"}

	_SYNONYMS: dict[str, List[str]] = {
	"use": ["utilize", "apply"],
	"build": ["construct", "create", "develop"],
	"fast": ["quick", "rapid", "efficient"],
	"error": ["bug", "issue", "fault", "exception"],
	"document": ["file", "record", "text"],
	"search": ["find", "retrieve", "query", "lookup"],
	"large": ["big", "huge", "extensive"],
	"small": ["tiny", "minimal", "compact"],
	"data": ["information", "records"],
	"model": ["system", "approach"],
	"index": ["catalog", "registry"],
	}


	class QueryProcessor:
	def process(self, query: str) -> ProcessedQuery:
	normalized = self.normalize(query)
	tokens = self._tokenize(normalized)
	expanded = self._expand(tokens)
	intent = self._detect_intent(query)
	is_complex = len(tokens) > 8 or "and" in query.lower() or "or" in query.lower()

	return ProcessedQuery(
	original=query,
	normalized=normalized,
	tokens=tokens,
	expanded_terms=expanded,
	intent=intent,
	is_complex=is_complex,
	)

	def process_query(self, query: str) -> ProcessedQuery:
	"""Alias for :meth:`process` (Phase 2 spec naming)."""
	return self.process(query)

	def normalize_text(self, text: str) -> str:
	"""Alias for :meth:`normalize` (Phase 2 spec naming)."""
	return self.normalize(text)

	def expand_query(self, query: str) -> List[str]:
	"""Return synonym expansions for tokenized query (excludes original tokens)."""
	normalized = self.normalize(query)
	tokens = self._tokenize(normalized)
	return self._expand(tokens)

	def detect_intent(self, query: str) -> QueryIntent:
	return self._detect_intent(query)

	def normalize(self, text: str) -> str:
	text = text.lower().strip()
	text = re.sub(r"[^a-z0-9\s]", " ", text)
	text = re.sub(r"\s+", " ", text)
	return text

	def _tokenize(self, text: str) -> List[str]:
	return [w for w in text.split() if w not in _STOP_WORDS and len(w) > 1]

	def _expand(self, tokens: List[str]) -> List[str]:
	extra: List[str] = []
	for token in tokens:
	extra.extend(_SYNONYMS.get(token, []))
	return extra

	def _detect_intent(self, query: str) -> QueryIntent:
	lower = query.lower()
	if any(sig in lower for sig in _COMPARATIVE_SIGNALS):
	return QueryIntent.COMPARATIVE
	if any(lower.startswith(sig) or f" {sig} " in lower for sig in _EXPLORATORY_SIGNALS):
	return QueryIntent.EXPLORATORY
	return QueryIntent.FACTUAL