Spaces:
Running
Running
| import re | |
| from dataclasses import dataclass | |
| from enum import Enum | |
| from typing import List | |
| class QueryIntent(Enum): | |
| FACTUAL = "factual" | |
| EXPLORATORY = "exploratory" | |
| COMPARATIVE = "comparative" | |
| class ProcessedQuery: | |
| original: str | |
| normalized: str | |
| tokens: List[str] | |
| expanded_terms: List[str] | |
| intent: QueryIntent | |
| is_complex: bool | |
| def all_terms(self) -> List[str]: | |
| return list(dict.fromkeys(self.tokens + self.expanded_terms)) | |
| _STOP_WORDS = { | |
| "a", "an", "the", "is", "it", "in", "on", "at", "to", "for", | |
| "of", "and", "or", "but", "with", "this", "that", "are", "was", | |
| "be", "has", "have", "do", "does", "did", "will", "would", "can", | |
| "could", "should", "may", "might", "shall", | |
| } | |
| _FACTUAL_SIGNALS = {"what", "who", "when", "where", "which", "how many", "define", "list"} | |
| _EXPLORATORY_SIGNALS = {"why", "how", "explain", "describe", "discuss", "compare", "analyze"} | |
| _COMPARATIVE_SIGNALS = {"vs", "versus", "compare", "difference", "between", "better"} | |
| _SYNONYMS: dict[str, List[str]] = { | |
| "use": ["utilize", "apply"], | |
| "build": ["construct", "create", "develop"], | |
| "fast": ["quick", "rapid", "efficient"], | |
| "error": ["bug", "issue", "fault", "exception"], | |
| "document": ["file", "record", "text"], | |
| "search": ["find", "retrieve", "query", "lookup"], | |
| "large": ["big", "huge", "extensive"], | |
| "small": ["tiny", "minimal", "compact"], | |
| "data": ["information", "records"], | |
| "model": ["system", "approach"], | |
| "index": ["catalog", "registry"], | |
| } | |
| class QueryProcessor: | |
| def process(self, query: str) -> ProcessedQuery: | |
| normalized = self.normalize(query) | |
| tokens = self._tokenize(normalized) | |
| expanded = self._expand(tokens) | |
| intent = self._detect_intent(query) | |
| is_complex = len(tokens) > 8 or "and" in query.lower() or "or" in query.lower() | |
| return ProcessedQuery( | |
| original=query, | |
| normalized=normalized, | |
| tokens=tokens, | |
| expanded_terms=expanded, | |
| intent=intent, | |
| is_complex=is_complex, | |
| ) | |
| def process_query(self, query: str) -> ProcessedQuery: | |
| """Alias for :meth:`process` (Phase 2 spec naming).""" | |
| return self.process(query) | |
| def normalize_text(self, text: str) -> str: | |
| """Alias for :meth:`normalize` (Phase 2 spec naming).""" | |
| return self.normalize(text) | |
| def expand_query(self, query: str) -> List[str]: | |
| """Return synonym expansions for tokenized query (excludes original tokens).""" | |
| normalized = self.normalize(query) | |
| tokens = self._tokenize(normalized) | |
| return self._expand(tokens) | |
| def detect_intent(self, query: str) -> QueryIntent: | |
| return self._detect_intent(query) | |
| def normalize(self, text: str) -> str: | |
| text = text.lower().strip() | |
| text = re.sub(r"[^a-z0-9\s]", " ", text) | |
| text = re.sub(r"\s+", " ", text) | |
| return text | |
| def _tokenize(self, text: str) -> List[str]: | |
| return [w for w in text.split() if w not in _STOP_WORDS and len(w) > 1] | |
| def _expand(self, tokens: List[str]) -> List[str]: | |
| extra: List[str] = [] | |
| for token in tokens: | |
| extra.extend(_SYNONYMS.get(token, [])) | |
| return extra | |
| def _detect_intent(self, query: str) -> QueryIntent: | |
| lower = query.lower() | |
| if any(sig in lower for sig in _COMPARATIVE_SIGNALS): | |
| return QueryIntent.COMPARATIVE | |
| if any(lower.startswith(sig) or f" {sig} " in lower for sig in _EXPLORATORY_SIGNALS): | |
| return QueryIntent.EXPLORATORY | |
| return QueryIntent.FACTUAL | |