File size: 3,673 Bytes
0315b16
10ea2c4
0315b16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f11cb0
 
 
0315b16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f11cb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0315b16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import re
from dataclasses import dataclass
from enum import Enum
from typing import List


class QueryIntent(Enum):
    FACTUAL = "factual"
    EXPLORATORY = "exploratory"
    COMPARATIVE = "comparative"


@dataclass
class ProcessedQuery:
    original: str
    normalized: str
    tokens: List[str]
    expanded_terms: List[str]
    intent: QueryIntent
    is_complex: bool

    @property
    def all_terms(self) -> List[str]:
        return list(dict.fromkeys(self.tokens + self.expanded_terms))


_STOP_WORDS = {
    "a", "an", "the", "is", "it", "in", "on", "at", "to", "for",
    "of", "and", "or", "but", "with", "this", "that", "are", "was",
    "be", "has", "have", "do", "does", "did", "will", "would", "can",
    "could", "should", "may", "might", "shall",
}

_FACTUAL_SIGNALS = {"what", "who", "when", "where", "which", "how many", "define", "list"}
_EXPLORATORY_SIGNALS = {"why", "how", "explain", "describe", "discuss", "compare", "analyze"}
_COMPARATIVE_SIGNALS = {"vs", "versus", "compare", "difference", "between", "better"}

_SYNONYMS: dict[str, List[str]] = {
    "use": ["utilize", "apply"],
    "build": ["construct", "create", "develop"],
    "fast": ["quick", "rapid", "efficient"],
    "error": ["bug", "issue", "fault", "exception"],
    "document": ["file", "record", "text"],
    "search": ["find", "retrieve", "query", "lookup"],
    "large": ["big", "huge", "extensive"],
    "small": ["tiny", "minimal", "compact"],
    "data": ["information", "records"],
    "model": ["system", "approach"],
    "index": ["catalog", "registry"],
}


class QueryProcessor:
    def process(self, query: str) -> ProcessedQuery:
        normalized = self.normalize(query)
        tokens = self._tokenize(normalized)
        expanded = self._expand(tokens)
        intent = self._detect_intent(query)
        is_complex = len(tokens) > 8 or "and" in query.lower() or "or" in query.lower()

        return ProcessedQuery(
            original=query,
            normalized=normalized,
            tokens=tokens,
            expanded_terms=expanded,
            intent=intent,
            is_complex=is_complex,
        )

    def process_query(self, query: str) -> ProcessedQuery:
        """Alias for :meth:`process` (Phase 2 spec naming)."""
        return self.process(query)

    def normalize_text(self, text: str) -> str:
        """Alias for :meth:`normalize` (Phase 2 spec naming)."""
        return self.normalize(text)

    def expand_query(self, query: str) -> List[str]:
        """Return synonym expansions for tokenized query (excludes original tokens)."""
        normalized = self.normalize(query)
        tokens = self._tokenize(normalized)
        return self._expand(tokens)

    def detect_intent(self, query: str) -> QueryIntent:
        return self._detect_intent(query)

    def normalize(self, text: str) -> str:
        text = text.lower().strip()
        text = re.sub(r"[^a-z0-9\s]", " ", text)
        text = re.sub(r"\s+", " ", text)
        return text

    def _tokenize(self, text: str) -> List[str]:
        return [w for w in text.split() if w not in _STOP_WORDS and len(w) > 1]

    def _expand(self, tokens: List[str]) -> List[str]:
        extra: List[str] = []
        for token in tokens:
            extra.extend(_SYNONYMS.get(token, []))
        return extra

    def _detect_intent(self, query: str) -> QueryIntent:
        lower = query.lower()
        if any(sig in lower for sig in _COMPARATIVE_SIGNALS):
            return QueryIntent.COMPARATIVE
        if any(lower.startswith(sig) or f" {sig} " in lower for sig in _EXPLORATORY_SIGNALS):
            return QueryIntent.EXPLORATORY
        return QueryIntent.FACTUAL