File size: 7,132 Bytes
85f900d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
"""
VoiceVault — Query Preprocessor
==================================
Cleans raw Whisper transcripts and classifies query intent.

Steps:
  1. Lowercase normalization
  2. Filler word removal (um, uh, like, you know, etc.)
  3. Punctuation repair (double spaces, leading/trailing cleanup)
  4. Language detection (langdetect)
  5. Query type classification: factual | summary | compare

The query type drives retrieval strategy in Phase 4:
  - factual  → standard hybrid retrieval, concise answer (< 100 words)
  - summary  → higher final_top_k, longer answer (up to 300 words)
  - compare  → 2-step retrieval targeting multiple entities

Usage:
    preprocessor = QueryPreprocessor()
    result = preprocessor.process("Um, what is, like, machine learning?")
    # result.processed_query == "what is machine learning"
    # result.query_type == "factual"
    # result.language == "en"
"""

from __future__ import annotations

import logging
import re
from dataclasses import dataclass

logger = logging.getLogger(__name__)

# ------------------------------------------------------------------ #
# Query Type Patterns                                                   #
# ------------------------------------------------------------------ #

_FACTUAL_PATTERNS = [
    r"^what (is|are|was|were)\b",
    r"^who (is|are|was|were)\b",
    r"^when (did|does|is|was)\b",
    r"^where (is|are|was|were|did)\b",
    r"^which\b",
    r"^how (many|much|long|old|far|often)\b",
    r"^define\b",
    r"^what does .+ mean\b",
]

_SUMMARY_PATTERNS = [
    r"^summar(ise|ize)\b",
    r"^give (me )?(an? )?(overview|summary|brief|outline)\b",
    r"^(describe|explain|tell me about)\b",
    r"^what (is the )?(main|key|primary|overall)\b",
    r"^overview of\b",
]

_COMPARE_PATTERNS = [
    r"\bcompar(e|ing)\b",
    r"\bdifferen(ce|t)\b",
    r"\bversus\b",
    r"\bvs\.?\b",
    r"\bpros and cons\b",
    r"what (is|are) the difference",
    r"how does .+ differ from\b",
]

# Filler words to strip (as whole words, case-insensitive)
_FILLER_WORDS = {
    "um", "uh", "er", "ah", "eh", "like", "you know",
    "i mean", "basically", "literally", "actually", "right",
    "so", "well", "okay", "ok",
}


@dataclass
class PreprocessedQuery:
    """Result of preprocessing a raw transcript."""
    raw_query: str
    processed_query: str
    query_type: str      # factual | summary | compare
    language: str        # ISO 639-1


class QueryPreprocessor:
    """
    Cleans and classifies a raw Whisper transcript for optimal retrieval.

    All operations are pure Python — no ML models required.
    Language detection uses `langdetect` (lightweight, offline).
    """

    def process(self, raw_query: str) -> PreprocessedQuery:
        """
        Apply the full preprocessing pipeline to a raw transcript.

        Args:
            raw_query: Raw text from Whisper (may contain fillers, casing, noise).

        Returns:
            PreprocessedQuery with cleaned text, classified type, and language.
        """
        if not raw_query or not raw_query.strip():
            return PreprocessedQuery(
                raw_query=raw_query,
                processed_query="",
                query_type="factual",
                language="en",
            )

        text = raw_query.strip()
        text = self._normalize(text)
        text = self._remove_fillers(text)
        text = self._repair_punctuation(text)

        language = self._detect_language(text)
        query_type = self._classify_intent(text)

        return PreprocessedQuery(
            raw_query=raw_query,
            processed_query=text,
            query_type=query_type,
            language=language,
        )

    # ------------------------------------------------------------------ #
    # Normalization                                                         #
    # ------------------------------------------------------------------ #

    @staticmethod
    def _normalize(text: str) -> str:
        """Lowercase and normalize whitespace."""
        text = text.lower()
        text = re.sub(r"\s+", " ", text)
        return text.strip()

    @staticmethod
    def _remove_fillers(text: str) -> str:
        """
        Remove spoken filler words and phrases as whole words.
        Multi-word fillers (e.g., "you know") are removed first to avoid
        partial matches.
        """
        # Multi-word fillers first (longer matches take priority)
        multi_word = sorted(
            [f for f in _FILLER_WORDS if " " in f],
            key=len, reverse=True
        )
        for filler in multi_word:
            pattern = r"(?<!\w)" + re.escape(filler) + r"(?!\w)"
            text = re.sub(pattern, " ", text, flags=re.IGNORECASE)

        # Single-word fillers as whole-word matches
        single_word = [f for f in _FILLER_WORDS if " " not in f]
        for filler in single_word:
            pattern = r"\b" + re.escape(filler) + r"\b"
            text = re.sub(pattern, " ", text, flags=re.IGNORECASE)

        return re.sub(r"\s+", " ", text).strip()

    @staticmethod
    def _repair_punctuation(text: str) -> str:
        """
        Clean up punctuation artifacts from filler removal.
        - Remove leading commas, dots
        - Normalize repeated punctuation
        - Ensure sentence ends with proper punctuation if it's a question
        """
        text = re.sub(r"^[,.\s]+", "", text)
        text = re.sub(r"[,.\s]+$", "", text)
        text = re.sub(r",\s*,", ",", text)
        text = text.strip()
        return text

    # ------------------------------------------------------------------ #
    # Language Detection                                                    #
    # ------------------------------------------------------------------ #

    @staticmethod
    def _detect_language(text: str) -> str:
        """
        Detect ISO 639-1 language code.
        Returns "en" if langdetect is unavailable or detection fails.
        """
        if not text or len(text.split()) < 3:
            return "en"
        try:
            from langdetect import detect
            return detect(text)
        except Exception:
            return "en"

    # ------------------------------------------------------------------ #
    # Intent Classification                                                 #
    # ------------------------------------------------------------------ #

    @staticmethod
    def _classify_intent(text: str) -> str:
        """
        Classify query into: factual | summary | compare.

        Priority: compare > summary > factual
        (compare and summary are more specific; factual is the default)
        """
        text_lower = text.lower().strip()

        for pattern in _COMPARE_PATTERNS:
            if re.search(pattern, text_lower):
                return "compare"

        for pattern in _SUMMARY_PATTERNS:
            if re.search(pattern, text_lower):
                return "summary"

        for pattern in _FACTUAL_PATTERNS:
            if re.search(pattern, text_lower):
                return "factual"

        # Default: factual
        return "factual"