""" VoiceVault — Query Preprocessor ================================== Cleans raw Whisper transcripts and classifies query intent. Steps: 1. Lowercase normalization 2. Filler word removal (um, uh, like, you know, etc.) 3. Punctuation repair (double spaces, leading/trailing cleanup) 4. Language detection (langdetect) 5. Query type classification: factual | summary | compare The query type drives retrieval strategy in Phase 4: - factual → standard hybrid retrieval, concise answer (< 100 words) - summary → higher final_top_k, longer answer (up to 300 words) - compare → 2-step retrieval targeting multiple entities Usage: preprocessor = QueryPreprocessor() result = preprocessor.process("Um, what is, like, machine learning?") # result.processed_query == "what is machine learning" # result.query_type == "factual" # result.language == "en" """ from __future__ import annotations import logging import re from dataclasses import dataclass logger = logging.getLogger(__name__) # ------------------------------------------------------------------ # # Query Type Patterns # # ------------------------------------------------------------------ # _FACTUAL_PATTERNS = [ r"^what (is|are|was|were)\b", r"^who (is|are|was|were)\b", r"^when (did|does|is|was)\b", r"^where (is|are|was|were|did)\b", r"^which\b", r"^how (many|much|long|old|far|often)\b", r"^define\b", r"^what does .+ mean\b", ] _SUMMARY_PATTERNS = [ r"^summar(ise|ize)\b", r"^give (me )?(an? )?(overview|summary|brief|outline)\b", r"^(describe|explain|tell me about)\b", r"^what (is the )?(main|key|primary|overall)\b", r"^overview of\b", ] _COMPARE_PATTERNS = [ r"\bcompar(e|ing)\b", r"\bdifferen(ce|t)\b", r"\bversus\b", r"\bvs\.?\b", r"\bpros and cons\b", r"what (is|are) the difference", r"how does .+ differ from\b", ] # Filler words to strip (as whole words, case-insensitive) _FILLER_WORDS = { "um", "uh", "er", "ah", "eh", "like", "you know", "i mean", "basically", "literally", "actually", "right", "so", "well", "okay", "ok", } @dataclass class PreprocessedQuery: """Result of preprocessing a raw transcript.""" raw_query: str processed_query: str query_type: str # factual | summary | compare language: str # ISO 639-1 class QueryPreprocessor: """ Cleans and classifies a raw Whisper transcript for optimal retrieval. All operations are pure Python — no ML models required. Language detection uses `langdetect` (lightweight, offline). """ def process(self, raw_query: str) -> PreprocessedQuery: """ Apply the full preprocessing pipeline to a raw transcript. Args: raw_query: Raw text from Whisper (may contain fillers, casing, noise). Returns: PreprocessedQuery with cleaned text, classified type, and language. """ if not raw_query or not raw_query.strip(): return PreprocessedQuery( raw_query=raw_query, processed_query="", query_type="factual", language="en", ) text = raw_query.strip() text = self._normalize(text) text = self._remove_fillers(text) text = self._repair_punctuation(text) language = self._detect_language(text) query_type = self._classify_intent(text) return PreprocessedQuery( raw_query=raw_query, processed_query=text, query_type=query_type, language=language, ) # ------------------------------------------------------------------ # # Normalization # # ------------------------------------------------------------------ # @staticmethod def _normalize(text: str) -> str: """Lowercase and normalize whitespace.""" text = text.lower() text = re.sub(r"\s+", " ", text) return text.strip() @staticmethod def _remove_fillers(text: str) -> str: """ Remove spoken filler words and phrases as whole words. Multi-word fillers (e.g., "you know") are removed first to avoid partial matches. """ # Multi-word fillers first (longer matches take priority) multi_word = sorted( [f for f in _FILLER_WORDS if " " in f], key=len, reverse=True ) for filler in multi_word: pattern = r"(? str: """ Clean up punctuation artifacts from filler removal. - Remove leading commas, dots - Normalize repeated punctuation - Ensure sentence ends with proper punctuation if it's a question """ text = re.sub(r"^[,.\s]+", "", text) text = re.sub(r"[,.\s]+$", "", text) text = re.sub(r",\s*,", ",", text) text = text.strip() return text # ------------------------------------------------------------------ # # Language Detection # # ------------------------------------------------------------------ # @staticmethod def _detect_language(text: str) -> str: """ Detect ISO 639-1 language code. Returns "en" if langdetect is unavailable or detection fails. """ if not text or len(text.split()) < 3: return "en" try: from langdetect import detect return detect(text) except Exception: return "en" # ------------------------------------------------------------------ # # Intent Classification # # ------------------------------------------------------------------ # @staticmethod def _classify_intent(text: str) -> str: """ Classify query into: factual | summary | compare. Priority: compare > summary > factual (compare and summary are more specific; factual is the default) """ text_lower = text.lower().strip() for pattern in _COMPARE_PATTERNS: if re.search(pattern, text_lower): return "compare" for pattern in _SUMMARY_PATTERNS: if re.search(pattern, text_lower): return "summary" for pattern in _FACTUAL_PATTERNS: if re.search(pattern, text_lower): return "factual" # Default: factual return "factual"