Spaces:
Running
Running
| """ | |
| VoiceVault — Query Preprocessor | |
| ================================== | |
| Cleans raw Whisper transcripts and classifies query intent. | |
| Steps: | |
| 1. Lowercase normalization | |
| 2. Filler word removal (um, uh, like, you know, etc.) | |
| 3. Punctuation repair (double spaces, leading/trailing cleanup) | |
| 4. Language detection (langdetect) | |
| 5. Query type classification: factual | summary | compare | |
| The query type drives retrieval strategy in Phase 4: | |
| - factual → standard hybrid retrieval, concise answer (< 100 words) | |
| - summary → higher final_top_k, longer answer (up to 300 words) | |
| - compare → 2-step retrieval targeting multiple entities | |
| Usage: | |
| preprocessor = QueryPreprocessor() | |
| result = preprocessor.process("Um, what is, like, machine learning?") | |
| # result.processed_query == "what is machine learning" | |
| # result.query_type == "factual" | |
| # result.language == "en" | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import re | |
| from dataclasses import dataclass | |
| logger = logging.getLogger(__name__) | |
| # ------------------------------------------------------------------ # | |
| # Query Type Patterns # | |
| # ------------------------------------------------------------------ # | |
| _FACTUAL_PATTERNS = [ | |
| r"^what (is|are|was|were)\b", | |
| r"^who (is|are|was|were)\b", | |
| r"^when (did|does|is|was)\b", | |
| r"^where (is|are|was|were|did)\b", | |
| r"^which\b", | |
| r"^how (many|much|long|old|far|often)\b", | |
| r"^define\b", | |
| r"^what does .+ mean\b", | |
| ] | |
| _SUMMARY_PATTERNS = [ | |
| r"^summar(ise|ize)\b", | |
| r"^give (me )?(an? )?(overview|summary|brief|outline)\b", | |
| r"^(describe|explain|tell me about)\b", | |
| r"^what (is the )?(main|key|primary|overall)\b", | |
| r"^overview of\b", | |
| ] | |
| _COMPARE_PATTERNS = [ | |
| r"\bcompar(e|ing)\b", | |
| r"\bdifferen(ce|t)\b", | |
| r"\bversus\b", | |
| r"\bvs\.?\b", | |
| r"\bpros and cons\b", | |
| r"what (is|are) the difference", | |
| r"how does .+ differ from\b", | |
| ] | |
| # Filler words to strip (as whole words, case-insensitive) | |
| _FILLER_WORDS = { | |
| "um", "uh", "er", "ah", "eh", "like", "you know", | |
| "i mean", "basically", "literally", "actually", "right", | |
| "so", "well", "okay", "ok", | |
| } | |
| class PreprocessedQuery: | |
| """Result of preprocessing a raw transcript.""" | |
| raw_query: str | |
| processed_query: str | |
| query_type: str # factual | summary | compare | |
| language: str # ISO 639-1 | |
| class QueryPreprocessor: | |
| """ | |
| Cleans and classifies a raw Whisper transcript for optimal retrieval. | |
| All operations are pure Python — no ML models required. | |
| Language detection uses `langdetect` (lightweight, offline). | |
| """ | |
| def process(self, raw_query: str) -> PreprocessedQuery: | |
| """ | |
| Apply the full preprocessing pipeline to a raw transcript. | |
| Args: | |
| raw_query: Raw text from Whisper (may contain fillers, casing, noise). | |
| Returns: | |
| PreprocessedQuery with cleaned text, classified type, and language. | |
| """ | |
| if not raw_query or not raw_query.strip(): | |
| return PreprocessedQuery( | |
| raw_query=raw_query, | |
| processed_query="", | |
| query_type="factual", | |
| language="en", | |
| ) | |
| text = raw_query.strip() | |
| text = self._normalize(text) | |
| text = self._remove_fillers(text) | |
| text = self._repair_punctuation(text) | |
| language = self._detect_language(text) | |
| query_type = self._classify_intent(text) | |
| return PreprocessedQuery( | |
| raw_query=raw_query, | |
| processed_query=text, | |
| query_type=query_type, | |
| language=language, | |
| ) | |
| # ------------------------------------------------------------------ # | |
| # Normalization # | |
| # ------------------------------------------------------------------ # | |
| def _normalize(text: str) -> str: | |
| """Lowercase and normalize whitespace.""" | |
| text = text.lower() | |
| text = re.sub(r"\s+", " ", text) | |
| return text.strip() | |
| def _remove_fillers(text: str) -> str: | |
| """ | |
| Remove spoken filler words and phrases as whole words. | |
| Multi-word fillers (e.g., "you know") are removed first to avoid | |
| partial matches. | |
| """ | |
| # Multi-word fillers first (longer matches take priority) | |
| multi_word = sorted( | |
| [f for f in _FILLER_WORDS if " " in f], | |
| key=len, reverse=True | |
| ) | |
| for filler in multi_word: | |
| pattern = r"(?<!\w)" + re.escape(filler) + r"(?!\w)" | |
| text = re.sub(pattern, " ", text, flags=re.IGNORECASE) | |
| # Single-word fillers as whole-word matches | |
| single_word = [f for f in _FILLER_WORDS if " " not in f] | |
| for filler in single_word: | |
| pattern = r"\b" + re.escape(filler) + r"\b" | |
| text = re.sub(pattern, " ", text, flags=re.IGNORECASE) | |
| return re.sub(r"\s+", " ", text).strip() | |
| def _repair_punctuation(text: str) -> str: | |
| """ | |
| Clean up punctuation artifacts from filler removal. | |
| - Remove leading commas, dots | |
| - Normalize repeated punctuation | |
| - Ensure sentence ends with proper punctuation if it's a question | |
| """ | |
| text = re.sub(r"^[,.\s]+", "", text) | |
| text = re.sub(r"[,.\s]+$", "", text) | |
| text = re.sub(r",\s*,", ",", text) | |
| text = text.strip() | |
| return text | |
| # ------------------------------------------------------------------ # | |
| # Language Detection # | |
| # ------------------------------------------------------------------ # | |
| def _detect_language(text: str) -> str: | |
| """ | |
| Detect ISO 639-1 language code. | |
| Returns "en" if langdetect is unavailable or detection fails. | |
| """ | |
| if not text or len(text.split()) < 3: | |
| return "en" | |
| try: | |
| from langdetect import detect | |
| return detect(text) | |
| except Exception: | |
| return "en" | |
| # ------------------------------------------------------------------ # | |
| # Intent Classification # | |
| # ------------------------------------------------------------------ # | |
| def _classify_intent(text: str) -> str: | |
| """ | |
| Classify query into: factual | summary | compare. | |
| Priority: compare > summary > factual | |
| (compare and summary are more specific; factual is the default) | |
| """ | |
| text_lower = text.lower().strip() | |
| for pattern in _COMPARE_PATTERNS: | |
| if re.search(pattern, text_lower): | |
| return "compare" | |
| for pattern in _SUMMARY_PATTERNS: | |
| if re.search(pattern, text_lower): | |
| return "summary" | |
| for pattern in _FACTUAL_PATTERNS: | |
| if re.search(pattern, text_lower): | |
| return "factual" | |
| # Default: factual | |
| return "factual" | |