Spaces:
Running
Running
File size: 7,132 Bytes
85f900d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | """
VoiceVault — Query Preprocessor
==================================
Cleans raw Whisper transcripts and classifies query intent.
Steps:
1. Lowercase normalization
2. Filler word removal (um, uh, like, you know, etc.)
3. Punctuation repair (double spaces, leading/trailing cleanup)
4. Language detection (langdetect)
5. Query type classification: factual | summary | compare
The query type drives retrieval strategy in Phase 4:
- factual → standard hybrid retrieval, concise answer (< 100 words)
- summary → higher final_top_k, longer answer (up to 300 words)
- compare → 2-step retrieval targeting multiple entities
Usage:
preprocessor = QueryPreprocessor()
result = preprocessor.process("Um, what is, like, machine learning?")
# result.processed_query == "what is machine learning"
# result.query_type == "factual"
# result.language == "en"
"""
from __future__ import annotations
import logging
import re
from dataclasses import dataclass
logger = logging.getLogger(__name__)
# ------------------------------------------------------------------ #
# Query Type Patterns #
# ------------------------------------------------------------------ #
_FACTUAL_PATTERNS = [
r"^what (is|are|was|were)\b",
r"^who (is|are|was|were)\b",
r"^when (did|does|is|was)\b",
r"^where (is|are|was|were|did)\b",
r"^which\b",
r"^how (many|much|long|old|far|often)\b",
r"^define\b",
r"^what does .+ mean\b",
]
_SUMMARY_PATTERNS = [
r"^summar(ise|ize)\b",
r"^give (me )?(an? )?(overview|summary|brief|outline)\b",
r"^(describe|explain|tell me about)\b",
r"^what (is the )?(main|key|primary|overall)\b",
r"^overview of\b",
]
_COMPARE_PATTERNS = [
r"\bcompar(e|ing)\b",
r"\bdifferen(ce|t)\b",
r"\bversus\b",
r"\bvs\.?\b",
r"\bpros and cons\b",
r"what (is|are) the difference",
r"how does .+ differ from\b",
]
# Filler words to strip (as whole words, case-insensitive)
_FILLER_WORDS = {
"um", "uh", "er", "ah", "eh", "like", "you know",
"i mean", "basically", "literally", "actually", "right",
"so", "well", "okay", "ok",
}
@dataclass
class PreprocessedQuery:
"""Result of preprocessing a raw transcript."""
raw_query: str
processed_query: str
query_type: str # factual | summary | compare
language: str # ISO 639-1
class QueryPreprocessor:
"""
Cleans and classifies a raw Whisper transcript for optimal retrieval.
All operations are pure Python — no ML models required.
Language detection uses `langdetect` (lightweight, offline).
"""
def process(self, raw_query: str) -> PreprocessedQuery:
"""
Apply the full preprocessing pipeline to a raw transcript.
Args:
raw_query: Raw text from Whisper (may contain fillers, casing, noise).
Returns:
PreprocessedQuery with cleaned text, classified type, and language.
"""
if not raw_query or not raw_query.strip():
return PreprocessedQuery(
raw_query=raw_query,
processed_query="",
query_type="factual",
language="en",
)
text = raw_query.strip()
text = self._normalize(text)
text = self._remove_fillers(text)
text = self._repair_punctuation(text)
language = self._detect_language(text)
query_type = self._classify_intent(text)
return PreprocessedQuery(
raw_query=raw_query,
processed_query=text,
query_type=query_type,
language=language,
)
# ------------------------------------------------------------------ #
# Normalization #
# ------------------------------------------------------------------ #
@staticmethod
def _normalize(text: str) -> str:
"""Lowercase and normalize whitespace."""
text = text.lower()
text = re.sub(r"\s+", " ", text)
return text.strip()
@staticmethod
def _remove_fillers(text: str) -> str:
"""
Remove spoken filler words and phrases as whole words.
Multi-word fillers (e.g., "you know") are removed first to avoid
partial matches.
"""
# Multi-word fillers first (longer matches take priority)
multi_word = sorted(
[f for f in _FILLER_WORDS if " " in f],
key=len, reverse=True
)
for filler in multi_word:
pattern = r"(?<!\w)" + re.escape(filler) + r"(?!\w)"
text = re.sub(pattern, " ", text, flags=re.IGNORECASE)
# Single-word fillers as whole-word matches
single_word = [f for f in _FILLER_WORDS if " " not in f]
for filler in single_word:
pattern = r"\b" + re.escape(filler) + r"\b"
text = re.sub(pattern, " ", text, flags=re.IGNORECASE)
return re.sub(r"\s+", " ", text).strip()
@staticmethod
def _repair_punctuation(text: str) -> str:
"""
Clean up punctuation artifacts from filler removal.
- Remove leading commas, dots
- Normalize repeated punctuation
- Ensure sentence ends with proper punctuation if it's a question
"""
text = re.sub(r"^[,.\s]+", "", text)
text = re.sub(r"[,.\s]+$", "", text)
text = re.sub(r",\s*,", ",", text)
text = text.strip()
return text
# ------------------------------------------------------------------ #
# Language Detection #
# ------------------------------------------------------------------ #
@staticmethod
def _detect_language(text: str) -> str:
"""
Detect ISO 639-1 language code.
Returns "en" if langdetect is unavailable or detection fails.
"""
if not text or len(text.split()) < 3:
return "en"
try:
from langdetect import detect
return detect(text)
except Exception:
return "en"
# ------------------------------------------------------------------ #
# Intent Classification #
# ------------------------------------------------------------------ #
@staticmethod
def _classify_intent(text: str) -> str:
"""
Classify query into: factual | summary | compare.
Priority: compare > summary > factual
(compare and summary are more specific; factual is the default)
"""
text_lower = text.lower().strip()
for pattern in _COMPARE_PATTERNS:
if re.search(pattern, text_lower):
return "compare"
for pattern in _SUMMARY_PATTERNS:
if re.search(pattern, text_lower):
return "summary"
for pattern in _FACTUAL_PATTERNS:
if re.search(pattern, text_lower):
return "factual"
# Default: factual
return "factual"
|