VoiceVault / voicevault /asr /query_preprocessor.py
NinjainPJs's picture
Initial release: VoiceVault v1.0.0 — Voice-First RAG Knowledge Agent
85f900d
"""
VoiceVault — Query Preprocessor
==================================
Cleans raw Whisper transcripts and classifies query intent.
Steps:
1. Lowercase normalization
2. Filler word removal (um, uh, like, you know, etc.)
3. Punctuation repair (double spaces, leading/trailing cleanup)
4. Language detection (langdetect)
5. Query type classification: factual | summary | compare
The query type drives retrieval strategy in Phase 4:
- factual → standard hybrid retrieval, concise answer (< 100 words)
- summary → higher final_top_k, longer answer (up to 300 words)
- compare → 2-step retrieval targeting multiple entities
Usage:
preprocessor = QueryPreprocessor()
result = preprocessor.process("Um, what is, like, machine learning?")
# result.processed_query == "what is machine learning"
# result.query_type == "factual"
# result.language == "en"
"""
from __future__ import annotations
import logging
import re
from dataclasses import dataclass
logger = logging.getLogger(__name__)
# ------------------------------------------------------------------ #
# Query Type Patterns #
# ------------------------------------------------------------------ #
_FACTUAL_PATTERNS = [
r"^what (is|are|was|were)\b",
r"^who (is|are|was|were)\b",
r"^when (did|does|is|was)\b",
r"^where (is|are|was|were|did)\b",
r"^which\b",
r"^how (many|much|long|old|far|often)\b",
r"^define\b",
r"^what does .+ mean\b",
]
_SUMMARY_PATTERNS = [
r"^summar(ise|ize)\b",
r"^give (me )?(an? )?(overview|summary|brief|outline)\b",
r"^(describe|explain|tell me about)\b",
r"^what (is the )?(main|key|primary|overall)\b",
r"^overview of\b",
]
_COMPARE_PATTERNS = [
r"\bcompar(e|ing)\b",
r"\bdifferen(ce|t)\b",
r"\bversus\b",
r"\bvs\.?\b",
r"\bpros and cons\b",
r"what (is|are) the difference",
r"how does .+ differ from\b",
]
# Filler words to strip (as whole words, case-insensitive)
_FILLER_WORDS = {
"um", "uh", "er", "ah", "eh", "like", "you know",
"i mean", "basically", "literally", "actually", "right",
"so", "well", "okay", "ok",
}
@dataclass
class PreprocessedQuery:
"""Result of preprocessing a raw transcript."""
raw_query: str
processed_query: str
query_type: str # factual | summary | compare
language: str # ISO 639-1
class QueryPreprocessor:
"""
Cleans and classifies a raw Whisper transcript for optimal retrieval.
All operations are pure Python — no ML models required.
Language detection uses `langdetect` (lightweight, offline).
"""
def process(self, raw_query: str) -> PreprocessedQuery:
"""
Apply the full preprocessing pipeline to a raw transcript.
Args:
raw_query: Raw text from Whisper (may contain fillers, casing, noise).
Returns:
PreprocessedQuery with cleaned text, classified type, and language.
"""
if not raw_query or not raw_query.strip():
return PreprocessedQuery(
raw_query=raw_query,
processed_query="",
query_type="factual",
language="en",
)
text = raw_query.strip()
text = self._normalize(text)
text = self._remove_fillers(text)
text = self._repair_punctuation(text)
language = self._detect_language(text)
query_type = self._classify_intent(text)
return PreprocessedQuery(
raw_query=raw_query,
processed_query=text,
query_type=query_type,
language=language,
)
# ------------------------------------------------------------------ #
# Normalization #
# ------------------------------------------------------------------ #
@staticmethod
def _normalize(text: str) -> str:
"""Lowercase and normalize whitespace."""
text = text.lower()
text = re.sub(r"\s+", " ", text)
return text.strip()
@staticmethod
def _remove_fillers(text: str) -> str:
"""
Remove spoken filler words and phrases as whole words.
Multi-word fillers (e.g., "you know") are removed first to avoid
partial matches.
"""
# Multi-word fillers first (longer matches take priority)
multi_word = sorted(
[f for f in _FILLER_WORDS if " " in f],
key=len, reverse=True
)
for filler in multi_word:
pattern = r"(?<!\w)" + re.escape(filler) + r"(?!\w)"
text = re.sub(pattern, " ", text, flags=re.IGNORECASE)
# Single-word fillers as whole-word matches
single_word = [f for f in _FILLER_WORDS if " " not in f]
for filler in single_word:
pattern = r"\b" + re.escape(filler) + r"\b"
text = re.sub(pattern, " ", text, flags=re.IGNORECASE)
return re.sub(r"\s+", " ", text).strip()
@staticmethod
def _repair_punctuation(text: str) -> str:
"""
Clean up punctuation artifacts from filler removal.
- Remove leading commas, dots
- Normalize repeated punctuation
- Ensure sentence ends with proper punctuation if it's a question
"""
text = re.sub(r"^[,.\s]+", "", text)
text = re.sub(r"[,.\s]+$", "", text)
text = re.sub(r",\s*,", ",", text)
text = text.strip()
return text
# ------------------------------------------------------------------ #
# Language Detection #
# ------------------------------------------------------------------ #
@staticmethod
def _detect_language(text: str) -> str:
"""
Detect ISO 639-1 language code.
Returns "en" if langdetect is unavailable or detection fails.
"""
if not text or len(text.split()) < 3:
return "en"
try:
from langdetect import detect
return detect(text)
except Exception:
return "en"
# ------------------------------------------------------------------ #
# Intent Classification #
# ------------------------------------------------------------------ #
@staticmethod
def _classify_intent(text: str) -> str:
"""
Classify query into: factual | summary | compare.
Priority: compare > summary > factual
(compare and summary are more specific; factual is the default)
"""
text_lower = text.lower().strip()
for pattern in _COMPARE_PATTERNS:
if re.search(pattern, text_lower):
return "compare"
for pattern in _SUMMARY_PATTERNS:
if re.search(pattern, text_lower):
return "summary"
for pattern in _FACTUAL_PATTERNS:
if re.search(pattern, text_lower):
return "factual"
# Default: factual
return "factual"