Spaces:

NinjainPJs
/

VoiceVault

Running

App Files Files Community

VoiceVault / voicevault /asr /query_preprocessor.py

NinjainPJs

Initial release: VoiceVault v1.0.0 — Voice-First RAG Knowledge Agent

85f900d 3 months ago

raw

history blame contribute delete

7.13 kB

	"""
	VoiceVault — Query Preprocessor
	==================================
	Cleans raw Whisper transcripts and classifies query intent.

	Steps:
	1. Lowercase normalization
	2. Filler word removal (um, uh, like, you know, etc.)
	3. Punctuation repair (double spaces, leading/trailing cleanup)
	4. Language detection (langdetect)
	5. Query type classification: factual \| summary \| compare

	The query type drives retrieval strategy in Phase 4:
	- factual → standard hybrid retrieval, concise answer (< 100 words)
	- summary → higher final_top_k, longer answer (up to 300 words)
	- compare → 2-step retrieval targeting multiple entities

	Usage:
	preprocessor = QueryPreprocessor()
	result = preprocessor.process("Um, what is, like, machine learning?")
	# result.processed_query == "what is machine learning"
	# result.query_type == "factual"
	# result.language == "en"
	"""

	from __future__ import annotations

	import logging
	import re
	from dataclasses import dataclass

	logger = logging.getLogger(__name__)

	# ------------------------------------------------------------------ #
	# Query Type Patterns #
	# ------------------------------------------------------------------ #

	_FACTUAL_PATTERNS = [
	r"^what (is\|are\|was\|were)\b",
	r"^who (is\|are\|was\|were)\b",
	r"^when (did\|does\|is\|was)\b",
	r"^where (is\|are\|was\|were\|did)\b",
	r"^which\b",
	r"^how (many\|much\|long\|old\|far\|often)\b",
	r"^define\b",
	r"^what does .+ mean\b",
	]

	_SUMMARY_PATTERNS = [
	r"^summar(ise\|ize)\b",
	r"^give (me )?(an? )?(overview\|summary\|brief\|outline)\b",
	r"^(describe\|explain\|tell me about)\b",
	r"^what (is the )?(main\|key\|primary\|overall)\b",
	r"^overview of\b",
	]

	_COMPARE_PATTERNS = [
	r"\bcompar(e\|ing)\b",
	r"\bdifferen(ce\|t)\b",
	r"\bversus\b",
	r"\bvs\.?\b",
	r"\bpros and cons\b",
	r"what (is\|are) the difference",
	r"how does .+ differ from\b",
	]

	# Filler words to strip (as whole words, case-insensitive)
	_FILLER_WORDS = {
	"um", "uh", "er", "ah", "eh", "like", "you know",
	"i mean", "basically", "literally", "actually", "right",
	"so", "well", "okay", "ok",
	}


	@dataclass
	class PreprocessedQuery:
	"""Result of preprocessing a raw transcript."""
	raw_query: str
	processed_query: str
	query_type: str # factual \| summary \| compare
	language: str # ISO 639-1


	class QueryPreprocessor:
	"""
	Cleans and classifies a raw Whisper transcript for optimal retrieval.

	All operations are pure Python — no ML models required.
	Language detection uses `langdetect` (lightweight, offline).
	"""

	def process(self, raw_query: str) -> PreprocessedQuery:
	"""
	Apply the full preprocessing pipeline to a raw transcript.

	Args:
	raw_query: Raw text from Whisper (may contain fillers, casing, noise).

	Returns:
	PreprocessedQuery with cleaned text, classified type, and language.
	"""
	if not raw_query or not raw_query.strip():
	return PreprocessedQuery(
	raw_query=raw_query,
	processed_query="",
	query_type="factual",
	language="en",
	)

	text = raw_query.strip()
	text = self._normalize(text)
	text = self._remove_fillers(text)
	text = self._repair_punctuation(text)

	language = self._detect_language(text)
	query_type = self._classify_intent(text)

	return PreprocessedQuery(
	raw_query=raw_query,
	processed_query=text,
	query_type=query_type,
	language=language,
	)

	# ------------------------------------------------------------------ #
	# Normalization #
	# ------------------------------------------------------------------ #

	@staticmethod
	def _normalize(text: str) -> str:
	"""Lowercase and normalize whitespace."""
	text = text.lower()
	text = re.sub(r"\s+", " ", text)
	return text.strip()

	@staticmethod
	def _remove_fillers(text: str) -> str:
	"""
	Remove spoken filler words and phrases as whole words.
	Multi-word fillers (e.g., "you know") are removed first to avoid
	partial matches.
	"""
	# Multi-word fillers first (longer matches take priority)
	multi_word = sorted(
	[f for f in _FILLER_WORDS if " " in f],
	key=len, reverse=True
	)
	for filler in multi_word:
	pattern = r"(?<!\w)" + re.escape(filler) + r"(?!\w)"
	text = re.sub(pattern, " ", text, flags=re.IGNORECASE)

	# Single-word fillers as whole-word matches
	single_word = [f for f in _FILLER_WORDS if " " not in f]
	for filler in single_word:
	pattern = r"\b" + re.escape(filler) + r"\b"
	text = re.sub(pattern, " ", text, flags=re.IGNORECASE)

	return re.sub(r"\s+", " ", text).strip()

	@staticmethod
	def _repair_punctuation(text: str) -> str:
	"""
	Clean up punctuation artifacts from filler removal.
	- Remove leading commas, dots
	- Normalize repeated punctuation
	- Ensure sentence ends with proper punctuation if it's a question
	"""
	text = re.sub(r"^[,.\s]+", "", text)
	text = re.sub(r"[,.\s]+$", "", text)
	text = re.sub(r",\s*,", ",", text)
	text = text.strip()
	return text

	# ------------------------------------------------------------------ #
	# Language Detection #
	# ------------------------------------------------------------------ #

	@staticmethod
	def _detect_language(text: str) -> str:
	"""
	Detect ISO 639-1 language code.
	Returns "en" if langdetect is unavailable or detection fails.
	"""
	if not text or len(text.split()) < 3:
	return "en"
	try:
	from langdetect import detect
	return detect(text)
	except Exception:
	return "en"

	# ------------------------------------------------------------------ #
	# Intent Classification #
	# ------------------------------------------------------------------ #

	@staticmethod
	def _classify_intent(text: str) -> str:
	"""
	Classify query into: factual \| summary \| compare.

	Priority: compare > summary > factual
	(compare and summary are more specific; factual is the default)
	"""
	text_lower = text.lower().strip()

	for pattern in _COMPARE_PATTERNS:
	if re.search(pattern, text_lower):
	return "compare"

	for pattern in _SUMMARY_PATTERNS:
	if re.search(pattern, text_lower):
	return "summary"

	for pattern in _FACTUAL_PATTERNS:
	if re.search(pattern, text_lower):
	return "factual"

	# Default: factual
	return "factual"