Spaces:

NinjainPJs
/

VoiceVault

Running

File size: 7,132 Bytes

85f900d

"""
VoiceVault — Query Preprocessor
==================================
Cleans raw Whisper transcripts and classifies query intent.

Steps:
  1. Lowercase normalization
  2. Filler word removal (um, uh, like, you know, etc.)
  3. Punctuation repair (double spaces, leading/trailing cleanup)
  4. Language detection (langdetect)
  5. Query type classification: factual | summary | compare

The query type drives retrieval strategy in Phase 4:
  - factual  → standard hybrid retrieval, concise answer (< 100 words)
  - summary  → higher final_top_k, longer answer (up to 300 words)
  - compare  → 2-step retrieval targeting multiple entities

Usage:
    preprocessor = QueryPreprocessor()
    result = preprocessor.process("Um, what is, like, machine learning?")
    # result.processed_query == "what is machine learning"
    # result.query_type == "factual"
    # result.language == "en"
"""

from __future__ import annotations

import logging
import re
from dataclasses import dataclass

logger = logging.getLogger(__name__)

# ------------------------------------------------------------------ #
# Query Type Patterns                                                   #
# ------------------------------------------------------------------ #

_FACTUAL_PATTERNS = [
    r"^what (is|are|was|were)\b",
    r"^who (is|are|was|were)\b",
    r"^when (did|does|is|was)\b",
    r"^where (is|are|was|were|did)\b",
    r"^which\b",
    r"^how (many|much|long|old|far|often)\b",
    r"^define\b",
    r"^what does .+ mean\b",
]

_SUMMARY_PATTERNS = [
    r"^summar(ise|ize)\b",
    r"^give (me )?(an? )?(overview|summary|brief|outline)\b",
    r"^(describe|explain|tell me about)\b",
    r"^what (is the )?(main|key|primary|overall)\b",
    r"^overview of\b",
]

_COMPARE_PATTERNS = [
    r"\bcompar(e|ing)\b",
    r"\bdifferen(ce|t)\b",
    r"\bversus\b",
    r"\bvs\.?\b",
    r"\bpros and cons\b",
    r"what (is|are) the difference",
    r"how does .+ differ from\b",
]

# Filler words to strip (as whole words, case-insensitive)
_FILLER_WORDS = {
    "um", "uh", "er", "ah", "eh", "like", "you know",
    "i mean", "basically", "literally", "actually", "right",
    "so", "well", "okay", "ok",
}


@dataclass
class PreprocessedQuery:
    """Result of preprocessing a raw transcript."""
    raw_query: str
    processed_query: str
    query_type: str      # factual | summary | compare
    language: str        # ISO 639-1


class QueryPreprocessor:
    """
    Cleans and classifies a raw Whisper transcript for optimal retrieval.

    All operations are pure Python — no ML models required.
    Language detection uses `langdetect` (lightweight, offline).
    """

    def process(self, raw_query: str) -> PreprocessedQuery:
        """
        Apply the full preprocessing pipeline to a raw transcript.

        Args:
            raw_query: Raw text from Whisper (may contain fillers, casing, noise).

        Returns:
            PreprocessedQuery with cleaned text, classified type, and language.
        """
        if not raw_query or not raw_query.strip():
            return PreprocessedQuery(
                raw_query=raw_query,
                processed_query="",
                query_type="factual",
                language="en",
            )

        text = raw_query.strip()
        text = self._normalize(text)
        text = self._remove_fillers(text)
        text = self._repair_punctuation(text)

        language = self._detect_language(text)
        query_type = self._classify_intent(text)

        return PreprocessedQuery(
            raw_query=raw_query,
            processed_query=text,
            query_type=query_type,
            language=language,
        )

    # ------------------------------------------------------------------ #
    # Normalization                                                         #
    # ------------------------------------------------------------------ #

    @staticmethod
    def _normalize(text: str) -> str:
        """Lowercase and normalize whitespace."""
        text = text.lower()
        text = re.sub(r"\s+", " ", text)
        return text.strip()

    @staticmethod
    def _remove_fillers(text: str) -> str:
        """
        Remove spoken filler words and phrases as whole words.
        Multi-word fillers (e.g., "you know") are removed first to avoid
        partial matches.
        """
        # Multi-word fillers first (longer matches take priority)
        multi_word = sorted(
            [f for f in _FILLER_WORDS if " " in f],
            key=len, reverse=True
        )
        for filler in multi_word:
            pattern = r"(?<!\w)" + re.escape(filler) + r"(?!\w)"
            text = re.sub(pattern, " ", text, flags=re.IGNORECASE)

        # Single-word fillers as whole-word matches
        single_word = [f for f in _FILLER_WORDS if " " not in f]
        for filler in single_word:
            pattern = r"\b" + re.escape(filler) + r"\b"
            text = re.sub(pattern, " ", text, flags=re.IGNORECASE)

        return re.sub(r"\s+", " ", text).strip()

    @staticmethod
    def _repair_punctuation(text: str) -> str:
        """
        Clean up punctuation artifacts from filler removal.
        - Remove leading commas, dots
        - Normalize repeated punctuation
        - Ensure sentence ends with proper punctuation if it's a question
        """
        text = re.sub(r"^[,.\s]+", "", text)
        text = re.sub(r"[,.\s]+$", "", text)
        text = re.sub(r",\s*,", ",", text)
        text = text.strip()
        return text

    # ------------------------------------------------------------------ #
    # Language Detection                                                    #
    # ------------------------------------------------------------------ #

    @staticmethod
    def _detect_language(text: str) -> str:
        """
        Detect ISO 639-1 language code.
        Returns "en" if langdetect is unavailable or detection fails.
        """
        if not text or len(text.split()) < 3:
            return "en"
        try:
            from langdetect import detect
            return detect(text)
        except Exception:
            return "en"

    # ------------------------------------------------------------------ #
    # Intent Classification                                                 #
    # ------------------------------------------------------------------ #

    @staticmethod
    def _classify_intent(text: str) -> str:
        """
        Classify query into: factual | summary | compare.

        Priority: compare > summary > factual
        (compare and summary are more specific; factual is the default)
        """
        text_lower = text.lower().strip()

        for pattern in _COMPARE_PATTERNS:
            if re.search(pattern, text_lower):
                return "compare"

        for pattern in _SUMMARY_PATTERNS:
            if re.search(pattern, text_lower):
                return "summary"

        for pattern in _FACTUAL_PATTERNS:
            if re.search(pattern, text_lower):
                return "factual"

        # Default: factual
        return "factual"