""" BarVox Audio Processing API - Pydantic Data Models """ from pydantic import BaseModel from typing import Optional, List, Dict, Any class DictionaryRecording(BaseModel): features: Dict[str, Any] class DictionaryEntry(BaseModel): id: str label: str recordings: List[DictionaryRecording] z_floor: Optional[float] = None # Per-word minimum z-score (computed at bank load time) dtw_z_floor: Optional[float] = None # Per-word minimum z-score in DTW space (computed at bank load time) cosine_floor: Optional[float] = None # Per-word whitened cosine floor (computed at bank load time) class ExtractBankRequest(BaseModel): bank_name: str silero_params: Optional[Dict[str, Any]] = None class SimilarityRequest(BaseModel): test_features: Dict[str, Any] dictionary_entries: List[DictionaryEntry] dtw_params: Optional[Dict[str, Any]] = None similarity_mode: Optional[str] = "mean" # "mean", "dtw", or "hybrid" (mean filter → DTW re-rank) hybrid_top_n: Optional[int] = 8 # Number of top candidates to re-rank with DTW in hybrid mode (increased from 5) # Unknown rejection: better to say "unknown" than give a wrong prediction unknown_threshold: Optional[float] = None # Cosine mean-score floor (from bank cosine self-similarity) dtw_calibration_threshold: Optional[float] = None # DTW score floor (from bank DTW self-similarity) unknown_min_gap: Optional[float] = None # Min DTW gap between 1st and 2nd (0 = disabled) unknown_z_threshold: Optional[float] = 2.0 # Z-score threshold: top word must be this many std devs above mean # Embedding whitening: breaks HuBERT anisotropy so cosine scores have real range global_mean_embedding: Optional[List[float]] = None # Global mean of all bank HuBERT embeddings (from /extract_bank) # CTC entropy: detects English words that HuBERT recognizes confidently ctc_entropy_threshold: Optional[float] = None # Reject if entropy < this AND DTW below floor (start: 1.0)