barvox-backend / data_models.py
RonenShilchikov
Restructure: move Python backend into backend/ directory
423bed8
"""
BarVox Audio Processing API - Pydantic Data Models
"""
from pydantic import BaseModel
from typing import Optional, List, Dict, Any
class DictionaryRecording(BaseModel):
features: Dict[str, Any]
class DictionaryEntry(BaseModel):
id: str
label: str
recordings: List[DictionaryRecording]
z_floor: Optional[float] = None # Per-word minimum z-score (computed at bank load time)
dtw_z_floor: Optional[float] = None # Per-word minimum z-score in DTW space (computed at bank load time)
cosine_floor: Optional[float] = None # Per-word whitened cosine floor (computed at bank load time)
class ExtractBankRequest(BaseModel):
bank_name: str
silero_params: Optional[Dict[str, Any]] = None
class SimilarityRequest(BaseModel):
test_features: Dict[str, Any]
dictionary_entries: List[DictionaryEntry]
dtw_params: Optional[Dict[str, Any]] = None
similarity_mode: Optional[str] = "mean" # "mean", "dtw", or "hybrid" (mean filter → DTW re-rank)
hybrid_top_n: Optional[int] = 8 # Number of top candidates to re-rank with DTW in hybrid mode (increased from 5)
# Unknown rejection: better to say "unknown" than give a wrong prediction
unknown_threshold: Optional[float] = None # Cosine mean-score floor (from bank cosine self-similarity)
dtw_calibration_threshold: Optional[float] = None # DTW score floor (from bank DTW self-similarity)
unknown_min_gap: Optional[float] = None # Min DTW gap between 1st and 2nd (0 = disabled)
unknown_z_threshold: Optional[float] = 2.0 # Z-score threshold: top word must be this many std devs above mean
# Embedding whitening: breaks HuBERT anisotropy so cosine scores have real range
global_mean_embedding: Optional[List[float]] = None # Global mean of all bank HuBERT embeddings (from /extract_bank)
# CTC entropy: detects English words that HuBERT recognizes confidently
ctc_entropy_threshold: Optional[float] = None # Reject if entropy < this AND DTW below floor (start: 1.0)