File size: 2,031 Bytes
423bed8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
"""
BarVox Audio Processing API - Pydantic Data Models
"""

from pydantic import BaseModel
from typing import Optional, List, Dict, Any

class DictionaryRecording(BaseModel):
    features: Dict[str, Any]
    
class DictionaryEntry(BaseModel):
    id: str
    label: str
    recordings: List[DictionaryRecording]
    z_floor: Optional[float] = None       # Per-word minimum z-score (computed at bank load time)
    dtw_z_floor: Optional[float] = None   # Per-word minimum z-score in DTW space (computed at bank load time)
    cosine_floor: Optional[float] = None  # Per-word whitened cosine floor (computed at bank load time)

class ExtractBankRequest(BaseModel):
    bank_name: str
    silero_params: Optional[Dict[str, Any]] = None


class SimilarityRequest(BaseModel):
    test_features: Dict[str, Any]
    dictionary_entries: List[DictionaryEntry]
    dtw_params: Optional[Dict[str, Any]] = None
    similarity_mode: Optional[str] = "mean"  # "mean", "dtw", or "hybrid" (mean filter → DTW re-rank)
    hybrid_top_n: Optional[int] = 8  # Number of top candidates to re-rank with DTW in hybrid mode (increased from 5)
    # Unknown rejection: better to say "unknown" than give a wrong prediction
    unknown_threshold: Optional[float] = None       # Cosine mean-score floor (from bank cosine self-similarity)
    dtw_calibration_threshold: Optional[float] = None  # DTW score floor (from bank DTW self-similarity)
    unknown_min_gap: Optional[float] = None         # Min DTW gap between 1st and 2nd (0 = disabled)
    unknown_z_threshold: Optional[float] = 2.0     # Z-score threshold: top word must be this many std devs above mean
    # Embedding whitening: breaks HuBERT anisotropy so cosine scores have real range
    global_mean_embedding: Optional[List[float]] = None   # Global mean of all bank HuBERT embeddings (from /extract_bank)
    # CTC entropy: detects English words that HuBERT recognizes confidently
    ctc_entropy_threshold: Optional[float] = None  # Reject if entropy < this AND DTW below floor (start: 1.0)