Spaces:
Running
Running
File size: 3,327 Bytes
857b1b2 2197627 857b1b2 5155dc9 857b1b2 435bead f7543f2 1fe53e1 57d5a31 857b1b2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | """
Application configuration using Pydantic Settings.
"""
import os
from pathlib import Path
from functools import lru_cache
from typing import Literal, Dict
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
"""Application settings loaded from environment variables."""
model_config = SettingsConfigDict(
env_file=".env", env_file_encoding="utf-8", extra="ignore"
)
# HuggingFace
hf_token: str = ""
enable_noise_reduction: bool = True
# Denoising (Speech Enhancement)
enable_denoiser: bool = True
available_whisper_models: Dict[str, str] = {
"PhoWhisper Large": "kiendt/PhoWhisper-large-ct2",
"PhoWhisper Lora Finetuned": "vyluong/pho-whisper-vi-ct2",
}
# S2T model
default_whisper_model: str = "vyluong/pho-whisper-vi-ct2"
wav2vec_model: str = "vyluong/w2vbert_final"
# voice emotion detection model
default_dual_emotion_model: str = "vyluong/emo_dual_classi"
# Diarization model
# pyannote/speaker-diarization-3.1
# pyannote/speaker-diarization-community-1
pyannote_model: str = "pyannote/speaker-diarization-community-1"
sortformer_model: str = "nvidia/diar_sortformer_4spk-v1"
diarization_backend: str = "sortformer"
# Device settings
device: Literal["cuda", "cpu", "auto"] = "auto"
compute_type: str = "float16" # float16 for GPU, int8 for CPU
# Upload settings
max_upload_size_mb: int = 100
allowed_extensions: list[str] = ["mp3", "wav", "m4a", "ogg", "flac", "webm"]
# Audio processing settings
sample_rate: int = 16000
channels: int = 1 # Mono
enable_loudnorm: bool = True
# VAD parameters
vad_threshold: float = 0.55
vad_min_speech_duration_ms: int = 200
vad_min_silence_duration_ms: int = 450
vad_speech_pad_ms: int = 250
# Post-processing
merge_threshold_s: float = 0.35 # Merge segments from same speaker if gap < this
min_segment_duration_s: float = 0.85 # Remove segments shorter than this
# Server settings
host: str = "0.0.0.0"
port: int = 7860
# Paths
base_dir: Path = Path(__file__).parent.parent.parent
data_dir: Path = base_dir / "data"
upload_dir: Path = data_dir / "uploads"
processed_dir: Path = data_dir / "processed"
def __init__(self, **kwargs):
super().__init__(**kwargs)
# Ensure directories exist
self.upload_dir.mkdir(parents=True, exist_ok=True)
self.processed_dir.mkdir(parents=True, exist_ok=True)
@property
def max_upload_size_bytes(self) -> int:
return self.max_upload_size_mb * 1024 * 1024
@property
def resolved_device(self) -> str:
"""Resolve 'auto' to actual device."""
if self.device == "auto":
try:
import torch
return "cuda" if torch.cuda.is_available() else "cpu"
except ImportError:
return "cpu"
return self.device
@property
def resolved_compute_type(self) -> str:
"""Get appropriate compute type for device."""
if self.resolved_device == "cuda":
return "float16"
return "int8"
@lru_cache
def get_settings() -> Settings:
"""Get cached settings instance."""
return Settings()
|