File size: 3,327 Bytes
857b1b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2197627
857b1b2
 
 
 
 
 
 
5155dc9
857b1b2
 
 
 
 
 
 
435bead
f7543f2
1fe53e1
 
57d5a31
 
 
857b1b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
Application configuration using Pydantic Settings.
"""

import os
from pathlib import Path
from functools import lru_cache
from typing import Literal, Dict

from pydantic_settings import BaseSettings, SettingsConfigDict


class Settings(BaseSettings):
    """Application settings loaded from environment variables."""

    model_config = SettingsConfigDict(
        env_file=".env", env_file_encoding="utf-8", extra="ignore"
    )

    # HuggingFace
    hf_token: str = ""
    enable_noise_reduction: bool = True

    # Denoising (Speech Enhancement)
    enable_denoiser: bool = True
  

    available_whisper_models: Dict[str, str] = {
        "PhoWhisper Large": "kiendt/PhoWhisper-large-ct2",
        "PhoWhisper Lora Finetuned": "vyluong/pho-whisper-vi-ct2",
    }

    # S2T model
    default_whisper_model: str = "vyluong/pho-whisper-vi-ct2"

    wav2vec_model: str = "vyluong/w2vbert_final"

    # voice emotion detection model
    default_dual_emotion_model: str = "vyluong/emo_dual_classi"

    # Diarization model
    # pyannote/speaker-diarization-3.1
    # pyannote/speaker-diarization-community-1
    pyannote_model: str = "pyannote/speaker-diarization-community-1"
    
    sortformer_model: str = "nvidia/diar_sortformer_4spk-v1"
    
    diarization_backend: str = "sortformer"

    # Device settings
    device: Literal["cuda", "cpu", "auto"] = "auto"
    compute_type: str = "float16"  # float16 for GPU, int8 for CPU

    # Upload settings
    max_upload_size_mb: int = 100
    allowed_extensions: list[str] = ["mp3", "wav", "m4a", "ogg", "flac", "webm"]

    # Audio processing settings
    sample_rate: int = 16000
    channels: int = 1  # Mono

    enable_loudnorm: bool = True

    # VAD parameters
    vad_threshold: float = 0.55
    vad_min_speech_duration_ms: int = 200
    vad_min_silence_duration_ms: int = 450
    vad_speech_pad_ms: int = 250

    # Post-processing
    merge_threshold_s: float = 0.35  # Merge segments from same speaker if gap < this
    min_segment_duration_s: float = 0.85  # Remove segments shorter than this

    # Server settings
    host: str = "0.0.0.0"
    port: int = 7860

    # Paths
    base_dir: Path = Path(__file__).parent.parent.parent
    data_dir: Path = base_dir / "data"
    upload_dir: Path = data_dir / "uploads"
    processed_dir: Path = data_dir / "processed"

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        # Ensure directories exist
        self.upload_dir.mkdir(parents=True, exist_ok=True)
        self.processed_dir.mkdir(parents=True, exist_ok=True)

    @property
    def max_upload_size_bytes(self) -> int:
        return self.max_upload_size_mb * 1024 * 1024

    @property
    def resolved_device(self) -> str:
        """Resolve 'auto' to actual device."""
        if self.device == "auto":
            try:
                import torch

                return "cuda" if torch.cuda.is_available() else "cpu"
            except ImportError:
                return "cpu"
        return self.device

    @property
    def resolved_compute_type(self) -> str:
        """Get appropriate compute type for device."""
        if self.resolved_device == "cuda":
            return "float16"
        return "int8"


@lru_cache
def get_settings() -> Settings:
    """Get cached settings instance."""
    return Settings()