voice-detection-api / config.py
shivam0897-i
fix(backend): Convert PyTorch thread execution to bounded async pool to prevent OOM on HF Spaces
4eae08d
"""
Configuration management using Pydantic Settings.
"""
from pydantic_settings import BaseSettings
from typing import List
from pydantic import Field
class Settings(BaseSettings):
"""Application configuration."""
# Core API Settings
API_KEY: str = Field(..., description="API Key for authentication")
PORT: int = Field(7860, description="Server port")
WEBSITE_URL: str = Field(
default="https://voice-detection-nu.vercel.app/",
description="Project or Portfolio URL"
)
# Security: Swagger/OpenAPI docs are disabled by default in production.
# Set DOCS_ENABLED=true in .env for local development.
DOCS_ENABLED: bool = Field(
default=False,
description="Enable /docs, /redoc, and /openapi.json endpoints (disable in production)"
)
# CORS Settings
# Use str field with alias to read env var safely (avoids Pydantic trying to parse as JSON)
ALLOWED_ORIGINS_RAW: str = Field(default="*", alias="ALLOWED_ORIGINS")
@property
def ALLOWED_ORIGINS(self) -> List[str]:
"""Parse the raw CORS origins string into a list."""
raw_value: str = self.ALLOWED_ORIGINS_RAW
if raw_value.strip().startswith("["):
import json
try:
return json.loads(raw_value)
except json.JSONDecodeError:
pass
return [origin.strip() for origin in raw_value.split(",") if origin.strip()]
# Audio Constraints
MAX_AUDIO_SIZE_MB: int = 10
SUPPORTED_LANGUAGES: List[str] = [
"Auto", "English", "Hindi", "Hinglish", "Mixed",
"Tamil", "Malayalam", "Telugu"
]
SUPPORTED_FORMATS: List[str] = [
"mp3", "wav", "flac", "ogg", "m4a", "mp4", "webm"
]
# ASR settings
ASR_ENABLED: bool = Field(default=True, description="Enable speech-to-text analysis for realtime sessions")
ASR_MODEL_SIZE: str = Field(default="tiny", description="faster-whisper model size")
ASR_COMPUTE_TYPE: str = Field(default="int8", description="faster-whisper compute type")
ASR_BEAM_SIZE: int = Field(default=3, description="Beam size for ASR decoding")
ASR_TIMEOUT_MS: int = Field(
default=1200,
ge=200,
le=15000,
description="Max realtime ASR duration per chunk before timeout fallback"
)
ASR_MAX_INFLIGHT_TASKS: int = Field(
default=3,
ge=1,
le=8,
description="Maximum concurrent ASR background tasks allowed to prevent thread pileups"
)
VOICE_MAX_INFLIGHT_TASKS: int = Field(
default=2,
ge=1,
le=8,
description="Maximum concurrent Voice Analysis PyTorch tasks allowed to prevent OOM thread pileups"
)
ASR_WARMUP_ENABLED: bool = Field(
default=True,
description="Warm faster-whisper model during startup to avoid first-chunk latency spike"
)
AUDIO_PIPELINE_WARMUP_ENABLED: bool = Field(
default=True,
description="Warm audio decoding/resampling pipeline during startup"
)
VOICE_WARMUP_ENABLED: bool = Field(
default=True,
description="Run one startup inference through voice analyzer to avoid first-chunk latency spikes"
)
# Voice classification model settings
VOICE_MODEL_ID: str = Field(
default="shivam-2211/voice-detection-model",
description="Primary Hugging Face model id for AI voice detection"
)
VOICE_MODEL_BACKUP_ID: str = Field(
default="mo-thecreator/Deepfake-audio-detection",
description="Backup model id if primary model load fails"
)
VOICE_MODEL_LOCAL_PATH: str = Field(
default="./fine_tuned_model",
description="Optional local model path that takes priority when present"
)
MODEL_LOGIT_TEMPERATURE: float = Field(
default=1.5,
ge=1.0,
le=10.0,
description="Temperature scaling for model logits before softmax. Higher values reduce overconfidence. 1.0 = no scaling."
)
REALTIME_LIGHTWEIGHT_AUDIO: bool = Field(
default=False,
description="Use lightweight audio analysis path for realtime chunk processing (set true for throughput-first mode)"
)
LEGACY_FALLBACK_RETURNS_UNCERTAIN: bool = Field(
default=True,
description="Return UNCERTAIN classification on legacy endpoint when ML fallback occurs"
)
# Risk policy (versioned + configurable weights)
RISK_POLICY_VERSION: str = Field(default="v1.2", description="Version tag for realtime risk policy")
RISK_WEIGHT_AUDIO: float = Field(default=0.45, ge=0.0, le=1.0)
RISK_WEIGHT_KEYWORD: float = Field(default=0.20, ge=0.0, le=1.0)
RISK_WEIGHT_SEMANTIC: float = Field(default=0.15, ge=0.0, le=1.0)
RISK_WEIGHT_BEHAVIOUR: float = Field(default=0.20, ge=0.0, le=1.0)
RISK_DELTA_BOOST_FACTOR: float = Field(
default=0.30,
ge=0.0,
le=1.0,
description="How strongly risk increases when per-chunk delta is positive"
)
# Optional LLM semantic verifier (second-layer, not primary classifier)
LLM_SEMANTIC_ENABLED: bool = Field(default=False)
LLM_PROVIDER: str = Field(default="gemini", description="LLM provider: openai or gemini")
LLM_SEMANTIC_MODEL: str = Field(default="gemini-2.5-flash", description="Model name for selected LLM provider (optional)")
LLM_SEMANTIC_TIMEOUT_MS: int = Field(default=900, ge=100, le=5000)
LLM_SEMANTIC_MIN_ASR_CONFIDENCE: float = Field(default=0.35, ge=0.0, le=1.0)
LLM_SEMANTIC_CHUNK_INTERVAL: int = Field(default=2, ge=1, le=20)
LLM_SEMANTIC_BLEND_WEIGHT: float = Field(
default=0.20,
ge=0.0,
le=1.0,
description="Weight assigned to LLM semantic score in fused semantic score"
)
OPENAI_API_KEY: str | None = Field(default=None, description="Optional OpenAI API key for LLM semantic verifier")
GEMINI_API_KEY: str | None = Field(default=None, description="Optional Gemini API key for LLM semantic verifier")
# Session store backend
SESSION_STORE_BACKEND: str = Field(
default="redis",
description="Session store backend: memory or redis"
)
REDIS_URL: str | None = Field(
default=None,
description="Redis URL for session state and queue (required when SESSION_STORE_BACKEND=redis)"
)
REDIS_PREFIX: str = Field(
default="ai_call_shield",
description="Redis key prefix namespace"
)
REDIS_CONNECT_TIMEOUT_MS: int = Field(default=2000, ge=100, le=30000)
REDIS_IO_TIMEOUT_MS: int = Field(default=2000, ge=100, le=30000)
# Deep-lane async verification controls
DEEP_LANE_ENABLED: bool = Field(
default=False,
description="Enable asynchronous deep-lane verification after fast-lane decision"
)
DEEP_LANE_QUEUE_BACKEND: str = Field(
default="memory",
description="Queue backend: memory or redis"
)
DEEP_LANE_MAX_WORKERS: int = Field(default=2, ge=1, le=16)
DEEP_LANE_MAX_RETRIES: int = Field(default=1, ge=0, le=10)
DEEP_LANE_RETRY_BACKOFF_MS: int = Field(default=500, ge=0, le=60000)
DEEP_LANE_TARGET_LATENCY_MS: int = Field(default=3000, ge=200, le=10000)
# Performance targets (for harness/reporting and CI gates)
PERF_CHUNK_P95_TARGET_MS: int = Field(default=1200, ge=100, le=10000)
PERF_ALERT_P95_TARGET_MS: int = Field(default=2500, ge=100, le=10000)
# Session retention and privacy controls
SESSION_ACTIVE_RETENTION_SECONDS: int = Field(
default=1800,
description="Retention TTL for active sessions with no updates"
)
SESSION_ENDED_RETENTION_SECONDS: int = Field(
default=300,
description="Retention TTL for ended sessions before purge"
)
MASK_TRANSCRIPT_OUTPUT: bool = Field(
default=True,
description="Mask sensitive entities from transcript before returning response"
)
# WebSocket limits
WS_MAX_DURATION_SECONDS: int = Field(
default=1800,
description="Maximum WebSocket connection duration in seconds (30 min)"
)
WS_IDLE_TIMEOUT_SECONDS: int = Field(
default=120,
description="Close WebSocket if no message received within this many seconds"
)
# Environment Specific
SPACE_ID: str | None = Field(default=None, description="Hugging Face Space ID if running in Spaces")
model_config = {
"env_file": ".env",
"case_sensitive": True,
"extra": "ignore"
}
# Global settings instance
settings = Settings()