PoC_PrecisionVoice_test / .env.example
vyluong's picture
PoC deployment
832e106 verified
# Environment Configuration for PrecisionVoice
# HuggingFace token (required for pyannote.audio)
# Get your token at: https://huggingface.co/settings/tokens
# Accept terms at: https://huggingface.co/pyannote/speaker-diarization-3.1
HF_TOKEN=your_huggingface_token_here
# Model settings
WHISPER_MODEL=kiendt/PhoWhisper-large-ct2
DIARIZATION_MODEL=pyannote/speaker-diarization-3.1
# Device settings (cuda, cpu, or auto)
DEVICE=auto
# --- Denoising (Speech Enhancement) ---
# Enable speech enhancement (removes background noise, hum, etc.)
ENABLE_DENOISER=True
# Denoiser model: dns64 (standard), dns48, or master64
DENOISER_MODEL=dns64
# --- MDX-Net Vocal Separation ---
# Enable vocal separation before transcription (isolates voice from music/noise)
# More effective than the basic Demucs implementation.
ENABLE_VOCAL_SEPARATION=True
# MDX-Net model: Kim_Vocal_2.onnx (recommended for vocals)
MDX_MODEL=Kim_Vocal_2.onnx
# Upload settings
MAX_UPLOAD_SIZE_MB=100
# --- Optimization Settings ---
# Enable subtle highpass filter (removes low-frequency rumble < 80Hz)
ENABLE_NOISE_REDUCTION=True
# Enable/Disable Loudness Normalization (EBU R128)
ENABLE_LOUDNORM=True
# --- VAD (Voice Activity Detection) Settings ---
# Threshold for detecting speech (0.0 to 1.0). Higher = stricter
VAD_THRESHOLD=0.5
# Ignore speech segments shorter than this (milliseconds)
VAD_MIN_SPEECH_DURATION_MS=250
# Minimum silence duration to split segments (milliseconds)
VAD_MIN_SILENCE_DURATION_MS=500
# --- Post-processing (Clustering) Settings ---
# Merge segments from same speaker if gap is less than this (seconds)
MERGE_THRESHOLD_S=0.5
# Filter out segments shorter than this (seconds) - removes blips/noise
MIN_SEGMENT_DURATION_S=0.3
# Server settings
HOST=0.0.0.0
PORT=8000