PoC_ASR_v5 / docker-compose.yml
vyluong's picture
PoC deployment
4d6b6c4 verified
services:
app:
build:
context: .
dockerfile: Dockerfile
args:
- PORT=${PORT:-7860}
container_name: precisionvoice
ports:
- "${PORT:-7860}:${PORT:-7860}"
volumes:
# Persist uploaded/processed files
- ./data:/app/data
# Cache models to avoid re-downloading
- model_cache_hf:/root/.cache/huggingface
- model_cache_torch:/root/.cache/torch
- model_cache_mdx:/root/.audio-separator-models
environment:
# HuggingFace token (required for pyannote.audio)
- HF_TOKEN=${HF_TOKEN:-}
# Model settings
- WHISPER_MODEL=${WHISPER_MODEL:-erax-ai/EraX-WoW-Turbo-V1.1-CT2}
- DIARIZATION_MODEL=${DIARIZATION_MODEL:-pyannote/speaker-diarization-3.1}
# Device (auto, cuda, cpu)
- DEVICE=${DEVICE:-auto}
# Speech Enhancement (SpeechBrain SepFormer)
- ENABLE_SPEECH_ENHANCEMENT=${ENABLE_SPEECH_ENHANCEMENT:-True}
- ENHANCEMENT_MODEL=${ENHANCEMENT_MODEL:-speechbrain/sepformer-dns4-16k-enhancement}
# MDX-Net Vocal Separation
- ENABLE_VOCAL_SEPARATION=${ENABLE_VOCAL_SEPARATION:-True}
- MDX_MODEL=${MDX_MODEL:-UVR-MDX-NET-Voc_FT}
# Upload settings
- MAX_UPLOAD_SIZE_MB=${MAX_UPLOAD_SIZE_MB:-100}
# Optimization settings
- ENABLE_LOUDNORM=${ENABLE_LOUDNORM:-True}
- ENABLE_NOISE_REDUCTION=${ENABLE_NOISE_REDUCTION:-True}
# VAD settings
- VAD_THRESHOLD=${VAD_THRESHOLD:-0.5}
- VAD_MIN_SPEECH_DURATION_MS=${VAD_MIN_SPEECH_DURATION_MS:-250}
- VAD_MIN_SILENCE_DURATION_MS=${VAD_MIN_SILENCE_DURATION_MS:-500}
# Clustering settings
- MERGE_THRESHOLD_S=${MERGE_THRESHOLD_S:-0.5}
- MIN_SEGMENT_DURATION_S=${MIN_SEGMENT_DURATION_S:-0.3}
restart: unless-stopped
# GPU support (uncomment for NVIDIA GPU)
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: all
# capabilities: [gpu]
volumes:
model_cache_hf:
name: precisionvoice_hf_cache
model_cache_torch:
name: precisionvoice_torch_cache
model_cache_mdx:
name: precisionvoice_mdx_cache