PoC_ASR_v1 / docker-compose.yml
vyluong's picture
Upload folder using huggingface_hub
5ab6c6e verified
services:
app:
build:
context: .
dockerfile: Dockerfile
args:
- PORT=${PORT:-7860}
container_name: precisionvoice
ports:
- "${PORT:-7860}:${PORT:-7860}"
volumes:
# Persist uploaded/processed files
- ./data:/app/data
# Cache models to avoid re-downloading
- model_cache_hf:/root/.cache/huggingface
- model_cache_torch:/root/.cache/torch
- model_cache_mdx:/root/.audio-separator-models
environment:
# HuggingFace token (required for pyannote.audio)
- HF_TOKEN=${HF_TOKEN:-}
# Model settings
- WHISPER_MODEL=${WHISPER_MODEL:-kiendt/PhoWhisper-large-ct2}
- DIARIZATION_MODEL=${DIARIZATION_MODEL:-pyannote/speaker-diarization-3.1}
# Device (auto, cuda, cpu)
- DEVICE=${DEVICE:-auto}
# Denoising (Speech Enhancement)
- ENABLE_DENOISER=${ENABLE_DENOISER:-True}
- DENOISER_MODEL=${DENOISER_MODEL:-dns64}
# MDX-Net Vocal Separation
- ENABLE_VOCAL_SEPARATION=${ENABLE_VOCAL_SEPARATION:-True}
- MDX_MODEL=${MDX_MODEL:-UVR-MDX-NET-Voc_FT}
# Upload settings
- MAX_UPLOAD_SIZE_MB=${MAX_UPLOAD_SIZE_MB:-100}
# Optimization settings
- ENABLE_LOUDNORM=${ENABLE_LOUDNORM:-True}
- ENABLE_NOISE_REDUCTION=${ENABLE_NOISE_REDUCTION:-True}
# VAD settings
- VAD_THRESHOLD=${VAD_THRESHOLD:-0.5}
- VAD_MIN_SPEECH_DURATION_MS=${VAD_MIN_SPEECH_DURATION_MS:-250}
- VAD_MIN_SILENCE_DURATION_MS=${VAD_MIN_SILENCE_DURATION_MS:-500}
# Clustering settings
- MERGE_THRESHOLD_S=${MERGE_THRESHOLD_S:-0.5}
- MIN_SEGMENT_DURATION_S=${MIN_SEGMENT_DURATION_S:-0.3}
restart: unless-stopped
# GPU support (uncomment for NVIDIA GPU)
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: all
# capabilities: [gpu]
volumes:
model_cache_hf:
name: precisionvoice_hf_cache
model_cache_torch:
name: precisionvoice_torch_cache
model_cache_mdx:
name: precisionvoice_mdx_cache