services: app: build: context: . dockerfile: Dockerfile args: - PORT=${PORT:-7860} container_name: precisionvoice ports: - "${PORT:-7860}:${PORT:-7860}" volumes: # Persist uploaded/processed files - ./data:/app/data # Cache models to avoid re-downloading - model_cache_hf:/root/.cache/huggingface - model_cache_torch:/root/.cache/torch - model_cache_mdx:/root/.audio-separator-models environment: # HuggingFace token (required for pyannote.audio) - HF_TOKEN=${HF_TOKEN:-} # Model settings - WHISPER_MODEL=${WHISPER_MODEL:-erax-ai/EraX-WoW-Turbo-V1.1-CT2} - DIARIZATION_MODEL=${DIARIZATION_MODEL:-pyannote/speaker-diarization-3.1} # Device (auto, cuda, cpu) - DEVICE=${DEVICE:-auto} # Speech Enhancement (SpeechBrain SepFormer) - ENABLE_SPEECH_ENHANCEMENT=${ENABLE_SPEECH_ENHANCEMENT:-True} - ENHANCEMENT_MODEL=${ENHANCEMENT_MODEL:-speechbrain/sepformer-dns4-16k-enhancement} # MDX-Net Vocal Separation - ENABLE_VOCAL_SEPARATION=${ENABLE_VOCAL_SEPARATION:-True} - MDX_MODEL=${MDX_MODEL:-UVR-MDX-NET-Voc_FT} # Upload settings - MAX_UPLOAD_SIZE_MB=${MAX_UPLOAD_SIZE_MB:-100} # Optimization settings - ENABLE_LOUDNORM=${ENABLE_LOUDNORM:-True} - ENABLE_NOISE_REDUCTION=${ENABLE_NOISE_REDUCTION:-True} # VAD settings - VAD_THRESHOLD=${VAD_THRESHOLD:-0.5} - VAD_MIN_SPEECH_DURATION_MS=${VAD_MIN_SPEECH_DURATION_MS:-250} - VAD_MIN_SILENCE_DURATION_MS=${VAD_MIN_SILENCE_DURATION_MS:-500} # Clustering settings - MERGE_THRESHOLD_S=${MERGE_THRESHOLD_S:-0.5} - MIN_SEGMENT_DURATION_S=${MIN_SEGMENT_DURATION_S:-0.3} restart: unless-stopped # GPU support (uncomment for NVIDIA GPU) # deploy: # resources: # reservations: # devices: # - driver: nvidia # count: all # capabilities: [gpu] volumes: model_cache_hf: name: precisionvoice_hf_cache model_cache_torch: name: precisionvoice_torch_cache model_cache_mdx: name: precisionvoice_mdx_cache