jcudit's picture
jcudit HF Staff
feat: implement cross-mode robustness fixes (phases 1-8)
95e1515
"""
Gradio web interface for Voice Tools.
Provides a user-friendly web UI for uploading audio files, configuring
extraction parameters, and downloading results.
"""
import logging
import shutil
import tempfile
import zipfile
from pathlib import Path
from typing import List, Optional, Tuple
# Configure SSL context BEFORE any imports that might trigger model downloads
from src.config.ssl_config import configure_ssl_context
configure_ssl_context()
import gradio as gr
from src.models.processing_job import ExtractionMode, ProcessingJob
from src.services.batch_processor import BatchProcessor
from src.web.handlers import estimate_time_handler, process_batch_handler, validate_files_handler
from src.web.tabs.speaker_extraction import create_speaker_extraction_tab
from src.web.tabs.speaker_separation import create_speaker_separation_tab
from src.web.tabs.voice_denoising import create_voice_denoising_tab
logger = logging.getLogger(__name__)
# Custom CSS for better styling
custom_css = """
.container {
max-width: 1200px;
margin: auto;
}
.header {
text-align: center;
padding: 20px;
}
.footer {
text-align: center;
padding: 10px;
color: #666;
}
"""
def create_app() -> gr.Blocks:
"""
Create and configure the Gradio web interface.
Returns:
Configured Gradio Blocks app
"""
with gr.Blocks(title="Voice Tools") as app:
# Header
gr.Markdown(
"""
# 🎤 Voice Tools
Extract and profile specific voices from audio files using AI-powered
speaker diarization and voice matching.
Choose a workflow below to get started.
"""
)
# Create tabs for different workflows
with gr.Tabs():
# Tab 1: Speaker Separation
create_speaker_separation_tab()
# Tab 2: Speaker Extraction
create_speaker_extraction_tab()
# Tab 3: Voice Denoising
create_voice_denoising_tab()
# Tab 4: Voice Extraction (EXISTING)
with gr.Tab("Voice Extraction"):
gr.Markdown(
"""
Extract specific voices from audio files using a reference clip.
Upload a reference voice clip and one or more audio files to extract
matching voice segments.
"""
)
with gr.Column(scale=1):
# Input Section
gr.Markdown("### 📤 Input Files")
reference_audio = gr.Audio(
label="Reference Voice",
type="filepath",
sources=["upload"],
)
input_files = gr.File(
label="Audio Files to Process",
file_count="multiple",
file_types=[".m4a", ".wav", ".mp3", ".flac"],
)
# Configuration Section
gr.Markdown("### ⚙️ Configuration")
with gr.Row():
extraction_mode = gr.Radio(
choices=["Speech", "Nonverbal", "Both"],
value="Speech",
label="Extraction Mode",
)
with gr.Accordion("Advanced Settings", open=False):
with gr.Row():
vad_threshold = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.5,
step=0.05,
label="VAD Threshold",
)
voice_threshold = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.7,
step=0.05,
label="Voice Match Threshold",
)
with gr.Row():
speech_threshold = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.6,
step=0.05,
label="Speech Classification Threshold",
)
enable_vad = gr.Checkbox(
value=True,
label="Enable VAD Optimization",
)
# Action Buttons
with gr.Row():
estimate_btn = gr.Button("📊 Estimate Processing Time", variant="secondary")
process_btn = gr.Button("🚀 Start Extraction", variant="primary", size="lg")
clear_btn = gr.ClearButton(
components=[reference_audio, input_files], value="🗑️ Clear"
)
with gr.Column(scale=1):
# Output Section
gr.Markdown("### 📊 Results")
# Status and Progress
status_output = gr.Textbox(
label="Status",
placeholder="Ready to process...",
interactive=False,
lines=2,
)
progress_output = gr.Progress()
# Estimation results
estimate_output = gr.JSON(label="Processing Time Estimate", visible=False)
# Statistics
stats_output = gr.JSON(label="Extraction Statistics", visible=False)
# Download Section
gr.Markdown("### 💾 Downloads")
output_files = gr.File(
label="Extracted Segments",
file_count="multiple",
interactive=False,
visible=False,
)
download_zip = gr.File(
label="Download All (ZIP)", interactive=False, visible=False
)
report_file = gr.File(
label="Extraction Report", interactive=False, visible=False
)
# Examples Section
gr.Markdown("### 📚 Examples")
gr.Markdown(
"""
**Quick Start Guide:**
1. **Upload Reference Voice**: A short, clear clip (5-30 seconds) of the voice you want to extract
2. **Upload Audio Files**: One or more files to process (can be long recordings)
3. **Select Mode**: Choose what to extract:
- **Speech**: Only spoken words and sentences
- **Nonverbal**: Sighs, laughs, moans, humming, etc.
- **Both**: Everything from the matched voice
4. **Start Extraction**: Click the button and wait for results
5. **Download**: Get individual segments or download everything as a ZIP
**Tips for Best Results:**
- Use a high-quality reference clip with minimal background noise
- Reference should contain only the target voice (no other speakers)
- Enable VAD optimization for faster processing of sparse audio
- Adjust voice threshold if you're getting too many/few matches
"""
)
# Event Handlers
estimate_btn.click(
fn=estimate_time_handler,
inputs=[reference_audio, input_files, vad_threshold, enable_vad],
outputs=[estimate_output, status_output],
api_name="estimate",
)
process_btn.click(
fn=process_batch_handler,
inputs=[
reference_audio,
input_files,
extraction_mode,
vad_threshold,
voice_threshold,
speech_threshold,
enable_vad,
],
outputs=[status_output, stats_output, output_files, download_zip, report_file],
api_name="process",
)
# Footer
gr.Markdown(
"""
---
<div class="footer">
Voice Tools v0.1.0 | Powered by Gradio, PyAnnote, and Transformers
</div>
""",
elem_classes=["footer"],
)
return app
def launch(
server_name: str = "0.0.0.0", server_port: int = 7860, share: bool = False, debug: bool = False
):
"""
Launch the Gradio web interface.
Args:
server_name: Server hostname (default: 0.0.0.0)
server_port: Server port (default: 7860)
share: Create public share link (default: False)
debug: Enable debug mode (default: False)
"""
if debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
app = create_app()
logger.info(f"Launching Voice Tools web interface on {server_name}:{server_port}")
app.launch(
server_name=server_name,
server_port=server_port,
share=share,
show_error=True,
)
if __name__ == "__main__":
launch(debug=True)