""" Gradio web interface for Voice Tools. Provides a user-friendly web UI for uploading audio files, configuring extraction parameters, and downloading results. """ import logging import shutil import tempfile import zipfile from pathlib import Path from typing import List, Optional, Tuple # Configure SSL context BEFORE any imports that might trigger model downloads from src.config.ssl_config import configure_ssl_context configure_ssl_context() import gradio as gr from src.models.processing_job import ExtractionMode, ProcessingJob from src.services.batch_processor import BatchProcessor from src.web.handlers import estimate_time_handler, process_batch_handler, validate_files_handler from src.web.tabs.speaker_extraction import create_speaker_extraction_tab from src.web.tabs.speaker_separation import create_speaker_separation_tab from src.web.tabs.voice_denoising import create_voice_denoising_tab logger = logging.getLogger(__name__) # Custom CSS for better styling custom_css = """ .container { max-width: 1200px; margin: auto; } .header { text-align: center; padding: 20px; } .footer { text-align: center; padding: 10px; color: #666; } """ def create_app() -> gr.Blocks: """ Create and configure the Gradio web interface. Returns: Configured Gradio Blocks app """ with gr.Blocks(title="Voice Tools") as app: # Header gr.Markdown( """ # 🎤 Voice Tools Extract and profile specific voices from audio files using AI-powered speaker diarization and voice matching. Choose a workflow below to get started. """ ) # Create tabs for different workflows with gr.Tabs(): # Tab 1: Speaker Separation create_speaker_separation_tab() # Tab 2: Speaker Extraction create_speaker_extraction_tab() # Tab 3: Voice Denoising create_voice_denoising_tab() # Tab 4: Voice Extraction (EXISTING) with gr.Tab("Voice Extraction"): gr.Markdown( """ Extract specific voices from audio files using a reference clip. Upload a reference voice clip and one or more audio files to extract matching voice segments. """ ) with gr.Column(scale=1): # Input Section gr.Markdown("### 📤 Input Files") reference_audio = gr.Audio( label="Reference Voice", type="filepath", sources=["upload"], ) input_files = gr.File( label="Audio Files to Process", file_count="multiple", file_types=[".m4a", ".wav", ".mp3", ".flac"], ) # Configuration Section gr.Markdown("### ⚙️ Configuration") with gr.Row(): extraction_mode = gr.Radio( choices=["Speech", "Nonverbal", "Both"], value="Speech", label="Extraction Mode", ) with gr.Accordion("Advanced Settings", open=False): with gr.Row(): vad_threshold = gr.Slider( minimum=0.0, maximum=1.0, value=0.5, step=0.05, label="VAD Threshold", ) voice_threshold = gr.Slider( minimum=0.0, maximum=1.0, value=0.7, step=0.05, label="Voice Match Threshold", ) with gr.Row(): speech_threshold = gr.Slider( minimum=0.0, maximum=1.0, value=0.6, step=0.05, label="Speech Classification Threshold", ) enable_vad = gr.Checkbox( value=True, label="Enable VAD Optimization", ) # Action Buttons with gr.Row(): estimate_btn = gr.Button("📊 Estimate Processing Time", variant="secondary") process_btn = gr.Button("🚀 Start Extraction", variant="primary", size="lg") clear_btn = gr.ClearButton( components=[reference_audio, input_files], value="🗑️ Clear" ) with gr.Column(scale=1): # Output Section gr.Markdown("### 📊 Results") # Status and Progress status_output = gr.Textbox( label="Status", placeholder="Ready to process...", interactive=False, lines=2, ) progress_output = gr.Progress() # Estimation results estimate_output = gr.JSON(label="Processing Time Estimate", visible=False) # Statistics stats_output = gr.JSON(label="Extraction Statistics", visible=False) # Download Section gr.Markdown("### 💾 Downloads") output_files = gr.File( label="Extracted Segments", file_count="multiple", interactive=False, visible=False, ) download_zip = gr.File( label="Download All (ZIP)", interactive=False, visible=False ) report_file = gr.File( label="Extraction Report", interactive=False, visible=False ) # Examples Section gr.Markdown("### 📚 Examples") gr.Markdown( """ **Quick Start Guide:** 1. **Upload Reference Voice**: A short, clear clip (5-30 seconds) of the voice you want to extract 2. **Upload Audio Files**: One or more files to process (can be long recordings) 3. **Select Mode**: Choose what to extract: - **Speech**: Only spoken words and sentences - **Nonverbal**: Sighs, laughs, moans, humming, etc. - **Both**: Everything from the matched voice 4. **Start Extraction**: Click the button and wait for results 5. **Download**: Get individual segments or download everything as a ZIP **Tips for Best Results:** - Use a high-quality reference clip with minimal background noise - Reference should contain only the target voice (no other speakers) - Enable VAD optimization for faster processing of sparse audio - Adjust voice threshold if you're getting too many/few matches """ ) # Event Handlers estimate_btn.click( fn=estimate_time_handler, inputs=[reference_audio, input_files, vad_threshold, enable_vad], outputs=[estimate_output, status_output], api_name="estimate", ) process_btn.click( fn=process_batch_handler, inputs=[ reference_audio, input_files, extraction_mode, vad_threshold, voice_threshold, speech_threshold, enable_vad, ], outputs=[status_output, stats_output, output_files, download_zip, report_file], api_name="process", ) # Footer gr.Markdown( """ --- """, elem_classes=["footer"], ) return app def launch( server_name: str = "0.0.0.0", server_port: int = 7860, share: bool = False, debug: bool = False ): """ Launch the Gradio web interface. Args: server_name: Server hostname (default: 0.0.0.0) server_port: Server port (default: 7860) share: Create public share link (default: False) debug: Enable debug mode (default: False) """ if debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) app = create_app() logger.info(f"Launching Voice Tools web interface on {server_name}:{server_port}") app.launch( server_name=server_name, server_port=server_port, share=share, show_error=True, ) if __name__ == "__main__": launch(debug=True)