Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Gradio web interface for Voice Tools. | |
| Provides a user-friendly web UI for uploading audio files, configuring | |
| extraction parameters, and downloading results. | |
| """ | |
| import logging | |
| import shutil | |
| import tempfile | |
| import zipfile | |
| from pathlib import Path | |
| from typing import List, Optional, Tuple | |
| # Configure SSL context BEFORE any imports that might trigger model downloads | |
| from src.config.ssl_config import configure_ssl_context | |
| configure_ssl_context() | |
| import gradio as gr | |
| from src.models.processing_job import ExtractionMode, ProcessingJob | |
| from src.services.batch_processor import BatchProcessor | |
| from src.web.handlers import estimate_time_handler, process_batch_handler, validate_files_handler | |
| from src.web.tabs.speaker_extraction import create_speaker_extraction_tab | |
| from src.web.tabs.speaker_separation import create_speaker_separation_tab | |
| from src.web.tabs.voice_denoising import create_voice_denoising_tab | |
| logger = logging.getLogger(__name__) | |
| # Custom CSS for better styling | |
| custom_css = """ | |
| .container { | |
| max-width: 1200px; | |
| margin: auto; | |
| } | |
| .header { | |
| text-align: center; | |
| padding: 20px; | |
| } | |
| .footer { | |
| text-align: center; | |
| padding: 10px; | |
| color: #666; | |
| } | |
| """ | |
| def create_app() -> gr.Blocks: | |
| """ | |
| Create and configure the Gradio web interface. | |
| Returns: | |
| Configured Gradio Blocks app | |
| """ | |
| with gr.Blocks(title="Voice Tools") as app: | |
| # Header | |
| gr.Markdown( | |
| """ | |
| # 🎤 Voice Tools | |
| Extract and profile specific voices from audio files using AI-powered | |
| speaker diarization and voice matching. | |
| Choose a workflow below to get started. | |
| """ | |
| ) | |
| # Create tabs for different workflows | |
| with gr.Tabs(): | |
| # Tab 1: Speaker Separation | |
| create_speaker_separation_tab() | |
| # Tab 2: Speaker Extraction | |
| create_speaker_extraction_tab() | |
| # Tab 3: Voice Denoising | |
| create_voice_denoising_tab() | |
| # Tab 4: Voice Extraction (EXISTING) | |
| with gr.Tab("Voice Extraction"): | |
| gr.Markdown( | |
| """ | |
| Extract specific voices from audio files using a reference clip. | |
| Upload a reference voice clip and one or more audio files to extract | |
| matching voice segments. | |
| """ | |
| ) | |
| with gr.Column(scale=1): | |
| # Input Section | |
| gr.Markdown("### 📤 Input Files") | |
| reference_audio = gr.Audio( | |
| label="Reference Voice", | |
| type="filepath", | |
| sources=["upload"], | |
| ) | |
| input_files = gr.File( | |
| label="Audio Files to Process", | |
| file_count="multiple", | |
| file_types=[".m4a", ".wav", ".mp3", ".flac"], | |
| ) | |
| # Configuration Section | |
| gr.Markdown("### ⚙️ Configuration") | |
| with gr.Row(): | |
| extraction_mode = gr.Radio( | |
| choices=["Speech", "Nonverbal", "Both"], | |
| value="Speech", | |
| label="Extraction Mode", | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| with gr.Row(): | |
| vad_threshold = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.5, | |
| step=0.05, | |
| label="VAD Threshold", | |
| ) | |
| voice_threshold = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.7, | |
| step=0.05, | |
| label="Voice Match Threshold", | |
| ) | |
| with gr.Row(): | |
| speech_threshold = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.6, | |
| step=0.05, | |
| label="Speech Classification Threshold", | |
| ) | |
| enable_vad = gr.Checkbox( | |
| value=True, | |
| label="Enable VAD Optimization", | |
| ) | |
| # Action Buttons | |
| with gr.Row(): | |
| estimate_btn = gr.Button("📊 Estimate Processing Time", variant="secondary") | |
| process_btn = gr.Button("🚀 Start Extraction", variant="primary", size="lg") | |
| clear_btn = gr.ClearButton( | |
| components=[reference_audio, input_files], value="🗑️ Clear" | |
| ) | |
| with gr.Column(scale=1): | |
| # Output Section | |
| gr.Markdown("### 📊 Results") | |
| # Status and Progress | |
| status_output = gr.Textbox( | |
| label="Status", | |
| placeholder="Ready to process...", | |
| interactive=False, | |
| lines=2, | |
| ) | |
| progress_output = gr.Progress() | |
| # Estimation results | |
| estimate_output = gr.JSON(label="Processing Time Estimate", visible=False) | |
| # Statistics | |
| stats_output = gr.JSON(label="Extraction Statistics", visible=False) | |
| # Download Section | |
| gr.Markdown("### 💾 Downloads") | |
| output_files = gr.File( | |
| label="Extracted Segments", | |
| file_count="multiple", | |
| interactive=False, | |
| visible=False, | |
| ) | |
| download_zip = gr.File( | |
| label="Download All (ZIP)", interactive=False, visible=False | |
| ) | |
| report_file = gr.File( | |
| label="Extraction Report", interactive=False, visible=False | |
| ) | |
| # Examples Section | |
| gr.Markdown("### 📚 Examples") | |
| gr.Markdown( | |
| """ | |
| **Quick Start Guide:** | |
| 1. **Upload Reference Voice**: A short, clear clip (5-30 seconds) of the voice you want to extract | |
| 2. **Upload Audio Files**: One or more files to process (can be long recordings) | |
| 3. **Select Mode**: Choose what to extract: | |
| - **Speech**: Only spoken words and sentences | |
| - **Nonverbal**: Sighs, laughs, moans, humming, etc. | |
| - **Both**: Everything from the matched voice | |
| 4. **Start Extraction**: Click the button and wait for results | |
| 5. **Download**: Get individual segments or download everything as a ZIP | |
| **Tips for Best Results:** | |
| - Use a high-quality reference clip with minimal background noise | |
| - Reference should contain only the target voice (no other speakers) | |
| - Enable VAD optimization for faster processing of sparse audio | |
| - Adjust voice threshold if you're getting too many/few matches | |
| """ | |
| ) | |
| # Event Handlers | |
| estimate_btn.click( | |
| fn=estimate_time_handler, | |
| inputs=[reference_audio, input_files, vad_threshold, enable_vad], | |
| outputs=[estimate_output, status_output], | |
| api_name="estimate", | |
| ) | |
| process_btn.click( | |
| fn=process_batch_handler, | |
| inputs=[ | |
| reference_audio, | |
| input_files, | |
| extraction_mode, | |
| vad_threshold, | |
| voice_threshold, | |
| speech_threshold, | |
| enable_vad, | |
| ], | |
| outputs=[status_output, stats_output, output_files, download_zip, report_file], | |
| api_name="process", | |
| ) | |
| # Footer | |
| gr.Markdown( | |
| """ | |
| --- | |
| <div class="footer"> | |
| Voice Tools v0.1.0 | Powered by Gradio, PyAnnote, and Transformers | |
| </div> | |
| """, | |
| elem_classes=["footer"], | |
| ) | |
| return app | |
| def launch( | |
| server_name: str = "0.0.0.0", server_port: int = 7860, share: bool = False, debug: bool = False | |
| ): | |
| """ | |
| Launch the Gradio web interface. | |
| Args: | |
| server_name: Server hostname (default: 0.0.0.0) | |
| server_port: Server port (default: 7860) | |
| share: Create public share link (default: False) | |
| debug: Enable debug mode (default: False) | |
| """ | |
| if debug: | |
| logging.basicConfig(level=logging.DEBUG) | |
| else: | |
| logging.basicConfig(level=logging.INFO) | |
| app = create_app() | |
| logger.info(f"Launching Voice Tools web interface on {server_name}:{server_port}") | |
| app.launch( | |
| server_name=server_name, | |
| server_port=server_port, | |
| share=share, | |
| show_error=True, | |
| ) | |
| if __name__ == "__main__": | |
| launch(debug=True) | |