Spaces:

jcudit
/

voice-tools

Running on Zero

File size: 9,806 Bytes

"""
Gradio web interface for Voice Tools.

Provides a user-friendly web UI for uploading audio files, configuring
extraction parameters, and downloading results.
"""

import logging
import shutil
import tempfile
import zipfile
from pathlib import Path
from typing import List, Optional, Tuple

# Configure SSL context BEFORE any imports that might trigger model downloads
from src.config.ssl_config import configure_ssl_context

configure_ssl_context()

import gradio as gr

from src.models.processing_job import ExtractionMode, ProcessingJob
from src.services.batch_processor import BatchProcessor
from src.web.handlers import estimate_time_handler, process_batch_handler, validate_files_handler
from src.web.tabs.speaker_extraction import create_speaker_extraction_tab
from src.web.tabs.speaker_separation import create_speaker_separation_tab
from src.web.tabs.voice_denoising import create_voice_denoising_tab

logger = logging.getLogger(__name__)

# Custom CSS for better styling
custom_css = """
.container {
    max-width: 1200px;
    margin: auto;
}
.header {
    text-align: center;
    padding: 20px;
}
.footer {
    text-align: center;
    padding: 10px;
    color: #666;
}
"""


def create_app() -> gr.Blocks:
    """
    Create and configure the Gradio web interface.

    Returns:
        Configured Gradio Blocks app
    """

    with gr.Blocks(title="Voice Tools") as app:
        # Header
        gr.Markdown(
            """
            # 🎤 Voice Tools

            Extract and profile specific voices from audio files using AI-powered
            speaker diarization and voice matching.

            Choose a workflow below to get started.
            """
        )

        # Create tabs for different workflows
        with gr.Tabs():
            # Tab 1: Speaker Separation
            create_speaker_separation_tab()

            # Tab 2: Speaker Extraction
            create_speaker_extraction_tab()

            # Tab 3: Voice Denoising
            create_voice_denoising_tab()

            # Tab 4: Voice Extraction (EXISTING)
            with gr.Tab("Voice Extraction"):
                gr.Markdown(
                    """
                    Extract specific voices from audio files using a reference clip.
                    Upload a reference voice clip and one or more audio files to extract
                    matching voice segments.
                    """
                )

                with gr.Column(scale=1):
                    # Input Section
                    gr.Markdown("### 📤 Input Files")

                    reference_audio = gr.Audio(
                        label="Reference Voice",
                        type="filepath",
                        sources=["upload"],
                    )

                    input_files = gr.File(
                        label="Audio Files to Process",
                        file_count="multiple",
                        file_types=[".m4a", ".wav", ".mp3", ".flac"],
                    )

                    # Configuration Section
                    gr.Markdown("### ⚙️ Configuration")

                    with gr.Row():
                        extraction_mode = gr.Radio(
                            choices=["Speech", "Nonverbal", "Both"],
                            value="Speech",
                            label="Extraction Mode",
                        )

                    with gr.Accordion("Advanced Settings", open=False):
                        with gr.Row():
                            vad_threshold = gr.Slider(
                                minimum=0.0,
                                maximum=1.0,
                                value=0.5,
                                step=0.05,
                                label="VAD Threshold",
                            )

                            voice_threshold = gr.Slider(
                                minimum=0.0,
                                maximum=1.0,
                                value=0.7,
                                step=0.05,
                                label="Voice Match Threshold",
                            )

                        with gr.Row():
                            speech_threshold = gr.Slider(
                                minimum=0.0,
                                maximum=1.0,
                                value=0.6,
                                step=0.05,
                                label="Speech Classification Threshold",
                            )

                            enable_vad = gr.Checkbox(
                                value=True,
                                label="Enable VAD Optimization",
                            )

                    # Action Buttons
                    with gr.Row():
                        estimate_btn = gr.Button("📊 Estimate Processing Time", variant="secondary")
                        process_btn = gr.Button("🚀 Start Extraction", variant="primary", size="lg")
                        clear_btn = gr.ClearButton(
                            components=[reference_audio, input_files], value="🗑️ Clear"
                        )

                with gr.Column(scale=1):
                    # Output Section
                    gr.Markdown("### 📊 Results")

                    # Status and Progress
                    status_output = gr.Textbox(
                        label="Status",
                        placeholder="Ready to process...",
                        interactive=False,
                        lines=2,
                    )

                    progress_output = gr.Progress()

                    # Estimation results
                    estimate_output = gr.JSON(label="Processing Time Estimate", visible=False)

                    # Statistics
                    stats_output = gr.JSON(label="Extraction Statistics", visible=False)

                    # Download Section
                    gr.Markdown("### 💾 Downloads")

                    output_files = gr.File(
                        label="Extracted Segments",
                        file_count="multiple",
                        interactive=False,
                        visible=False,
                    )

                    download_zip = gr.File(
                        label="Download All (ZIP)", interactive=False, visible=False
                    )

                    report_file = gr.File(
                        label="Extraction Report", interactive=False, visible=False
                    )

                # Examples Section
                gr.Markdown("### 📚 Examples")
                gr.Markdown(
                    """
                    **Quick Start Guide:**

                    1. **Upload Reference Voice**: A short, clear clip (5-30 seconds) of the voice you want to extract
                    2. **Upload Audio Files**: One or more files to process (can be long recordings)
                    3. **Select Mode**: Choose what to extract:
                       - **Speech**: Only spoken words and sentences
                       - **Nonverbal**: Sighs, laughs, moans, humming, etc.
                       - **Both**: Everything from the matched voice
                    4. **Start Extraction**: Click the button and wait for results
                    5. **Download**: Get individual segments or download everything as a ZIP

                    **Tips for Best Results:**
                    - Use a high-quality reference clip with minimal background noise
                    - Reference should contain only the target voice (no other speakers)
                    - Enable VAD optimization for faster processing of sparse audio
                    - Adjust voice threshold if you're getting too many/few matches
                    """
                )

                # Event Handlers
                estimate_btn.click(
                    fn=estimate_time_handler,
                    inputs=[reference_audio, input_files, vad_threshold, enable_vad],
                    outputs=[estimate_output, status_output],
                    api_name="estimate",
                )

                process_btn.click(
                    fn=process_batch_handler,
                    inputs=[
                        reference_audio,
                        input_files,
                        extraction_mode,
                        vad_threshold,
                        voice_threshold,
                        speech_threshold,
                        enable_vad,
                    ],
                    outputs=[status_output, stats_output, output_files, download_zip, report_file],
                    api_name="process",
                )

        # Footer
        gr.Markdown(
            """
            ---
            <div class="footer">
            Voice Tools v0.1.0 | Powered by Gradio, PyAnnote, and Transformers
            </div>
            """,
            elem_classes=["footer"],
        )

    return app


def launch(
    server_name: str = "0.0.0.0", server_port: int = 7860, share: bool = False, debug: bool = False
):
    """
    Launch the Gradio web interface.

    Args:
        server_name: Server hostname (default: 0.0.0.0)
        server_port: Server port (default: 7860)
        share: Create public share link (default: False)
        debug: Enable debug mode (default: False)
    """
    if debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    app = create_app()

    logger.info(f"Launching Voice Tools web interface on {server_name}:{server_port}")

    app.launch(
        server_name=server_name,
        server_port=server_port,
        share=share,
        show_error=True,
    )


if __name__ == "__main__":
    launch(debug=True)