File size: 9,806 Bytes
cb39c05
03cad88
cb39c05
 
 
 
 
 
 
 
 
 
 
 
95e1515
 
 
 
 
cb39c05
 
ba92724
 
 
 
 
 
cb39c05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03cad88
cb39c05
 
 
03cad88
cb39c05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb2a10f
cb39c05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb2a10f
 
 
 
cb39c05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb2a10f
 
 
cb39c05
bb2a10f
 
 
cb39c05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03cad88
cb39c05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03cad88
cb39c05
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
"""
Gradio web interface for Voice Tools.

Provides a user-friendly web UI for uploading audio files, configuring
extraction parameters, and downloading results.
"""

import logging
import shutil
import tempfile
import zipfile
from pathlib import Path
from typing import List, Optional, Tuple

# Configure SSL context BEFORE any imports that might trigger model downloads
from src.config.ssl_config import configure_ssl_context

configure_ssl_context()

import gradio as gr

from src.models.processing_job import ExtractionMode, ProcessingJob
from src.services.batch_processor import BatchProcessor
from src.web.handlers import estimate_time_handler, process_batch_handler, validate_files_handler
from src.web.tabs.speaker_extraction import create_speaker_extraction_tab
from src.web.tabs.speaker_separation import create_speaker_separation_tab
from src.web.tabs.voice_denoising import create_voice_denoising_tab

logger = logging.getLogger(__name__)

# Custom CSS for better styling
custom_css = """
.container {
    max-width: 1200px;
    margin: auto;
}
.header {
    text-align: center;
    padding: 20px;
}
.footer {
    text-align: center;
    padding: 10px;
    color: #666;
}
"""


def create_app() -> gr.Blocks:
    """
    Create and configure the Gradio web interface.

    Returns:
        Configured Gradio Blocks app
    """

    with gr.Blocks(title="Voice Tools") as app:
        # Header
        gr.Markdown(
            """
            # 🎀 Voice Tools

            Extract and profile specific voices from audio files using AI-powered
            speaker diarization and voice matching.

            Choose a workflow below to get started.
            """
        )

        # Create tabs for different workflows
        with gr.Tabs():
            # Tab 1: Speaker Separation
            create_speaker_separation_tab()

            # Tab 2: Speaker Extraction
            create_speaker_extraction_tab()

            # Tab 3: Voice Denoising
            create_voice_denoising_tab()

            # Tab 4: Voice Extraction (EXISTING)
            with gr.Tab("Voice Extraction"):
                gr.Markdown(
                    """
                    Extract specific voices from audio files using a reference clip.
                    Upload a reference voice clip and one or more audio files to extract
                    matching voice segments.
                    """
                )

                with gr.Column(scale=1):
                    # Input Section
                    gr.Markdown("### πŸ“€ Input Files")

                    reference_audio = gr.Audio(
                        label="Reference Voice",
                        type="filepath",
                        sources=["upload"],
                    )

                    input_files = gr.File(
                        label="Audio Files to Process",
                        file_count="multiple",
                        file_types=[".m4a", ".wav", ".mp3", ".flac"],
                    )

                    # Configuration Section
                    gr.Markdown("### βš™οΈ Configuration")

                    with gr.Row():
                        extraction_mode = gr.Radio(
                            choices=["Speech", "Nonverbal", "Both"],
                            value="Speech",
                            label="Extraction Mode",
                        )

                    with gr.Accordion("Advanced Settings", open=False):
                        with gr.Row():
                            vad_threshold = gr.Slider(
                                minimum=0.0,
                                maximum=1.0,
                                value=0.5,
                                step=0.05,
                                label="VAD Threshold",
                            )

                            voice_threshold = gr.Slider(
                                minimum=0.0,
                                maximum=1.0,
                                value=0.7,
                                step=0.05,
                                label="Voice Match Threshold",
                            )

                        with gr.Row():
                            speech_threshold = gr.Slider(
                                minimum=0.0,
                                maximum=1.0,
                                value=0.6,
                                step=0.05,
                                label="Speech Classification Threshold",
                            )

                            enable_vad = gr.Checkbox(
                                value=True,
                                label="Enable VAD Optimization",
                            )

                    # Action Buttons
                    with gr.Row():
                        estimate_btn = gr.Button("πŸ“Š Estimate Processing Time", variant="secondary")
                        process_btn = gr.Button("πŸš€ Start Extraction", variant="primary", size="lg")
                        clear_btn = gr.ClearButton(
                            components=[reference_audio, input_files], value="πŸ—‘οΈ Clear"
                        )

                with gr.Column(scale=1):
                    # Output Section
                    gr.Markdown("### πŸ“Š Results")

                    # Status and Progress
                    status_output = gr.Textbox(
                        label="Status",
                        placeholder="Ready to process...",
                        interactive=False,
                        lines=2,
                    )

                    progress_output = gr.Progress()

                    # Estimation results
                    estimate_output = gr.JSON(label="Processing Time Estimate", visible=False)

                    # Statistics
                    stats_output = gr.JSON(label="Extraction Statistics", visible=False)

                    # Download Section
                    gr.Markdown("### πŸ’Ύ Downloads")

                    output_files = gr.File(
                        label="Extracted Segments",
                        file_count="multiple",
                        interactive=False,
                        visible=False,
                    )

                    download_zip = gr.File(
                        label="Download All (ZIP)", interactive=False, visible=False
                    )

                    report_file = gr.File(
                        label="Extraction Report", interactive=False, visible=False
                    )

                # Examples Section
                gr.Markdown("### πŸ“š Examples")
                gr.Markdown(
                    """
                    **Quick Start Guide:**

                    1. **Upload Reference Voice**: A short, clear clip (5-30 seconds) of the voice you want to extract
                    2. **Upload Audio Files**: One or more files to process (can be long recordings)
                    3. **Select Mode**: Choose what to extract:
                       - **Speech**: Only spoken words and sentences
                       - **Nonverbal**: Sighs, laughs, moans, humming, etc.
                       - **Both**: Everything from the matched voice
                    4. **Start Extraction**: Click the button and wait for results
                    5. **Download**: Get individual segments or download everything as a ZIP

                    **Tips for Best Results:**
                    - Use a high-quality reference clip with minimal background noise
                    - Reference should contain only the target voice (no other speakers)
                    - Enable VAD optimization for faster processing of sparse audio
                    - Adjust voice threshold if you're getting too many/few matches
                    """
                )

                # Event Handlers
                estimate_btn.click(
                    fn=estimate_time_handler,
                    inputs=[reference_audio, input_files, vad_threshold, enable_vad],
                    outputs=[estimate_output, status_output],
                    api_name="estimate",
                )

                process_btn.click(
                    fn=process_batch_handler,
                    inputs=[
                        reference_audio,
                        input_files,
                        extraction_mode,
                        vad_threshold,
                        voice_threshold,
                        speech_threshold,
                        enable_vad,
                    ],
                    outputs=[status_output, stats_output, output_files, download_zip, report_file],
                    api_name="process",
                )

        # Footer
        gr.Markdown(
            """
            ---
            <div class="footer">
            Voice Tools v0.1.0 | Powered by Gradio, PyAnnote, and Transformers
            </div>
            """,
            elem_classes=["footer"],
        )

    return app


def launch(
    server_name: str = "0.0.0.0", server_port: int = 7860, share: bool = False, debug: bool = False
):
    """
    Launch the Gradio web interface.

    Args:
        server_name: Server hostname (default: 0.0.0.0)
        server_port: Server port (default: 7860)
        share: Create public share link (default: False)
        debug: Enable debug mode (default: False)
    """
    if debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    app = create_app()

    logger.info(f"Launching Voice Tools web interface on {server_name}:{server_port}")

    app.launch(
        server_name=server_name,
        server_port=server_port,
        share=share,
        show_error=True,
    )


if __name__ == "__main__":
    launch(debug=True)