Spaces:

jcudit
/

voice-tools

Running on Zero

App Files Files Community

voice-tools / src /web /app.py

jcudit HF Staff

feat: implement cross-mode robustness fixes (phases 1-8)

95e1515 about 1 month ago

raw

history blame contribute delete

9.81 kB

	"""
	Gradio web interface for Voice Tools.

	Provides a user-friendly web UI for uploading audio files, configuring
	extraction parameters, and downloading results.
	"""

	import logging
	import shutil
	import tempfile
	import zipfile
	from pathlib import Path
	from typing import List, Optional, Tuple

	# Configure SSL context BEFORE any imports that might trigger model downloads
	from src.config.ssl_config import configure_ssl_context

	configure_ssl_context()

	import gradio as gr

	from src.models.processing_job import ExtractionMode, ProcessingJob
	from src.services.batch_processor import BatchProcessor
	from src.web.handlers import estimate_time_handler, process_batch_handler, validate_files_handler
	from src.web.tabs.speaker_extraction import create_speaker_extraction_tab
	from src.web.tabs.speaker_separation import create_speaker_separation_tab
	from src.web.tabs.voice_denoising import create_voice_denoising_tab

	logger = logging.getLogger(__name__)

	# Custom CSS for better styling
	custom_css = """
	.container {
	max-width: 1200px;
	margin: auto;
	}
	.header {
	text-align: center;
	padding: 20px;
	}
	.footer {
	text-align: center;
	padding: 10px;
	color: #666;
	}
	"""


	def create_app() -> gr.Blocks:
	"""
	Create and configure the Gradio web interface.

	Returns:
	Configured Gradio Blocks app
	"""

	with gr.Blocks(title="Voice Tools") as app:
	# Header
	gr.Markdown(
	"""
	# 🎤 Voice Tools

	Extract and profile specific voices from audio files using AI-powered
	speaker diarization and voice matching.

	Choose a workflow below to get started.
	"""
	)

	# Create tabs for different workflows
	with gr.Tabs():
	# Tab 1: Speaker Separation
	create_speaker_separation_tab()

	# Tab 2: Speaker Extraction
	create_speaker_extraction_tab()

	# Tab 3: Voice Denoising
	create_voice_denoising_tab()

	# Tab 4: Voice Extraction (EXISTING)
	with gr.Tab("Voice Extraction"):
	gr.Markdown(
	"""
	Extract specific voices from audio files using a reference clip.
	Upload a reference voice clip and one or more audio files to extract
	matching voice segments.
	"""
	)

	with gr.Column(scale=1):
	# Input Section
	gr.Markdown("### 📤 Input Files")

	reference_audio = gr.Audio(
	label="Reference Voice",
	type="filepath",
	sources=["upload"],
	)

	input_files = gr.File(
	label="Audio Files to Process",
	file_count="multiple",
	file_types=[".m4a", ".wav", ".mp3", ".flac"],
	)

	# Configuration Section
	gr.Markdown("### ⚙️ Configuration")

	with gr.Row():
	extraction_mode = gr.Radio(
	choices=["Speech", "Nonverbal", "Both"],
	value="Speech",
	label="Extraction Mode",
	)

	with gr.Accordion("Advanced Settings", open=False):
	with gr.Row():
	vad_threshold = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.5,
	step=0.05,
	label="VAD Threshold",
	)

	voice_threshold = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.7,
	step=0.05,
	label="Voice Match Threshold",
	)

	with gr.Row():
	speech_threshold = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.6,
	step=0.05,
	label="Speech Classification Threshold",
	)

	enable_vad = gr.Checkbox(
	value=True,
	label="Enable VAD Optimization",
	)

	# Action Buttons
	with gr.Row():
	estimate_btn = gr.Button("📊 Estimate Processing Time", variant="secondary")
	process_btn = gr.Button("🚀 Start Extraction", variant="primary", size="lg")
	clear_btn = gr.ClearButton(
	components=[reference_audio, input_files], value="🗑️ Clear"
	)

	with gr.Column(scale=1):
	# Output Section
	gr.Markdown("### 📊 Results")

	# Status and Progress
	status_output = gr.Textbox(
	label="Status",
	placeholder="Ready to process...",
	interactive=False,
	lines=2,
	)

	progress_output = gr.Progress()

	# Estimation results
	estimate_output = gr.JSON(label="Processing Time Estimate", visible=False)

	# Statistics
	stats_output = gr.JSON(label="Extraction Statistics", visible=False)

	# Download Section
	gr.Markdown("### 💾 Downloads")

	output_files = gr.File(
	label="Extracted Segments",
	file_count="multiple",
	interactive=False,
	visible=False,
	)

	download_zip = gr.File(
	label="Download All (ZIP)", interactive=False, visible=False
	)

	report_file = gr.File(
	label="Extraction Report", interactive=False, visible=False
	)

	# Examples Section
	gr.Markdown("### 📚 Examples")
	gr.Markdown(
	"""
	Quick Start Guide:

	1. Upload Reference Voice: A short, clear clip (5-30 seconds) of the voice you want to extract
	2. Upload Audio Files: One or more files to process (can be long recordings)
	3. Select Mode: Choose what to extract:
	- Speech: Only spoken words and sentences
	- Nonverbal: Sighs, laughs, moans, humming, etc.
	- Both: Everything from the matched voice
	4. Start Extraction: Click the button and wait for results
	5. Download: Get individual segments or download everything as a ZIP

	Tips for Best Results:
	- Use a high-quality reference clip with minimal background noise
	- Reference should contain only the target voice (no other speakers)
	- Enable VAD optimization for faster processing of sparse audio
	- Adjust voice threshold if you're getting too many/few matches
	"""
	)

	# Event Handlers
	estimate_btn.click(
	fn=estimate_time_handler,
	inputs=[reference_audio, input_files, vad_threshold, enable_vad],
	outputs=[estimate_output, status_output],
	api_name="estimate",
	)

	process_btn.click(
	fn=process_batch_handler,
	inputs=[
	reference_audio,
	input_files,
	extraction_mode,
	vad_threshold,
	voice_threshold,
	speech_threshold,
	enable_vad,
	],
	outputs=[status_output, stats_output, output_files, download_zip, report_file],
	api_name="process",
	)

	# Footer
	gr.Markdown(
	"""
	---
	<div class="footer">
	Voice Tools v0.1.0 \| Powered by Gradio, PyAnnote, and Transformers
	</div>
	""",
	elem_classes=["footer"],
	)

	return app


	def launch(
	server_name: str = "0.0.0.0", server_port: int = 7860, share: bool = False, debug: bool = False
	):
	"""
	Launch the Gradio web interface.

	Args:
	server_name: Server hostname (default: 0.0.0.0)
	server_port: Server port (default: 7860)
	share: Create public share link (default: False)
	debug: Enable debug mode (default: False)
	"""
	if debug:
	logging.basicConfig(level=logging.DEBUG)
	else:
	logging.basicConfig(level=logging.INFO)

	app = create_app()

	logger.info(f"Launching Voice Tools web interface on {server_name}:{server_port}")

	app.launch(
	server_name=server_name,
	server_port=server_port,
	share=share,
	show_error=True,
	)


	if __name__ == "__main__":
	launch(debug=True)