Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Main CLI entry point for Voice Tools. | |
| Provides command-line interface for voice extraction and profiling tasks. | |
| """ | |
| import logging | |
| from pathlib import Path | |
| from typing import List, Optional | |
| import click | |
| from dotenv import load_dotenv | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| # Configure SSL context BEFORE any model-related imports | |
| from src.config.ssl_config import configure_ssl_context | |
| configure_ssl_context() | |
| from ..models.processing_job import ExtractionMode, ProcessingJob | |
| from ..services.batch_processor import BatchProcessor | |
| from .progress import ( | |
| ExtractionProgress, | |
| display_config, | |
| display_error, | |
| display_failures, | |
| display_header, | |
| display_info, | |
| display_statistics, | |
| display_success, | |
| display_vad_stats, | |
| display_warning, | |
| ) | |
| from .utils import discover_audio_files, validate_audio_files | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def cli(): | |
| """ | |
| Voice Tools - Extract and profile voices from audio files. | |
| This tool helps you extract specific voices from audio files using | |
| speaker diarization and voice matching. It can separate speech from | |
| nonverbal sounds and apply quality filtering. | |
| """ | |
| pass | |
| # Import and register commands | |
| from .denoise import denoise | |
| from .extract_speaker import extract_speaker | |
| from .separate import separate | |
| cli.add_command(separate) | |
| cli.add_command(extract_speaker) | |
| cli.add_command(denoise) | |
| def extract( | |
| reference_file: Path, | |
| input_paths: tuple, | |
| output_dir: Path, | |
| mode: str, | |
| vad_threshold: float, | |
| voice_threshold: float, | |
| speech_threshold: float, | |
| no_vad: bool, | |
| no_quality_filter: bool, | |
| verbose: bool, | |
| pattern: str, | |
| ): | |
| """ | |
| Extract voice segments from audio files. | |
| REFERENCE_FILE: Audio file containing the reference voice to extract | |
| INPUT_PATHS: One or more files, directories, or glob patterns | |
| Examples: | |
| \b | |
| # Extract speech from single file | |
| voice-tools extract reference.m4a input.m4a | |
| \b | |
| # Extract from multiple files | |
| voice-tools extract reference.m4a file1.m4a file2.m4a file3.m4a | |
| \b | |
| # Process entire directory | |
| voice-tools extract reference.m4a ./audio_files/ | |
| \b | |
| # Process directory with custom pattern | |
| voice-tools extract reference.m4a ./audio_files/ --pattern "*.wav" | |
| \b | |
| # Extract nonverbal sounds only | |
| voice-tools extract reference.m4a input.m4a --mode nonverbal | |
| \b | |
| # Extract both speech and nonverbal | |
| voice-tools extract reference.m4a input.m4a --mode both | |
| \b | |
| # Custom output directory | |
| voice-tools extract reference.m4a input.m4a -o ./my_output | |
| \b | |
| # Adjust voice matching sensitivity | |
| voice-tools extract reference.m4a input.m4a --voice-threshold 0.8 | |
| """ | |
| # Configure logging | |
| if verbose: | |
| logging.getLogger().setLevel(logging.DEBUG) | |
| display_header("Voice Tools - Extract Voice Segments") | |
| # Validate reference file | |
| if not reference_file.exists(): | |
| display_error(f"Reference file not found: {reference_file}") | |
| raise click.Abort() | |
| # Discover audio files from input paths (files, directories, or patterns) | |
| display_info(f"Discovering audio files from {len(input_paths)} input path(s)...") | |
| input_files_list = discover_audio_files(list(input_paths), pattern=pattern) | |
| if not input_files_list: | |
| display_error("No audio files found in the specified paths") | |
| raise click.Abort() | |
| display_success(f"Found {len(input_files_list)} audio file(s) to process") | |
| # Validate discovered files | |
| valid_files, errors = validate_audio_files(input_files_list) | |
| if errors: | |
| display_warning(f"Validation issues found:") | |
| for error in errors: | |
| display_warning(f" {error}") | |
| if not valid_files: | |
| display_error("No valid audio files to process") | |
| raise click.Abort() | |
| if len(valid_files) < len(input_files_list): | |
| display_info( | |
| f"Processing {len(valid_files)} valid files (skipped {len(input_files_list) - len(valid_files)})" | |
| ) | |
| input_files_list = valid_files | |
| # Display configuration | |
| config = { | |
| "Reference voice": str(reference_file), | |
| "Input files": len(input_files_list), | |
| "Output directory": str(output_dir), | |
| "Extraction mode": mode, | |
| "VAD enabled": not no_vad, | |
| "Quality filter": not no_quality_filter, | |
| "VAD threshold": vad_threshold, | |
| "Voice threshold": voice_threshold, | |
| "Speech threshold": speech_threshold, | |
| } | |
| display_config(config) | |
| # Convert mode string to ExtractionMode enum | |
| mode_map = { | |
| "speech": ExtractionMode.SPEECH, | |
| "nonverbal": ExtractionMode.NONVERBAL, | |
| "both": ExtractionMode.BOTH, | |
| } | |
| extraction_mode = mode_map[mode.lower()] | |
| # Create processing job | |
| job = ProcessingJob( | |
| reference_file=str(reference_file), | |
| input_files=[str(f) for f in input_files_list], | |
| output_dir=str(output_dir), | |
| extraction_mode=extraction_mode, | |
| vad_threshold=vad_threshold, | |
| voice_similarity_threshold=voice_threshold, | |
| speech_confidence_threshold=speech_threshold, | |
| apply_denoising=False, # Not implemented yet | |
| ) | |
| # Initialize processor | |
| processor = BatchProcessor( | |
| vad_threshold=vad_threshold, | |
| voice_similarity_threshold=voice_threshold, | |
| speech_confidence_threshold=speech_threshold, | |
| enable_vad=not no_vad, | |
| ) | |
| # Process batch | |
| try: | |
| display_info("Starting extraction...") | |
| with ExtractionProgress() as progress: | |
| progress.start(len(input_files_list)) | |
| job = processor.process_batch(job) | |
| # Display results | |
| display_header("Extraction Complete") | |
| summary = job.get_summary() | |
| display_statistics(summary) | |
| if summary["files_failed"] > 0: | |
| display_failures(job.failed_files) | |
| display_success(f"Output saved to: {output_dir}") | |
| # Generate detailed report | |
| report_path = output_dir / "extraction_report.txt" | |
| report_content = job.generate_report() | |
| report_path.write_text(report_content) | |
| display_success(f"Detailed report saved to: {report_path}") | |
| except KeyboardInterrupt: | |
| click.echo("\nExtraction cancelled by user", err=True) | |
| raise click.Abort() | |
| except Exception as e: | |
| click.echo(f"\nError during extraction: {e}", err=True) | |
| logger.exception("Extraction failed") | |
| raise click.Abort() | |
| def scan(audio_file: Path, vad_threshold: float): | |
| """ | |
| Scan an audio file for voice activity. | |
| Performs a quick VAD scan to estimate processing time and voice activity. | |
| Useful for determining if a file is worth processing. | |
| AUDIO_FILE: Audio file to scan | |
| Example: | |
| \b | |
| voice-tools scan input.m4a | |
| """ | |
| display_header("Voice Tools - Voice Activity Scan") | |
| processor = BatchProcessor(vad_threshold=vad_threshold) | |
| try: | |
| display_info(f"Scanning: {audio_file}") | |
| estimates = processor.estimate_processing_time(audio_file, enable_vad=True) | |
| # Create VAD stats for display | |
| vad_stats = { | |
| "total_duration": estimates["total_duration"], | |
| "voice_duration": estimates["voice_duration"], | |
| "voice_percentage": (estimates["voice_duration"] / estimates["total_duration"]) * 100, | |
| "worth_processing": estimates["voice_duration"] >= 30, | |
| } | |
| display_vad_stats(vad_stats) | |
| stats = { | |
| "estimated_processing_time": estimates["estimated_processing_time"], | |
| "estimated_minutes": estimates["estimated_minutes"], | |
| } | |
| from rich.table import Table | |
| from .progress import console | |
| table = Table(title="Processing Estimate", show_header=False) | |
| table.add_column("Metric", style="cyan") | |
| table.add_column("Value", style="white") | |
| table.add_row( | |
| "Estimated processing time", | |
| f"{stats['estimated_processing_time']:.2f}s ({stats['estimated_minutes']:.2f} min)", | |
| ) | |
| console.print(table) | |
| console.print() | |
| if estimates["voice_duration"] < 30: | |
| display_warning( | |
| "Very little voice activity detected. File may not be worth processing." | |
| ) | |
| elif vad_stats["voice_percentage"] < 10: | |
| display_info("Low voice activity. VAD optimization will provide significant speedup.") | |
| except Exception as e: | |
| display_error(f"Scan failed: {e}") | |
| logger.exception("Scan failed") | |
| raise click.Abort() | |
| def web(host: str, port: int, share: bool): | |
| """ | |
| Launch the web interface. | |
| Opens a browser-based UI for voice extraction with file upload, | |
| configuration, and result download. | |
| Example: | |
| \b | |
| voice-tools web | |
| voice-tools web --port 8080 --share | |
| """ | |
| from ..web.app import launch | |
| display_header("Voice Tools - Web Interface") | |
| display_info(f"Starting web server on http://{host}:{port}") | |
| if share: | |
| display_info("Creating public share link...") | |
| display_success("Server starting... Open the URL in your browser") | |
| try: | |
| launch(server_name=host, server_port=port, share=share, debug=False) | |
| except KeyboardInterrupt: | |
| display_info("Server stopped") | |
| except Exception as e: | |
| display_error(f"Failed to start server: {e}") | |
| raise click.Abort() | |
| def info(): | |
| """ | |
| Display information about Voice Tools. | |
| Shows configuration, model information, and system details. | |
| """ | |
| import torch | |
| from rich.table import Table | |
| from ..services.model_manager import ModelManager | |
| from .progress import console | |
| display_header("Voice Tools - System Information") | |
| # Version info | |
| info_table = Table(title="Version", show_header=False) | |
| info_table.add_column("Key", style="cyan") | |
| info_table.add_column("Value", style="white") | |
| info_table.add_row("Version", "0.1.0") | |
| console.print(info_table) | |
| console.print() | |
| # Models info | |
| models_table = Table(title="Models", show_header=False) | |
| models_table.add_column("Component", style="cyan") | |
| models_table.add_column("Model", style="white") | |
| models_table.add_row("Speaker Diarization", "pyannote/speaker-diarization-3.1") | |
| models_table.add_row("Voice Embedding", "pyannote/embedding") | |
| models_table.add_row("Speech Classifier", "MIT/ast-finetuned-audioset-10-10-0.4593") | |
| models_table.add_row("VAD", "Silero VAD v4.0") | |
| console.print(models_table) | |
| console.print() | |
| # Environment info | |
| env_table = Table(title="Environment", show_header=False) | |
| env_table.add_column("Key", style="cyan") | |
| env_table.add_column("Value", style="white") | |
| env_table.add_row("PyTorch version", torch.__version__) | |
| env_table.add_row("CUDA available", "Yes" if torch.cuda.is_available() else "No") | |
| if torch.cuda.is_available(): | |
| env_table.add_row("CUDA device", torch.cuda.get_device_name(0)) | |
| console.print(env_table) | |
| console.print() | |
| # Check for HuggingFace token | |
| model_manager = ModelManager() | |
| token = model_manager.get_hf_token() | |
| if token: | |
| display_success("HuggingFace token: Configured") | |
| else: | |
| display_warning("HuggingFace token: Not configured") | |
| display_info("Some models require authentication. Set HF_TOKEN environment variable.") | |
| def main(): | |
| """Entry point for the CLI.""" | |
| cli() | |
| if __name__ == "__main__": | |
| main() | |