#!/usr/bin/env python3 """ Voice Activity Detection + Speaker Diarization Simple demo script using the modular pipeline """ import torch import librosa import numpy as np from pathlib import Path import os import sys # Import from modular components from src.vad import SileroVAD from src.diarization import SpeakerDiarization from src.pipeline import VADDiarizationPipeline from src.utils import create_test_audio def setup_vad(): """Setup Silero VAD using modular wrapper""" print("Setting up Voice Activity Detection...") vad = SileroVAD(threshold=0.5) print("✓ Silero VAD loaded (40 MB)") return vad def setup_diarization(): """Setup Speaker Diarization using modular wrapper""" print("Setting up Speaker Diarization...") print("⚠️ First download requires 1GB+ bandwidth (one-time)") # Get token from environment or use provided one token = os.environ.get('HF_TOKEN', 'your_token_here') try: diarization = SpeakerDiarization( model_name="pyannote/speaker-diarization-3.1", use_auth_token=token ) print("✓ Diarization pipeline loaded") return diarization except Exception as e: print(f"❌ Error: {e}") print("Get your HF token: https://huggingface.co/settings/tokens") print("Or set it: export HF_TOKEN='your_token_here'") return None def demo_vad(audio_path, vad_model): """Demo VAD on an audio file using modular wrapper""" print(f"\nVAD Analysis: {audio_path}") timestamps, processing_time = vad_model.process_file(audio_path) print(f"Found {len(timestamps)} speech segments:") print(f"Processing time: {processing_time:.2f}ms") for i, ts in enumerate(timestamps, 1): start_s = ts['start'] end_s = ts['end'] duration_s = end_s - start_s print(f" Segment {i}: {start_s:6.2f}s - {end_s:6.2f}s ({duration_s:6.2f}s)") return timestamps def demo_diarization(audio_path, diar_pipeline): """Demo Diarization on an audio file using modular wrapper""" print(f"\nDiarization Analysis: {audio_path}") segments, processing_time, metadata = diar_pipeline.process_file(audio_path) print(f"Found {metadata['num_speakers']} speakers") print(f"Processing time: {processing_time:.2f}ms") print("\nSpeaker timeline:") for seg in segments: print(f" {seg['start']:6.2f}s - {seg['end']:6.2f}s: {seg['speaker']}") def demo_full_pipeline(audio_path): """Demo the full integrated pipeline""" print(f"\n{'='*60}") print("FULL PIPELINE DEMO") print(f"{'='*60}") token = os.environ.get('HF_TOKEN') if not token: print("\n⚠️ No HF_TOKEN found. Running VAD only...") vad = SileroVAD() demo_vad(audio_path, vad) return try: # Initialize full pipeline pipeline = VADDiarizationPipeline( use_auth_token=token, vad_threshold=0.5 ) # Process file result = pipeline.process_file(audio_path) # Display formatted output print("\n" + pipeline.format_output(result, format='text')) except Exception as e: print(f"\n❌ Error: {e}") print("Falling back to VAD only...") vad = SileroVAD() demo_vad(audio_path, vad) def main(): print("\n" + "=" * 60) print("VOICE ACTIVITY DETECTION + SPEAKER DIARIZATION") print("=" * 60) # Create test audio print("\nCreating test audio...") audio_path = create_test_audio("test_audio.wav", duration=10.0) print(f"✓ Created {audio_path}") # Option 1: Quick VAD demo print("\n" + "=" * 60) print("OPTION 1: VAD ONLY (No HF token needed)") print("=" * 60) vad_model = setup_vad() demo_vad(audio_path, vad_model) # Option 2: Full pipeline (requires HF token) print("\n" + "=" * 60) print("OPTION 2: FULL PIPELINE (VAD + Diarization)") print("=" * 60) demo_full_pipeline(audio_path) print("\n" + "=" * 60) print("✅ Demo complete!") print("\nNext steps:") print("1. Set HF_TOKEN: export HF_TOKEN='your_token_here'") print("2. Run Gradio demo: python app.py") print("3. Test on real audio files") print("4. Deploy with Docker: docker build -t vad-diarization .") print("5. Check notebooks/demo.ipynb for detailed examples") print("=" * 60 + "\n") if __name__ == "__main__": main()