VAD-speakerDiarization / vad_diarization.py
saadmannan's picture
initial commit
b77cba7
#!/usr/bin/env python3
"""
Voice Activity Detection + Speaker Diarization
Simple demo script using the modular pipeline
"""
import torch
import librosa
import numpy as np
from pathlib import Path
import os
import sys
# Import from modular components
from src.vad import SileroVAD
from src.diarization import SpeakerDiarization
from src.pipeline import VADDiarizationPipeline
from src.utils import create_test_audio
def setup_vad():
"""Setup Silero VAD using modular wrapper"""
print("Setting up Voice Activity Detection...")
vad = SileroVAD(threshold=0.5)
print("✓ Silero VAD loaded (40 MB)")
return vad
def setup_diarization():
"""Setup Speaker Diarization using modular wrapper"""
print("Setting up Speaker Diarization...")
print("⚠️ First download requires 1GB+ bandwidth (one-time)")
# Get token from environment or use provided one
token = os.environ.get('HF_TOKEN', 'your_token_here')
try:
diarization = SpeakerDiarization(
model_name="pyannote/speaker-diarization-3.1",
use_auth_token=token
)
print("✓ Diarization pipeline loaded")
return diarization
except Exception as e:
print(f"❌ Error: {e}")
print("Get your HF token: https://huggingface.co/settings/tokens")
print("Or set it: export HF_TOKEN='your_token_here'")
return None
def demo_vad(audio_path, vad_model):
"""Demo VAD on an audio file using modular wrapper"""
print(f"\nVAD Analysis: {audio_path}")
timestamps, processing_time = vad_model.process_file(audio_path)
print(f"Found {len(timestamps)} speech segments:")
print(f"Processing time: {processing_time:.2f}ms")
for i, ts in enumerate(timestamps, 1):
start_s = ts['start']
end_s = ts['end']
duration_s = end_s - start_s
print(f" Segment {i}: {start_s:6.2f}s - {end_s:6.2f}s ({duration_s:6.2f}s)")
return timestamps
def demo_diarization(audio_path, diar_pipeline):
"""Demo Diarization on an audio file using modular wrapper"""
print(f"\nDiarization Analysis: {audio_path}")
segments, processing_time, metadata = diar_pipeline.process_file(audio_path)
print(f"Found {metadata['num_speakers']} speakers")
print(f"Processing time: {processing_time:.2f}ms")
print("\nSpeaker timeline:")
for seg in segments:
print(f" {seg['start']:6.2f}s - {seg['end']:6.2f}s: {seg['speaker']}")
def demo_full_pipeline(audio_path):
"""Demo the full integrated pipeline"""
print(f"\n{'='*60}")
print("FULL PIPELINE DEMO")
print(f"{'='*60}")
token = os.environ.get('HF_TOKEN')
if not token:
print("\n⚠️ No HF_TOKEN found. Running VAD only...")
vad = SileroVAD()
demo_vad(audio_path, vad)
return
try:
# Initialize full pipeline
pipeline = VADDiarizationPipeline(
use_auth_token=token,
vad_threshold=0.5
)
# Process file
result = pipeline.process_file(audio_path)
# Display formatted output
print("\n" + pipeline.format_output(result, format='text'))
except Exception as e:
print(f"\n❌ Error: {e}")
print("Falling back to VAD only...")
vad = SileroVAD()
demo_vad(audio_path, vad)
def main():
print("\n" + "=" * 60)
print("VOICE ACTIVITY DETECTION + SPEAKER DIARIZATION")
print("=" * 60)
# Create test audio
print("\nCreating test audio...")
audio_path = create_test_audio("test_audio.wav", duration=10.0)
print(f"✓ Created {audio_path}")
# Option 1: Quick VAD demo
print("\n" + "=" * 60)
print("OPTION 1: VAD ONLY (No HF token needed)")
print("=" * 60)
vad_model = setup_vad()
demo_vad(audio_path, vad_model)
# Option 2: Full pipeline (requires HF token)
print("\n" + "=" * 60)
print("OPTION 2: FULL PIPELINE (VAD + Diarization)")
print("=" * 60)
demo_full_pipeline(audio_path)
print("\n" + "=" * 60)
print("✅ Demo complete!")
print("\nNext steps:")
print("1. Set HF_TOKEN: export HF_TOKEN='your_token_here'")
print("2. Run Gradio demo: python app.py")
print("3. Test on real audio files")
print("4. Deploy with Docker: docker build -t vad-diarization .")
print("5. Check notebooks/demo.ipynb for detailed examples")
print("=" * 60 + "\n")
if __name__ == "__main__":
main()