Spaces:
Sleeping
Sleeping
File size: 4,530 Bytes
b77cba7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
#!/usr/bin/env python3
"""
Voice Activity Detection + Speaker Diarization
Simple demo script using the modular pipeline
"""
import torch
import librosa
import numpy as np
from pathlib import Path
import os
import sys
# Import from modular components
from src.vad import SileroVAD
from src.diarization import SpeakerDiarization
from src.pipeline import VADDiarizationPipeline
from src.utils import create_test_audio
def setup_vad():
"""Setup Silero VAD using modular wrapper"""
print("Setting up Voice Activity Detection...")
vad = SileroVAD(threshold=0.5)
print("✓ Silero VAD loaded (40 MB)")
return vad
def setup_diarization():
"""Setup Speaker Diarization using modular wrapper"""
print("Setting up Speaker Diarization...")
print("⚠️ First download requires 1GB+ bandwidth (one-time)")
# Get token from environment or use provided one
token = os.environ.get('HF_TOKEN', 'your_token_here')
try:
diarization = SpeakerDiarization(
model_name="pyannote/speaker-diarization-3.1",
use_auth_token=token
)
print("✓ Diarization pipeline loaded")
return diarization
except Exception as e:
print(f"❌ Error: {e}")
print("Get your HF token: https://huggingface.co/settings/tokens")
print("Or set it: export HF_TOKEN='your_token_here'")
return None
def demo_vad(audio_path, vad_model):
"""Demo VAD on an audio file using modular wrapper"""
print(f"\nVAD Analysis: {audio_path}")
timestamps, processing_time = vad_model.process_file(audio_path)
print(f"Found {len(timestamps)} speech segments:")
print(f"Processing time: {processing_time:.2f}ms")
for i, ts in enumerate(timestamps, 1):
start_s = ts['start']
end_s = ts['end']
duration_s = end_s - start_s
print(f" Segment {i}: {start_s:6.2f}s - {end_s:6.2f}s ({duration_s:6.2f}s)")
return timestamps
def demo_diarization(audio_path, diar_pipeline):
"""Demo Diarization on an audio file using modular wrapper"""
print(f"\nDiarization Analysis: {audio_path}")
segments, processing_time, metadata = diar_pipeline.process_file(audio_path)
print(f"Found {metadata['num_speakers']} speakers")
print(f"Processing time: {processing_time:.2f}ms")
print("\nSpeaker timeline:")
for seg in segments:
print(f" {seg['start']:6.2f}s - {seg['end']:6.2f}s: {seg['speaker']}")
def demo_full_pipeline(audio_path):
"""Demo the full integrated pipeline"""
print(f"\n{'='*60}")
print("FULL PIPELINE DEMO")
print(f"{'='*60}")
token = os.environ.get('HF_TOKEN')
if not token:
print("\n⚠️ No HF_TOKEN found. Running VAD only...")
vad = SileroVAD()
demo_vad(audio_path, vad)
return
try:
# Initialize full pipeline
pipeline = VADDiarizationPipeline(
use_auth_token=token,
vad_threshold=0.5
)
# Process file
result = pipeline.process_file(audio_path)
# Display formatted output
print("\n" + pipeline.format_output(result, format='text'))
except Exception as e:
print(f"\n❌ Error: {e}")
print("Falling back to VAD only...")
vad = SileroVAD()
demo_vad(audio_path, vad)
def main():
print("\n" + "=" * 60)
print("VOICE ACTIVITY DETECTION + SPEAKER DIARIZATION")
print("=" * 60)
# Create test audio
print("\nCreating test audio...")
audio_path = create_test_audio("test_audio.wav", duration=10.0)
print(f"✓ Created {audio_path}")
# Option 1: Quick VAD demo
print("\n" + "=" * 60)
print("OPTION 1: VAD ONLY (No HF token needed)")
print("=" * 60)
vad_model = setup_vad()
demo_vad(audio_path, vad_model)
# Option 2: Full pipeline (requires HF token)
print("\n" + "=" * 60)
print("OPTION 2: FULL PIPELINE (VAD + Diarization)")
print("=" * 60)
demo_full_pipeline(audio_path)
print("\n" + "=" * 60)
print("✅ Demo complete!")
print("\nNext steps:")
print("1. Set HF_TOKEN: export HF_TOKEN='your_token_here'")
print("2. Run Gradio demo: python app.py")
print("3. Test on real audio files")
print("4. Deploy with Docker: docker build -t vad-diarization .")
print("5. Check notebooks/demo.ipynb for detailed examples")
print("=" * 60 + "\n")
if __name__ == "__main__":
main()
|