File size: 4,530 Bytes
b77cba7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python3
"""
Voice Activity Detection + Speaker Diarization
Simple demo script using the modular pipeline
"""

import torch
import librosa
import numpy as np
from pathlib import Path
import os
import sys

# Import from modular components
from src.vad import SileroVAD
from src.diarization import SpeakerDiarization
from src.pipeline import VADDiarizationPipeline
from src.utils import create_test_audio

def setup_vad():
    """Setup Silero VAD using modular wrapper"""
    print("Setting up Voice Activity Detection...")
    
    vad = SileroVAD(threshold=0.5)
    print("✓ Silero VAD loaded (40 MB)")
    
    return vad

def setup_diarization():
    """Setup Speaker Diarization using modular wrapper"""
    print("Setting up Speaker Diarization...")
    print("⚠️  First download requires 1GB+ bandwidth (one-time)")
    
    # Get token from environment or use provided one
    token = os.environ.get('HF_TOKEN', 'your_token_here')
    
    try:
        diarization = SpeakerDiarization(
            model_name="pyannote/speaker-diarization-3.1",
            use_auth_token=token
        )
        print("✓ Diarization pipeline loaded")
        return diarization
    except Exception as e:
        print(f"❌ Error: {e}")
        print("Get your HF token: https://huggingface.co/settings/tokens")
        print("Or set it: export HF_TOKEN='your_token_here'")
        return None

def demo_vad(audio_path, vad_model):
    """Demo VAD on an audio file using modular wrapper"""
    print(f"\nVAD Analysis: {audio_path}")
    
    timestamps, processing_time = vad_model.process_file(audio_path)
    
    print(f"Found {len(timestamps)} speech segments:")
    print(f"Processing time: {processing_time:.2f}ms")
    
    for i, ts in enumerate(timestamps, 1):
        start_s = ts['start']
        end_s = ts['end']
        duration_s = end_s - start_s
        print(f"  Segment {i}: {start_s:6.2f}s - {end_s:6.2f}s ({duration_s:6.2f}s)")
    
    return timestamps

def demo_diarization(audio_path, diar_pipeline):
    """Demo Diarization on an audio file using modular wrapper"""
    print(f"\nDiarization Analysis: {audio_path}")
    
    segments, processing_time, metadata = diar_pipeline.process_file(audio_path)
    
    print(f"Found {metadata['num_speakers']} speakers")
    print(f"Processing time: {processing_time:.2f}ms")
    print("\nSpeaker timeline:")
    for seg in segments:
        print(f"  {seg['start']:6.2f}s - {seg['end']:6.2f}s: {seg['speaker']}")

def demo_full_pipeline(audio_path):
    """Demo the full integrated pipeline"""
    print(f"\n{'='*60}")
    print("FULL PIPELINE DEMO")
    print(f"{'='*60}")
    
    token = os.environ.get('HF_TOKEN')
    if not token:
        print("\n⚠️  No HF_TOKEN found. Running VAD only...")
        vad = SileroVAD()
        demo_vad(audio_path, vad)
        return
    
    try:
        # Initialize full pipeline
        pipeline = VADDiarizationPipeline(
            use_auth_token=token,
            vad_threshold=0.5
        )
        
        # Process file
        result = pipeline.process_file(audio_path)
        
        # Display formatted output
        print("\n" + pipeline.format_output(result, format='text'))
        
    except Exception as e:
        print(f"\n❌ Error: {e}")
        print("Falling back to VAD only...")
        vad = SileroVAD()
        demo_vad(audio_path, vad)

def main():
    print("\n" + "=" * 60)
    print("VOICE ACTIVITY DETECTION + SPEAKER DIARIZATION")
    print("=" * 60)
    
    # Create test audio
    print("\nCreating test audio...")
    audio_path = create_test_audio("test_audio.wav", duration=10.0)
    print(f"✓ Created {audio_path}")
    
    # Option 1: Quick VAD demo
    print("\n" + "=" * 60)
    print("OPTION 1: VAD ONLY (No HF token needed)")
    print("=" * 60)
    vad_model = setup_vad()
    demo_vad(audio_path, vad_model)
    
    # Option 2: Full pipeline (requires HF token)
    print("\n" + "=" * 60)
    print("OPTION 2: FULL PIPELINE (VAD + Diarization)")
    print("=" * 60)
    demo_full_pipeline(audio_path)
    
    print("\n" + "=" * 60)
    print("✅ Demo complete!")
    print("\nNext steps:")
    print("1. Set HF_TOKEN: export HF_TOKEN='your_token_here'")
    print("2. Run Gradio demo: python app.py")
    print("3. Test on real audio files")
    print("4. Deploy with Docker: docker build -t vad-diarization .")
    print("5. Check notebooks/demo.ipynb for detailed examples")
    print("=" * 60 + "\n")

if __name__ == "__main__":
    main()