saadmannan commited on
Commit
b77cba7
Β·
1 Parent(s): 89128dd

initial commit

Browse files
.dockerignore ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ venv/
9
+ ENV/
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ downloads/
14
+ eggs/
15
+ .eggs/
16
+ lib/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ wheels/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+
26
+ # IDEs
27
+ .vscode/
28
+ .idea/
29
+ *.swp
30
+ *.swo
31
+ *~
32
+
33
+ # OS
34
+ .DS_Store
35
+ Thumbs.db
36
+
37
+ # Project specific
38
+ data/
39
+ outputs/
40
+ *.wav
41
+ *.mp3
42
+ *.flac
43
+ test_audio.*
44
+ benchmarks/
45
+ notebooks/
46
+ .git/
47
+ .gitignore
48
+
49
+ # Documentation
50
+ *.md
51
+ !README.md
.env.example ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Authentication Token
2
+ # Get yours at: https://huggingface.co/settings/tokens
3
+ HF_TOKEN="YOUR HF TOKEN HERE"
4
+
5
+ # Gradio Server Settings
6
+ GRADIO_SERVER_NAME=0.0.0.0
7
+ GRADIO_SERVER_PORT=7860
8
+
9
+ # Model Settings
10
+ VAD_THRESHOLD=0.5
11
+ USE_ONNX_VAD=false
12
+
13
+ # Optional: Specify number of speakers
14
+ # NUM_SPEAKERS=
15
+ # MIN_SPEAKERS=
16
+ # MAX_SPEAKERS=
.gitignore ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ venv/
9
+ ENV/
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ downloads/
14
+ eggs/
15
+ .eggs/
16
+ lib/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ wheels/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+ MANIFEST
26
+
27
+ # Virtual environments
28
+ venv/
29
+ ENV/
30
+ env/
31
+ .venv
32
+
33
+ # IDEs
34
+ .vscode/
35
+ .idea/
36
+ *.swp
37
+ *.swo
38
+ *~
39
+ .DS_Store
40
+
41
+ # Jupyter Notebook
42
+ .ipynb_checkpoints
43
+ *.ipynb_checkpoints/
44
+
45
+ # Environment variables
46
+ .env
47
+
48
+ # Audio files
49
+ *.wav
50
+ *.mp3
51
+ *.flac
52
+ *.ogg
53
+ *.m4a
54
+ test_audio.*
55
+
56
+ # Output files
57
+ outputs/
58
+ data/
59
+ *.json
60
+ *.rttm
61
+ *.txt
62
+ !requirements.txt
63
+ !README.txt
64
+
65
+ # Model cache
66
+ .cache/
67
+ models/
68
+
69
+ # Logs
70
+ *.log
71
+ logs/
72
+
73
+ # Benchmarks
74
+ benchmarks/*.json
75
+ benchmarks/*.csv
76
+
77
+ # Temporary files
78
+ tmp/
79
+ temp/
80
+ *.tmp
81
+
82
+ # Data folder
83
+ data/
84
+ data/*
85
+
86
+ # Backup files
87
+ *.backup
88
+ *.bak
Dockerfile ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile for VAD + Speaker Diarization System
2
+ FROM python:3.10-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ ffmpeg \
10
+ libsndfile1 \
11
+ git \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Copy requirements first for better caching
15
+ COPY requirements.txt .
16
+
17
+ # Install Python dependencies
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ # Install PyTorch with CUDA support (optional, comment out for CPU-only)
21
+ # RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
22
+
23
+ # Copy application code
24
+ COPY src/ ./src/
25
+ COPY app.py .
26
+
27
+ # Create directories for data
28
+ RUN mkdir -p /app/data /app/outputs
29
+
30
+ # Expose Gradio port
31
+ EXPOSE 7860
32
+
33
+ # Set environment variables
34
+ ENV GRADIO_SERVER_NAME="0.0.0.0"
35
+ ENV GRADIO_SERVER_PORT=7860
36
+
37
+ # Health check
38
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
39
+ CMD curl -f http://localhost:7860/ || exit 1
40
+
41
+ # Run the application
42
+ CMD ["python", "app.py"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 VAD+SD Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,494 @@
1
- ---
2
- title: VAD SpeakerDiarization
3
- emoji: πŸ¦€
4
- colorFrom: red
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
- pinned: false
10
- short_description: Production-ready system for Voice Activity Detection (VAD) a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # πŸŽ™οΈ Real-Time VAD + Speaker Diarization System
2
+
3
+ Production-ready system for **Voice Activity Detection (VAD)** and **Speaker Diarization** with real-time performance and state-of-the-art accuracy.
4
+
5
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
6
+ [![PyTorch](https://img.shields.io/badge/PyTorch-2.0+-red.svg)](https://pytorch.org/)
7
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
+
9
+ ## ✨ Features
10
+
11
+ - **Real-Time VAD**: <100ms latency using Silero VAD (40MB model)
12
+ - **Speaker Diarization**: State-of-the-art accuracy with Pyannote.audio 3.1/4.0+
13
+ - **Interactive Demo**: Beautiful Gradio web interface with visualizations
14
+ - **Production Ready**: Fully containerized with Docker
15
+ - **GPU Accelerated**: CUDA 12.1+ support for faster processing
16
+ - **Multiple Formats**: Export results as JSON, RTTM, or text
17
+ - **Modular Architecture**: Clean, maintainable, and extensible code
18
+
19
+ ## πŸš€ Quick Start
20
+
21
+ ### Prerequisites
22
+
23
+ - Python 3.10+
24
+ - CUDA 12.1+ (optional, for GPU acceleration)
25
+ - FFmpeg
26
+ - Hugging Face account with access to [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)
27
+
28
+ ### Installation
29
+
30
+ #### Option 1: Conda (Recommended)
31
+
32
+ ```bash
33
+ # Create and activate conda environment
34
+ conda create -n vad_diarization python=3.10 -y
35
+ conda activate vad_diarization
36
+
37
+ # Install PyTorch with CUDA
38
+ conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -y
39
+
40
+ # Install dependencies
41
+ pip install -r requirements.txt
42
+ ```
43
+
44
+ #### Option 2: Virtual Environment
45
+
46
+ ```bash
47
+ # Create virtual environment
48
+ python -m venv venv
49
+ source venv/bin/activate # On Windows: venv\Scripts\activate
50
+
51
+ # Install PyTorch with CUDA support (for GPU)
52
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
53
+
54
+ # Install other dependencies
55
+ pip install -r requirements.txt
56
+ ```
57
+
58
+ #### Option 3: Automated Setup
59
+
60
+ ```bash
61
+ # For conda users (activate environment first)
62
+ conda activate vad_diarization
63
+ ./setup.sh
64
+
65
+ # For venv users
66
+ ./setup.sh
67
+ ```
68
+
69
+ ### Hugging Face Token Setup
70
+
71
+ 1. **Get your token**: Visit https://huggingface.co/settings/tokens
72
+ 2. **Accept model conditions**: Visit https://huggingface.co/pyannote/speaker-diarization-3.1 and click "Agree and access repository"
73
+ 3. **Set environment variable**:
74
+ ```bash
75
+ export HF_TOKEN='your_token_here'
76
+ ```
77
+
78
+ ### Running the Demo
79
+
80
+ **Launch Gradio Web Interface:**
81
+ ```bash
82
+ export HF_TOKEN='your_token_here'
83
+ python app.py
84
+ ```
85
+
86
+ Then open http://localhost:7860 in your browser.
87
+
88
+ **Or use the helper script:**
89
+ ```bash
90
+ ./run_app.sh
91
+ ```
92
+
93
+ **Command Line Usage:**
94
+ ```python
95
+ from src.pipeline import VADDiarizationPipeline
96
+
97
+ # Initialize pipeline
98
+ pipeline = VADDiarizationPipeline(
99
+ token='your_hf_token',
100
+ vad_threshold=0.5
101
+ )
102
+
103
+ # Process audio file
104
+ result = pipeline.process_file('audio.wav')
105
+
106
+ # Print results
107
+ print(pipeline.format_output(result))
108
+ ```
109
+
110
+ ## πŸ“ Project Structure
111
+
112
+ ```
113
+ VAD+SD/
114
+ β”œβ”€β”€ src/
115
+ β”‚ β”œβ”€β”€ __init__.py # Package initialization
116
+ β”‚ β”œβ”€β”€ vad.py # Silero VAD wrapper
117
+ β”‚ β”œβ”€β”€ diarization.py # Pyannote diarization wrapper
118
+ β”‚ β”œβ”€β”€ pipeline.py # Integrated pipeline
119
+ β”‚ └── utils.py # Utility functions
120
+ β”œβ”€β”€ tests/ # Unit tests
121
+ β”‚ β”œβ”€β”€ test_vad.py
122
+ β”‚ β”œβ”€β”€ test_pipeline.py
123
+ β”‚ └── __init__.py
124
+ β”œβ”€β”€ notebooks/ # Jupyter notebooks
125
+ β”‚ └── demo.ipynb
126
+ β”œβ”€β”€ benchmarks/ # Benchmark scripts
127
+ β”‚ └── run_benchmarks.py
128
+ β”œβ”€β”€ app.py # Gradio web interface
129
+ β”œβ”€β”€ vad_diarization.py # CLI demo script
130
+ β”œβ”€β”€ requirements.txt # Python dependencies
131
+ β”œβ”€β”€ environment.yml # Conda environment file
132
+ β”œβ”€β”€ Dockerfile # Container configuration
133
+ β”œβ”€β”€ docker-compose.yml # Docker Compose config
134
+ β”œβ”€β”€ .dockerignore # Docker ignore patterns
135
+ β”œβ”€β”€ .gitignore # Git ignore patterns
136
+ β”œβ”€β”€ setup.sh # Automated setup script
137
+ β”œβ”€β”€ run_app.sh # App launcher script
138
+ β”œβ”€β”€ verify_installation.py # Installation verification
139
+ └── README.md # This file
140
+ ```
141
+
142
+ ## 🐳 Docker Deployment
143
+
144
+ ### Build and Run
145
+
146
+ ```bash
147
+ # Build image
148
+ docker build -t vad-diarization:latest .
149
+
150
+ # Run container
151
+ docker run -p 7860:7860 \
152
+ -e HF_TOKEN='your_token_here' \
153
+ --gpus all \
154
+ vad-diarization:latest
155
+ ```
156
+
157
+ ### Docker Compose
158
+
159
+ ```bash
160
+ # Set your token in .env file
161
+ echo "HF_TOKEN=your_token_here" > .env
162
+
163
+ # Start services
164
+ docker-compose up
165
+ ```
166
+
167
+ ## πŸ“Š Performance Benchmarks
168
+
169
+ ### VAD Performance
170
+ - **Latency**: ~9.73ms per second of audio βœ…
171
+ - **Model Size**: 40MB
172
+ - **Real-time Factor**: ~0.01x (100x faster than real-time)
173
+ - **Accuracy**: High precision on speech detection
174
+
175
+ ### Diarization Performance
176
+ - **DER on FEARLESS STEPS**: ~19-20%
177
+ - **Processing Speed**: Depends on audio length and hardware
178
+ - **GPU Memory**: ~2-4GB for typical audio
179
+ - **Supports**: 2-10 speakers (configurable)
180
+
181
+ ### System Requirements
182
+ - **Minimum**: 4GB RAM, CPU-only
183
+ - **Recommended**: 8GB+ RAM, NVIDIA GPU with 4GB+ VRAM
184
+ - **Optimal**: 16GB+ RAM, RTX 3060+ or better
185
+
186
+ ## πŸ”§ Configuration
187
+
188
+ ### VAD Parameters
189
+
190
+ ```python
191
+ from src.vad import SileroVAD
192
+
193
+ vad = SileroVAD(
194
+ threshold=0.5, # Speech probability threshold (0.0-1.0)
195
+ sampling_rate=16000, # Audio sample rate
196
+ min_speech_duration_ms=250, # Minimum speech segment duration
197
+ min_silence_duration_ms=100,# Minimum silence between segments
198
+ use_onnx=False # Use ONNX runtime for speed
199
+ )
200
+ ```
201
+
202
+ ### Diarization Parameters
203
+
204
+ ```python
205
+ from src.diarization import SpeakerDiarization
206
+
207
+ diarization = SpeakerDiarization(
208
+ model_name="pyannote/speaker-diarization-3.1",
209
+ token='your_token',
210
+ num_speakers=None, # Fixed number (if known)
211
+ min_speakers=None, # Minimum speakers
212
+ max_speakers=None # Maximum speakers
213
+ )
214
+ ```
215
+
216
+ ### Pipeline Configuration
217
+
218
+ ```python
219
+ from src.pipeline import VADDiarizationPipeline
220
+
221
+ pipeline = VADDiarizationPipeline(
222
+ vad_threshold=0.5, # VAD sensitivity
223
+ token='your_token', # HF token
224
+ num_speakers=None, # Auto-detect speakers
225
+ use_onnx_vad=False # Use ONNX for VAD
226
+ )
227
+ ```
228
+
229
+ ## πŸ“ˆ Usage Examples
230
+
231
+ ### Basic Processing
232
+
233
+ ```python
234
+ from src.pipeline import VADDiarizationPipeline
235
+
236
+ # Initialize
237
+ pipeline = VADDiarizationPipeline(token='your_token')
238
+
239
+ # Process file
240
+ result = pipeline.process_file('meeting.wav')
241
+
242
+ # Access results
243
+ print(f"Speakers: {result['metadata']['num_speakers']}")
244
+ print(f"Segments: {result['metadata']['num_segments']}")
245
+
246
+ # Print timeline
247
+ for seg in result['speaker_segments']:
248
+ print(f"{seg['start']:.2f}s - {seg['end']:.2f}s: {seg['speaker']}")
249
+ ```
250
+
251
+ ### Batch Processing
252
+
253
+ ```python
254
+ # Process multiple files
255
+ audio_files = ['audio1.wav', 'audio2.wav', 'audio3.wav']
256
+ results = pipeline.process_batch(audio_files)
257
+
258
+ # Export results
259
+ for result in results:
260
+ pipeline.save_results(result, 'outputs/', format='json')
261
+ ```
262
+
263
+ ### Custom Configuration
264
+
265
+ ```python
266
+ # Initialize with custom settings
267
+ pipeline = VADDiarizationPipeline(
268
+ vad_threshold=0.3, # More sensitive VAD
269
+ num_speakers=3, # Fixed 3 speakers
270
+ use_onnx_vad=True # Faster VAD inference
271
+ )
272
+
273
+ # Process with overrides
274
+ result = pipeline.process_file(
275
+ 'audio.wav',
276
+ num_speakers=2 # Override to 2 speakers for this file
277
+ )
278
+ ```
279
+
280
+ ### VAD Only
281
+
282
+ ```python
283
+ from src.vad import SileroVAD
284
+
285
+ vad = SileroVAD(threshold=0.5)
286
+
287
+ # Process audio
288
+ timestamps = vad.process_file('audio.wav')
289
+
290
+ # Print speech segments
291
+ for ts in timestamps:
292
+ print(f"Speech: {ts['start']:.2f}s - {ts['end']:.2f}s")
293
+ ```
294
+
295
+ ### Diarization Only
296
+
297
+ ```python
298
+ from src.diarization import SpeakerDiarization
299
+
300
+ diarizer = SpeakerDiarization(token='your_token')
301
+
302
+ # Process audio
303
+ segments, time_ms, metadata = diarizer.process_file('audio.wav')
304
+
305
+ # Print speaker segments
306
+ for seg in segments:
307
+ print(f"{seg['speaker']}: {seg['start']:.2f}s - {seg['end']:.2f}s")
308
+ ```
309
+
310
+ ## πŸ§ͺ Testing
311
+
312
+ ```bash
313
+ # Run all tests
314
+ python -m pytest tests/ -v
315
+
316
+ # Run with coverage
317
+ python -m pytest tests/ --cov=src --cov-report=html
318
+
319
+ # Test specific module
320
+ python -m pytest tests/test_vad.py -v
321
+
322
+ # Verify installation
323
+ python verify_installation.py
324
+
325
+ # Run benchmarks
326
+ python benchmarks/run_benchmarks.py
327
+ ```
328
+
329
+ ## πŸ“ Output Formats
330
+
331
+ ### JSON Format
332
+ ```json
333
+ {
334
+ "audio_path": "audio.wav",
335
+ "speaker_segments": [
336
+ {
337
+ "start": 0.5,
338
+ "end": 3.2,
339
+ "speaker": "SPEAKER_00",
340
+ "duration": 2.7
341
+ }
342
+ ],
343
+ "vad_segments": [
344
+ {
345
+ "start": 0.5,
346
+ "end": 3.2
347
+ }
348
+ ],
349
+ "metadata": {
350
+ "num_speakers": 2,
351
+ "num_segments": 15,
352
+ "total_speech_time": 45.3
353
+ },
354
+ "processing_time": {
355
+ "vad_ms": 150.2,
356
+ "diarization_ms": 3200.5,
357
+ "total_ms": 3350.7
358
+ }
359
+ }
360
+ ```
361
+
362
+ ### RTTM Format
363
+ ```
364
+ SPEAKER audio 1 0.500 2.700 <NA> <NA> SPEAKER_00 <NA> <NA>
365
+ SPEAKER audio 1 3.500 4.200 <NA> <NA> SPEAKER_01 <NA> <NA>
366
+ ```
367
+
368
+ ### Text Timeline
369
+ ```
370
+ [0.50s - 3.20s] SPEAKER_00
371
+ [3.50s - 7.70s] SPEAKER_01
372
+ [8.00s - 10.50s] SPEAKER_00
373
+ ```
374
+
375
+ ## 🎯 Use Cases
376
+
377
+ - **Meeting Transcription**: Identify who spoke when in recordings
378
+ - **Podcast Analysis**: Track speaker segments and statistics
379
+ - **Call Center Analytics**: Analyze customer-agent interactions
380
+ - **Video Production**: Generate speaker labels for editing
381
+ - **Research**: Speaker diarization for linguistic studies
382
+ - **Interview Processing**: Separate interviewer and interviewee
383
+ - **Broadcast Media**: Analyze news programs and talk shows
384
+
385
+ ## πŸ› Troubleshooting
386
+
387
+ ### Common Issues
388
+
389
+ #### 1. HF Token Error
390
+ ```
391
+ Error: Invalid token or model access denied
392
+ ```
393
+ **Solution**:
394
+ - Get token from https://huggingface.co/settings/tokens
395
+ - Accept model conditions at https://huggingface.co/pyannote/speaker-diarization-3.1
396
+ - Set environment variable: `export HF_TOKEN='your_token'`
397
+
398
+ #### 2. CUDA Out of Memory
399
+ ```
400
+ RuntimeError: CUDA out of memory
401
+ ```
402
+ **Solution**:
403
+ - Process shorter audio segments
404
+ - Use CPU mode: `device='cpu'`
405
+ - Reduce batch size
406
+
407
+ #### 3. Audio Format Not Supported
408
+ ```
409
+ Error loading audio
410
+ ```
411
+ **Solution**: Convert to WAV format using FFmpeg:
412
+ ```bash
413
+ ffmpeg -i input.mp3 -ar 16000 -ac 1 output.wav
414
+ ```
415
+
416
+ #### 4. DiarizeOutput Error
417
+ ```
418
+ 'DiarizeOutput' object has no attribute 'itertracks'
419
+ ```
420
+ **Solution**: This is fixed in the current version. Make sure you have the latest code.
421
+
422
+ #### 5. Import Errors
423
+ ```
424
+ ModuleNotFoundError: No module named 'torch'
425
+ ```
426
+ **Solution**:
427
+ - Activate your environment: `conda activate vad_diarization`
428
+ - Reinstall dependencies: `pip install -r requirements.txt`
429
+
430
+ ## πŸ”„ API Compatibility
431
+
432
+ This project supports both:
433
+ - **Pyannote.audio 3.x**: Returns `Annotation` objects
434
+ - **Pyannote.audio 4.0+**: Returns `DiarizeOutput` objects
435
+
436
+ The code automatically detects and handles both formats.
437
+
438
+ ## πŸš€ Deployment Options
439
+
440
+ ### Local Development
441
+ ```bash
442
+ python app.py
443
+ ```
444
+
445
+ ### Docker
446
+ ```bash
447
+ docker-compose up
448
+ ```
449
+
450
+ ### Cloud Platforms
451
+
452
+ **Hugging Face Spaces:**
453
+ - Fork this repository
454
+ - Create new Space
455
+ - Connect repository
456
+ - Set `HF_TOKEN` secret
457
+ - Deploy!
458
+
459
+ **AWS/GCP/Azure:**
460
+ - Use provided Dockerfile
461
+ - Deploy as container service
462
+ - Configure GPU instances for best performance
463
+
464
+ ## 🀝 Contributing
465
+
466
+ Contributions are welcome! Please feel free to submit a Pull Request.
467
+
468
+ 1. Fork the repository
469
+ 2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
470
+ 3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
471
+ 4. Push to the branch (`git push origin feature/AmazingFeature`)
472
+ 5. Open a Pull Request
473
+
474
+ ## πŸ“„ License
475
+
476
+ This project is licensed under the MIT License.
477
+
478
+ ## πŸ™ Acknowledgments
479
+
480
+ - [Silero VAD](https://github.com/snakers4/silero-vad) - Fast and accurate VAD
481
+ - [Pyannote.audio](https://github.com/pyannote/pyannote-audio) - Speaker diarization toolkit
482
+ - [Gradio](https://gradio.app/) - Web interface framework
483
+ - [PyTorch](https://pytorch.org/) - Deep learning framework
484
+
485
+ ## πŸ“§ Support
486
+
487
+ For questions or issues:
488
+ - Open an issue on GitHub
489
+ - Check existing issues for solutions
490
+ - Review the troubleshooting section
491
+
492
  ---
493
 
494
+ **Built with ❀️ for the speech processing community**
app.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Gradio Web Interface for Real-Time VAD + Speaker Diarization
4
+ Interactive demo with visualizations
5
+ """
6
+
7
+ import gradio as gr
8
+ import numpy as np
9
+ import matplotlib.pyplot as plt
10
+ import matplotlib.patches as mpatches
11
+ from pathlib import Path
12
+ import json
13
+ import os
14
+ import tempfile
15
+ from typing import Optional, Tuple, List, Dict
16
+
17
+ from src.pipeline import VADDiarizationPipeline
18
+ from src.utils import visualize_timeline, segment_to_rttm
19
+
20
+
21
+ # Initialize pipeline
22
+ print("Initializing pipeline...")
23
+ HF_TOKEN = os.environ.get('HF_TOKEN', None)
24
+
25
+ if not HF_TOKEN:
26
+ print("⚠️ No HF_TOKEN found. Set it with: export HF_TOKEN='your_token_here'")
27
+ print("Pipeline will work with VAD only until token is provided.")
28
+
29
+ try:
30
+ pipeline = VADDiarizationPipeline(
31
+ use_auth_token=HF_TOKEN,
32
+ vad_threshold=0.5
33
+ )
34
+ PIPELINE_READY = True
35
+ except Exception as e:
36
+ print(f"⚠️ Could not initialize full pipeline: {e}")
37
+ print("Will use VAD-only mode")
38
+ PIPELINE_READY = False
39
+
40
+
41
+ def create_timeline_plot(segments: List[Dict], duration: float) -> plt.Figure:
42
+ """Create a visual timeline plot of speaker segments."""
43
+ fig, ax = plt.subplots(figsize=(12, 4))
44
+
45
+ # Get unique speakers and assign colors
46
+ speakers = sorted(set(seg['speaker'] for seg in segments))
47
+ colors = plt.cm.Set3(np.linspace(0, 1, len(speakers)))
48
+ speaker_colors = {speaker: colors[i] for i, speaker in enumerate(speakers)}
49
+
50
+ # Plot segments
51
+ for seg in segments:
52
+ color = speaker_colors[seg['speaker']]
53
+ ax.barh(
54
+ 0,
55
+ seg['duration'],
56
+ left=seg['start'],
57
+ height=0.8,
58
+ color=color,
59
+ edgecolor='black',
60
+ linewidth=0.5
61
+ )
62
+
63
+ # Add speaker label in the middle of long segments
64
+ if seg['duration'] > 1.0:
65
+ mid = seg['start'] + seg['duration'] / 2
66
+ ax.text(
67
+ mid, 0, seg['speaker'],
68
+ ha='center', va='center',
69
+ fontsize=8, fontweight='bold'
70
+ )
71
+
72
+ # Formatting
73
+ ax.set_xlim(0, duration)
74
+ ax.set_ylim(-0.5, 0.5)
75
+ ax.set_xlabel('Time (seconds)', fontsize=12)
76
+ ax.set_yticks([])
77
+ ax.set_title('Speaker Timeline', fontsize=14, fontweight='bold')
78
+ ax.grid(True, axis='x', alpha=0.3)
79
+
80
+ # Legend
81
+ legend_patches = [
82
+ mpatches.Patch(color=speaker_colors[speaker], label=speaker)
83
+ for speaker in speakers
84
+ ]
85
+ ax.legend(handles=legend_patches, loc='upper right')
86
+
87
+ plt.tight_layout()
88
+ return fig
89
+
90
+
91
+ def process_audio(
92
+ audio_file,
93
+ num_speakers: Optional[int] = None,
94
+ vad_threshold: float = 0.5,
95
+ progress=gr.Progress()
96
+ ) -> Tuple[str, str, str, plt.Figure]:
97
+ """
98
+ Process audio file through the pipeline.
99
+
100
+ Returns:
101
+ Tuple of (summary_text, timeline_text, json_output, plot)
102
+ """
103
+ if audio_file is None:
104
+ return "Please upload an audio file", "", "", None
105
+
106
+ if not PIPELINE_READY:
107
+ return "Pipeline not ready. Please set HF_TOKEN environment variable.", "", "", None
108
+
109
+ try:
110
+ progress(0.1, desc="Loading audio...")
111
+
112
+ # Update VAD threshold if changed
113
+ pipeline.vad.threshold = vad_threshold
114
+
115
+ progress(0.3, desc="Running VAD...")
116
+
117
+ # Process file
118
+ num_speakers_param = int(num_speakers) if num_speakers and num_speakers > 0 else None
119
+
120
+ progress(0.5, desc="Running speaker diarization...")
121
+
122
+ result = pipeline.process_file(
123
+ audio_file,
124
+ num_speakers=num_speakers_param,
125
+ return_vad=True,
126
+ return_stats=True
127
+ )
128
+
129
+ progress(0.8, desc="Generating visualizations...")
130
+
131
+ # Create summary
132
+ summary_lines = []
133
+ summary_lines.append("# Processing Results\n")
134
+ summary_lines.append(f"**File:** {Path(audio_file).name}\n")
135
+ summary_lines.append(f"**Speakers Detected:** {result['metadata']['num_speakers']}")
136
+ summary_lines.append(f"**Speaker Segments:** {result['metadata']['num_segments']}")
137
+ summary_lines.append(f"**Total Speech Time:** {result['metadata']['total_speech_time']:.2f}s\n")
138
+
139
+ summary_lines.append("## Processing Time")
140
+ summary_lines.append(f"- VAD: {result['processing_time']['vad_ms']:.2f}ms")
141
+ summary_lines.append(f"- Diarization: {result['processing_time']['diarization_ms']:.2f}ms")
142
+ summary_lines.append(f"- **Total: {result['processing_time']['total_ms']:.2f}ms**\n")
143
+
144
+ # Speaker statistics
145
+ if 'speaker_statistics' in result:
146
+ summary_lines.append("## Speaker Statistics\n")
147
+ for speaker, stats in result['speaker_statistics'].items():
148
+ summary_lines.append(f"### {speaker}")
149
+ summary_lines.append(f"- Total speaking time: {stats['total_time']:.2f}s")
150
+ summary_lines.append(f"- Number of segments: {stats['num_segments']}")
151
+ summary_lines.append(f"- Average segment duration: {stats['avg_segment_duration']:.2f}s\n")
152
+
153
+ summary_text = "\n".join(summary_lines)
154
+
155
+ # Create timeline text
156
+ timeline_lines = ["# Speaker Timeline\n"]
157
+ timeline_lines.append("```")
158
+ for seg in result['speaker_segments']:
159
+ timeline_lines.append(
160
+ f"{seg['start']:7.2f}s - {seg['end']:7.2f}s: {seg['speaker']} ({seg['duration']:.2f}s)"
161
+ )
162
+ timeline_lines.append("```")
163
+ timeline_text = "\n".join(timeline_lines)
164
+
165
+ # JSON output
166
+ json_output = json.dumps(result, indent=2, default=str)
167
+
168
+ # Create plot
169
+ duration = max(seg['end'] for seg in result['speaker_segments'])
170
+ plot = create_timeline_plot(result['speaker_segments'], duration)
171
+
172
+ progress(1.0, desc="Complete!")
173
+
174
+ return summary_text, timeline_text, json_output, plot
175
+
176
+ except Exception as e:
177
+ error_msg = f"Error processing audio: {str(e)}\n\n"
178
+ error_msg += "Make sure you have:\n"
179
+ error_msg += "1. Valid HF_TOKEN environment variable\n"
180
+ error_msg += "2. Accepted model conditions at https://huggingface.co/pyannote/speaker-diarization-3.1"
181
+ return error_msg, "", "", None
182
+
183
+
184
+ def create_demo():
185
+ """Create Gradio interface."""
186
+
187
+ with gr.Blocks(title="VAD + Speaker Diarization", theme=gr.themes.Soft()) as demo:
188
+ gr.Markdown("""
189
+ # πŸŽ™οΈ Real-Time Voice Activity Detection + Speaker Diarization
190
+
191
+ Upload an audio file to detect speech segments and identify different speakers.
192
+
193
+ **Features:**
194
+ - Voice Activity Detection (VAD) with <100ms latency
195
+ - Speaker Diarization with state-of-the-art accuracy
196
+ - Visual timeline of speaker segments
197
+ - Detailed statistics and JSON export
198
+
199
+ **Supported formats:** WAV, MP3, FLAC, OGG, M4A
200
+ """)
201
+
202
+ with gr.Row():
203
+ with gr.Column(scale=1):
204
+ gr.Markdown("## Input")
205
+
206
+ audio_input = gr.Audio(
207
+ label="Upload Audio File",
208
+ type="filepath",
209
+ sources=["upload"]
210
+ )
211
+
212
+ with gr.Accordion("Advanced Settings", open=False):
213
+ num_speakers = gr.Number(
214
+ label="Number of Speakers (0 for auto-detection)",
215
+ value=0,
216
+ precision=0,
217
+ minimum=0,
218
+ maximum=10,
219
+ info="Set to 0 for automatic speaker detection"
220
+ )
221
+
222
+ vad_threshold = gr.Slider(
223
+ label="VAD Sensitivity Threshold",
224
+ minimum=0.0,
225
+ maximum=1.0,
226
+ value=0.5,
227
+ step=0.05,
228
+ info="Lower = more sensitive to speech"
229
+ )
230
+
231
+ process_btn = gr.Button("πŸš€ Process Audio", variant="primary", size="lg")
232
+
233
+ gr.Markdown("""
234
+ ### Tips:
235
+ - For best results, use clear audio with minimal background noise
236
+ - Specify number of speakers if known for better accuracy
237
+ - Adjust VAD threshold if speech is not detected properly
238
+ """)
239
+
240
+ with gr.Column(scale=2):
241
+ gr.Markdown("## Results")
242
+
243
+ with gr.Tab("Summary"):
244
+ summary_output = gr.Markdown(label="Summary")
245
+
246
+ with gr.Tab("Timeline"):
247
+ timeline_plot = gr.Plot(label="Visual Timeline")
248
+ timeline_output = gr.Markdown(label="Timeline Details")
249
+
250
+ with gr.Tab("JSON Export"):
251
+ json_output = gr.Code(
252
+ label="Full Results (JSON)",
253
+ language="json",
254
+ lines=20
255
+ )
256
+
257
+ # Examples
258
+ gr.Markdown("## πŸ“ Examples")
259
+ gr.Markdown("""
260
+ Try the demo with your own audio files or use sample data from the FEARLESS STEPS dataset.
261
+
262
+ **Expected Performance:**
263
+ - VAD Latency: <100ms per second of audio
264
+ - Diarization Error Rate (DER): ~19-20% on benchmark datasets
265
+ - Processing Time: Depends on audio length and hardware
266
+ """)
267
+
268
+ # Event handlers
269
+ process_btn.click(
270
+ fn=process_audio,
271
+ inputs=[audio_input, num_speakers, vad_threshold],
272
+ outputs=[summary_output, timeline_output, json_output, timeline_plot]
273
+ )
274
+
275
+ # Footer
276
+ gr.Markdown("""
277
+ ---
278
+ **Tech Stack:** Silero VAD + Pyannote.audio 3.1 | **GPU:** CUDA 12.5+ supported
279
+
280
+ **Note:** First run may take longer due to model downloads (~1GB)
281
+ """)
282
+
283
+ return demo
284
+
285
+
286
+ if __name__ == "__main__":
287
+ demo = create_demo()
288
+
289
+ # Launch settings
290
+ demo.launch(
291
+ server_name="0.0.0.0",
292
+ server_port=7860,
293
+ share=False,
294
+ show_error=True
295
+ )
benchmarks/run_benchmarks.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Benchmark script for VAD + Speaker Diarization
4
+ Tests performance on various audio conditions
5
+ """
6
+
7
+ import sys
8
+ from pathlib import Path
9
+ sys.path.insert(0, str(Path(__file__).parent.parent))
10
+
11
+ import time
12
+ import json
13
+ import numpy as np
14
+ from typing import Dict, List
15
+ import argparse
16
+
17
+ from src.vad import SileroVAD
18
+ from src.pipeline import VADDiarizationPipeline
19
+ from src.utils import create_test_audio
20
+
21
+
22
+ class Benchmark:
23
+ """Benchmark suite for VAD + Diarization."""
24
+
25
+ def __init__(self, use_auth_token: str = None):
26
+ """Initialize benchmark."""
27
+ self.use_auth_token = use_auth_token
28
+ self.results = {}
29
+
30
+ def benchmark_vad_latency(self, durations: List[float] = [1, 5, 10, 30, 60]):
31
+ """Benchmark VAD latency across different audio durations."""
32
+ print("\n" + "="*60)
33
+ print("VAD LATENCY BENCHMARK")
34
+ print("="*60)
35
+
36
+ vad = SileroVAD(threshold=0.5)
37
+ results = []
38
+
39
+ for duration in durations:
40
+ print(f"\nTesting {duration}s audio...")
41
+ metrics = vad.benchmark_latency(duration_seconds=duration)
42
+
43
+ result = {
44
+ 'duration_s': duration,
45
+ 'processing_time_ms': metrics['total_processing_time_ms'],
46
+ 'latency_per_second_ms': metrics['latency_per_second_ms'],
47
+ 'real_time_factor': metrics['real_time_factor']
48
+ }
49
+ results.append(result)
50
+
51
+ print(f" Processing time: {result['processing_time_ms']:.2f}ms")
52
+ print(f" Latency/second: {result['latency_per_second_ms']:.2f}ms")
53
+ print(f" Real-time factor: {result['real_time_factor']:.4f}x")
54
+
55
+ # Check target
56
+ if result['latency_per_second_ms'] < 100:
57
+ print(" βœ… Target achieved (<100ms)")
58
+ else:
59
+ print(" ⚠️ Above target (>100ms)")
60
+
61
+ self.results['vad_latency'] = results
62
+
63
+ # Summary
64
+ avg_latency = np.mean([r['latency_per_second_ms'] for r in results])
65
+ print(f"\nπŸ“Š Average latency: {avg_latency:.2f}ms per second")
66
+
67
+ return results
68
+
69
+ def benchmark_vad_thresholds(self, thresholds: List[float] = [0.3, 0.5, 0.7]):
70
+ """Benchmark VAD with different sensitivity thresholds."""
71
+ print("\n" + "="*60)
72
+ print("VAD THRESHOLD BENCHMARK")
73
+ print("="*60)
74
+
75
+ # Create test audio
76
+ test_audio = create_test_audio("test_threshold.wav", duration=10.0)
77
+ results = []
78
+
79
+ for threshold in thresholds:
80
+ print(f"\nTesting threshold {threshold}...")
81
+ vad = SileroVAD(threshold=threshold)
82
+
83
+ timestamps, processing_time = vad.process_file(test_audio)
84
+
85
+ result = {
86
+ 'threshold': threshold,
87
+ 'num_segments': len(timestamps),
88
+ 'processing_time_ms': processing_time,
89
+ 'total_speech_time_s': sum(ts['end'] - ts['start'] for ts in timestamps)
90
+ }
91
+ results.append(result)
92
+
93
+ print(f" Segments detected: {result['num_segments']}")
94
+ print(f" Total speech time: {result['total_speech_time_s']:.2f}s")
95
+ print(f" Processing time: {result['processing_time_ms']:.2f}ms")
96
+
97
+ self.results['vad_thresholds'] = results
98
+
99
+ # Cleanup
100
+ Path(test_audio).unlink(missing_ok=True)
101
+
102
+ return results
103
+
104
+ def benchmark_full_pipeline(self):
105
+ """Benchmark full VAD + Diarization pipeline."""
106
+ print("\n" + "="*60)
107
+ print("FULL PIPELINE BENCHMARK")
108
+ print("="*60)
109
+
110
+ if not self.use_auth_token:
111
+ print("⚠️ No HF_TOKEN provided, skipping full pipeline benchmark")
112
+ return None
113
+
114
+ try:
115
+ # Initialize pipeline
116
+ print("\nInitializing pipeline...")
117
+ pipeline = VADDiarizationPipeline(
118
+ use_auth_token=self.use_auth_token,
119
+ vad_threshold=0.5
120
+ )
121
+
122
+ # Create test audio
123
+ test_audio = create_test_audio("test_pipeline.wav", duration=30.0)
124
+
125
+ # Process
126
+ print(f"\nProcessing {test_audio}...")
127
+ result = pipeline.process_file(test_audio)
128
+
129
+ benchmark_result = {
130
+ 'audio_duration_s': 30.0,
131
+ 'vad_time_ms': result['processing_time']['vad_ms'],
132
+ 'diarization_time_ms': result['processing_time']['diarization_ms'],
133
+ 'total_time_ms': result['processing_time']['total_ms'],
134
+ 'num_speakers': result['metadata']['num_speakers'],
135
+ 'num_segments': result['metadata']['num_segments']
136
+ }
137
+
138
+ print(f"\nπŸ“Š Results:")
139
+ print(f" VAD time: {benchmark_result['vad_time_ms']:.2f}ms")
140
+ print(f" Diarization time: {benchmark_result['diarization_time_ms']:.2f}ms")
141
+ print(f" Total time: {benchmark_result['total_time_ms']:.2f}ms")
142
+ print(f" Speakers: {benchmark_result['num_speakers']}")
143
+ print(f" Segments: {benchmark_result['num_segments']}")
144
+
145
+ self.results['full_pipeline'] = benchmark_result
146
+
147
+ # Cleanup
148
+ Path(test_audio).unlink(missing_ok=True)
149
+
150
+ return benchmark_result
151
+
152
+ except Exception as e:
153
+ print(f"❌ Error: {e}")
154
+ return None
155
+
156
+ def benchmark_memory_usage(self):
157
+ """Benchmark memory usage."""
158
+ print("\n" + "="*60)
159
+ print("MEMORY USAGE BENCHMARK")
160
+ print("="*60)
161
+
162
+ import psutil
163
+ import torch
164
+
165
+ process = psutil.Process()
166
+
167
+ # Initial memory
168
+ initial_mem = process.memory_info().rss / 1024 / 1024 # MB
169
+ print(f"\nInitial memory: {initial_mem:.2f} MB")
170
+
171
+ # Load VAD
172
+ print("\nLoading VAD...")
173
+ vad = SileroVAD()
174
+ vad_mem = process.memory_info().rss / 1024 / 1024
175
+ print(f"After VAD: {vad_mem:.2f} MB (+{vad_mem - initial_mem:.2f} MB)")
176
+
177
+ # GPU memory (if available)
178
+ if torch.cuda.is_available():
179
+ gpu_mem = torch.cuda.memory_allocated() / 1024 / 1024
180
+ print(f"GPU memory: {gpu_mem:.2f} MB")
181
+
182
+ result = {
183
+ 'initial_memory_mb': initial_mem,
184
+ 'vad_memory_mb': vad_mem,
185
+ 'vad_increase_mb': vad_mem - initial_mem
186
+ }
187
+
188
+ if torch.cuda.is_available():
189
+ result['gpu_memory_mb'] = gpu_mem
190
+
191
+ self.results['memory_usage'] = result
192
+
193
+ return result
194
+
195
+ def save_results(self, output_path: str = "benchmark_results.json"):
196
+ """Save benchmark results to file."""
197
+ output_file = Path(__file__).parent / output_path
198
+
199
+ with open(output_file, 'w') as f:
200
+ json.dump(self.results, f, indent=2)
201
+
202
+ print(f"\nβœ“ Results saved to: {output_file}")
203
+
204
+ def run_all(self):
205
+ """Run all benchmarks."""
206
+ print("\n" + "="*60)
207
+ print("RUNNING ALL BENCHMARKS")
208
+ print("="*60)
209
+
210
+ # VAD latency
211
+ self.benchmark_vad_latency()
212
+
213
+ # VAD thresholds
214
+ self.benchmark_vad_thresholds()
215
+
216
+ # Memory usage
217
+ self.benchmark_memory_usage()
218
+
219
+ # Full pipeline (if token available)
220
+ if self.use_auth_token:
221
+ self.benchmark_full_pipeline()
222
+
223
+ # Save results
224
+ self.save_results()
225
+
226
+ print("\n" + "="*60)
227
+ print("βœ… ALL BENCHMARKS COMPLETE")
228
+ print("="*60)
229
+
230
+
231
+ def main():
232
+ """Main benchmark runner."""
233
+ parser = argparse.ArgumentParser(description="Run VAD + Diarization benchmarks")
234
+ parser.add_argument(
235
+ '--token',
236
+ type=str,
237
+ default=None,
238
+ help='Hugging Face token for full pipeline benchmark'
239
+ )
240
+ parser.add_argument(
241
+ '--output',
242
+ type=str,
243
+ default='benchmark_results.json',
244
+ help='Output file for results'
245
+ )
246
+ parser.add_argument(
247
+ '--quick',
248
+ action='store_true',
249
+ help='Run quick benchmark (VAD only)'
250
+ )
251
+
252
+ args = parser.parse_args()
253
+
254
+ # Get token from args or environment
255
+ token = args.token or os.environ.get('HF_TOKEN')
256
+
257
+ # Initialize benchmark
258
+ benchmark = Benchmark(use_auth_token=token)
259
+
260
+ if args.quick:
261
+ # Quick benchmark (VAD only)
262
+ benchmark.benchmark_vad_latency(durations=[1, 5, 10])
263
+ benchmark.save_results(args.output)
264
+ else:
265
+ # Full benchmark suite
266
+ benchmark.run_all()
267
+
268
+
269
+ if __name__ == "__main__":
270
+ import os
271
+ main()
docker-compose.yml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ vad-diarization:
5
+ build: .
6
+ container_name: vad-diarization
7
+ ports:
8
+ - "7860:7860"
9
+ environment:
10
+ - HF_TOKEN=${HF_TOKEN}
11
+ - GRADIO_SERVER_NAME=0.0.0.0
12
+ - GRADIO_SERVER_PORT=7860
13
+ volumes:
14
+ - ./data:/app/data
15
+ - ./outputs:/app/outputs
16
+ deploy:
17
+ resources:
18
+ reservations:
19
+ devices:
20
+ - driver: nvidia
21
+ count: 1
22
+ capabilities: [gpu]
23
+ restart: unless-stopped
24
+ healthcheck:
25
+ test: ["CMD", "curl", "-f", "http://localhost:7860/"]
26
+ interval: 30s
27
+ timeout: 10s
28
+ retries: 3
29
+ start_period: 40s
environment.yml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: vad_diarization
2
+ channels:
3
+ - pytorch
4
+ - nvidia
5
+ - conda-forge
6
+ - defaults
7
+ dependencies:
8
+ - python=3.10
9
+ - pytorch>=2.0.0
10
+ - torchvision>=0.15.0
11
+ - torchaudio>=2.0.0
12
+ - pytorch-cuda=12.1
13
+ - ffmpeg
14
+ - pip
15
+ - pip:
16
+ # VAD
17
+ - silero-vad>=5.0.0
18
+
19
+ # Speaker Diarization
20
+ - pyannote.audio>=3.1.0
21
+ - pyannote.core>=5.0.0
22
+ - pyannote.metrics>=3.2.0
23
+
24
+ # Audio processing
25
+ - librosa>=0.10.0
26
+ - soundfile>=0.12.0
27
+ - numpy>=1.24.0
28
+
29
+ # Web interface
30
+ - gradio>=4.0.0
31
+
32
+ # Visualization
33
+ - matplotlib>=3.7.0
34
+
35
+ # Utilities
36
+ - tqdm>=4.65.0
37
+ - pyyaml>=6.0
38
+
39
+ # Testing
40
+ - pytest>=7.0.0
41
+ - pytest-cov>=4.0.0
42
+
43
+ # System utilities
44
+ - psutil>=5.9.0
notebooks/demo.ipynb ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Real-Time VAD + Speaker Diarization Demo\n",
8
+ "\n",
9
+ "This notebook demonstrates the complete pipeline for voice activity detection and speaker diarization."
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "markdown",
14
+ "metadata": {},
15
+ "source": [
16
+ "## Setup"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "import sys\n",
26
+ "sys.path.insert(0, '..')\n",
27
+ "\n",
28
+ "import numpy as np\n",
29
+ "import matplotlib.pyplot as plt\n",
30
+ "from pathlib import Path\n",
31
+ "import os\n",
32
+ "\n",
33
+ "from src.vad import SileroVAD\n",
34
+ "from src.diarization import SpeakerDiarization\n",
35
+ "from src.pipeline import VADDiarizationPipeline\n",
36
+ "from src.utils import create_test_audio, visualize_timeline\n",
37
+ "\n",
38
+ "print(\"βœ… Imports successful\")"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "markdown",
43
+ "metadata": {},
44
+ "source": [
45
+ "## 1. Voice Activity Detection (VAD)"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": null,
51
+ "metadata": {},
52
+ "outputs": [],
53
+ "source": [
54
+ "# Initialize VAD\n",
55
+ "vad = SileroVAD(threshold=0.5)\n",
56
+ "\n",
57
+ "# Benchmark latency\n",
58
+ "print(\"Benchmarking VAD latency...\")\n",
59
+ "metrics = vad.benchmark_latency(duration_seconds=10.0)\n",
60
+ "\n",
61
+ "print(f\"\\nVAD Performance:\")\n",
62
+ "print(f\" Total processing time: {metrics['total_processing_time_ms']:.2f}ms\")\n",
63
+ "print(f\" Audio duration: {metrics['audio_duration_s']:.1f}s\")\n",
64
+ "print(f\" Latency per second: {metrics['latency_per_second_ms']:.2f}ms\")\n",
65
+ "print(f\" Real-time factor: {metrics['real_time_factor']:.4f}x\")\n",
66
+ "\n",
67
+ "if metrics['latency_per_second_ms'] < 100:\n",
68
+ " print(\"\\nβœ… Target latency achieved (<100ms)\")\n",
69
+ "else:\n",
70
+ " print(\"\\n⚠️ Latency above target\")"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "markdown",
75
+ "metadata": {},
76
+ "source": [
77
+ "## 2. Create Test Audio"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": null,
83
+ "metadata": {},
84
+ "outputs": [],
85
+ "source": [
86
+ "# Create synthetic test audio\n",
87
+ "test_audio_path = create_test_audio(\"test_audio.wav\", duration=10.0)\n",
88
+ "print(f\"βœ… Created test audio: {test_audio_path}\")"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "markdown",
93
+ "metadata": {},
94
+ "source": [
95
+ "## 3. Process with VAD"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "metadata": {},
102
+ "outputs": [],
103
+ "source": [
104
+ "# Process test audio\n",
105
+ "timestamps, processing_time = vad.process_file(test_audio_path)\n",
106
+ "\n",
107
+ "print(f\"\\nVAD Results:\")\n",
108
+ "print(f\" Found {len(timestamps)} speech segments\")\n",
109
+ "print(f\" Processing time: {processing_time:.2f}ms\")\n",
110
+ "print(f\"\\nSegments:\")\n",
111
+ "for i, ts in enumerate(timestamps, 1):\n",
112
+ " print(f\" {i}. {ts['start']:.2f}s - {ts['end']:.2f}s ({ts['end']-ts['start']:.2f}s)\")"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "markdown",
117
+ "metadata": {},
118
+ "source": [
119
+ "## 4. Full Pipeline (VAD + Diarization)\n",
120
+ "\n",
121
+ "**Note:** This requires a Hugging Face token. Set it with:\n",
122
+ "```python\n",
123
+ "os.environ['HF_TOKEN'] = 'your_token_here'\n",
124
+ "```"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": null,
130
+ "metadata": {},
131
+ "outputs": [],
132
+ "source": [
133
+ "# Check for HF token\n",
134
+ "HF_TOKEN = os.environ.get('HF_TOKEN')\n",
135
+ "\n",
136
+ "if not HF_TOKEN:\n",
137
+ " print(\"⚠️ No HF_TOKEN found. Set it to run full pipeline:\")\n",
138
+ " print(\" os.environ['HF_TOKEN'] = 'your_token_here'\")\n",
139
+ "else:\n",
140
+ " print(\"βœ… HF_TOKEN found, initializing full pipeline...\")\n",
141
+ " \n",
142
+ " try:\n",
143
+ " # Initialize pipeline\n",
144
+ " pipeline = VADDiarizationPipeline(\n",
145
+ " use_auth_token=HF_TOKEN,\n",
146
+ " vad_threshold=0.5\n",
147
+ " )\n",
148
+ " \n",
149
+ " print(\"\\nβœ… Pipeline initialized successfully\")\n",
150
+ " \n",
151
+ " except Exception as e:\n",
152
+ " print(f\"\\n❌ Error initializing pipeline: {e}\")\n",
153
+ " print(\"\\nMake sure you have:\")\n",
154
+ " print(\"1. Valid HF token\")\n",
155
+ " print(\"2. Accepted model conditions at:\")\n",
156
+ " print(\" https://huggingface.co/pyannote/speaker-diarization-3.1\")"
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "markdown",
161
+ "metadata": {},
162
+ "source": [
163
+ "## 5. Process Audio with Full Pipeline"
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "execution_count": null,
169
+ "metadata": {},
170
+ "outputs": [],
171
+ "source": [
172
+ "# Only run if pipeline is initialized\n",
173
+ "if 'pipeline' in locals():\n",
174
+ " # Process audio file\n",
175
+ " result = pipeline.process_file(test_audio_path)\n",
176
+ " \n",
177
+ " # Display results\n",
178
+ " print(\"\\n\" + \"=\"*60)\n",
179
+ " print(\"RESULTS\")\n",
180
+ " print(\"=\"*60)\n",
181
+ " print(f\"\\nSpeakers detected: {result['metadata']['num_speakers']}\")\n",
182
+ " print(f\"Speaker segments: {result['metadata']['num_segments']}\")\n",
183
+ " print(f\"Total speech time: {result['metadata']['total_speech_time']:.2f}s\")\n",
184
+ " \n",
185
+ " print(f\"\\nProcessing time:\")\n",
186
+ " print(f\" VAD: {result['processing_time']['vad_ms']:.2f}ms\")\n",
187
+ " print(f\" Diarization: {result['processing_time']['diarization_ms']:.2f}ms\")\n",
188
+ " print(f\" Total: {result['processing_time']['total_ms']:.2f}ms\")\n",
189
+ " \n",
190
+ " print(f\"\\nSpeaker Timeline:\")\n",
191
+ " for seg in result['speaker_segments']:\n",
192
+ " print(f\" {seg['start']:6.2f}s - {seg['end']:6.2f}s: {seg['speaker']}\")\n",
193
+ "else:\n",
194
+ " print(\"⚠️ Pipeline not initialized. Set HF_TOKEN to run full pipeline.\")"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "markdown",
199
+ "metadata": {},
200
+ "source": [
201
+ "## 6. Visualize Results"
202
+ ]
203
+ },
204
+ {
205
+ "cell_type": "code",
206
+ "execution_count": null,
207
+ "metadata": {},
208
+ "outputs": [],
209
+ "source": [
210
+ "if 'result' in locals():\n",
211
+ " # ASCII timeline\n",
212
+ " timeline = visualize_timeline(result['speaker_segments'])\n",
213
+ " print(timeline)\n",
214
+ " \n",
215
+ " # Plot timeline\n",
216
+ " fig, ax = plt.subplots(figsize=(12, 4))\n",
217
+ " \n",
218
+ " speakers = sorted(set(seg['speaker'] for seg in result['speaker_segments']))\n",
219
+ " colors = plt.cm.Set3(np.linspace(0, 1, len(speakers)))\n",
220
+ " speaker_colors = {speaker: colors[i] for i, speaker in enumerate(speakers)}\n",
221
+ " \n",
222
+ " for seg in result['speaker_segments']:\n",
223
+ " color = speaker_colors[seg['speaker']]\n",
224
+ " ax.barh(0, seg['duration'], left=seg['start'], height=0.8, \n",
225
+ " color=color, edgecolor='black', linewidth=0.5)\n",
226
+ " \n",
227
+ " ax.set_xlabel('Time (seconds)', fontsize=12)\n",
228
+ " ax.set_yticks([])\n",
229
+ " ax.set_title('Speaker Timeline', fontsize=14, fontweight='bold')\n",
230
+ " ax.grid(True, axis='x', alpha=0.3)\n",
231
+ " \n",
232
+ " # Legend\n",
233
+ " from matplotlib.patches import Patch\n",
234
+ " legend_patches = [Patch(color=speaker_colors[s], label=s) for s in speakers]\n",
235
+ " ax.legend(handles=legend_patches, loc='upper right')\n",
236
+ " \n",
237
+ " plt.tight_layout()\n",
238
+ " plt.show()\n",
239
+ "else:\n",
240
+ " print(\"⚠️ No results to visualize\")"
241
+ ]
242
+ },
243
+ {
244
+ "cell_type": "markdown",
245
+ "metadata": {},
246
+ "source": [
247
+ "## 7. Export Results"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": null,
253
+ "metadata": {},
254
+ "outputs": [],
255
+ "source": [
256
+ "if 'result' in locals():\n",
257
+ " # Export as JSON\n",
258
+ " pipeline.save_results(result, 'output.json', format='json')\n",
259
+ " \n",
260
+ " # Export as RTTM\n",
261
+ " pipeline.save_results(result, 'output.rttm', format='rttm')\n",
262
+ " \n",
263
+ " # Export as text\n",
264
+ " pipeline.save_results(result, 'output.txt', format='text')\n",
265
+ " \n",
266
+ " print(\"βœ… Results exported in multiple formats\")\n",
267
+ "else:\n",
268
+ " print(\"⚠️ No results to export\")"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "markdown",
273
+ "metadata": {},
274
+ "source": [
275
+ "## Summary\n",
276
+ "\n",
277
+ "This notebook demonstrated:\n",
278
+ "1. βœ… VAD with <100ms latency\n",
279
+ "2. βœ… Speaker diarization with state-of-the-art accuracy\n",
280
+ "3. βœ… Integrated pipeline processing\n",
281
+ "4. βœ… Visualization and export\n",
282
+ "\n",
283
+ "Next steps:\n",
284
+ "- Test on real audio files\n",
285
+ "- Benchmark on FEARLESS STEPS dataset\n",
286
+ "- Deploy with Gradio interface\n",
287
+ "- Containerize with Docker"
288
+ ]
289
+ }
290
+ ],
291
+ "metadata": {
292
+ "kernelspec": {
293
+ "display_name": "Python 3",
294
+ "language": "python",
295
+ "name": "python3"
296
+ },
297
+ "language_info": {
298
+ "codemirror_mode": {
299
+ "name": "ipython",
300
+ "version": 3
301
+ },
302
+ "file_extension": ".py",
303
+ "mimetype": "text/x-python",
304
+ "name": "python",
305
+ "nbconvert_exporter": "python",
306
+ "pygments_lexer": "ipython3",
307
+ "version": "3.10.0"
308
+ }
309
+ },
310
+ "nbformat": 4,
311
+ "nbformat_minor": 4
312
+ }
requirements.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ torch>=2.0.0
3
+ torchaudio>=2.0.0
4
+
5
+ # VAD
6
+ silero-vad>=5.0.0
7
+
8
+ # Speaker Diarization
9
+ pyannote.audio>=3.1.0
10
+ pyannote.core>=5.0.0
11
+ pyannote.metrics>=3.2.0
12
+
13
+ # Audio processing
14
+ librosa>=0.10.0
15
+ soundfile>=0.12.0
16
+ numpy>=1.24.0
17
+
18
+ # Web interface
19
+ gradio>=4.0.0
20
+
21
+ # Visualization
22
+ matplotlib>=3.7.0
23
+
24
+ # Utilities
25
+ tqdm>=4.65.0
26
+ pyyaml>=6.0
27
+
28
+ # Testing
29
+ pytest>=7.0.0
30
+ pytest-cov>=4.0.0
31
+
32
+ # System utilities
33
+ psutil>=5.9.0
run_app.sh ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Quick launcher for Gradio app with HF token
3
+
4
+ echo "=========================================="
5
+ echo "VAD + Speaker Diarization - Gradio App"
6
+ echo "=========================================="
7
+
8
+ # Check if .env file exists
9
+ if [ -f ".env" ]; then
10
+ echo "βœ“ Found .env file"
11
+ # Load token from .env
12
+ export HF_TOKEN=$(grep HF_TOKEN .env | cut -d '=' -f2)
13
+ fi
14
+
15
+ # Check if token is set
16
+ if [ -z "$HF_TOKEN" ]; then
17
+ echo ""
18
+ echo "❌ HF_TOKEN not set!"
19
+ echo ""
20
+ echo "Please set your Hugging Face token:"
21
+ echo " export HF_TOKEN='your_token_here'"
22
+ echo ""
23
+ echo "Or create a .env file with:"
24
+ echo " HF_TOKEN=your_token_here"
25
+ echo ""
26
+ echo "Get your token at: https://huggingface.co/settings/tokens"
27
+ echo "Accept model at: https://huggingface.co/pyannote/speaker-diarization-3.1"
28
+ echo ""
29
+ exit 1
30
+ fi
31
+
32
+ echo "βœ“ HF_TOKEN is set"
33
+ echo ""
34
+ echo "Starting Gradio app..."
35
+ echo "Open browser to: http://localhost:7860"
36
+ echo ""
37
+ echo "Press Ctrl+C to stop"
38
+ echo "=========================================="
39
+ echo ""
40
+
41
+ # Run the app
42
+ python app.py
setup.sh ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Quick setup script for VAD + Speaker Diarization
3
+
4
+ set -e
5
+
6
+ echo "=========================================="
7
+ echo "VAD + Speaker Diarization Setup"
8
+ echo "=========================================="
9
+
10
+ # Check if conda environment is active
11
+ if [[ -n "$CONDA_DEFAULT_ENV" ]]; then
12
+ echo "\nβœ“ Conda environment detected: $CONDA_DEFAULT_ENV"
13
+ USE_CONDA=true
14
+ else
15
+ echo "\n⚠️ No conda environment detected"
16
+ USE_CONDA=false
17
+ fi
18
+
19
+ # Check Python version
20
+ echo -e "\n[1/6] Checking Python version..."
21
+ python_version=$(python --version 2>&1 | awk '{print $2}')
22
+ echo "Found Python $python_version"
23
+
24
+ if ! python -c "import sys; assert sys.version_info >= (3, 10)" 2>/dev/null; then
25
+ echo "❌ Error: Python 3.10+ required"
26
+ exit 1
27
+ fi
28
+ echo "βœ“ Python version OK"
29
+
30
+ # Check CUDA (optional)
31
+ echo -e "\n[2/6] Checking CUDA..."
32
+ if command -v nvidia-smi &> /dev/null; then
33
+ cuda_version=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}')
34
+ echo "βœ“ CUDA $cuda_version detected"
35
+ USE_CUDA=true
36
+ else
37
+ echo "⚠️ No CUDA detected, will use CPU"
38
+ USE_CUDA=false
39
+ fi
40
+
41
+ # Check FFmpeg
42
+ echo -e "\n[3/6] Checking FFmpeg..."
43
+ if command -v ffmpeg &> /dev/null; then
44
+ echo "βœ“ FFmpeg installed"
45
+ else
46
+ echo "⚠️ FFmpeg not found"
47
+ echo "Install with: sudo apt-get install ffmpeg"
48
+ fi
49
+
50
+ # Setup environment
51
+ echo -e "\n[4/6] Setting up Python environment..."
52
+ if [ "$USE_CONDA" = true ]; then
53
+ echo "βœ“ Using conda environment: $CONDA_DEFAULT_ENV"
54
+ else
55
+ # Create virtual environment
56
+ if [ ! -d "venv" ]; then
57
+ python -m venv venv
58
+ echo "βœ“ Virtual environment created"
59
+ else
60
+ echo "βœ“ Virtual environment already exists"
61
+ fi
62
+ # Activate virtual environment
63
+ source venv/bin/activate
64
+ fi
65
+
66
+ # Install PyTorch
67
+ echo -e "\n[5/6] Installing PyTorch..."
68
+ if [ "$USE_CUDA" = true ]; then
69
+ echo "Installing PyTorch with CUDA support..."
70
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
71
+ else
72
+ echo "Installing PyTorch (CPU only)..."
73
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
74
+ fi
75
+
76
+ # Install dependencies
77
+ echo -e "\n[6/6] Installing dependencies..."
78
+ pip install -r requirements.txt
79
+
80
+ # Create directories
81
+ mkdir -p data outputs benchmarks
82
+
83
+ # Setup environment file
84
+ if [ ! -f ".env" ]; then
85
+ cp .env.example .env
86
+ echo "βœ“ Created .env file"
87
+ echo "⚠️ Please edit .env and add your HF_TOKEN"
88
+ fi
89
+
90
+ echo -e "\n=========================================="
91
+ echo "βœ… Setup complete!"
92
+ echo "=========================================="
93
+ echo -e "\nNext steps:"
94
+ if [ "$USE_CONDA" = true ]; then
95
+ echo "1. Environment already active: $CONDA_DEFAULT_ENV βœ“"
96
+ else
97
+ echo "1. Activate environment: source venv/bin/activate"
98
+ fi
99
+ echo "2. Set HF token: export HF_TOKEN='your_token_here'"
100
+ echo " Get token at: https://huggingface.co/settings/tokens"
101
+ echo "3. Accept model conditions at:"
102
+ echo " https://huggingface.co/pyannote/speaker-diarization-3.1"
103
+ echo "4. Run demo: python vad_diarization.py"
104
+ echo "5. Run Gradio app: python app.py"
105
+ echo -e "\nFor more info, see README.md"
106
+ echo "=========================================="
src/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Real-Time VAD + Speaker Diarization System
3
+ Production-ready pipeline for voice activity detection and speaker identification
4
+ """
5
+
6
+ from .vad import SileroVAD
7
+ from .diarization import SpeakerDiarization
8
+ from .pipeline import VADDiarizationPipeline
9
+ from . import utils
10
+
11
+ __version__ = "1.0.0"
12
+ __all__ = [
13
+ 'SileroVAD',
14
+ 'SpeakerDiarization',
15
+ 'VADDiarizationPipeline',
16
+ 'utils'
17
+ ]
src/diarization.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Pyannote Speaker Diarization Wrapper
4
+ Optimized for accuracy and performance
5
+ """
6
+
7
+ import torch
8
+ import numpy as np
9
+ from typing import List, Dict, Optional, Tuple
10
+ import time
11
+ from pathlib import Path
12
+
13
+
14
+ class SpeakerDiarization:
15
+ """
16
+ Production-ready Pyannote speaker diarization wrapper.
17
+
18
+ Features:
19
+ - State-of-the-art speaker diarization
20
+ - GPU acceleration support
21
+ - Configurable parameters for accuracy/speed tradeoff
22
+ - Overlap detection
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ model_name: str = "pyannote/speaker-diarization-3.1",
28
+ use_auth_token: Optional[str] = None,
29
+ token: Optional[str] = None,
30
+ device: Optional[str] = None,
31
+ num_speakers: Optional[int] = None,
32
+ min_speakers: Optional[int] = None,
33
+ max_speakers: Optional[int] = None
34
+ ):
35
+ """
36
+ Initialize speaker diarization pipeline.
37
+
38
+ Args:
39
+ model_name: Hugging Face model name
40
+ use_auth_token: (Deprecated) Hugging Face authentication token
41
+ token: Hugging Face authentication token (new parameter name)
42
+ device: Device to use ('cuda' or 'cpu')
43
+ num_speakers: Fixed number of speakers (if known)
44
+ min_speakers: Minimum number of speakers
45
+ max_speakers: Maximum number of speakers
46
+ """
47
+ self.model_name = model_name
48
+ self.num_speakers = num_speakers
49
+ self.min_speakers = min_speakers
50
+ self.max_speakers = max_speakers
51
+
52
+ # Handle both old and new parameter names
53
+ auth_token = token or use_auth_token
54
+
55
+ # Set device
56
+ if device is None:
57
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
58
+ else:
59
+ self.device = torch.device(device)
60
+
61
+ # Load pipeline
62
+ self.pipeline = self._load_pipeline(auth_token)
63
+
64
+ print(f"βœ“ Speaker diarization initialized on {self.device}")
65
+
66
+ def _load_pipeline(self, auth_token: Optional[str]):
67
+ """Load Pyannote diarization pipeline."""
68
+ from pyannote.audio import Pipeline
69
+
70
+ try:
71
+ # Use 'token' parameter for pyannote.audio 4.0+
72
+ pipeline = Pipeline.from_pretrained(
73
+ self.model_name,
74
+ token=auth_token
75
+ )
76
+
77
+ # Move to device
78
+ pipeline.to(self.device)
79
+
80
+ return pipeline
81
+ except Exception as e:
82
+ print(f"❌ Error loading pipeline: {e}")
83
+ print("Make sure you have:")
84
+ print("1. Accepted model conditions at https://huggingface.co/pyannote/speaker-diarization-3.1")
85
+ print("2. Valid HF token from https://huggingface.co/settings/tokens")
86
+ raise
87
+
88
+ def process_file(
89
+ self,
90
+ audio_path: str,
91
+ num_speakers: Optional[int] = None,
92
+ min_speakers: Optional[int] = None,
93
+ max_speakers: Optional[int] = None
94
+ ) -> Tuple[List[Dict], float, Dict]:
95
+ """
96
+ Process an audio file and return speaker segments.
97
+
98
+ Args:
99
+ audio_path: Path to audio file
100
+ num_speakers: Override number of speakers
101
+ min_speakers: Override minimum speakers
102
+ max_speakers: Override maximum speakers
103
+
104
+ Returns:
105
+ Tuple of (segments, processing_time_ms, metadata)
106
+ """
107
+ # Use instance defaults if not provided
108
+ num_speakers = num_speakers or self.num_speakers
109
+ min_speakers = min_speakers or self.min_speakers
110
+ max_speakers = max_speakers or self.max_speakers
111
+
112
+ # Prepare parameters
113
+ params = {}
114
+ if num_speakers is not None:
115
+ params['num_speakers'] = num_speakers
116
+ if min_speakers is not None:
117
+ params['min_speakers'] = min_speakers
118
+ if max_speakers is not None:
119
+ params['max_speakers'] = max_speakers
120
+
121
+ # Process
122
+ start_time = time.time()
123
+ diarization = self.pipeline(audio_path, **params)
124
+ processing_time = (time.time() - start_time) * 1000 # Convert to ms
125
+
126
+ # Extract segments
127
+ segments = []
128
+ speakers = set()
129
+
130
+ # Handle different output formats from pyannote.audio
131
+ # Version 4.0+ returns DiarizeOutput, earlier versions return Annotation
132
+ if hasattr(diarization, 'speaker_diarization'):
133
+ # pyannote.audio 4.0+ format - DiarizeOutput object
134
+ annotation = diarization.speaker_diarization
135
+ elif hasattr(diarization, 'itertracks'):
136
+ # pyannote.audio 3.x format - Annotation object
137
+ annotation = diarization
138
+ else:
139
+ raise ValueError(f"Unknown diarization output format: {type(diarization)}")
140
+
141
+ # Extract segments from annotation
142
+ for turn, _, speaker in annotation.itertracks(yield_label=True):
143
+ segments.append({
144
+ 'start': turn.start,
145
+ 'end': turn.end,
146
+ 'speaker': speaker,
147
+ 'duration': turn.end - turn.start
148
+ })
149
+ speakers.add(speaker)
150
+
151
+ # Metadata
152
+ metadata = {
153
+ 'num_speakers': len(speakers),
154
+ 'total_speech_time': sum(seg['duration'] for seg in segments),
155
+ 'num_segments': len(segments)
156
+ }
157
+
158
+ return segments, processing_time, metadata
159
+
160
+ def process_with_vad_segments(
161
+ self,
162
+ audio_path: str,
163
+ vad_segments: List[Dict],
164
+ **kwargs
165
+ ) -> List[Dict]:
166
+ """
167
+ Process audio using VAD segments to optimize diarization.
168
+
169
+ Args:
170
+ audio_path: Path to audio file
171
+ vad_segments: List of VAD segments with 'start' and 'end'
172
+ **kwargs: Additional parameters for diarization
173
+
174
+ Returns:
175
+ List of speaker segments
176
+ """
177
+ # For now, process full file
178
+ # TODO: Implement segment-wise processing for optimization
179
+ segments, _, _ = self.process_file(audio_path, **kwargs)
180
+
181
+ # Filter segments to only include VAD regions
182
+ filtered_segments = []
183
+ for seg in segments:
184
+ # Check if segment overlaps with any VAD segment
185
+ for vad_seg in vad_segments:
186
+ vad_start = vad_seg['start']
187
+ vad_end = vad_seg['end']
188
+
189
+ # Check overlap
190
+ if seg['start'] < vad_end and seg['end'] > vad_start:
191
+ filtered_segments.append(seg)
192
+ break
193
+
194
+ return filtered_segments
195
+
196
+ def get_speaker_statistics(self, segments: List[Dict]) -> Dict:
197
+ """
198
+ Calculate speaker statistics from segments.
199
+
200
+ Args:
201
+ segments: List of speaker segments
202
+
203
+ Returns:
204
+ Dict with per-speaker statistics
205
+ """
206
+ stats = {}
207
+
208
+ for seg in segments:
209
+ speaker = seg['speaker']
210
+ if speaker not in stats:
211
+ stats[speaker] = {
212
+ 'total_time': 0.0,
213
+ 'num_segments': 0,
214
+ 'avg_segment_duration': 0.0
215
+ }
216
+
217
+ stats[speaker]['total_time'] += seg['duration']
218
+ stats[speaker]['num_segments'] += 1
219
+
220
+ # Calculate averages
221
+ for speaker in stats:
222
+ stats[speaker]['avg_segment_duration'] = (
223
+ stats[speaker]['total_time'] / stats[speaker]['num_segments']
224
+ )
225
+
226
+ return stats
227
+
228
+ def format_timeline(self, segments: List[Dict]) -> str:
229
+ """
230
+ Format segments as a readable timeline.
231
+
232
+ Args:
233
+ segments: List of speaker segments
234
+
235
+ Returns:
236
+ Formatted timeline string
237
+ """
238
+ lines = ["Speaker Timeline:", "=" * 50]
239
+
240
+ for seg in segments:
241
+ line = f"{seg['start']:7.2f}s - {seg['end']:7.2f}s: {seg['speaker']} ({seg['duration']:.2f}s)"
242
+ lines.append(line)
243
+
244
+ return "\n".join(lines)
245
+
246
+ def calculate_der(
247
+ self,
248
+ predicted_segments: List[Dict],
249
+ reference_segments: List[Dict],
250
+ collar: float = 0.25
251
+ ) -> float:
252
+ """
253
+ Calculate Diarization Error Rate (DER).
254
+
255
+ Args:
256
+ predicted_segments: Predicted speaker segments
257
+ reference_segments: Ground truth segments
258
+ collar: Collar size in seconds for forgiveness
259
+
260
+ Returns:
261
+ DER value (0.0-1.0)
262
+ """
263
+ # This is a simplified DER calculation
264
+ # For production, use pyannote.metrics
265
+ try:
266
+ from pyannote.metrics.diarization import DiarizationErrorRate
267
+ from pyannote.core import Annotation, Segment
268
+
269
+ # Convert to pyannote format
270
+ reference = Annotation()
271
+ for seg in reference_segments:
272
+ reference[Segment(seg['start'], seg['end'])] = seg['speaker']
273
+
274
+ hypothesis = Annotation()
275
+ for seg in predicted_segments:
276
+ hypothesis[Segment(seg['start'], seg['end'])] = seg['speaker']
277
+
278
+ # Calculate DER
279
+ metric = DiarizationErrorRate(collar=collar)
280
+ der = metric(reference, hypothesis)
281
+
282
+ return der
283
+ except ImportError:
284
+ print("⚠️ pyannote.metrics not available, skipping DER calculation")
285
+ return -1.0
286
+
287
+
288
+ def demo():
289
+ """Demo diarization functionality."""
290
+ print("\n" + "="*60)
291
+ print("SPEAKER DIARIZATION DEMO")
292
+ print("="*60)
293
+
294
+ print("\n⚠️ This demo requires:")
295
+ print("1. Hugging Face account")
296
+ print("2. Accepted model conditions at:")
297
+ print(" https://huggingface.co/pyannote/speaker-diarization-3.1")
298
+ print("3. Valid HF token from:")
299
+ print(" https://huggingface.co/settings/tokens")
300
+
301
+ # Check for token
302
+ import os
303
+ token = os.environ.get('HF_TOKEN')
304
+
305
+ if not token:
306
+ print("\n❌ No HF_TOKEN found in environment")
307
+ print("Set it with: export HF_TOKEN='your_token_here'")
308
+ return
309
+
310
+ try:
311
+ # Initialize
312
+ diarization = SpeakerDiarization(use_auth_token=token)
313
+ print("\nβœ… Diarization pipeline loaded successfully")
314
+
315
+ except Exception as e:
316
+ print(f"\n❌ Failed to load pipeline: {e}")
317
+
318
+ print("\n" + "="*60)
319
+
320
+
321
+ if __name__ == "__main__":
322
+ demo()
src/pipeline.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Integrated VAD + Speaker Diarization Pipeline
4
+ Real-time processing with optimized performance
5
+ """
6
+
7
+ import torch
8
+ import numpy as np
9
+ from typing import List, Dict, Optional, Tuple, Union
10
+ import time
11
+ from pathlib import Path
12
+ import json
13
+
14
+ from .vad import SileroVAD
15
+ from .diarization import SpeakerDiarization
16
+
17
+
18
+ class VADDiarizationPipeline:
19
+ """
20
+ Integrated pipeline combining VAD and speaker diarization.
21
+
22
+ Features:
23
+ - Two-stage processing: VAD first, then diarization
24
+ - Optimized for real-time performance
25
+ - Configurable parameters
26
+ - Comprehensive output format
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ vad_threshold: float = 0.5,
32
+ use_auth_token: Optional[str] = None,
33
+ token: Optional[str] = None,
34
+ device: Optional[str] = None,
35
+ num_speakers: Optional[int] = None,
36
+ min_speakers: Optional[int] = None,
37
+ max_speakers: Optional[int] = None,
38
+ use_onnx_vad: bool = False
39
+ ):
40
+ """
41
+ Initialize the integrated pipeline.
42
+
43
+ Args:
44
+ vad_threshold: VAD sensitivity threshold
45
+ use_auth_token: (Deprecated) Hugging Face token for diarization
46
+ token: Hugging Face token for diarization (new parameter name)
47
+ device: Device to use ('cuda' or 'cpu')
48
+ num_speakers: Fixed number of speakers
49
+ min_speakers: Minimum number of speakers
50
+ max_speakers: Maximum number of speakers
51
+ use_onnx_vad: Use ONNX for VAD (faster)
52
+ """
53
+ print("\n" + "="*60)
54
+ print("INITIALIZING VAD + DIARIZATION PIPELINE")
55
+ print("="*60)
56
+
57
+ # Handle both old and new parameter names
58
+ auth_token = token or use_auth_token
59
+
60
+ # Initialize VAD
61
+ print("\n[1/2] Loading Voice Activity Detection...")
62
+ self.vad = SileroVAD(
63
+ threshold=vad_threshold,
64
+ use_onnx=use_onnx_vad
65
+ )
66
+
67
+ # Initialize Diarization
68
+ print("\n[2/2] Loading Speaker Diarization...")
69
+ self.diarization = SpeakerDiarization(
70
+ token=auth_token,
71
+ device=device,
72
+ num_speakers=num_speakers,
73
+ min_speakers=min_speakers,
74
+ max_speakers=max_speakers
75
+ )
76
+
77
+ print("\n" + "="*60)
78
+ print("βœ… PIPELINE READY")
79
+ print("="*60 + "\n")
80
+
81
+ def process_file(
82
+ self,
83
+ audio_path: str,
84
+ num_speakers: Optional[int] = None,
85
+ return_vad: bool = True,
86
+ return_stats: bool = True
87
+ ) -> Dict:
88
+ """
89
+ Process an audio file through the complete pipeline.
90
+
91
+ Args:
92
+ audio_path: Path to audio file
93
+ num_speakers: Number of speakers (if known)
94
+ return_vad: Include VAD segments in output
95
+ return_stats: Include statistics in output
96
+
97
+ Returns:
98
+ Dict with results and metadata
99
+ """
100
+ print(f"\nπŸ“ Processing: {audio_path}")
101
+ print("-" * 60)
102
+
103
+ total_start = time.time()
104
+
105
+ # Stage 1: VAD
106
+ print("Stage 1: Voice Activity Detection...")
107
+ vad_start = time.time()
108
+ vad_segments, vad_time = self.vad.process_file(audio_path)
109
+ vad_duration = (time.time() - vad_start) * 1000
110
+
111
+ print(f" βœ“ Found {len(vad_segments)} speech segments")
112
+ print(f" βœ“ Processing time: {vad_duration:.2f}ms")
113
+
114
+ # Stage 2: Diarization
115
+ print("\nStage 2: Speaker Diarization...")
116
+ diar_start = time.time()
117
+ speaker_segments, diar_time, diar_metadata = self.diarization.process_file(
118
+ audio_path,
119
+ num_speakers=num_speakers
120
+ )
121
+ diar_duration = (time.time() - diar_start) * 1000
122
+
123
+ print(f" βœ“ Identified {diar_metadata['num_speakers']} speakers")
124
+ print(f" βœ“ Found {diar_metadata['num_segments']} speaker segments")
125
+ print(f" βœ“ Processing time: {diar_duration:.2f}ms")
126
+
127
+ # Calculate total time
128
+ total_duration = (time.time() - total_start) * 1000
129
+
130
+ print(f"\n⏱️ Total processing time: {total_duration:.2f}ms")
131
+ print("-" * 60)
132
+
133
+ # Build result
134
+ result = {
135
+ 'audio_path': audio_path,
136
+ 'speaker_segments': speaker_segments,
137
+ 'processing_time': {
138
+ 'vad_ms': vad_duration,
139
+ 'diarization_ms': diar_duration,
140
+ 'total_ms': total_duration
141
+ },
142
+ 'metadata': diar_metadata
143
+ }
144
+
145
+ if return_vad:
146
+ result['vad_segments'] = vad_segments
147
+
148
+ if return_stats:
149
+ result['speaker_statistics'] = self.diarization.get_speaker_statistics(
150
+ speaker_segments
151
+ )
152
+
153
+ return result
154
+
155
+ def process_batch(
156
+ self,
157
+ audio_paths: List[str],
158
+ **kwargs
159
+ ) -> List[Dict]:
160
+ """
161
+ Process multiple audio files.
162
+
163
+ Args:
164
+ audio_paths: List of audio file paths
165
+ **kwargs: Additional arguments for process_file
166
+
167
+ Returns:
168
+ List of results
169
+ """
170
+ results = []
171
+
172
+ print(f"\nπŸ“¦ Batch processing {len(audio_paths)} files...")
173
+ print("="*60)
174
+
175
+ for i, path in enumerate(audio_paths, 1):
176
+ print(f"\n[{i}/{len(audio_paths)}]")
177
+ result = self.process_file(path, **kwargs)
178
+ results.append(result)
179
+
180
+ print("\n" + "="*60)
181
+ print(f"βœ… Batch processing complete ({len(results)} files)")
182
+ print("="*60 + "\n")
183
+
184
+ return results
185
+
186
+ def format_output(self, result: Dict, format: str = 'text') -> str:
187
+ """
188
+ Format pipeline output.
189
+
190
+ Args:
191
+ result: Result from process_file
192
+ format: Output format ('text', 'json', 'rttm')
193
+
194
+ Returns:
195
+ Formatted string
196
+ """
197
+ if format == 'json':
198
+ return json.dumps(result, indent=2)
199
+
200
+ elif format == 'rttm':
201
+ # RTTM format for NIST evaluation
202
+ lines = []
203
+ for seg in result['speaker_segments']:
204
+ # RTTM format: SPEAKER file 1 start duration <NA> <NA> speaker <NA> <NA>
205
+ line = f"SPEAKER {Path(result['audio_path']).stem} 1 {seg['start']:.3f} {seg['duration']:.3f} <NA> <NA> {seg['speaker']} <NA> <NA>"
206
+ lines.append(line)
207
+ return "\n".join(lines)
208
+
209
+ else: # text
210
+ lines = []
211
+ lines.append("="*60)
212
+ lines.append("VAD + SPEAKER DIARIZATION RESULTS")
213
+ lines.append("="*60)
214
+ lines.append(f"\nFile: {result['audio_path']}")
215
+
216
+ # Metadata
217
+ lines.append(f"\nMetadata:")
218
+ lines.append(f" Speakers: {result['metadata']['num_speakers']}")
219
+ lines.append(f" Segments: {result['metadata']['num_segments']}")
220
+ lines.append(f" Total speech: {result['metadata']['total_speech_time']:.2f}s")
221
+
222
+ # Processing time
223
+ lines.append(f"\nProcessing Time:")
224
+ lines.append(f" VAD: {result['processing_time']['vad_ms']:.2f}ms")
225
+ lines.append(f" Diarization: {result['processing_time']['diarization_ms']:.2f}ms")
226
+ lines.append(f" Total: {result['processing_time']['total_ms']:.2f}ms")
227
+
228
+ # Speaker statistics
229
+ if 'speaker_statistics' in result:
230
+ lines.append(f"\nSpeaker Statistics:")
231
+ for speaker, stats in result['speaker_statistics'].items():
232
+ lines.append(f" {speaker}:")
233
+ lines.append(f" Total time: {stats['total_time']:.2f}s")
234
+ lines.append(f" Segments: {stats['num_segments']}")
235
+ lines.append(f" Avg duration: {stats['avg_segment_duration']:.2f}s")
236
+
237
+ # Timeline
238
+ lines.append(f"\nSpeaker Timeline:")
239
+ lines.append("-"*60)
240
+ for seg in result['speaker_segments']:
241
+ lines.append(f"{seg['start']:7.2f}s - {seg['end']:7.2f}s: {seg['speaker']}")
242
+
243
+ lines.append("="*60)
244
+
245
+ return "\n".join(lines)
246
+
247
+ def save_results(
248
+ self,
249
+ result: Dict,
250
+ output_path: str,
251
+ format: str = 'json'
252
+ ):
253
+ """
254
+ Save results to file.
255
+
256
+ Args:
257
+ result: Result from process_file
258
+ output_path: Output file path
259
+ format: Output format ('json', 'rttm', 'text')
260
+ """
261
+ output = self.format_output(result, format=format)
262
+
263
+ with open(output_path, 'w') as f:
264
+ f.write(output)
265
+
266
+ print(f"βœ“ Results saved to: {output_path}")
267
+
268
+ def benchmark(
269
+ self,
270
+ test_audio_path: Optional[str] = None,
271
+ duration_seconds: float = 10.0
272
+ ) -> Dict:
273
+ """
274
+ Benchmark pipeline performance.
275
+
276
+ Args:
277
+ test_audio_path: Path to test audio (optional)
278
+ duration_seconds: Duration for synthetic test
279
+
280
+ Returns:
281
+ Benchmark metrics
282
+ """
283
+ print("\n" + "="*60)
284
+ print("PIPELINE BENCHMARK")
285
+ print("="*60)
286
+
287
+ # VAD benchmark
288
+ print("\n[1/2] Benchmarking VAD...")
289
+ vad_metrics = self.vad.benchmark_latency(duration_seconds)
290
+ print(f" Latency: {vad_metrics['latency_per_second_ms']:.2f}ms per second")
291
+ print(f" Real-time factor: {vad_metrics['real_time_factor']:.4f}x")
292
+
293
+ if vad_metrics['latency_per_second_ms'] < 100:
294
+ print(" βœ… VAD latency target achieved (<100ms)")
295
+ else:
296
+ print(" ⚠️ VAD latency above target")
297
+
298
+ # Full pipeline benchmark (if test audio provided)
299
+ if test_audio_path:
300
+ print("\n[2/2] Benchmarking full pipeline...")
301
+ result = self.process_file(test_audio_path, return_stats=False)
302
+
303
+ print(f" Total time: {result['processing_time']['total_ms']:.2f}ms")
304
+
305
+ print("\n" + "="*60)
306
+
307
+ return {
308
+ 'vad_metrics': vad_metrics,
309
+ 'pipeline_metrics': result['processing_time'] if test_audio_path else None
310
+ }
311
+
312
+
313
+ def demo():
314
+ """Demo the integrated pipeline."""
315
+ print("\n" + "="*60)
316
+ print("INTEGRATED PIPELINE DEMO")
317
+ print("="*60)
318
+
319
+ import os
320
+
321
+ # Check for HF token
322
+ token = os.environ.get('HF_TOKEN')
323
+ if not token:
324
+ print("\n⚠️ No HF_TOKEN found in environment")
325
+ print("Set it with: export HF_TOKEN='your_token_here'")
326
+ print("\nFor now, will demo VAD only...")
327
+
328
+ # VAD-only demo
329
+ vad = SileroVAD()
330
+ metrics = vad.benchmark_latency()
331
+ print(f"\nβœ… VAD latency: {metrics['latency_per_second_ms']:.2f}ms per second")
332
+ return
333
+
334
+ try:
335
+ # Initialize pipeline
336
+ pipeline = VADDiarizationPipeline(
337
+ use_auth_token=token,
338
+ vad_threshold=0.5
339
+ )
340
+
341
+ # Benchmark
342
+ pipeline.benchmark()
343
+
344
+ print("\nβœ… Pipeline demo complete!")
345
+
346
+ except Exception as e:
347
+ print(f"\n❌ Error: {e}")
348
+
349
+ print("\n" + "="*60)
350
+
351
+
352
+ if __name__ == "__main__":
353
+ demo()
src/utils.py ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Utility functions for VAD + Diarization pipeline
4
+ """
5
+
6
+ import numpy as np
7
+ import torch
8
+ from typing import List, Dict, Optional, Tuple
9
+ from pathlib import Path
10
+ import json
11
+
12
+
13
+ def load_audio(
14
+ path: str,
15
+ sampling_rate: int = 16000,
16
+ mono: bool = True
17
+ ) -> Tuple[np.ndarray, int]:
18
+ """
19
+ Load audio file with automatic format detection.
20
+
21
+ Args:
22
+ path: Path to audio file
23
+ sampling_rate: Target sample rate
24
+ mono: Convert to mono
25
+
26
+ Returns:
27
+ Tuple of (audio_data, sample_rate)
28
+ """
29
+ try:
30
+ import librosa
31
+ audio, sr = librosa.load(path, sr=sampling_rate, mono=mono)
32
+ return audio, sr
33
+ except Exception as e:
34
+ print(f"Error loading audio with librosa: {e}")
35
+
36
+ # Fallback to soundfile
37
+ try:
38
+ import soundfile as sf
39
+ audio, sr = sf.read(path)
40
+
41
+ # Resample if needed
42
+ if sr != sampling_rate:
43
+ import librosa
44
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
45
+ sr = sampling_rate
46
+
47
+ # Convert to mono if needed
48
+ if mono and len(audio.shape) > 1:
49
+ audio = audio.mean(axis=1)
50
+
51
+ return audio, sr
52
+ except Exception as e:
53
+ print(f"Error loading audio with soundfile: {e}")
54
+ raise
55
+
56
+
57
+ def save_audio(
58
+ audio: np.ndarray,
59
+ path: str,
60
+ sampling_rate: int = 16000
61
+ ):
62
+ """
63
+ Save audio to file.
64
+
65
+ Args:
66
+ audio: Audio data
67
+ path: Output path
68
+ sampling_rate: Sample rate
69
+ """
70
+ import soundfile as sf
71
+ sf.write(path, audio, sampling_rate)
72
+
73
+
74
+ def merge_segments(
75
+ segments: List[Dict],
76
+ gap_threshold: float = 0.5
77
+ ) -> List[Dict]:
78
+ """
79
+ Merge nearby segments from the same speaker.
80
+
81
+ Args:
82
+ segments: List of segments with 'start', 'end', 'speaker'
83
+ gap_threshold: Maximum gap to merge (seconds)
84
+
85
+ Returns:
86
+ Merged segments
87
+ """
88
+ if not segments:
89
+ return []
90
+
91
+ # Sort by start time
92
+ sorted_segments = sorted(segments, key=lambda x: x['start'])
93
+
94
+ merged = [sorted_segments[0].copy()]
95
+
96
+ for seg in sorted_segments[1:]:
97
+ last = merged[-1]
98
+
99
+ # Check if same speaker and close enough
100
+ if (seg['speaker'] == last['speaker'] and
101
+ seg['start'] - last['end'] <= gap_threshold):
102
+ # Merge
103
+ last['end'] = seg['end']
104
+ last['duration'] = last['end'] - last['start']
105
+ else:
106
+ # Add new segment
107
+ merged.append(seg.copy())
108
+
109
+ return merged
110
+
111
+
112
+ def filter_short_segments(
113
+ segments: List[Dict],
114
+ min_duration: float = 0.5
115
+ ) -> List[Dict]:
116
+ """
117
+ Filter out segments shorter than threshold.
118
+
119
+ Args:
120
+ segments: List of segments
121
+ min_duration: Minimum duration (seconds)
122
+
123
+ Returns:
124
+ Filtered segments
125
+ """
126
+ return [seg for seg in segments if seg['duration'] >= min_duration]
127
+
128
+
129
+ def calculate_overlap(
130
+ seg1: Dict,
131
+ seg2: Dict
132
+ ) -> float:
133
+ """
134
+ Calculate overlap between two segments.
135
+
136
+ Args:
137
+ seg1: First segment with 'start' and 'end'
138
+ seg2: Second segment with 'start' and 'end'
139
+
140
+ Returns:
141
+ Overlap duration in seconds
142
+ """
143
+ start = max(seg1['start'], seg2['start'])
144
+ end = min(seg1['end'], seg2['end'])
145
+
146
+ return max(0, end - start)
147
+
148
+
149
+ def segment_to_rttm(
150
+ segments: List[Dict],
151
+ file_id: str = "audio"
152
+ ) -> str:
153
+ """
154
+ Convert segments to RTTM format.
155
+
156
+ Args:
157
+ segments: List of segments
158
+ file_id: File identifier
159
+
160
+ Returns:
161
+ RTTM formatted string
162
+ """
163
+ lines = []
164
+ for seg in segments:
165
+ # RTTM format: SPEAKER file 1 start duration <NA> <NA> speaker <NA> <NA>
166
+ line = f"SPEAKER {file_id} 1 {seg['start']:.3f} {seg['duration']:.3f} <NA> <NA> {seg['speaker']} <NA> <NA>"
167
+ lines.append(line)
168
+
169
+ return "\n".join(lines)
170
+
171
+
172
+ def rttm_to_segments(rttm_text: str) -> List[Dict]:
173
+ """
174
+ Parse RTTM format to segments.
175
+
176
+ Args:
177
+ rttm_text: RTTM formatted text
178
+
179
+ Returns:
180
+ List of segments
181
+ """
182
+ segments = []
183
+
184
+ for line in rttm_text.strip().split('\n'):
185
+ if not line.strip():
186
+ continue
187
+
188
+ parts = line.split()
189
+ if parts[0] != 'SPEAKER':
190
+ continue
191
+
192
+ start = float(parts[3])
193
+ duration = float(parts[4])
194
+ speaker = parts[7]
195
+
196
+ segments.append({
197
+ 'start': start,
198
+ 'end': start + duration,
199
+ 'duration': duration,
200
+ 'speaker': speaker
201
+ })
202
+
203
+ return segments
204
+
205
+
206
+ def visualize_timeline(
207
+ segments: List[Dict],
208
+ duration: Optional[float] = None,
209
+ width: int = 80
210
+ ) -> str:
211
+ """
212
+ Create ASCII visualization of speaker timeline.
213
+
214
+ Args:
215
+ segments: List of segments
216
+ duration: Total duration (auto-detect if None)
217
+ width: Width of visualization
218
+
219
+ Returns:
220
+ ASCII timeline string
221
+ """
222
+ if not segments:
223
+ return "No segments to visualize"
224
+
225
+ # Determine duration
226
+ if duration is None:
227
+ duration = max(seg['end'] for seg in segments)
228
+
229
+ # Get unique speakers
230
+ speakers = sorted(set(seg['speaker'] for seg in segments))
231
+ speaker_chars = {}
232
+ chars = ['β–ˆ', 'β–“', 'β–’', 'β–‘', '●', 'β—‹', 'β– ', 'β–‘', 'β–ͺ', 'β–«']
233
+ for i, speaker in enumerate(speakers):
234
+ speaker_chars[speaker] = chars[i % len(chars)]
235
+
236
+ # Create timeline
237
+ lines = []
238
+ lines.append(f"\nTimeline (0.00s - {duration:.2f}s):")
239
+ lines.append("─" * width)
240
+
241
+ # Time markers
242
+ time_line = ""
243
+ for i in range(width):
244
+ t = (i / width) * duration
245
+ if i % 10 == 0:
246
+ time_line += f"{t:.0f}s"
247
+ time_line += " " * (10 - len(f"{t:.0f}s"))
248
+ else:
249
+ time_line += " "
250
+ lines.append(time_line[:width])
251
+
252
+ # Speaker rows
253
+ for speaker in speakers:
254
+ row = [' '] * width
255
+
256
+ for seg in segments:
257
+ if seg['speaker'] == speaker:
258
+ start_pos = int((seg['start'] / duration) * width)
259
+ end_pos = int((seg['end'] / duration) * width)
260
+
261
+ for i in range(start_pos, min(end_pos, width)):
262
+ row[i] = speaker_chars[speaker]
263
+
264
+ lines.append(f"{speaker}: {''.join(row)}")
265
+
266
+ lines.append("─" * width)
267
+
268
+ return "\n".join(lines)
269
+
270
+
271
+ def export_results(
272
+ result: Dict,
273
+ output_dir: str,
274
+ formats: List[str] = ['json', 'rttm', 'txt']
275
+ ):
276
+ """
277
+ Export results in multiple formats.
278
+
279
+ Args:
280
+ result: Pipeline result
281
+ output_dir: Output directory
282
+ formats: List of formats to export
283
+ """
284
+ output_path = Path(output_dir)
285
+ output_path.mkdir(parents=True, exist_ok=True)
286
+
287
+ base_name = Path(result['audio_path']).stem
288
+
289
+ for fmt in formats:
290
+ if fmt == 'json':
291
+ # JSON format
292
+ json_path = output_path / f"{base_name}.json"
293
+ with open(json_path, 'w') as f:
294
+ json.dump(result, f, indent=2)
295
+ print(f"βœ“ Saved JSON: {json_path}")
296
+
297
+ elif fmt == 'rttm':
298
+ # RTTM format
299
+ rttm_path = output_path / f"{base_name}.rttm"
300
+ rttm_text = segment_to_rttm(result['speaker_segments'], base_name)
301
+ with open(rttm_path, 'w') as f:
302
+ f.write(rttm_text)
303
+ print(f"βœ“ Saved RTTM: {rttm_path}")
304
+
305
+ elif fmt == 'txt':
306
+ # Text format
307
+ txt_path = output_path / f"{base_name}.txt"
308
+
309
+ lines = []
310
+ lines.append("="*60)
311
+ lines.append("SPEAKER DIARIZATION RESULTS")
312
+ lines.append("="*60)
313
+ lines.append(f"\nFile: {result['audio_path']}")
314
+ lines.append(f"Speakers: {result['metadata']['num_speakers']}")
315
+ lines.append(f"Segments: {result['metadata']['num_segments']}")
316
+ lines.append(f"\nTimeline:")
317
+ lines.append("-"*60)
318
+
319
+ for seg in result['speaker_segments']:
320
+ lines.append(f"{seg['start']:7.2f}s - {seg['end']:7.2f}s: {seg['speaker']}")
321
+
322
+ with open(txt_path, 'w') as f:
323
+ f.write("\n".join(lines))
324
+ print(f"βœ“ Saved TXT: {txt_path}")
325
+
326
+
327
+ def create_test_audio(
328
+ output_path: str = "test_audio.wav",
329
+ duration: float = 10.0,
330
+ sampling_rate: int = 16000
331
+ ) -> str:
332
+ """
333
+ Create synthetic test audio with speech-like patterns.
334
+
335
+ Args:
336
+ output_path: Output file path
337
+ duration: Duration in seconds
338
+ sampling_rate: Sample rate
339
+
340
+ Returns:
341
+ Path to created file
342
+ """
343
+ import soundfile as sf
344
+
345
+ # Generate audio
346
+ t = np.linspace(0, duration, int(sampling_rate * duration))
347
+
348
+ # Create speech-like patterns with silence
349
+ signal = np.zeros_like(t)
350
+
351
+ # Calculate segment lengths
352
+ seg1_len = min(int(sampling_rate*3), len(signal))
353
+ seg2_start = int(sampling_rate*4)
354
+ seg2_end = min(int(sampling_rate*7), len(signal))
355
+ seg3_start = min(int(sampling_rate*8), len(signal))
356
+
357
+ # Speaker 1: 0-3s (or until end)
358
+ if seg1_len > 0:
359
+ signal[0:seg1_len] = 0.3 * np.sin(2 * np.pi * 440 * t[0:seg1_len])
360
+
361
+ # Silence: 3-4s
362
+
363
+ # Speaker 2: 4-7s (or until end)
364
+ if seg2_start < len(signal) and seg2_end > seg2_start:
365
+ seg2_len = seg2_end - seg2_start
366
+ signal[seg2_start:seg2_end] = 0.3 * np.sin(2 * np.pi * 880 * t[seg2_start:seg2_end])
367
+
368
+ # Silence: 7-8s
369
+
370
+ # Speaker 1: 8-10s (or until end)
371
+ if seg3_start < len(signal):
372
+ signal[seg3_start:] = 0.3 * np.sin(2 * np.pi * 440 * t[seg3_start:])
373
+
374
+ # Add some noise
375
+ signal += 0.01 * np.random.randn(len(signal))
376
+
377
+ # Save
378
+ sf.write(output_path, signal, sampling_rate)
379
+
380
+ return output_path
381
+
382
+
383
+ if __name__ == "__main__":
384
+ # Demo utilities
385
+ print("Utility functions loaded")
386
+
387
+ # Create test audio
388
+ test_path = create_test_audio()
389
+ print(f"βœ“ Created test audio: {test_path}")
src/vad.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Silero VAD Wrapper for Real-Time Voice Activity Detection
4
+ Optimized for <100ms latency with streaming support
5
+ """
6
+
7
+ import torch
8
+ import numpy as np
9
+ from typing import List, Dict, Optional, Tuple
10
+ import time
11
+ from pathlib import Path
12
+
13
+
14
+ class SileroVAD:
15
+ """
16
+ Production-ready Silero VAD wrapper with streaming support.
17
+
18
+ Features:
19
+ - Real-time processing with <100ms latency
20
+ - Configurable sensitivity thresholds
21
+ - Streaming audio buffer management
22
+ - ONNX runtime support for optimization
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ threshold: float = 0.5,
28
+ sampling_rate: int = 16000,
29
+ min_speech_duration_ms: int = 250,
30
+ min_silence_duration_ms: int = 100,
31
+ window_size_samples: int = 1536,
32
+ use_onnx: bool = False
33
+ ):
34
+ """
35
+ Initialize Silero VAD.
36
+
37
+ Args:
38
+ threshold: Speech probability threshold (0.0-1.0)
39
+ sampling_rate: Audio sample rate (8000 or 16000)
40
+ min_speech_duration_ms: Minimum speech segment duration
41
+ min_silence_duration_ms: Minimum silence duration between segments
42
+ window_size_samples: VAD window size (512, 1024, or 1536)
43
+ use_onnx: Use ONNX runtime for faster inference
44
+ """
45
+ self.threshold = threshold
46
+ self.sampling_rate = sampling_rate
47
+ self.min_speech_duration_ms = min_speech_duration_ms
48
+ self.min_silence_duration_ms = min_silence_duration_ms
49
+ self.window_size_samples = window_size_samples
50
+ self.use_onnx = use_onnx
51
+
52
+ # Load model
53
+ self.model = self._load_model()
54
+
55
+ # State for streaming
56
+ self.reset_states()
57
+
58
+ print(f"βœ“ Silero VAD initialized (threshold={threshold}, sr={sampling_rate}Hz)")
59
+
60
+ def _load_model(self):
61
+ """Load Silero VAD model."""
62
+ try:
63
+ # Try importing from silero_vad package
64
+ from silero_vad import load_silero_vad
65
+ model = load_silero_vad(onnx=self.use_onnx)
66
+ return model
67
+ except ImportError:
68
+ # Fallback: load from torch hub
69
+ model, utils = torch.hub.load(
70
+ repo_or_dir='snakers4/silero-vad',
71
+ model='silero_vad',
72
+ force_reload=False,
73
+ onnx=self.use_onnx
74
+ )
75
+ return model
76
+
77
+ def reset_states(self):
78
+ """Reset internal states for streaming."""
79
+ self.model.reset_states()
80
+
81
+ def process_chunk(self, audio_chunk: np.ndarray) -> float:
82
+ """
83
+ Process a single audio chunk and return speech probability.
84
+
85
+ Args:
86
+ audio_chunk: Audio data (numpy array, float32, mono)
87
+
88
+ Returns:
89
+ Speech probability (0.0-1.0)
90
+ """
91
+ # Convert to torch tensor
92
+ if isinstance(audio_chunk, np.ndarray):
93
+ audio_tensor = torch.from_numpy(audio_chunk).float()
94
+ else:
95
+ audio_tensor = audio_chunk
96
+
97
+ # Get speech probability
98
+ with torch.no_grad():
99
+ speech_prob = self.model(audio_tensor, self.sampling_rate).item()
100
+
101
+ return speech_prob
102
+
103
+ def get_speech_timestamps(
104
+ self,
105
+ audio: np.ndarray,
106
+ return_seconds: bool = False
107
+ ) -> List[Dict[str, float]]:
108
+ """
109
+ Get speech timestamps from audio.
110
+
111
+ Args:
112
+ audio: Audio data (numpy array, float32, mono)
113
+ return_seconds: Return timestamps in seconds instead of samples
114
+
115
+ Returns:
116
+ List of dicts with 'start' and 'end' keys
117
+ """
118
+ try:
119
+ from silero_vad import get_speech_timestamps
120
+
121
+ # Convert to torch tensor
122
+ if isinstance(audio, np.ndarray):
123
+ audio_tensor = torch.from_numpy(audio).float()
124
+ else:
125
+ audio_tensor = audio
126
+
127
+ # Get timestamps
128
+ timestamps = get_speech_timestamps(
129
+ audio_tensor,
130
+ self.model,
131
+ threshold=self.threshold,
132
+ sampling_rate=self.sampling_rate,
133
+ min_speech_duration_ms=self.min_speech_duration_ms,
134
+ min_silence_duration_ms=self.min_silence_duration_ms,
135
+ window_size_samples=self.window_size_samples,
136
+ return_seconds=return_seconds
137
+ )
138
+
139
+ return timestamps
140
+ except ImportError:
141
+ # Fallback: manual implementation
142
+ return self._get_speech_timestamps_manual(audio, return_seconds)
143
+
144
+ def _get_speech_timestamps_manual(
145
+ self,
146
+ audio: np.ndarray,
147
+ return_seconds: bool = False
148
+ ) -> List[Dict[str, float]]:
149
+ """Manual implementation of speech timestamp detection."""
150
+ if isinstance(audio, np.ndarray):
151
+ audio_tensor = torch.from_numpy(audio).float()
152
+ else:
153
+ audio_tensor = audio
154
+
155
+ # Process in windows
156
+ window_size = self.window_size_samples
157
+ speech_probs = []
158
+
159
+ self.reset_states()
160
+
161
+ for i in range(0, len(audio_tensor), window_size):
162
+ chunk = audio_tensor[i:i + window_size]
163
+ if len(chunk) < window_size:
164
+ # Pad last chunk
165
+ chunk = torch.nn.functional.pad(chunk, (0, window_size - len(chunk)))
166
+
167
+ prob = self.process_chunk(chunk)
168
+ speech_probs.append(prob)
169
+
170
+ # Find speech segments
171
+ timestamps = []
172
+ in_speech = False
173
+ speech_start = 0
174
+
175
+ for i, prob in enumerate(speech_probs):
176
+ sample_idx = i * window_size
177
+
178
+ if prob >= self.threshold and not in_speech:
179
+ # Speech start
180
+ in_speech = True
181
+ speech_start = sample_idx
182
+ elif prob < self.threshold and in_speech:
183
+ # Speech end
184
+ in_speech = False
185
+ speech_end = sample_idx
186
+
187
+ # Check minimum duration
188
+ duration_ms = (speech_end - speech_start) / self.sampling_rate * 1000
189
+ if duration_ms >= self.min_speech_duration_ms:
190
+ if return_seconds:
191
+ timestamps.append({
192
+ 'start': speech_start / self.sampling_rate,
193
+ 'end': speech_end / self.sampling_rate
194
+ })
195
+ else:
196
+ timestamps.append({
197
+ 'start': speech_start,
198
+ 'end': speech_end
199
+ })
200
+
201
+ # Handle case where speech continues to end
202
+ if in_speech:
203
+ speech_end = len(audio_tensor)
204
+ if return_seconds:
205
+ timestamps.append({
206
+ 'start': speech_start / self.sampling_rate,
207
+ 'end': speech_end / self.sampling_rate
208
+ })
209
+ else:
210
+ timestamps.append({
211
+ 'start': speech_start,
212
+ 'end': speech_end
213
+ })
214
+
215
+ return timestamps
216
+
217
+ def process_file(self, audio_path: str) -> Tuple[List[Dict], float]:
218
+ """
219
+ Process an audio file and return speech segments with latency.
220
+
221
+ Args:
222
+ audio_path: Path to audio file
223
+
224
+ Returns:
225
+ Tuple of (timestamps, processing_time_ms)
226
+ """
227
+ # Load audio
228
+ audio = self.read_audio(audio_path)
229
+
230
+ # Measure processing time
231
+ start_time = time.time()
232
+ timestamps = self.get_speech_timestamps(audio, return_seconds=True)
233
+ processing_time = (time.time() - start_time) * 1000 # Convert to ms
234
+
235
+ return timestamps, processing_time
236
+
237
+ @staticmethod
238
+ def read_audio(path: str, sampling_rate: int = 16000) -> torch.Tensor:
239
+ """
240
+ Read audio file and convert to required format.
241
+
242
+ Args:
243
+ path: Path to audio file
244
+ sampling_rate: Target sample rate
245
+
246
+ Returns:
247
+ Audio tensor (mono, float32)
248
+ """
249
+ try:
250
+ from silero_vad import read_audio
251
+ return read_audio(path, sampling_rate=sampling_rate)
252
+ except ImportError:
253
+ # Fallback: use librosa
254
+ import librosa
255
+ audio, sr = librosa.load(path, sr=sampling_rate, mono=True)
256
+ return torch.from_numpy(audio).float()
257
+
258
+ def benchmark_latency(self, duration_seconds: float = 10.0) -> Dict[str, float]:
259
+ """
260
+ Benchmark VAD latency on synthetic audio.
261
+
262
+ Args:
263
+ duration_seconds: Duration of test audio
264
+
265
+ Returns:
266
+ Dict with latency metrics
267
+ """
268
+ # Generate test audio
269
+ num_samples = int(duration_seconds * self.sampling_rate)
270
+ test_audio = torch.randn(num_samples)
271
+
272
+ # Warm-up
273
+ self.reset_states()
274
+ _ = self.get_speech_timestamps(test_audio.numpy())
275
+
276
+ # Benchmark
277
+ self.reset_states()
278
+ start_time = time.time()
279
+ timestamps = self.get_speech_timestamps(test_audio.numpy())
280
+ end_time = time.time()
281
+
282
+ processing_time_ms = (end_time - start_time) * 1000
283
+ latency_per_second = processing_time_ms / duration_seconds
284
+
285
+ return {
286
+ 'total_processing_time_ms': processing_time_ms,
287
+ 'audio_duration_s': duration_seconds,
288
+ 'latency_per_second_ms': latency_per_second,
289
+ 'real_time_factor': processing_time_ms / (duration_seconds * 1000),
290
+ 'num_segments': len(timestamps)
291
+ }
292
+
293
+
294
+ def demo():
295
+ """Demo VAD functionality."""
296
+ print("\n" + "="*60)
297
+ print("SILERO VAD DEMO")
298
+ print("="*60)
299
+
300
+ # Initialize VAD
301
+ vad = SileroVAD(threshold=0.5)
302
+
303
+ # Benchmark latency
304
+ print("\nπŸ“Š Benchmarking latency...")
305
+ metrics = vad.benchmark_latency(duration_seconds=10.0)
306
+ print(f" Total processing time: {metrics['total_processing_time_ms']:.2f}ms")
307
+ print(f" Audio duration: {metrics['audio_duration_s']:.1f}s")
308
+ print(f" Latency per second: {metrics['latency_per_second_ms']:.2f}ms")
309
+ print(f" Real-time factor: {metrics['real_time_factor']:.4f}x")
310
+
311
+ if metrics['latency_per_second_ms'] < 100:
312
+ print(" βœ… Target latency achieved (<100ms)")
313
+ else:
314
+ print(" ⚠️ Latency above target (>100ms)")
315
+
316
+ print("\n" + "="*60)
317
+
318
+
319
+ if __name__ == "__main__":
320
+ demo()
tests/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Test suite for VAD + Speaker Diarization system
3
+ """
tests/test_pipeline.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Unit tests for integrated pipeline
4
+ """
5
+
6
+ import pytest
7
+ import numpy as np
8
+ from pathlib import Path
9
+ import sys
10
+ import tempfile
11
+ import soundfile as sf
12
+
13
+ # Add src to path
14
+ sys.path.insert(0, str(Path(__file__).parent.parent))
15
+
16
+ from src.pipeline import VADDiarizationPipeline
17
+ from src.vad import SileroVAD
18
+
19
+
20
+ class TestPipeline:
21
+ """Test cases for integrated pipeline."""
22
+
23
+ @pytest.fixture
24
+ def test_audio_file(self):
25
+ """Create a temporary test audio file."""
26
+ # Generate test audio
27
+ sr = 16000
28
+ duration = 5
29
+ audio = 0.1 * np.random.randn(sr * duration).astype(np.float32)
30
+
31
+ # Save to temp file
32
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
33
+ sf.write(f.name, audio, sr)
34
+ yield f.name
35
+
36
+ # Cleanup
37
+ Path(f.name).unlink(missing_ok=True)
38
+
39
+ def test_vad_only(self, test_audio_file):
40
+ """Test VAD-only processing (no HF token needed)."""
41
+ vad = SileroVAD()
42
+
43
+ # Process file
44
+ timestamps, processing_time = vad.process_file(test_audio_file)
45
+
46
+ # Verify
47
+ assert isinstance(timestamps, list)
48
+ assert isinstance(processing_time, float)
49
+ assert processing_time > 0
50
+
51
+ def test_format_output_text(self):
52
+ """Test text output formatting."""
53
+ # Mock result
54
+ result = {
55
+ 'audio_path': 'test.wav',
56
+ 'speaker_segments': [
57
+ {'start': 0.0, 'end': 2.0, 'speaker': 'SPEAKER_00', 'duration': 2.0},
58
+ {'start': 3.0, 'end': 5.0, 'speaker': 'SPEAKER_01', 'duration': 2.0}
59
+ ],
60
+ 'metadata': {
61
+ 'num_speakers': 2,
62
+ 'num_segments': 2,
63
+ 'total_speech_time': 4.0
64
+ },
65
+ 'processing_time': {
66
+ 'vad_ms': 50.0,
67
+ 'diarization_ms': 1000.0,
68
+ 'total_ms': 1050.0
69
+ }
70
+ }
71
+
72
+ # Test with VAD only (no full pipeline needed)
73
+ from src.pipeline import VADDiarizationPipeline
74
+
75
+ # Format output (doesn't require initialized pipeline)
76
+ output = format_result_text(result)
77
+
78
+ assert 'test.wav' in output
79
+ assert 'SPEAKER_00' in output
80
+ assert 'SPEAKER_01' in output
81
+
82
+ def test_vad_latency_target(self):
83
+ """Test that VAD meets latency target."""
84
+ vad = SileroVAD()
85
+
86
+ # Benchmark
87
+ metrics = vad.benchmark_latency(duration_seconds=10.0)
88
+
89
+ # Check latency target (<100ms per second)
90
+ assert metrics['latency_per_second_ms'] < 100, \
91
+ f"VAD latency {metrics['latency_per_second_ms']:.2f}ms exceeds 100ms target"
92
+
93
+
94
+ def format_result_text(result):
95
+ """Helper function to format results as text."""
96
+ lines = []
97
+ lines.append(f"File: {result['audio_path']}")
98
+ lines.append(f"Speakers: {result['metadata']['num_speakers']}")
99
+ lines.append(f"Segments: {result['metadata']['num_segments']}")
100
+
101
+ for seg in result['speaker_segments']:
102
+ lines.append(f"{seg['start']:.2f}s - {seg['end']:.2f}s: {seg['speaker']}")
103
+
104
+ return "\n".join(lines)
105
+
106
+
107
+ if __name__ == "__main__":
108
+ pytest.main([__file__, "-v"])
tests/test_vad.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Unit tests for VAD module
4
+ """
5
+
6
+ import pytest
7
+ import torch
8
+ import numpy as np
9
+ from pathlib import Path
10
+ import sys
11
+
12
+ # Add src to path
13
+ sys.path.insert(0, str(Path(__file__).parent.parent))
14
+
15
+ from src.vad import SileroVAD
16
+
17
+
18
+ class TestSileroVAD:
19
+ """Test cases for Silero VAD."""
20
+
21
+ @pytest.fixture
22
+ def vad(self):
23
+ """Create VAD instance for testing."""
24
+ return SileroVAD(threshold=0.5)
25
+
26
+ def test_initialization(self, vad):
27
+ """Test VAD initialization."""
28
+ assert vad is not None
29
+ assert vad.threshold == 0.5
30
+ assert vad.sampling_rate == 16000
31
+ assert vad.model is not None
32
+
33
+ def test_process_chunk(self, vad):
34
+ """Test processing a single audio chunk."""
35
+ # Create test audio
36
+ chunk = np.random.randn(1536).astype(np.float32)
37
+
38
+ # Process
39
+ prob = vad.process_chunk(chunk)
40
+
41
+ # Verify
42
+ assert isinstance(prob, float)
43
+ assert 0.0 <= prob <= 1.0
44
+
45
+ def test_get_speech_timestamps(self, vad):
46
+ """Test getting speech timestamps."""
47
+ # Create test audio with speech-like pattern
48
+ sr = 16000
49
+ duration = 5
50
+ audio = np.zeros(sr * duration, dtype=np.float32)
51
+
52
+ # Add "speech" in middle (higher energy)
53
+ audio[sr:sr*3] = 0.5 * np.random.randn(sr * 2)
54
+
55
+ # Get timestamps
56
+ timestamps = vad.get_speech_timestamps(audio, return_seconds=True)
57
+
58
+ # Verify
59
+ assert isinstance(timestamps, list)
60
+ for ts in timestamps:
61
+ assert 'start' in ts
62
+ assert 'end' in ts
63
+ assert ts['end'] > ts['start']
64
+
65
+ def test_reset_states(self, vad):
66
+ """Test state reset."""
67
+ # Process some audio
68
+ chunk = np.random.randn(1536).astype(np.float32)
69
+ vad.process_chunk(chunk)
70
+
71
+ # Reset
72
+ vad.reset_states()
73
+
74
+ # Should work without error
75
+ prob = vad.process_chunk(chunk)
76
+ assert isinstance(prob, float)
77
+
78
+ def test_benchmark_latency(self, vad):
79
+ """Test latency benchmarking."""
80
+ metrics = vad.benchmark_latency(duration_seconds=1.0)
81
+
82
+ # Verify metrics
83
+ assert 'total_processing_time_ms' in metrics
84
+ assert 'audio_duration_s' in metrics
85
+ assert 'latency_per_second_ms' in metrics
86
+ assert 'real_time_factor' in metrics
87
+
88
+ # Check latency target
89
+ assert metrics['latency_per_second_ms'] < 1000 # Should be much faster
90
+
91
+ def test_different_thresholds(self):
92
+ """Test VAD with different thresholds."""
93
+ thresholds = [0.3, 0.5, 0.7]
94
+
95
+ for threshold in thresholds:
96
+ vad = SileroVAD(threshold=threshold)
97
+ assert vad.threshold == threshold
98
+
99
+ # Test processing
100
+ audio = np.random.randn(16000).astype(np.float32)
101
+ timestamps = vad.get_speech_timestamps(audio)
102
+ assert isinstance(timestamps, list)
103
+
104
+
105
+ def test_vad_import():
106
+ """Test that VAD can be imported."""
107
+ from src.vad import SileroVAD
108
+ assert SileroVAD is not None
109
+
110
+
111
+ if __name__ == "__main__":
112
+ pytest.main([__file__, "-v"])
vad_diarization.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Voice Activity Detection + Speaker Diarization
4
+ Simple demo script using the modular pipeline
5
+ """
6
+
7
+ import torch
8
+ import librosa
9
+ import numpy as np
10
+ from pathlib import Path
11
+ import os
12
+ import sys
13
+
14
+ # Import from modular components
15
+ from src.vad import SileroVAD
16
+ from src.diarization import SpeakerDiarization
17
+ from src.pipeline import VADDiarizationPipeline
18
+ from src.utils import create_test_audio
19
+
20
+ def setup_vad():
21
+ """Setup Silero VAD using modular wrapper"""
22
+ print("Setting up Voice Activity Detection...")
23
+
24
+ vad = SileroVAD(threshold=0.5)
25
+ print("βœ“ Silero VAD loaded (40 MB)")
26
+
27
+ return vad
28
+
29
+ def setup_diarization():
30
+ """Setup Speaker Diarization using modular wrapper"""
31
+ print("Setting up Speaker Diarization...")
32
+ print("⚠️ First download requires 1GB+ bandwidth (one-time)")
33
+
34
+ # Get token from environment or use provided one
35
+ token = os.environ.get('HF_TOKEN', 'your_token_here')
36
+
37
+ try:
38
+ diarization = SpeakerDiarization(
39
+ model_name="pyannote/speaker-diarization-3.1",
40
+ use_auth_token=token
41
+ )
42
+ print("βœ“ Diarization pipeline loaded")
43
+ return diarization
44
+ except Exception as e:
45
+ print(f"❌ Error: {e}")
46
+ print("Get your HF token: https://huggingface.co/settings/tokens")
47
+ print("Or set it: export HF_TOKEN='your_token_here'")
48
+ return None
49
+
50
+ def demo_vad(audio_path, vad_model):
51
+ """Demo VAD on an audio file using modular wrapper"""
52
+ print(f"\nVAD Analysis: {audio_path}")
53
+
54
+ timestamps, processing_time = vad_model.process_file(audio_path)
55
+
56
+ print(f"Found {len(timestamps)} speech segments:")
57
+ print(f"Processing time: {processing_time:.2f}ms")
58
+
59
+ for i, ts in enumerate(timestamps, 1):
60
+ start_s = ts['start']
61
+ end_s = ts['end']
62
+ duration_s = end_s - start_s
63
+ print(f" Segment {i}: {start_s:6.2f}s - {end_s:6.2f}s ({duration_s:6.2f}s)")
64
+
65
+ return timestamps
66
+
67
+ def demo_diarization(audio_path, diar_pipeline):
68
+ """Demo Diarization on an audio file using modular wrapper"""
69
+ print(f"\nDiarization Analysis: {audio_path}")
70
+
71
+ segments, processing_time, metadata = diar_pipeline.process_file(audio_path)
72
+
73
+ print(f"Found {metadata['num_speakers']} speakers")
74
+ print(f"Processing time: {processing_time:.2f}ms")
75
+ print("\nSpeaker timeline:")
76
+ for seg in segments:
77
+ print(f" {seg['start']:6.2f}s - {seg['end']:6.2f}s: {seg['speaker']}")
78
+
79
+ def demo_full_pipeline(audio_path):
80
+ """Demo the full integrated pipeline"""
81
+ print(f"\n{'='*60}")
82
+ print("FULL PIPELINE DEMO")
83
+ print(f"{'='*60}")
84
+
85
+ token = os.environ.get('HF_TOKEN')
86
+ if not token:
87
+ print("\n⚠️ No HF_TOKEN found. Running VAD only...")
88
+ vad = SileroVAD()
89
+ demo_vad(audio_path, vad)
90
+ return
91
+
92
+ try:
93
+ # Initialize full pipeline
94
+ pipeline = VADDiarizationPipeline(
95
+ use_auth_token=token,
96
+ vad_threshold=0.5
97
+ )
98
+
99
+ # Process file
100
+ result = pipeline.process_file(audio_path)
101
+
102
+ # Display formatted output
103
+ print("\n" + pipeline.format_output(result, format='text'))
104
+
105
+ except Exception as e:
106
+ print(f"\n❌ Error: {e}")
107
+ print("Falling back to VAD only...")
108
+ vad = SileroVAD()
109
+ demo_vad(audio_path, vad)
110
+
111
+ def main():
112
+ print("\n" + "=" * 60)
113
+ print("VOICE ACTIVITY DETECTION + SPEAKER DIARIZATION")
114
+ print("=" * 60)
115
+
116
+ # Create test audio
117
+ print("\nCreating test audio...")
118
+ audio_path = create_test_audio("test_audio.wav", duration=10.0)
119
+ print(f"βœ“ Created {audio_path}")
120
+
121
+ # Option 1: Quick VAD demo
122
+ print("\n" + "=" * 60)
123
+ print("OPTION 1: VAD ONLY (No HF token needed)")
124
+ print("=" * 60)
125
+ vad_model = setup_vad()
126
+ demo_vad(audio_path, vad_model)
127
+
128
+ # Option 2: Full pipeline (requires HF token)
129
+ print("\n" + "=" * 60)
130
+ print("OPTION 2: FULL PIPELINE (VAD + Diarization)")
131
+ print("=" * 60)
132
+ demo_full_pipeline(audio_path)
133
+
134
+ print("\n" + "=" * 60)
135
+ print("βœ… Demo complete!")
136
+ print("\nNext steps:")
137
+ print("1. Set HF_TOKEN: export HF_TOKEN='your_token_here'")
138
+ print("2. Run Gradio demo: python app.py")
139
+ print("3. Test on real audio files")
140
+ print("4. Deploy with Docker: docker build -t vad-diarization .")
141
+ print("5. Check notebooks/demo.ipynb for detailed examples")
142
+ print("=" * 60 + "\n")
143
+
144
+ if __name__ == "__main__":
145
+ main()
verify_installation.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Installation verification script
4
+ Checks that all components are properly installed and configured
5
+ """
6
+
7
+ import sys
8
+ from pathlib import Path
9
+ import importlib
10
+
11
+ def check_python_version():
12
+ """Check Python version."""
13
+ print("Checking Python version...")
14
+ version = sys.version_info
15
+ if version.major >= 3 and version.minor >= 10:
16
+ print(f" βœ… Python {version.major}.{version.minor}.{version.micro}")
17
+ return True
18
+ else:
19
+ print(f" ❌ Python {version.major}.{version.minor}.{version.micro} (requires 3.10+)")
20
+ return False
21
+
22
+ def check_package(package_name, import_name=None):
23
+ """Check if a package is installed."""
24
+ if import_name is None:
25
+ import_name = package_name
26
+
27
+ try:
28
+ mod = importlib.import_module(import_name)
29
+ version = getattr(mod, '__version__', 'unknown')
30
+ print(f" βœ… {package_name} ({version})")
31
+ return True
32
+ except ImportError:
33
+ print(f" ❌ {package_name} not found")
34
+ return False
35
+
36
+ def check_cuda():
37
+ """Check CUDA availability."""
38
+ print("Checking CUDA...")
39
+ try:
40
+ import torch
41
+ if torch.cuda.is_available():
42
+ print(f" βœ… CUDA available (version {torch.version.cuda})")
43
+ print(f" GPU: {torch.cuda.get_device_name(0)}")
44
+ return True
45
+ else:
46
+ print(" ⚠️ CUDA not available (CPU mode)")
47
+ return False
48
+ except ImportError:
49
+ print(" ❌ PyTorch not installed")
50
+ return False
51
+
52
+ def check_files():
53
+ """Check that all required files exist."""
54
+ print("Checking project files...")
55
+
56
+ required_files = [
57
+ 'src/__init__.py',
58
+ 'src/vad.py',
59
+ 'src/diarization.py',
60
+ 'src/pipeline.py',
61
+ 'src/utils.py',
62
+ 'app.py',
63
+ 'vad_diarization.py',
64
+ 'requirements.txt',
65
+ 'Dockerfile',
66
+ 'README.md'
67
+ ]
68
+
69
+ all_exist = True
70
+ for file in required_files:
71
+ path = Path(file)
72
+ if path.exists():
73
+ print(f" βœ… {file}")
74
+ else:
75
+ print(f" ❌ {file} missing")
76
+ all_exist = False
77
+
78
+ return all_exist
79
+
80
+ def check_hf_token():
81
+ """Check for Hugging Face token."""
82
+ print("Checking Hugging Face token...")
83
+ import os
84
+ token = os.environ.get('HF_TOKEN')
85
+ if token:
86
+ print(f" βœ… HF_TOKEN found (length: {len(token)})")
87
+ return True
88
+ else:
89
+ print(" ⚠️ HF_TOKEN not set (required for full pipeline)")
90
+ print(" Set with: export HF_TOKEN='your_token_here'")
91
+ return False
92
+
93
+ def test_vad():
94
+ """Test VAD functionality."""
95
+ print("Testing VAD...")
96
+ try:
97
+ from src.vad import SileroVAD
98
+ vad = SileroVAD(threshold=0.5)
99
+ print(" βœ… VAD initialized successfully")
100
+
101
+ # Quick benchmark
102
+ metrics = vad.benchmark_latency(duration_seconds=1.0)
103
+ latency = metrics['latency_per_second_ms']
104
+ print(f" βœ… VAD latency: {latency:.2f}ms per second")
105
+
106
+ if latency < 100:
107
+ print(" βœ… Latency target achieved (<100ms)")
108
+ else:
109
+ print(" ⚠️ Latency above target")
110
+
111
+ return True
112
+ except Exception as e:
113
+ print(f" ❌ VAD test failed: {e}")
114
+ return False
115
+
116
+ def main():
117
+ """Run all verification checks."""
118
+ print("\n" + "="*60)
119
+ print("INSTALLATION VERIFICATION")
120
+ print("="*60 + "\n")
121
+
122
+ results = {}
123
+
124
+ # Python version
125
+ results['python'] = check_python_version()
126
+ print()
127
+
128
+ # Required packages
129
+ print("Checking required packages...")
130
+ packages = [
131
+ ('torch', 'torch'),
132
+ ('numpy', 'numpy'),
133
+ ('librosa', 'librosa'),
134
+ ('soundfile', 'soundfile'),
135
+ ('gradio', 'gradio'),
136
+ ('matplotlib', 'matplotlib'),
137
+ ('silero-vad', 'silero_vad'),
138
+ ('pyannote.audio', 'pyannote.audio')
139
+ ]
140
+
141
+ results['packages'] = all(check_package(name, imp) for name, imp in packages)
142
+ print()
143
+
144
+ # CUDA
145
+ results['cuda'] = check_cuda()
146
+ print()
147
+
148
+ # Files
149
+ results['files'] = check_files()
150
+ print()
151
+
152
+ # HF Token
153
+ results['token'] = check_hf_token()
154
+ print()
155
+
156
+ # VAD test
157
+ results['vad'] = test_vad()
158
+ print()
159
+
160
+ # Summary
161
+ print("="*60)
162
+ print("VERIFICATION SUMMARY")
163
+ print("="*60)
164
+
165
+ total = len(results)
166
+ passed = sum(1 for v in results.values() if v)
167
+
168
+ for check, result in results.items():
169
+ status = "βœ… PASS" if result else "❌ FAIL"
170
+ print(f"{check.upper():20s}: {status}")
171
+
172
+ print()
173
+ print(f"Results: {passed}/{total} checks passed")
174
+
175
+ if passed == total:
176
+ print("\nπŸŽ‰ All checks passed! System is ready to use.")
177
+ print("\nNext steps:")
178
+ print("1. Run demo: python vad_diarization.py")
179
+ print("2. Launch Gradio: python app.py")
180
+ print("3. Run benchmarks: python benchmarks/run_benchmarks.py --quick")
181
+ elif results['python'] and results['packages'] and results['files']:
182
+ print("\nβœ… Core system is functional.")
183
+ if not results['token']:
184
+ print("⚠️ Set HF_TOKEN for full pipeline functionality")
185
+ if not results['cuda']:
186
+ print("⚠️ CUDA not available, will use CPU (slower)")
187
+ else:
188
+ print("\n❌ Installation incomplete. Please fix the issues above.")
189
+ print("\nTry running: ./setup.sh")
190
+
191
+ print("="*60 + "\n")
192
+
193
+ return passed == total
194
+
195
+ if __name__ == "__main__":
196
+ success = main()
197
+ sys.exit(0 if success else 1)