Spaces:
Sleeping
Sleeping
Commit
Β·
b77cba7
1
Parent(s):
89128dd
initial commit
Browse files- .dockerignore +51 -0
- .env.example +16 -0
- .gitignore +88 -0
- Dockerfile +42 -0
- LICENSE +21 -0
- README.md +492 -11
- app.py +295 -0
- benchmarks/run_benchmarks.py +271 -0
- docker-compose.yml +29 -0
- environment.yml +44 -0
- notebooks/demo.ipynb +312 -0
- requirements.txt +33 -0
- run_app.sh +42 -0
- setup.sh +106 -0
- src/__init__.py +17 -0
- src/diarization.py +322 -0
- src/pipeline.py +353 -0
- src/utils.py +389 -0
- src/vad.py +320 -0
- tests/__init__.py +3 -0
- tests/test_pipeline.py +108 -0
- tests/test_vad.py +112 -0
- vad_diarization.py +145 -0
- verify_installation.py +197 -0
.dockerignore
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
env/
|
| 8 |
+
venv/
|
| 9 |
+
ENV/
|
| 10 |
+
build/
|
| 11 |
+
develop-eggs/
|
| 12 |
+
dist/
|
| 13 |
+
downloads/
|
| 14 |
+
eggs/
|
| 15 |
+
.eggs/
|
| 16 |
+
lib/
|
| 17 |
+
lib64/
|
| 18 |
+
parts/
|
| 19 |
+
sdist/
|
| 20 |
+
var/
|
| 21 |
+
wheels/
|
| 22 |
+
*.egg-info/
|
| 23 |
+
.installed.cfg
|
| 24 |
+
*.egg
|
| 25 |
+
|
| 26 |
+
# IDEs
|
| 27 |
+
.vscode/
|
| 28 |
+
.idea/
|
| 29 |
+
*.swp
|
| 30 |
+
*.swo
|
| 31 |
+
*~
|
| 32 |
+
|
| 33 |
+
# OS
|
| 34 |
+
.DS_Store
|
| 35 |
+
Thumbs.db
|
| 36 |
+
|
| 37 |
+
# Project specific
|
| 38 |
+
data/
|
| 39 |
+
outputs/
|
| 40 |
+
*.wav
|
| 41 |
+
*.mp3
|
| 42 |
+
*.flac
|
| 43 |
+
test_audio.*
|
| 44 |
+
benchmarks/
|
| 45 |
+
notebooks/
|
| 46 |
+
.git/
|
| 47 |
+
.gitignore
|
| 48 |
+
|
| 49 |
+
# Documentation
|
| 50 |
+
*.md
|
| 51 |
+
!README.md
|
.env.example
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face Authentication Token
|
| 2 |
+
# Get yours at: https://huggingface.co/settings/tokens
|
| 3 |
+
HF_TOKEN="YOUR HF TOKEN HERE"
|
| 4 |
+
|
| 5 |
+
# Gradio Server Settings
|
| 6 |
+
GRADIO_SERVER_NAME=0.0.0.0
|
| 7 |
+
GRADIO_SERVER_PORT=7860
|
| 8 |
+
|
| 9 |
+
# Model Settings
|
| 10 |
+
VAD_THRESHOLD=0.5
|
| 11 |
+
USE_ONNX_VAD=false
|
| 12 |
+
|
| 13 |
+
# Optional: Specify number of speakers
|
| 14 |
+
# NUM_SPEAKERS=
|
| 15 |
+
# MIN_SPEAKERS=
|
| 16 |
+
# MAX_SPEAKERS=
|
.gitignore
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
env/
|
| 8 |
+
venv/
|
| 9 |
+
ENV/
|
| 10 |
+
build/
|
| 11 |
+
develop-eggs/
|
| 12 |
+
dist/
|
| 13 |
+
downloads/
|
| 14 |
+
eggs/
|
| 15 |
+
.eggs/
|
| 16 |
+
lib/
|
| 17 |
+
lib64/
|
| 18 |
+
parts/
|
| 19 |
+
sdist/
|
| 20 |
+
var/
|
| 21 |
+
wheels/
|
| 22 |
+
*.egg-info/
|
| 23 |
+
.installed.cfg
|
| 24 |
+
*.egg
|
| 25 |
+
MANIFEST
|
| 26 |
+
|
| 27 |
+
# Virtual environments
|
| 28 |
+
venv/
|
| 29 |
+
ENV/
|
| 30 |
+
env/
|
| 31 |
+
.venv
|
| 32 |
+
|
| 33 |
+
# IDEs
|
| 34 |
+
.vscode/
|
| 35 |
+
.idea/
|
| 36 |
+
*.swp
|
| 37 |
+
*.swo
|
| 38 |
+
*~
|
| 39 |
+
.DS_Store
|
| 40 |
+
|
| 41 |
+
# Jupyter Notebook
|
| 42 |
+
.ipynb_checkpoints
|
| 43 |
+
*.ipynb_checkpoints/
|
| 44 |
+
|
| 45 |
+
# Environment variables
|
| 46 |
+
.env
|
| 47 |
+
|
| 48 |
+
# Audio files
|
| 49 |
+
*.wav
|
| 50 |
+
*.mp3
|
| 51 |
+
*.flac
|
| 52 |
+
*.ogg
|
| 53 |
+
*.m4a
|
| 54 |
+
test_audio.*
|
| 55 |
+
|
| 56 |
+
# Output files
|
| 57 |
+
outputs/
|
| 58 |
+
data/
|
| 59 |
+
*.json
|
| 60 |
+
*.rttm
|
| 61 |
+
*.txt
|
| 62 |
+
!requirements.txt
|
| 63 |
+
!README.txt
|
| 64 |
+
|
| 65 |
+
# Model cache
|
| 66 |
+
.cache/
|
| 67 |
+
models/
|
| 68 |
+
|
| 69 |
+
# Logs
|
| 70 |
+
*.log
|
| 71 |
+
logs/
|
| 72 |
+
|
| 73 |
+
# Benchmarks
|
| 74 |
+
benchmarks/*.json
|
| 75 |
+
benchmarks/*.csv
|
| 76 |
+
|
| 77 |
+
# Temporary files
|
| 78 |
+
tmp/
|
| 79 |
+
temp/
|
| 80 |
+
*.tmp
|
| 81 |
+
|
| 82 |
+
# Data folder
|
| 83 |
+
data/
|
| 84 |
+
data/*
|
| 85 |
+
|
| 86 |
+
# Backup files
|
| 87 |
+
*.backup
|
| 88 |
+
*.bak
|
Dockerfile
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dockerfile for VAD + Speaker Diarization System
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
+
|
| 4 |
+
# Set working directory
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install system dependencies
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
ffmpeg \
|
| 10 |
+
libsndfile1 \
|
| 11 |
+
git \
|
| 12 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
# Copy requirements first for better caching
|
| 15 |
+
COPY requirements.txt .
|
| 16 |
+
|
| 17 |
+
# Install Python dependencies
|
| 18 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 19 |
+
|
| 20 |
+
# Install PyTorch with CUDA support (optional, comment out for CPU-only)
|
| 21 |
+
# RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
| 22 |
+
|
| 23 |
+
# Copy application code
|
| 24 |
+
COPY src/ ./src/
|
| 25 |
+
COPY app.py .
|
| 26 |
+
|
| 27 |
+
# Create directories for data
|
| 28 |
+
RUN mkdir -p /app/data /app/outputs
|
| 29 |
+
|
| 30 |
+
# Expose Gradio port
|
| 31 |
+
EXPOSE 7860
|
| 32 |
+
|
| 33 |
+
# Set environment variables
|
| 34 |
+
ENV GRADIO_SERVER_NAME="0.0.0.0"
|
| 35 |
+
ENV GRADIO_SERVER_PORT=7860
|
| 36 |
+
|
| 37 |
+
# Health check
|
| 38 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
| 39 |
+
CMD curl -f http://localhost:7860/ || exit 1
|
| 40 |
+
|
| 41 |
+
# Run the application
|
| 42 |
+
CMD ["python", "app.py"]
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2024 VAD+SD Contributors
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
CHANGED
|
@@ -1,13 +1,494 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
| 1 |
+
# ποΈ Real-Time VAD + Speaker Diarization System
|
| 2 |
+
|
| 3 |
+
Production-ready system for **Voice Activity Detection (VAD)** and **Speaker Diarization** with real-time performance and state-of-the-art accuracy.
|
| 4 |
+
|
| 5 |
+
[](https://www.python.org/downloads/)
|
| 6 |
+
[](https://pytorch.org/)
|
| 7 |
+
[](https://opensource.org/licenses/MIT)
|
| 8 |
+
|
| 9 |
+
## β¨ Features
|
| 10 |
+
|
| 11 |
+
- **Real-Time VAD**: <100ms latency using Silero VAD (40MB model)
|
| 12 |
+
- **Speaker Diarization**: State-of-the-art accuracy with Pyannote.audio 3.1/4.0+
|
| 13 |
+
- **Interactive Demo**: Beautiful Gradio web interface with visualizations
|
| 14 |
+
- **Production Ready**: Fully containerized with Docker
|
| 15 |
+
- **GPU Accelerated**: CUDA 12.1+ support for faster processing
|
| 16 |
+
- **Multiple Formats**: Export results as JSON, RTTM, or text
|
| 17 |
+
- **Modular Architecture**: Clean, maintainable, and extensible code
|
| 18 |
+
|
| 19 |
+
## π Quick Start
|
| 20 |
+
|
| 21 |
+
### Prerequisites
|
| 22 |
+
|
| 23 |
+
- Python 3.10+
|
| 24 |
+
- CUDA 12.1+ (optional, for GPU acceleration)
|
| 25 |
+
- FFmpeg
|
| 26 |
+
- Hugging Face account with access to [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)
|
| 27 |
+
|
| 28 |
+
### Installation
|
| 29 |
+
|
| 30 |
+
#### Option 1: Conda (Recommended)
|
| 31 |
+
|
| 32 |
+
```bash
|
| 33 |
+
# Create and activate conda environment
|
| 34 |
+
conda create -n vad_diarization python=3.10 -y
|
| 35 |
+
conda activate vad_diarization
|
| 36 |
+
|
| 37 |
+
# Install PyTorch with CUDA
|
| 38 |
+
conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -y
|
| 39 |
+
|
| 40 |
+
# Install dependencies
|
| 41 |
+
pip install -r requirements.txt
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
#### Option 2: Virtual Environment
|
| 45 |
+
|
| 46 |
+
```bash
|
| 47 |
+
# Create virtual environment
|
| 48 |
+
python -m venv venv
|
| 49 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 50 |
+
|
| 51 |
+
# Install PyTorch with CUDA support (for GPU)
|
| 52 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
| 53 |
+
|
| 54 |
+
# Install other dependencies
|
| 55 |
+
pip install -r requirements.txt
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
#### Option 3: Automated Setup
|
| 59 |
+
|
| 60 |
+
```bash
|
| 61 |
+
# For conda users (activate environment first)
|
| 62 |
+
conda activate vad_diarization
|
| 63 |
+
./setup.sh
|
| 64 |
+
|
| 65 |
+
# For venv users
|
| 66 |
+
./setup.sh
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
### Hugging Face Token Setup
|
| 70 |
+
|
| 71 |
+
1. **Get your token**: Visit https://huggingface.co/settings/tokens
|
| 72 |
+
2. **Accept model conditions**: Visit https://huggingface.co/pyannote/speaker-diarization-3.1 and click "Agree and access repository"
|
| 73 |
+
3. **Set environment variable**:
|
| 74 |
+
```bash
|
| 75 |
+
export HF_TOKEN='your_token_here'
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
### Running the Demo
|
| 79 |
+
|
| 80 |
+
**Launch Gradio Web Interface:**
|
| 81 |
+
```bash
|
| 82 |
+
export HF_TOKEN='your_token_here'
|
| 83 |
+
python app.py
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
Then open http://localhost:7860 in your browser.
|
| 87 |
+
|
| 88 |
+
**Or use the helper script:**
|
| 89 |
+
```bash
|
| 90 |
+
./run_app.sh
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
**Command Line Usage:**
|
| 94 |
+
```python
|
| 95 |
+
from src.pipeline import VADDiarizationPipeline
|
| 96 |
+
|
| 97 |
+
# Initialize pipeline
|
| 98 |
+
pipeline = VADDiarizationPipeline(
|
| 99 |
+
token='your_hf_token',
|
| 100 |
+
vad_threshold=0.5
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# Process audio file
|
| 104 |
+
result = pipeline.process_file('audio.wav')
|
| 105 |
+
|
| 106 |
+
# Print results
|
| 107 |
+
print(pipeline.format_output(result))
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
## π Project Structure
|
| 111 |
+
|
| 112 |
+
```
|
| 113 |
+
VAD+SD/
|
| 114 |
+
βββ src/
|
| 115 |
+
β βββ __init__.py # Package initialization
|
| 116 |
+
β βββ vad.py # Silero VAD wrapper
|
| 117 |
+
β βββ diarization.py # Pyannote diarization wrapper
|
| 118 |
+
β βββ pipeline.py # Integrated pipeline
|
| 119 |
+
β βββ utils.py # Utility functions
|
| 120 |
+
βββ tests/ # Unit tests
|
| 121 |
+
β βββ test_vad.py
|
| 122 |
+
β βββ test_pipeline.py
|
| 123 |
+
β βββ __init__.py
|
| 124 |
+
βββ notebooks/ # Jupyter notebooks
|
| 125 |
+
β βββ demo.ipynb
|
| 126 |
+
βββ benchmarks/ # Benchmark scripts
|
| 127 |
+
β βββ run_benchmarks.py
|
| 128 |
+
βββ app.py # Gradio web interface
|
| 129 |
+
βββ vad_diarization.py # CLI demo script
|
| 130 |
+
βββ requirements.txt # Python dependencies
|
| 131 |
+
βββ environment.yml # Conda environment file
|
| 132 |
+
βββ Dockerfile # Container configuration
|
| 133 |
+
βββ docker-compose.yml # Docker Compose config
|
| 134 |
+
βββ .dockerignore # Docker ignore patterns
|
| 135 |
+
βββ .gitignore # Git ignore patterns
|
| 136 |
+
βββ setup.sh # Automated setup script
|
| 137 |
+
βββ run_app.sh # App launcher script
|
| 138 |
+
βββ verify_installation.py # Installation verification
|
| 139 |
+
βββ README.md # This file
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
## π³ Docker Deployment
|
| 143 |
+
|
| 144 |
+
### Build and Run
|
| 145 |
+
|
| 146 |
+
```bash
|
| 147 |
+
# Build image
|
| 148 |
+
docker build -t vad-diarization:latest .
|
| 149 |
+
|
| 150 |
+
# Run container
|
| 151 |
+
docker run -p 7860:7860 \
|
| 152 |
+
-e HF_TOKEN='your_token_here' \
|
| 153 |
+
--gpus all \
|
| 154 |
+
vad-diarization:latest
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
### Docker Compose
|
| 158 |
+
|
| 159 |
+
```bash
|
| 160 |
+
# Set your token in .env file
|
| 161 |
+
echo "HF_TOKEN=your_token_here" > .env
|
| 162 |
+
|
| 163 |
+
# Start services
|
| 164 |
+
docker-compose up
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
## π Performance Benchmarks
|
| 168 |
+
|
| 169 |
+
### VAD Performance
|
| 170 |
+
- **Latency**: ~9.73ms per second of audio β
|
| 171 |
+
- **Model Size**: 40MB
|
| 172 |
+
- **Real-time Factor**: ~0.01x (100x faster than real-time)
|
| 173 |
+
- **Accuracy**: High precision on speech detection
|
| 174 |
+
|
| 175 |
+
### Diarization Performance
|
| 176 |
+
- **DER on FEARLESS STEPS**: ~19-20%
|
| 177 |
+
- **Processing Speed**: Depends on audio length and hardware
|
| 178 |
+
- **GPU Memory**: ~2-4GB for typical audio
|
| 179 |
+
- **Supports**: 2-10 speakers (configurable)
|
| 180 |
+
|
| 181 |
+
### System Requirements
|
| 182 |
+
- **Minimum**: 4GB RAM, CPU-only
|
| 183 |
+
- **Recommended**: 8GB+ RAM, NVIDIA GPU with 4GB+ VRAM
|
| 184 |
+
- **Optimal**: 16GB+ RAM, RTX 3060+ or better
|
| 185 |
+
|
| 186 |
+
## π§ Configuration
|
| 187 |
+
|
| 188 |
+
### VAD Parameters
|
| 189 |
+
|
| 190 |
+
```python
|
| 191 |
+
from src.vad import SileroVAD
|
| 192 |
+
|
| 193 |
+
vad = SileroVAD(
|
| 194 |
+
threshold=0.5, # Speech probability threshold (0.0-1.0)
|
| 195 |
+
sampling_rate=16000, # Audio sample rate
|
| 196 |
+
min_speech_duration_ms=250, # Minimum speech segment duration
|
| 197 |
+
min_silence_duration_ms=100,# Minimum silence between segments
|
| 198 |
+
use_onnx=False # Use ONNX runtime for speed
|
| 199 |
+
)
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
### Diarization Parameters
|
| 203 |
+
|
| 204 |
+
```python
|
| 205 |
+
from src.diarization import SpeakerDiarization
|
| 206 |
+
|
| 207 |
+
diarization = SpeakerDiarization(
|
| 208 |
+
model_name="pyannote/speaker-diarization-3.1",
|
| 209 |
+
token='your_token',
|
| 210 |
+
num_speakers=None, # Fixed number (if known)
|
| 211 |
+
min_speakers=None, # Minimum speakers
|
| 212 |
+
max_speakers=None # Maximum speakers
|
| 213 |
+
)
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
### Pipeline Configuration
|
| 217 |
+
|
| 218 |
+
```python
|
| 219 |
+
from src.pipeline import VADDiarizationPipeline
|
| 220 |
+
|
| 221 |
+
pipeline = VADDiarizationPipeline(
|
| 222 |
+
vad_threshold=0.5, # VAD sensitivity
|
| 223 |
+
token='your_token', # HF token
|
| 224 |
+
num_speakers=None, # Auto-detect speakers
|
| 225 |
+
use_onnx_vad=False # Use ONNX for VAD
|
| 226 |
+
)
|
| 227 |
+
```
|
| 228 |
+
|
| 229 |
+
## π Usage Examples
|
| 230 |
+
|
| 231 |
+
### Basic Processing
|
| 232 |
+
|
| 233 |
+
```python
|
| 234 |
+
from src.pipeline import VADDiarizationPipeline
|
| 235 |
+
|
| 236 |
+
# Initialize
|
| 237 |
+
pipeline = VADDiarizationPipeline(token='your_token')
|
| 238 |
+
|
| 239 |
+
# Process file
|
| 240 |
+
result = pipeline.process_file('meeting.wav')
|
| 241 |
+
|
| 242 |
+
# Access results
|
| 243 |
+
print(f"Speakers: {result['metadata']['num_speakers']}")
|
| 244 |
+
print(f"Segments: {result['metadata']['num_segments']}")
|
| 245 |
+
|
| 246 |
+
# Print timeline
|
| 247 |
+
for seg in result['speaker_segments']:
|
| 248 |
+
print(f"{seg['start']:.2f}s - {seg['end']:.2f}s: {seg['speaker']}")
|
| 249 |
+
```
|
| 250 |
+
|
| 251 |
+
### Batch Processing
|
| 252 |
+
|
| 253 |
+
```python
|
| 254 |
+
# Process multiple files
|
| 255 |
+
audio_files = ['audio1.wav', 'audio2.wav', 'audio3.wav']
|
| 256 |
+
results = pipeline.process_batch(audio_files)
|
| 257 |
+
|
| 258 |
+
# Export results
|
| 259 |
+
for result in results:
|
| 260 |
+
pipeline.save_results(result, 'outputs/', format='json')
|
| 261 |
+
```
|
| 262 |
+
|
| 263 |
+
### Custom Configuration
|
| 264 |
+
|
| 265 |
+
```python
|
| 266 |
+
# Initialize with custom settings
|
| 267 |
+
pipeline = VADDiarizationPipeline(
|
| 268 |
+
vad_threshold=0.3, # More sensitive VAD
|
| 269 |
+
num_speakers=3, # Fixed 3 speakers
|
| 270 |
+
use_onnx_vad=True # Faster VAD inference
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
# Process with overrides
|
| 274 |
+
result = pipeline.process_file(
|
| 275 |
+
'audio.wav',
|
| 276 |
+
num_speakers=2 # Override to 2 speakers for this file
|
| 277 |
+
)
|
| 278 |
+
```
|
| 279 |
+
|
| 280 |
+
### VAD Only
|
| 281 |
+
|
| 282 |
+
```python
|
| 283 |
+
from src.vad import SileroVAD
|
| 284 |
+
|
| 285 |
+
vad = SileroVAD(threshold=0.5)
|
| 286 |
+
|
| 287 |
+
# Process audio
|
| 288 |
+
timestamps = vad.process_file('audio.wav')
|
| 289 |
+
|
| 290 |
+
# Print speech segments
|
| 291 |
+
for ts in timestamps:
|
| 292 |
+
print(f"Speech: {ts['start']:.2f}s - {ts['end']:.2f}s")
|
| 293 |
+
```
|
| 294 |
+
|
| 295 |
+
### Diarization Only
|
| 296 |
+
|
| 297 |
+
```python
|
| 298 |
+
from src.diarization import SpeakerDiarization
|
| 299 |
+
|
| 300 |
+
diarizer = SpeakerDiarization(token='your_token')
|
| 301 |
+
|
| 302 |
+
# Process audio
|
| 303 |
+
segments, time_ms, metadata = diarizer.process_file('audio.wav')
|
| 304 |
+
|
| 305 |
+
# Print speaker segments
|
| 306 |
+
for seg in segments:
|
| 307 |
+
print(f"{seg['speaker']}: {seg['start']:.2f}s - {seg['end']:.2f}s")
|
| 308 |
+
```
|
| 309 |
+
|
| 310 |
+
## π§ͺ Testing
|
| 311 |
+
|
| 312 |
+
```bash
|
| 313 |
+
# Run all tests
|
| 314 |
+
python -m pytest tests/ -v
|
| 315 |
+
|
| 316 |
+
# Run with coverage
|
| 317 |
+
python -m pytest tests/ --cov=src --cov-report=html
|
| 318 |
+
|
| 319 |
+
# Test specific module
|
| 320 |
+
python -m pytest tests/test_vad.py -v
|
| 321 |
+
|
| 322 |
+
# Verify installation
|
| 323 |
+
python verify_installation.py
|
| 324 |
+
|
| 325 |
+
# Run benchmarks
|
| 326 |
+
python benchmarks/run_benchmarks.py
|
| 327 |
+
```
|
| 328 |
+
|
| 329 |
+
## π Output Formats
|
| 330 |
+
|
| 331 |
+
### JSON Format
|
| 332 |
+
```json
|
| 333 |
+
{
|
| 334 |
+
"audio_path": "audio.wav",
|
| 335 |
+
"speaker_segments": [
|
| 336 |
+
{
|
| 337 |
+
"start": 0.5,
|
| 338 |
+
"end": 3.2,
|
| 339 |
+
"speaker": "SPEAKER_00",
|
| 340 |
+
"duration": 2.7
|
| 341 |
+
}
|
| 342 |
+
],
|
| 343 |
+
"vad_segments": [
|
| 344 |
+
{
|
| 345 |
+
"start": 0.5,
|
| 346 |
+
"end": 3.2
|
| 347 |
+
}
|
| 348 |
+
],
|
| 349 |
+
"metadata": {
|
| 350 |
+
"num_speakers": 2,
|
| 351 |
+
"num_segments": 15,
|
| 352 |
+
"total_speech_time": 45.3
|
| 353 |
+
},
|
| 354 |
+
"processing_time": {
|
| 355 |
+
"vad_ms": 150.2,
|
| 356 |
+
"diarization_ms": 3200.5,
|
| 357 |
+
"total_ms": 3350.7
|
| 358 |
+
}
|
| 359 |
+
}
|
| 360 |
+
```
|
| 361 |
+
|
| 362 |
+
### RTTM Format
|
| 363 |
+
```
|
| 364 |
+
SPEAKER audio 1 0.500 2.700 <NA> <NA> SPEAKER_00 <NA> <NA>
|
| 365 |
+
SPEAKER audio 1 3.500 4.200 <NA> <NA> SPEAKER_01 <NA> <NA>
|
| 366 |
+
```
|
| 367 |
+
|
| 368 |
+
### Text Timeline
|
| 369 |
+
```
|
| 370 |
+
[0.50s - 3.20s] SPEAKER_00
|
| 371 |
+
[3.50s - 7.70s] SPEAKER_01
|
| 372 |
+
[8.00s - 10.50s] SPEAKER_00
|
| 373 |
+
```
|
| 374 |
+
|
| 375 |
+
## π― Use Cases
|
| 376 |
+
|
| 377 |
+
- **Meeting Transcription**: Identify who spoke when in recordings
|
| 378 |
+
- **Podcast Analysis**: Track speaker segments and statistics
|
| 379 |
+
- **Call Center Analytics**: Analyze customer-agent interactions
|
| 380 |
+
- **Video Production**: Generate speaker labels for editing
|
| 381 |
+
- **Research**: Speaker diarization for linguistic studies
|
| 382 |
+
- **Interview Processing**: Separate interviewer and interviewee
|
| 383 |
+
- **Broadcast Media**: Analyze news programs and talk shows
|
| 384 |
+
|
| 385 |
+
## π Troubleshooting
|
| 386 |
+
|
| 387 |
+
### Common Issues
|
| 388 |
+
|
| 389 |
+
#### 1. HF Token Error
|
| 390 |
+
```
|
| 391 |
+
Error: Invalid token or model access denied
|
| 392 |
+
```
|
| 393 |
+
**Solution**:
|
| 394 |
+
- Get token from https://huggingface.co/settings/tokens
|
| 395 |
+
- Accept model conditions at https://huggingface.co/pyannote/speaker-diarization-3.1
|
| 396 |
+
- Set environment variable: `export HF_TOKEN='your_token'`
|
| 397 |
+
|
| 398 |
+
#### 2. CUDA Out of Memory
|
| 399 |
+
```
|
| 400 |
+
RuntimeError: CUDA out of memory
|
| 401 |
+
```
|
| 402 |
+
**Solution**:
|
| 403 |
+
- Process shorter audio segments
|
| 404 |
+
- Use CPU mode: `device='cpu'`
|
| 405 |
+
- Reduce batch size
|
| 406 |
+
|
| 407 |
+
#### 3. Audio Format Not Supported
|
| 408 |
+
```
|
| 409 |
+
Error loading audio
|
| 410 |
+
```
|
| 411 |
+
**Solution**: Convert to WAV format using FFmpeg:
|
| 412 |
+
```bash
|
| 413 |
+
ffmpeg -i input.mp3 -ar 16000 -ac 1 output.wav
|
| 414 |
+
```
|
| 415 |
+
|
| 416 |
+
#### 4. DiarizeOutput Error
|
| 417 |
+
```
|
| 418 |
+
'DiarizeOutput' object has no attribute 'itertracks'
|
| 419 |
+
```
|
| 420 |
+
**Solution**: This is fixed in the current version. Make sure you have the latest code.
|
| 421 |
+
|
| 422 |
+
#### 5. Import Errors
|
| 423 |
+
```
|
| 424 |
+
ModuleNotFoundError: No module named 'torch'
|
| 425 |
+
```
|
| 426 |
+
**Solution**:
|
| 427 |
+
- Activate your environment: `conda activate vad_diarization`
|
| 428 |
+
- Reinstall dependencies: `pip install -r requirements.txt`
|
| 429 |
+
|
| 430 |
+
## π API Compatibility
|
| 431 |
+
|
| 432 |
+
This project supports both:
|
| 433 |
+
- **Pyannote.audio 3.x**: Returns `Annotation` objects
|
| 434 |
+
- **Pyannote.audio 4.0+**: Returns `DiarizeOutput` objects
|
| 435 |
+
|
| 436 |
+
The code automatically detects and handles both formats.
|
| 437 |
+
|
| 438 |
+
## π Deployment Options
|
| 439 |
+
|
| 440 |
+
### Local Development
|
| 441 |
+
```bash
|
| 442 |
+
python app.py
|
| 443 |
+
```
|
| 444 |
+
|
| 445 |
+
### Docker
|
| 446 |
+
```bash
|
| 447 |
+
docker-compose up
|
| 448 |
+
```
|
| 449 |
+
|
| 450 |
+
### Cloud Platforms
|
| 451 |
+
|
| 452 |
+
**Hugging Face Spaces:**
|
| 453 |
+
- Fork this repository
|
| 454 |
+
- Create new Space
|
| 455 |
+
- Connect repository
|
| 456 |
+
- Set `HF_TOKEN` secret
|
| 457 |
+
- Deploy!
|
| 458 |
+
|
| 459 |
+
**AWS/GCP/Azure:**
|
| 460 |
+
- Use provided Dockerfile
|
| 461 |
+
- Deploy as container service
|
| 462 |
+
- Configure GPU instances for best performance
|
| 463 |
+
|
| 464 |
+
## π€ Contributing
|
| 465 |
+
|
| 466 |
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
| 467 |
+
|
| 468 |
+
1. Fork the repository
|
| 469 |
+
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
|
| 470 |
+
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
| 471 |
+
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
| 472 |
+
5. Open a Pull Request
|
| 473 |
+
|
| 474 |
+
## π License
|
| 475 |
+
|
| 476 |
+
This project is licensed under the MIT License.
|
| 477 |
+
|
| 478 |
+
## π Acknowledgments
|
| 479 |
+
|
| 480 |
+
- [Silero VAD](https://github.com/snakers4/silero-vad) - Fast and accurate VAD
|
| 481 |
+
- [Pyannote.audio](https://github.com/pyannote/pyannote-audio) - Speaker diarization toolkit
|
| 482 |
+
- [Gradio](https://gradio.app/) - Web interface framework
|
| 483 |
+
- [PyTorch](https://pytorch.org/) - Deep learning framework
|
| 484 |
+
|
| 485 |
+
## π§ Support
|
| 486 |
+
|
| 487 |
+
For questions or issues:
|
| 488 |
+
- Open an issue on GitHub
|
| 489 |
+
- Check existing issues for solutions
|
| 490 |
+
- Review the troubleshooting section
|
| 491 |
+
|
| 492 |
---
|
| 493 |
|
| 494 |
+
**Built with β€οΈ for the speech processing community**
|
app.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Gradio Web Interface for Real-Time VAD + Speaker Diarization
|
| 4 |
+
Interactive demo with visualizations
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import numpy as np
|
| 9 |
+
import matplotlib.pyplot as plt
|
| 10 |
+
import matplotlib.patches as mpatches
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import json
|
| 13 |
+
import os
|
| 14 |
+
import tempfile
|
| 15 |
+
from typing import Optional, Tuple, List, Dict
|
| 16 |
+
|
| 17 |
+
from src.pipeline import VADDiarizationPipeline
|
| 18 |
+
from src.utils import visualize_timeline, segment_to_rttm
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# Initialize pipeline
|
| 22 |
+
print("Initializing pipeline...")
|
| 23 |
+
HF_TOKEN = os.environ.get('HF_TOKEN', None)
|
| 24 |
+
|
| 25 |
+
if not HF_TOKEN:
|
| 26 |
+
print("β οΈ No HF_TOKEN found. Set it with: export HF_TOKEN='your_token_here'")
|
| 27 |
+
print("Pipeline will work with VAD only until token is provided.")
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
pipeline = VADDiarizationPipeline(
|
| 31 |
+
use_auth_token=HF_TOKEN,
|
| 32 |
+
vad_threshold=0.5
|
| 33 |
+
)
|
| 34 |
+
PIPELINE_READY = True
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f"β οΈ Could not initialize full pipeline: {e}")
|
| 37 |
+
print("Will use VAD-only mode")
|
| 38 |
+
PIPELINE_READY = False
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def create_timeline_plot(segments: List[Dict], duration: float) -> plt.Figure:
|
| 42 |
+
"""Create a visual timeline plot of speaker segments."""
|
| 43 |
+
fig, ax = plt.subplots(figsize=(12, 4))
|
| 44 |
+
|
| 45 |
+
# Get unique speakers and assign colors
|
| 46 |
+
speakers = sorted(set(seg['speaker'] for seg in segments))
|
| 47 |
+
colors = plt.cm.Set3(np.linspace(0, 1, len(speakers)))
|
| 48 |
+
speaker_colors = {speaker: colors[i] for i, speaker in enumerate(speakers)}
|
| 49 |
+
|
| 50 |
+
# Plot segments
|
| 51 |
+
for seg in segments:
|
| 52 |
+
color = speaker_colors[seg['speaker']]
|
| 53 |
+
ax.barh(
|
| 54 |
+
0,
|
| 55 |
+
seg['duration'],
|
| 56 |
+
left=seg['start'],
|
| 57 |
+
height=0.8,
|
| 58 |
+
color=color,
|
| 59 |
+
edgecolor='black',
|
| 60 |
+
linewidth=0.5
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# Add speaker label in the middle of long segments
|
| 64 |
+
if seg['duration'] > 1.0:
|
| 65 |
+
mid = seg['start'] + seg['duration'] / 2
|
| 66 |
+
ax.text(
|
| 67 |
+
mid, 0, seg['speaker'],
|
| 68 |
+
ha='center', va='center',
|
| 69 |
+
fontsize=8, fontweight='bold'
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Formatting
|
| 73 |
+
ax.set_xlim(0, duration)
|
| 74 |
+
ax.set_ylim(-0.5, 0.5)
|
| 75 |
+
ax.set_xlabel('Time (seconds)', fontsize=12)
|
| 76 |
+
ax.set_yticks([])
|
| 77 |
+
ax.set_title('Speaker Timeline', fontsize=14, fontweight='bold')
|
| 78 |
+
ax.grid(True, axis='x', alpha=0.3)
|
| 79 |
+
|
| 80 |
+
# Legend
|
| 81 |
+
legend_patches = [
|
| 82 |
+
mpatches.Patch(color=speaker_colors[speaker], label=speaker)
|
| 83 |
+
for speaker in speakers
|
| 84 |
+
]
|
| 85 |
+
ax.legend(handles=legend_patches, loc='upper right')
|
| 86 |
+
|
| 87 |
+
plt.tight_layout()
|
| 88 |
+
return fig
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def process_audio(
|
| 92 |
+
audio_file,
|
| 93 |
+
num_speakers: Optional[int] = None,
|
| 94 |
+
vad_threshold: float = 0.5,
|
| 95 |
+
progress=gr.Progress()
|
| 96 |
+
) -> Tuple[str, str, str, plt.Figure]:
|
| 97 |
+
"""
|
| 98 |
+
Process audio file through the pipeline.
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
Tuple of (summary_text, timeline_text, json_output, plot)
|
| 102 |
+
"""
|
| 103 |
+
if audio_file is None:
|
| 104 |
+
return "Please upload an audio file", "", "", None
|
| 105 |
+
|
| 106 |
+
if not PIPELINE_READY:
|
| 107 |
+
return "Pipeline not ready. Please set HF_TOKEN environment variable.", "", "", None
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
progress(0.1, desc="Loading audio...")
|
| 111 |
+
|
| 112 |
+
# Update VAD threshold if changed
|
| 113 |
+
pipeline.vad.threshold = vad_threshold
|
| 114 |
+
|
| 115 |
+
progress(0.3, desc="Running VAD...")
|
| 116 |
+
|
| 117 |
+
# Process file
|
| 118 |
+
num_speakers_param = int(num_speakers) if num_speakers and num_speakers > 0 else None
|
| 119 |
+
|
| 120 |
+
progress(0.5, desc="Running speaker diarization...")
|
| 121 |
+
|
| 122 |
+
result = pipeline.process_file(
|
| 123 |
+
audio_file,
|
| 124 |
+
num_speakers=num_speakers_param,
|
| 125 |
+
return_vad=True,
|
| 126 |
+
return_stats=True
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
progress(0.8, desc="Generating visualizations...")
|
| 130 |
+
|
| 131 |
+
# Create summary
|
| 132 |
+
summary_lines = []
|
| 133 |
+
summary_lines.append("# Processing Results\n")
|
| 134 |
+
summary_lines.append(f"**File:** {Path(audio_file).name}\n")
|
| 135 |
+
summary_lines.append(f"**Speakers Detected:** {result['metadata']['num_speakers']}")
|
| 136 |
+
summary_lines.append(f"**Speaker Segments:** {result['metadata']['num_segments']}")
|
| 137 |
+
summary_lines.append(f"**Total Speech Time:** {result['metadata']['total_speech_time']:.2f}s\n")
|
| 138 |
+
|
| 139 |
+
summary_lines.append("## Processing Time")
|
| 140 |
+
summary_lines.append(f"- VAD: {result['processing_time']['vad_ms']:.2f}ms")
|
| 141 |
+
summary_lines.append(f"- Diarization: {result['processing_time']['diarization_ms']:.2f}ms")
|
| 142 |
+
summary_lines.append(f"- **Total: {result['processing_time']['total_ms']:.2f}ms**\n")
|
| 143 |
+
|
| 144 |
+
# Speaker statistics
|
| 145 |
+
if 'speaker_statistics' in result:
|
| 146 |
+
summary_lines.append("## Speaker Statistics\n")
|
| 147 |
+
for speaker, stats in result['speaker_statistics'].items():
|
| 148 |
+
summary_lines.append(f"### {speaker}")
|
| 149 |
+
summary_lines.append(f"- Total speaking time: {stats['total_time']:.2f}s")
|
| 150 |
+
summary_lines.append(f"- Number of segments: {stats['num_segments']}")
|
| 151 |
+
summary_lines.append(f"- Average segment duration: {stats['avg_segment_duration']:.2f}s\n")
|
| 152 |
+
|
| 153 |
+
summary_text = "\n".join(summary_lines)
|
| 154 |
+
|
| 155 |
+
# Create timeline text
|
| 156 |
+
timeline_lines = ["# Speaker Timeline\n"]
|
| 157 |
+
timeline_lines.append("```")
|
| 158 |
+
for seg in result['speaker_segments']:
|
| 159 |
+
timeline_lines.append(
|
| 160 |
+
f"{seg['start']:7.2f}s - {seg['end']:7.2f}s: {seg['speaker']} ({seg['duration']:.2f}s)"
|
| 161 |
+
)
|
| 162 |
+
timeline_lines.append("```")
|
| 163 |
+
timeline_text = "\n".join(timeline_lines)
|
| 164 |
+
|
| 165 |
+
# JSON output
|
| 166 |
+
json_output = json.dumps(result, indent=2, default=str)
|
| 167 |
+
|
| 168 |
+
# Create plot
|
| 169 |
+
duration = max(seg['end'] for seg in result['speaker_segments'])
|
| 170 |
+
plot = create_timeline_plot(result['speaker_segments'], duration)
|
| 171 |
+
|
| 172 |
+
progress(1.0, desc="Complete!")
|
| 173 |
+
|
| 174 |
+
return summary_text, timeline_text, json_output, plot
|
| 175 |
+
|
| 176 |
+
except Exception as e:
|
| 177 |
+
error_msg = f"Error processing audio: {str(e)}\n\n"
|
| 178 |
+
error_msg += "Make sure you have:\n"
|
| 179 |
+
error_msg += "1. Valid HF_TOKEN environment variable\n"
|
| 180 |
+
error_msg += "2. Accepted model conditions at https://huggingface.co/pyannote/speaker-diarization-3.1"
|
| 181 |
+
return error_msg, "", "", None
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def create_demo():
|
| 185 |
+
"""Create Gradio interface."""
|
| 186 |
+
|
| 187 |
+
with gr.Blocks(title="VAD + Speaker Diarization", theme=gr.themes.Soft()) as demo:
|
| 188 |
+
gr.Markdown("""
|
| 189 |
+
# ποΈ Real-Time Voice Activity Detection + Speaker Diarization
|
| 190 |
+
|
| 191 |
+
Upload an audio file to detect speech segments and identify different speakers.
|
| 192 |
+
|
| 193 |
+
**Features:**
|
| 194 |
+
- Voice Activity Detection (VAD) with <100ms latency
|
| 195 |
+
- Speaker Diarization with state-of-the-art accuracy
|
| 196 |
+
- Visual timeline of speaker segments
|
| 197 |
+
- Detailed statistics and JSON export
|
| 198 |
+
|
| 199 |
+
**Supported formats:** WAV, MP3, FLAC, OGG, M4A
|
| 200 |
+
""")
|
| 201 |
+
|
| 202 |
+
with gr.Row():
|
| 203 |
+
with gr.Column(scale=1):
|
| 204 |
+
gr.Markdown("## Input")
|
| 205 |
+
|
| 206 |
+
audio_input = gr.Audio(
|
| 207 |
+
label="Upload Audio File",
|
| 208 |
+
type="filepath",
|
| 209 |
+
sources=["upload"]
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
with gr.Accordion("Advanced Settings", open=False):
|
| 213 |
+
num_speakers = gr.Number(
|
| 214 |
+
label="Number of Speakers (0 for auto-detection)",
|
| 215 |
+
value=0,
|
| 216 |
+
precision=0,
|
| 217 |
+
minimum=0,
|
| 218 |
+
maximum=10,
|
| 219 |
+
info="Set to 0 for automatic speaker detection"
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
vad_threshold = gr.Slider(
|
| 223 |
+
label="VAD Sensitivity Threshold",
|
| 224 |
+
minimum=0.0,
|
| 225 |
+
maximum=1.0,
|
| 226 |
+
value=0.5,
|
| 227 |
+
step=0.05,
|
| 228 |
+
info="Lower = more sensitive to speech"
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
process_btn = gr.Button("π Process Audio", variant="primary", size="lg")
|
| 232 |
+
|
| 233 |
+
gr.Markdown("""
|
| 234 |
+
### Tips:
|
| 235 |
+
- For best results, use clear audio with minimal background noise
|
| 236 |
+
- Specify number of speakers if known for better accuracy
|
| 237 |
+
- Adjust VAD threshold if speech is not detected properly
|
| 238 |
+
""")
|
| 239 |
+
|
| 240 |
+
with gr.Column(scale=2):
|
| 241 |
+
gr.Markdown("## Results")
|
| 242 |
+
|
| 243 |
+
with gr.Tab("Summary"):
|
| 244 |
+
summary_output = gr.Markdown(label="Summary")
|
| 245 |
+
|
| 246 |
+
with gr.Tab("Timeline"):
|
| 247 |
+
timeline_plot = gr.Plot(label="Visual Timeline")
|
| 248 |
+
timeline_output = gr.Markdown(label="Timeline Details")
|
| 249 |
+
|
| 250 |
+
with gr.Tab("JSON Export"):
|
| 251 |
+
json_output = gr.Code(
|
| 252 |
+
label="Full Results (JSON)",
|
| 253 |
+
language="json",
|
| 254 |
+
lines=20
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
# Examples
|
| 258 |
+
gr.Markdown("## π Examples")
|
| 259 |
+
gr.Markdown("""
|
| 260 |
+
Try the demo with your own audio files or use sample data from the FEARLESS STEPS dataset.
|
| 261 |
+
|
| 262 |
+
**Expected Performance:**
|
| 263 |
+
- VAD Latency: <100ms per second of audio
|
| 264 |
+
- Diarization Error Rate (DER): ~19-20% on benchmark datasets
|
| 265 |
+
- Processing Time: Depends on audio length and hardware
|
| 266 |
+
""")
|
| 267 |
+
|
| 268 |
+
# Event handlers
|
| 269 |
+
process_btn.click(
|
| 270 |
+
fn=process_audio,
|
| 271 |
+
inputs=[audio_input, num_speakers, vad_threshold],
|
| 272 |
+
outputs=[summary_output, timeline_output, json_output, timeline_plot]
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
# Footer
|
| 276 |
+
gr.Markdown("""
|
| 277 |
+
---
|
| 278 |
+
**Tech Stack:** Silero VAD + Pyannote.audio 3.1 | **GPU:** CUDA 12.5+ supported
|
| 279 |
+
|
| 280 |
+
**Note:** First run may take longer due to model downloads (~1GB)
|
| 281 |
+
""")
|
| 282 |
+
|
| 283 |
+
return demo
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
if __name__ == "__main__":
|
| 287 |
+
demo = create_demo()
|
| 288 |
+
|
| 289 |
+
# Launch settings
|
| 290 |
+
demo.launch(
|
| 291 |
+
server_name="0.0.0.0",
|
| 292 |
+
server_port=7860,
|
| 293 |
+
share=False,
|
| 294 |
+
show_error=True
|
| 295 |
+
)
|
benchmarks/run_benchmarks.py
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Benchmark script for VAD + Speaker Diarization
|
| 4 |
+
Tests performance on various audio conditions
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 10 |
+
|
| 11 |
+
import time
|
| 12 |
+
import json
|
| 13 |
+
import numpy as np
|
| 14 |
+
from typing import Dict, List
|
| 15 |
+
import argparse
|
| 16 |
+
|
| 17 |
+
from src.vad import SileroVAD
|
| 18 |
+
from src.pipeline import VADDiarizationPipeline
|
| 19 |
+
from src.utils import create_test_audio
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class Benchmark:
|
| 23 |
+
"""Benchmark suite for VAD + Diarization."""
|
| 24 |
+
|
| 25 |
+
def __init__(self, use_auth_token: str = None):
|
| 26 |
+
"""Initialize benchmark."""
|
| 27 |
+
self.use_auth_token = use_auth_token
|
| 28 |
+
self.results = {}
|
| 29 |
+
|
| 30 |
+
def benchmark_vad_latency(self, durations: List[float] = [1, 5, 10, 30, 60]):
|
| 31 |
+
"""Benchmark VAD latency across different audio durations."""
|
| 32 |
+
print("\n" + "="*60)
|
| 33 |
+
print("VAD LATENCY BENCHMARK")
|
| 34 |
+
print("="*60)
|
| 35 |
+
|
| 36 |
+
vad = SileroVAD(threshold=0.5)
|
| 37 |
+
results = []
|
| 38 |
+
|
| 39 |
+
for duration in durations:
|
| 40 |
+
print(f"\nTesting {duration}s audio...")
|
| 41 |
+
metrics = vad.benchmark_latency(duration_seconds=duration)
|
| 42 |
+
|
| 43 |
+
result = {
|
| 44 |
+
'duration_s': duration,
|
| 45 |
+
'processing_time_ms': metrics['total_processing_time_ms'],
|
| 46 |
+
'latency_per_second_ms': metrics['latency_per_second_ms'],
|
| 47 |
+
'real_time_factor': metrics['real_time_factor']
|
| 48 |
+
}
|
| 49 |
+
results.append(result)
|
| 50 |
+
|
| 51 |
+
print(f" Processing time: {result['processing_time_ms']:.2f}ms")
|
| 52 |
+
print(f" Latency/second: {result['latency_per_second_ms']:.2f}ms")
|
| 53 |
+
print(f" Real-time factor: {result['real_time_factor']:.4f}x")
|
| 54 |
+
|
| 55 |
+
# Check target
|
| 56 |
+
if result['latency_per_second_ms'] < 100:
|
| 57 |
+
print(" β
Target achieved (<100ms)")
|
| 58 |
+
else:
|
| 59 |
+
print(" β οΈ Above target (>100ms)")
|
| 60 |
+
|
| 61 |
+
self.results['vad_latency'] = results
|
| 62 |
+
|
| 63 |
+
# Summary
|
| 64 |
+
avg_latency = np.mean([r['latency_per_second_ms'] for r in results])
|
| 65 |
+
print(f"\nπ Average latency: {avg_latency:.2f}ms per second")
|
| 66 |
+
|
| 67 |
+
return results
|
| 68 |
+
|
| 69 |
+
def benchmark_vad_thresholds(self, thresholds: List[float] = [0.3, 0.5, 0.7]):
|
| 70 |
+
"""Benchmark VAD with different sensitivity thresholds."""
|
| 71 |
+
print("\n" + "="*60)
|
| 72 |
+
print("VAD THRESHOLD BENCHMARK")
|
| 73 |
+
print("="*60)
|
| 74 |
+
|
| 75 |
+
# Create test audio
|
| 76 |
+
test_audio = create_test_audio("test_threshold.wav", duration=10.0)
|
| 77 |
+
results = []
|
| 78 |
+
|
| 79 |
+
for threshold in thresholds:
|
| 80 |
+
print(f"\nTesting threshold {threshold}...")
|
| 81 |
+
vad = SileroVAD(threshold=threshold)
|
| 82 |
+
|
| 83 |
+
timestamps, processing_time = vad.process_file(test_audio)
|
| 84 |
+
|
| 85 |
+
result = {
|
| 86 |
+
'threshold': threshold,
|
| 87 |
+
'num_segments': len(timestamps),
|
| 88 |
+
'processing_time_ms': processing_time,
|
| 89 |
+
'total_speech_time_s': sum(ts['end'] - ts['start'] for ts in timestamps)
|
| 90 |
+
}
|
| 91 |
+
results.append(result)
|
| 92 |
+
|
| 93 |
+
print(f" Segments detected: {result['num_segments']}")
|
| 94 |
+
print(f" Total speech time: {result['total_speech_time_s']:.2f}s")
|
| 95 |
+
print(f" Processing time: {result['processing_time_ms']:.2f}ms")
|
| 96 |
+
|
| 97 |
+
self.results['vad_thresholds'] = results
|
| 98 |
+
|
| 99 |
+
# Cleanup
|
| 100 |
+
Path(test_audio).unlink(missing_ok=True)
|
| 101 |
+
|
| 102 |
+
return results
|
| 103 |
+
|
| 104 |
+
def benchmark_full_pipeline(self):
|
| 105 |
+
"""Benchmark full VAD + Diarization pipeline."""
|
| 106 |
+
print("\n" + "="*60)
|
| 107 |
+
print("FULL PIPELINE BENCHMARK")
|
| 108 |
+
print("="*60)
|
| 109 |
+
|
| 110 |
+
if not self.use_auth_token:
|
| 111 |
+
print("β οΈ No HF_TOKEN provided, skipping full pipeline benchmark")
|
| 112 |
+
return None
|
| 113 |
+
|
| 114 |
+
try:
|
| 115 |
+
# Initialize pipeline
|
| 116 |
+
print("\nInitializing pipeline...")
|
| 117 |
+
pipeline = VADDiarizationPipeline(
|
| 118 |
+
use_auth_token=self.use_auth_token,
|
| 119 |
+
vad_threshold=0.5
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
# Create test audio
|
| 123 |
+
test_audio = create_test_audio("test_pipeline.wav", duration=30.0)
|
| 124 |
+
|
| 125 |
+
# Process
|
| 126 |
+
print(f"\nProcessing {test_audio}...")
|
| 127 |
+
result = pipeline.process_file(test_audio)
|
| 128 |
+
|
| 129 |
+
benchmark_result = {
|
| 130 |
+
'audio_duration_s': 30.0,
|
| 131 |
+
'vad_time_ms': result['processing_time']['vad_ms'],
|
| 132 |
+
'diarization_time_ms': result['processing_time']['diarization_ms'],
|
| 133 |
+
'total_time_ms': result['processing_time']['total_ms'],
|
| 134 |
+
'num_speakers': result['metadata']['num_speakers'],
|
| 135 |
+
'num_segments': result['metadata']['num_segments']
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
print(f"\nπ Results:")
|
| 139 |
+
print(f" VAD time: {benchmark_result['vad_time_ms']:.2f}ms")
|
| 140 |
+
print(f" Diarization time: {benchmark_result['diarization_time_ms']:.2f}ms")
|
| 141 |
+
print(f" Total time: {benchmark_result['total_time_ms']:.2f}ms")
|
| 142 |
+
print(f" Speakers: {benchmark_result['num_speakers']}")
|
| 143 |
+
print(f" Segments: {benchmark_result['num_segments']}")
|
| 144 |
+
|
| 145 |
+
self.results['full_pipeline'] = benchmark_result
|
| 146 |
+
|
| 147 |
+
# Cleanup
|
| 148 |
+
Path(test_audio).unlink(missing_ok=True)
|
| 149 |
+
|
| 150 |
+
return benchmark_result
|
| 151 |
+
|
| 152 |
+
except Exception as e:
|
| 153 |
+
print(f"β Error: {e}")
|
| 154 |
+
return None
|
| 155 |
+
|
| 156 |
+
def benchmark_memory_usage(self):
|
| 157 |
+
"""Benchmark memory usage."""
|
| 158 |
+
print("\n" + "="*60)
|
| 159 |
+
print("MEMORY USAGE BENCHMARK")
|
| 160 |
+
print("="*60)
|
| 161 |
+
|
| 162 |
+
import psutil
|
| 163 |
+
import torch
|
| 164 |
+
|
| 165 |
+
process = psutil.Process()
|
| 166 |
+
|
| 167 |
+
# Initial memory
|
| 168 |
+
initial_mem = process.memory_info().rss / 1024 / 1024 # MB
|
| 169 |
+
print(f"\nInitial memory: {initial_mem:.2f} MB")
|
| 170 |
+
|
| 171 |
+
# Load VAD
|
| 172 |
+
print("\nLoading VAD...")
|
| 173 |
+
vad = SileroVAD()
|
| 174 |
+
vad_mem = process.memory_info().rss / 1024 / 1024
|
| 175 |
+
print(f"After VAD: {vad_mem:.2f} MB (+{vad_mem - initial_mem:.2f} MB)")
|
| 176 |
+
|
| 177 |
+
# GPU memory (if available)
|
| 178 |
+
if torch.cuda.is_available():
|
| 179 |
+
gpu_mem = torch.cuda.memory_allocated() / 1024 / 1024
|
| 180 |
+
print(f"GPU memory: {gpu_mem:.2f} MB")
|
| 181 |
+
|
| 182 |
+
result = {
|
| 183 |
+
'initial_memory_mb': initial_mem,
|
| 184 |
+
'vad_memory_mb': vad_mem,
|
| 185 |
+
'vad_increase_mb': vad_mem - initial_mem
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
if torch.cuda.is_available():
|
| 189 |
+
result['gpu_memory_mb'] = gpu_mem
|
| 190 |
+
|
| 191 |
+
self.results['memory_usage'] = result
|
| 192 |
+
|
| 193 |
+
return result
|
| 194 |
+
|
| 195 |
+
def save_results(self, output_path: str = "benchmark_results.json"):
|
| 196 |
+
"""Save benchmark results to file."""
|
| 197 |
+
output_file = Path(__file__).parent / output_path
|
| 198 |
+
|
| 199 |
+
with open(output_file, 'w') as f:
|
| 200 |
+
json.dump(self.results, f, indent=2)
|
| 201 |
+
|
| 202 |
+
print(f"\nβ Results saved to: {output_file}")
|
| 203 |
+
|
| 204 |
+
def run_all(self):
|
| 205 |
+
"""Run all benchmarks."""
|
| 206 |
+
print("\n" + "="*60)
|
| 207 |
+
print("RUNNING ALL BENCHMARKS")
|
| 208 |
+
print("="*60)
|
| 209 |
+
|
| 210 |
+
# VAD latency
|
| 211 |
+
self.benchmark_vad_latency()
|
| 212 |
+
|
| 213 |
+
# VAD thresholds
|
| 214 |
+
self.benchmark_vad_thresholds()
|
| 215 |
+
|
| 216 |
+
# Memory usage
|
| 217 |
+
self.benchmark_memory_usage()
|
| 218 |
+
|
| 219 |
+
# Full pipeline (if token available)
|
| 220 |
+
if self.use_auth_token:
|
| 221 |
+
self.benchmark_full_pipeline()
|
| 222 |
+
|
| 223 |
+
# Save results
|
| 224 |
+
self.save_results()
|
| 225 |
+
|
| 226 |
+
print("\n" + "="*60)
|
| 227 |
+
print("β
ALL BENCHMARKS COMPLETE")
|
| 228 |
+
print("="*60)
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def main():
|
| 232 |
+
"""Main benchmark runner."""
|
| 233 |
+
parser = argparse.ArgumentParser(description="Run VAD + Diarization benchmarks")
|
| 234 |
+
parser.add_argument(
|
| 235 |
+
'--token',
|
| 236 |
+
type=str,
|
| 237 |
+
default=None,
|
| 238 |
+
help='Hugging Face token for full pipeline benchmark'
|
| 239 |
+
)
|
| 240 |
+
parser.add_argument(
|
| 241 |
+
'--output',
|
| 242 |
+
type=str,
|
| 243 |
+
default='benchmark_results.json',
|
| 244 |
+
help='Output file for results'
|
| 245 |
+
)
|
| 246 |
+
parser.add_argument(
|
| 247 |
+
'--quick',
|
| 248 |
+
action='store_true',
|
| 249 |
+
help='Run quick benchmark (VAD only)'
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
args = parser.parse_args()
|
| 253 |
+
|
| 254 |
+
# Get token from args or environment
|
| 255 |
+
token = args.token or os.environ.get('HF_TOKEN')
|
| 256 |
+
|
| 257 |
+
# Initialize benchmark
|
| 258 |
+
benchmark = Benchmark(use_auth_token=token)
|
| 259 |
+
|
| 260 |
+
if args.quick:
|
| 261 |
+
# Quick benchmark (VAD only)
|
| 262 |
+
benchmark.benchmark_vad_latency(durations=[1, 5, 10])
|
| 263 |
+
benchmark.save_results(args.output)
|
| 264 |
+
else:
|
| 265 |
+
# Full benchmark suite
|
| 266 |
+
benchmark.run_all()
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
if __name__ == "__main__":
|
| 270 |
+
import os
|
| 271 |
+
main()
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
vad-diarization:
|
| 5 |
+
build: .
|
| 6 |
+
container_name: vad-diarization
|
| 7 |
+
ports:
|
| 8 |
+
- "7860:7860"
|
| 9 |
+
environment:
|
| 10 |
+
- HF_TOKEN=${HF_TOKEN}
|
| 11 |
+
- GRADIO_SERVER_NAME=0.0.0.0
|
| 12 |
+
- GRADIO_SERVER_PORT=7860
|
| 13 |
+
volumes:
|
| 14 |
+
- ./data:/app/data
|
| 15 |
+
- ./outputs:/app/outputs
|
| 16 |
+
deploy:
|
| 17 |
+
resources:
|
| 18 |
+
reservations:
|
| 19 |
+
devices:
|
| 20 |
+
- driver: nvidia
|
| 21 |
+
count: 1
|
| 22 |
+
capabilities: [gpu]
|
| 23 |
+
restart: unless-stopped
|
| 24 |
+
healthcheck:
|
| 25 |
+
test: ["CMD", "curl", "-f", "http://localhost:7860/"]
|
| 26 |
+
interval: 30s
|
| 27 |
+
timeout: 10s
|
| 28 |
+
retries: 3
|
| 29 |
+
start_period: 40s
|
environment.yml
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: vad_diarization
|
| 2 |
+
channels:
|
| 3 |
+
- pytorch
|
| 4 |
+
- nvidia
|
| 5 |
+
- conda-forge
|
| 6 |
+
- defaults
|
| 7 |
+
dependencies:
|
| 8 |
+
- python=3.10
|
| 9 |
+
- pytorch>=2.0.0
|
| 10 |
+
- torchvision>=0.15.0
|
| 11 |
+
- torchaudio>=2.0.0
|
| 12 |
+
- pytorch-cuda=12.1
|
| 13 |
+
- ffmpeg
|
| 14 |
+
- pip
|
| 15 |
+
- pip:
|
| 16 |
+
# VAD
|
| 17 |
+
- silero-vad>=5.0.0
|
| 18 |
+
|
| 19 |
+
# Speaker Diarization
|
| 20 |
+
- pyannote.audio>=3.1.0
|
| 21 |
+
- pyannote.core>=5.0.0
|
| 22 |
+
- pyannote.metrics>=3.2.0
|
| 23 |
+
|
| 24 |
+
# Audio processing
|
| 25 |
+
- librosa>=0.10.0
|
| 26 |
+
- soundfile>=0.12.0
|
| 27 |
+
- numpy>=1.24.0
|
| 28 |
+
|
| 29 |
+
# Web interface
|
| 30 |
+
- gradio>=4.0.0
|
| 31 |
+
|
| 32 |
+
# Visualization
|
| 33 |
+
- matplotlib>=3.7.0
|
| 34 |
+
|
| 35 |
+
# Utilities
|
| 36 |
+
- tqdm>=4.65.0
|
| 37 |
+
- pyyaml>=6.0
|
| 38 |
+
|
| 39 |
+
# Testing
|
| 40 |
+
- pytest>=7.0.0
|
| 41 |
+
- pytest-cov>=4.0.0
|
| 42 |
+
|
| 43 |
+
# System utilities
|
| 44 |
+
- psutil>=5.9.0
|
notebooks/demo.ipynb
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# Real-Time VAD + Speaker Diarization Demo\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"This notebook demonstrates the complete pipeline for voice activity detection and speaker diarization."
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "markdown",
|
| 14 |
+
"metadata": {},
|
| 15 |
+
"source": [
|
| 16 |
+
"## Setup"
|
| 17 |
+
]
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"cell_type": "code",
|
| 21 |
+
"execution_count": null,
|
| 22 |
+
"metadata": {},
|
| 23 |
+
"outputs": [],
|
| 24 |
+
"source": [
|
| 25 |
+
"import sys\n",
|
| 26 |
+
"sys.path.insert(0, '..')\n",
|
| 27 |
+
"\n",
|
| 28 |
+
"import numpy as np\n",
|
| 29 |
+
"import matplotlib.pyplot as plt\n",
|
| 30 |
+
"from pathlib import Path\n",
|
| 31 |
+
"import os\n",
|
| 32 |
+
"\n",
|
| 33 |
+
"from src.vad import SileroVAD\n",
|
| 34 |
+
"from src.diarization import SpeakerDiarization\n",
|
| 35 |
+
"from src.pipeline import VADDiarizationPipeline\n",
|
| 36 |
+
"from src.utils import create_test_audio, visualize_timeline\n",
|
| 37 |
+
"\n",
|
| 38 |
+
"print(\"β
Imports successful\")"
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"cell_type": "markdown",
|
| 43 |
+
"metadata": {},
|
| 44 |
+
"source": [
|
| 45 |
+
"## 1. Voice Activity Detection (VAD)"
|
| 46 |
+
]
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"cell_type": "code",
|
| 50 |
+
"execution_count": null,
|
| 51 |
+
"metadata": {},
|
| 52 |
+
"outputs": [],
|
| 53 |
+
"source": [
|
| 54 |
+
"# Initialize VAD\n",
|
| 55 |
+
"vad = SileroVAD(threshold=0.5)\n",
|
| 56 |
+
"\n",
|
| 57 |
+
"# Benchmark latency\n",
|
| 58 |
+
"print(\"Benchmarking VAD latency...\")\n",
|
| 59 |
+
"metrics = vad.benchmark_latency(duration_seconds=10.0)\n",
|
| 60 |
+
"\n",
|
| 61 |
+
"print(f\"\\nVAD Performance:\")\n",
|
| 62 |
+
"print(f\" Total processing time: {metrics['total_processing_time_ms']:.2f}ms\")\n",
|
| 63 |
+
"print(f\" Audio duration: {metrics['audio_duration_s']:.1f}s\")\n",
|
| 64 |
+
"print(f\" Latency per second: {metrics['latency_per_second_ms']:.2f}ms\")\n",
|
| 65 |
+
"print(f\" Real-time factor: {metrics['real_time_factor']:.4f}x\")\n",
|
| 66 |
+
"\n",
|
| 67 |
+
"if metrics['latency_per_second_ms'] < 100:\n",
|
| 68 |
+
" print(\"\\nβ
Target latency achieved (<100ms)\")\n",
|
| 69 |
+
"else:\n",
|
| 70 |
+
" print(\"\\nβ οΈ Latency above target\")"
|
| 71 |
+
]
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"cell_type": "markdown",
|
| 75 |
+
"metadata": {},
|
| 76 |
+
"source": [
|
| 77 |
+
"## 2. Create Test Audio"
|
| 78 |
+
]
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"cell_type": "code",
|
| 82 |
+
"execution_count": null,
|
| 83 |
+
"metadata": {},
|
| 84 |
+
"outputs": [],
|
| 85 |
+
"source": [
|
| 86 |
+
"# Create synthetic test audio\n",
|
| 87 |
+
"test_audio_path = create_test_audio(\"test_audio.wav\", duration=10.0)\n",
|
| 88 |
+
"print(f\"β
Created test audio: {test_audio_path}\")"
|
| 89 |
+
]
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"cell_type": "markdown",
|
| 93 |
+
"metadata": {},
|
| 94 |
+
"source": [
|
| 95 |
+
"## 3. Process with VAD"
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"cell_type": "code",
|
| 100 |
+
"execution_count": null,
|
| 101 |
+
"metadata": {},
|
| 102 |
+
"outputs": [],
|
| 103 |
+
"source": [
|
| 104 |
+
"# Process test audio\n",
|
| 105 |
+
"timestamps, processing_time = vad.process_file(test_audio_path)\n",
|
| 106 |
+
"\n",
|
| 107 |
+
"print(f\"\\nVAD Results:\")\n",
|
| 108 |
+
"print(f\" Found {len(timestamps)} speech segments\")\n",
|
| 109 |
+
"print(f\" Processing time: {processing_time:.2f}ms\")\n",
|
| 110 |
+
"print(f\"\\nSegments:\")\n",
|
| 111 |
+
"for i, ts in enumerate(timestamps, 1):\n",
|
| 112 |
+
" print(f\" {i}. {ts['start']:.2f}s - {ts['end']:.2f}s ({ts['end']-ts['start']:.2f}s)\")"
|
| 113 |
+
]
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"cell_type": "markdown",
|
| 117 |
+
"metadata": {},
|
| 118 |
+
"source": [
|
| 119 |
+
"## 4. Full Pipeline (VAD + Diarization)\n",
|
| 120 |
+
"\n",
|
| 121 |
+
"**Note:** This requires a Hugging Face token. Set it with:\n",
|
| 122 |
+
"```python\n",
|
| 123 |
+
"os.environ['HF_TOKEN'] = 'your_token_here'\n",
|
| 124 |
+
"```"
|
| 125 |
+
]
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"cell_type": "code",
|
| 129 |
+
"execution_count": null,
|
| 130 |
+
"metadata": {},
|
| 131 |
+
"outputs": [],
|
| 132 |
+
"source": [
|
| 133 |
+
"# Check for HF token\n",
|
| 134 |
+
"HF_TOKEN = os.environ.get('HF_TOKEN')\n",
|
| 135 |
+
"\n",
|
| 136 |
+
"if not HF_TOKEN:\n",
|
| 137 |
+
" print(\"β οΈ No HF_TOKEN found. Set it to run full pipeline:\")\n",
|
| 138 |
+
" print(\" os.environ['HF_TOKEN'] = 'your_token_here'\")\n",
|
| 139 |
+
"else:\n",
|
| 140 |
+
" print(\"β
HF_TOKEN found, initializing full pipeline...\")\n",
|
| 141 |
+
" \n",
|
| 142 |
+
" try:\n",
|
| 143 |
+
" # Initialize pipeline\n",
|
| 144 |
+
" pipeline = VADDiarizationPipeline(\n",
|
| 145 |
+
" use_auth_token=HF_TOKEN,\n",
|
| 146 |
+
" vad_threshold=0.5\n",
|
| 147 |
+
" )\n",
|
| 148 |
+
" \n",
|
| 149 |
+
" print(\"\\nβ
Pipeline initialized successfully\")\n",
|
| 150 |
+
" \n",
|
| 151 |
+
" except Exception as e:\n",
|
| 152 |
+
" print(f\"\\nβ Error initializing pipeline: {e}\")\n",
|
| 153 |
+
" print(\"\\nMake sure you have:\")\n",
|
| 154 |
+
" print(\"1. Valid HF token\")\n",
|
| 155 |
+
" print(\"2. Accepted model conditions at:\")\n",
|
| 156 |
+
" print(\" https://huggingface.co/pyannote/speaker-diarization-3.1\")"
|
| 157 |
+
]
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"cell_type": "markdown",
|
| 161 |
+
"metadata": {},
|
| 162 |
+
"source": [
|
| 163 |
+
"## 5. Process Audio with Full Pipeline"
|
| 164 |
+
]
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"cell_type": "code",
|
| 168 |
+
"execution_count": null,
|
| 169 |
+
"metadata": {},
|
| 170 |
+
"outputs": [],
|
| 171 |
+
"source": [
|
| 172 |
+
"# Only run if pipeline is initialized\n",
|
| 173 |
+
"if 'pipeline' in locals():\n",
|
| 174 |
+
" # Process audio file\n",
|
| 175 |
+
" result = pipeline.process_file(test_audio_path)\n",
|
| 176 |
+
" \n",
|
| 177 |
+
" # Display results\n",
|
| 178 |
+
" print(\"\\n\" + \"=\"*60)\n",
|
| 179 |
+
" print(\"RESULTS\")\n",
|
| 180 |
+
" print(\"=\"*60)\n",
|
| 181 |
+
" print(f\"\\nSpeakers detected: {result['metadata']['num_speakers']}\")\n",
|
| 182 |
+
" print(f\"Speaker segments: {result['metadata']['num_segments']}\")\n",
|
| 183 |
+
" print(f\"Total speech time: {result['metadata']['total_speech_time']:.2f}s\")\n",
|
| 184 |
+
" \n",
|
| 185 |
+
" print(f\"\\nProcessing time:\")\n",
|
| 186 |
+
" print(f\" VAD: {result['processing_time']['vad_ms']:.2f}ms\")\n",
|
| 187 |
+
" print(f\" Diarization: {result['processing_time']['diarization_ms']:.2f}ms\")\n",
|
| 188 |
+
" print(f\" Total: {result['processing_time']['total_ms']:.2f}ms\")\n",
|
| 189 |
+
" \n",
|
| 190 |
+
" print(f\"\\nSpeaker Timeline:\")\n",
|
| 191 |
+
" for seg in result['speaker_segments']:\n",
|
| 192 |
+
" print(f\" {seg['start']:6.2f}s - {seg['end']:6.2f}s: {seg['speaker']}\")\n",
|
| 193 |
+
"else:\n",
|
| 194 |
+
" print(\"β οΈ Pipeline not initialized. Set HF_TOKEN to run full pipeline.\")"
|
| 195 |
+
]
|
| 196 |
+
},
|
| 197 |
+
{
|
| 198 |
+
"cell_type": "markdown",
|
| 199 |
+
"metadata": {},
|
| 200 |
+
"source": [
|
| 201 |
+
"## 6. Visualize Results"
|
| 202 |
+
]
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"cell_type": "code",
|
| 206 |
+
"execution_count": null,
|
| 207 |
+
"metadata": {},
|
| 208 |
+
"outputs": [],
|
| 209 |
+
"source": [
|
| 210 |
+
"if 'result' in locals():\n",
|
| 211 |
+
" # ASCII timeline\n",
|
| 212 |
+
" timeline = visualize_timeline(result['speaker_segments'])\n",
|
| 213 |
+
" print(timeline)\n",
|
| 214 |
+
" \n",
|
| 215 |
+
" # Plot timeline\n",
|
| 216 |
+
" fig, ax = plt.subplots(figsize=(12, 4))\n",
|
| 217 |
+
" \n",
|
| 218 |
+
" speakers = sorted(set(seg['speaker'] for seg in result['speaker_segments']))\n",
|
| 219 |
+
" colors = plt.cm.Set3(np.linspace(0, 1, len(speakers)))\n",
|
| 220 |
+
" speaker_colors = {speaker: colors[i] for i, speaker in enumerate(speakers)}\n",
|
| 221 |
+
" \n",
|
| 222 |
+
" for seg in result['speaker_segments']:\n",
|
| 223 |
+
" color = speaker_colors[seg['speaker']]\n",
|
| 224 |
+
" ax.barh(0, seg['duration'], left=seg['start'], height=0.8, \n",
|
| 225 |
+
" color=color, edgecolor='black', linewidth=0.5)\n",
|
| 226 |
+
" \n",
|
| 227 |
+
" ax.set_xlabel('Time (seconds)', fontsize=12)\n",
|
| 228 |
+
" ax.set_yticks([])\n",
|
| 229 |
+
" ax.set_title('Speaker Timeline', fontsize=14, fontweight='bold')\n",
|
| 230 |
+
" ax.grid(True, axis='x', alpha=0.3)\n",
|
| 231 |
+
" \n",
|
| 232 |
+
" # Legend\n",
|
| 233 |
+
" from matplotlib.patches import Patch\n",
|
| 234 |
+
" legend_patches = [Patch(color=speaker_colors[s], label=s) for s in speakers]\n",
|
| 235 |
+
" ax.legend(handles=legend_patches, loc='upper right')\n",
|
| 236 |
+
" \n",
|
| 237 |
+
" plt.tight_layout()\n",
|
| 238 |
+
" plt.show()\n",
|
| 239 |
+
"else:\n",
|
| 240 |
+
" print(\"β οΈ No results to visualize\")"
|
| 241 |
+
]
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"cell_type": "markdown",
|
| 245 |
+
"metadata": {},
|
| 246 |
+
"source": [
|
| 247 |
+
"## 7. Export Results"
|
| 248 |
+
]
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"cell_type": "code",
|
| 252 |
+
"execution_count": null,
|
| 253 |
+
"metadata": {},
|
| 254 |
+
"outputs": [],
|
| 255 |
+
"source": [
|
| 256 |
+
"if 'result' in locals():\n",
|
| 257 |
+
" # Export as JSON\n",
|
| 258 |
+
" pipeline.save_results(result, 'output.json', format='json')\n",
|
| 259 |
+
" \n",
|
| 260 |
+
" # Export as RTTM\n",
|
| 261 |
+
" pipeline.save_results(result, 'output.rttm', format='rttm')\n",
|
| 262 |
+
" \n",
|
| 263 |
+
" # Export as text\n",
|
| 264 |
+
" pipeline.save_results(result, 'output.txt', format='text')\n",
|
| 265 |
+
" \n",
|
| 266 |
+
" print(\"β
Results exported in multiple formats\")\n",
|
| 267 |
+
"else:\n",
|
| 268 |
+
" print(\"β οΈ No results to export\")"
|
| 269 |
+
]
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"cell_type": "markdown",
|
| 273 |
+
"metadata": {},
|
| 274 |
+
"source": [
|
| 275 |
+
"## Summary\n",
|
| 276 |
+
"\n",
|
| 277 |
+
"This notebook demonstrated:\n",
|
| 278 |
+
"1. β
VAD with <100ms latency\n",
|
| 279 |
+
"2. β
Speaker diarization with state-of-the-art accuracy\n",
|
| 280 |
+
"3. β
Integrated pipeline processing\n",
|
| 281 |
+
"4. β
Visualization and export\n",
|
| 282 |
+
"\n",
|
| 283 |
+
"Next steps:\n",
|
| 284 |
+
"- Test on real audio files\n",
|
| 285 |
+
"- Benchmark on FEARLESS STEPS dataset\n",
|
| 286 |
+
"- Deploy with Gradio interface\n",
|
| 287 |
+
"- Containerize with Docker"
|
| 288 |
+
]
|
| 289 |
+
}
|
| 290 |
+
],
|
| 291 |
+
"metadata": {
|
| 292 |
+
"kernelspec": {
|
| 293 |
+
"display_name": "Python 3",
|
| 294 |
+
"language": "python",
|
| 295 |
+
"name": "python3"
|
| 296 |
+
},
|
| 297 |
+
"language_info": {
|
| 298 |
+
"codemirror_mode": {
|
| 299 |
+
"name": "ipython",
|
| 300 |
+
"version": 3
|
| 301 |
+
},
|
| 302 |
+
"file_extension": ".py",
|
| 303 |
+
"mimetype": "text/x-python",
|
| 304 |
+
"name": "python",
|
| 305 |
+
"nbconvert_exporter": "python",
|
| 306 |
+
"pygments_lexer": "ipython3",
|
| 307 |
+
"version": "3.10.0"
|
| 308 |
+
}
|
| 309 |
+
},
|
| 310 |
+
"nbformat": 4,
|
| 311 |
+
"nbformat_minor": 4
|
| 312 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core dependencies
|
| 2 |
+
torch>=2.0.0
|
| 3 |
+
torchaudio>=2.0.0
|
| 4 |
+
|
| 5 |
+
# VAD
|
| 6 |
+
silero-vad>=5.0.0
|
| 7 |
+
|
| 8 |
+
# Speaker Diarization
|
| 9 |
+
pyannote.audio>=3.1.0
|
| 10 |
+
pyannote.core>=5.0.0
|
| 11 |
+
pyannote.metrics>=3.2.0
|
| 12 |
+
|
| 13 |
+
# Audio processing
|
| 14 |
+
librosa>=0.10.0
|
| 15 |
+
soundfile>=0.12.0
|
| 16 |
+
numpy>=1.24.0
|
| 17 |
+
|
| 18 |
+
# Web interface
|
| 19 |
+
gradio>=4.0.0
|
| 20 |
+
|
| 21 |
+
# Visualization
|
| 22 |
+
matplotlib>=3.7.0
|
| 23 |
+
|
| 24 |
+
# Utilities
|
| 25 |
+
tqdm>=4.65.0
|
| 26 |
+
pyyaml>=6.0
|
| 27 |
+
|
| 28 |
+
# Testing
|
| 29 |
+
pytest>=7.0.0
|
| 30 |
+
pytest-cov>=4.0.0
|
| 31 |
+
|
| 32 |
+
# System utilities
|
| 33 |
+
psutil>=5.9.0
|
run_app.sh
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Quick launcher for Gradio app with HF token
|
| 3 |
+
|
| 4 |
+
echo "=========================================="
|
| 5 |
+
echo "VAD + Speaker Diarization - Gradio App"
|
| 6 |
+
echo "=========================================="
|
| 7 |
+
|
| 8 |
+
# Check if .env file exists
|
| 9 |
+
if [ -f ".env" ]; then
|
| 10 |
+
echo "β Found .env file"
|
| 11 |
+
# Load token from .env
|
| 12 |
+
export HF_TOKEN=$(grep HF_TOKEN .env | cut -d '=' -f2)
|
| 13 |
+
fi
|
| 14 |
+
|
| 15 |
+
# Check if token is set
|
| 16 |
+
if [ -z "$HF_TOKEN" ]; then
|
| 17 |
+
echo ""
|
| 18 |
+
echo "β HF_TOKEN not set!"
|
| 19 |
+
echo ""
|
| 20 |
+
echo "Please set your Hugging Face token:"
|
| 21 |
+
echo " export HF_TOKEN='your_token_here'"
|
| 22 |
+
echo ""
|
| 23 |
+
echo "Or create a .env file with:"
|
| 24 |
+
echo " HF_TOKEN=your_token_here"
|
| 25 |
+
echo ""
|
| 26 |
+
echo "Get your token at: https://huggingface.co/settings/tokens"
|
| 27 |
+
echo "Accept model at: https://huggingface.co/pyannote/speaker-diarization-3.1"
|
| 28 |
+
echo ""
|
| 29 |
+
exit 1
|
| 30 |
+
fi
|
| 31 |
+
|
| 32 |
+
echo "β HF_TOKEN is set"
|
| 33 |
+
echo ""
|
| 34 |
+
echo "Starting Gradio app..."
|
| 35 |
+
echo "Open browser to: http://localhost:7860"
|
| 36 |
+
echo ""
|
| 37 |
+
echo "Press Ctrl+C to stop"
|
| 38 |
+
echo "=========================================="
|
| 39 |
+
echo ""
|
| 40 |
+
|
| 41 |
+
# Run the app
|
| 42 |
+
python app.py
|
setup.sh
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Quick setup script for VAD + Speaker Diarization
|
| 3 |
+
|
| 4 |
+
set -e
|
| 5 |
+
|
| 6 |
+
echo "=========================================="
|
| 7 |
+
echo "VAD + Speaker Diarization Setup"
|
| 8 |
+
echo "=========================================="
|
| 9 |
+
|
| 10 |
+
# Check if conda environment is active
|
| 11 |
+
if [[ -n "$CONDA_DEFAULT_ENV" ]]; then
|
| 12 |
+
echo "\nβ Conda environment detected: $CONDA_DEFAULT_ENV"
|
| 13 |
+
USE_CONDA=true
|
| 14 |
+
else
|
| 15 |
+
echo "\nβ οΈ No conda environment detected"
|
| 16 |
+
USE_CONDA=false
|
| 17 |
+
fi
|
| 18 |
+
|
| 19 |
+
# Check Python version
|
| 20 |
+
echo -e "\n[1/6] Checking Python version..."
|
| 21 |
+
python_version=$(python --version 2>&1 | awk '{print $2}')
|
| 22 |
+
echo "Found Python $python_version"
|
| 23 |
+
|
| 24 |
+
if ! python -c "import sys; assert sys.version_info >= (3, 10)" 2>/dev/null; then
|
| 25 |
+
echo "β Error: Python 3.10+ required"
|
| 26 |
+
exit 1
|
| 27 |
+
fi
|
| 28 |
+
echo "β Python version OK"
|
| 29 |
+
|
| 30 |
+
# Check CUDA (optional)
|
| 31 |
+
echo -e "\n[2/6] Checking CUDA..."
|
| 32 |
+
if command -v nvidia-smi &> /dev/null; then
|
| 33 |
+
cuda_version=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}')
|
| 34 |
+
echo "β CUDA $cuda_version detected"
|
| 35 |
+
USE_CUDA=true
|
| 36 |
+
else
|
| 37 |
+
echo "β οΈ No CUDA detected, will use CPU"
|
| 38 |
+
USE_CUDA=false
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
# Check FFmpeg
|
| 42 |
+
echo -e "\n[3/6] Checking FFmpeg..."
|
| 43 |
+
if command -v ffmpeg &> /dev/null; then
|
| 44 |
+
echo "β FFmpeg installed"
|
| 45 |
+
else
|
| 46 |
+
echo "β οΈ FFmpeg not found"
|
| 47 |
+
echo "Install with: sudo apt-get install ffmpeg"
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
# Setup environment
|
| 51 |
+
echo -e "\n[4/6] Setting up Python environment..."
|
| 52 |
+
if [ "$USE_CONDA" = true ]; then
|
| 53 |
+
echo "β Using conda environment: $CONDA_DEFAULT_ENV"
|
| 54 |
+
else
|
| 55 |
+
# Create virtual environment
|
| 56 |
+
if [ ! -d "venv" ]; then
|
| 57 |
+
python -m venv venv
|
| 58 |
+
echo "β Virtual environment created"
|
| 59 |
+
else
|
| 60 |
+
echo "β Virtual environment already exists"
|
| 61 |
+
fi
|
| 62 |
+
# Activate virtual environment
|
| 63 |
+
source venv/bin/activate
|
| 64 |
+
fi
|
| 65 |
+
|
| 66 |
+
# Install PyTorch
|
| 67 |
+
echo -e "\n[5/6] Installing PyTorch..."
|
| 68 |
+
if [ "$USE_CUDA" = true ]; then
|
| 69 |
+
echo "Installing PyTorch with CUDA support..."
|
| 70 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
| 71 |
+
else
|
| 72 |
+
echo "Installing PyTorch (CPU only)..."
|
| 73 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
|
| 74 |
+
fi
|
| 75 |
+
|
| 76 |
+
# Install dependencies
|
| 77 |
+
echo -e "\n[6/6] Installing dependencies..."
|
| 78 |
+
pip install -r requirements.txt
|
| 79 |
+
|
| 80 |
+
# Create directories
|
| 81 |
+
mkdir -p data outputs benchmarks
|
| 82 |
+
|
| 83 |
+
# Setup environment file
|
| 84 |
+
if [ ! -f ".env" ]; then
|
| 85 |
+
cp .env.example .env
|
| 86 |
+
echo "β Created .env file"
|
| 87 |
+
echo "β οΈ Please edit .env and add your HF_TOKEN"
|
| 88 |
+
fi
|
| 89 |
+
|
| 90 |
+
echo -e "\n=========================================="
|
| 91 |
+
echo "β
Setup complete!"
|
| 92 |
+
echo "=========================================="
|
| 93 |
+
echo -e "\nNext steps:"
|
| 94 |
+
if [ "$USE_CONDA" = true ]; then
|
| 95 |
+
echo "1. Environment already active: $CONDA_DEFAULT_ENV β"
|
| 96 |
+
else
|
| 97 |
+
echo "1. Activate environment: source venv/bin/activate"
|
| 98 |
+
fi
|
| 99 |
+
echo "2. Set HF token: export HF_TOKEN='your_token_here'"
|
| 100 |
+
echo " Get token at: https://huggingface.co/settings/tokens"
|
| 101 |
+
echo "3. Accept model conditions at:"
|
| 102 |
+
echo " https://huggingface.co/pyannote/speaker-diarization-3.1"
|
| 103 |
+
echo "4. Run demo: python vad_diarization.py"
|
| 104 |
+
echo "5. Run Gradio app: python app.py"
|
| 105 |
+
echo -e "\nFor more info, see README.md"
|
| 106 |
+
echo "=========================================="
|
src/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Real-Time VAD + Speaker Diarization System
|
| 3 |
+
Production-ready pipeline for voice activity detection and speaker identification
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .vad import SileroVAD
|
| 7 |
+
from .diarization import SpeakerDiarization
|
| 8 |
+
from .pipeline import VADDiarizationPipeline
|
| 9 |
+
from . import utils
|
| 10 |
+
|
| 11 |
+
__version__ = "1.0.0"
|
| 12 |
+
__all__ = [
|
| 13 |
+
'SileroVAD',
|
| 14 |
+
'SpeakerDiarization',
|
| 15 |
+
'VADDiarizationPipeline',
|
| 16 |
+
'utils'
|
| 17 |
+
]
|
src/diarization.py
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Pyannote Speaker Diarization Wrapper
|
| 4 |
+
Optimized for accuracy and performance
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
import numpy as np
|
| 9 |
+
from typing import List, Dict, Optional, Tuple
|
| 10 |
+
import time
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class SpeakerDiarization:
|
| 15 |
+
"""
|
| 16 |
+
Production-ready Pyannote speaker diarization wrapper.
|
| 17 |
+
|
| 18 |
+
Features:
|
| 19 |
+
- State-of-the-art speaker diarization
|
| 20 |
+
- GPU acceleration support
|
| 21 |
+
- Configurable parameters for accuracy/speed tradeoff
|
| 22 |
+
- Overlap detection
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
model_name: str = "pyannote/speaker-diarization-3.1",
|
| 28 |
+
use_auth_token: Optional[str] = None,
|
| 29 |
+
token: Optional[str] = None,
|
| 30 |
+
device: Optional[str] = None,
|
| 31 |
+
num_speakers: Optional[int] = None,
|
| 32 |
+
min_speakers: Optional[int] = None,
|
| 33 |
+
max_speakers: Optional[int] = None
|
| 34 |
+
):
|
| 35 |
+
"""
|
| 36 |
+
Initialize speaker diarization pipeline.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
model_name: Hugging Face model name
|
| 40 |
+
use_auth_token: (Deprecated) Hugging Face authentication token
|
| 41 |
+
token: Hugging Face authentication token (new parameter name)
|
| 42 |
+
device: Device to use ('cuda' or 'cpu')
|
| 43 |
+
num_speakers: Fixed number of speakers (if known)
|
| 44 |
+
min_speakers: Minimum number of speakers
|
| 45 |
+
max_speakers: Maximum number of speakers
|
| 46 |
+
"""
|
| 47 |
+
self.model_name = model_name
|
| 48 |
+
self.num_speakers = num_speakers
|
| 49 |
+
self.min_speakers = min_speakers
|
| 50 |
+
self.max_speakers = max_speakers
|
| 51 |
+
|
| 52 |
+
# Handle both old and new parameter names
|
| 53 |
+
auth_token = token or use_auth_token
|
| 54 |
+
|
| 55 |
+
# Set device
|
| 56 |
+
if device is None:
|
| 57 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 58 |
+
else:
|
| 59 |
+
self.device = torch.device(device)
|
| 60 |
+
|
| 61 |
+
# Load pipeline
|
| 62 |
+
self.pipeline = self._load_pipeline(auth_token)
|
| 63 |
+
|
| 64 |
+
print(f"β Speaker diarization initialized on {self.device}")
|
| 65 |
+
|
| 66 |
+
def _load_pipeline(self, auth_token: Optional[str]):
|
| 67 |
+
"""Load Pyannote diarization pipeline."""
|
| 68 |
+
from pyannote.audio import Pipeline
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
# Use 'token' parameter for pyannote.audio 4.0+
|
| 72 |
+
pipeline = Pipeline.from_pretrained(
|
| 73 |
+
self.model_name,
|
| 74 |
+
token=auth_token
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# Move to device
|
| 78 |
+
pipeline.to(self.device)
|
| 79 |
+
|
| 80 |
+
return pipeline
|
| 81 |
+
except Exception as e:
|
| 82 |
+
print(f"β Error loading pipeline: {e}")
|
| 83 |
+
print("Make sure you have:")
|
| 84 |
+
print("1. Accepted model conditions at https://huggingface.co/pyannote/speaker-diarization-3.1")
|
| 85 |
+
print("2. Valid HF token from https://huggingface.co/settings/tokens")
|
| 86 |
+
raise
|
| 87 |
+
|
| 88 |
+
def process_file(
|
| 89 |
+
self,
|
| 90 |
+
audio_path: str,
|
| 91 |
+
num_speakers: Optional[int] = None,
|
| 92 |
+
min_speakers: Optional[int] = None,
|
| 93 |
+
max_speakers: Optional[int] = None
|
| 94 |
+
) -> Tuple[List[Dict], float, Dict]:
|
| 95 |
+
"""
|
| 96 |
+
Process an audio file and return speaker segments.
|
| 97 |
+
|
| 98 |
+
Args:
|
| 99 |
+
audio_path: Path to audio file
|
| 100 |
+
num_speakers: Override number of speakers
|
| 101 |
+
min_speakers: Override minimum speakers
|
| 102 |
+
max_speakers: Override maximum speakers
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
Tuple of (segments, processing_time_ms, metadata)
|
| 106 |
+
"""
|
| 107 |
+
# Use instance defaults if not provided
|
| 108 |
+
num_speakers = num_speakers or self.num_speakers
|
| 109 |
+
min_speakers = min_speakers or self.min_speakers
|
| 110 |
+
max_speakers = max_speakers or self.max_speakers
|
| 111 |
+
|
| 112 |
+
# Prepare parameters
|
| 113 |
+
params = {}
|
| 114 |
+
if num_speakers is not None:
|
| 115 |
+
params['num_speakers'] = num_speakers
|
| 116 |
+
if min_speakers is not None:
|
| 117 |
+
params['min_speakers'] = min_speakers
|
| 118 |
+
if max_speakers is not None:
|
| 119 |
+
params['max_speakers'] = max_speakers
|
| 120 |
+
|
| 121 |
+
# Process
|
| 122 |
+
start_time = time.time()
|
| 123 |
+
diarization = self.pipeline(audio_path, **params)
|
| 124 |
+
processing_time = (time.time() - start_time) * 1000 # Convert to ms
|
| 125 |
+
|
| 126 |
+
# Extract segments
|
| 127 |
+
segments = []
|
| 128 |
+
speakers = set()
|
| 129 |
+
|
| 130 |
+
# Handle different output formats from pyannote.audio
|
| 131 |
+
# Version 4.0+ returns DiarizeOutput, earlier versions return Annotation
|
| 132 |
+
if hasattr(diarization, 'speaker_diarization'):
|
| 133 |
+
# pyannote.audio 4.0+ format - DiarizeOutput object
|
| 134 |
+
annotation = diarization.speaker_diarization
|
| 135 |
+
elif hasattr(diarization, 'itertracks'):
|
| 136 |
+
# pyannote.audio 3.x format - Annotation object
|
| 137 |
+
annotation = diarization
|
| 138 |
+
else:
|
| 139 |
+
raise ValueError(f"Unknown diarization output format: {type(diarization)}")
|
| 140 |
+
|
| 141 |
+
# Extract segments from annotation
|
| 142 |
+
for turn, _, speaker in annotation.itertracks(yield_label=True):
|
| 143 |
+
segments.append({
|
| 144 |
+
'start': turn.start,
|
| 145 |
+
'end': turn.end,
|
| 146 |
+
'speaker': speaker,
|
| 147 |
+
'duration': turn.end - turn.start
|
| 148 |
+
})
|
| 149 |
+
speakers.add(speaker)
|
| 150 |
+
|
| 151 |
+
# Metadata
|
| 152 |
+
metadata = {
|
| 153 |
+
'num_speakers': len(speakers),
|
| 154 |
+
'total_speech_time': sum(seg['duration'] for seg in segments),
|
| 155 |
+
'num_segments': len(segments)
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
return segments, processing_time, metadata
|
| 159 |
+
|
| 160 |
+
def process_with_vad_segments(
|
| 161 |
+
self,
|
| 162 |
+
audio_path: str,
|
| 163 |
+
vad_segments: List[Dict],
|
| 164 |
+
**kwargs
|
| 165 |
+
) -> List[Dict]:
|
| 166 |
+
"""
|
| 167 |
+
Process audio using VAD segments to optimize diarization.
|
| 168 |
+
|
| 169 |
+
Args:
|
| 170 |
+
audio_path: Path to audio file
|
| 171 |
+
vad_segments: List of VAD segments with 'start' and 'end'
|
| 172 |
+
**kwargs: Additional parameters for diarization
|
| 173 |
+
|
| 174 |
+
Returns:
|
| 175 |
+
List of speaker segments
|
| 176 |
+
"""
|
| 177 |
+
# For now, process full file
|
| 178 |
+
# TODO: Implement segment-wise processing for optimization
|
| 179 |
+
segments, _, _ = self.process_file(audio_path, **kwargs)
|
| 180 |
+
|
| 181 |
+
# Filter segments to only include VAD regions
|
| 182 |
+
filtered_segments = []
|
| 183 |
+
for seg in segments:
|
| 184 |
+
# Check if segment overlaps with any VAD segment
|
| 185 |
+
for vad_seg in vad_segments:
|
| 186 |
+
vad_start = vad_seg['start']
|
| 187 |
+
vad_end = vad_seg['end']
|
| 188 |
+
|
| 189 |
+
# Check overlap
|
| 190 |
+
if seg['start'] < vad_end and seg['end'] > vad_start:
|
| 191 |
+
filtered_segments.append(seg)
|
| 192 |
+
break
|
| 193 |
+
|
| 194 |
+
return filtered_segments
|
| 195 |
+
|
| 196 |
+
def get_speaker_statistics(self, segments: List[Dict]) -> Dict:
|
| 197 |
+
"""
|
| 198 |
+
Calculate speaker statistics from segments.
|
| 199 |
+
|
| 200 |
+
Args:
|
| 201 |
+
segments: List of speaker segments
|
| 202 |
+
|
| 203 |
+
Returns:
|
| 204 |
+
Dict with per-speaker statistics
|
| 205 |
+
"""
|
| 206 |
+
stats = {}
|
| 207 |
+
|
| 208 |
+
for seg in segments:
|
| 209 |
+
speaker = seg['speaker']
|
| 210 |
+
if speaker not in stats:
|
| 211 |
+
stats[speaker] = {
|
| 212 |
+
'total_time': 0.0,
|
| 213 |
+
'num_segments': 0,
|
| 214 |
+
'avg_segment_duration': 0.0
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
stats[speaker]['total_time'] += seg['duration']
|
| 218 |
+
stats[speaker]['num_segments'] += 1
|
| 219 |
+
|
| 220 |
+
# Calculate averages
|
| 221 |
+
for speaker in stats:
|
| 222 |
+
stats[speaker]['avg_segment_duration'] = (
|
| 223 |
+
stats[speaker]['total_time'] / stats[speaker]['num_segments']
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
return stats
|
| 227 |
+
|
| 228 |
+
def format_timeline(self, segments: List[Dict]) -> str:
|
| 229 |
+
"""
|
| 230 |
+
Format segments as a readable timeline.
|
| 231 |
+
|
| 232 |
+
Args:
|
| 233 |
+
segments: List of speaker segments
|
| 234 |
+
|
| 235 |
+
Returns:
|
| 236 |
+
Formatted timeline string
|
| 237 |
+
"""
|
| 238 |
+
lines = ["Speaker Timeline:", "=" * 50]
|
| 239 |
+
|
| 240 |
+
for seg in segments:
|
| 241 |
+
line = f"{seg['start']:7.2f}s - {seg['end']:7.2f}s: {seg['speaker']} ({seg['duration']:.2f}s)"
|
| 242 |
+
lines.append(line)
|
| 243 |
+
|
| 244 |
+
return "\n".join(lines)
|
| 245 |
+
|
| 246 |
+
def calculate_der(
|
| 247 |
+
self,
|
| 248 |
+
predicted_segments: List[Dict],
|
| 249 |
+
reference_segments: List[Dict],
|
| 250 |
+
collar: float = 0.25
|
| 251 |
+
) -> float:
|
| 252 |
+
"""
|
| 253 |
+
Calculate Diarization Error Rate (DER).
|
| 254 |
+
|
| 255 |
+
Args:
|
| 256 |
+
predicted_segments: Predicted speaker segments
|
| 257 |
+
reference_segments: Ground truth segments
|
| 258 |
+
collar: Collar size in seconds for forgiveness
|
| 259 |
+
|
| 260 |
+
Returns:
|
| 261 |
+
DER value (0.0-1.0)
|
| 262 |
+
"""
|
| 263 |
+
# This is a simplified DER calculation
|
| 264 |
+
# For production, use pyannote.metrics
|
| 265 |
+
try:
|
| 266 |
+
from pyannote.metrics.diarization import DiarizationErrorRate
|
| 267 |
+
from pyannote.core import Annotation, Segment
|
| 268 |
+
|
| 269 |
+
# Convert to pyannote format
|
| 270 |
+
reference = Annotation()
|
| 271 |
+
for seg in reference_segments:
|
| 272 |
+
reference[Segment(seg['start'], seg['end'])] = seg['speaker']
|
| 273 |
+
|
| 274 |
+
hypothesis = Annotation()
|
| 275 |
+
for seg in predicted_segments:
|
| 276 |
+
hypothesis[Segment(seg['start'], seg['end'])] = seg['speaker']
|
| 277 |
+
|
| 278 |
+
# Calculate DER
|
| 279 |
+
metric = DiarizationErrorRate(collar=collar)
|
| 280 |
+
der = metric(reference, hypothesis)
|
| 281 |
+
|
| 282 |
+
return der
|
| 283 |
+
except ImportError:
|
| 284 |
+
print("β οΈ pyannote.metrics not available, skipping DER calculation")
|
| 285 |
+
return -1.0
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
def demo():
|
| 289 |
+
"""Demo diarization functionality."""
|
| 290 |
+
print("\n" + "="*60)
|
| 291 |
+
print("SPEAKER DIARIZATION DEMO")
|
| 292 |
+
print("="*60)
|
| 293 |
+
|
| 294 |
+
print("\nβ οΈ This demo requires:")
|
| 295 |
+
print("1. Hugging Face account")
|
| 296 |
+
print("2. Accepted model conditions at:")
|
| 297 |
+
print(" https://huggingface.co/pyannote/speaker-diarization-3.1")
|
| 298 |
+
print("3. Valid HF token from:")
|
| 299 |
+
print(" https://huggingface.co/settings/tokens")
|
| 300 |
+
|
| 301 |
+
# Check for token
|
| 302 |
+
import os
|
| 303 |
+
token = os.environ.get('HF_TOKEN')
|
| 304 |
+
|
| 305 |
+
if not token:
|
| 306 |
+
print("\nβ No HF_TOKEN found in environment")
|
| 307 |
+
print("Set it with: export HF_TOKEN='your_token_here'")
|
| 308 |
+
return
|
| 309 |
+
|
| 310 |
+
try:
|
| 311 |
+
# Initialize
|
| 312 |
+
diarization = SpeakerDiarization(use_auth_token=token)
|
| 313 |
+
print("\nβ
Diarization pipeline loaded successfully")
|
| 314 |
+
|
| 315 |
+
except Exception as e:
|
| 316 |
+
print(f"\nβ Failed to load pipeline: {e}")
|
| 317 |
+
|
| 318 |
+
print("\n" + "="*60)
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
if __name__ == "__main__":
|
| 322 |
+
demo()
|
src/pipeline.py
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Integrated VAD + Speaker Diarization Pipeline
|
| 4 |
+
Real-time processing with optimized performance
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
import numpy as np
|
| 9 |
+
from typing import List, Dict, Optional, Tuple, Union
|
| 10 |
+
import time
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import json
|
| 13 |
+
|
| 14 |
+
from .vad import SileroVAD
|
| 15 |
+
from .diarization import SpeakerDiarization
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class VADDiarizationPipeline:
|
| 19 |
+
"""
|
| 20 |
+
Integrated pipeline combining VAD and speaker diarization.
|
| 21 |
+
|
| 22 |
+
Features:
|
| 23 |
+
- Two-stage processing: VAD first, then diarization
|
| 24 |
+
- Optimized for real-time performance
|
| 25 |
+
- Configurable parameters
|
| 26 |
+
- Comprehensive output format
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def __init__(
|
| 30 |
+
self,
|
| 31 |
+
vad_threshold: float = 0.5,
|
| 32 |
+
use_auth_token: Optional[str] = None,
|
| 33 |
+
token: Optional[str] = None,
|
| 34 |
+
device: Optional[str] = None,
|
| 35 |
+
num_speakers: Optional[int] = None,
|
| 36 |
+
min_speakers: Optional[int] = None,
|
| 37 |
+
max_speakers: Optional[int] = None,
|
| 38 |
+
use_onnx_vad: bool = False
|
| 39 |
+
):
|
| 40 |
+
"""
|
| 41 |
+
Initialize the integrated pipeline.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
vad_threshold: VAD sensitivity threshold
|
| 45 |
+
use_auth_token: (Deprecated) Hugging Face token for diarization
|
| 46 |
+
token: Hugging Face token for diarization (new parameter name)
|
| 47 |
+
device: Device to use ('cuda' or 'cpu')
|
| 48 |
+
num_speakers: Fixed number of speakers
|
| 49 |
+
min_speakers: Minimum number of speakers
|
| 50 |
+
max_speakers: Maximum number of speakers
|
| 51 |
+
use_onnx_vad: Use ONNX for VAD (faster)
|
| 52 |
+
"""
|
| 53 |
+
print("\n" + "="*60)
|
| 54 |
+
print("INITIALIZING VAD + DIARIZATION PIPELINE")
|
| 55 |
+
print("="*60)
|
| 56 |
+
|
| 57 |
+
# Handle both old and new parameter names
|
| 58 |
+
auth_token = token or use_auth_token
|
| 59 |
+
|
| 60 |
+
# Initialize VAD
|
| 61 |
+
print("\n[1/2] Loading Voice Activity Detection...")
|
| 62 |
+
self.vad = SileroVAD(
|
| 63 |
+
threshold=vad_threshold,
|
| 64 |
+
use_onnx=use_onnx_vad
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Initialize Diarization
|
| 68 |
+
print("\n[2/2] Loading Speaker Diarization...")
|
| 69 |
+
self.diarization = SpeakerDiarization(
|
| 70 |
+
token=auth_token,
|
| 71 |
+
device=device,
|
| 72 |
+
num_speakers=num_speakers,
|
| 73 |
+
min_speakers=min_speakers,
|
| 74 |
+
max_speakers=max_speakers
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
print("\n" + "="*60)
|
| 78 |
+
print("β
PIPELINE READY")
|
| 79 |
+
print("="*60 + "\n")
|
| 80 |
+
|
| 81 |
+
def process_file(
|
| 82 |
+
self,
|
| 83 |
+
audio_path: str,
|
| 84 |
+
num_speakers: Optional[int] = None,
|
| 85 |
+
return_vad: bool = True,
|
| 86 |
+
return_stats: bool = True
|
| 87 |
+
) -> Dict:
|
| 88 |
+
"""
|
| 89 |
+
Process an audio file through the complete pipeline.
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
audio_path: Path to audio file
|
| 93 |
+
num_speakers: Number of speakers (if known)
|
| 94 |
+
return_vad: Include VAD segments in output
|
| 95 |
+
return_stats: Include statistics in output
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
Dict with results and metadata
|
| 99 |
+
"""
|
| 100 |
+
print(f"\nπ Processing: {audio_path}")
|
| 101 |
+
print("-" * 60)
|
| 102 |
+
|
| 103 |
+
total_start = time.time()
|
| 104 |
+
|
| 105 |
+
# Stage 1: VAD
|
| 106 |
+
print("Stage 1: Voice Activity Detection...")
|
| 107 |
+
vad_start = time.time()
|
| 108 |
+
vad_segments, vad_time = self.vad.process_file(audio_path)
|
| 109 |
+
vad_duration = (time.time() - vad_start) * 1000
|
| 110 |
+
|
| 111 |
+
print(f" β Found {len(vad_segments)} speech segments")
|
| 112 |
+
print(f" β Processing time: {vad_duration:.2f}ms")
|
| 113 |
+
|
| 114 |
+
# Stage 2: Diarization
|
| 115 |
+
print("\nStage 2: Speaker Diarization...")
|
| 116 |
+
diar_start = time.time()
|
| 117 |
+
speaker_segments, diar_time, diar_metadata = self.diarization.process_file(
|
| 118 |
+
audio_path,
|
| 119 |
+
num_speakers=num_speakers
|
| 120 |
+
)
|
| 121 |
+
diar_duration = (time.time() - diar_start) * 1000
|
| 122 |
+
|
| 123 |
+
print(f" β Identified {diar_metadata['num_speakers']} speakers")
|
| 124 |
+
print(f" β Found {diar_metadata['num_segments']} speaker segments")
|
| 125 |
+
print(f" β Processing time: {diar_duration:.2f}ms")
|
| 126 |
+
|
| 127 |
+
# Calculate total time
|
| 128 |
+
total_duration = (time.time() - total_start) * 1000
|
| 129 |
+
|
| 130 |
+
print(f"\nβ±οΈ Total processing time: {total_duration:.2f}ms")
|
| 131 |
+
print("-" * 60)
|
| 132 |
+
|
| 133 |
+
# Build result
|
| 134 |
+
result = {
|
| 135 |
+
'audio_path': audio_path,
|
| 136 |
+
'speaker_segments': speaker_segments,
|
| 137 |
+
'processing_time': {
|
| 138 |
+
'vad_ms': vad_duration,
|
| 139 |
+
'diarization_ms': diar_duration,
|
| 140 |
+
'total_ms': total_duration
|
| 141 |
+
},
|
| 142 |
+
'metadata': diar_metadata
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
if return_vad:
|
| 146 |
+
result['vad_segments'] = vad_segments
|
| 147 |
+
|
| 148 |
+
if return_stats:
|
| 149 |
+
result['speaker_statistics'] = self.diarization.get_speaker_statistics(
|
| 150 |
+
speaker_segments
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
return result
|
| 154 |
+
|
| 155 |
+
def process_batch(
|
| 156 |
+
self,
|
| 157 |
+
audio_paths: List[str],
|
| 158 |
+
**kwargs
|
| 159 |
+
) -> List[Dict]:
|
| 160 |
+
"""
|
| 161 |
+
Process multiple audio files.
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
audio_paths: List of audio file paths
|
| 165 |
+
**kwargs: Additional arguments for process_file
|
| 166 |
+
|
| 167 |
+
Returns:
|
| 168 |
+
List of results
|
| 169 |
+
"""
|
| 170 |
+
results = []
|
| 171 |
+
|
| 172 |
+
print(f"\nπ¦ Batch processing {len(audio_paths)} files...")
|
| 173 |
+
print("="*60)
|
| 174 |
+
|
| 175 |
+
for i, path in enumerate(audio_paths, 1):
|
| 176 |
+
print(f"\n[{i}/{len(audio_paths)}]")
|
| 177 |
+
result = self.process_file(path, **kwargs)
|
| 178 |
+
results.append(result)
|
| 179 |
+
|
| 180 |
+
print("\n" + "="*60)
|
| 181 |
+
print(f"β
Batch processing complete ({len(results)} files)")
|
| 182 |
+
print("="*60 + "\n")
|
| 183 |
+
|
| 184 |
+
return results
|
| 185 |
+
|
| 186 |
+
def format_output(self, result: Dict, format: str = 'text') -> str:
|
| 187 |
+
"""
|
| 188 |
+
Format pipeline output.
|
| 189 |
+
|
| 190 |
+
Args:
|
| 191 |
+
result: Result from process_file
|
| 192 |
+
format: Output format ('text', 'json', 'rttm')
|
| 193 |
+
|
| 194 |
+
Returns:
|
| 195 |
+
Formatted string
|
| 196 |
+
"""
|
| 197 |
+
if format == 'json':
|
| 198 |
+
return json.dumps(result, indent=2)
|
| 199 |
+
|
| 200 |
+
elif format == 'rttm':
|
| 201 |
+
# RTTM format for NIST evaluation
|
| 202 |
+
lines = []
|
| 203 |
+
for seg in result['speaker_segments']:
|
| 204 |
+
# RTTM format: SPEAKER file 1 start duration <NA> <NA> speaker <NA> <NA>
|
| 205 |
+
line = f"SPEAKER {Path(result['audio_path']).stem} 1 {seg['start']:.3f} {seg['duration']:.3f} <NA> <NA> {seg['speaker']} <NA> <NA>"
|
| 206 |
+
lines.append(line)
|
| 207 |
+
return "\n".join(lines)
|
| 208 |
+
|
| 209 |
+
else: # text
|
| 210 |
+
lines = []
|
| 211 |
+
lines.append("="*60)
|
| 212 |
+
lines.append("VAD + SPEAKER DIARIZATION RESULTS")
|
| 213 |
+
lines.append("="*60)
|
| 214 |
+
lines.append(f"\nFile: {result['audio_path']}")
|
| 215 |
+
|
| 216 |
+
# Metadata
|
| 217 |
+
lines.append(f"\nMetadata:")
|
| 218 |
+
lines.append(f" Speakers: {result['metadata']['num_speakers']}")
|
| 219 |
+
lines.append(f" Segments: {result['metadata']['num_segments']}")
|
| 220 |
+
lines.append(f" Total speech: {result['metadata']['total_speech_time']:.2f}s")
|
| 221 |
+
|
| 222 |
+
# Processing time
|
| 223 |
+
lines.append(f"\nProcessing Time:")
|
| 224 |
+
lines.append(f" VAD: {result['processing_time']['vad_ms']:.2f}ms")
|
| 225 |
+
lines.append(f" Diarization: {result['processing_time']['diarization_ms']:.2f}ms")
|
| 226 |
+
lines.append(f" Total: {result['processing_time']['total_ms']:.2f}ms")
|
| 227 |
+
|
| 228 |
+
# Speaker statistics
|
| 229 |
+
if 'speaker_statistics' in result:
|
| 230 |
+
lines.append(f"\nSpeaker Statistics:")
|
| 231 |
+
for speaker, stats in result['speaker_statistics'].items():
|
| 232 |
+
lines.append(f" {speaker}:")
|
| 233 |
+
lines.append(f" Total time: {stats['total_time']:.2f}s")
|
| 234 |
+
lines.append(f" Segments: {stats['num_segments']}")
|
| 235 |
+
lines.append(f" Avg duration: {stats['avg_segment_duration']:.2f}s")
|
| 236 |
+
|
| 237 |
+
# Timeline
|
| 238 |
+
lines.append(f"\nSpeaker Timeline:")
|
| 239 |
+
lines.append("-"*60)
|
| 240 |
+
for seg in result['speaker_segments']:
|
| 241 |
+
lines.append(f"{seg['start']:7.2f}s - {seg['end']:7.2f}s: {seg['speaker']}")
|
| 242 |
+
|
| 243 |
+
lines.append("="*60)
|
| 244 |
+
|
| 245 |
+
return "\n".join(lines)
|
| 246 |
+
|
| 247 |
+
def save_results(
|
| 248 |
+
self,
|
| 249 |
+
result: Dict,
|
| 250 |
+
output_path: str,
|
| 251 |
+
format: str = 'json'
|
| 252 |
+
):
|
| 253 |
+
"""
|
| 254 |
+
Save results to file.
|
| 255 |
+
|
| 256 |
+
Args:
|
| 257 |
+
result: Result from process_file
|
| 258 |
+
output_path: Output file path
|
| 259 |
+
format: Output format ('json', 'rttm', 'text')
|
| 260 |
+
"""
|
| 261 |
+
output = self.format_output(result, format=format)
|
| 262 |
+
|
| 263 |
+
with open(output_path, 'w') as f:
|
| 264 |
+
f.write(output)
|
| 265 |
+
|
| 266 |
+
print(f"β Results saved to: {output_path}")
|
| 267 |
+
|
| 268 |
+
def benchmark(
|
| 269 |
+
self,
|
| 270 |
+
test_audio_path: Optional[str] = None,
|
| 271 |
+
duration_seconds: float = 10.0
|
| 272 |
+
) -> Dict:
|
| 273 |
+
"""
|
| 274 |
+
Benchmark pipeline performance.
|
| 275 |
+
|
| 276 |
+
Args:
|
| 277 |
+
test_audio_path: Path to test audio (optional)
|
| 278 |
+
duration_seconds: Duration for synthetic test
|
| 279 |
+
|
| 280 |
+
Returns:
|
| 281 |
+
Benchmark metrics
|
| 282 |
+
"""
|
| 283 |
+
print("\n" + "="*60)
|
| 284 |
+
print("PIPELINE BENCHMARK")
|
| 285 |
+
print("="*60)
|
| 286 |
+
|
| 287 |
+
# VAD benchmark
|
| 288 |
+
print("\n[1/2] Benchmarking VAD...")
|
| 289 |
+
vad_metrics = self.vad.benchmark_latency(duration_seconds)
|
| 290 |
+
print(f" Latency: {vad_metrics['latency_per_second_ms']:.2f}ms per second")
|
| 291 |
+
print(f" Real-time factor: {vad_metrics['real_time_factor']:.4f}x")
|
| 292 |
+
|
| 293 |
+
if vad_metrics['latency_per_second_ms'] < 100:
|
| 294 |
+
print(" β
VAD latency target achieved (<100ms)")
|
| 295 |
+
else:
|
| 296 |
+
print(" β οΈ VAD latency above target")
|
| 297 |
+
|
| 298 |
+
# Full pipeline benchmark (if test audio provided)
|
| 299 |
+
if test_audio_path:
|
| 300 |
+
print("\n[2/2] Benchmarking full pipeline...")
|
| 301 |
+
result = self.process_file(test_audio_path, return_stats=False)
|
| 302 |
+
|
| 303 |
+
print(f" Total time: {result['processing_time']['total_ms']:.2f}ms")
|
| 304 |
+
|
| 305 |
+
print("\n" + "="*60)
|
| 306 |
+
|
| 307 |
+
return {
|
| 308 |
+
'vad_metrics': vad_metrics,
|
| 309 |
+
'pipeline_metrics': result['processing_time'] if test_audio_path else None
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
def demo():
|
| 314 |
+
"""Demo the integrated pipeline."""
|
| 315 |
+
print("\n" + "="*60)
|
| 316 |
+
print("INTEGRATED PIPELINE DEMO")
|
| 317 |
+
print("="*60)
|
| 318 |
+
|
| 319 |
+
import os
|
| 320 |
+
|
| 321 |
+
# Check for HF token
|
| 322 |
+
token = os.environ.get('HF_TOKEN')
|
| 323 |
+
if not token:
|
| 324 |
+
print("\nβ οΈ No HF_TOKEN found in environment")
|
| 325 |
+
print("Set it with: export HF_TOKEN='your_token_here'")
|
| 326 |
+
print("\nFor now, will demo VAD only...")
|
| 327 |
+
|
| 328 |
+
# VAD-only demo
|
| 329 |
+
vad = SileroVAD()
|
| 330 |
+
metrics = vad.benchmark_latency()
|
| 331 |
+
print(f"\nβ
VAD latency: {metrics['latency_per_second_ms']:.2f}ms per second")
|
| 332 |
+
return
|
| 333 |
+
|
| 334 |
+
try:
|
| 335 |
+
# Initialize pipeline
|
| 336 |
+
pipeline = VADDiarizationPipeline(
|
| 337 |
+
use_auth_token=token,
|
| 338 |
+
vad_threshold=0.5
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
# Benchmark
|
| 342 |
+
pipeline.benchmark()
|
| 343 |
+
|
| 344 |
+
print("\nβ
Pipeline demo complete!")
|
| 345 |
+
|
| 346 |
+
except Exception as e:
|
| 347 |
+
print(f"\nβ Error: {e}")
|
| 348 |
+
|
| 349 |
+
print("\n" + "="*60)
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
if __name__ == "__main__":
|
| 353 |
+
demo()
|
src/utils.py
ADDED
|
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Utility functions for VAD + Diarization pipeline
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
import torch
|
| 8 |
+
from typing import List, Dict, Optional, Tuple
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import json
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def load_audio(
|
| 14 |
+
path: str,
|
| 15 |
+
sampling_rate: int = 16000,
|
| 16 |
+
mono: bool = True
|
| 17 |
+
) -> Tuple[np.ndarray, int]:
|
| 18 |
+
"""
|
| 19 |
+
Load audio file with automatic format detection.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
path: Path to audio file
|
| 23 |
+
sampling_rate: Target sample rate
|
| 24 |
+
mono: Convert to mono
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
Tuple of (audio_data, sample_rate)
|
| 28 |
+
"""
|
| 29 |
+
try:
|
| 30 |
+
import librosa
|
| 31 |
+
audio, sr = librosa.load(path, sr=sampling_rate, mono=mono)
|
| 32 |
+
return audio, sr
|
| 33 |
+
except Exception as e:
|
| 34 |
+
print(f"Error loading audio with librosa: {e}")
|
| 35 |
+
|
| 36 |
+
# Fallback to soundfile
|
| 37 |
+
try:
|
| 38 |
+
import soundfile as sf
|
| 39 |
+
audio, sr = sf.read(path)
|
| 40 |
+
|
| 41 |
+
# Resample if needed
|
| 42 |
+
if sr != sampling_rate:
|
| 43 |
+
import librosa
|
| 44 |
+
audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
|
| 45 |
+
sr = sampling_rate
|
| 46 |
+
|
| 47 |
+
# Convert to mono if needed
|
| 48 |
+
if mono and len(audio.shape) > 1:
|
| 49 |
+
audio = audio.mean(axis=1)
|
| 50 |
+
|
| 51 |
+
return audio, sr
|
| 52 |
+
except Exception as e:
|
| 53 |
+
print(f"Error loading audio with soundfile: {e}")
|
| 54 |
+
raise
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def save_audio(
|
| 58 |
+
audio: np.ndarray,
|
| 59 |
+
path: str,
|
| 60 |
+
sampling_rate: int = 16000
|
| 61 |
+
):
|
| 62 |
+
"""
|
| 63 |
+
Save audio to file.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
audio: Audio data
|
| 67 |
+
path: Output path
|
| 68 |
+
sampling_rate: Sample rate
|
| 69 |
+
"""
|
| 70 |
+
import soundfile as sf
|
| 71 |
+
sf.write(path, audio, sampling_rate)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def merge_segments(
|
| 75 |
+
segments: List[Dict],
|
| 76 |
+
gap_threshold: float = 0.5
|
| 77 |
+
) -> List[Dict]:
|
| 78 |
+
"""
|
| 79 |
+
Merge nearby segments from the same speaker.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
segments: List of segments with 'start', 'end', 'speaker'
|
| 83 |
+
gap_threshold: Maximum gap to merge (seconds)
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
Merged segments
|
| 87 |
+
"""
|
| 88 |
+
if not segments:
|
| 89 |
+
return []
|
| 90 |
+
|
| 91 |
+
# Sort by start time
|
| 92 |
+
sorted_segments = sorted(segments, key=lambda x: x['start'])
|
| 93 |
+
|
| 94 |
+
merged = [sorted_segments[0].copy()]
|
| 95 |
+
|
| 96 |
+
for seg in sorted_segments[1:]:
|
| 97 |
+
last = merged[-1]
|
| 98 |
+
|
| 99 |
+
# Check if same speaker and close enough
|
| 100 |
+
if (seg['speaker'] == last['speaker'] and
|
| 101 |
+
seg['start'] - last['end'] <= gap_threshold):
|
| 102 |
+
# Merge
|
| 103 |
+
last['end'] = seg['end']
|
| 104 |
+
last['duration'] = last['end'] - last['start']
|
| 105 |
+
else:
|
| 106 |
+
# Add new segment
|
| 107 |
+
merged.append(seg.copy())
|
| 108 |
+
|
| 109 |
+
return merged
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def filter_short_segments(
|
| 113 |
+
segments: List[Dict],
|
| 114 |
+
min_duration: float = 0.5
|
| 115 |
+
) -> List[Dict]:
|
| 116 |
+
"""
|
| 117 |
+
Filter out segments shorter than threshold.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
segments: List of segments
|
| 121 |
+
min_duration: Minimum duration (seconds)
|
| 122 |
+
|
| 123 |
+
Returns:
|
| 124 |
+
Filtered segments
|
| 125 |
+
"""
|
| 126 |
+
return [seg for seg in segments if seg['duration'] >= min_duration]
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def calculate_overlap(
|
| 130 |
+
seg1: Dict,
|
| 131 |
+
seg2: Dict
|
| 132 |
+
) -> float:
|
| 133 |
+
"""
|
| 134 |
+
Calculate overlap between two segments.
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
seg1: First segment with 'start' and 'end'
|
| 138 |
+
seg2: Second segment with 'start' and 'end'
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
Overlap duration in seconds
|
| 142 |
+
"""
|
| 143 |
+
start = max(seg1['start'], seg2['start'])
|
| 144 |
+
end = min(seg1['end'], seg2['end'])
|
| 145 |
+
|
| 146 |
+
return max(0, end - start)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def segment_to_rttm(
|
| 150 |
+
segments: List[Dict],
|
| 151 |
+
file_id: str = "audio"
|
| 152 |
+
) -> str:
|
| 153 |
+
"""
|
| 154 |
+
Convert segments to RTTM format.
|
| 155 |
+
|
| 156 |
+
Args:
|
| 157 |
+
segments: List of segments
|
| 158 |
+
file_id: File identifier
|
| 159 |
+
|
| 160 |
+
Returns:
|
| 161 |
+
RTTM formatted string
|
| 162 |
+
"""
|
| 163 |
+
lines = []
|
| 164 |
+
for seg in segments:
|
| 165 |
+
# RTTM format: SPEAKER file 1 start duration <NA> <NA> speaker <NA> <NA>
|
| 166 |
+
line = f"SPEAKER {file_id} 1 {seg['start']:.3f} {seg['duration']:.3f} <NA> <NA> {seg['speaker']} <NA> <NA>"
|
| 167 |
+
lines.append(line)
|
| 168 |
+
|
| 169 |
+
return "\n".join(lines)
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def rttm_to_segments(rttm_text: str) -> List[Dict]:
|
| 173 |
+
"""
|
| 174 |
+
Parse RTTM format to segments.
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
rttm_text: RTTM formatted text
|
| 178 |
+
|
| 179 |
+
Returns:
|
| 180 |
+
List of segments
|
| 181 |
+
"""
|
| 182 |
+
segments = []
|
| 183 |
+
|
| 184 |
+
for line in rttm_text.strip().split('\n'):
|
| 185 |
+
if not line.strip():
|
| 186 |
+
continue
|
| 187 |
+
|
| 188 |
+
parts = line.split()
|
| 189 |
+
if parts[0] != 'SPEAKER':
|
| 190 |
+
continue
|
| 191 |
+
|
| 192 |
+
start = float(parts[3])
|
| 193 |
+
duration = float(parts[4])
|
| 194 |
+
speaker = parts[7]
|
| 195 |
+
|
| 196 |
+
segments.append({
|
| 197 |
+
'start': start,
|
| 198 |
+
'end': start + duration,
|
| 199 |
+
'duration': duration,
|
| 200 |
+
'speaker': speaker
|
| 201 |
+
})
|
| 202 |
+
|
| 203 |
+
return segments
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def visualize_timeline(
|
| 207 |
+
segments: List[Dict],
|
| 208 |
+
duration: Optional[float] = None,
|
| 209 |
+
width: int = 80
|
| 210 |
+
) -> str:
|
| 211 |
+
"""
|
| 212 |
+
Create ASCII visualization of speaker timeline.
|
| 213 |
+
|
| 214 |
+
Args:
|
| 215 |
+
segments: List of segments
|
| 216 |
+
duration: Total duration (auto-detect if None)
|
| 217 |
+
width: Width of visualization
|
| 218 |
+
|
| 219 |
+
Returns:
|
| 220 |
+
ASCII timeline string
|
| 221 |
+
"""
|
| 222 |
+
if not segments:
|
| 223 |
+
return "No segments to visualize"
|
| 224 |
+
|
| 225 |
+
# Determine duration
|
| 226 |
+
if duration is None:
|
| 227 |
+
duration = max(seg['end'] for seg in segments)
|
| 228 |
+
|
| 229 |
+
# Get unique speakers
|
| 230 |
+
speakers = sorted(set(seg['speaker'] for seg in segments))
|
| 231 |
+
speaker_chars = {}
|
| 232 |
+
chars = ['β', 'β', 'β', 'β', 'β', 'β', 'β ', 'β‘', 'βͺ', 'β«']
|
| 233 |
+
for i, speaker in enumerate(speakers):
|
| 234 |
+
speaker_chars[speaker] = chars[i % len(chars)]
|
| 235 |
+
|
| 236 |
+
# Create timeline
|
| 237 |
+
lines = []
|
| 238 |
+
lines.append(f"\nTimeline (0.00s - {duration:.2f}s):")
|
| 239 |
+
lines.append("β" * width)
|
| 240 |
+
|
| 241 |
+
# Time markers
|
| 242 |
+
time_line = ""
|
| 243 |
+
for i in range(width):
|
| 244 |
+
t = (i / width) * duration
|
| 245 |
+
if i % 10 == 0:
|
| 246 |
+
time_line += f"{t:.0f}s"
|
| 247 |
+
time_line += " " * (10 - len(f"{t:.0f}s"))
|
| 248 |
+
else:
|
| 249 |
+
time_line += " "
|
| 250 |
+
lines.append(time_line[:width])
|
| 251 |
+
|
| 252 |
+
# Speaker rows
|
| 253 |
+
for speaker in speakers:
|
| 254 |
+
row = [' '] * width
|
| 255 |
+
|
| 256 |
+
for seg in segments:
|
| 257 |
+
if seg['speaker'] == speaker:
|
| 258 |
+
start_pos = int((seg['start'] / duration) * width)
|
| 259 |
+
end_pos = int((seg['end'] / duration) * width)
|
| 260 |
+
|
| 261 |
+
for i in range(start_pos, min(end_pos, width)):
|
| 262 |
+
row[i] = speaker_chars[speaker]
|
| 263 |
+
|
| 264 |
+
lines.append(f"{speaker}: {''.join(row)}")
|
| 265 |
+
|
| 266 |
+
lines.append("β" * width)
|
| 267 |
+
|
| 268 |
+
return "\n".join(lines)
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
def export_results(
|
| 272 |
+
result: Dict,
|
| 273 |
+
output_dir: str,
|
| 274 |
+
formats: List[str] = ['json', 'rttm', 'txt']
|
| 275 |
+
):
|
| 276 |
+
"""
|
| 277 |
+
Export results in multiple formats.
|
| 278 |
+
|
| 279 |
+
Args:
|
| 280 |
+
result: Pipeline result
|
| 281 |
+
output_dir: Output directory
|
| 282 |
+
formats: List of formats to export
|
| 283 |
+
"""
|
| 284 |
+
output_path = Path(output_dir)
|
| 285 |
+
output_path.mkdir(parents=True, exist_ok=True)
|
| 286 |
+
|
| 287 |
+
base_name = Path(result['audio_path']).stem
|
| 288 |
+
|
| 289 |
+
for fmt in formats:
|
| 290 |
+
if fmt == 'json':
|
| 291 |
+
# JSON format
|
| 292 |
+
json_path = output_path / f"{base_name}.json"
|
| 293 |
+
with open(json_path, 'w') as f:
|
| 294 |
+
json.dump(result, f, indent=2)
|
| 295 |
+
print(f"β Saved JSON: {json_path}")
|
| 296 |
+
|
| 297 |
+
elif fmt == 'rttm':
|
| 298 |
+
# RTTM format
|
| 299 |
+
rttm_path = output_path / f"{base_name}.rttm"
|
| 300 |
+
rttm_text = segment_to_rttm(result['speaker_segments'], base_name)
|
| 301 |
+
with open(rttm_path, 'w') as f:
|
| 302 |
+
f.write(rttm_text)
|
| 303 |
+
print(f"β Saved RTTM: {rttm_path}")
|
| 304 |
+
|
| 305 |
+
elif fmt == 'txt':
|
| 306 |
+
# Text format
|
| 307 |
+
txt_path = output_path / f"{base_name}.txt"
|
| 308 |
+
|
| 309 |
+
lines = []
|
| 310 |
+
lines.append("="*60)
|
| 311 |
+
lines.append("SPEAKER DIARIZATION RESULTS")
|
| 312 |
+
lines.append("="*60)
|
| 313 |
+
lines.append(f"\nFile: {result['audio_path']}")
|
| 314 |
+
lines.append(f"Speakers: {result['metadata']['num_speakers']}")
|
| 315 |
+
lines.append(f"Segments: {result['metadata']['num_segments']}")
|
| 316 |
+
lines.append(f"\nTimeline:")
|
| 317 |
+
lines.append("-"*60)
|
| 318 |
+
|
| 319 |
+
for seg in result['speaker_segments']:
|
| 320 |
+
lines.append(f"{seg['start']:7.2f}s - {seg['end']:7.2f}s: {seg['speaker']}")
|
| 321 |
+
|
| 322 |
+
with open(txt_path, 'w') as f:
|
| 323 |
+
f.write("\n".join(lines))
|
| 324 |
+
print(f"β Saved TXT: {txt_path}")
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def create_test_audio(
|
| 328 |
+
output_path: str = "test_audio.wav",
|
| 329 |
+
duration: float = 10.0,
|
| 330 |
+
sampling_rate: int = 16000
|
| 331 |
+
) -> str:
|
| 332 |
+
"""
|
| 333 |
+
Create synthetic test audio with speech-like patterns.
|
| 334 |
+
|
| 335 |
+
Args:
|
| 336 |
+
output_path: Output file path
|
| 337 |
+
duration: Duration in seconds
|
| 338 |
+
sampling_rate: Sample rate
|
| 339 |
+
|
| 340 |
+
Returns:
|
| 341 |
+
Path to created file
|
| 342 |
+
"""
|
| 343 |
+
import soundfile as sf
|
| 344 |
+
|
| 345 |
+
# Generate audio
|
| 346 |
+
t = np.linspace(0, duration, int(sampling_rate * duration))
|
| 347 |
+
|
| 348 |
+
# Create speech-like patterns with silence
|
| 349 |
+
signal = np.zeros_like(t)
|
| 350 |
+
|
| 351 |
+
# Calculate segment lengths
|
| 352 |
+
seg1_len = min(int(sampling_rate*3), len(signal))
|
| 353 |
+
seg2_start = int(sampling_rate*4)
|
| 354 |
+
seg2_end = min(int(sampling_rate*7), len(signal))
|
| 355 |
+
seg3_start = min(int(sampling_rate*8), len(signal))
|
| 356 |
+
|
| 357 |
+
# Speaker 1: 0-3s (or until end)
|
| 358 |
+
if seg1_len > 0:
|
| 359 |
+
signal[0:seg1_len] = 0.3 * np.sin(2 * np.pi * 440 * t[0:seg1_len])
|
| 360 |
+
|
| 361 |
+
# Silence: 3-4s
|
| 362 |
+
|
| 363 |
+
# Speaker 2: 4-7s (or until end)
|
| 364 |
+
if seg2_start < len(signal) and seg2_end > seg2_start:
|
| 365 |
+
seg2_len = seg2_end - seg2_start
|
| 366 |
+
signal[seg2_start:seg2_end] = 0.3 * np.sin(2 * np.pi * 880 * t[seg2_start:seg2_end])
|
| 367 |
+
|
| 368 |
+
# Silence: 7-8s
|
| 369 |
+
|
| 370 |
+
# Speaker 1: 8-10s (or until end)
|
| 371 |
+
if seg3_start < len(signal):
|
| 372 |
+
signal[seg3_start:] = 0.3 * np.sin(2 * np.pi * 440 * t[seg3_start:])
|
| 373 |
+
|
| 374 |
+
# Add some noise
|
| 375 |
+
signal += 0.01 * np.random.randn(len(signal))
|
| 376 |
+
|
| 377 |
+
# Save
|
| 378 |
+
sf.write(output_path, signal, sampling_rate)
|
| 379 |
+
|
| 380 |
+
return output_path
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
if __name__ == "__main__":
|
| 384 |
+
# Demo utilities
|
| 385 |
+
print("Utility functions loaded")
|
| 386 |
+
|
| 387 |
+
# Create test audio
|
| 388 |
+
test_path = create_test_audio()
|
| 389 |
+
print(f"β Created test audio: {test_path}")
|
src/vad.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Silero VAD Wrapper for Real-Time Voice Activity Detection
|
| 4 |
+
Optimized for <100ms latency with streaming support
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
import numpy as np
|
| 9 |
+
from typing import List, Dict, Optional, Tuple
|
| 10 |
+
import time
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class SileroVAD:
|
| 15 |
+
"""
|
| 16 |
+
Production-ready Silero VAD wrapper with streaming support.
|
| 17 |
+
|
| 18 |
+
Features:
|
| 19 |
+
- Real-time processing with <100ms latency
|
| 20 |
+
- Configurable sensitivity thresholds
|
| 21 |
+
- Streaming audio buffer management
|
| 22 |
+
- ONNX runtime support for optimization
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
threshold: float = 0.5,
|
| 28 |
+
sampling_rate: int = 16000,
|
| 29 |
+
min_speech_duration_ms: int = 250,
|
| 30 |
+
min_silence_duration_ms: int = 100,
|
| 31 |
+
window_size_samples: int = 1536,
|
| 32 |
+
use_onnx: bool = False
|
| 33 |
+
):
|
| 34 |
+
"""
|
| 35 |
+
Initialize Silero VAD.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
threshold: Speech probability threshold (0.0-1.0)
|
| 39 |
+
sampling_rate: Audio sample rate (8000 or 16000)
|
| 40 |
+
min_speech_duration_ms: Minimum speech segment duration
|
| 41 |
+
min_silence_duration_ms: Minimum silence duration between segments
|
| 42 |
+
window_size_samples: VAD window size (512, 1024, or 1536)
|
| 43 |
+
use_onnx: Use ONNX runtime for faster inference
|
| 44 |
+
"""
|
| 45 |
+
self.threshold = threshold
|
| 46 |
+
self.sampling_rate = sampling_rate
|
| 47 |
+
self.min_speech_duration_ms = min_speech_duration_ms
|
| 48 |
+
self.min_silence_duration_ms = min_silence_duration_ms
|
| 49 |
+
self.window_size_samples = window_size_samples
|
| 50 |
+
self.use_onnx = use_onnx
|
| 51 |
+
|
| 52 |
+
# Load model
|
| 53 |
+
self.model = self._load_model()
|
| 54 |
+
|
| 55 |
+
# State for streaming
|
| 56 |
+
self.reset_states()
|
| 57 |
+
|
| 58 |
+
print(f"β Silero VAD initialized (threshold={threshold}, sr={sampling_rate}Hz)")
|
| 59 |
+
|
| 60 |
+
def _load_model(self):
|
| 61 |
+
"""Load Silero VAD model."""
|
| 62 |
+
try:
|
| 63 |
+
# Try importing from silero_vad package
|
| 64 |
+
from silero_vad import load_silero_vad
|
| 65 |
+
model = load_silero_vad(onnx=self.use_onnx)
|
| 66 |
+
return model
|
| 67 |
+
except ImportError:
|
| 68 |
+
# Fallback: load from torch hub
|
| 69 |
+
model, utils = torch.hub.load(
|
| 70 |
+
repo_or_dir='snakers4/silero-vad',
|
| 71 |
+
model='silero_vad',
|
| 72 |
+
force_reload=False,
|
| 73 |
+
onnx=self.use_onnx
|
| 74 |
+
)
|
| 75 |
+
return model
|
| 76 |
+
|
| 77 |
+
def reset_states(self):
|
| 78 |
+
"""Reset internal states for streaming."""
|
| 79 |
+
self.model.reset_states()
|
| 80 |
+
|
| 81 |
+
def process_chunk(self, audio_chunk: np.ndarray) -> float:
|
| 82 |
+
"""
|
| 83 |
+
Process a single audio chunk and return speech probability.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
audio_chunk: Audio data (numpy array, float32, mono)
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
Speech probability (0.0-1.0)
|
| 90 |
+
"""
|
| 91 |
+
# Convert to torch tensor
|
| 92 |
+
if isinstance(audio_chunk, np.ndarray):
|
| 93 |
+
audio_tensor = torch.from_numpy(audio_chunk).float()
|
| 94 |
+
else:
|
| 95 |
+
audio_tensor = audio_chunk
|
| 96 |
+
|
| 97 |
+
# Get speech probability
|
| 98 |
+
with torch.no_grad():
|
| 99 |
+
speech_prob = self.model(audio_tensor, self.sampling_rate).item()
|
| 100 |
+
|
| 101 |
+
return speech_prob
|
| 102 |
+
|
| 103 |
+
def get_speech_timestamps(
|
| 104 |
+
self,
|
| 105 |
+
audio: np.ndarray,
|
| 106 |
+
return_seconds: bool = False
|
| 107 |
+
) -> List[Dict[str, float]]:
|
| 108 |
+
"""
|
| 109 |
+
Get speech timestamps from audio.
|
| 110 |
+
|
| 111 |
+
Args:
|
| 112 |
+
audio: Audio data (numpy array, float32, mono)
|
| 113 |
+
return_seconds: Return timestamps in seconds instead of samples
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
List of dicts with 'start' and 'end' keys
|
| 117 |
+
"""
|
| 118 |
+
try:
|
| 119 |
+
from silero_vad import get_speech_timestamps
|
| 120 |
+
|
| 121 |
+
# Convert to torch tensor
|
| 122 |
+
if isinstance(audio, np.ndarray):
|
| 123 |
+
audio_tensor = torch.from_numpy(audio).float()
|
| 124 |
+
else:
|
| 125 |
+
audio_tensor = audio
|
| 126 |
+
|
| 127 |
+
# Get timestamps
|
| 128 |
+
timestamps = get_speech_timestamps(
|
| 129 |
+
audio_tensor,
|
| 130 |
+
self.model,
|
| 131 |
+
threshold=self.threshold,
|
| 132 |
+
sampling_rate=self.sampling_rate,
|
| 133 |
+
min_speech_duration_ms=self.min_speech_duration_ms,
|
| 134 |
+
min_silence_duration_ms=self.min_silence_duration_ms,
|
| 135 |
+
window_size_samples=self.window_size_samples,
|
| 136 |
+
return_seconds=return_seconds
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
return timestamps
|
| 140 |
+
except ImportError:
|
| 141 |
+
# Fallback: manual implementation
|
| 142 |
+
return self._get_speech_timestamps_manual(audio, return_seconds)
|
| 143 |
+
|
| 144 |
+
def _get_speech_timestamps_manual(
|
| 145 |
+
self,
|
| 146 |
+
audio: np.ndarray,
|
| 147 |
+
return_seconds: bool = False
|
| 148 |
+
) -> List[Dict[str, float]]:
|
| 149 |
+
"""Manual implementation of speech timestamp detection."""
|
| 150 |
+
if isinstance(audio, np.ndarray):
|
| 151 |
+
audio_tensor = torch.from_numpy(audio).float()
|
| 152 |
+
else:
|
| 153 |
+
audio_tensor = audio
|
| 154 |
+
|
| 155 |
+
# Process in windows
|
| 156 |
+
window_size = self.window_size_samples
|
| 157 |
+
speech_probs = []
|
| 158 |
+
|
| 159 |
+
self.reset_states()
|
| 160 |
+
|
| 161 |
+
for i in range(0, len(audio_tensor), window_size):
|
| 162 |
+
chunk = audio_tensor[i:i + window_size]
|
| 163 |
+
if len(chunk) < window_size:
|
| 164 |
+
# Pad last chunk
|
| 165 |
+
chunk = torch.nn.functional.pad(chunk, (0, window_size - len(chunk)))
|
| 166 |
+
|
| 167 |
+
prob = self.process_chunk(chunk)
|
| 168 |
+
speech_probs.append(prob)
|
| 169 |
+
|
| 170 |
+
# Find speech segments
|
| 171 |
+
timestamps = []
|
| 172 |
+
in_speech = False
|
| 173 |
+
speech_start = 0
|
| 174 |
+
|
| 175 |
+
for i, prob in enumerate(speech_probs):
|
| 176 |
+
sample_idx = i * window_size
|
| 177 |
+
|
| 178 |
+
if prob >= self.threshold and not in_speech:
|
| 179 |
+
# Speech start
|
| 180 |
+
in_speech = True
|
| 181 |
+
speech_start = sample_idx
|
| 182 |
+
elif prob < self.threshold and in_speech:
|
| 183 |
+
# Speech end
|
| 184 |
+
in_speech = False
|
| 185 |
+
speech_end = sample_idx
|
| 186 |
+
|
| 187 |
+
# Check minimum duration
|
| 188 |
+
duration_ms = (speech_end - speech_start) / self.sampling_rate * 1000
|
| 189 |
+
if duration_ms >= self.min_speech_duration_ms:
|
| 190 |
+
if return_seconds:
|
| 191 |
+
timestamps.append({
|
| 192 |
+
'start': speech_start / self.sampling_rate,
|
| 193 |
+
'end': speech_end / self.sampling_rate
|
| 194 |
+
})
|
| 195 |
+
else:
|
| 196 |
+
timestamps.append({
|
| 197 |
+
'start': speech_start,
|
| 198 |
+
'end': speech_end
|
| 199 |
+
})
|
| 200 |
+
|
| 201 |
+
# Handle case where speech continues to end
|
| 202 |
+
if in_speech:
|
| 203 |
+
speech_end = len(audio_tensor)
|
| 204 |
+
if return_seconds:
|
| 205 |
+
timestamps.append({
|
| 206 |
+
'start': speech_start / self.sampling_rate,
|
| 207 |
+
'end': speech_end / self.sampling_rate
|
| 208 |
+
})
|
| 209 |
+
else:
|
| 210 |
+
timestamps.append({
|
| 211 |
+
'start': speech_start,
|
| 212 |
+
'end': speech_end
|
| 213 |
+
})
|
| 214 |
+
|
| 215 |
+
return timestamps
|
| 216 |
+
|
| 217 |
+
def process_file(self, audio_path: str) -> Tuple[List[Dict], float]:
|
| 218 |
+
"""
|
| 219 |
+
Process an audio file and return speech segments with latency.
|
| 220 |
+
|
| 221 |
+
Args:
|
| 222 |
+
audio_path: Path to audio file
|
| 223 |
+
|
| 224 |
+
Returns:
|
| 225 |
+
Tuple of (timestamps, processing_time_ms)
|
| 226 |
+
"""
|
| 227 |
+
# Load audio
|
| 228 |
+
audio = self.read_audio(audio_path)
|
| 229 |
+
|
| 230 |
+
# Measure processing time
|
| 231 |
+
start_time = time.time()
|
| 232 |
+
timestamps = self.get_speech_timestamps(audio, return_seconds=True)
|
| 233 |
+
processing_time = (time.time() - start_time) * 1000 # Convert to ms
|
| 234 |
+
|
| 235 |
+
return timestamps, processing_time
|
| 236 |
+
|
| 237 |
+
@staticmethod
|
| 238 |
+
def read_audio(path: str, sampling_rate: int = 16000) -> torch.Tensor:
|
| 239 |
+
"""
|
| 240 |
+
Read audio file and convert to required format.
|
| 241 |
+
|
| 242 |
+
Args:
|
| 243 |
+
path: Path to audio file
|
| 244 |
+
sampling_rate: Target sample rate
|
| 245 |
+
|
| 246 |
+
Returns:
|
| 247 |
+
Audio tensor (mono, float32)
|
| 248 |
+
"""
|
| 249 |
+
try:
|
| 250 |
+
from silero_vad import read_audio
|
| 251 |
+
return read_audio(path, sampling_rate=sampling_rate)
|
| 252 |
+
except ImportError:
|
| 253 |
+
# Fallback: use librosa
|
| 254 |
+
import librosa
|
| 255 |
+
audio, sr = librosa.load(path, sr=sampling_rate, mono=True)
|
| 256 |
+
return torch.from_numpy(audio).float()
|
| 257 |
+
|
| 258 |
+
def benchmark_latency(self, duration_seconds: float = 10.0) -> Dict[str, float]:
|
| 259 |
+
"""
|
| 260 |
+
Benchmark VAD latency on synthetic audio.
|
| 261 |
+
|
| 262 |
+
Args:
|
| 263 |
+
duration_seconds: Duration of test audio
|
| 264 |
+
|
| 265 |
+
Returns:
|
| 266 |
+
Dict with latency metrics
|
| 267 |
+
"""
|
| 268 |
+
# Generate test audio
|
| 269 |
+
num_samples = int(duration_seconds * self.sampling_rate)
|
| 270 |
+
test_audio = torch.randn(num_samples)
|
| 271 |
+
|
| 272 |
+
# Warm-up
|
| 273 |
+
self.reset_states()
|
| 274 |
+
_ = self.get_speech_timestamps(test_audio.numpy())
|
| 275 |
+
|
| 276 |
+
# Benchmark
|
| 277 |
+
self.reset_states()
|
| 278 |
+
start_time = time.time()
|
| 279 |
+
timestamps = self.get_speech_timestamps(test_audio.numpy())
|
| 280 |
+
end_time = time.time()
|
| 281 |
+
|
| 282 |
+
processing_time_ms = (end_time - start_time) * 1000
|
| 283 |
+
latency_per_second = processing_time_ms / duration_seconds
|
| 284 |
+
|
| 285 |
+
return {
|
| 286 |
+
'total_processing_time_ms': processing_time_ms,
|
| 287 |
+
'audio_duration_s': duration_seconds,
|
| 288 |
+
'latency_per_second_ms': latency_per_second,
|
| 289 |
+
'real_time_factor': processing_time_ms / (duration_seconds * 1000),
|
| 290 |
+
'num_segments': len(timestamps)
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
def demo():
|
| 295 |
+
"""Demo VAD functionality."""
|
| 296 |
+
print("\n" + "="*60)
|
| 297 |
+
print("SILERO VAD DEMO")
|
| 298 |
+
print("="*60)
|
| 299 |
+
|
| 300 |
+
# Initialize VAD
|
| 301 |
+
vad = SileroVAD(threshold=0.5)
|
| 302 |
+
|
| 303 |
+
# Benchmark latency
|
| 304 |
+
print("\nπ Benchmarking latency...")
|
| 305 |
+
metrics = vad.benchmark_latency(duration_seconds=10.0)
|
| 306 |
+
print(f" Total processing time: {metrics['total_processing_time_ms']:.2f}ms")
|
| 307 |
+
print(f" Audio duration: {metrics['audio_duration_s']:.1f}s")
|
| 308 |
+
print(f" Latency per second: {metrics['latency_per_second_ms']:.2f}ms")
|
| 309 |
+
print(f" Real-time factor: {metrics['real_time_factor']:.4f}x")
|
| 310 |
+
|
| 311 |
+
if metrics['latency_per_second_ms'] < 100:
|
| 312 |
+
print(" β
Target latency achieved (<100ms)")
|
| 313 |
+
else:
|
| 314 |
+
print(" β οΈ Latency above target (>100ms)")
|
| 315 |
+
|
| 316 |
+
print("\n" + "="*60)
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
if __name__ == "__main__":
|
| 320 |
+
demo()
|
tests/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test suite for VAD + Speaker Diarization system
|
| 3 |
+
"""
|
tests/test_pipeline.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Unit tests for integrated pipeline
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import pytest
|
| 7 |
+
import numpy as np
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import sys
|
| 10 |
+
import tempfile
|
| 11 |
+
import soundfile as sf
|
| 12 |
+
|
| 13 |
+
# Add src to path
|
| 14 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 15 |
+
|
| 16 |
+
from src.pipeline import VADDiarizationPipeline
|
| 17 |
+
from src.vad import SileroVAD
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class TestPipeline:
|
| 21 |
+
"""Test cases for integrated pipeline."""
|
| 22 |
+
|
| 23 |
+
@pytest.fixture
|
| 24 |
+
def test_audio_file(self):
|
| 25 |
+
"""Create a temporary test audio file."""
|
| 26 |
+
# Generate test audio
|
| 27 |
+
sr = 16000
|
| 28 |
+
duration = 5
|
| 29 |
+
audio = 0.1 * np.random.randn(sr * duration).astype(np.float32)
|
| 30 |
+
|
| 31 |
+
# Save to temp file
|
| 32 |
+
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
|
| 33 |
+
sf.write(f.name, audio, sr)
|
| 34 |
+
yield f.name
|
| 35 |
+
|
| 36 |
+
# Cleanup
|
| 37 |
+
Path(f.name).unlink(missing_ok=True)
|
| 38 |
+
|
| 39 |
+
def test_vad_only(self, test_audio_file):
|
| 40 |
+
"""Test VAD-only processing (no HF token needed)."""
|
| 41 |
+
vad = SileroVAD()
|
| 42 |
+
|
| 43 |
+
# Process file
|
| 44 |
+
timestamps, processing_time = vad.process_file(test_audio_file)
|
| 45 |
+
|
| 46 |
+
# Verify
|
| 47 |
+
assert isinstance(timestamps, list)
|
| 48 |
+
assert isinstance(processing_time, float)
|
| 49 |
+
assert processing_time > 0
|
| 50 |
+
|
| 51 |
+
def test_format_output_text(self):
|
| 52 |
+
"""Test text output formatting."""
|
| 53 |
+
# Mock result
|
| 54 |
+
result = {
|
| 55 |
+
'audio_path': 'test.wav',
|
| 56 |
+
'speaker_segments': [
|
| 57 |
+
{'start': 0.0, 'end': 2.0, 'speaker': 'SPEAKER_00', 'duration': 2.0},
|
| 58 |
+
{'start': 3.0, 'end': 5.0, 'speaker': 'SPEAKER_01', 'duration': 2.0}
|
| 59 |
+
],
|
| 60 |
+
'metadata': {
|
| 61 |
+
'num_speakers': 2,
|
| 62 |
+
'num_segments': 2,
|
| 63 |
+
'total_speech_time': 4.0
|
| 64 |
+
},
|
| 65 |
+
'processing_time': {
|
| 66 |
+
'vad_ms': 50.0,
|
| 67 |
+
'diarization_ms': 1000.0,
|
| 68 |
+
'total_ms': 1050.0
|
| 69 |
+
}
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
# Test with VAD only (no full pipeline needed)
|
| 73 |
+
from src.pipeline import VADDiarizationPipeline
|
| 74 |
+
|
| 75 |
+
# Format output (doesn't require initialized pipeline)
|
| 76 |
+
output = format_result_text(result)
|
| 77 |
+
|
| 78 |
+
assert 'test.wav' in output
|
| 79 |
+
assert 'SPEAKER_00' in output
|
| 80 |
+
assert 'SPEAKER_01' in output
|
| 81 |
+
|
| 82 |
+
def test_vad_latency_target(self):
|
| 83 |
+
"""Test that VAD meets latency target."""
|
| 84 |
+
vad = SileroVAD()
|
| 85 |
+
|
| 86 |
+
# Benchmark
|
| 87 |
+
metrics = vad.benchmark_latency(duration_seconds=10.0)
|
| 88 |
+
|
| 89 |
+
# Check latency target (<100ms per second)
|
| 90 |
+
assert metrics['latency_per_second_ms'] < 100, \
|
| 91 |
+
f"VAD latency {metrics['latency_per_second_ms']:.2f}ms exceeds 100ms target"
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def format_result_text(result):
|
| 95 |
+
"""Helper function to format results as text."""
|
| 96 |
+
lines = []
|
| 97 |
+
lines.append(f"File: {result['audio_path']}")
|
| 98 |
+
lines.append(f"Speakers: {result['metadata']['num_speakers']}")
|
| 99 |
+
lines.append(f"Segments: {result['metadata']['num_segments']}")
|
| 100 |
+
|
| 101 |
+
for seg in result['speaker_segments']:
|
| 102 |
+
lines.append(f"{seg['start']:.2f}s - {seg['end']:.2f}s: {seg['speaker']}")
|
| 103 |
+
|
| 104 |
+
return "\n".join(lines)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
if __name__ == "__main__":
|
| 108 |
+
pytest.main([__file__, "-v"])
|
tests/test_vad.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Unit tests for VAD module
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import pytest
|
| 7 |
+
import torch
|
| 8 |
+
import numpy as np
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import sys
|
| 11 |
+
|
| 12 |
+
# Add src to path
|
| 13 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 14 |
+
|
| 15 |
+
from src.vad import SileroVAD
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class TestSileroVAD:
|
| 19 |
+
"""Test cases for Silero VAD."""
|
| 20 |
+
|
| 21 |
+
@pytest.fixture
|
| 22 |
+
def vad(self):
|
| 23 |
+
"""Create VAD instance for testing."""
|
| 24 |
+
return SileroVAD(threshold=0.5)
|
| 25 |
+
|
| 26 |
+
def test_initialization(self, vad):
|
| 27 |
+
"""Test VAD initialization."""
|
| 28 |
+
assert vad is not None
|
| 29 |
+
assert vad.threshold == 0.5
|
| 30 |
+
assert vad.sampling_rate == 16000
|
| 31 |
+
assert vad.model is not None
|
| 32 |
+
|
| 33 |
+
def test_process_chunk(self, vad):
|
| 34 |
+
"""Test processing a single audio chunk."""
|
| 35 |
+
# Create test audio
|
| 36 |
+
chunk = np.random.randn(1536).astype(np.float32)
|
| 37 |
+
|
| 38 |
+
# Process
|
| 39 |
+
prob = vad.process_chunk(chunk)
|
| 40 |
+
|
| 41 |
+
# Verify
|
| 42 |
+
assert isinstance(prob, float)
|
| 43 |
+
assert 0.0 <= prob <= 1.0
|
| 44 |
+
|
| 45 |
+
def test_get_speech_timestamps(self, vad):
|
| 46 |
+
"""Test getting speech timestamps."""
|
| 47 |
+
# Create test audio with speech-like pattern
|
| 48 |
+
sr = 16000
|
| 49 |
+
duration = 5
|
| 50 |
+
audio = np.zeros(sr * duration, dtype=np.float32)
|
| 51 |
+
|
| 52 |
+
# Add "speech" in middle (higher energy)
|
| 53 |
+
audio[sr:sr*3] = 0.5 * np.random.randn(sr * 2)
|
| 54 |
+
|
| 55 |
+
# Get timestamps
|
| 56 |
+
timestamps = vad.get_speech_timestamps(audio, return_seconds=True)
|
| 57 |
+
|
| 58 |
+
# Verify
|
| 59 |
+
assert isinstance(timestamps, list)
|
| 60 |
+
for ts in timestamps:
|
| 61 |
+
assert 'start' in ts
|
| 62 |
+
assert 'end' in ts
|
| 63 |
+
assert ts['end'] > ts['start']
|
| 64 |
+
|
| 65 |
+
def test_reset_states(self, vad):
|
| 66 |
+
"""Test state reset."""
|
| 67 |
+
# Process some audio
|
| 68 |
+
chunk = np.random.randn(1536).astype(np.float32)
|
| 69 |
+
vad.process_chunk(chunk)
|
| 70 |
+
|
| 71 |
+
# Reset
|
| 72 |
+
vad.reset_states()
|
| 73 |
+
|
| 74 |
+
# Should work without error
|
| 75 |
+
prob = vad.process_chunk(chunk)
|
| 76 |
+
assert isinstance(prob, float)
|
| 77 |
+
|
| 78 |
+
def test_benchmark_latency(self, vad):
|
| 79 |
+
"""Test latency benchmarking."""
|
| 80 |
+
metrics = vad.benchmark_latency(duration_seconds=1.0)
|
| 81 |
+
|
| 82 |
+
# Verify metrics
|
| 83 |
+
assert 'total_processing_time_ms' in metrics
|
| 84 |
+
assert 'audio_duration_s' in metrics
|
| 85 |
+
assert 'latency_per_second_ms' in metrics
|
| 86 |
+
assert 'real_time_factor' in metrics
|
| 87 |
+
|
| 88 |
+
# Check latency target
|
| 89 |
+
assert metrics['latency_per_second_ms'] < 1000 # Should be much faster
|
| 90 |
+
|
| 91 |
+
def test_different_thresholds(self):
|
| 92 |
+
"""Test VAD with different thresholds."""
|
| 93 |
+
thresholds = [0.3, 0.5, 0.7]
|
| 94 |
+
|
| 95 |
+
for threshold in thresholds:
|
| 96 |
+
vad = SileroVAD(threshold=threshold)
|
| 97 |
+
assert vad.threshold == threshold
|
| 98 |
+
|
| 99 |
+
# Test processing
|
| 100 |
+
audio = np.random.randn(16000).astype(np.float32)
|
| 101 |
+
timestamps = vad.get_speech_timestamps(audio)
|
| 102 |
+
assert isinstance(timestamps, list)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def test_vad_import():
|
| 106 |
+
"""Test that VAD can be imported."""
|
| 107 |
+
from src.vad import SileroVAD
|
| 108 |
+
assert SileroVAD is not None
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
if __name__ == "__main__":
|
| 112 |
+
pytest.main([__file__, "-v"])
|
vad_diarization.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Voice Activity Detection + Speaker Diarization
|
| 4 |
+
Simple demo script using the modular pipeline
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
import librosa
|
| 9 |
+
import numpy as np
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import os
|
| 12 |
+
import sys
|
| 13 |
+
|
| 14 |
+
# Import from modular components
|
| 15 |
+
from src.vad import SileroVAD
|
| 16 |
+
from src.diarization import SpeakerDiarization
|
| 17 |
+
from src.pipeline import VADDiarizationPipeline
|
| 18 |
+
from src.utils import create_test_audio
|
| 19 |
+
|
| 20 |
+
def setup_vad():
|
| 21 |
+
"""Setup Silero VAD using modular wrapper"""
|
| 22 |
+
print("Setting up Voice Activity Detection...")
|
| 23 |
+
|
| 24 |
+
vad = SileroVAD(threshold=0.5)
|
| 25 |
+
print("β Silero VAD loaded (40 MB)")
|
| 26 |
+
|
| 27 |
+
return vad
|
| 28 |
+
|
| 29 |
+
def setup_diarization():
|
| 30 |
+
"""Setup Speaker Diarization using modular wrapper"""
|
| 31 |
+
print("Setting up Speaker Diarization...")
|
| 32 |
+
print("β οΈ First download requires 1GB+ bandwidth (one-time)")
|
| 33 |
+
|
| 34 |
+
# Get token from environment or use provided one
|
| 35 |
+
token = os.environ.get('HF_TOKEN', 'your_token_here')
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
diarization = SpeakerDiarization(
|
| 39 |
+
model_name="pyannote/speaker-diarization-3.1",
|
| 40 |
+
use_auth_token=token
|
| 41 |
+
)
|
| 42 |
+
print("β Diarization pipeline loaded")
|
| 43 |
+
return diarization
|
| 44 |
+
except Exception as e:
|
| 45 |
+
print(f"β Error: {e}")
|
| 46 |
+
print("Get your HF token: https://huggingface.co/settings/tokens")
|
| 47 |
+
print("Or set it: export HF_TOKEN='your_token_here'")
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
def demo_vad(audio_path, vad_model):
|
| 51 |
+
"""Demo VAD on an audio file using modular wrapper"""
|
| 52 |
+
print(f"\nVAD Analysis: {audio_path}")
|
| 53 |
+
|
| 54 |
+
timestamps, processing_time = vad_model.process_file(audio_path)
|
| 55 |
+
|
| 56 |
+
print(f"Found {len(timestamps)} speech segments:")
|
| 57 |
+
print(f"Processing time: {processing_time:.2f}ms")
|
| 58 |
+
|
| 59 |
+
for i, ts in enumerate(timestamps, 1):
|
| 60 |
+
start_s = ts['start']
|
| 61 |
+
end_s = ts['end']
|
| 62 |
+
duration_s = end_s - start_s
|
| 63 |
+
print(f" Segment {i}: {start_s:6.2f}s - {end_s:6.2f}s ({duration_s:6.2f}s)")
|
| 64 |
+
|
| 65 |
+
return timestamps
|
| 66 |
+
|
| 67 |
+
def demo_diarization(audio_path, diar_pipeline):
|
| 68 |
+
"""Demo Diarization on an audio file using modular wrapper"""
|
| 69 |
+
print(f"\nDiarization Analysis: {audio_path}")
|
| 70 |
+
|
| 71 |
+
segments, processing_time, metadata = diar_pipeline.process_file(audio_path)
|
| 72 |
+
|
| 73 |
+
print(f"Found {metadata['num_speakers']} speakers")
|
| 74 |
+
print(f"Processing time: {processing_time:.2f}ms")
|
| 75 |
+
print("\nSpeaker timeline:")
|
| 76 |
+
for seg in segments:
|
| 77 |
+
print(f" {seg['start']:6.2f}s - {seg['end']:6.2f}s: {seg['speaker']}")
|
| 78 |
+
|
| 79 |
+
def demo_full_pipeline(audio_path):
|
| 80 |
+
"""Demo the full integrated pipeline"""
|
| 81 |
+
print(f"\n{'='*60}")
|
| 82 |
+
print("FULL PIPELINE DEMO")
|
| 83 |
+
print(f"{'='*60}")
|
| 84 |
+
|
| 85 |
+
token = os.environ.get('HF_TOKEN')
|
| 86 |
+
if not token:
|
| 87 |
+
print("\nβ οΈ No HF_TOKEN found. Running VAD only...")
|
| 88 |
+
vad = SileroVAD()
|
| 89 |
+
demo_vad(audio_path, vad)
|
| 90 |
+
return
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
# Initialize full pipeline
|
| 94 |
+
pipeline = VADDiarizationPipeline(
|
| 95 |
+
use_auth_token=token,
|
| 96 |
+
vad_threshold=0.5
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# Process file
|
| 100 |
+
result = pipeline.process_file(audio_path)
|
| 101 |
+
|
| 102 |
+
# Display formatted output
|
| 103 |
+
print("\n" + pipeline.format_output(result, format='text'))
|
| 104 |
+
|
| 105 |
+
except Exception as e:
|
| 106 |
+
print(f"\nβ Error: {e}")
|
| 107 |
+
print("Falling back to VAD only...")
|
| 108 |
+
vad = SileroVAD()
|
| 109 |
+
demo_vad(audio_path, vad)
|
| 110 |
+
|
| 111 |
+
def main():
|
| 112 |
+
print("\n" + "=" * 60)
|
| 113 |
+
print("VOICE ACTIVITY DETECTION + SPEAKER DIARIZATION")
|
| 114 |
+
print("=" * 60)
|
| 115 |
+
|
| 116 |
+
# Create test audio
|
| 117 |
+
print("\nCreating test audio...")
|
| 118 |
+
audio_path = create_test_audio("test_audio.wav", duration=10.0)
|
| 119 |
+
print(f"β Created {audio_path}")
|
| 120 |
+
|
| 121 |
+
# Option 1: Quick VAD demo
|
| 122 |
+
print("\n" + "=" * 60)
|
| 123 |
+
print("OPTION 1: VAD ONLY (No HF token needed)")
|
| 124 |
+
print("=" * 60)
|
| 125 |
+
vad_model = setup_vad()
|
| 126 |
+
demo_vad(audio_path, vad_model)
|
| 127 |
+
|
| 128 |
+
# Option 2: Full pipeline (requires HF token)
|
| 129 |
+
print("\n" + "=" * 60)
|
| 130 |
+
print("OPTION 2: FULL PIPELINE (VAD + Diarization)")
|
| 131 |
+
print("=" * 60)
|
| 132 |
+
demo_full_pipeline(audio_path)
|
| 133 |
+
|
| 134 |
+
print("\n" + "=" * 60)
|
| 135 |
+
print("β
Demo complete!")
|
| 136 |
+
print("\nNext steps:")
|
| 137 |
+
print("1. Set HF_TOKEN: export HF_TOKEN='your_token_here'")
|
| 138 |
+
print("2. Run Gradio demo: python app.py")
|
| 139 |
+
print("3. Test on real audio files")
|
| 140 |
+
print("4. Deploy with Docker: docker build -t vad-diarization .")
|
| 141 |
+
print("5. Check notebooks/demo.ipynb for detailed examples")
|
| 142 |
+
print("=" * 60 + "\n")
|
| 143 |
+
|
| 144 |
+
if __name__ == "__main__":
|
| 145 |
+
main()
|
verify_installation.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Installation verification script
|
| 4 |
+
Checks that all components are properly installed and configured
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import importlib
|
| 10 |
+
|
| 11 |
+
def check_python_version():
|
| 12 |
+
"""Check Python version."""
|
| 13 |
+
print("Checking Python version...")
|
| 14 |
+
version = sys.version_info
|
| 15 |
+
if version.major >= 3 and version.minor >= 10:
|
| 16 |
+
print(f" β
Python {version.major}.{version.minor}.{version.micro}")
|
| 17 |
+
return True
|
| 18 |
+
else:
|
| 19 |
+
print(f" β Python {version.major}.{version.minor}.{version.micro} (requires 3.10+)")
|
| 20 |
+
return False
|
| 21 |
+
|
| 22 |
+
def check_package(package_name, import_name=None):
|
| 23 |
+
"""Check if a package is installed."""
|
| 24 |
+
if import_name is None:
|
| 25 |
+
import_name = package_name
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
mod = importlib.import_module(import_name)
|
| 29 |
+
version = getattr(mod, '__version__', 'unknown')
|
| 30 |
+
print(f" β
{package_name} ({version})")
|
| 31 |
+
return True
|
| 32 |
+
except ImportError:
|
| 33 |
+
print(f" β {package_name} not found")
|
| 34 |
+
return False
|
| 35 |
+
|
| 36 |
+
def check_cuda():
|
| 37 |
+
"""Check CUDA availability."""
|
| 38 |
+
print("Checking CUDA...")
|
| 39 |
+
try:
|
| 40 |
+
import torch
|
| 41 |
+
if torch.cuda.is_available():
|
| 42 |
+
print(f" β
CUDA available (version {torch.version.cuda})")
|
| 43 |
+
print(f" GPU: {torch.cuda.get_device_name(0)}")
|
| 44 |
+
return True
|
| 45 |
+
else:
|
| 46 |
+
print(" β οΈ CUDA not available (CPU mode)")
|
| 47 |
+
return False
|
| 48 |
+
except ImportError:
|
| 49 |
+
print(" β PyTorch not installed")
|
| 50 |
+
return False
|
| 51 |
+
|
| 52 |
+
def check_files():
|
| 53 |
+
"""Check that all required files exist."""
|
| 54 |
+
print("Checking project files...")
|
| 55 |
+
|
| 56 |
+
required_files = [
|
| 57 |
+
'src/__init__.py',
|
| 58 |
+
'src/vad.py',
|
| 59 |
+
'src/diarization.py',
|
| 60 |
+
'src/pipeline.py',
|
| 61 |
+
'src/utils.py',
|
| 62 |
+
'app.py',
|
| 63 |
+
'vad_diarization.py',
|
| 64 |
+
'requirements.txt',
|
| 65 |
+
'Dockerfile',
|
| 66 |
+
'README.md'
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
+
all_exist = True
|
| 70 |
+
for file in required_files:
|
| 71 |
+
path = Path(file)
|
| 72 |
+
if path.exists():
|
| 73 |
+
print(f" β
{file}")
|
| 74 |
+
else:
|
| 75 |
+
print(f" β {file} missing")
|
| 76 |
+
all_exist = False
|
| 77 |
+
|
| 78 |
+
return all_exist
|
| 79 |
+
|
| 80 |
+
def check_hf_token():
|
| 81 |
+
"""Check for Hugging Face token."""
|
| 82 |
+
print("Checking Hugging Face token...")
|
| 83 |
+
import os
|
| 84 |
+
token = os.environ.get('HF_TOKEN')
|
| 85 |
+
if token:
|
| 86 |
+
print(f" β
HF_TOKEN found (length: {len(token)})")
|
| 87 |
+
return True
|
| 88 |
+
else:
|
| 89 |
+
print(" β οΈ HF_TOKEN not set (required for full pipeline)")
|
| 90 |
+
print(" Set with: export HF_TOKEN='your_token_here'")
|
| 91 |
+
return False
|
| 92 |
+
|
| 93 |
+
def test_vad():
|
| 94 |
+
"""Test VAD functionality."""
|
| 95 |
+
print("Testing VAD...")
|
| 96 |
+
try:
|
| 97 |
+
from src.vad import SileroVAD
|
| 98 |
+
vad = SileroVAD(threshold=0.5)
|
| 99 |
+
print(" β
VAD initialized successfully")
|
| 100 |
+
|
| 101 |
+
# Quick benchmark
|
| 102 |
+
metrics = vad.benchmark_latency(duration_seconds=1.0)
|
| 103 |
+
latency = metrics['latency_per_second_ms']
|
| 104 |
+
print(f" β
VAD latency: {latency:.2f}ms per second")
|
| 105 |
+
|
| 106 |
+
if latency < 100:
|
| 107 |
+
print(" β
Latency target achieved (<100ms)")
|
| 108 |
+
else:
|
| 109 |
+
print(" β οΈ Latency above target")
|
| 110 |
+
|
| 111 |
+
return True
|
| 112 |
+
except Exception as e:
|
| 113 |
+
print(f" β VAD test failed: {e}")
|
| 114 |
+
return False
|
| 115 |
+
|
| 116 |
+
def main():
|
| 117 |
+
"""Run all verification checks."""
|
| 118 |
+
print("\n" + "="*60)
|
| 119 |
+
print("INSTALLATION VERIFICATION")
|
| 120 |
+
print("="*60 + "\n")
|
| 121 |
+
|
| 122 |
+
results = {}
|
| 123 |
+
|
| 124 |
+
# Python version
|
| 125 |
+
results['python'] = check_python_version()
|
| 126 |
+
print()
|
| 127 |
+
|
| 128 |
+
# Required packages
|
| 129 |
+
print("Checking required packages...")
|
| 130 |
+
packages = [
|
| 131 |
+
('torch', 'torch'),
|
| 132 |
+
('numpy', 'numpy'),
|
| 133 |
+
('librosa', 'librosa'),
|
| 134 |
+
('soundfile', 'soundfile'),
|
| 135 |
+
('gradio', 'gradio'),
|
| 136 |
+
('matplotlib', 'matplotlib'),
|
| 137 |
+
('silero-vad', 'silero_vad'),
|
| 138 |
+
('pyannote.audio', 'pyannote.audio')
|
| 139 |
+
]
|
| 140 |
+
|
| 141 |
+
results['packages'] = all(check_package(name, imp) for name, imp in packages)
|
| 142 |
+
print()
|
| 143 |
+
|
| 144 |
+
# CUDA
|
| 145 |
+
results['cuda'] = check_cuda()
|
| 146 |
+
print()
|
| 147 |
+
|
| 148 |
+
# Files
|
| 149 |
+
results['files'] = check_files()
|
| 150 |
+
print()
|
| 151 |
+
|
| 152 |
+
# HF Token
|
| 153 |
+
results['token'] = check_hf_token()
|
| 154 |
+
print()
|
| 155 |
+
|
| 156 |
+
# VAD test
|
| 157 |
+
results['vad'] = test_vad()
|
| 158 |
+
print()
|
| 159 |
+
|
| 160 |
+
# Summary
|
| 161 |
+
print("="*60)
|
| 162 |
+
print("VERIFICATION SUMMARY")
|
| 163 |
+
print("="*60)
|
| 164 |
+
|
| 165 |
+
total = len(results)
|
| 166 |
+
passed = sum(1 for v in results.values() if v)
|
| 167 |
+
|
| 168 |
+
for check, result in results.items():
|
| 169 |
+
status = "β
PASS" if result else "β FAIL"
|
| 170 |
+
print(f"{check.upper():20s}: {status}")
|
| 171 |
+
|
| 172 |
+
print()
|
| 173 |
+
print(f"Results: {passed}/{total} checks passed")
|
| 174 |
+
|
| 175 |
+
if passed == total:
|
| 176 |
+
print("\nπ All checks passed! System is ready to use.")
|
| 177 |
+
print("\nNext steps:")
|
| 178 |
+
print("1. Run demo: python vad_diarization.py")
|
| 179 |
+
print("2. Launch Gradio: python app.py")
|
| 180 |
+
print("3. Run benchmarks: python benchmarks/run_benchmarks.py --quick")
|
| 181 |
+
elif results['python'] and results['packages'] and results['files']:
|
| 182 |
+
print("\nβ
Core system is functional.")
|
| 183 |
+
if not results['token']:
|
| 184 |
+
print("β οΈ Set HF_TOKEN for full pipeline functionality")
|
| 185 |
+
if not results['cuda']:
|
| 186 |
+
print("β οΈ CUDA not available, will use CPU (slower)")
|
| 187 |
+
else:
|
| 188 |
+
print("\nβ Installation incomplete. Please fix the issues above.")
|
| 189 |
+
print("\nTry running: ./setup.sh")
|
| 190 |
+
|
| 191 |
+
print("="*60 + "\n")
|
| 192 |
+
|
| 193 |
+
return passed == total
|
| 194 |
+
|
| 195 |
+
if __name__ == "__main__":
|
| 196 |
+
success = main()
|
| 197 |
+
sys.exit(0 if success else 1)
|