Spaces:
Running
Running
Add SOP Audio Analyzer app files
Browse files- Full Streamlit app with audio analysis
- SpeechBrain VAD, diarization, voiceprint
- Fraud detection modules
- Database models
- Dockerfile +21 -9
- README.md +67 -12
- app.py +1361 -0
- requirements.txt +28 -3
- src/__init__.py +27 -0
- src/analyzer.py +597 -0
- src/database/__init__.py +3 -0
- src/database/models.py +320 -0
- src/fraud_detection/__init__.py +14 -0
- src/fraud_detection/pause_detector.py +235 -0
- src/fraud_detection/reading_pattern.py +238 -0
- src/fraud_detection/whisper_detector.py +203 -0
- src/phase1_foundation/__init__.py +11 -0
- src/phase1_foundation/diarization.py +199 -0
- src/phase1_foundation/preprocessor.py +102 -0
- src/phase1_foundation/vad.py +117 -0
- src/phase1_foundation/voiceprint.py +199 -0
- src/phase2_background/__init__.py +3 -0
- src/phase2_background/analyzer.py +253 -0
- src/phase6_synthetic/__init__.py +8 -0
- src/phase6_synthetic/detector.py +494 -0
- src/phase6_synthetic/wake_words.py +235 -0
- src/ui/__init__.py +1 -0
Dockerfile
CHANGED
|
@@ -1,20 +1,32 @@
|
|
| 1 |
-
FROM python:3.
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
|
|
|
| 5 |
RUN apt-get update && apt-get install -y \
|
| 6 |
-
|
| 7 |
-
|
| 8 |
git \
|
| 9 |
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
|
| 11 |
-
|
| 12 |
-
COPY
|
| 13 |
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
-
|
|
|
|
| 17 |
|
| 18 |
-
|
|
|
|
| 19 |
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
+
# Install system dependencies for audio processing
|
| 6 |
RUN apt-get update && apt-get install -y \
|
| 7 |
+
ffmpeg \
|
| 8 |
+
libsndfile1 \
|
| 9 |
git \
|
| 10 |
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
| 12 |
+
# Copy requirements first for better caching
|
| 13 |
+
COPY requirements.txt .
|
| 14 |
|
| 15 |
+
# Install Python dependencies
|
| 16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 17 |
|
| 18 |
+
# Copy application code
|
| 19 |
+
COPY . .
|
| 20 |
|
| 21 |
+
# Create necessary directories
|
| 22 |
+
RUN mkdir -p data/db data/clips pretrained_models
|
| 23 |
|
| 24 |
+
# Expose Streamlit port (HF Spaces uses 7860)
|
| 25 |
+
EXPOSE 7860
|
| 26 |
+
|
| 27 |
+
# Set environment variables
|
| 28 |
+
ENV STREAMLIT_SERVER_PORT=7860
|
| 29 |
+
ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
|
| 30 |
+
|
| 31 |
+
# Run Streamlit
|
| 32 |
+
CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
|
README.md
CHANGED
|
@@ -1,19 +1,74 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: red
|
| 6 |
-
sdk:
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
pinned: false
|
| 11 |
-
short_description: Audio Analyzer
|
| 12 |
---
|
| 13 |
|
| 14 |
-
#
|
| 15 |
|
| 16 |
-
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: SOP Audio Analyzer
|
| 3 |
+
emoji: 🎙️
|
| 4 |
+
colorFrom: blue
|
| 5 |
colorTo: red
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: 1.29.0
|
| 8 |
+
python_version: "3.11"
|
| 9 |
+
app_file: app.py
|
| 10 |
pinned: false
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# SOP Audio Analyzer
|
| 14 |
|
| 15 |
+
Test Integrity Analysis - Voice fraud detection for take-at-home tests.
|
| 16 |
|
| 17 |
+
## Features
|
| 18 |
+
|
| 19 |
+
- 🎤 **Record or upload** audio files
|
| 20 |
+
- 🗣️ **Speaker diarization** - detect multiple voices
|
| 21 |
+
- 🎯 **Voiceprint extraction** - unique ID per speaker
|
| 22 |
+
- 🔈 **Background analysis** - detect whispers, distant voices
|
| 23 |
+
- 🤖 **Synthetic detection** - identify TTS/AI voices
|
| 24 |
+
- 📢 **Wake word detection** - Alexa, Siri, Google
|
| 25 |
+
- 🗄️ **Cross-test tracking** - find same voice across tests
|
| 26 |
+
|
| 27 |
+
## Installation
|
| 28 |
+
|
| 29 |
+
```bash
|
| 30 |
+
# Create virtual environment
|
| 31 |
+
python -m venv venv
|
| 32 |
+
source venv/bin/activate # Linux/Mac
|
| 33 |
+
# or: venv\Scripts\activate # Windows
|
| 34 |
+
|
| 35 |
+
# Install dependencies
|
| 36 |
+
pip install -r requirements.txt
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
## Run
|
| 40 |
+
|
| 41 |
+
```bash
|
| 42 |
+
streamlit run app.py
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
## Project Structure
|
| 46 |
+
|
| 47 |
+
```
|
| 48 |
+
sop-audio-analyzer/
|
| 49 |
+
├── app.py # Main Streamlit app
|
| 50 |
+
├── requirements.txt
|
| 51 |
+
├── src/
|
| 52 |
+
│ ├── phase1_foundation/ # VAD, Diarization, Voiceprint
|
| 53 |
+
│ ├── phase2_background/ # Background analysis
|
| 54 |
+
│ ├── phase6_synthetic/ # Synthetic & wake word detection
|
| 55 |
+
│ ├── database/ # SQLite models & queries
|
| 56 |
+
│ └── ui/ # UI components
|
| 57 |
+
├── data/
|
| 58 |
+
│ ├── db/ # SQLite database
|
| 59 |
+
│ └── clips/ # Extracted audio clips
|
| 60 |
+
└── tests/
|
| 61 |
+
└── audio/ # Test audio files
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
## Usage
|
| 65 |
+
|
| 66 |
+
1. **Analyzer tab**: Upload or record audio → Analyze → View results
|
| 67 |
+
2. **Database tab**: Browse all voiceprints → Track across tests
|
| 68 |
+
|
| 69 |
+
## Tech Stack
|
| 70 |
+
|
| 71 |
+
- **SpeechBrain**: VAD, diarization, speaker recognition
|
| 72 |
+
- **Whisper**: Transcription, wake word detection
|
| 73 |
+
- **Streamlit**: Web UI
|
| 74 |
+
- **SQLite**: Voiceprint database
|
app.py
ADDED
|
@@ -0,0 +1,1361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SOP Audio Analyzer - Streamlit UI
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import tempfile
|
| 7 |
+
import wave
|
| 8 |
+
import numpy as np
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from streamlit_webrtc import webrtc_streamer, WebRtcMode, AudioProcessorBase
|
| 11 |
+
import av
|
| 12 |
+
import queue
|
| 13 |
+
import threading
|
| 14 |
+
import time
|
| 15 |
+
import plotly.graph_objects as go
|
| 16 |
+
|
| 17 |
+
# Page config
|
| 18 |
+
st.set_page_config(
|
| 19 |
+
page_title="Test Integrity Analysis",
|
| 20 |
+
page_icon="🎙️",
|
| 21 |
+
layout="wide"
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# ============ SIMPLE LOGIN ============
|
| 25 |
+
APP_USERNAME = "PTEXAdmin"
|
| 26 |
+
APP_PASSWORD = "T3st@26"
|
| 27 |
+
|
| 28 |
+
def check_login():
|
| 29 |
+
"""Simple username/password authentication."""
|
| 30 |
+
if 'authenticated' not in st.session_state:
|
| 31 |
+
st.session_state['authenticated'] = False
|
| 32 |
+
|
| 33 |
+
if not st.session_state['authenticated']:
|
| 34 |
+
st.markdown("## 🔐 Login")
|
| 35 |
+
st.markdown("Enter credentials to access the application")
|
| 36 |
+
|
| 37 |
+
col1, col2, col3 = st.columns([1, 2, 1])
|
| 38 |
+
with col2:
|
| 39 |
+
username = st.text_input("Username", key="login_username")
|
| 40 |
+
password = st.text_input("Password", type="password", key="login_password")
|
| 41 |
+
if st.button("Login", use_container_width=True):
|
| 42 |
+
if username == APP_USERNAME and password == APP_PASSWORD:
|
| 43 |
+
st.session_state['authenticated'] = True
|
| 44 |
+
st.session_state['username'] = username
|
| 45 |
+
st.rerun()
|
| 46 |
+
else:
|
| 47 |
+
st.error("Invalid credentials")
|
| 48 |
+
return False
|
| 49 |
+
return True
|
| 50 |
+
|
| 51 |
+
# Initialize analyzer (lazy) - v2 forces reload
|
| 52 |
+
@st.cache_resource
|
| 53 |
+
def get_analyzer():
|
| 54 |
+
from src.analyzer import AudioAnalyzer
|
| 55 |
+
return AudioAnalyzer()
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class AudioProcessor(AudioProcessorBase):
|
| 59 |
+
"""Audio processor that records and provides real-time level data."""
|
| 60 |
+
|
| 61 |
+
def __init__(self):
|
| 62 |
+
self.audio_frames = []
|
| 63 |
+
self.sample_rate = 48000
|
| 64 |
+
self.lock = threading.Lock()
|
| 65 |
+
self.level_queue = queue.Queue()
|
| 66 |
+
|
| 67 |
+
def recv(self, frame: av.AudioFrame) -> av.AudioFrame:
|
| 68 |
+
"""Process incoming audio frames."""
|
| 69 |
+
sound = frame.to_ndarray()
|
| 70 |
+
|
| 71 |
+
with self.lock:
|
| 72 |
+
self.audio_frames.append(sound.copy())
|
| 73 |
+
|
| 74 |
+
# Calculate RMS level for visualization
|
| 75 |
+
rms = np.sqrt(np.mean(sound.astype(np.float32) ** 2))
|
| 76 |
+
level_db = 20 * np.log10(max(rms, 1e-10))
|
| 77 |
+
|
| 78 |
+
try:
|
| 79 |
+
self.level_queue.put_nowait(level_db)
|
| 80 |
+
except queue.Full:
|
| 81 |
+
pass
|
| 82 |
+
|
| 83 |
+
return frame
|
| 84 |
+
|
| 85 |
+
def get_audio_data(self):
|
| 86 |
+
"""Get recorded audio as numpy array."""
|
| 87 |
+
with self.lock:
|
| 88 |
+
if not self.audio_frames:
|
| 89 |
+
return None
|
| 90 |
+
return np.concatenate(self.audio_frames, axis=1)
|
| 91 |
+
|
| 92 |
+
def get_frame_count(self):
|
| 93 |
+
"""Get number of recorded frames."""
|
| 94 |
+
with self.lock:
|
| 95 |
+
return len(self.audio_frames)
|
| 96 |
+
|
| 97 |
+
def clear(self):
|
| 98 |
+
"""Clear recorded frames."""
|
| 99 |
+
with self.lock:
|
| 100 |
+
self.audio_frames = []
|
| 101 |
+
|
| 102 |
+
def save_to_wav(self, filepath: str) -> bool:
|
| 103 |
+
"""Save recorded audio to WAV file."""
|
| 104 |
+
audio_data = self.get_audio_data()
|
| 105 |
+
if audio_data is None or audio_data.size == 0:
|
| 106 |
+
return False
|
| 107 |
+
|
| 108 |
+
# Convert to mono if stereo
|
| 109 |
+
if len(audio_data.shape) > 1 and audio_data.shape[0] > 1:
|
| 110 |
+
audio_data = audio_data.mean(axis=0)
|
| 111 |
+
else:
|
| 112 |
+
audio_data = audio_data.flatten()
|
| 113 |
+
|
| 114 |
+
# Normalize to int16
|
| 115 |
+
audio_data = audio_data.astype(np.float32)
|
| 116 |
+
max_val = np.abs(audio_data).max()
|
| 117 |
+
if max_val > 0:
|
| 118 |
+
audio_data = audio_data / max_val
|
| 119 |
+
audio_int16 = (audio_data * 32767).astype(np.int16)
|
| 120 |
+
|
| 121 |
+
# Save WAV
|
| 122 |
+
with wave.open(filepath, 'wb') as wf:
|
| 123 |
+
wf.setnchannels(1)
|
| 124 |
+
wf.setsampwidth(2)
|
| 125 |
+
wf.setframerate(self.sample_rate)
|
| 126 |
+
wf.writeframes(audio_int16.tobytes())
|
| 127 |
+
|
| 128 |
+
return True
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def render_waveform(audio_data, sample_rate):
|
| 132 |
+
"""Render waveform visualization."""
|
| 133 |
+
if audio_data is None:
|
| 134 |
+
return
|
| 135 |
+
|
| 136 |
+
# Flatten to mono
|
| 137 |
+
if len(audio_data.shape) > 1:
|
| 138 |
+
audio_data = audio_data.mean(axis=0)
|
| 139 |
+
else:
|
| 140 |
+
audio_data = audio_data.flatten()
|
| 141 |
+
|
| 142 |
+
# Downsample for display
|
| 143 |
+
max_points = 1000
|
| 144 |
+
if len(audio_data) > max_points:
|
| 145 |
+
step = len(audio_data) // max_points
|
| 146 |
+
audio_data = audio_data[::step]
|
| 147 |
+
|
| 148 |
+
# Create time axis
|
| 149 |
+
duration = len(audio_data) / (sample_rate / (step if 'step' in dir() else 1))
|
| 150 |
+
time_axis = np.linspace(0, duration, len(audio_data))
|
| 151 |
+
|
| 152 |
+
# Normalize
|
| 153 |
+
max_val = np.abs(audio_data).max()
|
| 154 |
+
if max_val > 0:
|
| 155 |
+
audio_data = audio_data / max_val
|
| 156 |
+
|
| 157 |
+
fig = go.Figure()
|
| 158 |
+
fig.add_trace(go.Scatter(
|
| 159 |
+
x=time_axis,
|
| 160 |
+
y=audio_data,
|
| 161 |
+
mode='lines',
|
| 162 |
+
line=dict(color='#2563eb', width=1),
|
| 163 |
+
fill='tozeroy',
|
| 164 |
+
fillcolor='rgba(37, 99, 235, 0.3)'
|
| 165 |
+
))
|
| 166 |
+
|
| 167 |
+
fig.update_layout(
|
| 168 |
+
height=150,
|
| 169 |
+
margin=dict(l=0, r=0, t=10, b=30),
|
| 170 |
+
xaxis=dict(title='Time (s)', showgrid=True),
|
| 171 |
+
yaxis=dict(visible=False, range=[-1, 1]),
|
| 172 |
+
showlegend=False
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
return fig
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def format_time(seconds: float) -> str:
|
| 179 |
+
"""Format seconds as MM:SS or HH:MM:SS."""
|
| 180 |
+
if seconds < 3600:
|
| 181 |
+
return f"{int(seconds // 60)}:{int(seconds % 60):02d}"
|
| 182 |
+
else:
|
| 183 |
+
hours = int(seconds // 3600)
|
| 184 |
+
minutes = int((seconds % 3600) // 60)
|
| 185 |
+
secs = int(seconds % 60)
|
| 186 |
+
return f"{hours}:{minutes:02d}:{secs:02d}"
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def save_audio_to_wav(audio_data, sample_rate, filepath):
|
| 190 |
+
"""Save audio data to WAV file."""
|
| 191 |
+
if audio_data is None or audio_data.size == 0:
|
| 192 |
+
return False
|
| 193 |
+
|
| 194 |
+
# Convert to mono if stereo
|
| 195 |
+
if len(audio_data.shape) > 1 and audio_data.shape[0] > 1:
|
| 196 |
+
audio_data = audio_data.mean(axis=0)
|
| 197 |
+
else:
|
| 198 |
+
audio_data = audio_data.flatten()
|
| 199 |
+
|
| 200 |
+
# Normalize to int16
|
| 201 |
+
audio_data = audio_data.astype(np.float32)
|
| 202 |
+
max_val = np.abs(audio_data).max()
|
| 203 |
+
if max_val > 0:
|
| 204 |
+
audio_data = audio_data / max_val
|
| 205 |
+
audio_int16 = (audio_data * 32767).astype(np.int16)
|
| 206 |
+
|
| 207 |
+
# Save WAV
|
| 208 |
+
with wave.open(filepath, 'wb') as wf:
|
| 209 |
+
wf.setnchannels(1)
|
| 210 |
+
wf.setsampwidth(2)
|
| 211 |
+
wf.setframerate(sample_rate)
|
| 212 |
+
wf.writeframes(audio_int16.tobytes())
|
| 213 |
+
|
| 214 |
+
return True
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def render_analyzer_tab():
|
| 218 |
+
"""Render the analyzer tab with clean UI."""
|
| 219 |
+
|
| 220 |
+
# Initialize session state
|
| 221 |
+
if 'recorded_audio_path' not in st.session_state:
|
| 222 |
+
st.session_state.recorded_audio_path = None
|
| 223 |
+
|
| 224 |
+
# Custom CSS for better styling
|
| 225 |
+
st.markdown("""
|
| 226 |
+
<style>
|
| 227 |
+
.recording-container {
|
| 228 |
+
background: linear-gradient(135deg, #1e3a5f 0%, #2d5a87 100%);
|
| 229 |
+
border-radius: 16px;
|
| 230 |
+
padding: 2rem;
|
| 231 |
+
text-align: center;
|
| 232 |
+
margin-bottom: 1rem;
|
| 233 |
+
}
|
| 234 |
+
.upload-icon {
|
| 235 |
+
font-size: 1.5rem;
|
| 236 |
+
cursor: pointer;
|
| 237 |
+
}
|
| 238 |
+
.stAudio > div {
|
| 239 |
+
border-radius: 8px;
|
| 240 |
+
}
|
| 241 |
+
</style>
|
| 242 |
+
""", unsafe_allow_html=True)
|
| 243 |
+
|
| 244 |
+
# Main recording area - centered and prominent
|
| 245 |
+
col_left, col_main, col_right = st.columns([1, 3, 1])
|
| 246 |
+
|
| 247 |
+
with col_main:
|
| 248 |
+
st.markdown("#### 🎙️ Record Audio")
|
| 249 |
+
st.caption("Minimum 20 seconds required")
|
| 250 |
+
|
| 251 |
+
# Audio recorder with visual feedback
|
| 252 |
+
recorded_audio = st.audio_input("", key="audio_recorder", label_visibility="collapsed")
|
| 253 |
+
|
| 254 |
+
if recorded_audio:
|
| 255 |
+
# Save and show waveform
|
| 256 |
+
temp_path = tempfile.mktemp(suffix='.wav')
|
| 257 |
+
with open(temp_path, 'wb') as f:
|
| 258 |
+
f.write(recorded_audio.getbuffer())
|
| 259 |
+
st.session_state.recorded_audio_path = temp_path
|
| 260 |
+
|
| 261 |
+
# Show audio duration
|
| 262 |
+
import wave
|
| 263 |
+
with wave.open(temp_path, 'rb') as wf:
|
| 264 |
+
frames = wf.getnframes()
|
| 265 |
+
rate = wf.getframerate()
|
| 266 |
+
duration = frames / float(rate)
|
| 267 |
+
|
| 268 |
+
if duration < 20:
|
| 269 |
+
st.warning(f"⚠️ Audio: {duration:.1f}s - Need at least 20s")
|
| 270 |
+
else:
|
| 271 |
+
st.success(f"✅ Audio ready: {duration:.1f}s")
|
| 272 |
+
|
| 273 |
+
# Analyze button
|
| 274 |
+
if st.button("🔍 Analyze", type="primary", use_container_width=True):
|
| 275 |
+
analyze_recorded_audio(temp_path)
|
| 276 |
+
|
| 277 |
+
# Upload button - opens modal dialog
|
| 278 |
+
with col_right:
|
| 279 |
+
if st.button("📤", help="Upload audio file", key="open_upload_modal"):
|
| 280 |
+
st.session_state['show_upload_modal'] = True
|
| 281 |
+
|
| 282 |
+
# Upload modal dialog
|
| 283 |
+
@st.dialog("Upload Audio File")
|
| 284 |
+
def upload_dialog():
|
| 285 |
+
uploaded_file = st.file_uploader(
|
| 286 |
+
"Select audio file",
|
| 287 |
+
type=['wav', 'mp3', 'm4a', 'ogg', 'flac'],
|
| 288 |
+
key="audio_uploader_modal"
|
| 289 |
+
)
|
| 290 |
+
if uploaded_file is not None:
|
| 291 |
+
st.audio(uploaded_file, format=f'audio/{uploaded_file.type.split("/")[-1]}')
|
| 292 |
+
if st.button("🔍 Analyze", use_container_width=True):
|
| 293 |
+
st.session_state['show_upload_modal'] = False
|
| 294 |
+
analyze_audio(uploaded_file)
|
| 295 |
+
|
| 296 |
+
if st.session_state.get('show_upload_modal', False):
|
| 297 |
+
upload_dialog()
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def analyze_recorded_audio(audio_path: str):
|
| 301 |
+
"""Run analysis on recorded audio file."""
|
| 302 |
+
try:
|
| 303 |
+
# Clear analyzer cache to ensure fresh analysis
|
| 304 |
+
st.cache_resource.clear()
|
| 305 |
+
|
| 306 |
+
# Progress bar
|
| 307 |
+
progress_bar = st.progress(0)
|
| 308 |
+
status_text = st.empty()
|
| 309 |
+
|
| 310 |
+
def update_progress(msg, pct):
|
| 311 |
+
progress_bar.progress(pct / 100)
|
| 312 |
+
status_text.text(msg)
|
| 313 |
+
|
| 314 |
+
# Run analysis
|
| 315 |
+
analyzer = get_analyzer()
|
| 316 |
+
result = analyzer.analyze(audio_path, progress_callback=update_progress)
|
| 317 |
+
|
| 318 |
+
# Clear progress
|
| 319 |
+
progress_bar.empty()
|
| 320 |
+
status_text.empty()
|
| 321 |
+
|
| 322 |
+
# Store result in session state
|
| 323 |
+
st.session_state['last_result'] = result
|
| 324 |
+
|
| 325 |
+
# Clear recorded audio after analysis
|
| 326 |
+
if st.session_state.recorded_audio_path and os.path.exists(st.session_state.recorded_audio_path):
|
| 327 |
+
os.remove(st.session_state.recorded_audio_path)
|
| 328 |
+
st.session_state.recorded_audio_path = None
|
| 329 |
+
st.session_state.recording_complete = False
|
| 330 |
+
|
| 331 |
+
# Display results
|
| 332 |
+
render_results(result)
|
| 333 |
+
|
| 334 |
+
except Exception as e:
|
| 335 |
+
st.error(f"Analysis failed: {str(e)}")
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
def analyze_audio(uploaded_file):
|
| 339 |
+
"""Run analysis on uploaded file."""
|
| 340 |
+
# Save to temp file
|
| 341 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as f:
|
| 342 |
+
f.write(uploaded_file.getbuffer())
|
| 343 |
+
temp_path = f.name
|
| 344 |
+
|
| 345 |
+
try:
|
| 346 |
+
# Clear analyzer cache to ensure fresh analysis
|
| 347 |
+
st.cache_resource.clear()
|
| 348 |
+
|
| 349 |
+
# Progress bar
|
| 350 |
+
progress_bar = st.progress(0)
|
| 351 |
+
status_text = st.empty()
|
| 352 |
+
|
| 353 |
+
def update_progress(msg, pct):
|
| 354 |
+
progress_bar.progress(pct / 100)
|
| 355 |
+
status_text.text(msg)
|
| 356 |
+
|
| 357 |
+
# Run analysis
|
| 358 |
+
analyzer = get_analyzer()
|
| 359 |
+
result = analyzer.analyze(temp_path, progress_callback=update_progress)
|
| 360 |
+
|
| 361 |
+
# Clear progress
|
| 362 |
+
progress_bar.empty()
|
| 363 |
+
status_text.empty()
|
| 364 |
+
|
| 365 |
+
# Store result in session state
|
| 366 |
+
st.session_state['last_result'] = result
|
| 367 |
+
|
| 368 |
+
# Store audio bytes for playback
|
| 369 |
+
with open(temp_path, 'rb') as f:
|
| 370 |
+
st.session_state['last_audio_bytes'] = f.read()
|
| 371 |
+
|
| 372 |
+
# Display results
|
| 373 |
+
render_results(result)
|
| 374 |
+
|
| 375 |
+
finally:
|
| 376 |
+
# Cleanup
|
| 377 |
+
if os.path.exists(temp_path):
|
| 378 |
+
os.remove(temp_path)
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
def calc_speaking_time(speaker, max_duration):
|
| 382 |
+
"""Calculate actual speaking time from segments, merging overlaps and capping to duration."""
|
| 383 |
+
if not speaker.segments:
|
| 384 |
+
return min(speaker.total_seconds, max_duration)
|
| 385 |
+
|
| 386 |
+
sorted_segs = sorted(speaker.segments, key=lambda s: s['start'])
|
| 387 |
+
merged = []
|
| 388 |
+
for seg in sorted_segs:
|
| 389 |
+
seg_start = max(0, seg['start'])
|
| 390 |
+
seg_end = min(seg['end'], max_duration)
|
| 391 |
+
if seg_start >= max_duration:
|
| 392 |
+
continue
|
| 393 |
+
if merged and seg_start <= merged[-1][1]:
|
| 394 |
+
merged[-1] = (merged[-1][0], max(merged[-1][1], seg_end))
|
| 395 |
+
else:
|
| 396 |
+
merged.append((seg_start, seg_end))
|
| 397 |
+
return sum(end - start for start, end in merged)
|
| 398 |
+
|
| 399 |
+
|
| 400 |
+
def render_results(result):
|
| 401 |
+
"""Render analysis results in a compact layout."""
|
| 402 |
+
st.markdown("---")
|
| 403 |
+
st.markdown("## Analysis Results")
|
| 404 |
+
|
| 405 |
+
# Row 1: Main speaker + Detection flags (3 columns)
|
| 406 |
+
col1, col2, col3 = st.columns([1.2, 1, 1])
|
| 407 |
+
|
| 408 |
+
with col1:
|
| 409 |
+
st.markdown("#### 🎤 Main Speaker")
|
| 410 |
+
if result.main_speaker:
|
| 411 |
+
main = result.main_speaker
|
| 412 |
+
st.code(main.voiceprint_id, language=None)
|
| 413 |
+
|
| 414 |
+
quality_color = {"High": "🟢", "Medium": "🟡", "Low": "🔴"}.get(main.quality, "⚪")
|
| 415 |
+
synth_icon = "✅" if not main.is_synthetic else "⚠️"
|
| 416 |
+
|
| 417 |
+
speaking_time = calc_speaking_time(main, result.duration_seconds)
|
| 418 |
+
st.markdown(f"{quality_color} Quality: **{main.quality}** · {speaking_time:.1f}s")
|
| 419 |
+
st.markdown(f"{synth_icon} Synthetic risk: **{main.synthetic_score:.0%}**")
|
| 420 |
+
|
| 421 |
+
# Voice sample
|
| 422 |
+
clip_path = getattr(main, 'clip_path', None)
|
| 423 |
+
if clip_path and os.path.exists(clip_path):
|
| 424 |
+
with open(clip_path, 'rb') as audio_file:
|
| 425 |
+
st.audio(audio_file.read(), format='audio/wav')
|
| 426 |
+
|
| 427 |
+
with col2:
|
| 428 |
+
st.markdown("#### 🔍 Detection")
|
| 429 |
+
# Synthetic
|
| 430 |
+
synth_detected = result.main_speaker and result.main_speaker.is_synthetic
|
| 431 |
+
synth_icon = "⚠️" if synth_detected else "✅"
|
| 432 |
+
st.markdown(f"{synth_icon} **Synthetic voice:** {'Yes' if synth_detected else 'No'}")
|
| 433 |
+
st.caption("AI-generated (ElevenLabs, clones)")
|
| 434 |
+
|
| 435 |
+
# Playback
|
| 436 |
+
playback_detected = getattr(result, 'playback_detected', False)
|
| 437 |
+
playback_score = getattr(result, 'playback_score', 0.0)
|
| 438 |
+
playback_icon = "🔊" if playback_detected else "✅"
|
| 439 |
+
st.markdown(f"{playback_icon} **Playback:** {'Yes' if playback_detected else 'No'} ({playback_score:.0%})")
|
| 440 |
+
st.caption("Audio from speakers")
|
| 441 |
+
|
| 442 |
+
# Reading Pattern
|
| 443 |
+
reading_detected = getattr(result, 'reading_pattern_detected', False)
|
| 444 |
+
reading_conf = getattr(result, 'reading_confidence', 0.0)
|
| 445 |
+
reading_icon = "📖" if reading_detected else "✅"
|
| 446 |
+
st.markdown(f"{reading_icon} **Reading:** {'Yes' if reading_detected else 'No'} ({reading_conf:.0%})")
|
| 447 |
+
st.caption("Unnatural speech rhythm")
|
| 448 |
+
|
| 449 |
+
with col3:
|
| 450 |
+
st.markdown("#### 🚨 Alerts")
|
| 451 |
+
# Wake words
|
| 452 |
+
if result.wake_words:
|
| 453 |
+
for ww in result.wake_words[:2]:
|
| 454 |
+
st.markdown(f"🔴 **\"{ww['word']}\"** @ {format_time(ww['time'])}")
|
| 455 |
+
else:
|
| 456 |
+
st.markdown("✅ No wake words")
|
| 457 |
+
st.caption("Alexa, Siri, transfer...")
|
| 458 |
+
|
| 459 |
+
# Whisper detection
|
| 460 |
+
whisper_detected = getattr(result, 'whisper_detected', False)
|
| 461 |
+
whisper_instances = getattr(result, 'whisper_instances', []) or []
|
| 462 |
+
if whisper_detected:
|
| 463 |
+
st.markdown(f"🔇 **{len(whisper_instances)} whispers**")
|
| 464 |
+
for w in whisper_instances[:2]:
|
| 465 |
+
st.markdown(f"· @ {format_time(w['start'])} ({w['confidence']:.0%})")
|
| 466 |
+
else:
|
| 467 |
+
st.markdown("✅ No whispers")
|
| 468 |
+
st.caption("Background voices")
|
| 469 |
+
|
| 470 |
+
# Suspicious pauses
|
| 471 |
+
pauses_detected = getattr(result, 'suspicious_pauses_detected', False)
|
| 472 |
+
pauses = getattr(result, 'suspicious_pauses', []) or []
|
| 473 |
+
longest = getattr(result, 'longest_pause', 0.0)
|
| 474 |
+
if pauses_detected:
|
| 475 |
+
st.markdown(f"⏸️ **{len(pauses)} long pauses** (max {longest:.0f}s)")
|
| 476 |
+
else:
|
| 477 |
+
st.markdown("✅ No suspicious pauses")
|
| 478 |
+
st.caption("Silences > 5 seconds")
|
| 479 |
+
|
| 480 |
+
# Additional Speakers section
|
| 481 |
+
st.markdown("---")
|
| 482 |
+
|
| 483 |
+
if result.additional_speakers:
|
| 484 |
+
total_time = sum(calc_speaking_time(s, result.duration_seconds) for s in result.additional_speakers)
|
| 485 |
+
st.markdown(f"#### 👥 Additional Speakers ({len(result.additional_speakers)}) · {total_time:.1f}s total")
|
| 486 |
+
|
| 487 |
+
speaker_letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
|
| 488 |
+
|
| 489 |
+
# Display speakers in rows of 2-3
|
| 490 |
+
cols_per_row = min(3, len(result.additional_speakers))
|
| 491 |
+
for row_start in range(0, len(result.additional_speakers), cols_per_row):
|
| 492 |
+
row_speakers = result.additional_speakers[row_start:row_start + cols_per_row]
|
| 493 |
+
cols = st.columns(cols_per_row)
|
| 494 |
+
|
| 495 |
+
for col_idx, speaker in enumerate(row_speakers):
|
| 496 |
+
idx = row_start + col_idx
|
| 497 |
+
display_label = f"Speaker {speaker_letters[idx]}" if idx < len(speaker_letters) else f"Speaker {idx+1}"
|
| 498 |
+
|
| 499 |
+
with cols[col_idx]:
|
| 500 |
+
voice_type = "⚠️ synthetic" if speaker.is_synthetic else "human"
|
| 501 |
+
times_info = f"🚨 {speaker.times_seen}x" if speaker.times_seen > 1 else "1st"
|
| 502 |
+
speaker_time = calc_speaking_time(speaker, result.duration_seconds)
|
| 503 |
+
|
| 504 |
+
st.markdown(f"**{display_label}** · {speaker_time:.1f}s · {voice_type} · {times_info}")
|
| 505 |
+
st.code(speaker.voiceprint_id, language=None)
|
| 506 |
+
|
| 507 |
+
# Voice sample player
|
| 508 |
+
clip_path = getattr(speaker, 'clip_path', None)
|
| 509 |
+
if clip_path and os.path.exists(clip_path):
|
| 510 |
+
with open(clip_path, 'rb') as audio_file:
|
| 511 |
+
st.audio(audio_file.read(), format='audio/wav')
|
| 512 |
+
|
| 513 |
+
if speaker.times_seen > 1:
|
| 514 |
+
if st.button(f"History", key=f"hist_{speaker.voiceprint_id}_{idx}"):
|
| 515 |
+
st.session_state['view_voiceprint'] = speaker.voiceprint_id
|
| 516 |
+
st.session_state['active_tab'] = 'database'
|
| 517 |
+
st.rerun()
|
| 518 |
+
|
| 519 |
+
else:
|
| 520 |
+
st.markdown("✅ No additional speakers detected")
|
| 521 |
+
|
| 522 |
+
# Timeline
|
| 523 |
+
st.markdown("---")
|
| 524 |
+
st.markdown("#### 📊 Timeline")
|
| 525 |
+
|
| 526 |
+
render_timeline(result)
|
| 527 |
+
|
| 528 |
+
# Download button inline
|
| 529 |
+
st.download_button(
|
| 530 |
+
label="📥 Download JSON",
|
| 531 |
+
data=result.to_json(),
|
| 532 |
+
file_name=f"{result.test_id}_analysis.json",
|
| 533 |
+
mime="application/json"
|
| 534 |
+
)
|
| 535 |
+
|
| 536 |
+
|
| 537 |
+
def render_timeline(result):
|
| 538 |
+
"""Render a timeline visualization with each speaker on their own row."""
|
| 539 |
+
import plotly.graph_objects as go
|
| 540 |
+
|
| 541 |
+
fig = go.Figure()
|
| 542 |
+
|
| 543 |
+
duration = result.duration_seconds
|
| 544 |
+
|
| 545 |
+
# Build list of speakers for y-axis
|
| 546 |
+
speakers = []
|
| 547 |
+
speaker_colors = {}
|
| 548 |
+
|
| 549 |
+
# Main speaker first
|
| 550 |
+
if result.main_speaker:
|
| 551 |
+
speakers.append(('Main Speaker', result.main_speaker))
|
| 552 |
+
speaker_colors['Main Speaker'] = '#2563eb'
|
| 553 |
+
|
| 554 |
+
# Additional speakers - use letters (A, B, C...) to match diarization
|
| 555 |
+
additional_colors = ['#dc2626', '#ea580c', '#ca8a04', '#16a34a', '#9333ea']
|
| 556 |
+
speaker_letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
|
| 557 |
+
for i, speaker in enumerate(result.additional_speakers):
|
| 558 |
+
label = f"Speaker {speaker_letters[i]}" if i < len(speaker_letters) else f"Speaker {i+1}"
|
| 559 |
+
speakers.append((label, speaker))
|
| 560 |
+
speaker_colors[label] = additional_colors[i % len(additional_colors)]
|
| 561 |
+
|
| 562 |
+
# Add "Events" row at bottom
|
| 563 |
+
speakers.append(('Events', None))
|
| 564 |
+
|
| 565 |
+
# Create y-axis mapping (bottom to top)
|
| 566 |
+
y_positions = {label: i for i, (label, _) in enumerate(speakers)}
|
| 567 |
+
y_labels = [label for label, _ in speakers]
|
| 568 |
+
|
| 569 |
+
bar_height = 0.4
|
| 570 |
+
|
| 571 |
+
# Draw speaker segments
|
| 572 |
+
for label, speaker in speakers:
|
| 573 |
+
if speaker is None or not hasattr(speaker, 'segments') or not speaker.segments:
|
| 574 |
+
continue
|
| 575 |
+
|
| 576 |
+
y_pos = y_positions[label]
|
| 577 |
+
color = speaker_colors.get(label, '#6b7280')
|
| 578 |
+
|
| 579 |
+
for seg in speaker.segments:
|
| 580 |
+
# Use shapes for cleaner bars
|
| 581 |
+
fig.add_shape(
|
| 582 |
+
type="rect",
|
| 583 |
+
x0=seg['start'],
|
| 584 |
+
x1=seg['end'],
|
| 585 |
+
y0=y_pos - bar_height,
|
| 586 |
+
y1=y_pos + bar_height,
|
| 587 |
+
fillcolor=color,
|
| 588 |
+
line=dict(width=0),
|
| 589 |
+
opacity=0.8
|
| 590 |
+
)
|
| 591 |
+
|
| 592 |
+
# Add invisible trace for hover
|
| 593 |
+
fig.add_trace(go.Scatter(
|
| 594 |
+
x=[(seg['start'] + seg['end']) / 2],
|
| 595 |
+
y=[y_pos],
|
| 596 |
+
mode='markers',
|
| 597 |
+
marker=dict(size=1, opacity=0),
|
| 598 |
+
hoverinfo='text',
|
| 599 |
+
hovertext=f"{label}: {seg['start']:.1f}s - {seg['end']:.1f}s ({seg['end']-seg['start']:.1f}s)",
|
| 600 |
+
showlegend=False
|
| 601 |
+
))
|
| 602 |
+
|
| 603 |
+
# Events row (wake words, anomalies, whispers, pauses)
|
| 604 |
+
events_y = y_positions['Events']
|
| 605 |
+
|
| 606 |
+
# Wake words as red markers
|
| 607 |
+
for ww in result.wake_words:
|
| 608 |
+
fig.add_trace(go.Scatter(
|
| 609 |
+
x=[ww['time']],
|
| 610 |
+
y=[events_y],
|
| 611 |
+
mode='markers',
|
| 612 |
+
marker=dict(size=14, color='#dc2626', symbol='diamond'),
|
| 613 |
+
hoverinfo='text',
|
| 614 |
+
hovertext=f"Wake Word: {ww['word']} ({ww['confidence']:.0%})",
|
| 615 |
+
showlegend=False
|
| 616 |
+
))
|
| 617 |
+
|
| 618 |
+
# Anomalies as yellow markers
|
| 619 |
+
for anom in result.background_anomalies:
|
| 620 |
+
fig.add_trace(go.Scatter(
|
| 621 |
+
x=[anom['start']],
|
| 622 |
+
y=[events_y],
|
| 623 |
+
mode='markers',
|
| 624 |
+
marker=dict(size=12, color='#eab308', symbol='triangle-up'),
|
| 625 |
+
hoverinfo='text',
|
| 626 |
+
hovertext=f"Anomaly: {anom['type']} ({anom['confidence']:.0%})",
|
| 627 |
+
showlegend=False
|
| 628 |
+
))
|
| 629 |
+
|
| 630 |
+
# Whispers as purple markers
|
| 631 |
+
whisper_instances = getattr(result, 'whisper_instances', []) or []
|
| 632 |
+
for whisper in whisper_instances:
|
| 633 |
+
fig.add_trace(go.Scatter(
|
| 634 |
+
x=[whisper['start']],
|
| 635 |
+
y=[events_y],
|
| 636 |
+
mode='markers',
|
| 637 |
+
marker=dict(size=12, color='#9333ea', symbol='circle'),
|
| 638 |
+
hoverinfo='text',
|
| 639 |
+
hovertext=f"Whisper: {whisper['start']:.1f}s - {whisper['end']:.1f}s ({whisper['confidence']:.0%})",
|
| 640 |
+
showlegend=False
|
| 641 |
+
))
|
| 642 |
+
|
| 643 |
+
# Suspicious pauses as gray bars
|
| 644 |
+
suspicious_pauses = getattr(result, 'suspicious_pauses', []) or []
|
| 645 |
+
for pause in suspicious_pauses:
|
| 646 |
+
# Draw a gray semi-transparent rectangle for the pause
|
| 647 |
+
fig.add_shape(
|
| 648 |
+
type="rect",
|
| 649 |
+
x0=pause['start'],
|
| 650 |
+
x1=pause['end'],
|
| 651 |
+
y0=events_y - bar_height,
|
| 652 |
+
y1=events_y + bar_height,
|
| 653 |
+
fillcolor='rgba(107, 114, 128, 0.5)',
|
| 654 |
+
line=dict(color='#6b7280', width=1),
|
| 655 |
+
)
|
| 656 |
+
# Add marker for hover
|
| 657 |
+
fig.add_trace(go.Scatter(
|
| 658 |
+
x=[(pause['start'] + pause['end']) / 2],
|
| 659 |
+
y=[events_y],
|
| 660 |
+
mode='markers',
|
| 661 |
+
marker=dict(size=10, color='#6b7280', symbol='square'),
|
| 662 |
+
hoverinfo='text',
|
| 663 |
+
hovertext=f"Pause: {pause['duration']:.1f}s ({pause['start']:.1f}s - {pause['end']:.1f}s)",
|
| 664 |
+
showlegend=False
|
| 665 |
+
))
|
| 666 |
+
|
| 667 |
+
# Calculate dynamic height based on number of rows
|
| 668 |
+
row_height = 50
|
| 669 |
+
chart_height = max(180, len(speakers) * row_height + 60)
|
| 670 |
+
|
| 671 |
+
fig.update_layout(
|
| 672 |
+
height=chart_height,
|
| 673 |
+
margin=dict(l=100, r=20, t=20, b=40),
|
| 674 |
+
xaxis=dict(
|
| 675 |
+
range=[0, duration],
|
| 676 |
+
title='Time (seconds)',
|
| 677 |
+
showgrid=True,
|
| 678 |
+
gridcolor='rgba(128,128,128,0.2)'
|
| 679 |
+
),
|
| 680 |
+
yaxis=dict(
|
| 681 |
+
tickmode='array',
|
| 682 |
+
tickvals=list(range(len(y_labels))),
|
| 683 |
+
ticktext=y_labels,
|
| 684 |
+
range=[-0.8, len(y_labels) - 0.2],
|
| 685 |
+
showgrid=True,
|
| 686 |
+
gridcolor='rgba(128,128,128,0.1)'
|
| 687 |
+
),
|
| 688 |
+
showlegend=False,
|
| 689 |
+
plot_bgcolor='rgba(0,0,0,0)'
|
| 690 |
+
)
|
| 691 |
+
|
| 692 |
+
# Add legend for events
|
| 693 |
+
has_events = result.wake_words or result.background_anomalies or whisper_instances or suspicious_pauses
|
| 694 |
+
if has_events:
|
| 695 |
+
legend_text = []
|
| 696 |
+
if result.wake_words:
|
| 697 |
+
legend_text.append("◆ Wake Words")
|
| 698 |
+
if result.background_anomalies:
|
| 699 |
+
legend_text.append("▲ Anomalies")
|
| 700 |
+
if whisper_instances:
|
| 701 |
+
legend_text.append("● Whispers")
|
| 702 |
+
if suspicious_pauses:
|
| 703 |
+
legend_text.append("■ Long Pauses")
|
| 704 |
+
st.caption(" | ".join(legend_text))
|
| 705 |
+
|
| 706 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 707 |
+
|
| 708 |
+
# Audio player below timeline for synchronized playback
|
| 709 |
+
if 'last_audio_bytes' in st.session_state and st.session_state['last_audio_bytes']:
|
| 710 |
+
st.markdown("**🔊 Audio Playback**")
|
| 711 |
+
st.audio(st.session_state['last_audio_bytes'], format='audio/wav')
|
| 712 |
+
st.caption("Play audio while viewing the timeline above to follow speaker changes")
|
| 713 |
+
|
| 714 |
+
|
| 715 |
+
def render_database_tab():
|
| 716 |
+
"""Render the voiceprint database tab."""
|
| 717 |
+
st.markdown("### Voiceprint Database")
|
| 718 |
+
|
| 719 |
+
analyzer = get_analyzer()
|
| 720 |
+
stats = analyzer.get_database_stats()
|
| 721 |
+
|
| 722 |
+
# KPI Cards with descriptions
|
| 723 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 724 |
+
|
| 725 |
+
with col1:
|
| 726 |
+
with st.container(border=True):
|
| 727 |
+
st.metric("Tests Analyzed", stats['total_tests'])
|
| 728 |
+
st.caption("Audio files processed")
|
| 729 |
+
|
| 730 |
+
with col2:
|
| 731 |
+
with st.container(border=True):
|
| 732 |
+
st.metric("Unique Voices", stats['total_voiceprints'])
|
| 733 |
+
st.caption("Distinct speakers identified")
|
| 734 |
+
|
| 735 |
+
with col3:
|
| 736 |
+
with st.container(border=True):
|
| 737 |
+
st.metric("Flagged", stats['flagged_voiceprints'])
|
| 738 |
+
st.caption("Suspicious voices marked")
|
| 739 |
+
|
| 740 |
+
with col4:
|
| 741 |
+
with st.container(border=True):
|
| 742 |
+
st.metric("Recurring", stats['multi_appearance'])
|
| 743 |
+
st.caption("Voices in 2+ tests")
|
| 744 |
+
|
| 745 |
+
# Check if clear was requested
|
| 746 |
+
if st.session_state.get('clear_db_search', False):
|
| 747 |
+
st.session_state['clear_db_search'] = False
|
| 748 |
+
default_search = ""
|
| 749 |
+
else:
|
| 750 |
+
default_search = st.session_state.get('db_search_value', "")
|
| 751 |
+
|
| 752 |
+
# Filter input with clear button
|
| 753 |
+
filter_col1, filter_col2 = st.columns([5, 1])
|
| 754 |
+
with filter_col1:
|
| 755 |
+
search_query = st.text_input("🔍 Filter", value=default_search, placeholder="Voice ID or File name...", key="db_search_input", label_visibility="collapsed")
|
| 756 |
+
st.session_state['db_search_value'] = search_query
|
| 757 |
+
with filter_col2:
|
| 758 |
+
if search_query and st.button("✕ Clear", key="clear_search", use_container_width=True):
|
| 759 |
+
st.session_state['clear_db_search'] = True
|
| 760 |
+
st.rerun()
|
| 761 |
+
|
| 762 |
+
# Get all voiceprints
|
| 763 |
+
all_vps = analyzer.db.get_all_voiceprints()
|
| 764 |
+
|
| 765 |
+
if not all_vps:
|
| 766 |
+
st.info("No voiceprints yet. Analyze audio to get started!")
|
| 767 |
+
return
|
| 768 |
+
|
| 769 |
+
# Filter voiceprints if search query provided
|
| 770 |
+
if search_query:
|
| 771 |
+
search_lower = search_query.lower()
|
| 772 |
+
filtered_vps = []
|
| 773 |
+
for vp in all_vps:
|
| 774 |
+
# Check voice ID
|
| 775 |
+
if search_lower in vp.id.lower():
|
| 776 |
+
filtered_vps.append(vp)
|
| 777 |
+
continue
|
| 778 |
+
# Check label
|
| 779 |
+
if vp.label and search_lower in vp.label.lower():
|
| 780 |
+
filtered_vps.append(vp)
|
| 781 |
+
continue
|
| 782 |
+
# Check file names in appearances
|
| 783 |
+
appearances = analyzer.get_voiceprint_history(vp.id)
|
| 784 |
+
for app in appearances:
|
| 785 |
+
if app.get('filename') and search_lower in app['filename'].lower():
|
| 786 |
+
filtered_vps.append(vp)
|
| 787 |
+
break
|
| 788 |
+
all_vps = filtered_vps
|
| 789 |
+
|
| 790 |
+
if not all_vps:
|
| 791 |
+
st.warning(f"No results for '{search_query}'")
|
| 792 |
+
return
|
| 793 |
+
|
| 794 |
+
st.caption(f"Showing {len(all_vps)} voiceprint(s)")
|
| 795 |
+
st.markdown("---")
|
| 796 |
+
|
| 797 |
+
# Each voiceprint as a compact row
|
| 798 |
+
for i, vp in enumerate(all_vps):
|
| 799 |
+
flag_icon = "🚨" if vp.is_flagged else ("🟡" if vp.times_seen >= 2 else "✅")
|
| 800 |
+
label = vp.label if hasattr(vp, 'label') and vp.label else ""
|
| 801 |
+
display_name = label if label else vp.id
|
| 802 |
+
|
| 803 |
+
# Get appearances for this voiceprint
|
| 804 |
+
appearances = analyzer.get_voiceprint_history(vp.id)
|
| 805 |
+
|
| 806 |
+
# Compact single row: Flag | Name | Audio | Tests | Button
|
| 807 |
+
cols = st.columns([0.3, 1.5, 2.5, 0.5, 0.8])
|
| 808 |
+
|
| 809 |
+
with cols[0]:
|
| 810 |
+
st.write(flag_icon)
|
| 811 |
+
|
| 812 |
+
with cols[1]:
|
| 813 |
+
st.write(f"**{display_name}** · {vp.total_audio_seconds:.0f}s")
|
| 814 |
+
|
| 815 |
+
with cols[2]:
|
| 816 |
+
# Audio player
|
| 817 |
+
if appearances and appearances[0].get('clip_path'):
|
| 818 |
+
clip_path = appearances[0]['clip_path']
|
| 819 |
+
if os.path.exists(clip_path):
|
| 820 |
+
with open(clip_path, 'rb') as f:
|
| 821 |
+
st.audio(f.read(), format='audio/wav')
|
| 822 |
+
else:
|
| 823 |
+
st.caption("—")
|
| 824 |
+
else:
|
| 825 |
+
st.caption("—")
|
| 826 |
+
|
| 827 |
+
with cols[3]:
|
| 828 |
+
st.write(f"**{vp.times_seen}** tests")
|
| 829 |
+
|
| 830 |
+
with cols[4]:
|
| 831 |
+
# Flag toggle button
|
| 832 |
+
btn_label = "Unflag" if vp.is_flagged else "Flag"
|
| 833 |
+
if st.button(btn_label, key=f"flag_{vp.id}_{i}"):
|
| 834 |
+
analyzer.db.toggle_voiceprint_flag(vp.id, not vp.is_flagged, "Manual" if not vp.is_flagged else None)
|
| 835 |
+
st.rerun()
|
| 836 |
+
|
| 837 |
+
# Accordion for test appearances
|
| 838 |
+
if appearances and len(appearances) > 0:
|
| 839 |
+
with st.expander(f"📋 {len(appearances)} test appearances", expanded=False):
|
| 840 |
+
# Editable fields row
|
| 841 |
+
edit_cols = st.columns([1, 2])
|
| 842 |
+
with edit_cols[0]:
|
| 843 |
+
new_label = st.text_input("Name", value=label, key=f"label_{vp.id}_{i}", placeholder="Add name...")
|
| 844 |
+
if new_label != label:
|
| 845 |
+
if st.button("Save name", key=f"save_label_{vp.id}_{i}"):
|
| 846 |
+
analyzer.db.update_voiceprint_label(vp.id, new_label)
|
| 847 |
+
st.rerun()
|
| 848 |
+
with edit_cols[1]:
|
| 849 |
+
notes = vp.notes if hasattr(vp, 'notes') and vp.notes else ""
|
| 850 |
+
new_notes = st.text_input("Notes", value=notes, key=f"notes_{vp.id}_{i}", placeholder="Add notes...")
|
| 851 |
+
if new_notes != notes:
|
| 852 |
+
if st.button("Save notes", key=f"save_notes_{vp.id}_{i}"):
|
| 853 |
+
analyzer.db.update_voiceprint_notes(vp.id, new_notes)
|
| 854 |
+
st.rerun()
|
| 855 |
+
|
| 856 |
+
# Appearances table with audio players
|
| 857 |
+
# Header
|
| 858 |
+
hdr_cols = st.columns([2, 2.5, 1, 1, 1.5])
|
| 859 |
+
hdr_cols[0].caption("**Date**")
|
| 860 |
+
hdr_cols[1].caption("**File**")
|
| 861 |
+
hdr_cols[2].caption("**Role**")
|
| 862 |
+
hdr_cols[3].caption("**Duration**")
|
| 863 |
+
hdr_cols[4].caption("**Audio**")
|
| 864 |
+
|
| 865 |
+
for j, app in enumerate(appearances):
|
| 866 |
+
# Format date as dd/mm/yyyy HH:mm
|
| 867 |
+
date_str = '-'
|
| 868 |
+
if app['date']:
|
| 869 |
+
try:
|
| 870 |
+
from datetime import datetime as dt
|
| 871 |
+
date_obj = dt.fromisoformat(app['date'].replace('Z', '+00:00'))
|
| 872 |
+
date_str = date_obj.strftime('%d/%m/%Y %H:%M')
|
| 873 |
+
except:
|
| 874 |
+
date_str = app['date'][:16] if len(app['date']) >= 16 else app['date']
|
| 875 |
+
|
| 876 |
+
row_cols = st.columns([2, 2.5, 1, 1, 1.5])
|
| 877 |
+
row_cols[0].write(date_str)
|
| 878 |
+
row_cols[1].write(app['filename'][:30] if app['filename'] else '-')
|
| 879 |
+
row_cols[2].write('👤' if app['role'] == 'main' else '👥')
|
| 880 |
+
row_cols[3].write(f"{app['duration']:.0f}s")
|
| 881 |
+
|
| 882 |
+
# Audio player
|
| 883 |
+
clip_path = app.get('clip_path')
|
| 884 |
+
if clip_path and os.path.exists(clip_path):
|
| 885 |
+
with open(clip_path, 'rb') as f:
|
| 886 |
+
row_cols[4].audio(f.read(), format='audio/wav')
|
| 887 |
+
else:
|
| 888 |
+
row_cols[4].button("▶", disabled=True, key=f"no_audio_{vp.id}_{j}")
|
| 889 |
+
|
| 890 |
+
|
| 891 |
+
def render_voiceprint_card(vp, analyzer, compact=False, key_suffix=""):
|
| 892 |
+
"""Render a voiceprint card with audio player."""
|
| 893 |
+
flag_icon = "🚨" if vp.is_flagged else "🟡" if vp.times_seen >= 2 else "✅"
|
| 894 |
+
label = vp.label if hasattr(vp, 'label') and vp.label else None
|
| 895 |
+
display_name = f"{label} ({vp.id})" if label else vp.id
|
| 896 |
+
unique_key = f"{vp.id}_{key_suffix}"
|
| 897 |
+
|
| 898 |
+
cols = st.columns([3, 1, 1, 1]) if not compact else st.columns([4, 1, 1])
|
| 899 |
+
|
| 900 |
+
with cols[0]:
|
| 901 |
+
st.markdown(f"**{flag_icon} {display_name}**")
|
| 902 |
+
if not compact:
|
| 903 |
+
st.caption(f"First: {vp.first_seen.strftime('%Y-%m-%d') if vp.first_seen else '-'} · {vp.total_audio_seconds:.0f}s total")
|
| 904 |
+
|
| 905 |
+
# Get most recent clip for audio player
|
| 906 |
+
appearances = analyzer.get_voiceprint_history(vp.id)
|
| 907 |
+
if appearances and appearances[0].get('clip_path'):
|
| 908 |
+
clip_path = appearances[0]['clip_path']
|
| 909 |
+
if os.path.exists(clip_path):
|
| 910 |
+
with open(clip_path, 'rb') as f:
|
| 911 |
+
st.audio(f.read(), format='audio/wav')
|
| 912 |
+
|
| 913 |
+
with cols[1]:
|
| 914 |
+
st.metric("Tests", vp.times_seen, label_visibility="collapsed")
|
| 915 |
+
|
| 916 |
+
if not compact:
|
| 917 |
+
with cols[2]:
|
| 918 |
+
if st.button("View", key=f"view_{unique_key}"):
|
| 919 |
+
st.session_state['view_voiceprint'] = vp.id
|
| 920 |
+
st.rerun()
|
| 921 |
+
|
| 922 |
+
with cols[3]:
|
| 923 |
+
# Quick flag toggle
|
| 924 |
+
new_flag = not vp.is_flagged
|
| 925 |
+
flag_label = "Unflag" if vp.is_flagged else "Flag"
|
| 926 |
+
if st.button(flag_label, key=f"flag_{unique_key}"):
|
| 927 |
+
analyzer.db.toggle_voiceprint_flag(vp.id, new_flag, "Manual flag" if new_flag else None)
|
| 928 |
+
st.rerun()
|
| 929 |
+
else:
|
| 930 |
+
with cols[2]:
|
| 931 |
+
if st.button("→", key=f"view_{unique_key}"):
|
| 932 |
+
st.session_state['view_voiceprint'] = vp.id
|
| 933 |
+
st.rerun()
|
| 934 |
+
|
| 935 |
+
|
| 936 |
+
def render_appearance_timeline(timeline_data):
|
| 937 |
+
"""Render timeline chart of voiceprint appearances."""
|
| 938 |
+
import plotly.express as px
|
| 939 |
+
import pandas as pd
|
| 940 |
+
|
| 941 |
+
if not timeline_data:
|
| 942 |
+
return
|
| 943 |
+
|
| 944 |
+
df = pd.DataFrame(timeline_data)
|
| 945 |
+
df['date'] = pd.to_datetime(df['date'])
|
| 946 |
+
df['day'] = df['date'].dt.strftime('%Y-%m-%d')
|
| 947 |
+
|
| 948 |
+
# Count appearances per day
|
| 949 |
+
daily_counts = df.groupby('day').size().reset_index(name='count')
|
| 950 |
+
daily_counts = daily_counts.sort_values('day')
|
| 951 |
+
|
| 952 |
+
fig = px.bar(
|
| 953 |
+
daily_counts,
|
| 954 |
+
x='day',
|
| 955 |
+
y='count',
|
| 956 |
+
labels={'day': 'Date', 'count': 'Appearances'},
|
| 957 |
+
)
|
| 958 |
+
|
| 959 |
+
fig.update_layout(
|
| 960 |
+
height=120,
|
| 961 |
+
margin=dict(l=0, r=0, t=10, b=30),
|
| 962 |
+
showlegend=False,
|
| 963 |
+
xaxis=dict(
|
| 964 |
+
type='category',
|
| 965 |
+
tickangle=-45
|
| 966 |
+
)
|
| 967 |
+
)
|
| 968 |
+
fig.update_traces(marker_color='#2563eb')
|
| 969 |
+
|
| 970 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 971 |
+
|
| 972 |
+
|
| 973 |
+
def render_voiceprint_detail(vp_id: str):
|
| 974 |
+
"""Render detail view for a voiceprint."""
|
| 975 |
+
analyzer = get_analyzer()
|
| 976 |
+
|
| 977 |
+
vp = analyzer.db.get_voiceprint(vp_id)
|
| 978 |
+
if not vp:
|
| 979 |
+
st.error(f"Voiceprint {vp_id} not found")
|
| 980 |
+
return
|
| 981 |
+
|
| 982 |
+
# Header with editable label
|
| 983 |
+
flag_icon = "🚨" if vp.is_flagged else "🟡" if vp.times_seen >= 2 else "✅"
|
| 984 |
+
current_label = vp.label if hasattr(vp, 'label') and vp.label else ""
|
| 985 |
+
|
| 986 |
+
col_title, col_flag = st.columns([4, 1])
|
| 987 |
+
with col_title:
|
| 988 |
+
st.markdown(f"## {flag_icon} {vp_id}")
|
| 989 |
+
with col_flag:
|
| 990 |
+
# Flag toggle
|
| 991 |
+
new_flag = not vp.is_flagged
|
| 992 |
+
flag_btn = "🚩 Unflag" if vp.is_flagged else "🚩 Flag"
|
| 993 |
+
if st.button(flag_btn, key="detail_flag"):
|
| 994 |
+
analyzer.db.toggle_voiceprint_flag(vp_id, new_flag, "Manual flag" if new_flag else None)
|
| 995 |
+
st.rerun()
|
| 996 |
+
|
| 997 |
+
# Editable name/label
|
| 998 |
+
new_label = st.text_input("Name/Label", value=current_label, placeholder="e.g., Juan Pérez")
|
| 999 |
+
if new_label != current_label:
|
| 1000 |
+
if st.button("💾 Save name"):
|
| 1001 |
+
analyzer.db.update_voiceprint_label(vp_id, new_label)
|
| 1002 |
+
st.success("Name saved!")
|
| 1003 |
+
st.rerun()
|
| 1004 |
+
|
| 1005 |
+
# Stats row
|
| 1006 |
+
col1, col2, col3 = st.columns(3)
|
| 1007 |
+
with col1:
|
| 1008 |
+
st.metric("Times Seen", vp.times_seen)
|
| 1009 |
+
with col2:
|
| 1010 |
+
st.metric("Total Audio", f"{vp.total_audio_seconds:.0f}s")
|
| 1011 |
+
with col3:
|
| 1012 |
+
st.metric("First Seen", vp.first_seen.strftime('%Y-%m-%d') if vp.first_seen else '-')
|
| 1013 |
+
|
| 1014 |
+
if vp.flag_reason:
|
| 1015 |
+
st.warning(f"Flag reason: {vp.flag_reason}")
|
| 1016 |
+
|
| 1017 |
+
# Notes/Comments
|
| 1018 |
+
st.markdown("#### 📝 Notes")
|
| 1019 |
+
current_notes = vp.notes if hasattr(vp, 'notes') and vp.notes else ""
|
| 1020 |
+
new_notes = st.text_area("", value=current_notes, placeholder="Add notes about this voiceprint...", height=80)
|
| 1021 |
+
if new_notes != current_notes:
|
| 1022 |
+
if st.button("💾 Save notes"):
|
| 1023 |
+
analyzer.db.update_voiceprint_notes(vp_id, new_notes)
|
| 1024 |
+
st.success("Notes saved!")
|
| 1025 |
+
st.rerun()
|
| 1026 |
+
|
| 1027 |
+
# Timeline for this voiceprint
|
| 1028 |
+
st.markdown("#### 📈 Appearance Timeline")
|
| 1029 |
+
vp_timeline = analyzer.db.get_appearance_timeline(vp_id)
|
| 1030 |
+
if vp_timeline:
|
| 1031 |
+
render_appearance_timeline(vp_timeline)
|
| 1032 |
+
|
| 1033 |
+
# Appearances list
|
| 1034 |
+
st.markdown("#### 📋 Appearances")
|
| 1035 |
+
|
| 1036 |
+
appearances = analyzer.get_voiceprint_history(vp_id)
|
| 1037 |
+
|
| 1038 |
+
for app in appearances:
|
| 1039 |
+
with st.container():
|
| 1040 |
+
cols = st.columns([1, 2, 1, 2])
|
| 1041 |
+
|
| 1042 |
+
with cols[0]:
|
| 1043 |
+
date_str = app['date'][:10] if app['date'] else '-'
|
| 1044 |
+
st.markdown(f"**{date_str}**")
|
| 1045 |
+
|
| 1046 |
+
with cols[1]:
|
| 1047 |
+
st.caption(f"{app['filename']}")
|
| 1048 |
+
|
| 1049 |
+
with cols[2]:
|
| 1050 |
+
role_icon = "👤" if app['role'] == 'main' else "👥"
|
| 1051 |
+
st.markdown(f"{role_icon} {app['duration']:.0f}s")
|
| 1052 |
+
|
| 1053 |
+
with cols[3]:
|
| 1054 |
+
# Audio player
|
| 1055 |
+
if app['clip_path'] and os.path.exists(app['clip_path']):
|
| 1056 |
+
with open(app['clip_path'], 'rb') as f:
|
| 1057 |
+
st.audio(f.read(), format='audio/wav')
|
| 1058 |
+
|
| 1059 |
+
|
| 1060 |
+
def render_about_tab():
|
| 1061 |
+
"""Render the About tab with technical explanations."""
|
| 1062 |
+
|
| 1063 |
+
st.markdown("## How It Works")
|
| 1064 |
+
st.markdown("This tool analyzes audio recordings to detect fraud patterns in voice-based assessments.")
|
| 1065 |
+
|
| 1066 |
+
st.markdown("---")
|
| 1067 |
+
|
| 1068 |
+
# Section 1: Speaker Diarization
|
| 1069 |
+
st.markdown("### 🎭 Speaker Diarization")
|
| 1070 |
+
with st.container(border=True):
|
| 1071 |
+
col1, col2 = st.columns([2, 1])
|
| 1072 |
+
with col1:
|
| 1073 |
+
st.markdown("""
|
| 1074 |
+
**What is it?**
|
| 1075 |
+
Speaker diarization answers the question *"who spoke when?"* — it segments audio by speaker identity.
|
| 1076 |
+
|
| 1077 |
+
**How we do it:**
|
| 1078 |
+
- Extract voice embeddings using **ECAPA-TDNN** neural network
|
| 1079 |
+
- Cluster similar embeddings to group speech by speaker
|
| 1080 |
+
- Label speakers as A, B, C... based on speaking time
|
| 1081 |
+
|
| 1082 |
+
**Key metric:**
|
| 1083 |
+
`Speaking time` — total seconds each speaker talks
|
| 1084 |
+
""")
|
| 1085 |
+
with col2:
|
| 1086 |
+
st.markdown("""
|
| 1087 |
+
```
|
| 1088 |
+
Audio Timeline:
|
| 1089 |
+
├─ Speaker A ████░░████░░
|
| 1090 |
+
├─ Speaker B ░░░░██░░░░██
|
| 1091 |
+
└─ Speaker C ░░░░░░░░██░░
|
| 1092 |
+
```
|
| 1093 |
+
""")
|
| 1094 |
+
|
| 1095 |
+
st.markdown("")
|
| 1096 |
+
|
| 1097 |
+
# Section 2: Voiceprint Matching
|
| 1098 |
+
st.markdown("### 🔐 Voiceprint Matching")
|
| 1099 |
+
with st.container(border=True):
|
| 1100 |
+
col1, col2 = st.columns([2, 1])
|
| 1101 |
+
with col1:
|
| 1102 |
+
st.markdown("""
|
| 1103 |
+
**What is it?**
|
| 1104 |
+
A voiceprint is a unique numerical representation (embedding) of a person's voice characteristics.
|
| 1105 |
+
|
| 1106 |
+
**How matching works:**
|
| 1107 |
+
1. Extract 192-dimensional embedding vector
|
| 1108 |
+
2. Compare with stored voiceprints using **cosine similarity**
|
| 1109 |
+
3. If similarity > threshold → same person
|
| 1110 |
+
|
| 1111 |
+
**Threshold:**
|
| 1112 |
+
""")
|
| 1113 |
+
st.code("similarity_threshold = 0.80 # 80% match required", language="python")
|
| 1114 |
+
with col2:
|
| 1115 |
+
st.metric("Embedding Size", "192-dim")
|
| 1116 |
+
st.metric("Match Threshold", "80%")
|
| 1117 |
+
st.metric("Model", "ECAPA-TDNN")
|
| 1118 |
+
|
| 1119 |
+
st.markdown("")
|
| 1120 |
+
|
| 1121 |
+
# Section 3: Synthetic Voice Detection
|
| 1122 |
+
st.markdown("### 🤖 Synthetic Voice Detection")
|
| 1123 |
+
with st.container(border=True):
|
| 1124 |
+
st.markdown("""
|
| 1125 |
+
**What we detect:**
|
| 1126 |
+
| Type | Description | Indicators |
|
| 1127 |
+
|------|-------------|------------|
|
| 1128 |
+
| **TTS** | Text-to-Speech (ElevenLabs, etc.) | Flat pitch, regular timing, smooth spectrum |
|
| 1129 |
+
| **Voice Clone** | AI-generated voice copy | Unnatural prosody, artifacts |
|
| 1130 |
+
| **Playback** | Pre-recorded audio through speakers | Room acoustics, compression artifacts |
|
| 1131 |
+
|
| 1132 |
+
**Detection methods:**
|
| 1133 |
+
""")
|
| 1134 |
+
col1, col2, col3 = st.columns(3)
|
| 1135 |
+
with col1:
|
| 1136 |
+
st.markdown("""
|
| 1137 |
+
**Pitch Analysis**
|
| 1138 |
+
```python
|
| 1139 |
+
# TTS has very consistent pitch
|
| 1140 |
+
pitch_cv < 0.08 # Coefficient of variation
|
| 1141 |
+
```
|
| 1142 |
+
""")
|
| 1143 |
+
with col2:
|
| 1144 |
+
st.markdown("""
|
| 1145 |
+
**Timing Regularity**
|
| 1146 |
+
```python
|
| 1147 |
+
# TTS has robotic timing
|
| 1148 |
+
timing_std < 0.05 # seconds
|
| 1149 |
+
```
|
| 1150 |
+
""")
|
| 1151 |
+
with col3:
|
| 1152 |
+
st.markdown("""
|
| 1153 |
+
**Spectral Smoothness**
|
| 1154 |
+
```python
|
| 1155 |
+
# Natural speech has texture
|
| 1156 |
+
spectral_flux > threshold
|
| 1157 |
+
```
|
| 1158 |
+
""")
|
| 1159 |
+
|
| 1160 |
+
st.markdown("")
|
| 1161 |
+
|
| 1162 |
+
# Section 4: Wake Word Detection
|
| 1163 |
+
st.markdown("### 🎯 Wake Word Detection")
|
| 1164 |
+
with st.container(border=True):
|
| 1165 |
+
col1, col2 = st.columns([2, 1])
|
| 1166 |
+
with col1:
|
| 1167 |
+
st.markdown("""
|
| 1168 |
+
**Purpose:**
|
| 1169 |
+
Detect if someone is using voice assistants or getting external help during the test.
|
| 1170 |
+
|
| 1171 |
+
**Words we detect:**
|
| 1172 |
+
""")
|
| 1173 |
+
st.code("""
|
| 1174 |
+
WAKE_WORDS = [
|
| 1175 |
+
"alexa", "siri", "hey google", "ok google",
|
| 1176 |
+
"cortana", "hey chat", "transfer", "send money"
|
| 1177 |
+
]
|
| 1178 |
+
""", language="python")
|
| 1179 |
+
with col2:
|
| 1180 |
+
st.warning("🔴 **Alert triggered** when wake words detected")
|
| 1181 |
+
|
| 1182 |
+
st.markdown("")
|
| 1183 |
+
|
| 1184 |
+
# Section 5: Fraud Detection Module
|
| 1185 |
+
st.markdown("### 🕵️ Fraud Detection Module")
|
| 1186 |
+
with st.container(border=True):
|
| 1187 |
+
st.markdown("""
|
| 1188 |
+
Three specialized detectors analyze speech patterns to identify potential cheating:
|
| 1189 |
+
""")
|
| 1190 |
+
|
| 1191 |
+
# Whisper Detection
|
| 1192 |
+
st.markdown("#### 🔇 Whisper Detection")
|
| 1193 |
+
col1, col2 = st.columns([2, 1])
|
| 1194 |
+
with col1:
|
| 1195 |
+
st.markdown("""
|
| 1196 |
+
**Purpose:** Detect low-volume background voices that may indicate someone is being prompted.
|
| 1197 |
+
|
| 1198 |
+
**How it works:**
|
| 1199 |
+
- Analyzes audio energy in frames (25ms windows)
|
| 1200 |
+
- Calculates spectral centroid (whispers have higher frequencies)
|
| 1201 |
+
- Measures zero-crossing rate (breathy sounds have higher ZCR)
|
| 1202 |
+
- Filters segments that overlap with main speaker
|
| 1203 |
+
""")
|
| 1204 |
+
with col2:
|
| 1205 |
+
st.code("""
|
| 1206 |
+
# Whisper characteristics
|
| 1207 |
+
energy < 30% of main speech
|
| 1208 |
+
spectral_centroid > 0.15
|
| 1209 |
+
zero_crossing_rate > 0.1
|
| 1210 |
+
""", language="python")
|
| 1211 |
+
|
| 1212 |
+
st.markdown("---")
|
| 1213 |
+
|
| 1214 |
+
# Reading Pattern Detection
|
| 1215 |
+
st.markdown("#### 📖 Reading Pattern Detection")
|
| 1216 |
+
col1, col2 = st.columns([2, 1])
|
| 1217 |
+
with col1:
|
| 1218 |
+
st.markdown("""
|
| 1219 |
+
**Purpose:** Detect if someone is reading prepared answers vs speaking naturally.
|
| 1220 |
+
|
| 1221 |
+
**Indicators analyzed:**
|
| 1222 |
+
| Indicator | Natural Speech | Reading |
|
| 1223 |
+
|-----------|---------------|---------|
|
| 1224 |
+
| Speech rate | Variable (CV > 0.15) | Constant |
|
| 1225 |
+
| Filler words | 2+ per minute | Few/none |
|
| 1226 |
+
| Pause pattern | Irregular | Regular |
|
| 1227 |
+
| Self-corrections | Present | Absent |
|
| 1228 |
+
""")
|
| 1229 |
+
with col2:
|
| 1230 |
+
st.code("""
|
| 1231 |
+
FILLER_WORDS = [
|
| 1232 |
+
'um', 'uh', 'like',
|
| 1233 |
+
'you know', 'basically',
|
| 1234 |
+
'i mean', 'sort of'
|
| 1235 |
+
]
|
| 1236 |
+
# < 2 fillers/min = suspicious
|
| 1237 |
+
""", language="python")
|
| 1238 |
+
|
| 1239 |
+
st.markdown("---")
|
| 1240 |
+
|
| 1241 |
+
# Suspicious Pause Detection
|
| 1242 |
+
st.markdown("#### ⏸️ Suspicious Pause Detection")
|
| 1243 |
+
col1, col2 = st.columns([2, 1])
|
| 1244 |
+
with col1:
|
| 1245 |
+
st.markdown("""
|
| 1246 |
+
**Purpose:** Identify abnormally long silences that may indicate looking up answers.
|
| 1247 |
+
|
| 1248 |
+
**Pause classification:**
|
| 1249 |
+
| Duration | Classification |
|
| 1250 |
+
|----------|---------------|
|
| 1251 |
+
| < 2s | Natural thinking pause |
|
| 1252 |
+
| 2-5s | Extended pause (warning) |
|
| 1253 |
+
| > 5s | **Suspicious** - may indicate cheating |
|
| 1254 |
+
|
| 1255 |
+
**Context captured:** What was said before/after the pause
|
| 1256 |
+
""")
|
| 1257 |
+
with col2:
|
| 1258 |
+
st.metric("Suspicious Threshold", "> 5 seconds")
|
| 1259 |
+
st.caption("Long silences may indicate:")
|
| 1260 |
+
st.markdown("""
|
| 1261 |
+
- Looking up answers
|
| 1262 |
+
- Receiving external help
|
| 1263 |
+
- Reading from a source
|
| 1264 |
+
- Searching on phone/computer
|
| 1265 |
+
""")
|
| 1266 |
+
|
| 1267 |
+
st.markdown("")
|
| 1268 |
+
|
| 1269 |
+
# Section 6: Technology Stack
|
| 1270 |
+
st.markdown("### 🛠️ Technology Stack")
|
| 1271 |
+
with st.container(border=True):
|
| 1272 |
+
col1, col2, col3 = st.columns(3)
|
| 1273 |
+
with col1:
|
| 1274 |
+
st.markdown("""
|
| 1275 |
+
**🧠 SpeechBrain**
|
| 1276 |
+
- Open-source speech toolkit
|
| 1277 |
+
- PyTorch-based
|
| 1278 |
+
- Pre-trained models
|
| 1279 |
+
- [speechbrain.github.io](https://speechbrain.github.io)
|
| 1280 |
+
""")
|
| 1281 |
+
with col2:
|
| 1282 |
+
st.markdown("""
|
| 1283 |
+
**🎤 Models Used**
|
| 1284 |
+
- `spkrec-ecapa-voxceleb` — Speaker embedding
|
| 1285 |
+
- `vad-crdnn-libriparty` — Voice Activity Detection
|
| 1286 |
+
- `asr-wav2vec2` — Transcription
|
| 1287 |
+
""")
|
| 1288 |
+
with col3:
|
| 1289 |
+
st.markdown("""
|
| 1290 |
+
**📊 Thresholds**
|
| 1291 |
+
| Parameter | Value |
|
| 1292 |
+
|-----------|-------|
|
| 1293 |
+
| Min audio | 20s |
|
| 1294 |
+
| Voice match | 80% |
|
| 1295 |
+
| Synthetic | 45% |
|
| 1296 |
+
| Voice sample | 10s |
|
| 1297 |
+
""")
|
| 1298 |
+
|
| 1299 |
+
st.markdown("")
|
| 1300 |
+
|
| 1301 |
+
# Section 7: Flags & Alerts
|
| 1302 |
+
st.markdown("### 🚨 Flags & Alert System")
|
| 1303 |
+
with st.container(border=True):
|
| 1304 |
+
st.markdown("""
|
| 1305 |
+
| Icon | Status | Meaning |
|
| 1306 |
+
|------|--------|---------|
|
| 1307 |
+
| ✅ | OK | Voice appears normal, seen 1 time |
|
| 1308 |
+
| 🟡 | Review | Voice seen in 2-3 tests — verify identity |
|
| 1309 |
+
| 🚨 | Flagged | Voice seen 4+ times OR manually flagged — investigate |
|
| 1310 |
+
|
| 1311 |
+
**Detection indicators:**
|
| 1312 |
+
| Icon | Detection | Description |
|
| 1313 |
+
|------|-----------|-------------|
|
| 1314 |
+
| 🔊 | Playback | Audio played through speakers |
|
| 1315 |
+
| 📖 | Reading | Unnatural speech rhythm (reading prepared text) |
|
| 1316 |
+
| 🔇 | Whispers | Background voices detected |
|
| 1317 |
+
| ⏸️ | Long Pauses | Silences > 5 seconds |
|
| 1318 |
+
| 🔴 | Wake Words | "Alexa", "Siri", etc. detected |
|
| 1319 |
+
|
| 1320 |
+
**Auto-flag conditions:**
|
| 1321 |
+
- Same voice in 4+ different tests
|
| 1322 |
+
- High synthetic voice score (>45%)
|
| 1323 |
+
- Wake words detected during test
|
| 1324 |
+
- Multiple fraud indicators triggered
|
| 1325 |
+
""")
|
| 1326 |
+
|
| 1327 |
+
st.markdown("---")
|
| 1328 |
+
st.caption("Built with SpeechBrain, Streamlit, and PyTorch · [GitHub](https://github.com/daasime/sop-audio-analyzer)")
|
| 1329 |
+
|
| 1330 |
+
|
| 1331 |
+
def main():
|
| 1332 |
+
"""Main app."""
|
| 1333 |
+
# Check login first
|
| 1334 |
+
if not check_login():
|
| 1335 |
+
return
|
| 1336 |
+
|
| 1337 |
+
st.title("🎙️ Test Integrity Analysis")
|
| 1338 |
+
st.markdown("Monitor and review voice authentication results")
|
| 1339 |
+
|
| 1340 |
+
# Logout button in sidebar
|
| 1341 |
+
with st.sidebar:
|
| 1342 |
+
st.markdown(f"**Logged in**")
|
| 1343 |
+
if st.button("🚪 Logout"):
|
| 1344 |
+
st.session_state['authenticated'] = False
|
| 1345 |
+
st.rerun()
|
| 1346 |
+
|
| 1347 |
+
# Tabs
|
| 1348 |
+
tab1, tab2, tab3 = st.tabs(["Analyzer", "Database", "About"])
|
| 1349 |
+
|
| 1350 |
+
with tab1:
|
| 1351 |
+
render_analyzer_tab()
|
| 1352 |
+
|
| 1353 |
+
with tab2:
|
| 1354 |
+
render_database_tab()
|
| 1355 |
+
|
| 1356 |
+
with tab3:
|
| 1357 |
+
render_about_tab()
|
| 1358 |
+
|
| 1359 |
+
|
| 1360 |
+
if __name__ == "__main__":
|
| 1361 |
+
main()
|
requirements.txt
CHANGED
|
@@ -1,3 +1,28 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core ML - compatible with Python 3.11 (for HF Spaces)
|
| 2 |
+
torch==2.5.1
|
| 3 |
+
torchaudio==2.5.1
|
| 4 |
+
speechbrain>=1.0.0
|
| 5 |
+
|
| 6 |
+
# Audio processing
|
| 7 |
+
librosa>=0.10.0
|
| 8 |
+
soundfile>=0.12.1
|
| 9 |
+
pydub>=0.25.1
|
| 10 |
+
|
| 11 |
+
# Transcription
|
| 12 |
+
openai-whisper>=20231117
|
| 13 |
+
|
| 14 |
+
# Scientific computing
|
| 15 |
+
numpy>=1.24.0
|
| 16 |
+
scipy>=1.10.0
|
| 17 |
+
scikit-learn>=1.3.0
|
| 18 |
+
|
| 19 |
+
# Database
|
| 20 |
+
sqlalchemy>=2.0.0
|
| 21 |
+
|
| 22 |
+
# UI
|
| 23 |
+
streamlit>=1.29.0
|
| 24 |
+
streamlit-webrtc>=0.47.0
|
| 25 |
+
plotly>=5.18.0
|
| 26 |
+
|
| 27 |
+
# Utilities
|
| 28 |
+
python-dotenv>=1.0.0
|
src/__init__.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Patch huggingface_hub to handle deprecated use_auth_token parameter
|
| 2 |
+
# This is needed because speechbrain uses the old API
|
| 3 |
+
import huggingface_hub
|
| 4 |
+
_original_hf_hub_download = huggingface_hub.hf_hub_download
|
| 5 |
+
|
| 6 |
+
def _patched_hf_hub_download(*args, **kwargs):
|
| 7 |
+
# Convert deprecated use_auth_token to token
|
| 8 |
+
if 'use_auth_token' in kwargs:
|
| 9 |
+
kwargs['token'] = kwargs.pop('use_auth_token')
|
| 10 |
+
return _original_hf_hub_download(*args, **kwargs)
|
| 11 |
+
|
| 12 |
+
huggingface_hub.hf_hub_download = _patched_hf_hub_download
|
| 13 |
+
|
| 14 |
+
# Also patch snapshot_download if it exists
|
| 15 |
+
if hasattr(huggingface_hub, 'snapshot_download'):
|
| 16 |
+
_original_snapshot_download = huggingface_hub.snapshot_download
|
| 17 |
+
|
| 18 |
+
def _patched_snapshot_download(*args, **kwargs):
|
| 19 |
+
if 'use_auth_token' in kwargs:
|
| 20 |
+
kwargs['token'] = kwargs.pop('use_auth_token')
|
| 21 |
+
return _original_snapshot_download(*args, **kwargs)
|
| 22 |
+
|
| 23 |
+
huggingface_hub.snapshot_download = _patched_snapshot_download
|
| 24 |
+
|
| 25 |
+
from .analyzer import AudioAnalyzer, AnalysisResult, SpeakerResult
|
| 26 |
+
|
| 27 |
+
__all__ = ['AudioAnalyzer', 'AnalysisResult', 'SpeakerResult']
|
src/analyzer.py
ADDED
|
@@ -0,0 +1,597 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Main Audio Analyzer - orchestrates all analysis phases.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
import uuid
|
| 6 |
+
import json
|
| 7 |
+
import tempfile
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from typing import Dict, List, Optional, Callable
|
| 10 |
+
from dataclasses import dataclass, asdict
|
| 11 |
+
import numpy as np
|
| 12 |
+
import torch
|
| 13 |
+
import torchaudio
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def to_python_type(obj):
|
| 17 |
+
"""Convert numpy types to Python native types for JSON serialization."""
|
| 18 |
+
if isinstance(obj, (np.bool_, bool)):
|
| 19 |
+
return bool(obj)
|
| 20 |
+
elif isinstance(obj, (np.integer, np.int64, np.int32)):
|
| 21 |
+
return int(obj)
|
| 22 |
+
elif isinstance(obj, (np.floating, np.float64, np.float32)):
|
| 23 |
+
return float(obj)
|
| 24 |
+
elif isinstance(obj, np.ndarray):
|
| 25 |
+
return obj.tolist()
|
| 26 |
+
elif isinstance(obj, dict):
|
| 27 |
+
return {k: to_python_type(v) for k, v in obj.items()}
|
| 28 |
+
elif isinstance(obj, list):
|
| 29 |
+
return [to_python_type(i) for i in obj]
|
| 30 |
+
return obj
|
| 31 |
+
|
| 32 |
+
from .phase1_foundation import (
|
| 33 |
+
AudioPreprocessor,
|
| 34 |
+
VoiceActivityDetector,
|
| 35 |
+
SpeakerDiarizer,
|
| 36 |
+
VoiceprintExtractor,
|
| 37 |
+
VoiceprintResult
|
| 38 |
+
)
|
| 39 |
+
from .phase2_background import BackgroundAnalyzer, BackgroundAnomaly
|
| 40 |
+
from .phase6_synthetic import SyntheticDetector, WakeWordDetector, PlaybackDetector
|
| 41 |
+
from .fraud_detection import (
|
| 42 |
+
WhisperDetector, WhisperResult,
|
| 43 |
+
ReadingPatternAnalyzer, ReadingPatternResult,
|
| 44 |
+
SuspiciousPauseDetector, PauseResult
|
| 45 |
+
)
|
| 46 |
+
from .database import Database
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@dataclass
|
| 50 |
+
class SpeakerResult:
|
| 51 |
+
"""Result for a detected speaker."""
|
| 52 |
+
voiceprint_id: str
|
| 53 |
+
label: str
|
| 54 |
+
role: str # "main" or "additional"
|
| 55 |
+
total_seconds: float
|
| 56 |
+
quality: str
|
| 57 |
+
is_synthetic: bool
|
| 58 |
+
synthetic_score: float
|
| 59 |
+
is_playback: bool = False
|
| 60 |
+
playback_score: float = 0.0
|
| 61 |
+
playback_indicators: List[str] = None
|
| 62 |
+
times_seen: int = 1
|
| 63 |
+
is_flagged: bool = False
|
| 64 |
+
segments: List[dict] = None
|
| 65 |
+
clip_path: str = None # Path to audio sample for this speaker
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
@dataclass
|
| 69 |
+
class AnalysisResult:
|
| 70 |
+
"""Complete analysis result."""
|
| 71 |
+
test_id: str
|
| 72 |
+
filename: str
|
| 73 |
+
duration_seconds: float
|
| 74 |
+
analyzed_at: str
|
| 75 |
+
|
| 76 |
+
# Speakers
|
| 77 |
+
main_speaker: Optional[SpeakerResult]
|
| 78 |
+
additional_speakers: List[SpeakerResult]
|
| 79 |
+
|
| 80 |
+
# Background
|
| 81 |
+
background_anomalies: List[dict]
|
| 82 |
+
|
| 83 |
+
# Wake words
|
| 84 |
+
wake_words: List[dict]
|
| 85 |
+
assistant_responses: List[dict]
|
| 86 |
+
|
| 87 |
+
# Prompt voice (audio from question prompts)
|
| 88 |
+
prompt_voice_detected: bool
|
| 89 |
+
prompt_voice_seconds: float
|
| 90 |
+
|
| 91 |
+
# Playback detection (global)
|
| 92 |
+
playback_detected: bool = False
|
| 93 |
+
playback_score: float = 0.0
|
| 94 |
+
playback_indicators: List[str] = None
|
| 95 |
+
|
| 96 |
+
# Fraud detection - Whisper (background voices)
|
| 97 |
+
whisper_detected: bool = False
|
| 98 |
+
whisper_instances: List[dict] = None
|
| 99 |
+
|
| 100 |
+
# Fraud detection - Reading pattern
|
| 101 |
+
reading_pattern_detected: bool = False
|
| 102 |
+
reading_confidence: float = 0.0
|
| 103 |
+
reading_indicators: List[str] = None
|
| 104 |
+
|
| 105 |
+
# Fraud detection - Suspicious pauses
|
| 106 |
+
suspicious_pauses_detected: bool = False
|
| 107 |
+
suspicious_pauses: List[dict] = None
|
| 108 |
+
longest_pause: float = 0.0
|
| 109 |
+
|
| 110 |
+
def to_dict(self) -> dict:
|
| 111 |
+
"""Convert to dictionary."""
|
| 112 |
+
result = {
|
| 113 |
+
'test_id': self.test_id,
|
| 114 |
+
'filename': self.filename,
|
| 115 |
+
'duration_seconds': float(self.duration_seconds),
|
| 116 |
+
'analyzed_at': self.analyzed_at,
|
| 117 |
+
'main_speaker': to_python_type(asdict(self.main_speaker)) if self.main_speaker else None,
|
| 118 |
+
'additional_speakers': [to_python_type(asdict(s)) for s in self.additional_speakers],
|
| 119 |
+
'background_anomalies': to_python_type(self.background_anomalies),
|
| 120 |
+
'wake_words': to_python_type(self.wake_words),
|
| 121 |
+
'assistant_responses': to_python_type(self.assistant_responses),
|
| 122 |
+
'prompt_voice_detected': bool(self.prompt_voice_detected),
|
| 123 |
+
'prompt_voice_seconds': float(self.prompt_voice_seconds),
|
| 124 |
+
'playback_detected': bool(self.playback_detected),
|
| 125 |
+
'playback_score': float(self.playback_score),
|
| 126 |
+
'playback_indicators': self.playback_indicators or [],
|
| 127 |
+
# Fraud detection fields
|
| 128 |
+
'whisper_detected': bool(self.whisper_detected),
|
| 129 |
+
'whisper_instances': to_python_type(self.whisper_instances or []),
|
| 130 |
+
'reading_pattern_detected': bool(self.reading_pattern_detected),
|
| 131 |
+
'reading_confidence': float(self.reading_confidence),
|
| 132 |
+
'reading_indicators': self.reading_indicators or [],
|
| 133 |
+
'suspicious_pauses_detected': bool(self.suspicious_pauses_detected),
|
| 134 |
+
'suspicious_pauses': to_python_type(self.suspicious_pauses or []),
|
| 135 |
+
'longest_pause': float(self.longest_pause)
|
| 136 |
+
}
|
| 137 |
+
return result
|
| 138 |
+
|
| 139 |
+
def to_json(self) -> str:
|
| 140 |
+
"""Convert to JSON string."""
|
| 141 |
+
return json.dumps(self.to_dict(), indent=2)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
class AudioAnalyzer:
|
| 145 |
+
"""Main analyzer that orchestrates all phases."""
|
| 146 |
+
|
| 147 |
+
def __init__(self, db_path: str = "data/db/voiceprints.db",
|
| 148 |
+
clips_dir: str = "data/clips",
|
| 149 |
+
device: str = None):
|
| 150 |
+
"""
|
| 151 |
+
Initialize analyzer.
|
| 152 |
+
|
| 153 |
+
Args:
|
| 154 |
+
db_path: Path to SQLite database
|
| 155 |
+
clips_dir: Directory to save audio clips
|
| 156 |
+
device: torch device (cuda/cpu)
|
| 157 |
+
"""
|
| 158 |
+
self.device = device
|
| 159 |
+
self.clips_dir = clips_dir
|
| 160 |
+
os.makedirs(clips_dir, exist_ok=True)
|
| 161 |
+
|
| 162 |
+
# Initialize database
|
| 163 |
+
self.db = Database(db_path)
|
| 164 |
+
|
| 165 |
+
# Initialize components (lazy loaded)
|
| 166 |
+
self._preprocessor = None
|
| 167 |
+
self._vad = None
|
| 168 |
+
self._diarizer = None
|
| 169 |
+
self._voiceprint = None
|
| 170 |
+
self._background = None
|
| 171 |
+
self._synthetic = None
|
| 172 |
+
self._playback = None
|
| 173 |
+
self._wake_words = None
|
| 174 |
+
# Fraud detectors
|
| 175 |
+
self._whisper_detector = None
|
| 176 |
+
self._reading_pattern = None
|
| 177 |
+
self._pause_detector = None
|
| 178 |
+
|
| 179 |
+
@property
|
| 180 |
+
def preprocessor(self):
|
| 181 |
+
if self._preprocessor is None:
|
| 182 |
+
self._preprocessor = AudioPreprocessor()
|
| 183 |
+
return self._preprocessor
|
| 184 |
+
|
| 185 |
+
@property
|
| 186 |
+
def vad(self):
|
| 187 |
+
if self._vad is None:
|
| 188 |
+
self._vad = VoiceActivityDetector(device=self.device)
|
| 189 |
+
return self._vad
|
| 190 |
+
|
| 191 |
+
@property
|
| 192 |
+
def diarizer(self):
|
| 193 |
+
if self._diarizer is None:
|
| 194 |
+
self._diarizer = SpeakerDiarizer(device=self.device)
|
| 195 |
+
return self._diarizer
|
| 196 |
+
|
| 197 |
+
@property
|
| 198 |
+
def voiceprint_extractor(self):
|
| 199 |
+
if self._voiceprint is None:
|
| 200 |
+
self._voiceprint = VoiceprintExtractor(device=self.device)
|
| 201 |
+
return self._voiceprint
|
| 202 |
+
|
| 203 |
+
@property
|
| 204 |
+
def background_analyzer(self):
|
| 205 |
+
if self._background is None:
|
| 206 |
+
self._background = BackgroundAnalyzer()
|
| 207 |
+
return self._background
|
| 208 |
+
|
| 209 |
+
@property
|
| 210 |
+
def synthetic_detector(self):
|
| 211 |
+
if self._synthetic is None:
|
| 212 |
+
self._synthetic = SyntheticDetector(device=self.device)
|
| 213 |
+
return self._synthetic
|
| 214 |
+
|
| 215 |
+
@property
|
| 216 |
+
def playback_detector(self):
|
| 217 |
+
if self._playback is None:
|
| 218 |
+
self._playback = PlaybackDetector()
|
| 219 |
+
return self._playback
|
| 220 |
+
|
| 221 |
+
@property
|
| 222 |
+
def wake_word_detector(self):
|
| 223 |
+
if self._wake_words is None:
|
| 224 |
+
self._wake_words = WakeWordDetector(model_size="base")
|
| 225 |
+
return self._wake_words
|
| 226 |
+
|
| 227 |
+
@property
|
| 228 |
+
def whisper_detector(self):
|
| 229 |
+
if self._whisper_detector is None:
|
| 230 |
+
self._whisper_detector = WhisperDetector()
|
| 231 |
+
return self._whisper_detector
|
| 232 |
+
|
| 233 |
+
@property
|
| 234 |
+
def reading_pattern_analyzer(self):
|
| 235 |
+
if self._reading_pattern is None:
|
| 236 |
+
self._reading_pattern = ReadingPatternAnalyzer()
|
| 237 |
+
return self._reading_pattern
|
| 238 |
+
|
| 239 |
+
@property
|
| 240 |
+
def pause_detector(self):
|
| 241 |
+
if self._pause_detector is None:
|
| 242 |
+
self._pause_detector = SuspiciousPauseDetector()
|
| 243 |
+
return self._pause_detector
|
| 244 |
+
|
| 245 |
+
def analyze(self, audio_path: str,
|
| 246 |
+
test_id: str = None,
|
| 247 |
+
progress_callback: Callable[[str, int], None] = None) -> AnalysisResult:
|
| 248 |
+
"""
|
| 249 |
+
Run full analysis on audio file.
|
| 250 |
+
|
| 251 |
+
Args:
|
| 252 |
+
audio_path: Path to audio file
|
| 253 |
+
test_id: Optional test ID (generated if not provided)
|
| 254 |
+
progress_callback: Optional callback for progress updates
|
| 255 |
+
|
| 256 |
+
Returns:
|
| 257 |
+
AnalysisResult with all findings
|
| 258 |
+
"""
|
| 259 |
+
def update_progress(msg: str, pct: int):
|
| 260 |
+
if progress_callback:
|
| 261 |
+
progress_callback(msg, pct)
|
| 262 |
+
|
| 263 |
+
# Generate test ID
|
| 264 |
+
if test_id is None:
|
| 265 |
+
test_id = f"test_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:6]}"
|
| 266 |
+
|
| 267 |
+
filename = os.path.basename(audio_path)
|
| 268 |
+
|
| 269 |
+
# Step 1: Preprocess
|
| 270 |
+
update_progress("Preprocessing audio...", 5)
|
| 271 |
+
waveform, sample_rate, metadata = self.preprocessor.process_file(audio_path)
|
| 272 |
+
duration = metadata['normalized_duration']
|
| 273 |
+
|
| 274 |
+
# Validate minimum audio duration (20 seconds)
|
| 275 |
+
MIN_DURATION = 20.0
|
| 276 |
+
if duration < MIN_DURATION:
|
| 277 |
+
raise ValueError(f"Audio too short: {duration:.1f}s. Minimum required: {MIN_DURATION:.0f}s")
|
| 278 |
+
|
| 279 |
+
# Save normalized audio to temp file for other components
|
| 280 |
+
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
|
| 281 |
+
temp_path = f.name
|
| 282 |
+
torchaudio.save(temp_path, waveform, sample_rate)
|
| 283 |
+
|
| 284 |
+
try:
|
| 285 |
+
# Step 2: VAD
|
| 286 |
+
update_progress("Detecting voice activity...", 15)
|
| 287 |
+
speech_segments = self.vad.detect(temp_path)
|
| 288 |
+
|
| 289 |
+
# Step 3: Speaker Diarization
|
| 290 |
+
update_progress("Identifying speakers...", 30)
|
| 291 |
+
speakers = self.diarizer.diarize(temp_path, speech_segments)
|
| 292 |
+
|
| 293 |
+
# Step 4: Process speakers
|
| 294 |
+
update_progress("Extracting voiceprints...", 45)
|
| 295 |
+
main_speaker_result = None
|
| 296 |
+
additional_speakers = []
|
| 297 |
+
|
| 298 |
+
speaker_list = list(speakers.values())
|
| 299 |
+
|
| 300 |
+
# First pass: recalculate actual speaking time for all speakers
|
| 301 |
+
for speaker_info in speaker_list:
|
| 302 |
+
actual_speaking_time = sum(seg.end - seg.start for seg in speaker_info.segments)
|
| 303 |
+
actual_speaking_time = min(actual_speaking_time, duration) # Cap to audio duration
|
| 304 |
+
speaker_info.total_seconds = actual_speaking_time
|
| 305 |
+
|
| 306 |
+
# Re-sort by speaking time (most speaking = main speaker)
|
| 307 |
+
speaker_list = sorted(speaker_list, key=lambda s: s.total_seconds, reverse=True)
|
| 308 |
+
|
| 309 |
+
for i, speaker_info in enumerate(speaker_list):
|
| 310 |
+
# Extract voiceprint
|
| 311 |
+
if speaker_info.embedding is not None:
|
| 312 |
+
vp_result = self.voiceprint_extractor.extract_from_embedding(
|
| 313 |
+
speaker_info.embedding,
|
| 314 |
+
speaker_info.total_seconds
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
+
# Check for synthetic
|
| 318 |
+
# Get speaker audio segments and run detection
|
| 319 |
+
synthetic_result = self._detect_synthetic_for_speaker(
|
| 320 |
+
waveform, sample_rate, speaker_info
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
role = "main" if i == 0 else "additional"
|
| 324 |
+
|
| 325 |
+
# Save to database and check for matches
|
| 326 |
+
existing_vp, similarity = self.db.find_matching_voiceprint(
|
| 327 |
+
vp_result.to_bytes(),
|
| 328 |
+
threshold=0.75
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
if existing_vp:
|
| 332 |
+
vp_id = existing_vp.id
|
| 333 |
+
times_seen = existing_vp.times_seen + 1
|
| 334 |
+
is_flagged = existing_vp.is_flagged or times_seen >= 4
|
| 335 |
+
else:
|
| 336 |
+
vp_id = vp_result.voiceprint_id
|
| 337 |
+
times_seen = 1
|
| 338 |
+
is_flagged = False
|
| 339 |
+
|
| 340 |
+
# Save clip for this speaker
|
| 341 |
+
clip_path = self._save_speaker_clip(
|
| 342 |
+
waveform, sample_rate, speaker_info, test_id, vp_id
|
| 343 |
+
)
|
| 344 |
+
|
| 345 |
+
# Add to database
|
| 346 |
+
self.db.add_voiceprint(
|
| 347 |
+
vp_id=vp_id,
|
| 348 |
+
embedding=vp_result.to_bytes(),
|
| 349 |
+
test_id=test_id,
|
| 350 |
+
filename=filename,
|
| 351 |
+
role=role,
|
| 352 |
+
duration=speaker_info.total_seconds,
|
| 353 |
+
clip_path=clip_path
|
| 354 |
+
)
|
| 355 |
+
|
| 356 |
+
speaker_result = SpeakerResult(
|
| 357 |
+
voiceprint_id=vp_id,
|
| 358 |
+
label=speaker_info.speaker_id,
|
| 359 |
+
role=role,
|
| 360 |
+
total_seconds=speaker_info.total_seconds,
|
| 361 |
+
quality=self.voiceprint_extractor.quality_label(vp_result.quality_score),
|
| 362 |
+
is_synthetic=synthetic_result.is_synthetic,
|
| 363 |
+
synthetic_score=synthetic_result.score,
|
| 364 |
+
times_seen=times_seen,
|
| 365 |
+
is_flagged=is_flagged,
|
| 366 |
+
segments=[{'start': s.start, 'end': s.end} for s in speaker_info.segments],
|
| 367 |
+
clip_path=clip_path
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
if i == 0:
|
| 371 |
+
main_speaker_result = speaker_result
|
| 372 |
+
else:
|
| 373 |
+
additional_speakers.append(speaker_result)
|
| 374 |
+
|
| 375 |
+
# Step 5: Background Analysis
|
| 376 |
+
update_progress("Analyzing background audio...", 55)
|
| 377 |
+
waveform_np = waveform.squeeze().numpy()
|
| 378 |
+
anomalies = self.background_analyzer.detect_anomalies(
|
| 379 |
+
waveform_np, speech_segments
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
# Step 6: Playback Detection (detect if audio is from speakers)
|
| 383 |
+
update_progress("Detecting playback/replay...", 65)
|
| 384 |
+
playback_result = self.playback_detector.detect(waveform_np)
|
| 385 |
+
|
| 386 |
+
# Step 7: Wake Word Detection
|
| 387 |
+
update_progress("Detecting wake words...", 70)
|
| 388 |
+
wake_analysis = self.wake_word_detector.analyze(temp_path)
|
| 389 |
+
|
| 390 |
+
# Step 8: Fraud Detection - Whisper, Reading Pattern, Suspicious Pauses
|
| 391 |
+
update_progress("Running fraud detection...", 80)
|
| 392 |
+
|
| 393 |
+
# 8a: Whisper detection (background voices)
|
| 394 |
+
main_speaker_segs = []
|
| 395 |
+
if main_speaker_result and main_speaker_result.segments:
|
| 396 |
+
main_speaker_segs = main_speaker_result.segments
|
| 397 |
+
whisper_result = self.whisper_detector.detect(
|
| 398 |
+
waveform_np, sample_rate, main_speaker_segs
|
| 399 |
+
)
|
| 400 |
+
|
| 401 |
+
# 8b: Reading pattern detection (uses wake word transcription)
|
| 402 |
+
word_timestamps = wake_analysis.get('word_timestamps', [])
|
| 403 |
+
transcription = wake_analysis.get('transcription', '')
|
| 404 |
+
reading_result = self.reading_pattern_analyzer.analyze(
|
| 405 |
+
transcription, word_timestamps, duration
|
| 406 |
+
)
|
| 407 |
+
|
| 408 |
+
# 8c: Suspicious pause detection
|
| 409 |
+
speech_segments_dict = [{'start': s.start, 'end': s.end} for s in speech_segments]
|
| 410 |
+
pause_result = self.pause_detector.detect(speech_segments_dict, duration)
|
| 411 |
+
|
| 412 |
+
# Step 9: Compile results
|
| 413 |
+
update_progress("Compiling results...", 90)
|
| 414 |
+
|
| 415 |
+
# Detect prompt voice (simplified: assume first few seconds might be prompt)
|
| 416 |
+
prompt_seconds = sum(
|
| 417 |
+
s.duration for s in speech_segments
|
| 418 |
+
if s.start < 5.0 # First 5 seconds
|
| 419 |
+
)
|
| 420 |
+
|
| 421 |
+
result = AnalysisResult(
|
| 422 |
+
test_id=test_id,
|
| 423 |
+
filename=filename,
|
| 424 |
+
duration_seconds=duration,
|
| 425 |
+
analyzed_at=datetime.now().isoformat(),
|
| 426 |
+
main_speaker=main_speaker_result,
|
| 427 |
+
additional_speakers=additional_speakers,
|
| 428 |
+
background_anomalies=[
|
| 429 |
+
{
|
| 430 |
+
'start': a.start,
|
| 431 |
+
'end': a.end,
|
| 432 |
+
'type': a.anomaly_type.value,
|
| 433 |
+
'amplitude_db': a.amplitude_db,
|
| 434 |
+
'confidence': a.confidence
|
| 435 |
+
}
|
| 436 |
+
for a in anomalies
|
| 437 |
+
],
|
| 438 |
+
wake_words=[
|
| 439 |
+
{
|
| 440 |
+
'word': w.word,
|
| 441 |
+
'assistant': w.assistant,
|
| 442 |
+
'time': w.time,
|
| 443 |
+
'confidence': w.confidence,
|
| 444 |
+
'context': w.context
|
| 445 |
+
}
|
| 446 |
+
for w in wake_analysis['wake_words']
|
| 447 |
+
],
|
| 448 |
+
assistant_responses=wake_analysis['assistant_responses'],
|
| 449 |
+
prompt_voice_detected=prompt_seconds > 0,
|
| 450 |
+
prompt_voice_seconds=prompt_seconds,
|
| 451 |
+
playback_detected=playback_result.is_playback,
|
| 452 |
+
playback_score=playback_result.score,
|
| 453 |
+
playback_indicators=playback_result.indicators,
|
| 454 |
+
# Fraud detection results
|
| 455 |
+
whisper_detected=whisper_result.detected,
|
| 456 |
+
whisper_instances=[
|
| 457 |
+
{'start': w.start, 'end': w.end, 'confidence': w.confidence}
|
| 458 |
+
for w in whisper_result.instances
|
| 459 |
+
],
|
| 460 |
+
reading_pattern_detected=reading_result.is_reading,
|
| 461 |
+
reading_confidence=reading_result.confidence,
|
| 462 |
+
reading_indicators=reading_result.indicators,
|
| 463 |
+
suspicious_pauses_detected=pause_result.detected,
|
| 464 |
+
suspicious_pauses=[
|
| 465 |
+
{'start': p.start, 'end': p.end, 'duration': p.duration, 'context': p.context}
|
| 466 |
+
for p in pause_result.pauses
|
| 467 |
+
],
|
| 468 |
+
longest_pause=pause_result.longest_pause
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
# Save analysis to database
|
| 472 |
+
self.db.save_test_analysis(
|
| 473 |
+
test_id=test_id,
|
| 474 |
+
filename=filename,
|
| 475 |
+
duration=duration,
|
| 476 |
+
results=result.to_dict()
|
| 477 |
+
)
|
| 478 |
+
|
| 479 |
+
update_progress("Analysis complete!", 100)
|
| 480 |
+
|
| 481 |
+
return result
|
| 482 |
+
|
| 483 |
+
finally:
|
| 484 |
+
# Cleanup temp file
|
| 485 |
+
if os.path.exists(temp_path):
|
| 486 |
+
os.remove(temp_path)
|
| 487 |
+
|
| 488 |
+
def _detect_synthetic_for_speaker(self, waveform, sample_rate, speaker_info):
|
| 489 |
+
"""Run synthetic detection on speaker's audio.
|
| 490 |
+
|
| 491 |
+
Combines both SyntheticDetector (voice characteristics) and
|
| 492 |
+
PlaybackDetector (TTS/speaker playback) for better detection.
|
| 493 |
+
"""
|
| 494 |
+
from .phase6_synthetic import SyntheticResult
|
| 495 |
+
|
| 496 |
+
# Concatenate speaker segments
|
| 497 |
+
segments_audio = []
|
| 498 |
+
|
| 499 |
+
for seg in speaker_info.segments[:5]: # Limit to first 5 segments
|
| 500 |
+
start_sample = int(seg.start * sample_rate)
|
| 501 |
+
end_sample = int(seg.end * sample_rate)
|
| 502 |
+
if end_sample <= waveform.shape[1]:
|
| 503 |
+
segments_audio.append(waveform[:, start_sample:end_sample])
|
| 504 |
+
|
| 505 |
+
if not segments_audio:
|
| 506 |
+
return SyntheticResult.from_score(0.0)
|
| 507 |
+
|
| 508 |
+
speaker_audio = np.concatenate([s.squeeze().numpy() for s in segments_audio])
|
| 509 |
+
|
| 510 |
+
# Run both detectors on speaker's audio
|
| 511 |
+
synthetic_result = self.synthetic_detector.detect(speaker_audio)
|
| 512 |
+
playback_result = self.playback_detector.detect(speaker_audio)
|
| 513 |
+
|
| 514 |
+
# Combine scores: if either detects synthetic/TTS, flag it
|
| 515 |
+
# Playback with TTS indicators is strong evidence of synthetic
|
| 516 |
+
tts_indicators = ['tts_flat_pitch', 'tts_low_pitch_variation', 'tts_regular_timing',
|
| 517 |
+
'smooth_spectrum', 'slightly_smooth_spectrum']
|
| 518 |
+
has_tts_indicators = any(ind in playback_result.indicators for ind in tts_indicators)
|
| 519 |
+
|
| 520 |
+
# Calculate combined score
|
| 521 |
+
if has_tts_indicators:
|
| 522 |
+
# Strong TTS evidence from playback detector
|
| 523 |
+
combined_score = max(synthetic_result.score, playback_result.score * 0.9)
|
| 524 |
+
else:
|
| 525 |
+
# Weight synthetic detector more, but consider playback
|
| 526 |
+
combined_score = synthetic_result.score * 0.7 + playback_result.score * 0.3
|
| 527 |
+
|
| 528 |
+
# Boost if both detectors agree
|
| 529 |
+
if synthetic_result.score > 0.4 and playback_result.score > 0.4:
|
| 530 |
+
combined_score = min(1.0, combined_score * 1.2)
|
| 531 |
+
|
| 532 |
+
return SyntheticResult.from_score(combined_score, threshold=0.45)
|
| 533 |
+
|
| 534 |
+
def _save_speaker_clip(self, waveform, sample_rate, speaker_info, test_id, vp_id):
|
| 535 |
+
"""Save audio clip for a speaker (minimum 10 seconds for voice sample)."""
|
| 536 |
+
segments = sorted(speaker_info.segments, key=lambda s: s.start)
|
| 537 |
+
|
| 538 |
+
if not segments:
|
| 539 |
+
return None
|
| 540 |
+
|
| 541 |
+
# Merge overlapping segments first
|
| 542 |
+
merged_segments = []
|
| 543 |
+
for seg in segments:
|
| 544 |
+
if merged_segments and seg.start <= merged_segments[-1][1]:
|
| 545 |
+
# Overlap - extend previous segment
|
| 546 |
+
merged_segments[-1] = (merged_segments[-1][0], max(merged_segments[-1][1], seg.end))
|
| 547 |
+
else:
|
| 548 |
+
merged_segments.append((seg.start, seg.end))
|
| 549 |
+
|
| 550 |
+
# Concatenate segments until we have at least 10 seconds for voice sample
|
| 551 |
+
target_duration = 10.0
|
| 552 |
+
clips = []
|
| 553 |
+
total_duration = 0.0
|
| 554 |
+
|
| 555 |
+
for start, end in merged_segments:
|
| 556 |
+
start_sample = int(start * sample_rate)
|
| 557 |
+
end_sample = int(end * sample_rate)
|
| 558 |
+
|
| 559 |
+
if end_sample <= waveform.shape[1]:
|
| 560 |
+
clips.append(waveform[:, start_sample:end_sample])
|
| 561 |
+
total_duration += (end - start)
|
| 562 |
+
|
| 563 |
+
if total_duration >= target_duration:
|
| 564 |
+
break
|
| 565 |
+
|
| 566 |
+
if not clips:
|
| 567 |
+
return None
|
| 568 |
+
|
| 569 |
+
# Concatenate all clips
|
| 570 |
+
clip = torch.cat(clips, dim=1)
|
| 571 |
+
|
| 572 |
+
# Save clip
|
| 573 |
+
clip_filename = f"{test_id}_{vp_id}_{total_duration:.1f}s.wav"
|
| 574 |
+
clip_path = os.path.join(self.clips_dir, clip_filename)
|
| 575 |
+
|
| 576 |
+
torchaudio.save(clip_path, clip, sample_rate)
|
| 577 |
+
|
| 578 |
+
return clip_path
|
| 579 |
+
|
| 580 |
+
def get_voiceprint_history(self, vp_id: str) -> List[dict]:
|
| 581 |
+
"""Get appearance history for a voiceprint."""
|
| 582 |
+
appearances = self.db.get_voiceprint_appearances(vp_id)
|
| 583 |
+
return [
|
| 584 |
+
{
|
| 585 |
+
'test_id': a.test_id,
|
| 586 |
+
'filename': a.test_filename,
|
| 587 |
+
'role': a.role,
|
| 588 |
+
'duration': a.duration_seconds,
|
| 589 |
+
'date': a.detected_at.isoformat() if a.detected_at else None,
|
| 590 |
+
'clip_path': a.clip_path
|
| 591 |
+
}
|
| 592 |
+
for a in appearances
|
| 593 |
+
]
|
| 594 |
+
|
| 595 |
+
def get_database_stats(self) -> dict:
|
| 596 |
+
"""Get database statistics."""
|
| 597 |
+
return self.db.get_stats()
|
src/database/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .models import Database, Voiceprint, VoiceprintAppearance, TestAnalysis
|
| 2 |
+
|
| 3 |
+
__all__ = ['Database', 'Voiceprint', 'VoiceprintAppearance', 'TestAnalysis']
|
src/database/models.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Database models for voiceprint tracking.
|
| 3 |
+
"""
|
| 4 |
+
from sqlalchemy import create_engine, Column, String, Float, Integer, DateTime, Boolean, ForeignKey, LargeBinary
|
| 5 |
+
from sqlalchemy.ext.declarative import declarative_base
|
| 6 |
+
from sqlalchemy.orm import sessionmaker, relationship
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
Base = declarative_base()
|
| 11 |
+
|
| 12 |
+
class Voiceprint(Base):
|
| 13 |
+
"""Unique voice identity."""
|
| 14 |
+
__tablename__ = 'voiceprints'
|
| 15 |
+
|
| 16 |
+
id = Column(String(20), primary_key=True) # vp_xxxxxxxx
|
| 17 |
+
embedding = Column(LargeBinary, nullable=False) # 192-dim vector as bytes
|
| 18 |
+
first_seen = Column(DateTime, default=datetime.utcnow)
|
| 19 |
+
times_seen = Column(Integer, default=1)
|
| 20 |
+
total_audio_seconds = Column(Float, default=0.0)
|
| 21 |
+
is_flagged = Column(Boolean, default=False)
|
| 22 |
+
flag_reason = Column(String(200), nullable=True)
|
| 23 |
+
|
| 24 |
+
# User-editable fields
|
| 25 |
+
label = Column(String(100), nullable=True) # Human-friendly name (e.g., "Juan Pérez")
|
| 26 |
+
notes = Column(String(1000), nullable=True) # User comments/notes
|
| 27 |
+
|
| 28 |
+
# Relationships
|
| 29 |
+
appearances = relationship("VoiceprintAppearance", back_populates="voiceprint")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class VoiceprintAppearance(Base):
|
| 33 |
+
"""Track where a voiceprint appears."""
|
| 34 |
+
__tablename__ = 'voiceprint_appearances'
|
| 35 |
+
|
| 36 |
+
id = Column(Integer, primary_key=True, autoincrement=True)
|
| 37 |
+
voiceprint_id = Column(String(20), ForeignKey('voiceprints.id'), nullable=False)
|
| 38 |
+
test_id = Column(String(50), nullable=False)
|
| 39 |
+
test_filename = Column(String(200), nullable=False)
|
| 40 |
+
role = Column(String(20), nullable=False) # 'main' or 'additional'
|
| 41 |
+
duration_seconds = Column(Float, nullable=False)
|
| 42 |
+
detected_at = Column(DateTime, default=datetime.utcnow)
|
| 43 |
+
clip_path = Column(String(500), nullable=True) # Path to extracted audio clip
|
| 44 |
+
|
| 45 |
+
# Relationships
|
| 46 |
+
voiceprint = relationship("Voiceprint", back_populates="appearances")
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class TestAnalysis(Base):
|
| 50 |
+
"""Store analysis results per test."""
|
| 51 |
+
__tablename__ = 'test_analyses'
|
| 52 |
+
|
| 53 |
+
id = Column(Integer, primary_key=True, autoincrement=True)
|
| 54 |
+
test_id = Column(String(50), unique=True, nullable=False)
|
| 55 |
+
filename = Column(String(200), nullable=False)
|
| 56 |
+
duration_seconds = Column(Float, nullable=False)
|
| 57 |
+
analyzed_at = Column(DateTime, default=datetime.utcnow)
|
| 58 |
+
|
| 59 |
+
# Main speaker
|
| 60 |
+
main_voiceprint_id = Column(String(20), ForeignKey('voiceprints.id'), nullable=True)
|
| 61 |
+
main_speech_seconds = Column(Float, default=0.0)
|
| 62 |
+
main_quality = Column(String(20), nullable=True)
|
| 63 |
+
|
| 64 |
+
# Detection counts
|
| 65 |
+
additional_speakers_count = Column(Integer, default=0)
|
| 66 |
+
background_anomalies_count = Column(Integer, default=0)
|
| 67 |
+
wake_words_count = Column(Integer, default=0)
|
| 68 |
+
|
| 69 |
+
# Synthetic detection
|
| 70 |
+
synthetic_score = Column(Float, default=0.0)
|
| 71 |
+
is_synthetic = Column(Boolean, default=False)
|
| 72 |
+
|
| 73 |
+
# JSON results (full analysis)
|
| 74 |
+
results_json = Column(String, nullable=True)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class Database:
|
| 78 |
+
"""Database manager."""
|
| 79 |
+
|
| 80 |
+
def __init__(self, db_path: str = "data/db/voiceprints.db"):
|
| 81 |
+
self.db_path = db_path
|
| 82 |
+
os.makedirs(os.path.dirname(db_path), exist_ok=True)
|
| 83 |
+
self.engine = create_engine(f'sqlite:///{db_path}')
|
| 84 |
+
Base.metadata.create_all(self.engine)
|
| 85 |
+
self.Session = sessionmaker(bind=self.engine)
|
| 86 |
+
|
| 87 |
+
def get_session(self):
|
| 88 |
+
return self.Session()
|
| 89 |
+
|
| 90 |
+
def add_voiceprint(self, vp_id: str, embedding: bytes,
|
| 91 |
+
test_id: str, filename: str, role: str,
|
| 92 |
+
duration: float, clip_path: str = None):
|
| 93 |
+
"""Add or update voiceprint and record appearance."""
|
| 94 |
+
session = self.get_session()
|
| 95 |
+
try:
|
| 96 |
+
# Check if voiceprint exists
|
| 97 |
+
vp = session.query(Voiceprint).filter_by(id=vp_id).first()
|
| 98 |
+
|
| 99 |
+
if vp:
|
| 100 |
+
# Update existing
|
| 101 |
+
vp.times_seen += 1
|
| 102 |
+
vp.total_audio_seconds += duration
|
| 103 |
+
|
| 104 |
+
# Check for flag conditions
|
| 105 |
+
if vp.times_seen >= 4:
|
| 106 |
+
vp.is_flagged = True
|
| 107 |
+
vp.flag_reason = f"Seen in {vp.times_seen} tests"
|
| 108 |
+
else:
|
| 109 |
+
# Create new
|
| 110 |
+
vp = Voiceprint(
|
| 111 |
+
id=vp_id,
|
| 112 |
+
embedding=embedding,
|
| 113 |
+
total_audio_seconds=duration
|
| 114 |
+
)
|
| 115 |
+
session.add(vp)
|
| 116 |
+
|
| 117 |
+
# Record appearance
|
| 118 |
+
appearance = VoiceprintAppearance(
|
| 119 |
+
voiceprint_id=vp_id,
|
| 120 |
+
test_id=test_id,
|
| 121 |
+
test_filename=filename,
|
| 122 |
+
role=role,
|
| 123 |
+
duration_seconds=duration,
|
| 124 |
+
clip_path=clip_path
|
| 125 |
+
)
|
| 126 |
+
session.add(appearance)
|
| 127 |
+
|
| 128 |
+
session.commit()
|
| 129 |
+
return vp
|
| 130 |
+
except Exception as e:
|
| 131 |
+
session.rollback()
|
| 132 |
+
raise e
|
| 133 |
+
finally:
|
| 134 |
+
session.close()
|
| 135 |
+
|
| 136 |
+
def get_voiceprint(self, vp_id: str):
|
| 137 |
+
"""Get voiceprint by ID."""
|
| 138 |
+
session = self.get_session()
|
| 139 |
+
try:
|
| 140 |
+
return session.query(Voiceprint).filter_by(id=vp_id).first()
|
| 141 |
+
finally:
|
| 142 |
+
session.close()
|
| 143 |
+
|
| 144 |
+
def get_all_voiceprints(self):
|
| 145 |
+
"""Get all voiceprints."""
|
| 146 |
+
session = self.get_session()
|
| 147 |
+
try:
|
| 148 |
+
return session.query(Voiceprint).order_by(Voiceprint.times_seen.desc()).all()
|
| 149 |
+
finally:
|
| 150 |
+
session.close()
|
| 151 |
+
|
| 152 |
+
def get_flagged_voiceprints(self):
|
| 153 |
+
"""Get flagged voiceprints."""
|
| 154 |
+
session = self.get_session()
|
| 155 |
+
try:
|
| 156 |
+
return session.query(Voiceprint).filter_by(is_flagged=True).all()
|
| 157 |
+
finally:
|
| 158 |
+
session.close()
|
| 159 |
+
|
| 160 |
+
def get_multi_appearance_voiceprints(self, min_appearances: int = 2):
|
| 161 |
+
"""Get voiceprints seen in multiple tests."""
|
| 162 |
+
session = self.get_session()
|
| 163 |
+
try:
|
| 164 |
+
return session.query(Voiceprint).filter(
|
| 165 |
+
Voiceprint.times_seen >= min_appearances
|
| 166 |
+
).order_by(Voiceprint.times_seen.desc()).all()
|
| 167 |
+
finally:
|
| 168 |
+
session.close()
|
| 169 |
+
|
| 170 |
+
def get_voiceprint_appearances(self, vp_id: str):
|
| 171 |
+
"""Get all appearances of a voiceprint."""
|
| 172 |
+
session = self.get_session()
|
| 173 |
+
try:
|
| 174 |
+
return session.query(VoiceprintAppearance).filter_by(
|
| 175 |
+
voiceprint_id=vp_id
|
| 176 |
+
).order_by(VoiceprintAppearance.detected_at.desc()).all()
|
| 177 |
+
finally:
|
| 178 |
+
session.close()
|
| 179 |
+
|
| 180 |
+
def find_matching_voiceprint(self, embedding: bytes, threshold: float = 0.80):
|
| 181 |
+
"""Find existing voiceprint matching the embedding."""
|
| 182 |
+
import numpy as np
|
| 183 |
+
|
| 184 |
+
session = self.get_session()
|
| 185 |
+
try:
|
| 186 |
+
new_emb = np.frombuffer(embedding, dtype=np.float32)
|
| 187 |
+
|
| 188 |
+
for vp in session.query(Voiceprint).all():
|
| 189 |
+
stored_emb = np.frombuffer(vp.embedding, dtype=np.float32)
|
| 190 |
+
|
| 191 |
+
# Cosine similarity
|
| 192 |
+
similarity = np.dot(new_emb, stored_emb) / (
|
| 193 |
+
np.linalg.norm(new_emb) * np.linalg.norm(stored_emb)
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
if similarity >= threshold:
|
| 197 |
+
return vp, similarity
|
| 198 |
+
|
| 199 |
+
return None, 0.0
|
| 200 |
+
finally:
|
| 201 |
+
session.close()
|
| 202 |
+
|
| 203 |
+
def save_test_analysis(self, test_id: str, filename: str,
|
| 204 |
+
duration: float, results: dict):
|
| 205 |
+
"""Save full test analysis."""
|
| 206 |
+
import json
|
| 207 |
+
|
| 208 |
+
session = self.get_session()
|
| 209 |
+
try:
|
| 210 |
+
analysis = TestAnalysis(
|
| 211 |
+
test_id=test_id,
|
| 212 |
+
filename=filename,
|
| 213 |
+
duration_seconds=duration,
|
| 214 |
+
main_voiceprint_id=results.get('main_voiceprint_id'),
|
| 215 |
+
main_speech_seconds=results.get('main_speech_seconds', 0),
|
| 216 |
+
main_quality=results.get('main_quality'),
|
| 217 |
+
additional_speakers_count=len(results.get('additional_speakers', [])),
|
| 218 |
+
background_anomalies_count=len(results.get('background_anomalies', [])),
|
| 219 |
+
wake_words_count=len(results.get('wake_words', [])),
|
| 220 |
+
synthetic_score=results.get('synthetic_score', 0),
|
| 221 |
+
is_synthetic=results.get('is_synthetic', False),
|
| 222 |
+
results_json=json.dumps(results)
|
| 223 |
+
)
|
| 224 |
+
session.add(analysis)
|
| 225 |
+
session.commit()
|
| 226 |
+
return analysis
|
| 227 |
+
except Exception as e:
|
| 228 |
+
session.rollback()
|
| 229 |
+
raise e
|
| 230 |
+
finally:
|
| 231 |
+
session.close()
|
| 232 |
+
|
| 233 |
+
def get_stats(self):
|
| 234 |
+
"""Get database statistics."""
|
| 235 |
+
session = self.get_session()
|
| 236 |
+
try:
|
| 237 |
+
return {
|
| 238 |
+
'total_tests': session.query(TestAnalysis).count(),
|
| 239 |
+
'total_voiceprints': session.query(Voiceprint).count(),
|
| 240 |
+
'flagged_voiceprints': session.query(Voiceprint).filter_by(is_flagged=True).count(),
|
| 241 |
+
'multi_appearance': session.query(Voiceprint).filter(Voiceprint.times_seen >= 2).count()
|
| 242 |
+
}
|
| 243 |
+
finally:
|
| 244 |
+
session.close()
|
| 245 |
+
|
| 246 |
+
def update_voiceprint_label(self, vp_id: str, label: str):
|
| 247 |
+
"""Update voiceprint label/name."""
|
| 248 |
+
session = self.get_session()
|
| 249 |
+
try:
|
| 250 |
+
vp = session.query(Voiceprint).filter_by(id=vp_id).first()
|
| 251 |
+
if vp:
|
| 252 |
+
vp.label = label
|
| 253 |
+
session.commit()
|
| 254 |
+
return True
|
| 255 |
+
return False
|
| 256 |
+
except Exception as e:
|
| 257 |
+
session.rollback()
|
| 258 |
+
raise e
|
| 259 |
+
finally:
|
| 260 |
+
session.close()
|
| 261 |
+
|
| 262 |
+
def update_voiceprint_notes(self, vp_id: str, notes: str):
|
| 263 |
+
"""Update voiceprint notes/comments."""
|
| 264 |
+
session = self.get_session()
|
| 265 |
+
try:
|
| 266 |
+
vp = session.query(Voiceprint).filter_by(id=vp_id).first()
|
| 267 |
+
if vp:
|
| 268 |
+
vp.notes = notes
|
| 269 |
+
session.commit()
|
| 270 |
+
return True
|
| 271 |
+
return False
|
| 272 |
+
except Exception as e:
|
| 273 |
+
session.rollback()
|
| 274 |
+
raise e
|
| 275 |
+
finally:
|
| 276 |
+
session.close()
|
| 277 |
+
|
| 278 |
+
def toggle_voiceprint_flag(self, vp_id: str, flagged: bool, reason: str = None):
|
| 279 |
+
"""Manually flag/unflag a voiceprint."""
|
| 280 |
+
session = self.get_session()
|
| 281 |
+
try:
|
| 282 |
+
vp = session.query(Voiceprint).filter_by(id=vp_id).first()
|
| 283 |
+
if vp:
|
| 284 |
+
vp.is_flagged = flagged
|
| 285 |
+
vp.flag_reason = reason if flagged else None
|
| 286 |
+
session.commit()
|
| 287 |
+
return True
|
| 288 |
+
return False
|
| 289 |
+
except Exception as e:
|
| 290 |
+
session.rollback()
|
| 291 |
+
raise e
|
| 292 |
+
finally:
|
| 293 |
+
session.close()
|
| 294 |
+
|
| 295 |
+
def get_similarity_threshold(self):
|
| 296 |
+
"""Get current similarity threshold (default 0.80)."""
|
| 297 |
+
# Could be stored in a settings table, for now return default
|
| 298 |
+
return 0.80
|
| 299 |
+
|
| 300 |
+
def get_appearance_timeline(self, vp_id: str = None):
|
| 301 |
+
"""Get appearances over time for timeline chart."""
|
| 302 |
+
session = self.get_session()
|
| 303 |
+
try:
|
| 304 |
+
query = session.query(VoiceprintAppearance)
|
| 305 |
+
if vp_id:
|
| 306 |
+
query = query.filter_by(voiceprint_id=vp_id)
|
| 307 |
+
appearances = query.order_by(VoiceprintAppearance.detected_at).all()
|
| 308 |
+
|
| 309 |
+
return [
|
| 310 |
+
{
|
| 311 |
+
'date': a.detected_at,
|
| 312 |
+
'voiceprint_id': a.voiceprint_id,
|
| 313 |
+
'test_id': a.test_id,
|
| 314 |
+
'role': a.role,
|
| 315 |
+
'duration': a.duration_seconds
|
| 316 |
+
}
|
| 317 |
+
for a in appearances
|
| 318 |
+
]
|
| 319 |
+
finally:
|
| 320 |
+
session.close()
|
src/fraud_detection/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Fraud Detection Module
|
| 3 |
+
Detects suspicious patterns in audio that may indicate cheating.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .whisper_detector import WhisperDetector, WhisperResult, WhisperInstance
|
| 7 |
+
from .reading_pattern import ReadingPatternAnalyzer, ReadingPatternResult
|
| 8 |
+
from .pause_detector import SuspiciousPauseDetector, PauseResult, SuspiciousPause
|
| 9 |
+
|
| 10 |
+
__all__ = [
|
| 11 |
+
'WhisperDetector', 'WhisperResult', 'WhisperInstance',
|
| 12 |
+
'ReadingPatternAnalyzer', 'ReadingPatternResult',
|
| 13 |
+
'SuspiciousPauseDetector', 'PauseResult', 'SuspiciousPause'
|
| 14 |
+
]
|
src/fraud_detection/pause_detector.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Suspicious Pause Detector
|
| 3 |
+
Detects abnormally long silences that may indicate the speaker is looking up
|
| 4 |
+
answers or receiving help during a test.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
from dataclasses import dataclass, field
|
| 9 |
+
from typing import List, Optional
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class SuspiciousPause:
|
| 14 |
+
"""A detected suspicious pause."""
|
| 15 |
+
start: float
|
| 16 |
+
end: float
|
| 17 |
+
duration: float
|
| 18 |
+
context: str = "" # What happened before/after
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class PauseResult:
|
| 23 |
+
"""Result of suspicious pause detection."""
|
| 24 |
+
detected: bool
|
| 25 |
+
pauses: List[SuspiciousPause] = field(default_factory=list)
|
| 26 |
+
total_suspicious_time: float = 0.0
|
| 27 |
+
longest_pause: float = 0.0
|
| 28 |
+
|
| 29 |
+
@property
|
| 30 |
+
def count(self) -> int:
|
| 31 |
+
return len(self.pauses)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class SuspiciousPauseDetector:
|
| 35 |
+
"""
|
| 36 |
+
Detects suspicious long pauses in speech.
|
| 37 |
+
|
| 38 |
+
In natural conversation, pauses are typically:
|
| 39 |
+
- Short (< 2 seconds) for thinking
|
| 40 |
+
- Medium (2-4 seconds) for complex thoughts
|
| 41 |
+
|
| 42 |
+
Suspicious pauses (> 5 seconds) may indicate:
|
| 43 |
+
- Looking up answers
|
| 44 |
+
- Receiving external help
|
| 45 |
+
- Reading from a source
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
def __init__(self,
|
| 49 |
+
min_suspicious_duration: float = 5.0,
|
| 50 |
+
warning_duration: float = 3.0,
|
| 51 |
+
max_natural_pause: float = 2.0):
|
| 52 |
+
"""
|
| 53 |
+
Args:
|
| 54 |
+
min_suspicious_duration: Minimum pause duration to flag as suspicious
|
| 55 |
+
warning_duration: Duration to flag as a warning (not fully suspicious)
|
| 56 |
+
max_natural_pause: Maximum duration for a natural pause
|
| 57 |
+
"""
|
| 58 |
+
self.min_suspicious_duration = min_suspicious_duration
|
| 59 |
+
self.warning_duration = warning_duration
|
| 60 |
+
self.max_natural_pause = max_natural_pause
|
| 61 |
+
|
| 62 |
+
def detect(self, speech_segments: List[dict],
|
| 63 |
+
total_duration: float,
|
| 64 |
+
transcription_segments: List[dict] = None) -> PauseResult:
|
| 65 |
+
"""
|
| 66 |
+
Detect suspicious pauses between speech segments.
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
speech_segments: List of {'start': float, 'end': float} for speech
|
| 70 |
+
total_duration: Total audio duration in seconds
|
| 71 |
+
transcription_segments: Optional transcription with timestamps for context
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
PauseResult with detected suspicious pauses
|
| 75 |
+
"""
|
| 76 |
+
if not speech_segments:
|
| 77 |
+
return PauseResult(detected=False)
|
| 78 |
+
|
| 79 |
+
# Sort segments by start time
|
| 80 |
+
sorted_segments = sorted(speech_segments, key=lambda s: s.get('start', 0))
|
| 81 |
+
|
| 82 |
+
suspicious_pauses = []
|
| 83 |
+
|
| 84 |
+
# Check pause at the beginning
|
| 85 |
+
first_start = sorted_segments[0].get('start', 0)
|
| 86 |
+
if first_start >= self.min_suspicious_duration:
|
| 87 |
+
context = self._get_context(0, first_start, transcription_segments, "start")
|
| 88 |
+
suspicious_pauses.append(SuspiciousPause(
|
| 89 |
+
start=0,
|
| 90 |
+
end=first_start,
|
| 91 |
+
duration=round(first_start, 2),
|
| 92 |
+
context=context
|
| 93 |
+
))
|
| 94 |
+
|
| 95 |
+
# Check pauses between segments
|
| 96 |
+
for i in range(1, len(sorted_segments)):
|
| 97 |
+
prev_end = sorted_segments[i-1].get('end', 0)
|
| 98 |
+
curr_start = sorted_segments[i].get('start', 0)
|
| 99 |
+
|
| 100 |
+
gap = curr_start - prev_end
|
| 101 |
+
|
| 102 |
+
if gap >= self.min_suspicious_duration:
|
| 103 |
+
context = self._get_context(prev_end, curr_start, transcription_segments, "middle")
|
| 104 |
+
suspicious_pauses.append(SuspiciousPause(
|
| 105 |
+
start=round(prev_end, 2),
|
| 106 |
+
end=round(curr_start, 2),
|
| 107 |
+
duration=round(gap, 2),
|
| 108 |
+
context=context
|
| 109 |
+
))
|
| 110 |
+
|
| 111 |
+
# Check pause at the end
|
| 112 |
+
last_end = sorted_segments[-1].get('end', 0)
|
| 113 |
+
end_gap = total_duration - last_end
|
| 114 |
+
if end_gap >= self.min_suspicious_duration:
|
| 115 |
+
context = self._get_context(last_end, total_duration, transcription_segments, "end")
|
| 116 |
+
suspicious_pauses.append(SuspiciousPause(
|
| 117 |
+
start=round(last_end, 2),
|
| 118 |
+
end=round(total_duration, 2),
|
| 119 |
+
duration=round(end_gap, 2),
|
| 120 |
+
context=context
|
| 121 |
+
))
|
| 122 |
+
|
| 123 |
+
# Calculate summary statistics
|
| 124 |
+
total_suspicious_time = sum(p.duration for p in suspicious_pauses)
|
| 125 |
+
longest_pause = max((p.duration for p in suspicious_pauses), default=0)
|
| 126 |
+
|
| 127 |
+
return PauseResult(
|
| 128 |
+
detected=len(suspicious_pauses) > 0,
|
| 129 |
+
pauses=suspicious_pauses,
|
| 130 |
+
total_suspicious_time=round(total_suspicious_time, 2),
|
| 131 |
+
longest_pause=round(longest_pause, 2)
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
def detect_from_vad(self, vad_result: dict, total_duration: float) -> PauseResult:
|
| 135 |
+
"""
|
| 136 |
+
Detect suspicious pauses using VAD output.
|
| 137 |
+
|
| 138 |
+
Args:
|
| 139 |
+
vad_result: VAD result with 'segments' list
|
| 140 |
+
total_duration: Total audio duration
|
| 141 |
+
|
| 142 |
+
Returns:
|
| 143 |
+
PauseResult with detected suspicious pauses
|
| 144 |
+
"""
|
| 145 |
+
segments = vad_result.get('segments', [])
|
| 146 |
+
return self.detect(segments, total_duration)
|
| 147 |
+
|
| 148 |
+
def _get_context(self, start: float, end: float,
|
| 149 |
+
transcription_segments: List[dict],
|
| 150 |
+
position: str) -> str:
|
| 151 |
+
"""
|
| 152 |
+
Get context about what happened before/after the pause.
|
| 153 |
+
"""
|
| 154 |
+
if not transcription_segments:
|
| 155 |
+
if position == "start":
|
| 156 |
+
return "Long silence at audio start"
|
| 157 |
+
elif position == "end":
|
| 158 |
+
return "Long silence at audio end"
|
| 159 |
+
else:
|
| 160 |
+
return "Long silence mid-conversation"
|
| 161 |
+
|
| 162 |
+
# Find text before and after the pause
|
| 163 |
+
text_before = ""
|
| 164 |
+
text_after = ""
|
| 165 |
+
|
| 166 |
+
for seg in transcription_segments:
|
| 167 |
+
seg_end = seg.get('end', 0)
|
| 168 |
+
seg_start = seg.get('start', 0)
|
| 169 |
+
seg_text = seg.get('text', '').strip()
|
| 170 |
+
|
| 171 |
+
# Text ending just before pause
|
| 172 |
+
if seg_end <= start + 0.5 and seg_end >= start - 1.0:
|
| 173 |
+
text_before = seg_text[-50:] if len(seg_text) > 50 else seg_text
|
| 174 |
+
|
| 175 |
+
# Text starting just after pause
|
| 176 |
+
if seg_start >= end - 0.5 and seg_start <= end + 1.0:
|
| 177 |
+
text_after = seg_text[:50] if len(seg_text) > 50 else seg_text
|
| 178 |
+
|
| 179 |
+
if text_before and text_after:
|
| 180 |
+
return f"After: '{text_before}...' | Before: '...{text_after}'"
|
| 181 |
+
elif text_before:
|
| 182 |
+
return f"After: '{text_before}...'"
|
| 183 |
+
elif text_after:
|
| 184 |
+
return f"Before: '...{text_after}'"
|
| 185 |
+
else:
|
| 186 |
+
return f"Silence at {position} of audio"
|
| 187 |
+
|
| 188 |
+
def analyze_pause_pattern(self, speech_segments: List[dict],
|
| 189 |
+
total_duration: float) -> dict:
|
| 190 |
+
"""
|
| 191 |
+
Analyze the overall pause pattern in the audio.
|
| 192 |
+
|
| 193 |
+
Returns statistics about pause behavior.
|
| 194 |
+
"""
|
| 195 |
+
if not speech_segments or len(speech_segments) < 2:
|
| 196 |
+
return {
|
| 197 |
+
'avg_pause': 0,
|
| 198 |
+
'max_pause': 0,
|
| 199 |
+
'pause_count': 0,
|
| 200 |
+
'speech_ratio': 0
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
sorted_segments = sorted(speech_segments, key=lambda s: s.get('start', 0))
|
| 204 |
+
|
| 205 |
+
pauses = []
|
| 206 |
+
for i in range(1, len(sorted_segments)):
|
| 207 |
+
prev_end = sorted_segments[i-1].get('end', 0)
|
| 208 |
+
curr_start = sorted_segments[i].get('start', 0)
|
| 209 |
+
gap = curr_start - prev_end
|
| 210 |
+
if gap > 0.1: # Ignore very small gaps
|
| 211 |
+
pauses.append(gap)
|
| 212 |
+
|
| 213 |
+
if not pauses:
|
| 214 |
+
return {
|
| 215 |
+
'avg_pause': 0,
|
| 216 |
+
'max_pause': 0,
|
| 217 |
+
'pause_count': 0,
|
| 218 |
+
'speech_ratio': 1.0
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
# Calculate speech time
|
| 222 |
+
speech_time = sum(
|
| 223 |
+
seg.get('end', 0) - seg.get('start', 0)
|
| 224 |
+
for seg in sorted_segments
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
return {
|
| 228 |
+
'avg_pause': round(np.mean(pauses), 2),
|
| 229 |
+
'max_pause': round(max(pauses), 2),
|
| 230 |
+
'pause_count': len(pauses),
|
| 231 |
+
'speech_ratio': round(speech_time / total_duration, 2) if total_duration > 0 else 0,
|
| 232 |
+
'natural_pauses': sum(1 for p in pauses if p <= self.max_natural_pause),
|
| 233 |
+
'warning_pauses': sum(1 for p in pauses if self.max_natural_pause < p < self.min_suspicious_duration),
|
| 234 |
+
'suspicious_pauses': sum(1 for p in pauses if p >= self.min_suspicious_duration)
|
| 235 |
+
}
|
src/fraud_detection/reading_pattern.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Reading Pattern Analyzer
|
| 3 |
+
Detects if someone is reading prepared answers vs speaking naturally.
|
| 4 |
+
|
| 5 |
+
Key indicators of reading:
|
| 6 |
+
- Consistent speech rate (no natural variation)
|
| 7 |
+
- Lack of filler words ("um", "uh", "like", "you know")
|
| 8 |
+
- Regular pause patterns
|
| 9 |
+
- Monotonic rhythm
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
from dataclasses import dataclass, field
|
| 14 |
+
from typing import List, Optional
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# Common filler words in English
|
| 18 |
+
FILLER_WORDS = [
|
| 19 |
+
'um', 'uh', 'uhm', 'umm', 'er', 'ah', 'like', 'you know',
|
| 20 |
+
'basically', 'actually', 'so', 'well', 'i mean', 'kind of',
|
| 21 |
+
'sort of', 'right', 'okay'
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@dataclass
|
| 26 |
+
class ReadingPatternResult:
|
| 27 |
+
"""Result of reading pattern analysis."""
|
| 28 |
+
is_reading: bool
|
| 29 |
+
confidence: float # 0.0 to 1.0
|
| 30 |
+
indicators: List[str] = field(default_factory=list)
|
| 31 |
+
speech_rate_cv: float = 0.0 # Coefficient of variation
|
| 32 |
+
filler_word_rate: float = 0.0 # Fillers per minute
|
| 33 |
+
pause_regularity: float = 0.0 # How regular pauses are
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class ReadingPatternAnalyzer:
|
| 37 |
+
"""
|
| 38 |
+
Analyzes speech patterns to detect if someone is reading.
|
| 39 |
+
|
| 40 |
+
Uses transcription with timestamps to analyze:
|
| 41 |
+
- Speech rate variation
|
| 42 |
+
- Filler word frequency
|
| 43 |
+
- Pause patterns
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
def __init__(self,
|
| 47 |
+
min_speech_rate_cv: float = 0.15,
|
| 48 |
+
min_filler_rate: float = 2.0,
|
| 49 |
+
reading_threshold: float = 0.6):
|
| 50 |
+
"""
|
| 51 |
+
Args:
|
| 52 |
+
min_speech_rate_cv: Minimum coefficient of variation for natural speech
|
| 53 |
+
min_filler_rate: Minimum filler words per minute for natural speech
|
| 54 |
+
reading_threshold: Confidence threshold to flag as reading
|
| 55 |
+
"""
|
| 56 |
+
self.min_speech_rate_cv = min_speech_rate_cv
|
| 57 |
+
self.min_filler_rate = min_filler_rate
|
| 58 |
+
self.reading_threshold = reading_threshold
|
| 59 |
+
|
| 60 |
+
def analyze(self, transcription: str, word_timestamps: List[dict],
|
| 61 |
+
duration_seconds: float) -> ReadingPatternResult:
|
| 62 |
+
"""
|
| 63 |
+
Analyze transcription for reading patterns.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
transcription: Full transcription text
|
| 67 |
+
word_timestamps: List of {'word': str, 'start': float, 'end': float}
|
| 68 |
+
duration_seconds: Total audio duration
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
ReadingPatternResult with analysis
|
| 72 |
+
"""
|
| 73 |
+
if not word_timestamps or len(word_timestamps) < 10:
|
| 74 |
+
return ReadingPatternResult(
|
| 75 |
+
is_reading=False,
|
| 76 |
+
confidence=0.0,
|
| 77 |
+
indicators=["Insufficient data for analysis"]
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
indicators = []
|
| 81 |
+
scores = []
|
| 82 |
+
|
| 83 |
+
# 1. Analyze speech rate variation
|
| 84 |
+
speech_rate_cv = self._analyze_speech_rate(word_timestamps)
|
| 85 |
+
if speech_rate_cv < self.min_speech_rate_cv:
|
| 86 |
+
indicators.append(f"Constant speech rate (CV={speech_rate_cv:.2f})")
|
| 87 |
+
scores.append(0.8)
|
| 88 |
+
else:
|
| 89 |
+
scores.append(0.2)
|
| 90 |
+
|
| 91 |
+
# 2. Analyze filler word frequency
|
| 92 |
+
filler_rate = self._analyze_filler_words(transcription, duration_seconds)
|
| 93 |
+
if filler_rate < self.min_filler_rate:
|
| 94 |
+
indicators.append(f"Few filler words ({filler_rate:.1f}/min)")
|
| 95 |
+
scores.append(0.7)
|
| 96 |
+
else:
|
| 97 |
+
scores.append(0.2)
|
| 98 |
+
|
| 99 |
+
# 3. Analyze pause patterns
|
| 100 |
+
pause_regularity = self._analyze_pause_patterns(word_timestamps)
|
| 101 |
+
if pause_regularity > 0.7:
|
| 102 |
+
indicators.append(f"Regular pause pattern ({pause_regularity:.0%})")
|
| 103 |
+
scores.append(0.6)
|
| 104 |
+
else:
|
| 105 |
+
scores.append(0.2)
|
| 106 |
+
|
| 107 |
+
# 4. Check for natural speech markers
|
| 108 |
+
has_corrections = self._has_self_corrections(transcription)
|
| 109 |
+
if not has_corrections:
|
| 110 |
+
indicators.append("No self-corrections detected")
|
| 111 |
+
scores.append(0.5)
|
| 112 |
+
else:
|
| 113 |
+
scores.append(0.1)
|
| 114 |
+
|
| 115 |
+
# Calculate overall confidence
|
| 116 |
+
confidence = np.mean(scores)
|
| 117 |
+
is_reading = confidence >= self.reading_threshold
|
| 118 |
+
|
| 119 |
+
return ReadingPatternResult(
|
| 120 |
+
is_reading=is_reading,
|
| 121 |
+
confidence=round(confidence, 2),
|
| 122 |
+
indicators=indicators,
|
| 123 |
+
speech_rate_cv=round(speech_rate_cv, 3),
|
| 124 |
+
filler_word_rate=round(filler_rate, 2),
|
| 125 |
+
pause_regularity=round(pause_regularity, 2)
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
def _analyze_speech_rate(self, word_timestamps: List[dict]) -> float:
|
| 129 |
+
"""
|
| 130 |
+
Calculate coefficient of variation of speech rate.
|
| 131 |
+
Natural speech has variable rate, reading is more constant.
|
| 132 |
+
"""
|
| 133 |
+
if len(word_timestamps) < 5:
|
| 134 |
+
return 0.0
|
| 135 |
+
|
| 136 |
+
# Calculate words per second in sliding windows
|
| 137 |
+
window_size = 3.0 # seconds
|
| 138 |
+
hop = 1.0 # seconds
|
| 139 |
+
|
| 140 |
+
rates = []
|
| 141 |
+
max_time = word_timestamps[-1].get('end', 0)
|
| 142 |
+
|
| 143 |
+
for start in np.arange(0, max_time - window_size, hop):
|
| 144 |
+
end = start + window_size
|
| 145 |
+
words_in_window = [
|
| 146 |
+
w for w in word_timestamps
|
| 147 |
+
if w.get('start', 0) >= start and w.get('end', 0) <= end
|
| 148 |
+
]
|
| 149 |
+
if words_in_window:
|
| 150 |
+
rate = len(words_in_window) / window_size
|
| 151 |
+
rates.append(rate)
|
| 152 |
+
|
| 153 |
+
if len(rates) < 3:
|
| 154 |
+
return 0.0
|
| 155 |
+
|
| 156 |
+
# Coefficient of variation (std / mean)
|
| 157 |
+
mean_rate = np.mean(rates)
|
| 158 |
+
if mean_rate == 0:
|
| 159 |
+
return 0.0
|
| 160 |
+
|
| 161 |
+
cv = np.std(rates) / mean_rate
|
| 162 |
+
return cv
|
| 163 |
+
|
| 164 |
+
def _analyze_filler_words(self, transcription: str,
|
| 165 |
+
duration_seconds: float) -> float:
|
| 166 |
+
"""
|
| 167 |
+
Count filler words per minute.
|
| 168 |
+
Natural speech has more fillers, reading has fewer.
|
| 169 |
+
"""
|
| 170 |
+
text_lower = transcription.lower()
|
| 171 |
+
filler_count = 0
|
| 172 |
+
|
| 173 |
+
for filler in FILLER_WORDS:
|
| 174 |
+
# Count occurrences (word boundaries)
|
| 175 |
+
import re
|
| 176 |
+
pattern = r'\b' + re.escape(filler) + r'\b'
|
| 177 |
+
matches = re.findall(pattern, text_lower)
|
| 178 |
+
filler_count += len(matches)
|
| 179 |
+
|
| 180 |
+
# Calculate per minute rate
|
| 181 |
+
minutes = duration_seconds / 60.0
|
| 182 |
+
if minutes < 0.1:
|
| 183 |
+
return 0.0
|
| 184 |
+
|
| 185 |
+
return filler_count / minutes
|
| 186 |
+
|
| 187 |
+
def _analyze_pause_patterns(self, word_timestamps: List[dict]) -> float:
|
| 188 |
+
"""
|
| 189 |
+
Analyze regularity of pauses between words.
|
| 190 |
+
Reading tends to have more regular pauses.
|
| 191 |
+
"""
|
| 192 |
+
if len(word_timestamps) < 5:
|
| 193 |
+
return 0.0
|
| 194 |
+
|
| 195 |
+
# Calculate gaps between consecutive words
|
| 196 |
+
gaps = []
|
| 197 |
+
for i in range(1, len(word_timestamps)):
|
| 198 |
+
prev_end = word_timestamps[i-1].get('end', 0)
|
| 199 |
+
curr_start = word_timestamps[i].get('start', 0)
|
| 200 |
+
gap = curr_start - prev_end
|
| 201 |
+
if gap > 0.05: # Ignore very small gaps
|
| 202 |
+
gaps.append(gap)
|
| 203 |
+
|
| 204 |
+
if len(gaps) < 3:
|
| 205 |
+
return 0.0
|
| 206 |
+
|
| 207 |
+
# Calculate regularity (inverse of coefficient of variation)
|
| 208 |
+
mean_gap = np.mean(gaps)
|
| 209 |
+
if mean_gap == 0:
|
| 210 |
+
return 0.0
|
| 211 |
+
|
| 212 |
+
cv = np.std(gaps) / mean_gap
|
| 213 |
+
regularity = 1.0 / (1.0 + cv) # Higher = more regular
|
| 214 |
+
|
| 215 |
+
return regularity
|
| 216 |
+
|
| 217 |
+
def _has_self_corrections(self, transcription: str) -> bool:
|
| 218 |
+
"""
|
| 219 |
+
Check for self-corrections which indicate natural speech.
|
| 220 |
+
E.g., "I went to the... I mean, I was going to the store"
|
| 221 |
+
"""
|
| 222 |
+
correction_markers = [
|
| 223 |
+
'i mean', 'sorry', 'no wait', 'actually', 'let me',
|
| 224 |
+
'what i meant', 'no no', 'sorry i', 'wait'
|
| 225 |
+
]
|
| 226 |
+
|
| 227 |
+
text_lower = transcription.lower()
|
| 228 |
+
for marker in correction_markers:
|
| 229 |
+
if marker in text_lower:
|
| 230 |
+
return True
|
| 231 |
+
|
| 232 |
+
# Check for repeated words (stammering/correction)
|
| 233 |
+
words = text_lower.split()
|
| 234 |
+
for i in range(1, len(words)):
|
| 235 |
+
if words[i] == words[i-1] and len(words[i]) > 2:
|
| 236 |
+
return True
|
| 237 |
+
|
| 238 |
+
return False
|
src/fraud_detection/whisper_detector.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Whisper Detector
|
| 3 |
+
Detects low-volume background voices (whispers) that may indicate someone
|
| 4 |
+
is being prompted or helped during a test.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import librosa
|
| 9 |
+
from dataclasses import dataclass, field
|
| 10 |
+
from typing import List, Tuple
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class WhisperInstance:
|
| 15 |
+
"""A detected whisper event."""
|
| 16 |
+
start: float
|
| 17 |
+
end: float
|
| 18 |
+
confidence: float
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class WhisperResult:
|
| 23 |
+
"""Result of whisper detection."""
|
| 24 |
+
detected: bool
|
| 25 |
+
instances: List[WhisperInstance] = field(default_factory=list)
|
| 26 |
+
|
| 27 |
+
@property
|
| 28 |
+
def count(self) -> int:
|
| 29 |
+
return len(self.instances)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class WhisperDetector:
|
| 33 |
+
"""
|
| 34 |
+
Detects whispers/low background voices in audio.
|
| 35 |
+
|
| 36 |
+
Whispers have distinct characteristics:
|
| 37 |
+
- Lower amplitude than normal speech
|
| 38 |
+
- More high-frequency content (less voiced, more fricative)
|
| 39 |
+
- Often occur during pauses in main speaker's speech
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
def __init__(self,
|
| 43 |
+
energy_threshold: float = 0.02,
|
| 44 |
+
min_duration: float = 0.3,
|
| 45 |
+
max_amplitude_ratio: float = 0.3):
|
| 46 |
+
"""
|
| 47 |
+
Args:
|
| 48 |
+
energy_threshold: Minimum energy to consider as potential whisper
|
| 49 |
+
min_duration: Minimum duration in seconds for a whisper
|
| 50 |
+
max_amplitude_ratio: Max ratio vs main speech (whispers are quieter)
|
| 51 |
+
"""
|
| 52 |
+
self.energy_threshold = energy_threshold
|
| 53 |
+
self.min_duration = min_duration
|
| 54 |
+
self.max_amplitude_ratio = max_amplitude_ratio
|
| 55 |
+
|
| 56 |
+
def detect(self, waveform: np.ndarray, sample_rate: int,
|
| 57 |
+
main_speaker_segments: List[dict] = None) -> WhisperResult:
|
| 58 |
+
"""
|
| 59 |
+
Detect whispers in audio.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
waveform: Audio waveform as numpy array
|
| 63 |
+
sample_rate: Sample rate of audio
|
| 64 |
+
main_speaker_segments: Segments where main speaker is talking
|
| 65 |
+
(whispers are checked outside these)
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
WhisperResult with detected whisper instances
|
| 69 |
+
"""
|
| 70 |
+
# Ensure mono
|
| 71 |
+
if len(waveform.shape) > 1:
|
| 72 |
+
waveform = waveform.mean(axis=0)
|
| 73 |
+
|
| 74 |
+
# Calculate main speech amplitude for comparison
|
| 75 |
+
main_amplitude = np.percentile(np.abs(waveform), 95)
|
| 76 |
+
|
| 77 |
+
# Frame-based analysis
|
| 78 |
+
frame_length = int(0.025 * sample_rate) # 25ms frames
|
| 79 |
+
hop_length = int(0.010 * sample_rate) # 10ms hop
|
| 80 |
+
|
| 81 |
+
# Calculate energy per frame
|
| 82 |
+
energy = librosa.feature.rms(y=waveform, frame_length=frame_length,
|
| 83 |
+
hop_length=hop_length)[0]
|
| 84 |
+
|
| 85 |
+
# Calculate spectral centroid (whispers have higher centroid)
|
| 86 |
+
spectral_centroid = librosa.feature.spectral_centroid(
|
| 87 |
+
y=waveform, sr=sample_rate,
|
| 88 |
+
n_fft=frame_length, hop_length=hop_length
|
| 89 |
+
)[0]
|
| 90 |
+
|
| 91 |
+
# Calculate zero crossing rate (whispers have higher ZCR)
|
| 92 |
+
zcr = librosa.feature.zero_crossing_rate(
|
| 93 |
+
y=waveform, frame_length=frame_length, hop_length=hop_length
|
| 94 |
+
)[0]
|
| 95 |
+
|
| 96 |
+
# Normalize features
|
| 97 |
+
energy_norm = energy / (main_amplitude + 1e-10)
|
| 98 |
+
centroid_norm = spectral_centroid / (sample_rate / 2)
|
| 99 |
+
|
| 100 |
+
# Identify whisper candidates:
|
| 101 |
+
# - Low energy (but not silent)
|
| 102 |
+
# - High spectral centroid (breathy)
|
| 103 |
+
# - High zero crossing rate
|
| 104 |
+
whisper_frames = (
|
| 105 |
+
(energy > self.energy_threshold * main_amplitude) &
|
| 106 |
+
(energy_norm < self.max_amplitude_ratio) &
|
| 107 |
+
(centroid_norm > 0.15) &
|
| 108 |
+
(zcr > 0.1)
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
# Convert frames to time segments
|
| 112 |
+
frame_times = librosa.frames_to_time(
|
| 113 |
+
np.arange(len(energy)), sr=sample_rate, hop_length=hop_length
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
# Group consecutive whisper frames
|
| 117 |
+
instances = []
|
| 118 |
+
in_whisper = False
|
| 119 |
+
start_time = 0
|
| 120 |
+
|
| 121 |
+
for i, is_whisper in enumerate(whisper_frames):
|
| 122 |
+
time = frame_times[i] if i < len(frame_times) else frame_times[-1]
|
| 123 |
+
|
| 124 |
+
if is_whisper and not in_whisper:
|
| 125 |
+
start_time = time
|
| 126 |
+
in_whisper = True
|
| 127 |
+
elif not is_whisper and in_whisper:
|
| 128 |
+
duration = time - start_time
|
| 129 |
+
if duration >= self.min_duration:
|
| 130 |
+
# Check if this overlaps with main speaker
|
| 131 |
+
if not self._overlaps_main_speaker(start_time, time, main_speaker_segments):
|
| 132 |
+
confidence = self._calculate_confidence(
|
| 133 |
+
waveform, sample_rate, start_time, time, main_amplitude
|
| 134 |
+
)
|
| 135 |
+
if confidence > 0.5:
|
| 136 |
+
instances.append(WhisperInstance(
|
| 137 |
+
start=round(start_time, 2),
|
| 138 |
+
end=round(time, 2),
|
| 139 |
+
confidence=round(confidence, 2)
|
| 140 |
+
))
|
| 141 |
+
in_whisper = False
|
| 142 |
+
|
| 143 |
+
# Handle case where audio ends during whisper
|
| 144 |
+
if in_whisper:
|
| 145 |
+
end_time = frame_times[-1] if len(frame_times) > 0 else 0
|
| 146 |
+
duration = end_time - start_time
|
| 147 |
+
if duration >= self.min_duration:
|
| 148 |
+
if not self._overlaps_main_speaker(start_time, end_time, main_speaker_segments):
|
| 149 |
+
confidence = self._calculate_confidence(
|
| 150 |
+
waveform, sample_rate, start_time, end_time, main_amplitude
|
| 151 |
+
)
|
| 152 |
+
if confidence > 0.5:
|
| 153 |
+
instances.append(WhisperInstance(
|
| 154 |
+
start=round(start_time, 2),
|
| 155 |
+
end=round(end_time, 2),
|
| 156 |
+
confidence=round(confidence, 2)
|
| 157 |
+
))
|
| 158 |
+
|
| 159 |
+
return WhisperResult(
|
| 160 |
+
detected=len(instances) > 0,
|
| 161 |
+
instances=instances
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
def _overlaps_main_speaker(self, start: float, end: float,
|
| 165 |
+
segments: List[dict]) -> bool:
|
| 166 |
+
"""Check if time range overlaps with main speaker segments."""
|
| 167 |
+
if not segments:
|
| 168 |
+
return False
|
| 169 |
+
|
| 170 |
+
for seg in segments:
|
| 171 |
+
seg_start = seg.get('start', 0)
|
| 172 |
+
seg_end = seg.get('end', 0)
|
| 173 |
+
# Check for overlap
|
| 174 |
+
if start < seg_end and end > seg_start:
|
| 175 |
+
return True
|
| 176 |
+
return False
|
| 177 |
+
|
| 178 |
+
def _calculate_confidence(self, waveform: np.ndarray, sample_rate: int,
|
| 179 |
+
start: float, end: float,
|
| 180 |
+
main_amplitude: float) -> float:
|
| 181 |
+
"""Calculate confidence that this segment is a whisper."""
|
| 182 |
+
start_sample = int(start * sample_rate)
|
| 183 |
+
end_sample = int(end * sample_rate)
|
| 184 |
+
|
| 185 |
+
if end_sample > len(waveform):
|
| 186 |
+
end_sample = len(waveform)
|
| 187 |
+
if start_sample >= end_sample:
|
| 188 |
+
return 0.0
|
| 189 |
+
|
| 190 |
+
segment = waveform[start_sample:end_sample]
|
| 191 |
+
|
| 192 |
+
# Calculate features for this segment
|
| 193 |
+
seg_amplitude = np.percentile(np.abs(segment), 95)
|
| 194 |
+
amplitude_ratio = seg_amplitude / (main_amplitude + 1e-10)
|
| 195 |
+
|
| 196 |
+
# Whisper confidence based on amplitude ratio
|
| 197 |
+
# Lower ratio = more likely whisper
|
| 198 |
+
if amplitude_ratio > 0.5:
|
| 199 |
+
return 0.0
|
| 200 |
+
|
| 201 |
+
# Scale confidence: 0.1-0.3 ratio = high confidence
|
| 202 |
+
confidence = 1.0 - (amplitude_ratio / 0.5)
|
| 203 |
+
return min(1.0, max(0.0, confidence))
|
src/phase1_foundation/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .preprocessor import AudioPreprocessor
|
| 2 |
+
from .vad import VoiceActivityDetector, SpeechSegment
|
| 3 |
+
from .diarization import SpeakerDiarizer, SpeakerInfo, SpeakerSegment
|
| 4 |
+
from .voiceprint import VoiceprintExtractor, VoiceprintResult
|
| 5 |
+
|
| 6 |
+
__all__ = [
|
| 7 |
+
'AudioPreprocessor',
|
| 8 |
+
'VoiceActivityDetector', 'SpeechSegment',
|
| 9 |
+
'SpeakerDiarizer', 'SpeakerInfo', 'SpeakerSegment',
|
| 10 |
+
'VoiceprintExtractor', 'VoiceprintResult'
|
| 11 |
+
]
|
src/phase1_foundation/diarization.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Speaker Diarization - identify who spoke when.
|
| 3 |
+
"""
|
| 4 |
+
import torch
|
| 5 |
+
import numpy as np
|
| 6 |
+
from typing import List, Dict, Optional
|
| 7 |
+
from dataclasses import dataclass, field
|
| 8 |
+
from sklearn.cluster import AgglomerativeClustering
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class SpeakerSegment:
|
| 13 |
+
"""A segment of speech from a specific speaker."""
|
| 14 |
+
start: float
|
| 15 |
+
end: float
|
| 16 |
+
speaker_id: str
|
| 17 |
+
|
| 18 |
+
@property
|
| 19 |
+
def duration(self) -> float:
|
| 20 |
+
return self.end - self.start
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass
|
| 24 |
+
class SpeakerInfo:
|
| 25 |
+
"""Information about a speaker."""
|
| 26 |
+
speaker_id: str
|
| 27 |
+
total_seconds: float = 0.0
|
| 28 |
+
segments: List[SpeakerSegment] = field(default_factory=list)
|
| 29 |
+
embedding: Optional[np.ndarray] = None
|
| 30 |
+
|
| 31 |
+
def add_segment(self, segment: SpeakerSegment):
|
| 32 |
+
self.segments.append(segment)
|
| 33 |
+
self.total_seconds += segment.duration
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class SpeakerDiarizer:
|
| 37 |
+
"""Speaker diarization using embedding clustering."""
|
| 38 |
+
|
| 39 |
+
def __init__(self, device: str = None):
|
| 40 |
+
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
|
| 41 |
+
self._embedding_model = None
|
| 42 |
+
|
| 43 |
+
@property
|
| 44 |
+
def embedding_model(self):
|
| 45 |
+
"""Lazy load embedding model."""
|
| 46 |
+
if self._embedding_model is None:
|
| 47 |
+
from speechbrain.inference.speaker import SpeakerRecognition
|
| 48 |
+
self._embedding_model = SpeakerRecognition.from_hparams(
|
| 49 |
+
source="speechbrain/spkrec-ecapa-voxceleb",
|
| 50 |
+
savedir="pretrained_models/spkrec",
|
| 51 |
+
run_opts={"device": self.device}
|
| 52 |
+
)
|
| 53 |
+
return self._embedding_model
|
| 54 |
+
|
| 55 |
+
def diarize(self, audio_path: str,
|
| 56 |
+
speech_segments: List = None,
|
| 57 |
+
window_size: float = 2.0,
|
| 58 |
+
hop_size: float = 0.5,
|
| 59 |
+
num_speakers: Optional[int] = None,
|
| 60 |
+
min_speakers: int = 1,
|
| 61 |
+
max_speakers: int = 5) -> Dict[str, SpeakerInfo]:
|
| 62 |
+
"""
|
| 63 |
+
Perform speaker diarization.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
audio_path: Path to audio file
|
| 67 |
+
speech_segments: Optional list of speech segments (from VAD)
|
| 68 |
+
window_size: Window size for embedding extraction
|
| 69 |
+
hop_size: Hop size between windows
|
| 70 |
+
num_speakers: Known number of speakers (None to estimate)
|
| 71 |
+
min_speakers: Minimum speakers to detect
|
| 72 |
+
max_speakers: Maximum speakers to detect
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
Dict mapping speaker_id to SpeakerInfo
|
| 76 |
+
"""
|
| 77 |
+
import torchaudio
|
| 78 |
+
|
| 79 |
+
# Load audio (use soundfile backend to avoid torchcodec dependency)
|
| 80 |
+
waveform, sample_rate = torchaudio.load(audio_path, backend="soundfile")
|
| 81 |
+
if waveform.shape[0] > 1:
|
| 82 |
+
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
| 83 |
+
|
| 84 |
+
duration = waveform.shape[1] / sample_rate
|
| 85 |
+
|
| 86 |
+
# Extract embeddings for windows
|
| 87 |
+
windows = []
|
| 88 |
+
embeddings = []
|
| 89 |
+
|
| 90 |
+
current = 0.0
|
| 91 |
+
while current + window_size <= duration:
|
| 92 |
+
start_sample = int(current * sample_rate)
|
| 93 |
+
end_sample = int((current + window_size) * sample_rate)
|
| 94 |
+
|
| 95 |
+
window_audio = waveform[:, start_sample:end_sample]
|
| 96 |
+
|
| 97 |
+
# Check if this window has speech (if VAD provided)
|
| 98 |
+
has_speech = True
|
| 99 |
+
if speech_segments:
|
| 100 |
+
has_speech = any(
|
| 101 |
+
s.start <= current + window_size/2 <= s.end
|
| 102 |
+
for s in speech_segments
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
if has_speech and window_audio.shape[1] > 0:
|
| 106 |
+
# Extract embedding
|
| 107 |
+
emb = self.embedding_model.encode_batch(window_audio)
|
| 108 |
+
emb = emb.squeeze().cpu().numpy()
|
| 109 |
+
|
| 110 |
+
windows.append({'start': current, 'end': current + window_size})
|
| 111 |
+
embeddings.append(emb)
|
| 112 |
+
|
| 113 |
+
current += hop_size
|
| 114 |
+
|
| 115 |
+
if len(embeddings) < 2:
|
| 116 |
+
# Not enough data for clustering
|
| 117 |
+
speaker_info = SpeakerInfo(speaker_id="speaker_A")
|
| 118 |
+
for seg in (speech_segments or []):
|
| 119 |
+
speaker_info.add_segment(SpeakerSegment(
|
| 120 |
+
start=seg.start, end=seg.end, speaker_id="speaker_A"
|
| 121 |
+
))
|
| 122 |
+
if embeddings:
|
| 123 |
+
speaker_info.embedding = embeddings[0]
|
| 124 |
+
return {"speaker_A": speaker_info}
|
| 125 |
+
|
| 126 |
+
embeddings_array = np.array(embeddings)
|
| 127 |
+
|
| 128 |
+
# Cluster embeddings
|
| 129 |
+
if num_speakers is None:
|
| 130 |
+
# Estimate number of speakers
|
| 131 |
+
clustering = AgglomerativeClustering(
|
| 132 |
+
n_clusters=None,
|
| 133 |
+
distance_threshold=0.7,
|
| 134 |
+
metric='cosine',
|
| 135 |
+
linkage='average'
|
| 136 |
+
)
|
| 137 |
+
else:
|
| 138 |
+
clustering = AgglomerativeClustering(
|
| 139 |
+
n_clusters=num_speakers,
|
| 140 |
+
metric='cosine',
|
| 141 |
+
linkage='average'
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
labels = clustering.fit_predict(embeddings_array)
|
| 145 |
+
|
| 146 |
+
# Clamp number of speakers
|
| 147 |
+
unique_labels = np.unique(labels)
|
| 148 |
+
if len(unique_labels) > max_speakers:
|
| 149 |
+
# Re-cluster with max speakers
|
| 150 |
+
clustering = AgglomerativeClustering(
|
| 151 |
+
n_clusters=max_speakers,
|
| 152 |
+
metric='cosine',
|
| 153 |
+
linkage='average'
|
| 154 |
+
)
|
| 155 |
+
labels = clustering.fit_predict(embeddings_array)
|
| 156 |
+
unique_labels = np.unique(labels)
|
| 157 |
+
|
| 158 |
+
# Build speaker info
|
| 159 |
+
speakers = {}
|
| 160 |
+
speaker_names = ['speaker_A', 'speaker_B', 'speaker_C', 'speaker_D', 'speaker_E']
|
| 161 |
+
|
| 162 |
+
for label in unique_labels:
|
| 163 |
+
speaker_id = speaker_names[label] if label < len(speaker_names) else f"speaker_{label}"
|
| 164 |
+
speakers[speaker_id] = SpeakerInfo(speaker_id=speaker_id)
|
| 165 |
+
|
| 166 |
+
# Calculate mean embedding for this speaker
|
| 167 |
+
mask = labels == label
|
| 168 |
+
speaker_embeddings = embeddings_array[mask]
|
| 169 |
+
speakers[speaker_id].embedding = np.mean(speaker_embeddings, axis=0)
|
| 170 |
+
|
| 171 |
+
# Assign windows to speakers
|
| 172 |
+
for i, (window, label) in enumerate(zip(windows, labels)):
|
| 173 |
+
speaker_id = speaker_names[label] if label < len(speaker_names) else f"speaker_{label}"
|
| 174 |
+
segment = SpeakerSegment(
|
| 175 |
+
start=window['start'],
|
| 176 |
+
end=window['end'],
|
| 177 |
+
speaker_id=speaker_id
|
| 178 |
+
)
|
| 179 |
+
speakers[speaker_id].add_segment(segment)
|
| 180 |
+
|
| 181 |
+
# Sort by total speech time (main speaker first)
|
| 182 |
+
speakers = dict(sorted(
|
| 183 |
+
speakers.items(),
|
| 184 |
+
key=lambda x: x[1].total_seconds,
|
| 185 |
+
reverse=True
|
| 186 |
+
))
|
| 187 |
+
|
| 188 |
+
return speakers
|
| 189 |
+
|
| 190 |
+
def get_main_speaker(self, speakers: Dict[str, SpeakerInfo]) -> Optional[SpeakerInfo]:
|
| 191 |
+
"""Get the speaker with most speech time."""
|
| 192 |
+
if not speakers:
|
| 193 |
+
return None
|
| 194 |
+
return next(iter(speakers.values()))
|
| 195 |
+
|
| 196 |
+
def get_additional_speakers(self, speakers: Dict[str, SpeakerInfo]) -> List[SpeakerInfo]:
|
| 197 |
+
"""Get all speakers except the main one."""
|
| 198 |
+
items = list(speakers.values())
|
| 199 |
+
return items[1:] if len(items) > 1 else []
|
src/phase1_foundation/preprocessor.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio preprocessor - normalize audio for analysis.
|
| 3 |
+
"""
|
| 4 |
+
import torch
|
| 5 |
+
import torchaudio
|
| 6 |
+
import numpy as np
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Tuple, Optional
|
| 9 |
+
import tempfile
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class AudioPreprocessor:
|
| 14 |
+
"""Normalize audio to standard format for analysis."""
|
| 15 |
+
|
| 16 |
+
TARGET_SAMPLE_RATE = 16000
|
| 17 |
+
TARGET_CHANNELS = 1
|
| 18 |
+
|
| 19 |
+
def __init__(self):
|
| 20 |
+
pass
|
| 21 |
+
|
| 22 |
+
def load_audio(self, audio_path: str) -> Tuple[torch.Tensor, int]:
|
| 23 |
+
"""
|
| 24 |
+
Load audio file.
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
Tuple of (waveform, sample_rate)
|
| 28 |
+
"""
|
| 29 |
+
# Use soundfile backend to avoid torchcodec dependency
|
| 30 |
+
waveform, sample_rate = torchaudio.load(audio_path, backend="soundfile")
|
| 31 |
+
return waveform, sample_rate
|
| 32 |
+
|
| 33 |
+
def normalize(self, waveform: torch.Tensor, sample_rate: int) -> Tuple[torch.Tensor, int]:
|
| 34 |
+
"""
|
| 35 |
+
Normalize audio to mono, 16kHz, normalized amplitude.
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
Tuple of (normalized_waveform, target_sample_rate)
|
| 39 |
+
"""
|
| 40 |
+
# Convert to mono
|
| 41 |
+
if waveform.shape[0] > 1:
|
| 42 |
+
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
| 43 |
+
|
| 44 |
+
# Resample to 16kHz
|
| 45 |
+
if sample_rate != self.TARGET_SAMPLE_RATE:
|
| 46 |
+
resampler = torchaudio.transforms.Resample(
|
| 47 |
+
orig_freq=sample_rate,
|
| 48 |
+
new_freq=self.TARGET_SAMPLE_RATE
|
| 49 |
+
)
|
| 50 |
+
waveform = resampler(waveform)
|
| 51 |
+
|
| 52 |
+
# Normalize amplitude
|
| 53 |
+
max_amp = waveform.abs().max()
|
| 54 |
+
if max_amp > 0:
|
| 55 |
+
waveform = waveform / max_amp * 0.95
|
| 56 |
+
|
| 57 |
+
return waveform, self.TARGET_SAMPLE_RATE
|
| 58 |
+
|
| 59 |
+
def process_file(self, audio_path: str, output_path: Optional[str] = None) -> Tuple[torch.Tensor, int, dict]:
|
| 60 |
+
"""
|
| 61 |
+
Load and normalize audio file.
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
Tuple of (waveform, sample_rate, metadata)
|
| 65 |
+
"""
|
| 66 |
+
# Load
|
| 67 |
+
waveform, orig_sr = self.load_audio(audio_path)
|
| 68 |
+
orig_duration = waveform.shape[1] / orig_sr
|
| 69 |
+
orig_channels = waveform.shape[0]
|
| 70 |
+
|
| 71 |
+
# Normalize
|
| 72 |
+
waveform, sample_rate = self.normalize(waveform, orig_sr)
|
| 73 |
+
|
| 74 |
+
# Save if output path provided
|
| 75 |
+
if output_path:
|
| 76 |
+
torchaudio.save(output_path, waveform, sample_rate)
|
| 77 |
+
|
| 78 |
+
metadata = {
|
| 79 |
+
'original_sample_rate': orig_sr,
|
| 80 |
+
'original_channels': orig_channels,
|
| 81 |
+
'original_duration': orig_duration,
|
| 82 |
+
'normalized_sample_rate': sample_rate,
|
| 83 |
+
'normalized_duration': waveform.shape[1] / sample_rate
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
return waveform, sample_rate, metadata
|
| 87 |
+
|
| 88 |
+
def get_duration(self, waveform: torch.Tensor, sample_rate: int) -> float:
|
| 89 |
+
"""Get duration in seconds."""
|
| 90 |
+
return waveform.shape[1] / sample_rate
|
| 91 |
+
|
| 92 |
+
def save_audio(self, waveform: torch.Tensor, sample_rate: int, output_path: str):
|
| 93 |
+
"""Save audio to file."""
|
| 94 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 95 |
+
torchaudio.save(output_path, waveform, sample_rate)
|
| 96 |
+
|
| 97 |
+
def extract_segment(self, waveform: torch.Tensor, sample_rate: int,
|
| 98 |
+
start: float, end: float) -> torch.Tensor:
|
| 99 |
+
"""Extract segment from waveform."""
|
| 100 |
+
start_sample = int(start * sample_rate)
|
| 101 |
+
end_sample = int(end * sample_rate)
|
| 102 |
+
return waveform[:, start_sample:end_sample]
|
src/phase1_foundation/vad.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Voice Activity Detection - detect speech segments.
|
| 3 |
+
"""
|
| 4 |
+
import torch
|
| 5 |
+
from typing import List, Tuple, Optional
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@dataclass
|
| 10 |
+
class SpeechSegment:
|
| 11 |
+
"""A segment of speech."""
|
| 12 |
+
start: float
|
| 13 |
+
end: float
|
| 14 |
+
|
| 15 |
+
@property
|
| 16 |
+
def duration(self) -> float:
|
| 17 |
+
return self.end - self.start
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class VoiceActivityDetector:
|
| 21 |
+
"""Detect speech segments using SpeechBrain VAD."""
|
| 22 |
+
|
| 23 |
+
def __init__(self, device: str = None):
|
| 24 |
+
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
|
| 25 |
+
self._model = None
|
| 26 |
+
|
| 27 |
+
@property
|
| 28 |
+
def model(self):
|
| 29 |
+
"""Lazy load VAD model."""
|
| 30 |
+
if self._model is None:
|
| 31 |
+
from speechbrain.inference.VAD import VAD
|
| 32 |
+
import warnings
|
| 33 |
+
# Suppress the use_auth_token deprecation warning from speechbrain
|
| 34 |
+
with warnings.catch_warnings():
|
| 35 |
+
warnings.filterwarnings("ignore", message=".*use_auth_token.*")
|
| 36 |
+
self._model = VAD.from_hparams(
|
| 37 |
+
source="speechbrain/vad-crdnn-libriparty",
|
| 38 |
+
savedir="pretrained_models/vad",
|
| 39 |
+
run_opts={"device": self.device}
|
| 40 |
+
)
|
| 41 |
+
return self._model
|
| 42 |
+
|
| 43 |
+
def detect(self, audio_path: str,
|
| 44 |
+
min_speech_duration: float = 0.25,
|
| 45 |
+
min_silence_duration: float = 0.1) -> List[SpeechSegment]:
|
| 46 |
+
"""
|
| 47 |
+
Detect speech segments in audio.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
audio_path: Path to audio file
|
| 51 |
+
min_speech_duration: Minimum speech duration to keep
|
| 52 |
+
min_silence_duration: Minimum silence to consider as gap
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
List of SpeechSegment objects
|
| 56 |
+
"""
|
| 57 |
+
# Use get_speech_segments which does full pipeline
|
| 58 |
+
boundaries = self.model.get_speech_segments(
|
| 59 |
+
audio_path,
|
| 60 |
+
large_chunk_size=30,
|
| 61 |
+
small_chunk_size=10,
|
| 62 |
+
overlap_small_chunk=True,
|
| 63 |
+
apply_energy_VAD=True,
|
| 64 |
+
double_check=True,
|
| 65 |
+
close_th=min_silence_duration,
|
| 66 |
+
len_th=min_speech_duration
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# Convert to segments
|
| 70 |
+
segments = []
|
| 71 |
+
# boundaries is a tensor with shape [N, 2] where each row is [start, end]
|
| 72 |
+
if boundaries is not None and len(boundaries) > 0:
|
| 73 |
+
for boundary in boundaries:
|
| 74 |
+
start, end = float(boundary[0]), float(boundary[1])
|
| 75 |
+
if end - start >= min_speech_duration:
|
| 76 |
+
segments.append(SpeechSegment(start=start, end=end))
|
| 77 |
+
|
| 78 |
+
return segments
|
| 79 |
+
|
| 80 |
+
def detect_from_waveform(self, waveform: torch.Tensor, sample_rate: int,
|
| 81 |
+
min_speech_duration: float = 0.25) -> List[SpeechSegment]:
|
| 82 |
+
"""
|
| 83 |
+
Detect speech segments from waveform tensor.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
waveform: Audio waveform tensor
|
| 87 |
+
sample_rate: Sample rate
|
| 88 |
+
min_speech_duration: Minimum speech duration
|
| 89 |
+
|
| 90 |
+
Returns:
|
| 91 |
+
List of SpeechSegment objects
|
| 92 |
+
"""
|
| 93 |
+
import tempfile
|
| 94 |
+
import torchaudio
|
| 95 |
+
import os
|
| 96 |
+
|
| 97 |
+
# Save to temp file (SpeechBrain VAD needs file path)
|
| 98 |
+
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
|
| 99 |
+
temp_path = f.name
|
| 100 |
+
|
| 101 |
+
try:
|
| 102 |
+
torchaudio.save(temp_path, waveform, sample_rate)
|
| 103 |
+
return self.detect(temp_path, min_speech_duration)
|
| 104 |
+
finally:
|
| 105 |
+
if os.path.exists(temp_path):
|
| 106 |
+
os.remove(temp_path)
|
| 107 |
+
|
| 108 |
+
def get_total_speech(self, segments: List[SpeechSegment]) -> float:
|
| 109 |
+
"""Get total speech duration from segments."""
|
| 110 |
+
return sum(s.duration for s in segments)
|
| 111 |
+
|
| 112 |
+
def get_speech_ratio(self, segments: List[SpeechSegment],
|
| 113 |
+
total_duration: float) -> float:
|
| 114 |
+
"""Get ratio of speech to total duration."""
|
| 115 |
+
if total_duration == 0:
|
| 116 |
+
return 0.0
|
| 117 |
+
return self.get_total_speech(segments) / total_duration
|
src/phase1_foundation/voiceprint.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Voiceprint extraction - generate unique voice identifiers.
|
| 3 |
+
"""
|
| 4 |
+
import torch
|
| 5 |
+
import numpy as np
|
| 6 |
+
import hashlib
|
| 7 |
+
from typing import Tuple, Optional
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class VoiceprintResult:
|
| 13 |
+
"""Result of voiceprint extraction."""
|
| 14 |
+
voiceprint_id: str
|
| 15 |
+
embedding: np.ndarray
|
| 16 |
+
quality_score: float
|
| 17 |
+
speech_duration: float
|
| 18 |
+
|
| 19 |
+
def to_bytes(self) -> bytes:
|
| 20 |
+
"""Convert embedding to bytes for storage."""
|
| 21 |
+
return self.embedding.astype(np.float32).tobytes()
|
| 22 |
+
|
| 23 |
+
@classmethod
|
| 24 |
+
def from_bytes(cls, vp_id: str, embedding_bytes: bytes,
|
| 25 |
+
quality: float = 0.0, duration: float = 0.0):
|
| 26 |
+
"""Create from stored bytes."""
|
| 27 |
+
embedding = np.frombuffer(embedding_bytes, dtype=np.float32)
|
| 28 |
+
return cls(
|
| 29 |
+
voiceprint_id=vp_id,
|
| 30 |
+
embedding=embedding,
|
| 31 |
+
quality_score=quality,
|
| 32 |
+
speech_duration=duration
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class VoiceprintExtractor:
|
| 37 |
+
"""Extract voiceprints using ECAPA-TDNN."""
|
| 38 |
+
|
| 39 |
+
EMBEDDING_DIM = 192
|
| 40 |
+
|
| 41 |
+
def __init__(self, device: str = None):
|
| 42 |
+
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
|
| 43 |
+
self._model = None
|
| 44 |
+
|
| 45 |
+
@property
|
| 46 |
+
def model(self):
|
| 47 |
+
"""Lazy load model."""
|
| 48 |
+
if self._model is None:
|
| 49 |
+
from speechbrain.inference.speaker import SpeakerRecognition
|
| 50 |
+
self._model = SpeakerRecognition.from_hparams(
|
| 51 |
+
source="speechbrain/spkrec-ecapa-voxceleb",
|
| 52 |
+
savedir="pretrained_models/spkrec",
|
| 53 |
+
run_opts={"device": self.device}
|
| 54 |
+
)
|
| 55 |
+
return self._model
|
| 56 |
+
|
| 57 |
+
def extract_from_file(self, audio_path: str) -> VoiceprintResult:
|
| 58 |
+
"""
|
| 59 |
+
Extract voiceprint from audio file.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
audio_path: Path to audio file
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
VoiceprintResult with ID, embedding, quality
|
| 66 |
+
"""
|
| 67 |
+
import torchaudio
|
| 68 |
+
|
| 69 |
+
# Load audio (use soundfile backend to avoid torchcodec dependency)
|
| 70 |
+
waveform, sample_rate = torchaudio.load(audio_path, backend="soundfile")
|
| 71 |
+
duration = waveform.shape[1] / sample_rate
|
| 72 |
+
|
| 73 |
+
return self.extract_from_waveform(waveform, sample_rate, duration)
|
| 74 |
+
|
| 75 |
+
def extract_from_waveform(self, waveform: torch.Tensor, sample_rate: int,
|
| 76 |
+
duration: float = None) -> VoiceprintResult:
|
| 77 |
+
"""
|
| 78 |
+
Extract voiceprint from waveform.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
waveform: Audio waveform tensor
|
| 82 |
+
sample_rate: Sample rate
|
| 83 |
+
duration: Optional duration (calculated if not provided)
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
VoiceprintResult
|
| 87 |
+
"""
|
| 88 |
+
if duration is None:
|
| 89 |
+
duration = waveform.shape[1] / sample_rate
|
| 90 |
+
|
| 91 |
+
# Ensure mono
|
| 92 |
+
if waveform.shape[0] > 1:
|
| 93 |
+
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
| 94 |
+
|
| 95 |
+
# Resample if needed
|
| 96 |
+
if sample_rate != 16000:
|
| 97 |
+
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
|
| 98 |
+
waveform = resampler(waveform)
|
| 99 |
+
|
| 100 |
+
# Extract embedding
|
| 101 |
+
embedding = self.model.encode_batch(waveform)
|
| 102 |
+
embedding = embedding.squeeze().cpu().numpy()
|
| 103 |
+
|
| 104 |
+
# Generate ID from embedding
|
| 105 |
+
vp_id = self._generate_id(embedding)
|
| 106 |
+
|
| 107 |
+
# Calculate quality score (based on duration and SNR proxy)
|
| 108 |
+
quality = self._calculate_quality(waveform, duration)
|
| 109 |
+
|
| 110 |
+
return VoiceprintResult(
|
| 111 |
+
voiceprint_id=vp_id,
|
| 112 |
+
embedding=embedding,
|
| 113 |
+
quality_score=quality,
|
| 114 |
+
speech_duration=duration
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
def extract_from_embedding(self, embedding: np.ndarray,
|
| 118 |
+
duration: float = 0.0) -> VoiceprintResult:
|
| 119 |
+
"""Create VoiceprintResult from existing embedding."""
|
| 120 |
+
vp_id = self._generate_id(embedding)
|
| 121 |
+
quality = min(1.0, duration / 30.0) # Simple duration-based quality
|
| 122 |
+
|
| 123 |
+
return VoiceprintResult(
|
| 124 |
+
voiceprint_id=vp_id,
|
| 125 |
+
embedding=embedding,
|
| 126 |
+
quality_score=quality,
|
| 127 |
+
speech_duration=duration
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
def _generate_id(self, embedding: np.ndarray) -> str:
|
| 131 |
+
"""Generate unique ID from embedding."""
|
| 132 |
+
# Hash the embedding
|
| 133 |
+
emb_bytes = embedding.astype(np.float32).tobytes()
|
| 134 |
+
hash_hex = hashlib.sha256(emb_bytes).hexdigest()[:8]
|
| 135 |
+
return f"vp_{hash_hex}"
|
| 136 |
+
|
| 137 |
+
def _calculate_quality(self, waveform: torch.Tensor, duration: float) -> float:
|
| 138 |
+
"""
|
| 139 |
+
Calculate quality score.
|
| 140 |
+
|
| 141 |
+
Based on:
|
| 142 |
+
- Duration (more is better, up to 30s)
|
| 143 |
+
- Signal energy (not too quiet, not clipping)
|
| 144 |
+
"""
|
| 145 |
+
# Duration factor (0-1, saturates at 30s)
|
| 146 |
+
duration_score = min(1.0, duration / 30.0)
|
| 147 |
+
|
| 148 |
+
# Energy factor
|
| 149 |
+
rms = torch.sqrt(torch.mean(waveform ** 2)).item()
|
| 150 |
+
if rms < 0.01: # Too quiet
|
| 151 |
+
energy_score = 0.3
|
| 152 |
+
elif rms > 0.9: # Clipping
|
| 153 |
+
energy_score = 0.5
|
| 154 |
+
else:
|
| 155 |
+
energy_score = 1.0
|
| 156 |
+
|
| 157 |
+
# Combined score
|
| 158 |
+
quality = 0.7 * duration_score + 0.3 * energy_score
|
| 159 |
+
return round(quality, 2)
|
| 160 |
+
|
| 161 |
+
def compare(self, vp1: VoiceprintResult, vp2: VoiceprintResult) -> float:
|
| 162 |
+
"""
|
| 163 |
+
Compare two voiceprints.
|
| 164 |
+
|
| 165 |
+
Returns:
|
| 166 |
+
Similarity score (0-1), higher = more similar
|
| 167 |
+
"""
|
| 168 |
+
return self.cosine_similarity(vp1.embedding, vp2.embedding)
|
| 169 |
+
|
| 170 |
+
def cosine_similarity(self, emb1: np.ndarray, emb2: np.ndarray) -> float:
|
| 171 |
+
"""Calculate cosine similarity between embeddings."""
|
| 172 |
+
dot = np.dot(emb1, emb2)
|
| 173 |
+
norm1 = np.linalg.norm(emb1)
|
| 174 |
+
norm2 = np.linalg.norm(emb2)
|
| 175 |
+
|
| 176 |
+
if norm1 == 0 or norm2 == 0:
|
| 177 |
+
return 0.0
|
| 178 |
+
|
| 179 |
+
return float(dot / (norm1 * norm2))
|
| 180 |
+
|
| 181 |
+
def is_same_speaker(self, emb1: np.ndarray, emb2: np.ndarray,
|
| 182 |
+
threshold: float = 0.75) -> Tuple[bool, float]:
|
| 183 |
+
"""
|
| 184 |
+
Check if two embeddings are from the same speaker.
|
| 185 |
+
|
| 186 |
+
Returns:
|
| 187 |
+
Tuple of (is_same, similarity_score)
|
| 188 |
+
"""
|
| 189 |
+
similarity = self.cosine_similarity(emb1, emb2)
|
| 190 |
+
return similarity >= threshold, similarity
|
| 191 |
+
|
| 192 |
+
def quality_label(self, score: float) -> str:
|
| 193 |
+
"""Get quality label from score."""
|
| 194 |
+
if score >= 0.8:
|
| 195 |
+
return "High"
|
| 196 |
+
elif score >= 0.5:
|
| 197 |
+
return "Medium"
|
| 198 |
+
else:
|
| 199 |
+
return "Low"
|
src/phase2_background/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .analyzer import BackgroundAnalyzer, BackgroundAnomaly, AnomalyType, AudioSource
|
| 2 |
+
|
| 3 |
+
__all__ = ['BackgroundAnalyzer', 'BackgroundAnomaly', 'AnomalyType', 'AudioSource']
|
src/phase2_background/analyzer.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Background Audio Analysis - detect subtle anomalies.
|
| 3 |
+
"""
|
| 4 |
+
import torch
|
| 5 |
+
import numpy as np
|
| 6 |
+
import librosa
|
| 7 |
+
from typing import List, Optional
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
from enum import Enum
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class AnomalyType(Enum):
|
| 13 |
+
WHISPER = "whisper"
|
| 14 |
+
DISTANT_VOICE = "distant_voice"
|
| 15 |
+
SPEAKER_AUDIO = "speaker_audio"
|
| 16 |
+
UNKNOWN = "unknown"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class AudioSource(Enum):
|
| 20 |
+
DIRECT = "direct"
|
| 21 |
+
SPEAKER = "speaker"
|
| 22 |
+
PHONE = "phone"
|
| 23 |
+
UNKNOWN = "unknown"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class BackgroundAnomaly:
|
| 28 |
+
"""A detected background anomaly."""
|
| 29 |
+
start: float
|
| 30 |
+
end: float
|
| 31 |
+
anomaly_type: AnomalyType
|
| 32 |
+
amplitude_db: float
|
| 33 |
+
confidence: float
|
| 34 |
+
|
| 35 |
+
@property
|
| 36 |
+
def duration(self) -> float:
|
| 37 |
+
return self.end - self.start
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class BackgroundAnalyzer:
|
| 41 |
+
"""Analyze background audio for anomalies."""
|
| 42 |
+
|
| 43 |
+
def __init__(self):
|
| 44 |
+
self.sample_rate = 16000
|
| 45 |
+
|
| 46 |
+
def amplify_background(self, waveform: np.ndarray,
|
| 47 |
+
threshold_db: float = -40,
|
| 48 |
+
boost_db: float = 25) -> np.ndarray:
|
| 49 |
+
"""
|
| 50 |
+
Amplify quiet background regions.
|
| 51 |
+
|
| 52 |
+
Args:
|
| 53 |
+
waveform: Audio waveform (numpy array)
|
| 54 |
+
threshold_db: Regions below this are amplified
|
| 55 |
+
boost_db: Amount to boost by
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
Amplified waveform
|
| 59 |
+
"""
|
| 60 |
+
# Convert to dB
|
| 61 |
+
rms = np.sqrt(np.mean(waveform ** 2))
|
| 62 |
+
if rms == 0:
|
| 63 |
+
return waveform
|
| 64 |
+
|
| 65 |
+
# Simple amplitude-based boosting
|
| 66 |
+
amplified = waveform.copy()
|
| 67 |
+
|
| 68 |
+
# Calculate local energy in windows
|
| 69 |
+
window_size = int(0.1 * self.sample_rate) # 100ms windows
|
| 70 |
+
hop = window_size // 2
|
| 71 |
+
|
| 72 |
+
for i in range(0, len(waveform) - window_size, hop):
|
| 73 |
+
window = waveform[i:i + window_size]
|
| 74 |
+
window_rms = np.sqrt(np.mean(window ** 2))
|
| 75 |
+
|
| 76 |
+
if window_rms > 0:
|
| 77 |
+
window_db = 20 * np.log10(window_rms + 1e-10)
|
| 78 |
+
|
| 79 |
+
if window_db < threshold_db:
|
| 80 |
+
# Boost this region
|
| 81 |
+
boost_factor = 10 ** (boost_db / 20)
|
| 82 |
+
amplified[i:i + window_size] *= boost_factor
|
| 83 |
+
|
| 84 |
+
# Normalize to prevent clipping
|
| 85 |
+
max_amp = np.abs(amplified).max()
|
| 86 |
+
if max_amp > 0.95:
|
| 87 |
+
amplified = amplified * 0.95 / max_amp
|
| 88 |
+
|
| 89 |
+
return amplified
|
| 90 |
+
|
| 91 |
+
def detect_anomalies(self, waveform: np.ndarray,
|
| 92 |
+
speech_segments: List = None,
|
| 93 |
+
threshold_db: float = -50) -> List[BackgroundAnomaly]:
|
| 94 |
+
"""
|
| 95 |
+
Detect anomalies in background audio.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
waveform: Audio waveform
|
| 99 |
+
speech_segments: Optional VAD segments to exclude
|
| 100 |
+
threshold_db: Minimum amplitude to consider
|
| 101 |
+
|
| 102 |
+
Returns:
|
| 103 |
+
List of detected anomalies
|
| 104 |
+
"""
|
| 105 |
+
anomalies = []
|
| 106 |
+
|
| 107 |
+
# Amplify background
|
| 108 |
+
amplified = self.amplify_background(waveform)
|
| 109 |
+
|
| 110 |
+
# Analyze in windows
|
| 111 |
+
window_size = int(0.5 * self.sample_rate) # 500ms
|
| 112 |
+
hop = window_size // 4
|
| 113 |
+
|
| 114 |
+
for i in range(0, len(amplified) - window_size, hop):
|
| 115 |
+
start_time = i / self.sample_rate
|
| 116 |
+
end_time = (i + window_size) / self.sample_rate
|
| 117 |
+
|
| 118 |
+
# Skip if in main speech
|
| 119 |
+
if speech_segments:
|
| 120 |
+
in_speech = any(
|
| 121 |
+
s.start <= start_time + 0.25 <= s.end
|
| 122 |
+
for s in speech_segments
|
| 123 |
+
)
|
| 124 |
+
if in_speech:
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
window = amplified[i:i + window_size]
|
| 128 |
+
window_rms = np.sqrt(np.mean(window ** 2))
|
| 129 |
+
|
| 130 |
+
if window_rms == 0:
|
| 131 |
+
continue
|
| 132 |
+
|
| 133 |
+
window_db = 20 * np.log10(window_rms + 1e-10)
|
| 134 |
+
|
| 135 |
+
# Check for anomaly
|
| 136 |
+
if window_db > threshold_db:
|
| 137 |
+
anomaly_type = self._classify_anomaly(window)
|
| 138 |
+
confidence = self._calculate_confidence(window, window_db, threshold_db)
|
| 139 |
+
|
| 140 |
+
if confidence > 0.3: # Minimum confidence threshold
|
| 141 |
+
anomalies.append(BackgroundAnomaly(
|
| 142 |
+
start=start_time,
|
| 143 |
+
end=end_time,
|
| 144 |
+
anomaly_type=anomaly_type,
|
| 145 |
+
amplitude_db=window_db,
|
| 146 |
+
confidence=confidence
|
| 147 |
+
))
|
| 148 |
+
|
| 149 |
+
# Merge adjacent anomalies
|
| 150 |
+
anomalies = self._merge_anomalies(anomalies)
|
| 151 |
+
|
| 152 |
+
return anomalies
|
| 153 |
+
|
| 154 |
+
def _classify_anomaly(self, window: np.ndarray) -> AnomalyType:
|
| 155 |
+
"""Classify the type of anomaly."""
|
| 156 |
+
# Extract spectral features
|
| 157 |
+
if len(window) < 512:
|
| 158 |
+
return AnomalyType.UNKNOWN
|
| 159 |
+
|
| 160 |
+
# Compute spectrum
|
| 161 |
+
spectrum = np.abs(np.fft.rfft(window))
|
| 162 |
+
freqs = np.fft.rfftfreq(len(window), 1/self.sample_rate)
|
| 163 |
+
|
| 164 |
+
# Frequency band energies
|
| 165 |
+
low_mask = freqs < 300
|
| 166 |
+
mid_mask = (freqs >= 300) & (freqs < 3000)
|
| 167 |
+
high_mask = freqs >= 3000
|
| 168 |
+
|
| 169 |
+
low_energy = np.sum(spectrum[low_mask] ** 2)
|
| 170 |
+
mid_energy = np.sum(spectrum[mid_mask] ** 2)
|
| 171 |
+
high_energy = np.sum(spectrum[high_mask] ** 2)
|
| 172 |
+
|
| 173 |
+
total = low_energy + mid_energy + high_energy + 1e-10
|
| 174 |
+
|
| 175 |
+
# Whisper: less low frequency, more high frequency
|
| 176 |
+
if low_energy / total < 0.1 and high_energy / total > 0.3:
|
| 177 |
+
return AnomalyType.WHISPER
|
| 178 |
+
|
| 179 |
+
# Speaker/Phone: limited bandwidth
|
| 180 |
+
if high_energy / total < 0.1:
|
| 181 |
+
return AnomalyType.SPEAKER_AUDIO
|
| 182 |
+
|
| 183 |
+
# Distant voice: high reverb indicator (simplified)
|
| 184 |
+
if mid_energy / total > 0.5:
|
| 185 |
+
return AnomalyType.DISTANT_VOICE
|
| 186 |
+
|
| 187 |
+
return AnomalyType.UNKNOWN
|
| 188 |
+
|
| 189 |
+
def _calculate_confidence(self, window: np.ndarray,
|
| 190 |
+
db: float, threshold: float) -> float:
|
| 191 |
+
"""Calculate confidence score for anomaly."""
|
| 192 |
+
# Higher amplitude above threshold = higher confidence
|
| 193 |
+
db_above = db - threshold
|
| 194 |
+
confidence = min(1.0, db_above / 20) # Saturate at 20dB above
|
| 195 |
+
return max(0.0, confidence)
|
| 196 |
+
|
| 197 |
+
def _merge_anomalies(self, anomalies: List[BackgroundAnomaly],
|
| 198 |
+
max_gap: float = 0.5) -> List[BackgroundAnomaly]:
|
| 199 |
+
"""Merge adjacent anomalies of same type."""
|
| 200 |
+
if not anomalies:
|
| 201 |
+
return []
|
| 202 |
+
|
| 203 |
+
# Sort by start time
|
| 204 |
+
anomalies = sorted(anomalies, key=lambda a: a.start)
|
| 205 |
+
|
| 206 |
+
merged = [anomalies[0]]
|
| 207 |
+
|
| 208 |
+
for anomaly in anomalies[1:]:
|
| 209 |
+
last = merged[-1]
|
| 210 |
+
|
| 211 |
+
# Merge if same type and close enough
|
| 212 |
+
if (anomaly.anomaly_type == last.anomaly_type and
|
| 213 |
+
anomaly.start - last.end < max_gap):
|
| 214 |
+
# Extend the last anomaly
|
| 215 |
+
merged[-1] = BackgroundAnomaly(
|
| 216 |
+
start=last.start,
|
| 217 |
+
end=anomaly.end,
|
| 218 |
+
anomaly_type=last.anomaly_type,
|
| 219 |
+
amplitude_db=max(last.amplitude_db, anomaly.amplitude_db),
|
| 220 |
+
confidence=max(last.confidence, anomaly.confidence)
|
| 221 |
+
)
|
| 222 |
+
else:
|
| 223 |
+
merged.append(anomaly)
|
| 224 |
+
|
| 225 |
+
return merged
|
| 226 |
+
|
| 227 |
+
def classify_audio_source(self, waveform: np.ndarray) -> AudioSource:
|
| 228 |
+
"""Classify the source of audio (direct, speaker, phone)."""
|
| 229 |
+
if len(waveform) < 1024:
|
| 230 |
+
return AudioSource.UNKNOWN
|
| 231 |
+
|
| 232 |
+
# Analyze frequency content
|
| 233 |
+
spectrum = np.abs(np.fft.rfft(waveform))
|
| 234 |
+
freqs = np.fft.rfftfreq(len(waveform), 1/self.sample_rate)
|
| 235 |
+
|
| 236 |
+
# Find effective bandwidth
|
| 237 |
+
total_energy = np.sum(spectrum ** 2)
|
| 238 |
+
if total_energy == 0:
|
| 239 |
+
return AudioSource.UNKNOWN
|
| 240 |
+
|
| 241 |
+
cumsum = np.cumsum(spectrum ** 2)
|
| 242 |
+
idx_95 = np.searchsorted(cumsum, 0.95 * total_energy)
|
| 243 |
+
max_freq = freqs[min(idx_95, len(freqs)-1)]
|
| 244 |
+
|
| 245 |
+
# Phone typically cuts off around 3.4kHz
|
| 246 |
+
if max_freq < 4000:
|
| 247 |
+
return AudioSource.PHONE
|
| 248 |
+
|
| 249 |
+
# Speaker typically has limited high freq
|
| 250 |
+
if max_freq < 8000:
|
| 251 |
+
return AudioSource.SPEAKER
|
| 252 |
+
|
| 253 |
+
return AudioSource.DIRECT
|
src/phase6_synthetic/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .detector import SyntheticDetector, SyntheticResult, PlaybackDetector, PlaybackResult
|
| 2 |
+
from .wake_words import WakeWordDetector, WakeWordDetection, TranscriptionSegment
|
| 3 |
+
|
| 4 |
+
__all__ = [
|
| 5 |
+
'SyntheticDetector', 'SyntheticResult',
|
| 6 |
+
'PlaybackDetector', 'PlaybackResult',
|
| 7 |
+
'WakeWordDetector', 'WakeWordDetection', 'TranscriptionSegment'
|
| 8 |
+
]
|
src/phase6_synthetic/detector.py
ADDED
|
@@ -0,0 +1,494 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Synthetic Voice Detection - detect TTS, AI-generated speech, and playback attacks.
|
| 3 |
+
"""
|
| 4 |
+
import torch
|
| 5 |
+
import numpy as np
|
| 6 |
+
from typing import Tuple
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class SyntheticResult:
|
| 12 |
+
"""Result of synthetic voice detection."""
|
| 13 |
+
score: float # 0 = genuine, 1 = synthetic
|
| 14 |
+
is_synthetic: bool
|
| 15 |
+
confidence: str # "high", "medium", "low"
|
| 16 |
+
|
| 17 |
+
@classmethod
|
| 18 |
+
def from_score(cls, score: float, threshold: float = 0.5):
|
| 19 |
+
is_synthetic = score > threshold
|
| 20 |
+
|
| 21 |
+
if score < 0.2 or score > 0.8:
|
| 22 |
+
confidence = "high"
|
| 23 |
+
elif score < 0.35 or score > 0.65:
|
| 24 |
+
confidence = "medium"
|
| 25 |
+
else:
|
| 26 |
+
confidence = "low"
|
| 27 |
+
|
| 28 |
+
return cls(score=score, is_synthetic=is_synthetic, confidence=confidence)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class PlaybackResult:
|
| 33 |
+
"""Result of playback/replay detection."""
|
| 34 |
+
score: float # 0 = live, 1 = playback
|
| 35 |
+
is_playback: bool
|
| 36 |
+
confidence: str
|
| 37 |
+
indicators: list # List of detected indicators
|
| 38 |
+
|
| 39 |
+
@classmethod
|
| 40 |
+
def from_score(cls, score: float, indicators: list = None, threshold: float = 0.5):
|
| 41 |
+
is_playback = score > threshold
|
| 42 |
+
|
| 43 |
+
if score < 0.2 or score > 0.8:
|
| 44 |
+
confidence = "high"
|
| 45 |
+
elif score < 0.35 or score > 0.65:
|
| 46 |
+
confidence = "medium"
|
| 47 |
+
else:
|
| 48 |
+
confidence = "low"
|
| 49 |
+
|
| 50 |
+
return cls(
|
| 51 |
+
score=score,
|
| 52 |
+
is_playback=is_playback,
|
| 53 |
+
confidence=confidence,
|
| 54 |
+
indicators=indicators or []
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class SyntheticDetector:
|
| 59 |
+
"""
|
| 60 |
+
Detect synthetic/AI-generated speech.
|
| 61 |
+
|
| 62 |
+
Note: For MVP, uses a simple heuristic approach.
|
| 63 |
+
Production should use ASVspoof models.
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
def __init__(self, device: str = None):
|
| 67 |
+
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
|
| 68 |
+
self.sample_rate = 16000
|
| 69 |
+
|
| 70 |
+
def detect(self, waveform: np.ndarray) -> SyntheticResult:
|
| 71 |
+
"""
|
| 72 |
+
Detect if audio is synthetic.
|
| 73 |
+
|
| 74 |
+
Args:
|
| 75 |
+
waveform: Audio waveform (numpy array)
|
| 76 |
+
|
| 77 |
+
Returns:
|
| 78 |
+
SyntheticResult with score and classification
|
| 79 |
+
"""
|
| 80 |
+
if len(waveform) < self.sample_rate: # Less than 1 second
|
| 81 |
+
return SyntheticResult.from_score(0.5) # Uncertain
|
| 82 |
+
|
| 83 |
+
# Extract features that differ between real and synthetic
|
| 84 |
+
features = self._extract_features(waveform)
|
| 85 |
+
|
| 86 |
+
# Simple scoring based on features
|
| 87 |
+
# (In production, use a trained classifier)
|
| 88 |
+
score = self._calculate_score(features)
|
| 89 |
+
|
| 90 |
+
return SyntheticResult.from_score(score)
|
| 91 |
+
|
| 92 |
+
def _extract_features(self, waveform: np.ndarray) -> dict:
|
| 93 |
+
"""Extract features for synthetic detection."""
|
| 94 |
+
import librosa
|
| 95 |
+
|
| 96 |
+
features = {}
|
| 97 |
+
|
| 98 |
+
# 1. Spectral flatness (synthetic often more uniform)
|
| 99 |
+
spectral_flatness = librosa.feature.spectral_flatness(y=waveform)
|
| 100 |
+
features['spectral_flatness_mean'] = np.mean(spectral_flatness)
|
| 101 |
+
features['spectral_flatness_std'] = np.std(spectral_flatness)
|
| 102 |
+
|
| 103 |
+
# 2. Zero crossing rate variability
|
| 104 |
+
zcr = librosa.feature.zero_crossing_rate(waveform)
|
| 105 |
+
features['zcr_std'] = np.std(zcr)
|
| 106 |
+
|
| 107 |
+
# 3. MFCC variance (real speech has more variation)
|
| 108 |
+
mfccs = librosa.feature.mfcc(y=waveform, sr=self.sample_rate, n_mfcc=13)
|
| 109 |
+
features['mfcc_var'] = np.mean(np.var(mfccs, axis=1))
|
| 110 |
+
|
| 111 |
+
# 4. Pitch variation (synthetic often more regular)
|
| 112 |
+
try:
|
| 113 |
+
pitches, magnitudes = librosa.piptrack(y=waveform, sr=self.sample_rate)
|
| 114 |
+
pitch_values = pitches[magnitudes > np.median(magnitudes)]
|
| 115 |
+
if len(pitch_values) > 0:
|
| 116 |
+
features['pitch_std'] = np.std(pitch_values[pitch_values > 0])
|
| 117 |
+
else:
|
| 118 |
+
features['pitch_std'] = 0
|
| 119 |
+
except:
|
| 120 |
+
features['pitch_std'] = 0
|
| 121 |
+
|
| 122 |
+
return features
|
| 123 |
+
|
| 124 |
+
def _calculate_score(self, features: dict) -> float:
|
| 125 |
+
"""Calculate synthetic score from features."""
|
| 126 |
+
score = 0.0
|
| 127 |
+
count = 0
|
| 128 |
+
|
| 129 |
+
# High spectral flatness = more synthetic
|
| 130 |
+
if features.get('spectral_flatness_mean', 0) > 0.3:
|
| 131 |
+
score += 0.7
|
| 132 |
+
elif features.get('spectral_flatness_mean', 0) > 0.15:
|
| 133 |
+
score += 0.3
|
| 134 |
+
count += 1
|
| 135 |
+
|
| 136 |
+
# Low spectral flatness variation = more synthetic
|
| 137 |
+
if features.get('spectral_flatness_std', 0) < 0.05:
|
| 138 |
+
score += 0.6
|
| 139 |
+
count += 1
|
| 140 |
+
|
| 141 |
+
# Low MFCC variance = more synthetic
|
| 142 |
+
if features.get('mfcc_var', 0) < 50:
|
| 143 |
+
score += 0.5
|
| 144 |
+
count += 1
|
| 145 |
+
|
| 146 |
+
# Low pitch variation = more synthetic
|
| 147 |
+
if features.get('pitch_std', 0) < 20:
|
| 148 |
+
score += 0.4
|
| 149 |
+
count += 1
|
| 150 |
+
|
| 151 |
+
return score / count if count > 0 else 0.5
|
| 152 |
+
|
| 153 |
+
def detect_from_file(self, audio_path: str) -> SyntheticResult:
|
| 154 |
+
"""Detect from audio file."""
|
| 155 |
+
import librosa
|
| 156 |
+
waveform, _ = librosa.load(audio_path, sr=self.sample_rate)
|
| 157 |
+
return self.detect(waveform)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
class PlaybackDetector:
|
| 161 |
+
"""
|
| 162 |
+
Detect if audio is being played back through speakers (replay attack).
|
| 163 |
+
|
| 164 |
+
Analyzes:
|
| 165 |
+
- Reverberation characteristics (room acoustics from speaker playback)
|
| 166 |
+
- High frequency roll-off (speakers have limited frequency response)
|
| 167 |
+
- Compression artifacts (from audio encoding)
|
| 168 |
+
- TTS/synthetic voice characteristics
|
| 169 |
+
- Spectral unnaturalness
|
| 170 |
+
- Double-talk/echo patterns
|
| 171 |
+
"""
|
| 172 |
+
|
| 173 |
+
def __init__(self, sample_rate: int = 16000):
|
| 174 |
+
self.sample_rate = sample_rate
|
| 175 |
+
|
| 176 |
+
def detect(self, waveform: np.ndarray) -> PlaybackResult:
|
| 177 |
+
"""
|
| 178 |
+
Detect if audio is from speaker playback.
|
| 179 |
+
|
| 180 |
+
Args:
|
| 181 |
+
waveform: Audio waveform (numpy array)
|
| 182 |
+
|
| 183 |
+
Returns:
|
| 184 |
+
PlaybackResult with score and indicators
|
| 185 |
+
"""
|
| 186 |
+
if len(waveform) < self.sample_rate:
|
| 187 |
+
return PlaybackResult.from_score(0.5, ["audio_too_short"])
|
| 188 |
+
|
| 189 |
+
indicators = []
|
| 190 |
+
scores = []
|
| 191 |
+
weights = []
|
| 192 |
+
|
| 193 |
+
# 1. Check for high-frequency roll-off (speakers cut off high frequencies)
|
| 194 |
+
hf_score, hf_indicator = self._check_high_freq_rolloff(waveform)
|
| 195 |
+
scores.append(hf_score)
|
| 196 |
+
weights.append(1.5) # Higher weight - very indicative
|
| 197 |
+
if hf_indicator:
|
| 198 |
+
indicators.append(hf_indicator)
|
| 199 |
+
|
| 200 |
+
# 2. Check for reverberation (speaker playback adds room reverb)
|
| 201 |
+
reverb_score, reverb_indicator = self._check_reverberation(waveform)
|
| 202 |
+
scores.append(reverb_score)
|
| 203 |
+
weights.append(1.0)
|
| 204 |
+
if reverb_indicator:
|
| 205 |
+
indicators.append(reverb_indicator)
|
| 206 |
+
|
| 207 |
+
# 3. Check for compression artifacts
|
| 208 |
+
comp_score, comp_indicator = self._check_compression_artifacts(waveform)
|
| 209 |
+
scores.append(comp_score)
|
| 210 |
+
weights.append(1.2)
|
| 211 |
+
if comp_indicator:
|
| 212 |
+
indicators.append(comp_indicator)
|
| 213 |
+
|
| 214 |
+
# 4. Check for unnatural silence patterns (digital silence)
|
| 215 |
+
silence_score, silence_indicator = self._check_digital_silence(waveform)
|
| 216 |
+
scores.append(silence_score)
|
| 217 |
+
weights.append(0.8)
|
| 218 |
+
if silence_indicator:
|
| 219 |
+
indicators.append(silence_indicator)
|
| 220 |
+
|
| 221 |
+
# 5. Check for clipping (common in playback through speakers)
|
| 222 |
+
clip_score, clip_indicator = self._check_clipping(waveform)
|
| 223 |
+
scores.append(clip_score)
|
| 224 |
+
weights.append(0.7)
|
| 225 |
+
if clip_indicator:
|
| 226 |
+
indicators.append(clip_indicator)
|
| 227 |
+
|
| 228 |
+
# 6. Check for TTS characteristics (ElevenLabs, etc.)
|
| 229 |
+
tts_score, tts_indicator = self._check_tts_characteristics(waveform)
|
| 230 |
+
scores.append(tts_score)
|
| 231 |
+
weights.append(2.0) # High weight - very important
|
| 232 |
+
if tts_indicator:
|
| 233 |
+
indicators.append(tts_indicator)
|
| 234 |
+
|
| 235 |
+
# 7. Check spectral smoothness (TTS has unnaturally smooth spectra)
|
| 236 |
+
smooth_score, smooth_indicator = self._check_spectral_smoothness(waveform)
|
| 237 |
+
scores.append(smooth_score)
|
| 238 |
+
weights.append(1.5)
|
| 239 |
+
if smooth_indicator:
|
| 240 |
+
indicators.append(smooth_indicator)
|
| 241 |
+
|
| 242 |
+
# 8. Check for room acoustics double-bounce
|
| 243 |
+
room_score, room_indicator = self._check_room_acoustics(waveform)
|
| 244 |
+
scores.append(room_score)
|
| 245 |
+
weights.append(1.0)
|
| 246 |
+
if room_indicator:
|
| 247 |
+
indicators.append(room_indicator)
|
| 248 |
+
|
| 249 |
+
# Calculate weighted final score
|
| 250 |
+
if scores:
|
| 251 |
+
final_score = np.average(scores, weights=weights)
|
| 252 |
+
else:
|
| 253 |
+
final_score = 0.5
|
| 254 |
+
|
| 255 |
+
# Boost score if multiple indicators detected
|
| 256 |
+
if len(indicators) >= 3:
|
| 257 |
+
final_score = min(1.0, final_score * 1.2)
|
| 258 |
+
if len(indicators) >= 4:
|
| 259 |
+
final_score = min(1.0, final_score * 1.1)
|
| 260 |
+
|
| 261 |
+
return PlaybackResult.from_score(final_score, indicators, threshold=0.45)
|
| 262 |
+
|
| 263 |
+
def _check_high_freq_rolloff(self, waveform: np.ndarray) -> Tuple[float, str]:
|
| 264 |
+
"""
|
| 265 |
+
Check for high frequency roll-off typical of speakers.
|
| 266 |
+
Live voice has energy up to 8kHz+, speakers often cut off around 4-6kHz.
|
| 267 |
+
"""
|
| 268 |
+
import librosa
|
| 269 |
+
|
| 270 |
+
# Get spectrogram
|
| 271 |
+
S = np.abs(librosa.stft(waveform))
|
| 272 |
+
freqs = librosa.fft_frequencies(sr=self.sample_rate)
|
| 273 |
+
|
| 274 |
+
# Calculate energy in different frequency bands
|
| 275 |
+
low_band = S[(freqs >= 100) & (freqs < 2000)].mean()
|
| 276 |
+
mid_band = S[(freqs >= 2000) & (freqs < 4000)].mean()
|
| 277 |
+
high_band = S[(freqs >= 4000) & (freqs < 8000)].mean()
|
| 278 |
+
|
| 279 |
+
if low_band == 0:
|
| 280 |
+
return 0.5, None
|
| 281 |
+
|
| 282 |
+
# Ratio of high to low frequency energy
|
| 283 |
+
high_low_ratio = high_band / (low_band + 1e-10)
|
| 284 |
+
mid_low_ratio = mid_band / (low_band + 1e-10)
|
| 285 |
+
|
| 286 |
+
# Very low high-frequency energy suggests speaker playback
|
| 287 |
+
if high_low_ratio < 0.05:
|
| 288 |
+
return 0.8, "severe_hf_rolloff"
|
| 289 |
+
elif high_low_ratio < 0.15:
|
| 290 |
+
return 0.6, "moderate_hf_rolloff"
|
| 291 |
+
elif high_low_ratio < 0.3:
|
| 292 |
+
return 0.4, None
|
| 293 |
+
|
| 294 |
+
return 0.2, None
|
| 295 |
+
|
| 296 |
+
def _check_reverberation(self, waveform: np.ndarray) -> Tuple[float, str]:
|
| 297 |
+
"""
|
| 298 |
+
Check for excessive reverberation typical of speaker playback.
|
| 299 |
+
Audio played through speakers picks up room acoustics twice.
|
| 300 |
+
"""
|
| 301 |
+
import librosa
|
| 302 |
+
|
| 303 |
+
# Calculate spectral centroid variance (reverb smooths this)
|
| 304 |
+
centroid = librosa.feature.spectral_centroid(y=waveform, sr=self.sample_rate)
|
| 305 |
+
centroid_var = np.var(centroid)
|
| 306 |
+
|
| 307 |
+
# Calculate spectral contrast (reverb reduces contrast)
|
| 308 |
+
contrast = librosa.feature.spectral_contrast(y=waveform, sr=self.sample_rate)
|
| 309 |
+
contrast_mean = np.mean(contrast)
|
| 310 |
+
|
| 311 |
+
# Low variance and contrast suggest reverb/playback
|
| 312 |
+
if centroid_var < 100000 and contrast_mean < 15:
|
| 313 |
+
return 0.7, "high_reverb_detected"
|
| 314 |
+
elif centroid_var < 500000 and contrast_mean < 20:
|
| 315 |
+
return 0.5, "moderate_reverb"
|
| 316 |
+
|
| 317 |
+
return 0.2, None
|
| 318 |
+
|
| 319 |
+
def _check_compression_artifacts(self, waveform: np.ndarray) -> Tuple[float, str]:
|
| 320 |
+
"""
|
| 321 |
+
Check for audio compression artifacts (MP3, AAC, etc.).
|
| 322 |
+
Playback often involves compressed audio.
|
| 323 |
+
"""
|
| 324 |
+
import librosa
|
| 325 |
+
|
| 326 |
+
# Check for spectral holes (common in lossy compression)
|
| 327 |
+
S = np.abs(librosa.stft(waveform))
|
| 328 |
+
|
| 329 |
+
# Count very low energy bins (spectral holes)
|
| 330 |
+
threshold = np.max(S) * 0.001
|
| 331 |
+
spectral_holes = np.sum(S < threshold) / S.size
|
| 332 |
+
|
| 333 |
+
# High number of spectral holes suggests compression
|
| 334 |
+
if spectral_holes > 0.4:
|
| 335 |
+
return 0.7, "compression_artifacts"
|
| 336 |
+
elif spectral_holes > 0.25:
|
| 337 |
+
return 0.5, None
|
| 338 |
+
|
| 339 |
+
return 0.2, None
|
| 340 |
+
|
| 341 |
+
def _check_digital_silence(self, waveform: np.ndarray) -> Tuple[float, str]:
|
| 342 |
+
"""
|
| 343 |
+
Check for perfectly digital silence (exactly zero values).
|
| 344 |
+
Natural recordings have noise floor, playback may have digital silence.
|
| 345 |
+
"""
|
| 346 |
+
# Count exactly zero samples
|
| 347 |
+
zero_count = np.sum(waveform == 0)
|
| 348 |
+
zero_ratio = zero_count / len(waveform)
|
| 349 |
+
|
| 350 |
+
# Significant perfect zeros suggest digital source
|
| 351 |
+
if zero_ratio > 0.1:
|
| 352 |
+
return 0.8, "digital_silence_detected"
|
| 353 |
+
elif zero_ratio > 0.05:
|
| 354 |
+
return 0.5, "some_digital_silence"
|
| 355 |
+
|
| 356 |
+
return 0.1, None
|
| 357 |
+
|
| 358 |
+
def _check_clipping(self, waveform: np.ndarray) -> Tuple[float, str]:
|
| 359 |
+
"""
|
| 360 |
+
Check for audio clipping (common in speaker playback at high volume).
|
| 361 |
+
"""
|
| 362 |
+
# Normalize
|
| 363 |
+
max_val = np.max(np.abs(waveform))
|
| 364 |
+
if max_val == 0:
|
| 365 |
+
return 0.5, None
|
| 366 |
+
|
| 367 |
+
normalized = waveform / max_val
|
| 368 |
+
|
| 369 |
+
# Count samples at or very near max amplitude
|
| 370 |
+
clip_threshold = 0.99
|
| 371 |
+
clipped_samples = np.sum(np.abs(normalized) > clip_threshold)
|
| 372 |
+
clip_ratio = clipped_samples / len(waveform)
|
| 373 |
+
|
| 374 |
+
if clip_ratio > 0.01:
|
| 375 |
+
return 0.7, "audio_clipping"
|
| 376 |
+
elif clip_ratio > 0.005:
|
| 377 |
+
return 0.5, None
|
| 378 |
+
|
| 379 |
+
return 0.2, None
|
| 380 |
+
|
| 381 |
+
def _check_tts_characteristics(self, waveform: np.ndarray) -> Tuple[float, str]:
|
| 382 |
+
"""
|
| 383 |
+
Check for TTS/synthetic voice characteristics.
|
| 384 |
+
ElevenLabs and similar TTS have very consistent pitch and timing.
|
| 385 |
+
"""
|
| 386 |
+
import librosa
|
| 387 |
+
|
| 388 |
+
# Extract pitch (F0)
|
| 389 |
+
try:
|
| 390 |
+
f0, voiced_flag, voiced_probs = librosa.pyin(
|
| 391 |
+
waveform,
|
| 392 |
+
fmin=librosa.note_to_hz('C2'),
|
| 393 |
+
fmax=librosa.note_to_hz('C7'),
|
| 394 |
+
sr=self.sample_rate
|
| 395 |
+
)
|
| 396 |
+
f0_valid = f0[~np.isnan(f0)]
|
| 397 |
+
|
| 398 |
+
if len(f0_valid) > 10:
|
| 399 |
+
# TTS has very low pitch variation
|
| 400 |
+
pitch_std = np.std(f0_valid)
|
| 401 |
+
pitch_mean = np.mean(f0_valid)
|
| 402 |
+
pitch_cv = pitch_std / (pitch_mean + 1e-10) # Coefficient of variation
|
| 403 |
+
|
| 404 |
+
# Natural speech has CV > 0.1, TTS often < 0.08
|
| 405 |
+
if pitch_cv < 0.05:
|
| 406 |
+
return 0.85, "tts_flat_pitch"
|
| 407 |
+
elif pitch_cv < 0.08:
|
| 408 |
+
return 0.7, "tts_low_pitch_variation"
|
| 409 |
+
elif pitch_cv < 0.12:
|
| 410 |
+
return 0.5, None
|
| 411 |
+
except Exception:
|
| 412 |
+
pass
|
| 413 |
+
|
| 414 |
+
# Check for unnaturally regular timing (TTS has consistent phoneme duration)
|
| 415 |
+
try:
|
| 416 |
+
onset_env = librosa.onset.onset_strength(y=waveform, sr=self.sample_rate)
|
| 417 |
+
onset_frames = librosa.onset.onset_detect(onset_envelope=onset_env, sr=self.sample_rate)
|
| 418 |
+
|
| 419 |
+
if len(onset_frames) > 5:
|
| 420 |
+
intervals = np.diff(onset_frames)
|
| 421 |
+
interval_cv = np.std(intervals) / (np.mean(intervals) + 1e-10)
|
| 422 |
+
|
| 423 |
+
# TTS has very regular intervals
|
| 424 |
+
if interval_cv < 0.3:
|
| 425 |
+
return 0.75, "tts_regular_timing"
|
| 426 |
+
elif interval_cv < 0.5:
|
| 427 |
+
return 0.55, None
|
| 428 |
+
except Exception:
|
| 429 |
+
pass
|
| 430 |
+
|
| 431 |
+
return 0.3, None
|
| 432 |
+
|
| 433 |
+
def _check_spectral_smoothness(self, waveform: np.ndarray) -> Tuple[float, str]:
|
| 434 |
+
"""
|
| 435 |
+
Check for unnaturally smooth spectrum (common in TTS and compressed playback).
|
| 436 |
+
"""
|
| 437 |
+
import librosa
|
| 438 |
+
|
| 439 |
+
# Get mel spectrogram
|
| 440 |
+
mel_spec = librosa.feature.melspectrogram(y=waveform, sr=self.sample_rate, n_mels=128)
|
| 441 |
+
mel_db = librosa.power_to_db(mel_spec, ref=np.max)
|
| 442 |
+
|
| 443 |
+
# Calculate spectral flux (frame-to-frame variation)
|
| 444 |
+
spectral_flux = np.mean(np.abs(np.diff(mel_db, axis=1)))
|
| 445 |
+
|
| 446 |
+
# TTS and playback have lower spectral flux (smoother)
|
| 447 |
+
if spectral_flux < 3.0:
|
| 448 |
+
return 0.8, "smooth_spectrum"
|
| 449 |
+
elif spectral_flux < 5.0:
|
| 450 |
+
return 0.6, "slightly_smooth_spectrum"
|
| 451 |
+
elif spectral_flux < 8.0:
|
| 452 |
+
return 0.4, None
|
| 453 |
+
|
| 454 |
+
return 0.2, None
|
| 455 |
+
|
| 456 |
+
def _check_room_acoustics(self, waveform: np.ndarray) -> Tuple[float, str]:
|
| 457 |
+
"""
|
| 458 |
+
Check for room acoustics characteristics from speaker playback.
|
| 459 |
+
Audio played through speakers picks up room reverb.
|
| 460 |
+
"""
|
| 461 |
+
import librosa
|
| 462 |
+
|
| 463 |
+
# Calculate spectral bandwidth variation
|
| 464 |
+
bandwidth = librosa.feature.spectral_bandwidth(y=waveform, sr=self.sample_rate)
|
| 465 |
+
bandwidth_var = np.var(bandwidth)
|
| 466 |
+
|
| 467 |
+
# Calculate spectral rolloff
|
| 468 |
+
rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=self.sample_rate)
|
| 469 |
+
rolloff_mean = np.mean(rolloff)
|
| 470 |
+
|
| 471 |
+
# Speaker playback tends to have limited bandwidth and lower rolloff
|
| 472 |
+
score = 0.3
|
| 473 |
+
|
| 474 |
+
# Low bandwidth variation suggests processed audio
|
| 475 |
+
if bandwidth_var < 50000:
|
| 476 |
+
score += 0.3
|
| 477 |
+
|
| 478 |
+
# Low rolloff frequency suggests speaker limitations
|
| 479 |
+
if rolloff_mean < 3000:
|
| 480 |
+
score += 0.3
|
| 481 |
+
return min(score, 0.85), "limited_frequency_range"
|
| 482 |
+
elif rolloff_mean < 4500:
|
| 483 |
+
score += 0.15
|
| 484 |
+
|
| 485 |
+
if score > 0.5:
|
| 486 |
+
return score, "room_acoustics_detected"
|
| 487 |
+
|
| 488 |
+
return score, None
|
| 489 |
+
|
| 490 |
+
def detect_from_file(self, audio_path: str) -> PlaybackResult:
|
| 491 |
+
"""Detect playback from audio file."""
|
| 492 |
+
import librosa
|
| 493 |
+
waveform, _ = librosa.load(audio_path, sr=self.sample_rate)
|
| 494 |
+
return self.detect(waveform)
|
src/phase6_synthetic/wake_words.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Wake Word Detection - detect voice assistant usage.
|
| 3 |
+
"""
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import List, Optional
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
import re
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class WakeWordDetection:
|
| 12 |
+
"""Detected wake word."""
|
| 13 |
+
word: str
|
| 14 |
+
assistant: str # amazon, apple, google, microsoft
|
| 15 |
+
time: float
|
| 16 |
+
confidence: float
|
| 17 |
+
context: str # Surrounding text
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@dataclass
|
| 21 |
+
class TranscriptionSegment:
|
| 22 |
+
"""A segment of transcription."""
|
| 23 |
+
start: float
|
| 24 |
+
end: float
|
| 25 |
+
text: str
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class WakeWordDetector:
|
| 29 |
+
"""Detect voice assistant wake words using Whisper transcription."""
|
| 30 |
+
|
| 31 |
+
WAKE_WORDS = {
|
| 32 |
+
"amazon": ["alexa", "echo", "amazon"],
|
| 33 |
+
"apple": ["hey siri", "siri"],
|
| 34 |
+
"google": ["ok google", "hey google", "google"],
|
| 35 |
+
"microsoft": ["cortana", "hey cortana"]
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
# Patterns that suggest assistant response
|
| 39 |
+
RESPONSE_PATTERNS = [
|
| 40 |
+
r"here'?s what i found",
|
| 41 |
+
r"according to",
|
| 42 |
+
r"the answer is",
|
| 43 |
+
r"i found this",
|
| 44 |
+
r"let me search",
|
| 45 |
+
r"searching for",
|
| 46 |
+
r"playing .+ by",
|
| 47 |
+
r"the weather (is|today|tomorrow)",
|
| 48 |
+
r"it'?s currently",
|
| 49 |
+
r"\d+ degrees"
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
+
def __init__(self, model_size: str = "base"):
|
| 53 |
+
"""
|
| 54 |
+
Initialize wake word detector.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
model_size: Whisper model size (tiny, base, small, medium, large)
|
| 58 |
+
"""
|
| 59 |
+
self.model_size = model_size
|
| 60 |
+
self._model = None
|
| 61 |
+
|
| 62 |
+
@property
|
| 63 |
+
def model(self):
|
| 64 |
+
"""Lazy load Whisper model."""
|
| 65 |
+
if self._model is None:
|
| 66 |
+
import whisper
|
| 67 |
+
self._model = whisper.load_model(self.model_size)
|
| 68 |
+
return self._model
|
| 69 |
+
|
| 70 |
+
def transcribe(self, audio_path: str,
|
| 71 |
+
language: str = "en") -> List[TranscriptionSegment]:
|
| 72 |
+
"""
|
| 73 |
+
Transcribe audio file.
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
audio_path: Path to audio file
|
| 77 |
+
language: Language code
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
List of transcription segments
|
| 81 |
+
"""
|
| 82 |
+
result = self.model.transcribe(
|
| 83 |
+
audio_path,
|
| 84 |
+
language=language,
|
| 85 |
+
word_timestamps=True
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
segments = []
|
| 89 |
+
for seg in result.get("segments", []):
|
| 90 |
+
segments.append(TranscriptionSegment(
|
| 91 |
+
start=seg["start"],
|
| 92 |
+
end=seg["end"],
|
| 93 |
+
text=seg["text"].strip()
|
| 94 |
+
))
|
| 95 |
+
|
| 96 |
+
return segments
|
| 97 |
+
|
| 98 |
+
def detect_wake_words(self, segments: List[TranscriptionSegment]) -> List[WakeWordDetection]:
|
| 99 |
+
"""
|
| 100 |
+
Detect wake words in transcription.
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
segments: Transcription segments
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
List of wake word detections
|
| 107 |
+
"""
|
| 108 |
+
detections = []
|
| 109 |
+
|
| 110 |
+
full_text = " ".join([s.text for s in segments]).lower()
|
| 111 |
+
|
| 112 |
+
for segment in segments:
|
| 113 |
+
text_lower = segment.text.lower()
|
| 114 |
+
|
| 115 |
+
for assistant, words in self.WAKE_WORDS.items():
|
| 116 |
+
for word in words:
|
| 117 |
+
if word in text_lower:
|
| 118 |
+
# Calculate confidence based on word clarity
|
| 119 |
+
confidence = self._calculate_confidence(word, text_lower)
|
| 120 |
+
|
| 121 |
+
# Get surrounding context
|
| 122 |
+
context = self._get_context(segment, segments)
|
| 123 |
+
|
| 124 |
+
detections.append(WakeWordDetection(
|
| 125 |
+
word=word,
|
| 126 |
+
assistant=assistant,
|
| 127 |
+
time=segment.start,
|
| 128 |
+
confidence=confidence,
|
| 129 |
+
context=context
|
| 130 |
+
))
|
| 131 |
+
|
| 132 |
+
# Remove duplicates (same word at similar times)
|
| 133 |
+
detections = self._deduplicate(detections)
|
| 134 |
+
|
| 135 |
+
return detections
|
| 136 |
+
|
| 137 |
+
def detect_assistant_responses(self, segments: List[TranscriptionSegment]) -> List[dict]:
|
| 138 |
+
"""
|
| 139 |
+
Detect patterns that suggest assistant responses.
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
List of detected responses with time and pattern
|
| 143 |
+
"""
|
| 144 |
+
responses = []
|
| 145 |
+
|
| 146 |
+
for segment in segments:
|
| 147 |
+
text_lower = segment.text.lower()
|
| 148 |
+
|
| 149 |
+
for pattern in self.RESPONSE_PATTERNS:
|
| 150 |
+
if re.search(pattern, text_lower):
|
| 151 |
+
responses.append({
|
| 152 |
+
'time': segment.start,
|
| 153 |
+
'end': segment.end,
|
| 154 |
+
'pattern': pattern,
|
| 155 |
+
'text': segment.text
|
| 156 |
+
})
|
| 157 |
+
break # One match per segment
|
| 158 |
+
|
| 159 |
+
return responses
|
| 160 |
+
|
| 161 |
+
def _calculate_confidence(self, word: str, text: str) -> float:
|
| 162 |
+
"""Calculate detection confidence."""
|
| 163 |
+
# Exact match = higher confidence
|
| 164 |
+
if f" {word} " in f" {text} ":
|
| 165 |
+
return 0.9
|
| 166 |
+
|
| 167 |
+
# Start of sentence
|
| 168 |
+
if text.startswith(word):
|
| 169 |
+
return 0.85
|
| 170 |
+
|
| 171 |
+
# Part of word might be false positive
|
| 172 |
+
return 0.6
|
| 173 |
+
|
| 174 |
+
def _get_context(self, segment: TranscriptionSegment,
|
| 175 |
+
all_segments: List[TranscriptionSegment],
|
| 176 |
+
context_window: float = 5.0) -> str:
|
| 177 |
+
"""Get text context around a segment."""
|
| 178 |
+
context_parts = []
|
| 179 |
+
|
| 180 |
+
for s in all_segments:
|
| 181 |
+
if abs(s.start - segment.start) <= context_window:
|
| 182 |
+
context_parts.append(s.text)
|
| 183 |
+
|
| 184 |
+
return " ".join(context_parts)
|
| 185 |
+
|
| 186 |
+
def _deduplicate(self, detections: List[WakeWordDetection],
|
| 187 |
+
time_threshold: float = 2.0) -> List[WakeWordDetection]:
|
| 188 |
+
"""Remove duplicate detections."""
|
| 189 |
+
if not detections:
|
| 190 |
+
return []
|
| 191 |
+
|
| 192 |
+
# Sort by time
|
| 193 |
+
detections = sorted(detections, key=lambda d: d.time)
|
| 194 |
+
|
| 195 |
+
unique = [detections[0]]
|
| 196 |
+
|
| 197 |
+
for detection in detections[1:]:
|
| 198 |
+
last = unique[-1]
|
| 199 |
+
|
| 200 |
+
# Skip if same word within threshold
|
| 201 |
+
if (detection.word == last.word and
|
| 202 |
+
abs(detection.time - last.time) < time_threshold):
|
| 203 |
+
# Keep the one with higher confidence
|
| 204 |
+
if detection.confidence > last.confidence:
|
| 205 |
+
unique[-1] = detection
|
| 206 |
+
else:
|
| 207 |
+
unique.append(detection)
|
| 208 |
+
|
| 209 |
+
return unique
|
| 210 |
+
|
| 211 |
+
def analyze(self, audio_path: str) -> dict:
|
| 212 |
+
"""
|
| 213 |
+
Full wake word analysis.
|
| 214 |
+
|
| 215 |
+
Returns:
|
| 216 |
+
Dict with wake_words and assistant_responses
|
| 217 |
+
"""
|
| 218 |
+
try:
|
| 219 |
+
segments = self.transcribe(audio_path)
|
| 220 |
+
wake_words = self.detect_wake_words(segments)
|
| 221 |
+
responses = self.detect_assistant_responses(segments)
|
| 222 |
+
|
| 223 |
+
return {
|
| 224 |
+
'transcription': segments,
|
| 225 |
+
'wake_words': wake_words,
|
| 226 |
+
'assistant_responses': responses
|
| 227 |
+
}
|
| 228 |
+
except Exception as e:
|
| 229 |
+
# Return empty results on error
|
| 230 |
+
print(f"Wake word detection error: {e}")
|
| 231 |
+
return {
|
| 232 |
+
'transcription': [],
|
| 233 |
+
'wake_words': [],
|
| 234 |
+
'assistant_responses': []
|
| 235 |
+
}
|
src/ui/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# UI components placeholder
|