Spaces:
Sleeping
Sleeping
Commit ·
1cb0653
0
Parent(s):
Added AI_Voice_Detector
Browse files- .dockerimage +24 -0
- .gitignore +80 -0
- DockerFile +46 -0
- README.md +488 -0
- app.py +1053 -0
- client.py +209 -0
- detector.py +875 -0
- download_models.py +92 -0
- pytest.ini +4 -0
- requirements.txt +14 -0
- self_learning_train.py +245 -0
- tests/conftest.py +144 -0
- tests/test_api.py +177 -0
- tests/test_feedback.py +67 -0
- tests/test_integration_model.py +60 -0
- tests/test_streaming.py +92 -0
- try.ipynb +0 -0
.dockerimage
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
.Python
|
| 6 |
+
*.so
|
| 7 |
+
*.egg
|
| 8 |
+
*.egg-info
|
| 9 |
+
dist
|
| 10 |
+
build
|
| 11 |
+
.pytest_cache
|
| 12 |
+
.coverage
|
| 13 |
+
htmlcov
|
| 14 |
+
.env.local
|
| 15 |
+
.DS_Store
|
| 16 |
+
*.log
|
| 17 |
+
test_audio/
|
| 18 |
+
logs/
|
| 19 |
+
*.md
|
| 20 |
+
.git
|
| 21 |
+
.gitignore
|
| 22 |
+
docker-compose.yml
|
| 23 |
+
test_api.py
|
| 24 |
+
client.py
|
.gitignore
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
AI_voice_dataset/
|
| 2 |
+
Deepfake-audio-detection-V2/
|
| 3 |
+
wav2vec2_finetuned_model/
|
| 4 |
+
wav2vec2-deepfake-voice-detector/
|
| 5 |
+
trained_voice_features.csv
|
| 6 |
+
voice_auth_model.pkl
|
| 7 |
+
|
| 8 |
+
.env
|
| 9 |
+
test.py
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# Python
|
| 13 |
+
__pycache__/
|
| 14 |
+
*.py[cod]
|
| 15 |
+
*$py.class
|
| 16 |
+
*.so
|
| 17 |
+
.Python
|
| 18 |
+
build/
|
| 19 |
+
develop-eggs/
|
| 20 |
+
dist/
|
| 21 |
+
downloads/
|
| 22 |
+
eggs/
|
| 23 |
+
.eggs/
|
| 24 |
+
lib/
|
| 25 |
+
lib64/
|
| 26 |
+
parts/
|
| 27 |
+
sdist/
|
| 28 |
+
var/
|
| 29 |
+
wheels/
|
| 30 |
+
*.egg-info/
|
| 31 |
+
.installed.cfg
|
| 32 |
+
*.egg
|
| 33 |
+
|
| 34 |
+
# Virtual Environment
|
| 35 |
+
venv/
|
| 36 |
+
ENV/
|
| 37 |
+
env/
|
| 38 |
+
.venv
|
| 39 |
+
|
| 40 |
+
# IDE
|
| 41 |
+
.vscode/
|
| 42 |
+
.idea/
|
| 43 |
+
*.swp
|
| 44 |
+
*.swo
|
| 45 |
+
*~
|
| 46 |
+
|
| 47 |
+
# Testing
|
| 48 |
+
.pytest_cache/
|
| 49 |
+
.coverage
|
| 50 |
+
htmlcov/
|
| 51 |
+
.tox/
|
| 52 |
+
|
| 53 |
+
# Environment
|
| 54 |
+
.env
|
| 55 |
+
.env.local
|
| 56 |
+
.env.*.local
|
| 57 |
+
|
| 58 |
+
# Logs
|
| 59 |
+
*.log
|
| 60 |
+
logs/
|
| 61 |
+
|
| 62 |
+
# OS
|
| 63 |
+
.DS_Store
|
| 64 |
+
Thumbs.db
|
| 65 |
+
|
| 66 |
+
# Audio files (for testing)
|
| 67 |
+
test_audio/
|
| 68 |
+
*.mp3
|
| 69 |
+
*.wav
|
| 70 |
+
|
| 71 |
+
# Self-learning data
|
| 72 |
+
data/
|
| 73 |
+
|
| 74 |
+
# Docker
|
| 75 |
+
.dockerignore
|
| 76 |
+
|
| 77 |
+
# Temporary files
|
| 78 |
+
*.tmp
|
| 79 |
+
temp/
|
| 80 |
+
tmp/
|
DockerFile
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# Set up a new user named "user" with user ID 1000 (required by HuggingFace Spaces)
|
| 4 |
+
RUN useradd -m -u 1000 user
|
| 5 |
+
|
| 6 |
+
# Set working directory
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
# System dependencies for audio processing
|
| 10 |
+
RUN apt-get update && apt-get install -y \
|
| 11 |
+
libsndfile1 \
|
| 12 |
+
ffmpeg \
|
| 13 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
# Copy requirements and install as root
|
| 16 |
+
COPY --chown=user requirements.txt /app/requirements.txt
|
| 17 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 18 |
+
pip install --no-cache-dir -r requirements.txt
|
| 19 |
+
|
| 20 |
+
# Copy application files with correct ownership
|
| 21 |
+
COPY --chown=user app.py /app/
|
| 22 |
+
COPY --chown=user detector.py /app/
|
| 23 |
+
COPY --chown=user self_learning_train.py /app/
|
| 24 |
+
|
| 25 |
+
# Switch to the "user" user
|
| 26 |
+
USER user
|
| 27 |
+
|
| 28 |
+
# Set home to the user's home directory
|
| 29 |
+
ENV HOME=/home/user \
|
| 30 |
+
PATH=/home/user/.local/bin:$PATH \
|
| 31 |
+
PYTHONUNBUFFERED=1
|
| 32 |
+
|
| 33 |
+
# Pre-download models (will be cached in user's home)
|
| 34 |
+
RUN python -c "from transformers import AutoModelForAudioClassification, AutoFeatureExtractor, WhisperProcessor, WhisperForConditionalGeneration; \
|
| 35 |
+
print('Downloading models...'); \
|
| 36 |
+
AutoModelForAudioClassification.from_pretrained('garystafford/wav2vec2-deepfake-voice-detector'); \
|
| 37 |
+
AutoFeatureExtractor.from_pretrained('garystafford/wav2vec2-deepfake-voice-detector'); \
|
| 38 |
+
WhisperProcessor.from_pretrained('openai/whisper-base'); \
|
| 39 |
+
WhisperForConditionalGeneration.from_pretrained('openai/whisper-base'); \
|
| 40 |
+
print('Models downloaded successfully')"
|
| 41 |
+
|
| 42 |
+
# Expose HuggingFace Spaces port
|
| 43 |
+
EXPOSE 7860
|
| 44 |
+
|
| 45 |
+
# Run with uvicorn (FastAPI)
|
| 46 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,488 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🎙️ Voice Detection API
|
| 2 |
+
|
| 3 |
+
A production-ready REST API that detects whether a voice recording is AI-generated or human using hybrid analysis (physics-based + deep learning).
|
| 4 |
+
|
| 5 |
+
## 🌟 Features
|
| 6 |
+
|
| 7 |
+
- ✅ **Multi-language Support**: Tamil, English, Hindi, Malayalam, Telugu
|
| 8 |
+
- ✅ **Hybrid Detection**: Combines physics analysis + Wav2Vec2 deepfake detection
|
| 9 |
+
- ✅ **Language Detection**: Automatic language identification using Whisper
|
| 10 |
+
- ✅ **Secure**: API key authentication
|
| 11 |
+
- ✅ **Fast**: Auto-truncates to 30 seconds for quick processing
|
| 12 |
+
- ✅ **Production Ready**: Docker support, logging, health checks
|
| 13 |
+
- ✅ **Realtime Streaming**: WebSocket streaming with partial results
|
| 14 |
+
- ✅ **Self-Learning Ready**: Feedback collection + calibration training
|
| 15 |
+
|
| 16 |
+
## 📁 Project Structure
|
| 17 |
+
|
| 18 |
+
```
|
| 19 |
+
voice-detection-api/
|
| 20 |
+
├── app.py # Flask API application
|
| 21 |
+
├── detector.py # Your HybridEnsembleDetector class
|
| 22 |
+
├── self_learning_train.py # Calibration training from feedback data
|
| 23 |
+
├── client.py # Example Python client
|
| 24 |
+
├── test_api.py # Automated test suite
|
| 25 |
+
├── requirements.txt # Python dependencies
|
| 26 |
+
├── Dockerfile # Docker configuration
|
| 27 |
+
├── docker-compose.yml # Docker Compose setup
|
| 28 |
+
├── .env # Environment variables
|
| 29 |
+
├── DEPLOYMENT.md # Detailed deployment guide
|
| 30 |
+
└── README.md # This file
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
## 🚀 Quick Start
|
| 34 |
+
|
| 35 |
+
### Prerequisites
|
| 36 |
+
|
| 37 |
+
- Python 3.10+
|
| 38 |
+
- pip
|
| 39 |
+
- (Optional) Docker & Docker Compose
|
| 40 |
+
|
| 41 |
+
### Installation
|
| 42 |
+
|
| 43 |
+
1. **Clone the repository**
|
| 44 |
+
```bash
|
| 45 |
+
git clone <your-repo-url>
|
| 46 |
+
cd voice-detection-api
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
2. **Install dependencies**
|
| 50 |
+
```bash
|
| 51 |
+
pip install -r requirements.txt
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
3. **Set up environment variables**
|
| 55 |
+
```bash
|
| 56 |
+
# Copy the example .env file
|
| 57 |
+
cp .env.example .env
|
| 58 |
+
|
| 59 |
+
# Edit .env and set your API key
|
| 60 |
+
nano .env
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
4. **Run the API**
|
| 64 |
+
```bash
|
| 65 |
+
python app.py
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
The API will start at `http://localhost:5000`
|
| 69 |
+
|
| 70 |
+
## 🐳 Docker Deployment (Recommended)
|
| 71 |
+
|
| 72 |
+
### Quick Start with Docker Compose
|
| 73 |
+
|
| 74 |
+
```bash
|
| 75 |
+
# Start the API
|
| 76 |
+
docker-compose up -d
|
| 77 |
+
|
| 78 |
+
# Check status
|
| 79 |
+
docker-compose ps
|
| 80 |
+
|
| 81 |
+
# View logs
|
| 82 |
+
docker-compose logs -f
|
| 83 |
+
|
| 84 |
+
# Stop the API
|
| 85 |
+
docker-compose down
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
### Manual Docker Build
|
| 89 |
+
|
| 90 |
+
```bash
|
| 91 |
+
# Build image
|
| 92 |
+
docker build -t voice-detection-api .
|
| 93 |
+
|
| 94 |
+
# Run container
|
| 95 |
+
docker run -p 5000:5000 \
|
| 96 |
+
-e API_KEY="your_secret_key" \
|
| 97 |
+
voice-detection-api
|
| 98 |
+
```
|
| 99 |
+
|
| 100 |
+
## 📡 API Usage
|
| 101 |
+
|
| 102 |
+
### Health Check
|
| 103 |
+
|
| 104 |
+
```bash
|
| 105 |
+
curl http://localhost:5000/health
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
### Voice Detection
|
| 109 |
+
|
| 110 |
+
**Using cURL:**
|
| 111 |
+
```bash
|
| 112 |
+
curl -X POST http://localhost:5000/api/voice-detection \
|
| 113 |
+
-H "Content-Type: application/json" \
|
| 114 |
+
-H "x-api-key: sk_test_123456789" \
|
| 115 |
+
-d '{
|
| 116 |
+
"language": "English",
|
| 117 |
+
"audioFormat": "mp3",
|
| 118 |
+
"audioBase64": "'"$(base64 -w 0 your_audio.mp3)"'"
|
| 119 |
+
}'
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
**Using Python Client:**
|
| 123 |
+
```bash
|
| 124 |
+
# Single file
|
| 125 |
+
python client.py --audio test_audio.mp3 --language English
|
| 126 |
+
|
| 127 |
+
# Multiple files
|
| 128 |
+
python client.py \
|
| 129 |
+
--audio file1.mp3 \
|
| 130 |
+
--audio file2.mp3 \
|
| 131 |
+
--language Tamil
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
**Using Python Requests:**
|
| 135 |
+
```python
|
| 136 |
+
import requests
|
| 137 |
+
import base64
|
| 138 |
+
|
| 139 |
+
# Encode audio
|
| 140 |
+
with open('audio.mp3', 'rb') as f:
|
| 141 |
+
audio_base64 = base64.b64encode(f.read()).decode()
|
| 142 |
+
|
| 143 |
+
# Make request
|
| 144 |
+
response = requests.post(
|
| 145 |
+
'http://localhost:5000/api/voice-detection',
|
| 146 |
+
headers={
|
| 147 |
+
'Content-Type': 'application/json',
|
| 148 |
+
'x-api-key': 'sk_test_123456789'
|
| 149 |
+
},
|
| 150 |
+
json={
|
| 151 |
+
'language': 'English',
|
| 152 |
+
'audioFormat': 'mp3',
|
| 153 |
+
'audioBase64': audio_base64
|
| 154 |
+
}
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
result = response.json()
|
| 158 |
+
print(f"Classification: {result['classification']}")
|
| 159 |
+
print(f"Confidence: {result['confidenceScore']}")
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
### Realtime Streaming (WebSocket)
|
| 163 |
+
|
| 164 |
+
Endpoint: `ws://localhost:5000/ws/voice-stream`
|
| 165 |
+
|
| 166 |
+
Authentication:
|
| 167 |
+
- Query param: `?api_key=sk_test_123456789`
|
| 168 |
+
- Or header: `x-api-key` (non-browser clients)
|
| 169 |
+
|
| 170 |
+
Recommended streaming format: `pcm16` (16kHz, mono). This allows partial
|
| 171 |
+
results while the audio is still streaming.
|
| 172 |
+
If you stream `mp3` or `wav`, partial results are disabled and analysis runs
|
| 173 |
+
on the final buffer.
|
| 174 |
+
|
| 175 |
+
**Client -> Server messages:**
|
| 176 |
+
```json
|
| 177 |
+
{ "type": "start", "audioFormat": "pcm16", "sampleRate": 16000, "channels": 1,
|
| 178 |
+
"enablePartial": true, "partialIntervalSec": 10 }
|
| 179 |
+
```
|
| 180 |
+
```json
|
| 181 |
+
{ "type": "audio_chunk", "audioChunkBase64": "<base64_pcm_chunk>" }
|
| 182 |
+
```
|
| 183 |
+
```json
|
| 184 |
+
{ "type": "audio_chunk", "audioChunkBase64": "<base64_pcm_chunk>", "final": true }
|
| 185 |
+
```
|
| 186 |
+
|
| 187 |
+
**Server -> Client messages:**
|
| 188 |
+
```json
|
| 189 |
+
{ "type": "ack", "sessionId": "...", "status": "ready" }
|
| 190 |
+
```
|
| 191 |
+
```json
|
| 192 |
+
{ "type": "progress", "receivedBytes": 12345, "bufferBytes": 12345, "bufferSeconds": 2.1 }
|
| 193 |
+
```
|
| 194 |
+
```json
|
| 195 |
+
{ "type": "partial_result", "result": { "status": "success", "classification": "AI_GENERATED" } }
|
| 196 |
+
```
|
| 197 |
+
```json
|
| 198 |
+
{ "type": "final_result", "result": { "status": "success", "classification": "HUMAN" } }
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
**Browser example:**
|
| 202 |
+
```javascript
|
| 203 |
+
const ws = new WebSocket("ws://localhost:5000/ws/voice-stream?api_key=sk_test_123456789");
|
| 204 |
+
ws.onopen = () => {
|
| 205 |
+
ws.send(JSON.stringify({
|
| 206 |
+
type: "start",
|
| 207 |
+
audioFormat: "pcm16",
|
| 208 |
+
sampleRate: 16000,
|
| 209 |
+
channels: 1,
|
| 210 |
+
enablePartial: true
|
| 211 |
+
}));
|
| 212 |
+
// Send base64-encoded PCM16 chunks as they arrive
|
| 213 |
+
ws.send(JSON.stringify({ type: "audio_chunk", audioChunkBase64: chunkBase64 }));
|
| 214 |
+
ws.send(JSON.stringify({ type: "audio_chunk", audioChunkBase64: lastChunkBase64, final: true }));
|
| 215 |
+
};
|
| 216 |
+
ws.onmessage = (event) => console.log(event.data);
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
### Feedback (Self-Learning)
|
| 220 |
+
|
| 221 |
+
Send labeled audio samples so the model can periodically recalibrate.
|
| 222 |
+
|
| 223 |
+
```bash
|
| 224 |
+
curl -X POST http://localhost:5000/api/feedback \
|
| 225 |
+
-H "Content-Type: application/json" \
|
| 226 |
+
-H "x-api-key: sk_test_123456789" \
|
| 227 |
+
-d '{
|
| 228 |
+
"label": "AI_GENERATED",
|
| 229 |
+
"audioFormat": "mp3",
|
| 230 |
+
"audioBase64": "'"$(base64 -w 0 new_ai_sample.mp3)"'"
|
| 231 |
+
}'
|
| 232 |
+
```
|
| 233 |
+
|
| 234 |
+
Stored samples are written to `data/feedback/<LABEL>/YYYYMMDD/` along with
|
| 235 |
+
metadata JSON files and an index.
|
| 236 |
+
|
| 237 |
+
### Train Calibration (Self-Learning)
|
| 238 |
+
|
| 239 |
+
This trains a lightweight calibration layer using feedback samples:
|
| 240 |
+
```bash
|
| 241 |
+
python self_learning_train.py --data-dir data/feedback --output data/calibration.json
|
| 242 |
+
```
|
| 243 |
+
|
| 244 |
+
If `CALIBRATION_PATH` exists, the API loads it on startup.
|
| 245 |
+
|
| 246 |
+
When retraining, the script will automatically archive the previous calibration
|
| 247 |
+
to `CALIBRATION_HISTORY_DIR` before writing the new file.
|
| 248 |
+
|
| 249 |
+
Reload calibration without restarting the API:
|
| 250 |
+
```bash
|
| 251 |
+
curl -X POST http://localhost:5000/api/reload-calibration \
|
| 252 |
+
-H "x-api-key: sk_test_123456789"
|
| 253 |
+
```
|
| 254 |
+
|
| 255 |
+
Backup the current calibration (creates a timestamped copy):
|
| 256 |
+
```bash
|
| 257 |
+
curl -X POST http://localhost:5000/api/backup-calibration \
|
| 258 |
+
-H "x-api-key: sk_test_123456789" \
|
| 259 |
+
-d '{"reason": "pre_retrain"}'
|
| 260 |
+
```
|
| 261 |
+
|
| 262 |
+
List calibration history:
|
| 263 |
+
```bash
|
| 264 |
+
curl -X GET http://localhost:5000/api/calibration-history \
|
| 265 |
+
-H "x-api-key: sk_test_123456789"
|
| 266 |
+
```
|
| 267 |
+
|
| 268 |
+
Rollback to a previous calibration:
|
| 269 |
+
```bash
|
| 270 |
+
curl -X POST http://localhost:5000/api/rollback-calibration \
|
| 271 |
+
-H "x-api-key: sk_test_123456789" \
|
| 272 |
+
-d '{"versionId": "20260207T120000Z_ab12cd34"}'
|
| 273 |
+
```
|
| 274 |
+
|
| 275 |
+
## 📊 Response Format
|
| 276 |
+
|
| 277 |
+
### Success Response
|
| 278 |
+
```json
|
| 279 |
+
{
|
| 280 |
+
"status": "success",
|
| 281 |
+
"language": "English",
|
| 282 |
+
"classification": "AI_GENERATED",
|
| 283 |
+
"confidenceScore": 0.91,
|
| 284 |
+
"explanation": "Deep learning model detected synthetic voice patterns (confidence: 92.5%)"
|
| 285 |
+
}
|
| 286 |
+
```
|
| 287 |
+
|
| 288 |
+
### Error Response
|
| 289 |
+
```json
|
| 290 |
+
{
|
| 291 |
+
"status": "error",
|
| 292 |
+
"message": "Invalid API key"
|
| 293 |
+
}
|
| 294 |
+
```
|
| 295 |
+
|
| 296 |
+
## 🔑 Authentication
|
| 297 |
+
|
| 298 |
+
All requests to `/api/voice-detection` require an API key in the header:
|
| 299 |
+
|
| 300 |
+
```
|
| 301 |
+
x-api-key: your_api_key_here
|
| 302 |
+
```
|
| 303 |
+
|
| 304 |
+
**Setting API Key:**
|
| 305 |
+
```bash
|
| 306 |
+
# In .env file
|
| 307 |
+
API_KEY=sk_test_123456789
|
| 308 |
+
|
| 309 |
+
# Or as environment variable
|
| 310 |
+
export API_KEY="your_secure_key"
|
| 311 |
+
```
|
| 312 |
+
|
| 313 |
+
## 🧪 Testing
|
| 314 |
+
|
| 315 |
+
### Run Test Suite
|
| 316 |
+
```bash
|
| 317 |
+
pytest
|
| 318 |
+
```
|
| 319 |
+
|
| 320 |
+
### Integration Tests (full model)
|
| 321 |
+
```bash
|
| 322 |
+
RUN_MODEL_TESTS=true pytest -m integration
|
| 323 |
+
```
|
| 324 |
+
Set `AI_MISS_AUDIO_PATH` to point at a known false-negative AI sample to
|
| 325 |
+
track improvements after recalibration.
|
| 326 |
+
|
| 327 |
+
### Manual Testing
|
| 328 |
+
```bash
|
| 329 |
+
# Health check
|
| 330 |
+
curl http://localhost:5000/health
|
| 331 |
+
|
| 332 |
+
# Test with sample audio
|
| 333 |
+
python client.py --audio test_audio.mp3
|
| 334 |
+
```
|
| 335 |
+
|
| 336 |
+
## 📝 Supported Features
|
| 337 |
+
|
| 338 |
+
### Languages
|
| 339 |
+
- Tamil
|
| 340 |
+
- English
|
| 341 |
+
- Hindi
|
| 342 |
+
- Malayalam
|
| 343 |
+
- Telugu
|
| 344 |
+
|
| 345 |
+
### Classifications
|
| 346 |
+
- `AI_GENERATED` - Synthetic/AI voice
|
| 347 |
+
- `HUMAN` - Real human voice
|
| 348 |
+
|
| 349 |
+
### Audio Requirements
|
| 350 |
+
- Format: MP3 only
|
| 351 |
+
- Input: Base64 encoded
|
| 352 |
+
- Max duration: 30 seconds (auto-truncated)
|
| 353 |
+
|
| 354 |
+
## ⚙️ Configuration
|
| 355 |
+
|
| 356 |
+
### Environment Variables
|
| 357 |
+
|
| 358 |
+
| Variable | Default | Description |
|
| 359 |
+
|----------|---------|-------------|
|
| 360 |
+
| `API_KEY` | `sk_test_123456789` | API authentication key |
|
| 361 |
+
| `PORT` | `5000` | Server port |
|
| 362 |
+
| `FLASK_ENV` | `production` | Flask environment |
|
| 363 |
+
| `ENABLE_STREAMING` | `true` | Enable WebSocket streaming endpoint |
|
| 364 |
+
| `STREAMING_MAX_BUFFER_SECONDS` | `30` | Max audio seconds buffered for streaming |
|
| 365 |
+
| `STREAMING_PARTIAL_INTERVAL_SECONDS` | `10` | Partial result interval for streaming |
|
| 366 |
+
| `STREAMING_PARTIAL_MODE` | `physics` | Partial mode: `full`, `physics`, or `dl` |
|
| 367 |
+
| `STREAMING_MAX_CHUNK_BYTES` | `2097152` | Max size per streaming chunk |
|
| 368 |
+
| `ENABLE_FEEDBACK_STORAGE` | `true` | Enable feedback storage for self-learning |
|
| 369 |
+
| `FEEDBACK_STORAGE_DIR` | `data/feedback` | Feedback storage directory |
|
| 370 |
+
| `FEEDBACK_MAX_BYTES` | `15728640` | Max feedback payload size |
|
| 371 |
+
| `CALIBRATION_PATH` | `data/calibration.json` | Calibration file path |
|
| 372 |
+
| `SKIP_MODEL_LOAD` | `false` | Skip loading models at startup (useful for tests) |
|
| 373 |
+
| `CALIBRATION_HISTORY_DIR` | `data/calibration_history` | Calibration backup directory |
|
| 374 |
+
| `CALIBRATION_HISTORY_MAX` | `50` | Max calibration backups retained |
|
| 375 |
+
|
| 376 |
+
### Model Configuration
|
| 377 |
+
|
| 378 |
+
Edit the detector initialization in `app.py`:
|
| 379 |
+
|
| 380 |
+
```python
|
| 381 |
+
detector = HybridEnsembleDetector(
|
| 382 |
+
physics_weight=0.4, # Physics model weight
|
| 383 |
+
dl_weight=0.6, # Deep learning weight
|
| 384 |
+
max_audio_duration=30 # Max seconds to process
|
| 385 |
+
)
|
| 386 |
+
```
|
| 387 |
+
|
| 388 |
+
## 🏗️ Architecture
|
| 389 |
+
|
| 390 |
+
### Detection Pipeline
|
| 391 |
+
|
| 392 |
+
1. **Audio Input** → Base64 MP3
|
| 393 |
+
2. **Preprocessing** → Decode, convert to 16kHz mono
|
| 394 |
+
3. **Language Detection** → Whisper model identifies language
|
| 395 |
+
4. **Physics Analysis** → Acoustic feature extraction
|
| 396 |
+
5. **Deep Learning** → Wav2Vec2 deepfake detection
|
| 397 |
+
6. **Ensemble** → Weighted combination of scores
|
| 398 |
+
7. **Classification** → AI_GENERATED or HUMAN
|
| 399 |
+
|
| 400 |
+
### Models Used
|
| 401 |
+
|
| 402 |
+
- **Deepfake Detector**: `garystafford/wav2vec2-deepfake-voice-detector`
|
| 403 |
+
- **Language Detector**: `openai/whisper-base`
|
| 404 |
+
|
| 405 |
+
## 📈 Performance
|
| 406 |
+
|
| 407 |
+
- **Processing Time**: 2-10 seconds per audio
|
| 408 |
+
- **Memory**: ~2GB RAM minimum
|
| 409 |
+
- **Accuracy**: Varies by language and audio quality
|
| 410 |
+
- **Throughput**: ~5-10 requests/minute per worker
|
| 411 |
+
|
| 412 |
+
## 🔧 Troubleshooting
|
| 413 |
+
|
| 414 |
+
### Models Not Loading
|
| 415 |
+
```bash
|
| 416 |
+
# Pre-download models
|
| 417 |
+
python -c "from transformers import AutoModelForAudioClassification; \
|
| 418 |
+
AutoModelForAudioClassification.from_pretrained('garystafford/wav2vec2-deepfake-voice-detector')"
|
| 419 |
+
```
|
| 420 |
+
|
| 421 |
+
### Port Already in Use
|
| 422 |
+
```bash
|
| 423 |
+
# Change port in .env
|
| 424 |
+
PORT=8000
|
| 425 |
+
|
| 426 |
+
# Or use environment variable
|
| 427 |
+
PORT=8000 python app.py
|
| 428 |
+
```
|
| 429 |
+
|
| 430 |
+
### Memory Issues
|
| 431 |
+
- Reduce `max_audio_duration` to 15 seconds
|
| 432 |
+
- Use fewer Docker workers
|
| 433 |
+
- Increase system RAM
|
| 434 |
+
|
| 435 |
+
## 📖 Documentation
|
| 436 |
+
|
| 437 |
+
- **Full Deployment Guide**: See [DEPLOYMENT.md](DEPLOYMENT.md)
|
| 438 |
+
- **API Reference**: See API section above
|
| 439 |
+
- **Model Details**: See `detector.py` comments
|
| 440 |
+
|
| 441 |
+
## 🛡️ Security Notes
|
| 442 |
+
|
| 443 |
+
- Never commit API keys to version control
|
| 444 |
+
- Use strong, random API keys in production
|
| 445 |
+
- Enable HTTPS/TLS for production deployments
|
| 446 |
+
- Implement rate limiting for production use
|
| 447 |
+
- Regularly update dependencies
|
| 448 |
+
|
| 449 |
+
## 🚀 Production Deployment
|
| 450 |
+
|
| 451 |
+
### Using Gunicorn
|
| 452 |
+
```bash
|
| 453 |
+
gunicorn --bind 0.0.0.0:5000 --workers 2 --timeout 120 app:app
|
| 454 |
+
```
|
| 455 |
+
|
| 456 |
+
### With Nginx Reverse Proxy
|
| 457 |
+
See [DEPLOYMENT.md](DEPLOYMENT.md) for Nginx configuration
|
| 458 |
+
|
| 459 |
+
### Cloud Platforms
|
| 460 |
+
- AWS: EC2 + Docker or Elastic Beanstalk
|
| 461 |
+
- Google Cloud: Cloud Run or Compute Engine
|
| 462 |
+
- Azure: App Service or Container Instances
|
| 463 |
+
- Heroku: Supports Python + Docker
|
| 464 |
+
|
| 465 |
+
## 📞 Support
|
| 466 |
+
|
| 467 |
+
For issues or questions:
|
| 468 |
+
1. Check [DEPLOYMENT.md](DEPLOYMENT.md)
|
| 469 |
+
2. Run test suite: `python test_api.py`
|
| 470 |
+
3. Check logs: `docker-compose logs`
|
| 471 |
+
|
| 472 |
+
## 📄 License
|
| 473 |
+
|
| 474 |
+
This project uses open-source models:
|
| 475 |
+
- Wav2Vec2: Apache 2.0
|
| 476 |
+
- Whisper: MIT
|
| 477 |
+
|
| 478 |
+
## 🙏 Credits
|
| 479 |
+
|
| 480 |
+
- **Models**: HuggingFace transformers
|
| 481 |
+
- **Framework**: Flask
|
| 482 |
+
- **Audio Processing**: Librosa, SoundFile
|
| 483 |
+
|
| 484 |
+
---
|
| 485 |
+
|
| 486 |
+
**Version**: 1.0.0
|
| 487 |
+
**Status**: Production Ready ✅
|
| 488 |
+
**Last Updated**: February 2026
|
app.py
ADDED
|
@@ -0,0 +1,1053 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Voice Detection API - Flask Application (HuggingFace Spaces Version)
|
| 3 |
+
Accepts Base64-encoded MP3 audio and returns AI vs Human classification
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from flask import Flask, request, jsonify
|
| 7 |
+
from flask_cors import CORS
|
| 8 |
+
from flask_sock import Sock
|
| 9 |
+
from functools import wraps
|
| 10 |
+
import base64
|
| 11 |
+
import json
|
| 12 |
+
import os
|
| 13 |
+
import logging
|
| 14 |
+
import shutil
|
| 15 |
+
import tempfile
|
| 16 |
+
import uuid
|
| 17 |
+
import wave
|
| 18 |
+
from datetime import datetime
|
| 19 |
+
from urllib.parse import parse_qs
|
| 20 |
+
|
| 21 |
+
# Import the detector
|
| 22 |
+
from detector import HybridEnsembleDetector
|
| 23 |
+
|
| 24 |
+
# Configure logging
|
| 25 |
+
logging.basicConfig(
|
| 26 |
+
level=logging.INFO,
|
| 27 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 28 |
+
)
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
# Initialize Flask app
|
| 32 |
+
app = Flask(__name__)
|
| 33 |
+
CORS(app)
|
| 34 |
+
sock = Sock(app)
|
| 35 |
+
|
| 36 |
+
# Load API key from environment variable (HuggingFace Secrets)
|
| 37 |
+
API_KEY = os.environ.get('API_KEY', 'sk_test_123456789')
|
| 38 |
+
logger.info(f"API initialized with key: {API_KEY[:10]}...")
|
| 39 |
+
|
| 40 |
+
def parse_bool(value, default=False):
|
| 41 |
+
if value is None:
|
| 42 |
+
return default
|
| 43 |
+
if isinstance(value, bool):
|
| 44 |
+
return value
|
| 45 |
+
return str(value).strip().lower() in ["1", "true", "yes", "y", "on"]
|
| 46 |
+
|
| 47 |
+
# Streaming configuration
|
| 48 |
+
STREAMING_ENABLED = parse_bool(os.environ.get("ENABLE_STREAMING", "true"))
|
| 49 |
+
STREAMING_MAX_BUFFER_SECONDS = int(os.environ.get("STREAMING_MAX_BUFFER_SECONDS", 30))
|
| 50 |
+
STREAMING_PARTIAL_INTERVAL_SECONDS = float(os.environ.get("STREAMING_PARTIAL_INTERVAL_SECONDS", 10))
|
| 51 |
+
STREAMING_PARTIAL_MODE = os.environ.get("STREAMING_PARTIAL_MODE", "physics").lower()
|
| 52 |
+
STREAMING_MAX_CHUNK_BYTES = int(os.environ.get("STREAMING_MAX_CHUNK_BYTES", 2 * 1024 * 1024))
|
| 53 |
+
STREAMING_SUPPORTED_FORMATS = {"pcm16", "wav", "mp3"}
|
| 54 |
+
|
| 55 |
+
# Self-learning / feedback configuration
|
| 56 |
+
ENABLE_FEEDBACK_STORAGE = parse_bool(os.environ.get("ENABLE_FEEDBACK_STORAGE", "true"))
|
| 57 |
+
FEEDBACK_STORAGE_DIR = os.environ.get("FEEDBACK_STORAGE_DIR", "data/feedback")
|
| 58 |
+
FEEDBACK_MAX_BYTES = int(os.environ.get("FEEDBACK_MAX_BYTES", 15 * 1024 * 1024))
|
| 59 |
+
CALIBRATION_PATH = os.environ.get("CALIBRATION_PATH", "data/calibration.json")
|
| 60 |
+
CALIBRATION_HISTORY_DIR = os.environ.get("CALIBRATION_HISTORY_DIR", "data/calibration_history")
|
| 61 |
+
CALIBRATION_HISTORY_MAX = int(os.environ.get("CALIBRATION_HISTORY_MAX", 50))
|
| 62 |
+
|
| 63 |
+
# Initialize the detector globally (load models once at startup)
|
| 64 |
+
logger.info("Loading AI detection models...")
|
| 65 |
+
detector = None
|
| 66 |
+
SKIP_MODEL_LOAD = parse_bool(os.environ.get("SKIP_MODEL_LOAD", "false"))
|
| 67 |
+
|
| 68 |
+
def init_detector():
|
| 69 |
+
"""Initialize the detector with models"""
|
| 70 |
+
global detector
|
| 71 |
+
try:
|
| 72 |
+
detector = HybridEnsembleDetector(
|
| 73 |
+
deepfake_model_path=r"D:\hackathons\GUVI_HCL\AI_Voice_Detector\wav2vec2-deepfake-voice-detector",
|
| 74 |
+
whisper_model_path="openai/whisper-base",
|
| 75 |
+
physics_weight=0.4,
|
| 76 |
+
dl_weight=0.6,
|
| 77 |
+
use_local_deepfake_model=True,
|
| 78 |
+
use_local_whisper_model=False,
|
| 79 |
+
calibration_path=CALIBRATION_PATH,
|
| 80 |
+
max_audio_duration=30
|
| 81 |
+
)
|
| 82 |
+
logger.info("✅ Detector initialized successfully")
|
| 83 |
+
return True
|
| 84 |
+
except Exception as e:
|
| 85 |
+
logger.error(f"❌ Failed to initialize detector: {str(e)}")
|
| 86 |
+
return False
|
| 87 |
+
|
| 88 |
+
# Initialize detector at startup
|
| 89 |
+
if SKIP_MODEL_LOAD:
|
| 90 |
+
logger.info("⚠️ Skipping detector initialization (SKIP_MODEL_LOAD=true)")
|
| 91 |
+
elif not init_detector():
|
| 92 |
+
logger.warning("⚠️ API starting without detector - models will be loaded on first request")
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# ==========================================================
|
| 96 |
+
# AUTHENTICATION DECORATOR
|
| 97 |
+
# ==========================================================
|
| 98 |
+
def require_api_key(f):
|
| 99 |
+
"""Decorator to validate API key from request headers"""
|
| 100 |
+
@wraps(f)
|
| 101 |
+
def decorated_function(*args, **kwargs):
|
| 102 |
+
# Get API key from headers
|
| 103 |
+
provided_key = request.headers.get('x-api-key')
|
| 104 |
+
|
| 105 |
+
if not provided_key:
|
| 106 |
+
logger.warning(f"Request without API key from {request.remote_addr}")
|
| 107 |
+
return jsonify({
|
| 108 |
+
"status": "error",
|
| 109 |
+
"message": "Missing API key. Please provide 'x-api-key' in request headers."
|
| 110 |
+
}), 401
|
| 111 |
+
|
| 112 |
+
if provided_key != API_KEY:
|
| 113 |
+
logger.warning(f"Invalid API key attempt from {request.remote_addr}")
|
| 114 |
+
return jsonify({
|
| 115 |
+
"status": "error",
|
| 116 |
+
"message": "Invalid API key"
|
| 117 |
+
}), 403
|
| 118 |
+
|
| 119 |
+
return f(*args, **kwargs)
|
| 120 |
+
|
| 121 |
+
return decorated_function
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def get_ws_api_key(environ):
|
| 125 |
+
if not environ:
|
| 126 |
+
return None
|
| 127 |
+
|
| 128 |
+
key = environ.get("HTTP_X_API_KEY")
|
| 129 |
+
if key:
|
| 130 |
+
return key
|
| 131 |
+
|
| 132 |
+
auth = environ.get("HTTP_AUTHORIZATION")
|
| 133 |
+
if auth and auth.lower().startswith("bearer "):
|
| 134 |
+
return auth.split(" ", 1)[1]
|
| 135 |
+
|
| 136 |
+
query_params = parse_qs(environ.get("QUERY_STRING", ""))
|
| 137 |
+
if "api_key" in query_params:
|
| 138 |
+
return query_params["api_key"][0]
|
| 139 |
+
|
| 140 |
+
return None
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def normalize_label(label):
|
| 144 |
+
if label is None:
|
| 145 |
+
return None
|
| 146 |
+
label_value = str(label).strip().upper()
|
| 147 |
+
if label_value in ["AI_GENERATED", "AI", "FAKE", "SYNTHETIC"]:
|
| 148 |
+
return "AI_GENERATED"
|
| 149 |
+
if label_value in ["HUMAN", "REAL"]:
|
| 150 |
+
return "HUMAN"
|
| 151 |
+
return None
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def decode_audio_base64(audio_base64):
|
| 155 |
+
detected_format = None
|
| 156 |
+
if isinstance(audio_base64, str) and audio_base64.startswith("data:"):
|
| 157 |
+
header, audio_base64 = audio_base64.split(",", 1)
|
| 158 |
+
header_lower = header.lower()
|
| 159 |
+
if "audio/wav" in header_lower or "audio/x-wav" in header_lower:
|
| 160 |
+
detected_format = "wav"
|
| 161 |
+
elif "audio/mpeg" in header_lower or "audio/mp3" in header_lower:
|
| 162 |
+
detected_format = "mp3"
|
| 163 |
+
audio_bytes = base64.b64decode(audio_base64)
|
| 164 |
+
return audio_bytes, detected_format
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def write_bytes_to_temp_file(data, suffix):
|
| 168 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
|
| 169 |
+
temp_file.write(data)
|
| 170 |
+
temp_file.close()
|
| 171 |
+
return temp_file.name
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def write_pcm16_to_wav_file(pcm_bytes, sample_rate, channels):
|
| 175 |
+
if len(pcm_bytes) % 2 != 0:
|
| 176 |
+
pcm_bytes = pcm_bytes[:len(pcm_bytes) - 1]
|
| 177 |
+
|
| 178 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
| 179 |
+
temp_path = temp_file.name
|
| 180 |
+
temp_file.close()
|
| 181 |
+
|
| 182 |
+
with wave.open(temp_path, "wb") as wav_file:
|
| 183 |
+
wav_file.setnchannels(channels)
|
| 184 |
+
wav_file.setsampwidth(2)
|
| 185 |
+
wav_file.setframerate(sample_rate)
|
| 186 |
+
wav_file.writeframes(pcm_bytes)
|
| 187 |
+
|
| 188 |
+
return temp_path
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def format_detection_payload(result, requested_language=None):
|
| 192 |
+
if result.get("status") != "success":
|
| 193 |
+
return {
|
| 194 |
+
"status": "error",
|
| 195 |
+
"message": result.get("error") or result.get("message") or "Unknown error"
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
payload = {
|
| 199 |
+
"status": "success",
|
| 200 |
+
"classification": result.get("classification"),
|
| 201 |
+
"confidenceScore": result.get("confidenceScore"),
|
| 202 |
+
"explanation": result.get("explanation"),
|
| 203 |
+
"detectedLanguage": result.get("language", "Unknown"),
|
| 204 |
+
"analysisMode": result.get("analysisMode", "full")
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
if requested_language:
|
| 208 |
+
payload["requestedLanguage"] = requested_language
|
| 209 |
+
|
| 210 |
+
return payload
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def ensure_dir(path):
|
| 214 |
+
if path:
|
| 215 |
+
os.makedirs(path, exist_ok=True)
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def build_calibration_version_id():
|
| 219 |
+
timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
|
| 220 |
+
suffix = uuid.uuid4().hex[:8]
|
| 221 |
+
return f"{timestamp}_{suffix}"
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def calibration_history_files():
|
| 225 |
+
if not os.path.isdir(CALIBRATION_HISTORY_DIR):
|
| 226 |
+
return []
|
| 227 |
+
|
| 228 |
+
files = []
|
| 229 |
+
for name in os.listdir(CALIBRATION_HISTORY_DIR):
|
| 230 |
+
if name.startswith("calibration_") and name.endswith(".json"):
|
| 231 |
+
if name.endswith(".meta.json"):
|
| 232 |
+
continue
|
| 233 |
+
files.append(os.path.join(CALIBRATION_HISTORY_DIR, name))
|
| 234 |
+
files.sort(key=lambda p: os.path.getmtime(p), reverse=True)
|
| 235 |
+
return files
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def archive_calibration(reason=None):
|
| 239 |
+
if not os.path.exists(CALIBRATION_PATH):
|
| 240 |
+
return None
|
| 241 |
+
|
| 242 |
+
ensure_dir(CALIBRATION_HISTORY_DIR)
|
| 243 |
+
version_id = build_calibration_version_id()
|
| 244 |
+
filename = f"calibration_{version_id}.json"
|
| 245 |
+
dest_path = os.path.join(CALIBRATION_HISTORY_DIR, filename)
|
| 246 |
+
shutil.copy2(CALIBRATION_PATH, dest_path)
|
| 247 |
+
|
| 248 |
+
meta = {
|
| 249 |
+
"versionId": version_id,
|
| 250 |
+
"source": CALIBRATION_PATH,
|
| 251 |
+
"archivedAt": datetime.utcnow().isoformat() + "Z",
|
| 252 |
+
"reason": reason or "manual"
|
| 253 |
+
}
|
| 254 |
+
meta_path = os.path.join(CALIBRATION_HISTORY_DIR, f"calibration_{version_id}.meta.json")
|
| 255 |
+
with open(meta_path, "w", encoding="utf-8") as handle:
|
| 256 |
+
json.dump(meta, handle, indent=2)
|
| 257 |
+
|
| 258 |
+
if CALIBRATION_HISTORY_MAX > 0:
|
| 259 |
+
history = calibration_history_files()
|
| 260 |
+
for path in history[CALIBRATION_HISTORY_MAX:]:
|
| 261 |
+
try:
|
| 262 |
+
os.unlink(path)
|
| 263 |
+
except Exception:
|
| 264 |
+
pass
|
| 265 |
+
meta_path = path.replace(".json", ".meta.json")
|
| 266 |
+
if os.path.exists(meta_path):
|
| 267 |
+
try:
|
| 268 |
+
os.unlink(meta_path)
|
| 269 |
+
except Exception:
|
| 270 |
+
pass
|
| 271 |
+
|
| 272 |
+
return {
|
| 273 |
+
"versionId": version_id,
|
| 274 |
+
"path": dest_path
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
def list_calibration_history():
|
| 279 |
+
entries = []
|
| 280 |
+
for path in calibration_history_files():
|
| 281 |
+
name = os.path.basename(path)
|
| 282 |
+
version_id = name.replace("calibration_", "").replace(".json", "")
|
| 283 |
+
meta_path = path.replace(".json", ".meta.json")
|
| 284 |
+
meta = {}
|
| 285 |
+
if os.path.exists(meta_path):
|
| 286 |
+
try:
|
| 287 |
+
with open(meta_path, "r", encoding="utf-8") as handle:
|
| 288 |
+
meta = json.load(handle)
|
| 289 |
+
except Exception:
|
| 290 |
+
meta = {}
|
| 291 |
+
entries.append({
|
| 292 |
+
"versionId": version_id,
|
| 293 |
+
"path": path,
|
| 294 |
+
"archivedAt": meta.get("archivedAt"),
|
| 295 |
+
"reason": meta.get("reason")
|
| 296 |
+
})
|
| 297 |
+
return entries
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def resolve_history_path(version_id):
|
| 301 |
+
if not version_id:
|
| 302 |
+
return None
|
| 303 |
+
filename = f"calibration_{version_id}.json"
|
| 304 |
+
return os.path.join(CALIBRATION_HISTORY_DIR, filename)
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
class StreamSession:
|
| 308 |
+
def __init__(
|
| 309 |
+
self,
|
| 310 |
+
audio_format,
|
| 311 |
+
sample_rate,
|
| 312 |
+
channels,
|
| 313 |
+
max_seconds,
|
| 314 |
+
enable_partial,
|
| 315 |
+
partial_interval_seconds,
|
| 316 |
+
partial_mode
|
| 317 |
+
):
|
| 318 |
+
self.session_id = str(uuid.uuid4())
|
| 319 |
+
self.audio_format = audio_format
|
| 320 |
+
self.sample_rate = sample_rate
|
| 321 |
+
self.channels = channels
|
| 322 |
+
self.max_seconds = max_seconds
|
| 323 |
+
self.enable_partial = enable_partial
|
| 324 |
+
self.partial_interval_seconds = partial_interval_seconds
|
| 325 |
+
self.partial_mode = partial_mode
|
| 326 |
+
self.buffer = bytearray()
|
| 327 |
+
self.total_bytes_received = 0
|
| 328 |
+
self.total_seconds_received = 0.0
|
| 329 |
+
self.last_partial_seconds = 0.0
|
| 330 |
+
|
| 331 |
+
def add_chunk(self, chunk_bytes):
|
| 332 |
+
self.total_bytes_received += len(chunk_bytes)
|
| 333 |
+
self.buffer.extend(chunk_bytes)
|
| 334 |
+
|
| 335 |
+
if self.audio_format == "pcm16":
|
| 336 |
+
bytes_per_second = self.sample_rate * self.channels * 2
|
| 337 |
+
if bytes_per_second > 0:
|
| 338 |
+
self.total_seconds_received = self.total_bytes_received / bytes_per_second
|
| 339 |
+
max_bytes = int(self.max_seconds * bytes_per_second)
|
| 340 |
+
if max_bytes > 0 and len(self.buffer) > max_bytes:
|
| 341 |
+
overflow = len(self.buffer) - max_bytes
|
| 342 |
+
del self.buffer[:overflow]
|
| 343 |
+
|
| 344 |
+
return self.current_buffer_seconds()
|
| 345 |
+
|
| 346 |
+
def current_buffer_seconds(self):
|
| 347 |
+
if self.audio_format != "pcm16":
|
| 348 |
+
return None
|
| 349 |
+
bytes_per_second = self.sample_rate * self.channels * 2
|
| 350 |
+
if bytes_per_second <= 0:
|
| 351 |
+
return None
|
| 352 |
+
return len(self.buffer) / bytes_per_second
|
| 353 |
+
|
| 354 |
+
def should_run_partial(self):
|
| 355 |
+
if not self.enable_partial:
|
| 356 |
+
return False
|
| 357 |
+
if self.audio_format != "pcm16":
|
| 358 |
+
return False
|
| 359 |
+
if self.partial_interval_seconds <= 0:
|
| 360 |
+
return False
|
| 361 |
+
if (self.total_seconds_received - self.last_partial_seconds) >= self.partial_interval_seconds:
|
| 362 |
+
self.last_partial_seconds = self.total_seconds_received
|
| 363 |
+
return True
|
| 364 |
+
return False
|
| 365 |
+
|
| 366 |
+
def write_temp_audio_file(self):
|
| 367 |
+
if self.audio_format == "pcm16":
|
| 368 |
+
return write_pcm16_to_wav_file(self.buffer, self.sample_rate, self.channels), "wav"
|
| 369 |
+
|
| 370 |
+
suffix = ".mp3" if self.audio_format == "mp3" else ".wav"
|
| 371 |
+
return write_bytes_to_temp_file(self.buffer, suffix), self.audio_format
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
# ==========================================================
|
| 375 |
+
# ROOT ENDPOINT (HuggingFace Spaces Homepage)
|
| 376 |
+
# ==========================================================
|
| 377 |
+
@app.route('/', methods=['GET'])
|
| 378 |
+
def home():
|
| 379 |
+
"""Root endpoint - API information"""
|
| 380 |
+
return jsonify({
|
| 381 |
+
"name": "Voice Detection API",
|
| 382 |
+
"version": "1.0.0",
|
| 383 |
+
"description": "AI-powered voice detection system for identifying AI-generated vs human voices",
|
| 384 |
+
"endpoints": {
|
| 385 |
+
"health": "/health",
|
| 386 |
+
"detection": "/api/voice-detection",
|
| 387 |
+
"streaming": "/ws/voice-stream",
|
| 388 |
+
"feedback": "/api/feedback",
|
| 389 |
+
"reload_calibration": "/api/reload-calibration",
|
| 390 |
+
"backup_calibration": "/api/backup-calibration",
|
| 391 |
+
"rollback_calibration": "/api/rollback-calibration",
|
| 392 |
+
"calibration_history": "/api/calibration-history"
|
| 393 |
+
},
|
| 394 |
+
"supported_languages": ["Tamil", "English", "Hindi", "Malayalam", "Telugu"],
|
| 395 |
+
"authentication": "Required - use 'x-api-key' header",
|
| 396 |
+
"documentation": "See README for full API documentation"
|
| 397 |
+
}), 200
|
| 398 |
+
|
| 399 |
+
|
| 400 |
+
# ==========================================================
|
| 401 |
+
# HEALTH CHECK ENDPOINT
|
| 402 |
+
# ==========================================================
|
| 403 |
+
@app.route('/health', methods=['GET'])
|
| 404 |
+
def health_check():
|
| 405 |
+
"""Health check endpoint (no authentication required)"""
|
| 406 |
+
return jsonify({
|
| 407 |
+
"status": "healthy",
|
| 408 |
+
"service": "Voice Detection API",
|
| 409 |
+
"timestamp": datetime.utcnow().isoformat(),
|
| 410 |
+
"models_loaded": detector is not None,
|
| 411 |
+
"calibration_loaded": bool(detector and detector.calibrator and detector.calibrator.ready),
|
| 412 |
+
"streaming_enabled": STREAMING_ENABLED,
|
| 413 |
+
"platform": "HuggingFace Spaces"
|
| 414 |
+
}), 200
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
# ==========================================================
|
| 418 |
+
# MAIN VOICE DETECTION ENDPOINT
|
| 419 |
+
# ==========================================================
|
| 420 |
+
@app.route('/api/voice-detection', methods=['POST'])
|
| 421 |
+
@require_api_key
|
| 422 |
+
def voice_detection():
|
| 423 |
+
"""
|
| 424 |
+
Main voice detection endpoint
|
| 425 |
+
|
| 426 |
+
Expected JSON Body:
|
| 427 |
+
{
|
| 428 |
+
"language": "Tamil" | "English" | "Hindi" | "Malayalam" | "Telugu",
|
| 429 |
+
"audioFormat": "mp3",
|
| 430 |
+
"audioBase64": "base64_encoded_audio_string"
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
Returns:
|
| 434 |
+
{
|
| 435 |
+
"status": "success",
|
| 436 |
+
"language": "Tamil",
|
| 437 |
+
"classification": "AI_GENERATED" | "HUMAN",
|
| 438 |
+
"confidenceScore": 0.0-1.0,
|
| 439 |
+
"explanation": "..."
|
| 440 |
+
}
|
| 441 |
+
"""
|
| 442 |
+
global detector
|
| 443 |
+
|
| 444 |
+
try:
|
| 445 |
+
# Validate Content-Type
|
| 446 |
+
if not request.is_json:
|
| 447 |
+
return jsonify({
|
| 448 |
+
"status": "error",
|
| 449 |
+
"message": "Content-Type must be application/json"
|
| 450 |
+
}), 400
|
| 451 |
+
|
| 452 |
+
# Get request data
|
| 453 |
+
data = request.get_json()
|
| 454 |
+
|
| 455 |
+
# Validate required fields
|
| 456 |
+
required_fields = ['language', 'audioFormat', 'audioBase64']
|
| 457 |
+
missing_fields = [field for field in required_fields if field not in data]
|
| 458 |
+
|
| 459 |
+
if missing_fields:
|
| 460 |
+
return jsonify({
|
| 461 |
+
"status": "error",
|
| 462 |
+
"message": f"Missing required fields: {', '.join(missing_fields)}"
|
| 463 |
+
}), 400
|
| 464 |
+
|
| 465 |
+
# Validate language
|
| 466 |
+
supported_languages = ['Tamil', 'English', 'Hindi', 'Malayalam', 'Telugu']
|
| 467 |
+
if data['language'] not in supported_languages:
|
| 468 |
+
return jsonify({
|
| 469 |
+
"status": "error",
|
| 470 |
+
"message": f"Unsupported language. Must be one of: {', '.join(supported_languages)}"
|
| 471 |
+
}), 400
|
| 472 |
+
|
| 473 |
+
# Validate audio format
|
| 474 |
+
if data['audioFormat'].lower() != 'mp3':
|
| 475 |
+
return jsonify({
|
| 476 |
+
"status": "error",
|
| 477 |
+
"message": "Only MP3 audio format is supported"
|
| 478 |
+
}), 400
|
| 479 |
+
|
| 480 |
+
# Validate base64 string
|
| 481 |
+
audio_base64 = data['audioBase64']
|
| 482 |
+
if not audio_base64 or len(audio_base64) < 100:
|
| 483 |
+
return jsonify({
|
| 484 |
+
"status": "error",
|
| 485 |
+
"message": "Invalid or empty audio data"
|
| 486 |
+
}), 400
|
| 487 |
+
|
| 488 |
+
# Initialize detector if not already loaded
|
| 489 |
+
if detector is None:
|
| 490 |
+
logger.info("Lazy loading detector on first request...")
|
| 491 |
+
if not init_detector():
|
| 492 |
+
return jsonify({
|
| 493 |
+
"status": "error",
|
| 494 |
+
"message": "Failed to load AI detection models. Please try again later."
|
| 495 |
+
}), 503
|
| 496 |
+
|
| 497 |
+
# Log request
|
| 498 |
+
logger.info(f"Processing voice detection request for language: {data['language']}")
|
| 499 |
+
|
| 500 |
+
# Analyze audio
|
| 501 |
+
result = detector.analyze(
|
| 502 |
+
audio_base64,
|
| 503 |
+
input_type="base64",
|
| 504 |
+
audio_format=data['audioFormat']
|
| 505 |
+
)
|
| 506 |
+
|
| 507 |
+
# Check if analysis was successful
|
| 508 |
+
if result['status'] != 'success':
|
| 509 |
+
error_msg = result.get('error', 'Unknown error during analysis')
|
| 510 |
+
logger.error(f"Analysis failed: {error_msg}")
|
| 511 |
+
return jsonify({
|
| 512 |
+
"status": "error",
|
| 513 |
+
"message": f"Audio analysis failed: {error_msg}"
|
| 514 |
+
}), 500
|
| 515 |
+
|
| 516 |
+
# Prepare response (API compliant format - NO DEBUG INFO in production)
|
| 517 |
+
response = {
|
| 518 |
+
"status": "success",
|
| 519 |
+
"language": data['language'], # Use requested language from input
|
| 520 |
+
"classification": result['classification'],
|
| 521 |
+
"confidenceScore": result['confidenceScore'],
|
| 522 |
+
"explanation": result['explanation']
|
| 523 |
+
}
|
| 524 |
+
|
| 525 |
+
logger.info(f"✅ Analysis complete: {result['classification']} (confidence: {result['confidenceScore']})")
|
| 526 |
+
|
| 527 |
+
return jsonify(response), 200
|
| 528 |
+
|
| 529 |
+
except Exception as e:
|
| 530 |
+
logger.error(f"Unexpected error in voice_detection: {str(e)}", exc_info=True)
|
| 531 |
+
return jsonify({
|
| 532 |
+
"status": "error",
|
| 533 |
+
"message": "Internal server error occurred during processing"
|
| 534 |
+
}), 500
|
| 535 |
+
|
| 536 |
+
|
| 537 |
+
# ==========================================================
|
| 538 |
+
# FEEDBACK / SELF-LEARNING ENDPOINT
|
| 539 |
+
# ==========================================================
|
| 540 |
+
@app.route('/api/feedback', methods=['POST'])
|
| 541 |
+
@require_api_key
|
| 542 |
+
def feedback():
|
| 543 |
+
"""
|
| 544 |
+
Collect labeled audio samples for periodic self-learning.
|
| 545 |
+
|
| 546 |
+
Expected JSON Body:
|
| 547 |
+
{
|
| 548 |
+
"label": "AI_GENERATED" | "HUMAN",
|
| 549 |
+
"audioFormat": "mp3" | "wav",
|
| 550 |
+
"audioBase64": "base64_encoded_audio_string",
|
| 551 |
+
"runDetection": false,
|
| 552 |
+
"metadata": { ... }
|
| 553 |
+
}
|
| 554 |
+
"""
|
| 555 |
+
if not ENABLE_FEEDBACK_STORAGE:
|
| 556 |
+
return jsonify({
|
| 557 |
+
"status": "error",
|
| 558 |
+
"message": "Feedback storage is disabled"
|
| 559 |
+
}), 403
|
| 560 |
+
|
| 561 |
+
if not request.is_json:
|
| 562 |
+
return jsonify({
|
| 563 |
+
"status": "error",
|
| 564 |
+
"message": "Content-Type must be application/json"
|
| 565 |
+
}), 400
|
| 566 |
+
|
| 567 |
+
data = request.get_json()
|
| 568 |
+
label = normalize_label(data.get("label"))
|
| 569 |
+
if not label:
|
| 570 |
+
return jsonify({
|
| 571 |
+
"status": "error",
|
| 572 |
+
"message": "Invalid label. Use AI_GENERATED or HUMAN."
|
| 573 |
+
}), 400
|
| 574 |
+
|
| 575 |
+
audio_format = str(data.get("audioFormat", "mp3")).lower()
|
| 576 |
+
if audio_format not in ["mp3", "wav"]:
|
| 577 |
+
return jsonify({
|
| 578 |
+
"status": "error",
|
| 579 |
+
"message": "audioFormat must be 'mp3' or 'wav'"
|
| 580 |
+
}), 400
|
| 581 |
+
|
| 582 |
+
audio_base64 = data.get("audioBase64")
|
| 583 |
+
if not audio_base64 or len(audio_base64) < 100:
|
| 584 |
+
return jsonify({
|
| 585 |
+
"status": "error",
|
| 586 |
+
"message": "Invalid or empty audio data"
|
| 587 |
+
}), 400
|
| 588 |
+
|
| 589 |
+
try:
|
| 590 |
+
audio_bytes, detected_format = decode_audio_base64(audio_base64)
|
| 591 |
+
except Exception as e:
|
| 592 |
+
return jsonify({
|
| 593 |
+
"status": "error",
|
| 594 |
+
"message": f"Failed to decode audio: {str(e)}"
|
| 595 |
+
}), 400
|
| 596 |
+
|
| 597 |
+
if detected_format:
|
| 598 |
+
audio_format = detected_format
|
| 599 |
+
|
| 600 |
+
if len(audio_bytes) > FEEDBACK_MAX_BYTES:
|
| 601 |
+
return jsonify({
|
| 602 |
+
"status": "error",
|
| 603 |
+
"message": "Audio payload exceeds maximum size"
|
| 604 |
+
}), 413
|
| 605 |
+
|
| 606 |
+
now = datetime.utcnow()
|
| 607 |
+
date_dir = now.strftime("%Y%m%d")
|
| 608 |
+
label_dir = os.path.join(FEEDBACK_STORAGE_DIR, label, date_dir)
|
| 609 |
+
os.makedirs(label_dir, exist_ok=True)
|
| 610 |
+
|
| 611 |
+
sample_id = str(uuid.uuid4())
|
| 612 |
+
extension = ".mp3" if audio_format == "mp3" else ".wav"
|
| 613 |
+
file_path = os.path.join(label_dir, f"{sample_id}{extension}")
|
| 614 |
+
|
| 615 |
+
with open(file_path, "wb") as handle:
|
| 616 |
+
handle.write(audio_bytes)
|
| 617 |
+
|
| 618 |
+
metadata = {
|
| 619 |
+
"id": sample_id,
|
| 620 |
+
"label": label,
|
| 621 |
+
"audio_format": audio_format,
|
| 622 |
+
"created_at": now.isoformat() + "Z",
|
| 623 |
+
"bytes": len(audio_bytes),
|
| 624 |
+
"path": file_path,
|
| 625 |
+
"client_metadata": data.get("metadata", {})
|
| 626 |
+
}
|
| 627 |
+
|
| 628 |
+
if parse_bool(data.get("runDetection", False)):
|
| 629 |
+
global detector
|
| 630 |
+
if detector is None:
|
| 631 |
+
logger.info("Lazy loading detector for feedback scoring...")
|
| 632 |
+
if not init_detector():
|
| 633 |
+
return jsonify({
|
| 634 |
+
"status": "error",
|
| 635 |
+
"message": "Failed to load AI detection models for scoring"
|
| 636 |
+
}), 503
|
| 637 |
+
|
| 638 |
+
scores = detector.extract_scores(file_path, input_type="file")
|
| 639 |
+
if scores.get("status") == "success":
|
| 640 |
+
metadata["physics_score"] = scores.get("physics_score")
|
| 641 |
+
metadata["dl_score"] = scores.get("dl_score")
|
| 642 |
+
metadata["dl_label"] = scores.get("dl_label")
|
| 643 |
+
metadata["audio_duration"] = scores.get("audio_duration")
|
| 644 |
+
metadata["was_truncated"] = scores.get("was_truncated")
|
| 645 |
+
|
| 646 |
+
meta_path = os.path.join(label_dir, f"{sample_id}.json")
|
| 647 |
+
with open(meta_path, "w", encoding="utf-8") as handle:
|
| 648 |
+
json.dump(metadata, handle, indent=2)
|
| 649 |
+
|
| 650 |
+
index_path = os.path.join(FEEDBACK_STORAGE_DIR, "index.jsonl")
|
| 651 |
+
with open(index_path, "a", encoding="utf-8") as handle:
|
| 652 |
+
handle.write(json.dumps(metadata) + "\n")
|
| 653 |
+
|
| 654 |
+
return jsonify({
|
| 655 |
+
"status": "success",
|
| 656 |
+
"id": sample_id,
|
| 657 |
+
"label": label,
|
| 658 |
+
"audioFormat": audio_format,
|
| 659 |
+
"stored": True
|
| 660 |
+
}), 200
|
| 661 |
+
|
| 662 |
+
|
| 663 |
+
# ==========================================================
|
| 664 |
+
# CALIBRATION RELOAD ENDPOINT
|
| 665 |
+
# ==========================================================
|
| 666 |
+
@app.route('/api/reload-calibration', methods=['POST'])
|
| 667 |
+
@require_api_key
|
| 668 |
+
def reload_calibration():
|
| 669 |
+
global detector
|
| 670 |
+
|
| 671 |
+
if detector is None:
|
| 672 |
+
logger.info("Lazy loading detector for calibration reload...")
|
| 673 |
+
if not init_detector():
|
| 674 |
+
return jsonify({
|
| 675 |
+
"status": "error",
|
| 676 |
+
"message": "Failed to load AI detection models"
|
| 677 |
+
}), 503
|
| 678 |
+
|
| 679 |
+
loaded = detector.reload_calibration(CALIBRATION_PATH)
|
| 680 |
+
if not loaded:
|
| 681 |
+
return jsonify({
|
| 682 |
+
"status": "error",
|
| 683 |
+
"message": "Calibration file not found or invalid"
|
| 684 |
+
}), 404
|
| 685 |
+
|
| 686 |
+
return jsonify({
|
| 687 |
+
"status": "success",
|
| 688 |
+
"calibrationPath": detector.calibrator.calibration_path
|
| 689 |
+
}), 200
|
| 690 |
+
|
| 691 |
+
|
| 692 |
+
@app.route('/api/backup-calibration', methods=['POST'])
|
| 693 |
+
@require_api_key
|
| 694 |
+
def backup_calibration():
|
| 695 |
+
payload = request.get_json(silent=True) or {}
|
| 696 |
+
reason = payload.get("reason")
|
| 697 |
+
|
| 698 |
+
if not os.path.exists(CALIBRATION_PATH):
|
| 699 |
+
return jsonify({
|
| 700 |
+
"status": "error",
|
| 701 |
+
"message": "Calibration file not found"
|
| 702 |
+
}), 404
|
| 703 |
+
|
| 704 |
+
backup = archive_calibration(reason=reason or "api_backup")
|
| 705 |
+
if not backup:
|
| 706 |
+
return jsonify({
|
| 707 |
+
"status": "error",
|
| 708 |
+
"message": "Failed to archive calibration"
|
| 709 |
+
}), 500
|
| 710 |
+
|
| 711 |
+
return jsonify({
|
| 712 |
+
"status": "success",
|
| 713 |
+
"versionId": backup["versionId"],
|
| 714 |
+
"path": backup["path"]
|
| 715 |
+
}), 200
|
| 716 |
+
|
| 717 |
+
|
| 718 |
+
@app.route('/api/calibration-history', methods=['GET'])
|
| 719 |
+
@require_api_key
|
| 720 |
+
def calibration_history():
|
| 721 |
+
history = list_calibration_history()
|
| 722 |
+
return jsonify({
|
| 723 |
+
"status": "success",
|
| 724 |
+
"history": history
|
| 725 |
+
}), 200
|
| 726 |
+
|
| 727 |
+
|
| 728 |
+
@app.route('/api/rollback-calibration', methods=['POST'])
|
| 729 |
+
@require_api_key
|
| 730 |
+
def rollback_calibration():
|
| 731 |
+
payload = request.get_json(silent=True) or {}
|
| 732 |
+
version_id = payload.get("versionId")
|
| 733 |
+
|
| 734 |
+
if not version_id:
|
| 735 |
+
return jsonify({
|
| 736 |
+
"status": "error",
|
| 737 |
+
"message": "Missing versionId"
|
| 738 |
+
}), 400
|
| 739 |
+
|
| 740 |
+
source_path = resolve_history_path(version_id)
|
| 741 |
+
if not source_path or not os.path.exists(source_path):
|
| 742 |
+
return jsonify({
|
| 743 |
+
"status": "error",
|
| 744 |
+
"message": "Calibration version not found"
|
| 745 |
+
}), 404
|
| 746 |
+
|
| 747 |
+
ensure_dir(os.path.dirname(CALIBRATION_PATH))
|
| 748 |
+
shutil.copy2(source_path, CALIBRATION_PATH)
|
| 749 |
+
|
| 750 |
+
global detector
|
| 751 |
+
if detector is None:
|
| 752 |
+
logger.info("Lazy loading detector for rollback...")
|
| 753 |
+
if not init_detector():
|
| 754 |
+
return jsonify({
|
| 755 |
+
"status": "error",
|
| 756 |
+
"message": "Failed to load AI detection models"
|
| 757 |
+
}), 503
|
| 758 |
+
|
| 759 |
+
loaded = detector.reload_calibration(CALIBRATION_PATH)
|
| 760 |
+
if not loaded:
|
| 761 |
+
return jsonify({
|
| 762 |
+
"status": "error",
|
| 763 |
+
"message": "Failed to load calibration after rollback"
|
| 764 |
+
}), 500
|
| 765 |
+
|
| 766 |
+
return jsonify({
|
| 767 |
+
"status": "success",
|
| 768 |
+
"versionId": version_id,
|
| 769 |
+
"calibrationPath": CALIBRATION_PATH
|
| 770 |
+
}), 200
|
| 771 |
+
|
| 772 |
+
|
| 773 |
+
# ==========================================================
|
| 774 |
+
# REALTIME STREAMING ENDPOINT (WEBSOCKET)
|
| 775 |
+
# ==========================================================
|
| 776 |
+
@sock.route('/ws/voice-stream')
|
| 777 |
+
def voice_stream(ws):
|
| 778 |
+
if not STREAMING_ENABLED:
|
| 779 |
+
ws.send(json.dumps({
|
| 780 |
+
"type": "error",
|
| 781 |
+
"message": "Streaming is disabled"
|
| 782 |
+
}))
|
| 783 |
+
return
|
| 784 |
+
|
| 785 |
+
api_key = get_ws_api_key(ws.environ)
|
| 786 |
+
if api_key != API_KEY:
|
| 787 |
+
ws.send(json.dumps({
|
| 788 |
+
"type": "error",
|
| 789 |
+
"message": "Invalid API key"
|
| 790 |
+
}))
|
| 791 |
+
return
|
| 792 |
+
|
| 793 |
+
session = None
|
| 794 |
+
requested_language = None
|
| 795 |
+
|
| 796 |
+
while True:
|
| 797 |
+
message = ws.receive()
|
| 798 |
+
if message is None:
|
| 799 |
+
break
|
| 800 |
+
|
| 801 |
+
try:
|
| 802 |
+
payload = json.loads(message)
|
| 803 |
+
except Exception:
|
| 804 |
+
ws.send(json.dumps({
|
| 805 |
+
"type": "error",
|
| 806 |
+
"message": "Invalid JSON message"
|
| 807 |
+
}))
|
| 808 |
+
continue
|
| 809 |
+
|
| 810 |
+
msg_type = payload.get("type")
|
| 811 |
+
|
| 812 |
+
if msg_type == "start":
|
| 813 |
+
if session is not None:
|
| 814 |
+
ws.send(json.dumps({
|
| 815 |
+
"type": "error",
|
| 816 |
+
"message": "Stream already started"
|
| 817 |
+
}))
|
| 818 |
+
continue
|
| 819 |
+
|
| 820 |
+
audio_format = str(payload.get("audioFormat", "pcm16")).lower()
|
| 821 |
+
if audio_format in ["pcm_s16le", "s16le", "pcm16le"]:
|
| 822 |
+
audio_format = "pcm16"
|
| 823 |
+
if audio_format not in STREAMING_SUPPORTED_FORMATS:
|
| 824 |
+
ws.send(json.dumps({
|
| 825 |
+
"type": "error",
|
| 826 |
+
"message": "Unsupported audioFormat for streaming"
|
| 827 |
+
}))
|
| 828 |
+
continue
|
| 829 |
+
|
| 830 |
+
sample_rate = int(payload.get("sampleRate", 16000))
|
| 831 |
+
channels = int(payload.get("channels", 1))
|
| 832 |
+
if audio_format == "pcm16":
|
| 833 |
+
if sample_rate <= 0 or channels <= 0:
|
| 834 |
+
ws.send(json.dumps({
|
| 835 |
+
"type": "error",
|
| 836 |
+
"message": "sampleRate and channels must be positive for pcm16"
|
| 837 |
+
}))
|
| 838 |
+
continue
|
| 839 |
+
if channels not in [1, 2]:
|
| 840 |
+
ws.send(json.dumps({
|
| 841 |
+
"type": "error",
|
| 842 |
+
"message": "channels must be 1 or 2 for pcm16"
|
| 843 |
+
}))
|
| 844 |
+
continue
|
| 845 |
+
requested_language = payload.get("language")
|
| 846 |
+
enable_partial = parse_bool(payload.get("enablePartial", True))
|
| 847 |
+
partial_interval = float(payload.get("partialIntervalSec", STREAMING_PARTIAL_INTERVAL_SECONDS))
|
| 848 |
+
max_seconds = int(payload.get("maxSeconds", STREAMING_MAX_BUFFER_SECONDS))
|
| 849 |
+
partial_mode = str(payload.get("partialMode", STREAMING_PARTIAL_MODE)).lower()
|
| 850 |
+
if partial_mode not in ["full", "physics", "dl"]:
|
| 851 |
+
partial_mode = "physics"
|
| 852 |
+
|
| 853 |
+
session = StreamSession(
|
| 854 |
+
audio_format=audio_format,
|
| 855 |
+
sample_rate=sample_rate,
|
| 856 |
+
channels=channels,
|
| 857 |
+
max_seconds=max_seconds,
|
| 858 |
+
enable_partial=enable_partial,
|
| 859 |
+
partial_interval_seconds=partial_interval,
|
| 860 |
+
partial_mode=partial_mode
|
| 861 |
+
)
|
| 862 |
+
|
| 863 |
+
ws.send(json.dumps({
|
| 864 |
+
"type": "ack",
|
| 865 |
+
"status": "ready",
|
| 866 |
+
"sessionId": session.session_id,
|
| 867 |
+
"streaming": {
|
| 868 |
+
"audioFormat": audio_format,
|
| 869 |
+
"sampleRate": sample_rate,
|
| 870 |
+
"channels": channels,
|
| 871 |
+
"maxSeconds": max_seconds,
|
| 872 |
+
"partialIntervalSec": partial_interval,
|
| 873 |
+
"partialMode": partial_mode,
|
| 874 |
+
"enablePartial": enable_partial
|
| 875 |
+
}
|
| 876 |
+
}))
|
| 877 |
+
continue
|
| 878 |
+
|
| 879 |
+
if msg_type == "ping":
|
| 880 |
+
ws.send(json.dumps({"type": "pong"}))
|
| 881 |
+
continue
|
| 882 |
+
|
| 883 |
+
if msg_type not in ["audio_chunk", "stop"]:
|
| 884 |
+
ws.send(json.dumps({
|
| 885 |
+
"type": "error",
|
| 886 |
+
"message": "Unsupported message type"
|
| 887 |
+
}))
|
| 888 |
+
continue
|
| 889 |
+
|
| 890 |
+
if session is None:
|
| 891 |
+
ws.send(json.dumps({
|
| 892 |
+
"type": "error",
|
| 893 |
+
"message": "Stream not started"
|
| 894 |
+
}))
|
| 895 |
+
continue
|
| 896 |
+
|
| 897 |
+
finalize_only = False
|
| 898 |
+
if msg_type == "stop":
|
| 899 |
+
payload["final"] = True
|
| 900 |
+
finalize_only = True
|
| 901 |
+
|
| 902 |
+
chunk_b64 = payload.get("audioChunkBase64")
|
| 903 |
+
chunk_bytes = None
|
| 904 |
+
if not chunk_b64:
|
| 905 |
+
if not finalize_only:
|
| 906 |
+
ws.send(json.dumps({
|
| 907 |
+
"type": "error",
|
| 908 |
+
"message": "Missing audioChunkBase64"
|
| 909 |
+
}))
|
| 910 |
+
continue
|
| 911 |
+
else:
|
| 912 |
+
try:
|
| 913 |
+
chunk_bytes = base64.b64decode(chunk_b64)
|
| 914 |
+
except Exception:
|
| 915 |
+
ws.send(json.dumps({
|
| 916 |
+
"type": "error",
|
| 917 |
+
"message": "Invalid base64 audio chunk"
|
| 918 |
+
}))
|
| 919 |
+
continue
|
| 920 |
+
|
| 921 |
+
if len(chunk_bytes) > STREAMING_MAX_CHUNK_BYTES:
|
| 922 |
+
ws.send(json.dumps({
|
| 923 |
+
"type": "error",
|
| 924 |
+
"message": "Audio chunk exceeds maximum size"
|
| 925 |
+
}))
|
| 926 |
+
continue
|
| 927 |
+
|
| 928 |
+
buffer_seconds = session.add_chunk(chunk_bytes)
|
| 929 |
+
ws.send(json.dumps({
|
| 930 |
+
"type": "progress",
|
| 931 |
+
"receivedBytes": session.total_bytes_received,
|
| 932 |
+
"bufferBytes": len(session.buffer),
|
| 933 |
+
"bufferSeconds": buffer_seconds
|
| 934 |
+
}))
|
| 935 |
+
|
| 936 |
+
if session.should_run_partial():
|
| 937 |
+
if detector is None:
|
| 938 |
+
logger.info("Lazy loading detector for streaming...")
|
| 939 |
+
if not init_detector():
|
| 940 |
+
ws.send(json.dumps({
|
| 941 |
+
"type": "error",
|
| 942 |
+
"message": "Failed to load AI detection models"
|
| 943 |
+
}))
|
| 944 |
+
break
|
| 945 |
+
|
| 946 |
+
temp_path = None
|
| 947 |
+
try:
|
| 948 |
+
temp_path, file_format = session.write_temp_audio_file()
|
| 949 |
+
result = detector.analyze(
|
| 950 |
+
temp_path,
|
| 951 |
+
input_type="file",
|
| 952 |
+
audio_format=file_format,
|
| 953 |
+
analysis_mode=session.partial_mode
|
| 954 |
+
)
|
| 955 |
+
ws.send(json.dumps({
|
| 956 |
+
"type": "partial_result",
|
| 957 |
+
"result": format_detection_payload(result, requested_language=requested_language)
|
| 958 |
+
}))
|
| 959 |
+
finally:
|
| 960 |
+
if temp_path and os.path.exists(temp_path):
|
| 961 |
+
try:
|
| 962 |
+
os.unlink(temp_path)
|
| 963 |
+
except Exception:
|
| 964 |
+
pass
|
| 965 |
+
|
| 966 |
+
if parse_bool(payload.get("final", False)):
|
| 967 |
+
if not session.buffer:
|
| 968 |
+
ws.send(json.dumps({
|
| 969 |
+
"type": "error",
|
| 970 |
+
"message": "No audio received"
|
| 971 |
+
}))
|
| 972 |
+
break
|
| 973 |
+
|
| 974 |
+
if detector is None:
|
| 975 |
+
logger.info("Lazy loading detector for streaming...")
|
| 976 |
+
if not init_detector():
|
| 977 |
+
ws.send(json.dumps({
|
| 978 |
+
"type": "error",
|
| 979 |
+
"message": "Failed to load AI detection models"
|
| 980 |
+
}))
|
| 981 |
+
break
|
| 982 |
+
|
| 983 |
+
temp_path = None
|
| 984 |
+
try:
|
| 985 |
+
temp_path, file_format = session.write_temp_audio_file()
|
| 986 |
+
result = detector.analyze(
|
| 987 |
+
temp_path,
|
| 988 |
+
input_type="file",
|
| 989 |
+
audio_format=file_format,
|
| 990 |
+
analysis_mode="full"
|
| 991 |
+
)
|
| 992 |
+
ws.send(json.dumps({
|
| 993 |
+
"type": "final_result",
|
| 994 |
+
"result": format_detection_payload(result, requested_language=requested_language)
|
| 995 |
+
}))
|
| 996 |
+
finally:
|
| 997 |
+
if temp_path and os.path.exists(temp_path):
|
| 998 |
+
try:
|
| 999 |
+
os.unlink(temp_path)
|
| 1000 |
+
except Exception:
|
| 1001 |
+
pass
|
| 1002 |
+
break
|
| 1003 |
+
|
| 1004 |
+
|
| 1005 |
+
# ==========================================================
|
| 1006 |
+
# ERROR HANDLERS
|
| 1007 |
+
# ==========================================================
|
| 1008 |
+
@app.errorhandler(404)
|
| 1009 |
+
def not_found(error):
|
| 1010 |
+
"""Handle 404 errors"""
|
| 1011 |
+
return jsonify({
|
| 1012 |
+
"status": "error",
|
| 1013 |
+
"message": "Endpoint not found"
|
| 1014 |
+
}), 404
|
| 1015 |
+
|
| 1016 |
+
|
| 1017 |
+
@app.errorhandler(405)
|
| 1018 |
+
def method_not_allowed(error):
|
| 1019 |
+
"""Handle 405 errors"""
|
| 1020 |
+
return jsonify({
|
| 1021 |
+
"status": "error",
|
| 1022 |
+
"message": "Method not allowed for this endpoint"
|
| 1023 |
+
}), 405
|
| 1024 |
+
|
| 1025 |
+
|
| 1026 |
+
@app.errorhandler(500)
|
| 1027 |
+
def internal_error(error):
|
| 1028 |
+
"""Handle 500 errors"""
|
| 1029 |
+
logger.error(f"Internal server error: {str(error)}")
|
| 1030 |
+
return jsonify({
|
| 1031 |
+
"status": "error",
|
| 1032 |
+
"message": "Internal server error"
|
| 1033 |
+
}), 500
|
| 1034 |
+
|
| 1035 |
+
|
| 1036 |
+
# ==========================================================
|
| 1037 |
+
# RUN APPLICATION
|
| 1038 |
+
# ==========================================================
|
| 1039 |
+
if __name__ == '__main__':
|
| 1040 |
+
# HuggingFace Spaces uses port 7860
|
| 1041 |
+
port = int(os.environ.get('PORT', 7860))
|
| 1042 |
+
|
| 1043 |
+
# Run the app
|
| 1044 |
+
logger.info(f"🚀 Starting Voice Detection API on port {port}")
|
| 1045 |
+
logger.info(f"📍 Endpoint: http://0.0.0.0:{port}/api/voice-detection")
|
| 1046 |
+
logger.info(f"🔑 API Key: {API_KEY}")
|
| 1047 |
+
logger.info(f"🌐 Platform: HuggingFace Spaces")
|
| 1048 |
+
|
| 1049 |
+
app.run(
|
| 1050 |
+
host='0.0.0.0',
|
| 1051 |
+
port=port,
|
| 1052 |
+
debug=False # Always False in production
|
| 1053 |
+
)
|
client.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Example Client Script for Voice Detection API
|
| 3 |
+
Demonstrates how to use the API from Python
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import requests
|
| 7 |
+
import base64
|
| 8 |
+
import json
|
| 9 |
+
import argparse
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
class VoiceDetectionClient:
|
| 13 |
+
"""Client for interacting with Voice Detection API"""
|
| 14 |
+
|
| 15 |
+
def __init__(self, api_url, api_key):
|
| 16 |
+
"""
|
| 17 |
+
Initialize the client
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
api_url: Base URL of the API (e.g., http://localhost:5000)
|
| 21 |
+
api_key: API authentication key
|
| 22 |
+
"""
|
| 23 |
+
self.api_url = api_url.rstrip('/')
|
| 24 |
+
self.api_key = api_key
|
| 25 |
+
self.headers = {
|
| 26 |
+
'Content-Type': 'application/json',
|
| 27 |
+
'x-api-key': self.api_key
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
def check_health(self):
|
| 31 |
+
"""Check if the API is healthy"""
|
| 32 |
+
try:
|
| 33 |
+
response = requests.get(f"{self.api_url}/health", timeout=5)
|
| 34 |
+
return response.json()
|
| 35 |
+
except Exception as e:
|
| 36 |
+
return {"status": "error", "message": str(e)}
|
| 37 |
+
|
| 38 |
+
def detect_voice(self, audio_path, language="English"):
|
| 39 |
+
"""
|
| 40 |
+
Detect if voice is AI-generated or human
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
audio_path: Path to MP3 audio file
|
| 44 |
+
language: Language of the audio (Tamil/English/Hindi/Malayalam/Telugu)
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
dict: API response
|
| 48 |
+
"""
|
| 49 |
+
# Validate file exists
|
| 50 |
+
if not Path(audio_path).exists():
|
| 51 |
+
return {"status": "error", "message": f"File not found: {audio_path}"}
|
| 52 |
+
|
| 53 |
+
# Validate language
|
| 54 |
+
supported_languages = ['Tamil', 'English', 'Hindi', 'Malayalam', 'Telugu']
|
| 55 |
+
if language not in supported_languages:
|
| 56 |
+
return {
|
| 57 |
+
"status": "error",
|
| 58 |
+
"message": f"Unsupported language. Use: {', '.join(supported_languages)}"
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
# Read and encode audio
|
| 62 |
+
try:
|
| 63 |
+
with open(audio_path, 'rb') as f:
|
| 64 |
+
audio_data = f.read()
|
| 65 |
+
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
|
| 66 |
+
except Exception as e:
|
| 67 |
+
return {"status": "error", "message": f"Failed to read audio file: {str(e)}"}
|
| 68 |
+
|
| 69 |
+
# Prepare request
|
| 70 |
+
payload = {
|
| 71 |
+
"language": language,
|
| 72 |
+
"audioFormat": "mp3",
|
| 73 |
+
"audioBase64": audio_base64
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
# Send request
|
| 77 |
+
try:
|
| 78 |
+
response = requests.post(
|
| 79 |
+
f"{self.api_url}/api/voice-detection",
|
| 80 |
+
headers=self.headers,
|
| 81 |
+
json=payload,
|
| 82 |
+
timeout=120 # 2 minutes timeout
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
return response.json()
|
| 86 |
+
|
| 87 |
+
except requests.exceptions.Timeout:
|
| 88 |
+
return {"status": "error", "message": "Request timed out"}
|
| 89 |
+
except requests.exceptions.ConnectionError:
|
| 90 |
+
return {"status": "error", "message": "Could not connect to API"}
|
| 91 |
+
except Exception as e:
|
| 92 |
+
return {"status": "error", "message": str(e)}
|
| 93 |
+
|
| 94 |
+
def print_result(self, result):
|
| 95 |
+
"""Pretty print the result"""
|
| 96 |
+
print("\n" + "="*70)
|
| 97 |
+
print("🎙️ VOICE DETECTION RESULT")
|
| 98 |
+
print("="*70)
|
| 99 |
+
|
| 100 |
+
if result.get('status') == 'success':
|
| 101 |
+
print(f"✅ Status: {result['status'].upper()}")
|
| 102 |
+
print(f"🌐 Language: {result['language']}")
|
| 103 |
+
print(f"🎯 Classification: {result['classification']}")
|
| 104 |
+
print(f"📊 Confidence Score: {result['confidenceScore']:.2f} / 1.00")
|
| 105 |
+
print(f"💡 Explanation: {result['explanation']}")
|
| 106 |
+
|
| 107 |
+
# Interpretation
|
| 108 |
+
print("\n" + "-"*70)
|
| 109 |
+
if result['classification'] == 'AI_GENERATED':
|
| 110 |
+
print("⚠️ This voice appears to be AI-generated or synthetic")
|
| 111 |
+
if result['confidenceScore'] > 0.8:
|
| 112 |
+
print(" High confidence - Strong indicators of AI generation")
|
| 113 |
+
elif result['confidenceScore'] > 0.65:
|
| 114 |
+
print(" Medium confidence - Multiple suspicious patterns detected")
|
| 115 |
+
else:
|
| 116 |
+
print(" Low confidence - Some indicators present but not conclusive")
|
| 117 |
+
else:
|
| 118 |
+
print("✅ This voice appears to be human/real")
|
| 119 |
+
if result['confidenceScore'] < 0.35:
|
| 120 |
+
print(" High confidence - Strong human characteristics")
|
| 121 |
+
elif result['confidenceScore'] < 0.5:
|
| 122 |
+
print(" Medium confidence - Mostly human patterns")
|
| 123 |
+
else:
|
| 124 |
+
print(" Low confidence - Close to threshold")
|
| 125 |
+
else:
|
| 126 |
+
print(f"❌ Status: ERROR")
|
| 127 |
+
print(f"💬 Message: {result.get('message', 'Unknown error')}")
|
| 128 |
+
|
| 129 |
+
print("="*70 + "\n")
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def main():
|
| 133 |
+
"""Main function with CLI interface"""
|
| 134 |
+
parser = argparse.ArgumentParser(
|
| 135 |
+
description='Voice Detection API Client',
|
| 136 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 137 |
+
epilog="""
|
| 138 |
+
Examples:
|
| 139 |
+
# Check API health
|
| 140 |
+
python client.py --health
|
| 141 |
+
|
| 142 |
+
# Detect single audio file
|
| 143 |
+
python client.py --audio test_audio.mp3 --language English
|
| 144 |
+
|
| 145 |
+
# Process multiple files
|
| 146 |
+
python client.py --audio file1.mp3 --audio file2.mp3 --language Tamil
|
| 147 |
+
|
| 148 |
+
# Use custom API URL and key
|
| 149 |
+
python client.py --audio test.mp3 --url http://api.example.com --key your_api_key
|
| 150 |
+
"""
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
parser.add_argument(
|
| 154 |
+
'--url',
|
| 155 |
+
default='http://localhost:5000',
|
| 156 |
+
help='API base URL (default: http://localhost:5000)'
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
parser.add_argument(
|
| 160 |
+
'--key',
|
| 161 |
+
default='sk_test_123456789',
|
| 162 |
+
help='API key (default: sk_test_123456789)'
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
parser.add_argument(
|
| 166 |
+
'--health',
|
| 167 |
+
action='store_true',
|
| 168 |
+
help='Check API health'
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
parser.add_argument(
|
| 172 |
+
'--audio',
|
| 173 |
+
action='append',
|
| 174 |
+
help='Path to MP3 audio file (can be used multiple times)'
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
parser.add_argument(
|
| 178 |
+
'--language',
|
| 179 |
+
default='English',
|
| 180 |
+
choices=['Tamil', 'English', 'Hindi', 'Malayalam', 'Telugu'],
|
| 181 |
+
help='Language of the audio (default: English)'
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
args = parser.parse_args()
|
| 185 |
+
|
| 186 |
+
# Initialize client
|
| 187 |
+
client = VoiceDetectionClient(args.url, args.key)
|
| 188 |
+
|
| 189 |
+
# Health check
|
| 190 |
+
if args.health:
|
| 191 |
+
print("🏥 Checking API health...")
|
| 192 |
+
health = client.check_health()
|
| 193 |
+
print(json.dumps(health, indent=2))
|
| 194 |
+
return
|
| 195 |
+
|
| 196 |
+
# Process audio files
|
| 197 |
+
if args.audio:
|
| 198 |
+
for audio_file in args.audio:
|
| 199 |
+
print(f"\n🎵 Processing: {audio_file}")
|
| 200 |
+
print(f" Language: {args.language}")
|
| 201 |
+
|
| 202 |
+
result = client.detect_voice(audio_file, args.language)
|
| 203 |
+
client.print_result(result)
|
| 204 |
+
else:
|
| 205 |
+
parser.print_help()
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
if __name__ == '__main__':
|
| 209 |
+
main()
|
detector.py
ADDED
|
@@ -0,0 +1,875 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import librosa
|
| 3 |
+
import numpy as np
|
| 4 |
+
import scipy.stats as stats
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor, WhisperProcessor, WhisperForConditionalGeneration
|
| 7 |
+
import base64
|
| 8 |
+
import io
|
| 9 |
+
import json
|
| 10 |
+
import math
|
| 11 |
+
import tempfile
|
| 12 |
+
import os
|
| 13 |
+
import soundfile as sf
|
| 14 |
+
import warnings
|
| 15 |
+
|
| 16 |
+
# Suppress librosa warnings
|
| 17 |
+
warnings.filterwarnings('ignore')
|
| 18 |
+
|
| 19 |
+
class ScoreCalibrator:
|
| 20 |
+
"""
|
| 21 |
+
Lightweight calibration model to adapt the final score using
|
| 22 |
+
physics and deep learning scores.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(self, calibration_path=None):
|
| 26 |
+
self.calibration_path = calibration_path
|
| 27 |
+
self.ready = False
|
| 28 |
+
self.weights = None
|
| 29 |
+
self.bias = 0.0
|
| 30 |
+
self.threshold = 0.5
|
| 31 |
+
self.metadata = {}
|
| 32 |
+
|
| 33 |
+
if calibration_path:
|
| 34 |
+
self.load(calibration_path)
|
| 35 |
+
|
| 36 |
+
def load(self, path=None):
|
| 37 |
+
path = path or self.calibration_path
|
| 38 |
+
if not path or not os.path.exists(path):
|
| 39 |
+
self.ready = False
|
| 40 |
+
return False
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
with open(path, "r", encoding="utf-8") as handle:
|
| 44 |
+
data = json.load(handle)
|
| 45 |
+
except Exception:
|
| 46 |
+
self.ready = False
|
| 47 |
+
return False
|
| 48 |
+
|
| 49 |
+
weights = data.get("weights")
|
| 50 |
+
if not isinstance(weights, list) or len(weights) != 2:
|
| 51 |
+
self.ready = False
|
| 52 |
+
return False
|
| 53 |
+
|
| 54 |
+
self.weights = [float(weights[0]), float(weights[1])]
|
| 55 |
+
self.bias = float(data.get("bias", 0.0))
|
| 56 |
+
self.threshold = float(data.get("threshold", 0.5))
|
| 57 |
+
self.metadata = data
|
| 58 |
+
self.calibration_path = path
|
| 59 |
+
self.ready = True
|
| 60 |
+
return True
|
| 61 |
+
|
| 62 |
+
def predict(self, physics_score, dl_score):
|
| 63 |
+
if not self.ready or self.weights is None:
|
| 64 |
+
return None
|
| 65 |
+
|
| 66 |
+
z = (self.weights[0] * physics_score) + (self.weights[1] * dl_score) + self.bias
|
| 67 |
+
if z >= 0:
|
| 68 |
+
exp_neg = math.exp(-z)
|
| 69 |
+
prob = 1.0 / (1.0 + exp_neg)
|
| 70 |
+
else:
|
| 71 |
+
exp_pos = math.exp(z)
|
| 72 |
+
prob = exp_pos / (1.0 + exp_pos)
|
| 73 |
+
return float(prob)
|
| 74 |
+
|
| 75 |
+
class HybridEnsembleDetector:
|
| 76 |
+
"""
|
| 77 |
+
Hybrid AI Voice Detection System with Language Detection
|
| 78 |
+
|
| 79 |
+
Features:
|
| 80 |
+
1. Physics-based acoustic analysis
|
| 81 |
+
2. Deep Learning deepfake detection
|
| 82 |
+
3. Language identification using Whisper (focus on Indian languages)
|
| 83 |
+
4. Auto-truncation to 30 seconds for faster processing
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
def __init__(
|
| 87 |
+
self,
|
| 88 |
+
deepfake_model_path="garystafford/wav2vec2-deepfake-voice-detector",
|
| 89 |
+
whisper_model_path="openai/whisper-base",
|
| 90 |
+
physics_weight=0.4,
|
| 91 |
+
dl_weight=0.6,
|
| 92 |
+
use_local_deepfake_model=False,
|
| 93 |
+
use_local_whisper_model=False,
|
| 94 |
+
calibration_path=None,
|
| 95 |
+
max_audio_duration=30 # seconds
|
| 96 |
+
):
|
| 97 |
+
"""
|
| 98 |
+
Initialize the hybrid detector
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
deepfake_model_path: Path to deepfake detection model
|
| 102 |
+
whisper_model_path: Path to Whisper model for language detection
|
| 103 |
+
physics_weight: Weight for physics score (0-1)
|
| 104 |
+
dl_weight: Weight for DL score (0-1)
|
| 105 |
+
use_local_deepfake_model: Whether to load deepfake model from local path
|
| 106 |
+
use_local_whisper_model: Whether to load Whisper from local path
|
| 107 |
+
calibration_path: Optional path to calibration JSON file
|
| 108 |
+
max_audio_duration: Maximum audio duration to process (seconds)
|
| 109 |
+
"""
|
| 110 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 111 |
+
self.max_duration = max_audio_duration
|
| 112 |
+
|
| 113 |
+
# Normalize weights
|
| 114 |
+
total_weight = physics_weight + dl_weight
|
| 115 |
+
self.physics_weight = physics_weight / total_weight
|
| 116 |
+
self.dl_weight = dl_weight / total_weight
|
| 117 |
+
|
| 118 |
+
self.calibrator = ScoreCalibrator(calibration_path)
|
| 119 |
+
if self.calibrator.ready:
|
| 120 |
+
print(f" Calibration loaded from: {self.calibrator.calibration_path}")
|
| 121 |
+
|
| 122 |
+
print(f"🔧 Initializing Hybrid Detector with Language Detection")
|
| 123 |
+
print(f" Device: {self.device}")
|
| 124 |
+
print(f" Physics Weight: {self.physics_weight*100:.0f}%")
|
| 125 |
+
print(f" DL Weight: {self.dl_weight*100:.0f}%")
|
| 126 |
+
print(f" Max Audio Duration: {self.max_duration}s")
|
| 127 |
+
|
| 128 |
+
# --- LOAD DEEPFAKE DETECTION MODEL ---
|
| 129 |
+
try:
|
| 130 |
+
print(f"📥 Loading deepfake detection model from '{deepfake_model_path}'...")
|
| 131 |
+
|
| 132 |
+
if use_local_deepfake_model:
|
| 133 |
+
self.dl_model = AutoModelForAudioClassification.from_pretrained(
|
| 134 |
+
deepfake_model_path,
|
| 135 |
+
local_files_only=True
|
| 136 |
+
)
|
| 137 |
+
self.feature_extractor = AutoFeatureExtractor.from_pretrained(
|
| 138 |
+
deepfake_model_path,
|
| 139 |
+
local_files_only=True
|
| 140 |
+
)
|
| 141 |
+
else:
|
| 142 |
+
self.dl_model = AutoModelForAudioClassification.from_pretrained(deepfake_model_path)
|
| 143 |
+
self.feature_extractor = AutoFeatureExtractor.from_pretrained(deepfake_model_path)
|
| 144 |
+
|
| 145 |
+
self.dl_model.to(self.device)
|
| 146 |
+
self.dl_model.eval()
|
| 147 |
+
self.dl_ready = True
|
| 148 |
+
print("✅ Deepfake Detection Model Loaded")
|
| 149 |
+
|
| 150 |
+
except Exception as e:
|
| 151 |
+
print(f"⚠️ DL Model Load Failed: {e}")
|
| 152 |
+
print(" Running in Physics-Only mode")
|
| 153 |
+
self.dl_ready = False
|
| 154 |
+
self.dl_weight = 0
|
| 155 |
+
self.physics_weight = 1.0
|
| 156 |
+
|
| 157 |
+
# --- LOAD WHISPER FOR LANGUAGE DETECTION ---
|
| 158 |
+
try:
|
| 159 |
+
print(f"📥 Loading Whisper model for language detection from '{whisper_model_path}'...")
|
| 160 |
+
|
| 161 |
+
if use_local_whisper_model:
|
| 162 |
+
self.whisper_processor = WhisperProcessor.from_pretrained(
|
| 163 |
+
whisper_model_path,
|
| 164 |
+
local_files_only=True
|
| 165 |
+
)
|
| 166 |
+
self.whisper_model = WhisperForConditionalGeneration.from_pretrained(
|
| 167 |
+
whisper_model_path,
|
| 168 |
+
local_files_only=True
|
| 169 |
+
)
|
| 170 |
+
else:
|
| 171 |
+
self.whisper_processor = WhisperProcessor.from_pretrained(whisper_model_path)
|
| 172 |
+
self.whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_path)
|
| 173 |
+
|
| 174 |
+
self.whisper_model.to(self.device)
|
| 175 |
+
self.whisper_model.eval()
|
| 176 |
+
self.lang_ready = True
|
| 177 |
+
print("✅ Whisper Language Detection Model Loaded")
|
| 178 |
+
|
| 179 |
+
# Language code mapping for Indian languages and common languages
|
| 180 |
+
self.language_map = {
|
| 181 |
+
'hi': 'Hindi',
|
| 182 |
+
'bn': 'Bengali',
|
| 183 |
+
'te': 'Telugu',
|
| 184 |
+
'mr': 'Marathi',
|
| 185 |
+
'ta': 'Tamil',
|
| 186 |
+
'gu': 'Gujarati',
|
| 187 |
+
'kn': 'Kannada',
|
| 188 |
+
'ml': 'Malayalam',
|
| 189 |
+
'or': 'Odia',
|
| 190 |
+
'pa': 'Punjabi',
|
| 191 |
+
'as': 'Assamese',
|
| 192 |
+
'ur': 'Urdu',
|
| 193 |
+
'en': 'English',
|
| 194 |
+
'ne': 'Nepali',
|
| 195 |
+
'si': 'Sinhala',
|
| 196 |
+
'sa': 'Sanskrit',
|
| 197 |
+
'sd': 'Sindhi',
|
| 198 |
+
'ks': 'Kashmiri'
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
except Exception as e:
|
| 202 |
+
print(f"⚠️ Whisper Model Load Failed: {e}")
|
| 203 |
+
print(" Running without language detection")
|
| 204 |
+
self.lang_ready = False
|
| 205 |
+
|
| 206 |
+
# --- PHYSICS ENGINE PARAMETERS ---
|
| 207 |
+
self.CV_AI_THRESHOLD = 0.20
|
| 208 |
+
self.CV_HUMAN_THRESHOLD = 0.32
|
| 209 |
+
self.INTENSITY_MIN_STD = 0.05
|
| 210 |
+
self.INTENSITY_MAX_STD = 0.15
|
| 211 |
+
|
| 212 |
+
print("✅ Hybrid Detector Ready\n")
|
| 213 |
+
|
| 214 |
+
def reload_calibration(self, calibration_path=None):
|
| 215 |
+
"""
|
| 216 |
+
Reload calibration weights from disk.
|
| 217 |
+
|
| 218 |
+
Args:
|
| 219 |
+
calibration_path: Optional override path
|
| 220 |
+
|
| 221 |
+
Returns:
|
| 222 |
+
bool: True if calibration loaded
|
| 223 |
+
"""
|
| 224 |
+
if self.calibrator is None:
|
| 225 |
+
self.calibrator = ScoreCalibrator(calibration_path)
|
| 226 |
+
return self.calibrator.ready
|
| 227 |
+
return self.calibrator.load(calibration_path)
|
| 228 |
+
|
| 229 |
+
# ==========================================================
|
| 230 |
+
# HELPER: Audio Preprocessing
|
| 231 |
+
# ==========================================================
|
| 232 |
+
def preprocess_audio(self, audio_path, target_sr=16000):
|
| 233 |
+
"""
|
| 234 |
+
Load and preprocess audio:
|
| 235 |
+
1. Load audio
|
| 236 |
+
2. Convert to mono
|
| 237 |
+
3. Truncate to max_duration if needed
|
| 238 |
+
4. Resample to target_sr
|
| 239 |
+
|
| 240 |
+
Args:
|
| 241 |
+
audio_path: Path to audio file
|
| 242 |
+
target_sr: Target sample rate
|
| 243 |
+
|
| 244 |
+
Returns:
|
| 245 |
+
tuple: (waveform_array, sample_rate, duration, was_truncated)
|
| 246 |
+
"""
|
| 247 |
+
try:
|
| 248 |
+
# Load audio
|
| 249 |
+
y, sr = librosa.load(audio_path, sr=None, mono=True)
|
| 250 |
+
|
| 251 |
+
# Calculate duration
|
| 252 |
+
duration = len(y) / sr
|
| 253 |
+
was_truncated = False
|
| 254 |
+
|
| 255 |
+
# Truncate if longer than max_duration
|
| 256 |
+
if duration > self.max_duration:
|
| 257 |
+
print(f" ⚠️ Audio is {duration:.1f}s, truncating to {self.max_duration}s")
|
| 258 |
+
max_samples = int(self.max_duration * sr)
|
| 259 |
+
y = y[:max_samples]
|
| 260 |
+
duration = self.max_duration
|
| 261 |
+
was_truncated = True
|
| 262 |
+
|
| 263 |
+
# Resample if needed
|
| 264 |
+
if sr != target_sr:
|
| 265 |
+
y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
|
| 266 |
+
sr = target_sr
|
| 267 |
+
|
| 268 |
+
return y, sr, duration, was_truncated
|
| 269 |
+
|
| 270 |
+
except Exception as e:
|
| 271 |
+
raise ValueError(f"Failed to preprocess audio: {str(e)}")
|
| 272 |
+
|
| 273 |
+
# ==========================================================
|
| 274 |
+
# HELPER: Base64 Decoding
|
| 275 |
+
# ==========================================================
|
| 276 |
+
def decode_base64_audio(self, base64_string, audio_format="mp3"):
|
| 277 |
+
"""
|
| 278 |
+
Decode base64 audio and save to temporary file
|
| 279 |
+
|
| 280 |
+
Args:
|
| 281 |
+
base64_string: Base64 encoded audio data
|
| 282 |
+
|
| 283 |
+
Returns:
|
| 284 |
+
str: Path to temporary audio file
|
| 285 |
+
"""
|
| 286 |
+
try:
|
| 287 |
+
detected_format = audio_format
|
| 288 |
+
if isinstance(base64_string, str) and base64_string.startswith("data:"):
|
| 289 |
+
header, base64_string = base64_string.split(",", 1)
|
| 290 |
+
header_lower = header.lower()
|
| 291 |
+
if "audio/wav" in header_lower or "audio/x-wav" in header_lower:
|
| 292 |
+
detected_format = "wav"
|
| 293 |
+
elif "audio/mpeg" in header_lower or "audio/mp3" in header_lower:
|
| 294 |
+
detected_format = "mp3"
|
| 295 |
+
|
| 296 |
+
# Decode base64
|
| 297 |
+
audio_data = base64.b64decode(base64_string)
|
| 298 |
+
|
| 299 |
+
file_suffix = ".wav" if str(detected_format).lower() in ["wav", "wave"] else ".mp3"
|
| 300 |
+
|
| 301 |
+
# Create temporary file
|
| 302 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix)
|
| 303 |
+
temp_file.write(audio_data)
|
| 304 |
+
temp_file.close()
|
| 305 |
+
|
| 306 |
+
return temp_file.name
|
| 307 |
+
|
| 308 |
+
except Exception as e:
|
| 309 |
+
raise ValueError(f"Failed to decode base64 audio: {str(e)}")
|
| 310 |
+
|
| 311 |
+
# ==========================================================
|
| 312 |
+
# LANGUAGE DETECTION
|
| 313 |
+
# ==========================================================
|
| 314 |
+
def detect_language(self, audio_path):
|
| 315 |
+
"""
|
| 316 |
+
Detect language using Whisper model
|
| 317 |
+
|
| 318 |
+
Args:
|
| 319 |
+
audio_path: Path to audio file
|
| 320 |
+
|
| 321 |
+
Returns:
|
| 322 |
+
str: Detected language name
|
| 323 |
+
"""
|
| 324 |
+
if not self.lang_ready:
|
| 325 |
+
return "Unknown"
|
| 326 |
+
|
| 327 |
+
try:
|
| 328 |
+
# Load and preprocess audio for Whisper (uses 16kHz)
|
| 329 |
+
# Use first 30 seconds for language detection
|
| 330 |
+
audio, sr = librosa.load(audio_path, sr=16000, mono=True, duration=30)
|
| 331 |
+
|
| 332 |
+
# Process audio with Whisper processor
|
| 333 |
+
input_features = self.whisper_processor(
|
| 334 |
+
audio,
|
| 335 |
+
sampling_rate=16000,
|
| 336 |
+
return_tensors="pt"
|
| 337 |
+
).input_features
|
| 338 |
+
|
| 339 |
+
input_features = input_features.to(self.device)
|
| 340 |
+
|
| 341 |
+
# Whisper language detection using forced_decoder_ids
|
| 342 |
+
with torch.no_grad():
|
| 343 |
+
# Generate with language detection enabled
|
| 344 |
+
generated_ids = self.whisper_model.generate(
|
| 345 |
+
input_features,
|
| 346 |
+
task="transcribe",
|
| 347 |
+
return_dict_in_generate=True
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
# Decode the output
|
| 351 |
+
full_output = self.whisper_processor.batch_decode(
|
| 352 |
+
generated_ids.sequences,
|
| 353 |
+
skip_special_tokens=False
|
| 354 |
+
)[0]
|
| 355 |
+
|
| 356 |
+
# Parse language from special tokens
|
| 357 |
+
# Format: <|startoftranscript|><|en|><|transcribe|>...
|
| 358 |
+
detected_lang = None
|
| 359 |
+
|
| 360 |
+
# Look for language tokens in the format <|xx|>
|
| 361 |
+
import re
|
| 362 |
+
lang_pattern = r'<\|([a-z]{2})\|>'
|
| 363 |
+
matches = re.findall(lang_pattern, full_output)
|
| 364 |
+
|
| 365 |
+
if matches:
|
| 366 |
+
# First match after startoftranscript is usually the language
|
| 367 |
+
for match in matches:
|
| 368 |
+
if match in self.language_map:
|
| 369 |
+
detected_lang = match
|
| 370 |
+
break
|
| 371 |
+
|
| 372 |
+
if detected_lang:
|
| 373 |
+
lang_name = self.language_map.get(detected_lang, detected_lang.upper())
|
| 374 |
+
print(f" 🌐 Detected Language: {lang_name} ({detected_lang})")
|
| 375 |
+
return lang_name
|
| 376 |
+
else:
|
| 377 |
+
# Fallback: if transcription successful, assume English
|
| 378 |
+
transcription = self.whisper_processor.batch_decode(
|
| 379 |
+
generated_ids.sequences,
|
| 380 |
+
skip_special_tokens=True
|
| 381 |
+
)[0]
|
| 382 |
+
|
| 383 |
+
if len(transcription.strip()) > 0:
|
| 384 |
+
print(f" 🌐 Detected Language: English (default)")
|
| 385 |
+
return "English"
|
| 386 |
+
else:
|
| 387 |
+
return "Unknown"
|
| 388 |
+
|
| 389 |
+
except Exception as e:
|
| 390 |
+
print(f" ⚠️ Language detection error: {str(e)}")
|
| 391 |
+
return "Unknown"
|
| 392 |
+
|
| 393 |
+
def extract_scores(self, audio_input, input_type="file", audio_format="mp3"):
|
| 394 |
+
"""
|
| 395 |
+
Extract physics and deep learning scores without language detection.
|
| 396 |
+
|
| 397 |
+
Args:
|
| 398 |
+
audio_input: Either file path or base64 string
|
| 399 |
+
input_type: "file" or "base64"
|
| 400 |
+
audio_format: "mp3" or "wav" when using base64
|
| 401 |
+
|
| 402 |
+
Returns:
|
| 403 |
+
dict: Score details
|
| 404 |
+
"""
|
| 405 |
+
temp_file = None
|
| 406 |
+
try:
|
| 407 |
+
if input_type == "base64":
|
| 408 |
+
temp_file = self.decode_base64_audio(audio_input, audio_format=audio_format)
|
| 409 |
+
audio_path = temp_file
|
| 410 |
+
elif input_type == "file":
|
| 411 |
+
audio_path = audio_input
|
| 412 |
+
if not os.path.exists(audio_path):
|
| 413 |
+
return {
|
| 414 |
+
"status": "error",
|
| 415 |
+
"error": f"Audio file not found: {audio_path}"
|
| 416 |
+
}
|
| 417 |
+
else:
|
| 418 |
+
return {
|
| 419 |
+
"status": "error",
|
| 420 |
+
"error": f"Invalid input_type: {input_type}. Use 'file' or 'base64'"
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
+
phys_score, phys_method, phys_feats = self.get_physics_score(audio_path)
|
| 424 |
+
dl_score, dl_label = self.get_dl_score(audio_path)
|
| 425 |
+
|
| 426 |
+
return {
|
| 427 |
+
"status": "success",
|
| 428 |
+
"physics_score": float(phys_score),
|
| 429 |
+
"dl_score": float(dl_score),
|
| 430 |
+
"dl_label": dl_label,
|
| 431 |
+
"physics_method": phys_method,
|
| 432 |
+
"audio_duration": float(phys_feats.get("duration", 0)),
|
| 433 |
+
"was_truncated": bool(phys_feats.get("was_truncated", False))
|
| 434 |
+
}
|
| 435 |
+
except Exception as e:
|
| 436 |
+
return {
|
| 437 |
+
"status": "error",
|
| 438 |
+
"error": str(e)
|
| 439 |
+
}
|
| 440 |
+
finally:
|
| 441 |
+
if temp_file and os.path.exists(temp_file):
|
| 442 |
+
try:
|
| 443 |
+
os.unlink(temp_file)
|
| 444 |
+
except Exception:
|
| 445 |
+
pass
|
| 446 |
+
|
| 447 |
+
# ==========================================================
|
| 448 |
+
# PART A: PHYSICS ENGINE (FIXED)
|
| 449 |
+
# ==========================================================
|
| 450 |
+
def get_linear_score(self, val, min_val, max_val):
|
| 451 |
+
"""Linear interpolation for scoring"""
|
| 452 |
+
if val <= min_val:
|
| 453 |
+
return 1.0
|
| 454 |
+
if val >= max_val:
|
| 455 |
+
return 0.0
|
| 456 |
+
return 1.0 - ((val - min_val) / (max_val - min_val))
|
| 457 |
+
|
| 458 |
+
def get_physics_score(self, audio_path):
|
| 459 |
+
"""
|
| 460 |
+
Analyze audio using physics-based acoustic features
|
| 461 |
+
|
| 462 |
+
Returns:
|
| 463 |
+
tuple: (ai_score, method, features_dict)
|
| 464 |
+
"""
|
| 465 |
+
try:
|
| 466 |
+
# Load audio at NATIVE sample rate (don't resample for physics analysis)
|
| 467 |
+
y, sr = librosa.load(audio_path, sr=None, mono=True)
|
| 468 |
+
|
| 469 |
+
# Calculate original duration
|
| 470 |
+
duration = len(y) / sr
|
| 471 |
+
was_truncated = False
|
| 472 |
+
|
| 473 |
+
# Truncate if needed
|
| 474 |
+
if duration > self.max_duration:
|
| 475 |
+
max_samples = int(self.max_duration * sr)
|
| 476 |
+
y = y[:max_samples]
|
| 477 |
+
duration = self.max_duration
|
| 478 |
+
was_truncated = True
|
| 479 |
+
|
| 480 |
+
print(f" 🔬 Running physics analysis on {duration:.1f}s audio at {sr}Hz")
|
| 481 |
+
|
| 482 |
+
# Robust pitch tracking using PYIN
|
| 483 |
+
try:
|
| 484 |
+
f0, voiced_flag, voiced_probs = librosa.pyin(
|
| 485 |
+
y,
|
| 486 |
+
fmin=librosa.note_to_hz('C2'), # ~65 Hz
|
| 487 |
+
fmax=librosa.note_to_hz('C7'), # ~2093 Hz
|
| 488 |
+
sr=sr,
|
| 489 |
+
frame_length=2048
|
| 490 |
+
)
|
| 491 |
+
valid_f0 = f0[~np.isnan(f0)]
|
| 492 |
+
except Exception as pitch_error:
|
| 493 |
+
print(f" ⚠️ Pitch detection failed: {pitch_error}, using fallback method")
|
| 494 |
+
# Fallback: use simpler pitch detection
|
| 495 |
+
valid_f0 = np.array([])
|
| 496 |
+
|
| 497 |
+
if len(valid_f0) < 10: # Need at least 10 valid pitch points
|
| 498 |
+
print(f" ⚠️ Insufficient pitch data ({len(valid_f0)} points), using alternative features")
|
| 499 |
+
# Fall back to non-pitch features
|
| 500 |
+
rms = librosa.feature.rms(y=y)[0]
|
| 501 |
+
centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
|
| 502 |
+
zcr = librosa.feature.zero_crossing_rate(y)[0]
|
| 503 |
+
|
| 504 |
+
feats = {
|
| 505 |
+
'pitch_cv': 0.25, # Neutral value
|
| 506 |
+
'intensity_std': np.std(rms),
|
| 507 |
+
'freq_skew': stats.skew(centroid),
|
| 508 |
+
'zcr_std': np.std(zcr),
|
| 509 |
+
'mean_pitch': 0,
|
| 510 |
+
'std_pitch': 0,
|
| 511 |
+
'duration': duration,
|
| 512 |
+
'was_truncated': was_truncated
|
| 513 |
+
}
|
| 514 |
+
|
| 515 |
+
# Score based on available features
|
| 516 |
+
intensity_score = self.get_linear_score(
|
| 517 |
+
feats['intensity_std'],
|
| 518 |
+
self.INTENSITY_MIN_STD,
|
| 519 |
+
self.INTENSITY_MAX_STD
|
| 520 |
+
)
|
| 521 |
+
|
| 522 |
+
zcr_score = self.get_linear_score(
|
| 523 |
+
feats['zcr_std'],
|
| 524 |
+
0.01,
|
| 525 |
+
0.08
|
| 526 |
+
)
|
| 527 |
+
|
| 528 |
+
skew_score = self.get_linear_score(
|
| 529 |
+
abs(feats['freq_skew']),
|
| 530 |
+
0.1,
|
| 531 |
+
1.0
|
| 532 |
+
)
|
| 533 |
+
|
| 534 |
+
# Weighted combination (no pitch)
|
| 535 |
+
final_score = (intensity_score * 0.5 + zcr_score * 0.2 + skew_score * 0.3)
|
| 536 |
+
|
| 537 |
+
print(f" 🔬 Physics score (no pitch): {final_score:.3f}")
|
| 538 |
+
return round(final_score, 3), "Physics Analysis (Limited)", feats
|
| 539 |
+
|
| 540 |
+
# Full analysis with pitch
|
| 541 |
+
rms = librosa.feature.rms(y=y)[0]
|
| 542 |
+
centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
|
| 543 |
+
|
| 544 |
+
mean_pitch = np.mean(valid_f0)
|
| 545 |
+
std_pitch = np.std(valid_f0)
|
| 546 |
+
|
| 547 |
+
# Calculate feature metrics
|
| 548 |
+
feats = {
|
| 549 |
+
'pitch_cv': std_pitch / mean_pitch if mean_pitch > 0 else 0,
|
| 550 |
+
'intensity_std': np.std(rms),
|
| 551 |
+
'freq_skew': stats.skew(centroid),
|
| 552 |
+
'mean_pitch': mean_pitch,
|
| 553 |
+
'std_pitch': std_pitch,
|
| 554 |
+
'duration': duration,
|
| 555 |
+
'was_truncated': was_truncated
|
| 556 |
+
}
|
| 557 |
+
|
| 558 |
+
# Individual feature scores (higher = more AI-like)
|
| 559 |
+
intensity_score = self.get_linear_score(
|
| 560 |
+
feats['intensity_std'],
|
| 561 |
+
self.INTENSITY_MIN_STD,
|
| 562 |
+
self.INTENSITY_MAX_STD
|
| 563 |
+
)
|
| 564 |
+
|
| 565 |
+
pitch_score = self.get_linear_score(
|
| 566 |
+
feats['pitch_cv'],
|
| 567 |
+
self.CV_AI_THRESHOLD,
|
| 568 |
+
self.CV_HUMAN_THRESHOLD
|
| 569 |
+
)
|
| 570 |
+
|
| 571 |
+
skew_score = self.get_linear_score(
|
| 572 |
+
abs(feats['freq_skew']),
|
| 573 |
+
0.1,
|
| 574 |
+
1.0
|
| 575 |
+
)
|
| 576 |
+
|
| 577 |
+
# Weighted combination
|
| 578 |
+
W_INTENSITY = 0.40
|
| 579 |
+
W_PITCH = 0.40
|
| 580 |
+
W_SKEW = 0.20
|
| 581 |
+
|
| 582 |
+
base_score = (
|
| 583 |
+
intensity_score * W_INTENSITY +
|
| 584 |
+
pitch_score * W_PITCH +
|
| 585 |
+
skew_score * W_SKEW
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
+
# Synergy bonus: if both intensity and pitch are suspicious
|
| 589 |
+
if intensity_score > 0.4 and pitch_score > 0.4:
|
| 590 |
+
final_score = min(base_score + 0.15, 1.0)
|
| 591 |
+
else:
|
| 592 |
+
final_score = base_score
|
| 593 |
+
|
| 594 |
+
print(f" 🔬 Physics score: {final_score:.3f} (intensity:{intensity_score:.2f}, pitch:{pitch_score:.2f})")
|
| 595 |
+
return round(final_score, 3), "Physics Analysis", feats
|
| 596 |
+
|
| 597 |
+
except Exception as e:
|
| 598 |
+
print(f" ❌ Physics analysis failed: {str(e)}")
|
| 599 |
+
import traceback
|
| 600 |
+
traceback.print_exc()
|
| 601 |
+
return 0.0, f"Physics Error: {str(e)}", {'duration': 0, 'was_truncated': False}
|
| 602 |
+
|
| 603 |
+
# ==========================================================
|
| 604 |
+
# PART B: DEEP LEARNING ENGINE
|
| 605 |
+
# ==========================================================
|
| 606 |
+
def get_dl_score(self, audio_path):
|
| 607 |
+
"""
|
| 608 |
+
Analyze audio using deep learning model
|
| 609 |
+
|
| 610 |
+
Returns:
|
| 611 |
+
tuple: (ai_score, label)
|
| 612 |
+
"""
|
| 613 |
+
if not self.dl_ready:
|
| 614 |
+
return 0.0, "Model not loaded"
|
| 615 |
+
|
| 616 |
+
try:
|
| 617 |
+
# Load and preprocess audio
|
| 618 |
+
waveform_np, sr, duration, was_truncated = self.preprocess_audio(audio_path, target_sr=16000)
|
| 619 |
+
|
| 620 |
+
# Process with feature extractor
|
| 621 |
+
inputs = self.feature_extractor(
|
| 622 |
+
waveform_np,
|
| 623 |
+
sampling_rate=16000,
|
| 624 |
+
return_tensors="pt",
|
| 625 |
+
padding=True
|
| 626 |
+
)
|
| 627 |
+
|
| 628 |
+
# Move to device
|
| 629 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 630 |
+
|
| 631 |
+
# Run inference
|
| 632 |
+
with torch.no_grad():
|
| 633 |
+
outputs = self.dl_model(**inputs)
|
| 634 |
+
logits = outputs.logits
|
| 635 |
+
probs = F.softmax(logits, dim=-1)
|
| 636 |
+
|
| 637 |
+
# Get predictions
|
| 638 |
+
# Class 0: Real, Class 1: Fake
|
| 639 |
+
prob_real = probs[0][0].item()
|
| 640 |
+
prob_fake = probs[0][1].item()
|
| 641 |
+
|
| 642 |
+
# AI score is the fake probability
|
| 643 |
+
ai_score = prob_fake
|
| 644 |
+
|
| 645 |
+
label = "Fake/Deepfake" if prob_fake > 0.5 else "Real/Human"
|
| 646 |
+
|
| 647 |
+
return round(ai_score, 3), label
|
| 648 |
+
|
| 649 |
+
except Exception as e:
|
| 650 |
+
print(f" ❌ DL analysis failed: {str(e)}")
|
| 651 |
+
return 0.0, f"DL Error: {str(e)}"
|
| 652 |
+
|
| 653 |
+
# ==========================================================
|
| 654 |
+
# PART C: EXPLANATION GENERATOR
|
| 655 |
+
# ==========================================================
|
| 656 |
+
def generate_explanation(self, final_score, phys_score, dl_score, dl_label, phys_feats, ai_threshold=0.55):
|
| 657 |
+
"""
|
| 658 |
+
Generate human-readable explanation for the classification
|
| 659 |
+
|
| 660 |
+
Returns:
|
| 661 |
+
str: Explanation text
|
| 662 |
+
"""
|
| 663 |
+
explanations = []
|
| 664 |
+
|
| 665 |
+
if final_score > ai_threshold:
|
| 666 |
+
# AI GENERATED
|
| 667 |
+
|
| 668 |
+
# Deep Learning contributions
|
| 669 |
+
if dl_score > 0.55 and self.dl_ready:
|
| 670 |
+
if "Fake" in dl_label or "Deepfake" in dl_label:
|
| 671 |
+
explanations.append(
|
| 672 |
+
f"Deep learning model detected synthetic voice patterns "
|
| 673 |
+
f"(confidence: {dl_score*100:.1f}%)"
|
| 674 |
+
)
|
| 675 |
+
|
| 676 |
+
# Physics contributions
|
| 677 |
+
if phys_score > 0.55:
|
| 678 |
+
p_cv = phys_feats.get('pitch_cv', 0)
|
| 679 |
+
i_std = phys_feats.get('intensity_std', 0)
|
| 680 |
+
|
| 681 |
+
if i_std < 0.06:
|
| 682 |
+
explanations.append(
|
| 683 |
+
f"Unnaturally consistent energy levels detected "
|
| 684 |
+
f"(std: {i_std:.3f}, expected: >0.06)"
|
| 685 |
+
)
|
| 686 |
+
|
| 687 |
+
if p_cv < 0.22 and p_cv > 0:
|
| 688 |
+
explanations.append(
|
| 689 |
+
f"Robotic pitch modulation patterns "
|
| 690 |
+
f"(CV: {p_cv:.2f}, expected: >0.22)"
|
| 691 |
+
)
|
| 692 |
+
|
| 693 |
+
if not explanations or (i_std >= 0.06 and p_cv >= 0.22):
|
| 694 |
+
explanations.append(
|
| 695 |
+
"Acoustic parameters lack natural human variability"
|
| 696 |
+
)
|
| 697 |
+
|
| 698 |
+
if not explanations:
|
| 699 |
+
explanations.append(
|
| 700 |
+
"Voice exhibits characteristics consistent with AI generation"
|
| 701 |
+
)
|
| 702 |
+
|
| 703 |
+
else:
|
| 704 |
+
# HUMAN
|
| 705 |
+
explanations.append(
|
| 706 |
+
"Voice exhibits natural acoustic variability and human speech characteristics"
|
| 707 |
+
)
|
| 708 |
+
|
| 709 |
+
return "; ".join(explanations)
|
| 710 |
+
|
| 711 |
+
# ==========================================================
|
| 712 |
+
# PART D: MAIN ANALYSIS FUNCTION
|
| 713 |
+
# ==========================================================
|
| 714 |
+
def analyze(self, audio_input, input_type="file", audio_format="mp3", analysis_mode="full"):
|
| 715 |
+
"""
|
| 716 |
+
Main analysis function with configurable input types
|
| 717 |
+
|
| 718 |
+
Args:
|
| 719 |
+
audio_input: Either file path or base64 string
|
| 720 |
+
input_type: "file" or "base64"
|
| 721 |
+
audio_format: "mp3" or "wav" when using base64 input
|
| 722 |
+
analysis_mode: "full", "physics", or "dl"
|
| 723 |
+
|
| 724 |
+
Returns:
|
| 725 |
+
dict: Analysis results following API response format
|
| 726 |
+
"""
|
| 727 |
+
temp_file = None
|
| 728 |
+
|
| 729 |
+
try:
|
| 730 |
+
analysis_mode = (analysis_mode or "full")
|
| 731 |
+
analysis_mode = str(analysis_mode).lower().strip()
|
| 732 |
+
if analysis_mode not in ["full", "physics", "dl"]:
|
| 733 |
+
return {
|
| 734 |
+
"status": "error",
|
| 735 |
+
"error": f"Invalid analysis_mode: {analysis_mode}. Use 'full', 'physics', or 'dl'"
|
| 736 |
+
}
|
| 737 |
+
|
| 738 |
+
# Handle input type
|
| 739 |
+
if input_type == "base64":
|
| 740 |
+
temp_file = self.decode_base64_audio(audio_input, audio_format=audio_format)
|
| 741 |
+
audio_path = temp_file
|
| 742 |
+
elif input_type == "file":
|
| 743 |
+
audio_path = audio_input
|
| 744 |
+
if not os.path.exists(audio_path):
|
| 745 |
+
return {
|
| 746 |
+
"status": "error",
|
| 747 |
+
"error": f"Audio file not found: {audio_path}"
|
| 748 |
+
}
|
| 749 |
+
else:
|
| 750 |
+
return {
|
| 751 |
+
"status": "error",
|
| 752 |
+
"error": f"Invalid input_type: {input_type}. Use 'file' or 'base64'"
|
| 753 |
+
}
|
| 754 |
+
|
| 755 |
+
print(f"🎵 Analyzing: {os.path.basename(audio_path)}")
|
| 756 |
+
|
| 757 |
+
# 1. Detect Language
|
| 758 |
+
detected_language = "Unknown"
|
| 759 |
+
if analysis_mode == "full":
|
| 760 |
+
detected_language = self.detect_language(audio_path)
|
| 761 |
+
|
| 762 |
+
# 2. Run Physics Analysis
|
| 763 |
+
phys_score = 0.0
|
| 764 |
+
phys_method = "Physics Skipped"
|
| 765 |
+
phys_feats = {'duration': 0, 'was_truncated': False}
|
| 766 |
+
if analysis_mode in ["full", "physics"]:
|
| 767 |
+
phys_score, phys_method, phys_feats = self.get_physics_score(audio_path)
|
| 768 |
+
|
| 769 |
+
# 3. Run Deep Learning Analysis
|
| 770 |
+
dl_score = 0.0
|
| 771 |
+
dl_label = "DL Skipped"
|
| 772 |
+
if analysis_mode in ["full", "dl"]:
|
| 773 |
+
dl_score, dl_label = self.get_dl_score(audio_path)
|
| 774 |
+
|
| 775 |
+
# 4. Calculate weighted ensemble score
|
| 776 |
+
used_calibration = False
|
| 777 |
+
threshold = 0.55
|
| 778 |
+
|
| 779 |
+
if analysis_mode == "full" and self.calibrator and self.calibrator.ready:
|
| 780 |
+
calibrated_score = self.calibrator.predict(phys_score, dl_score)
|
| 781 |
+
if calibrated_score is not None:
|
| 782 |
+
final_score = calibrated_score
|
| 783 |
+
used_calibration = True
|
| 784 |
+
threshold = float(self.calibrator.threshold)
|
| 785 |
+
else:
|
| 786 |
+
final_score = (
|
| 787 |
+
self.physics_weight * phys_score +
|
| 788 |
+
self.dl_weight * dl_score
|
| 789 |
+
)
|
| 790 |
+
elif analysis_mode == "physics":
|
| 791 |
+
final_score = phys_score
|
| 792 |
+
elif analysis_mode == "dl":
|
| 793 |
+
final_score = dl_score
|
| 794 |
+
else:
|
| 795 |
+
final_score = (
|
| 796 |
+
self.physics_weight * phys_score +
|
| 797 |
+
self.dl_weight * dl_score
|
| 798 |
+
)
|
| 799 |
+
|
| 800 |
+
# Round to 2 decimal places
|
| 801 |
+
final_score = round(float(final_score), 2)
|
| 802 |
+
|
| 803 |
+
# 5. Determine classification
|
| 804 |
+
classification = "AI_GENERATED" if final_score > threshold else "HUMAN"
|
| 805 |
+
|
| 806 |
+
# 6. Generate explanation
|
| 807 |
+
explanation = self.generate_explanation(
|
| 808 |
+
final_score,
|
| 809 |
+
phys_score,
|
| 810 |
+
dl_score,
|
| 811 |
+
dl_label,
|
| 812 |
+
phys_feats,
|
| 813 |
+
ai_threshold=threshold
|
| 814 |
+
)
|
| 815 |
+
|
| 816 |
+
# 7. Return API-compliant response (ensure all values are JSON serializable)
|
| 817 |
+
return {
|
| 818 |
+
"status": "success",
|
| 819 |
+
"language": detected_language,
|
| 820 |
+
"classification": classification,
|
| 821 |
+
"confidenceScore": float(final_score), # Convert to Python float
|
| 822 |
+
"explanation": explanation,
|
| 823 |
+
"analysisMode": analysis_mode,
|
| 824 |
+
"debug": {
|
| 825 |
+
"physics_score": float(phys_score),
|
| 826 |
+
"dl_score": float(dl_score),
|
| 827 |
+
"dl_label": dl_label,
|
| 828 |
+
"physics_weight": f"{self.physics_weight*100:.0f}%",
|
| 829 |
+
"dl_weight": f"{self.dl_weight*100:.0f}%",
|
| 830 |
+
"analysis_mode": analysis_mode,
|
| 831 |
+
"used_calibration": used_calibration,
|
| 832 |
+
"calibration_threshold": float(threshold) if used_calibration else None,
|
| 833 |
+
"calibration_path": self.calibrator.calibration_path if used_calibration else None,
|
| 834 |
+
"audio_duration": float(phys_feats.get('duration', 0)),
|
| 835 |
+
"was_truncated": bool(phys_feats.get('was_truncated', False)),
|
| 836 |
+
"physics_features": {k: float(v) if isinstance(v, (np.floating, np.integer)) else v
|
| 837 |
+
for k, v in phys_feats.items()
|
| 838 |
+
if k not in ['duration', 'was_truncated']}
|
| 839 |
+
}
|
| 840 |
+
}
|
| 841 |
+
|
| 842 |
+
except Exception as e:
|
| 843 |
+
import traceback
|
| 844 |
+
return {
|
| 845 |
+
"status": "error",
|
| 846 |
+
"error": str(e),
|
| 847 |
+
"traceback": traceback.format_exc()
|
| 848 |
+
}
|
| 849 |
+
|
| 850 |
+
finally:
|
| 851 |
+
# Clean up temporary file
|
| 852 |
+
if temp_file and os.path.exists(temp_file):
|
| 853 |
+
try:
|
| 854 |
+
os.unlink(temp_file)
|
| 855 |
+
except:
|
| 856 |
+
pass
|
| 857 |
+
|
| 858 |
+
# ==========================================================
|
| 859 |
+
# UTILITY: Update Weights
|
| 860 |
+
# ==========================================================
|
| 861 |
+
def update_weights(self, physics_weight, dl_weight):
|
| 862 |
+
"""
|
| 863 |
+
Update ensemble weights dynamically
|
| 864 |
+
|
| 865 |
+
Args:
|
| 866 |
+
physics_weight: New physics weight (0-1)
|
| 867 |
+
dl_weight: New DL weight (0-1)
|
| 868 |
+
"""
|
| 869 |
+
total = physics_weight + dl_weight
|
| 870 |
+
self.physics_weight = physics_weight / total
|
| 871 |
+
self.dl_weight = dl_weight / total
|
| 872 |
+
|
| 873 |
+
print(f"⚙️ Weights updated:")
|
| 874 |
+
print(f" Physics: {self.physics_weight*100:.0f}%")
|
| 875 |
+
print(f" DL: {self.dl_weight*100:.0f}%")
|
download_models.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pre-download Models Script
|
| 3 |
+
Downloads all required AI models before deployment
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
print("="*70)
|
| 11 |
+
print("Voice Detection API - Model Download Script")
|
| 12 |
+
print("="*70)
|
| 13 |
+
print()
|
| 14 |
+
|
| 15 |
+
# Check if we're in the right directory
|
| 16 |
+
if not Path("requirements.txt").exists():
|
| 17 |
+
print("ERROR: requirements.txt not found!")
|
| 18 |
+
print("Please run this script from the project root directory.")
|
| 19 |
+
sys.exit(1)
|
| 20 |
+
|
| 21 |
+
print("This script will download the following models:")
|
| 22 |
+
print("1. Wav2Vec2 Deepfake Detector (~1.2 GB)")
|
| 23 |
+
print("2. Whisper Base Language Model (~500 MB)")
|
| 24 |
+
print()
|
| 25 |
+
print("Total download size: ~1.7 GB")
|
| 26 |
+
print("This may take 5-15 minutes depending on your internet speed.")
|
| 27 |
+
print()
|
| 28 |
+
|
| 29 |
+
response = input("Continue? (y/n): ")
|
| 30 |
+
if response.lower() != 'y':
|
| 31 |
+
print("Download cancelled.")
|
| 32 |
+
sys.exit(0)
|
| 33 |
+
|
| 34 |
+
print()
|
| 35 |
+
print("="*70)
|
| 36 |
+
print("Step 1/2: Downloading Wav2Vec2 Deepfake Detector")
|
| 37 |
+
print("="*70)
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
|
| 41 |
+
|
| 42 |
+
print("Downloading model...")
|
| 43 |
+
model = AutoModelForAudioClassification.from_pretrained(
|
| 44 |
+
'garystafford/wav2vec2-deepfake-voice-detector'
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
print("Downloading feature extractor...")
|
| 48 |
+
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
| 49 |
+
'garystafford/wav2vec2-deepfake-voice-detector'
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
print("✅ Wav2Vec2 model downloaded successfully!")
|
| 53 |
+
print()
|
| 54 |
+
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"❌ Failed to download Wav2Vec2 model: {str(e)}")
|
| 57 |
+
print("Please check your internet connection and try again.")
|
| 58 |
+
sys.exit(1)
|
| 59 |
+
|
| 60 |
+
print("="*70)
|
| 61 |
+
print("Step 2/2: Downloading Whisper Language Detection Model")
|
| 62 |
+
print("="*70)
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
| 66 |
+
|
| 67 |
+
print("Downloading processor...")
|
| 68 |
+
processor = WhisperProcessor.from_pretrained('openai/whisper-base')
|
| 69 |
+
|
| 70 |
+
print("Downloading model...")
|
| 71 |
+
model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-base')
|
| 72 |
+
|
| 73 |
+
print("✅ Whisper model downloaded successfully!")
|
| 74 |
+
print()
|
| 75 |
+
|
| 76 |
+
except Exception as e:
|
| 77 |
+
print(f"❌ Failed to download Whisper model: {str(e)}")
|
| 78 |
+
print("Please check your internet connection and try again.")
|
| 79 |
+
sys.exit(1)
|
| 80 |
+
|
| 81 |
+
print("="*70)
|
| 82 |
+
print("✅ All models downloaded successfully!")
|
| 83 |
+
print("="*70)
|
| 84 |
+
print()
|
| 85 |
+
print("Models are cached in:", Path.home() / ".cache" / "huggingface")
|
| 86 |
+
print()
|
| 87 |
+
print("Next steps:")
|
| 88 |
+
print("1. The models will be automatically used by the API")
|
| 89 |
+
print("2. Start the API: python app.py")
|
| 90 |
+
print("3. Test the API: python test_api.py")
|
| 91 |
+
print()
|
| 92 |
+
print("="*70)
|
pytest.ini
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[pytest]
|
| 2 |
+
testpaths = tests
|
| 3 |
+
markers =
|
| 4 |
+
integration: tests that require full models and data
|
requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Flask
|
| 2 |
+
flask-cors
|
| 3 |
+
flask-sock
|
| 4 |
+
Werkzeug
|
| 5 |
+
transformers
|
| 6 |
+
librosa
|
| 7 |
+
soundfile
|
| 8 |
+
scipy
|
| 9 |
+
numpy
|
| 10 |
+
pydub
|
| 11 |
+
python-dotenv
|
| 12 |
+
gunicorn
|
| 13 |
+
pytest
|
| 14 |
+
# Note: torch, torchaudio are handled in Dockerfile
|
self_learning_train.py
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Train a lightweight calibration model from feedback audio samples.
|
| 3 |
+
|
| 4 |
+
This script builds a simple logistic regression calibration layer that
|
| 5 |
+
maps physics and deep learning scores to a calibrated probability.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import argparse
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import shutil
|
| 12 |
+
import sys
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
|
| 15 |
+
import numpy as np
|
| 16 |
+
|
| 17 |
+
from detector import HybridEnsembleDetector
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
LABEL_MAP = {
|
| 21 |
+
"AI_GENERATED": 1,
|
| 22 |
+
"AI": 1,
|
| 23 |
+
"FAKE": 1,
|
| 24 |
+
"SYNTHETIC": 1,
|
| 25 |
+
"HUMAN": 0,
|
| 26 |
+
"REAL": 0
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def sigmoid(z):
|
| 31 |
+
z = np.clip(z, -30.0, 30.0)
|
| 32 |
+
return 1.0 / (1.0 + np.exp(-z))
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def train_logreg(X, y, lr=0.5, epochs=300, l2=0.001):
|
| 36 |
+
w = np.zeros(X.shape[1], dtype=np.float64)
|
| 37 |
+
b = 0.0
|
| 38 |
+
n = float(X.shape[0])
|
| 39 |
+
|
| 40 |
+
for _ in range(epochs):
|
| 41 |
+
z = X.dot(w) + b
|
| 42 |
+
p = sigmoid(z)
|
| 43 |
+
error = p - y
|
| 44 |
+
grad_w = (X.T.dot(error) / n) + (l2 * w)
|
| 45 |
+
grad_b = error.mean()
|
| 46 |
+
w -= lr * grad_w
|
| 47 |
+
b -= lr * grad_b
|
| 48 |
+
|
| 49 |
+
return w, b
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def best_threshold(y_true, y_prob):
|
| 53 |
+
thresholds = np.linspace(0.1, 0.9, 81)
|
| 54 |
+
best_t = 0.5
|
| 55 |
+
best_f1 = -1.0
|
| 56 |
+
|
| 57 |
+
for t in thresholds:
|
| 58 |
+
preds = (y_prob >= t).astype(int)
|
| 59 |
+
tp = float(((preds == 1) & (y_true == 1)).sum())
|
| 60 |
+
fp = float(((preds == 1) & (y_true == 0)).sum())
|
| 61 |
+
fn = float(((preds == 0) & (y_true == 1)).sum())
|
| 62 |
+
precision = tp / (tp + fp + 1e-9)
|
| 63 |
+
recall = tp / (tp + fn + 1e-9)
|
| 64 |
+
f1 = (2 * precision * recall) / (precision + recall + 1e-9)
|
| 65 |
+
if f1 > best_f1:
|
| 66 |
+
best_f1 = f1
|
| 67 |
+
best_t = float(t)
|
| 68 |
+
|
| 69 |
+
return best_t, best_f1
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def iter_audio_files(data_dir, max_per_class=0):
|
| 73 |
+
samples = []
|
| 74 |
+
counts = {0: 0, 1: 0}
|
| 75 |
+
|
| 76 |
+
for label_name, label_value in LABEL_MAP.items():
|
| 77 |
+
label_dir = os.path.join(data_dir, label_name)
|
| 78 |
+
if not os.path.isdir(label_dir):
|
| 79 |
+
continue
|
| 80 |
+
|
| 81 |
+
for root, _, files in os.walk(label_dir):
|
| 82 |
+
for name in files:
|
| 83 |
+
if not name.lower().endswith((".mp3", ".wav")):
|
| 84 |
+
continue
|
| 85 |
+
if max_per_class and counts[label_value] >= max_per_class:
|
| 86 |
+
continue
|
| 87 |
+
|
| 88 |
+
file_path = os.path.join(root, name)
|
| 89 |
+
meta_path = os.path.splitext(file_path)[0] + ".json"
|
| 90 |
+
sample = {
|
| 91 |
+
"path": file_path,
|
| 92 |
+
"label": label_value
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
if os.path.exists(meta_path):
|
| 96 |
+
try:
|
| 97 |
+
with open(meta_path, "r", encoding="utf-8") as handle:
|
| 98 |
+
meta = json.load(handle)
|
| 99 |
+
if "physics_score" in meta and "dl_score" in meta:
|
| 100 |
+
sample["physics_score"] = float(meta["physics_score"])
|
| 101 |
+
sample["dl_score"] = float(meta["dl_score"])
|
| 102 |
+
except Exception:
|
| 103 |
+
pass
|
| 104 |
+
|
| 105 |
+
samples.append(sample)
|
| 106 |
+
counts[label_value] += 1
|
| 107 |
+
|
| 108 |
+
return samples
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def main():
|
| 112 |
+
parser = argparse.ArgumentParser(description="Train calibration layer from feedback samples")
|
| 113 |
+
parser.add_argument("--data-dir", default="data/feedback", help="Feedback dataset directory")
|
| 114 |
+
parser.add_argument("--output", default="data/calibration.json", help="Output calibration JSON file")
|
| 115 |
+
parser.add_argument("--history-dir", default=os.environ.get(
|
| 116 |
+
"CALIBRATION_HISTORY_DIR",
|
| 117 |
+
"data/calibration_history"
|
| 118 |
+
), help="Directory to store calibration history backups")
|
| 119 |
+
parser.add_argument("--epochs", type=int, default=300, help="Training epochs")
|
| 120 |
+
parser.add_argument("--lr", type=float, default=0.5, help="Learning rate")
|
| 121 |
+
parser.add_argument("--l2", type=float, default=0.001, help="L2 regularization")
|
| 122 |
+
parser.add_argument("--min-samples", type=int, default=20, help="Minimum samples required")
|
| 123 |
+
parser.add_argument("--max-per-class", type=int, default=0, help="Max samples per class (0 = all)")
|
| 124 |
+
parser.add_argument("--deepfake-model-path", default=os.environ.get(
|
| 125 |
+
"DEEPFAKE_MODEL_PATH",
|
| 126 |
+
"garystafford/wav2vec2-deepfake-voice-detector"
|
| 127 |
+
))
|
| 128 |
+
parser.add_argument("--whisper-model-path", default=os.environ.get(
|
| 129 |
+
"WHISPER_MODEL_PATH",
|
| 130 |
+
"openai/whisper-base"
|
| 131 |
+
))
|
| 132 |
+
parser.add_argument("--use-local-deepfake-model", action="store_true", default=False)
|
| 133 |
+
parser.add_argument("--use-local-whisper-model", action="store_true", default=False)
|
| 134 |
+
parser.add_argument("--max-audio-duration", type=int, default=30)
|
| 135 |
+
|
| 136 |
+
args = parser.parse_args()
|
| 137 |
+
|
| 138 |
+
if args.history_dir:
|
| 139 |
+
os.makedirs(args.history_dir, exist_ok=True)
|
| 140 |
+
|
| 141 |
+
if not os.path.isdir(args.data_dir):
|
| 142 |
+
print(f"Data directory not found: {args.data_dir}")
|
| 143 |
+
return 1
|
| 144 |
+
|
| 145 |
+
samples = iter_audio_files(args.data_dir, max_per_class=args.max_per_class)
|
| 146 |
+
if not samples:
|
| 147 |
+
print("No audio samples found.")
|
| 148 |
+
return 1
|
| 149 |
+
|
| 150 |
+
needs_scoring = any("physics_score" not in sample for sample in samples)
|
| 151 |
+
detector = None
|
| 152 |
+
if needs_scoring:
|
| 153 |
+
detector = HybridEnsembleDetector(
|
| 154 |
+
deepfake_model_path=args.deepfake_model_path,
|
| 155 |
+
whisper_model_path=args.whisper_model_path,
|
| 156 |
+
use_local_deepfake_model=args.use_local_deepfake_model,
|
| 157 |
+
use_local_whisper_model=args.use_local_whisper_model,
|
| 158 |
+
max_audio_duration=args.max_audio_duration
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
features = []
|
| 162 |
+
labels = []
|
| 163 |
+
skipped = 0
|
| 164 |
+
|
| 165 |
+
for sample in samples:
|
| 166 |
+
if "physics_score" in sample and "dl_score" in sample:
|
| 167 |
+
phys_score = sample["physics_score"]
|
| 168 |
+
dl_score = sample["dl_score"]
|
| 169 |
+
else:
|
| 170 |
+
if detector is None:
|
| 171 |
+
skipped += 1
|
| 172 |
+
continue
|
| 173 |
+
scores = detector.extract_scores(sample["path"], input_type="file")
|
| 174 |
+
if scores.get("status") != "success":
|
| 175 |
+
skipped += 1
|
| 176 |
+
continue
|
| 177 |
+
phys_score = scores["physics_score"]
|
| 178 |
+
dl_score = scores["dl_score"]
|
| 179 |
+
|
| 180 |
+
features.append([phys_score, dl_score])
|
| 181 |
+
labels.append(sample["label"])
|
| 182 |
+
|
| 183 |
+
if skipped:
|
| 184 |
+
print(f"Skipped {skipped} samples due to scoring errors.")
|
| 185 |
+
|
| 186 |
+
if len(features) < args.min_samples:
|
| 187 |
+
print(f"Not enough samples to train. Found {len(features)}.")
|
| 188 |
+
return 1
|
| 189 |
+
|
| 190 |
+
X = np.array(features, dtype=np.float64)
|
| 191 |
+
y = np.array(labels, dtype=np.float64)
|
| 192 |
+
|
| 193 |
+
w, b = train_logreg(X, y, lr=args.lr, epochs=args.epochs, l2=args.l2)
|
| 194 |
+
probs = sigmoid(X.dot(w) + b)
|
| 195 |
+
threshold, f1 = best_threshold(y, probs)
|
| 196 |
+
predictions = (probs >= threshold).astype(int)
|
| 197 |
+
accuracy = float((predictions == y).mean())
|
| 198 |
+
|
| 199 |
+
output_dir = os.path.dirname(args.output)
|
| 200 |
+
if output_dir:
|
| 201 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 202 |
+
|
| 203 |
+
if os.path.exists(args.output):
|
| 204 |
+
version_id = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ") + "_" + os.urandom(4).hex()
|
| 205 |
+
history_name = f"calibration_{version_id}.json"
|
| 206 |
+
history_path = os.path.join(args.history_dir, history_name)
|
| 207 |
+
shutil.copy2(args.output, history_path)
|
| 208 |
+
meta_path = os.path.join(args.history_dir, f"calibration_{version_id}.meta.json")
|
| 209 |
+
meta = {
|
| 210 |
+
"versionId": version_id,
|
| 211 |
+
"source": args.output,
|
| 212 |
+
"archivedAt": datetime.utcnow().isoformat() + "Z",
|
| 213 |
+
"reason": "self_learning_train"
|
| 214 |
+
}
|
| 215 |
+
with open(meta_path, "w", encoding="utf-8") as handle:
|
| 216 |
+
json.dump(meta, handle, indent=2)
|
| 217 |
+
|
| 218 |
+
calibration = {
|
| 219 |
+
"version": 1,
|
| 220 |
+
"trained_at": datetime.utcnow().isoformat() + "Z",
|
| 221 |
+
"weights": [float(w[0]), float(w[1])],
|
| 222 |
+
"bias": float(b),
|
| 223 |
+
"threshold": float(threshold),
|
| 224 |
+
"feature_order": ["physics_score", "dl_score"],
|
| 225 |
+
"metrics": {
|
| 226 |
+
"accuracy": accuracy,
|
| 227 |
+
"f1": float(f1)
|
| 228 |
+
},
|
| 229 |
+
"samples": {
|
| 230 |
+
"count": int(len(features)),
|
| 231 |
+
"ai": int((y == 1).sum()),
|
| 232 |
+
"human": int((y == 0).sum())
|
| 233 |
+
}
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
with open(args.output, "w", encoding="utf-8") as handle:
|
| 237 |
+
json.dump(calibration, handle, indent=2)
|
| 238 |
+
|
| 239 |
+
print(f"Calibration saved to {args.output}")
|
| 240 |
+
print(f"Accuracy: {accuracy:.3f} | F1: {f1:.3f} | Threshold: {threshold:.2f}")
|
| 241 |
+
return 0
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
if __name__ == "__main__":
|
| 245 |
+
sys.exit(main())
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import importlib
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import pytest
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
TEST_API_KEY = "test_key_123"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class DummyCalibrator:
|
| 14 |
+
ready = False
|
| 15 |
+
calibration_path = None
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class DummyDetector:
|
| 19 |
+
def __init__(self):
|
| 20 |
+
self.calibrator = DummyCalibrator()
|
| 21 |
+
|
| 22 |
+
def analyze(self, audio_input, input_type="file", audio_format="mp3", analysis_mode="full"):
|
| 23 |
+
return {
|
| 24 |
+
"status": "success",
|
| 25 |
+
"language": "English",
|
| 26 |
+
"classification": "AI_GENERATED",
|
| 27 |
+
"confidenceScore": 0.87,
|
| 28 |
+
"explanation": "Dummy detector response",
|
| 29 |
+
"analysisMode": analysis_mode,
|
| 30 |
+
"debug": {
|
| 31 |
+
"analysis_mode": analysis_mode,
|
| 32 |
+
"used_calibration": False
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
def extract_scores(self, audio_input, input_type="file", audio_format="mp3"):
|
| 37 |
+
return {
|
| 38 |
+
"status": "success",
|
| 39 |
+
"physics_score": 0.42,
|
| 40 |
+
"dl_score": 0.84,
|
| 41 |
+
"dl_label": "Fake/Deepfake",
|
| 42 |
+
"physics_method": "Physics Analysis",
|
| 43 |
+
"audio_duration": 1.0,
|
| 44 |
+
"was_truncated": False
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
def reload_calibration(self, calibration_path=None):
|
| 48 |
+
return bool(calibration_path and os.path.exists(calibration_path))
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def load_app(tmp_path, monkeypatch, overrides=None):
|
| 52 |
+
env = {
|
| 53 |
+
"API_KEY": TEST_API_KEY,
|
| 54 |
+
"SKIP_MODEL_LOAD": "true",
|
| 55 |
+
"ENABLE_STREAMING": "true",
|
| 56 |
+
"ENABLE_FEEDBACK_STORAGE": "true",
|
| 57 |
+
"FEEDBACK_STORAGE_DIR": str(tmp_path / "feedback"),
|
| 58 |
+
"FEEDBACK_MAX_BYTES": "2048",
|
| 59 |
+
"CALIBRATION_PATH": str(tmp_path / "calibration.json"),
|
| 60 |
+
"CALIBRATION_HISTORY_DIR": str(tmp_path / "calibration_history"),
|
| 61 |
+
"CALIBRATION_HISTORY_MAX": "5",
|
| 62 |
+
"STREAMING_PARTIAL_INTERVAL_SECONDS": "0.5"
|
| 63 |
+
}
|
| 64 |
+
if overrides:
|
| 65 |
+
env.update(overrides)
|
| 66 |
+
|
| 67 |
+
for key, value in env.items():
|
| 68 |
+
if value is None:
|
| 69 |
+
monkeypatch.delenv(key, raising=False)
|
| 70 |
+
else:
|
| 71 |
+
monkeypatch.setenv(key, str(value))
|
| 72 |
+
|
| 73 |
+
if "app" in sys.modules:
|
| 74 |
+
del sys.modules["app"]
|
| 75 |
+
|
| 76 |
+
app_module = importlib.import_module("app")
|
| 77 |
+
importlib.reload(app_module)
|
| 78 |
+
|
| 79 |
+
dummy = DummyDetector()
|
| 80 |
+
app_module.detector = dummy
|
| 81 |
+
|
| 82 |
+
def init_detector():
|
| 83 |
+
app_module.detector = dummy
|
| 84 |
+
return True
|
| 85 |
+
|
| 86 |
+
app_module.init_detector = init_detector
|
| 87 |
+
|
| 88 |
+
return app_module
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
@pytest.fixture
|
| 92 |
+
def app_factory(tmp_path, monkeypatch):
|
| 93 |
+
def _factory(**overrides):
|
| 94 |
+
return load_app(tmp_path, monkeypatch, overrides=overrides)
|
| 95 |
+
return _factory
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
@pytest.fixture
|
| 99 |
+
def app_module(app_factory):
|
| 100 |
+
return app_factory()
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
@pytest.fixture
|
| 104 |
+
def client(app_module):
|
| 105 |
+
return app_module.app.test_client()
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
@pytest.fixture
|
| 109 |
+
def api_headers():
|
| 110 |
+
return {
|
| 111 |
+
"Content-Type": "application/json",
|
| 112 |
+
"x-api-key": TEST_API_KEY
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
@pytest.fixture
|
| 117 |
+
def sample_audio_base64():
|
| 118 |
+
return base64.b64encode(b"\x00" * 200).decode("utf-8")
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def find_test_audio_files():
|
| 122 |
+
base_dir = Path(__file__).resolve().parent.parent / "test_audio"
|
| 123 |
+
if not base_dir.exists():
|
| 124 |
+
return []
|
| 125 |
+
return sorted([p for p in base_dir.iterdir() if p.suffix.lower() in [".mp3", ".wav"]])
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def load_test_audio_base64(prefer_extension=".mp3"):
|
| 129 |
+
candidates = find_test_audio_files()
|
| 130 |
+
for path in candidates:
|
| 131 |
+
if path.suffix.lower() == prefer_extension:
|
| 132 |
+
return path, base64.b64encode(path.read_bytes()).decode("utf-8")
|
| 133 |
+
if candidates:
|
| 134 |
+
path = candidates[0]
|
| 135 |
+
return path, base64.b64encode(path.read_bytes()).decode("utf-8")
|
| 136 |
+
return None, None
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
@pytest.fixture
|
| 140 |
+
def test_audio_base64():
|
| 141 |
+
path, b64_data = load_test_audio_base64(".mp3")
|
| 142 |
+
if not b64_data:
|
| 143 |
+
pytest.skip("No audio files found in test_audio/")
|
| 144 |
+
return path, b64_data
|
tests/test_api.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import pytest
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_health(client):
|
| 8 |
+
response = client.get("/health")
|
| 9 |
+
assert response.status_code == 200
|
| 10 |
+
payload = response.get_json()
|
| 11 |
+
assert payload["status"] == "healthy"
|
| 12 |
+
assert payload["streaming_enabled"] is True
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_voice_detection_success_with_sample_base64(client, api_headers, sample_audio_base64):
|
| 16 |
+
payload = {
|
| 17 |
+
"language": "English",
|
| 18 |
+
"audioFormat": "mp3",
|
| 19 |
+
"audioBase64": sample_audio_base64
|
| 20 |
+
}
|
| 21 |
+
response = client.post("/api/voice-detection", data=json.dumps(payload), headers=api_headers)
|
| 22 |
+
assert response.status_code == 200
|
| 23 |
+
data = response.get_json()
|
| 24 |
+
assert data["status"] == "success"
|
| 25 |
+
assert data["classification"] == "AI_GENERATED"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def test_voice_detection_success_with_test_audio(client, api_headers, test_audio_base64):
|
| 29 |
+
path, audio_b64 = test_audio_base64
|
| 30 |
+
if path.suffix.lower() != ".mp3":
|
| 31 |
+
pytest.skip("test_audio file is not mp3 (endpoint only supports mp3).")
|
| 32 |
+
|
| 33 |
+
payload = {
|
| 34 |
+
"language": "English",
|
| 35 |
+
"audioFormat": "mp3",
|
| 36 |
+
"audioBase64": audio_b64
|
| 37 |
+
}
|
| 38 |
+
response = client.post("/api/voice-detection", data=json.dumps(payload), headers=api_headers)
|
| 39 |
+
assert response.status_code == 200
|
| 40 |
+
data = response.get_json()
|
| 41 |
+
assert data["status"] == "success"
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def test_voice_detection_missing_api_key(client, sample_audio_base64):
|
| 45 |
+
payload = {
|
| 46 |
+
"language": "English",
|
| 47 |
+
"audioFormat": "mp3",
|
| 48 |
+
"audioBase64": sample_audio_base64
|
| 49 |
+
}
|
| 50 |
+
response = client.post("/api/voice-detection", json=payload)
|
| 51 |
+
assert response.status_code == 401
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def test_voice_detection_invalid_api_key(client, api_headers, sample_audio_base64):
|
| 55 |
+
payload = {
|
| 56 |
+
"language": "English",
|
| 57 |
+
"audioFormat": "mp3",
|
| 58 |
+
"audioBase64": sample_audio_base64
|
| 59 |
+
}
|
| 60 |
+
headers = dict(api_headers)
|
| 61 |
+
headers["x-api-key"] = "wrong_key"
|
| 62 |
+
response = client.post("/api/voice-detection", json=payload, headers=headers)
|
| 63 |
+
assert response.status_code == 403
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def test_voice_detection_invalid_content_type(client, api_headers):
|
| 67 |
+
response = client.post("/api/voice-detection", data="not json", headers=api_headers)
|
| 68 |
+
assert response.status_code == 400
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def test_voice_detection_missing_fields(client, api_headers):
|
| 72 |
+
payload = {"language": "English"}
|
| 73 |
+
response = client.post("/api/voice-detection", json=payload, headers=api_headers)
|
| 74 |
+
assert response.status_code == 400
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def test_voice_detection_unsupported_language(client, api_headers, sample_audio_base64):
|
| 78 |
+
payload = {
|
| 79 |
+
"language": "Spanish",
|
| 80 |
+
"audioFormat": "mp3",
|
| 81 |
+
"audioBase64": sample_audio_base64
|
| 82 |
+
}
|
| 83 |
+
response = client.post("/api/voice-detection", json=payload, headers=api_headers)
|
| 84 |
+
assert response.status_code == 400
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def test_voice_detection_unsupported_audio_format(client, api_headers, sample_audio_base64):
|
| 88 |
+
payload = {
|
| 89 |
+
"language": "English",
|
| 90 |
+
"audioFormat": "wav",
|
| 91 |
+
"audioBase64": sample_audio_base64
|
| 92 |
+
}
|
| 93 |
+
response = client.post("/api/voice-detection", json=payload, headers=api_headers)
|
| 94 |
+
assert response.status_code == 400
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def test_voice_detection_invalid_audio_payload(client, api_headers):
|
| 98 |
+
payload = {
|
| 99 |
+
"language": "English",
|
| 100 |
+
"audioFormat": "mp3",
|
| 101 |
+
"audioBase64": "short"
|
| 102 |
+
}
|
| 103 |
+
response = client.post("/api/voice-detection", json=payload, headers=api_headers)
|
| 104 |
+
assert response.status_code == 400
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def test_voice_detection_analysis_error(app_module, client, api_headers, sample_audio_base64):
|
| 108 |
+
def error_analyze(*args, **kwargs):
|
| 109 |
+
return {"status": "error", "error": "boom"}
|
| 110 |
+
|
| 111 |
+
app_module.detector.analyze = error_analyze
|
| 112 |
+
|
| 113 |
+
payload = {
|
| 114 |
+
"language": "English",
|
| 115 |
+
"audioFormat": "mp3",
|
| 116 |
+
"audioBase64": sample_audio_base64
|
| 117 |
+
}
|
| 118 |
+
response = client.post("/api/voice-detection", json=payload, headers=api_headers)
|
| 119 |
+
assert response.status_code == 500
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def test_reload_calibration_not_found(client, api_headers):
|
| 123 |
+
response = client.post("/api/reload-calibration", headers=api_headers)
|
| 124 |
+
assert response.status_code == 404
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def test_reload_calibration_success(app_module, client, api_headers):
|
| 128 |
+
calibration_file = Path(app_module.CALIBRATION_PATH)
|
| 129 |
+
calibration_file.parent.mkdir(parents=True, exist_ok=True)
|
| 130 |
+
calibration_file.write_text("{}", encoding="utf-8")
|
| 131 |
+
|
| 132 |
+
response = client.post("/api/reload-calibration", headers=api_headers)
|
| 133 |
+
assert response.status_code == 200
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def test_backup_and_rollback_calibration(app_module, client, api_headers):
|
| 137 |
+
calibration_file = Path(app_module.CALIBRATION_PATH)
|
| 138 |
+
calibration_file.parent.mkdir(parents=True, exist_ok=True)
|
| 139 |
+
calibration_file.write_text('{"version": "original"}', encoding="utf-8")
|
| 140 |
+
|
| 141 |
+
backup_response = client.post("/api/backup-calibration", headers=api_headers)
|
| 142 |
+
assert backup_response.status_code == 200
|
| 143 |
+
backup_payload = backup_response.get_json()
|
| 144 |
+
version_id = backup_payload["versionId"]
|
| 145 |
+
|
| 146 |
+
calibration_file.write_text('{"version": "new"}', encoding="utf-8")
|
| 147 |
+
|
| 148 |
+
rollback_response = client.post(
|
| 149 |
+
"/api/rollback-calibration",
|
| 150 |
+
json={"versionId": version_id},
|
| 151 |
+
headers=api_headers
|
| 152 |
+
)
|
| 153 |
+
assert rollback_response.status_code == 200
|
| 154 |
+
assert calibration_file.read_text(encoding="utf-8") == '{"version": "original"}'
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def test_backup_calibration_missing_file(client, api_headers):
|
| 158 |
+
response = client.post("/api/backup-calibration", headers=api_headers)
|
| 159 |
+
assert response.status_code == 404
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def test_rollback_calibration_missing_version(client, api_headers):
|
| 163 |
+
response = client.post("/api/rollback-calibration", json={}, headers=api_headers)
|
| 164 |
+
assert response.status_code == 400
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def test_calibration_history_list(app_module, client, api_headers):
|
| 168 |
+
history_dir = Path(app_module.CALIBRATION_HISTORY_DIR)
|
| 169 |
+
history_dir.mkdir(parents=True, exist_ok=True)
|
| 170 |
+
history_file = history_dir / "calibration_20260207T120000Z_deadbeef.json"
|
| 171 |
+
history_file.write_text("{}", encoding="utf-8")
|
| 172 |
+
|
| 173 |
+
response = client.get("/api/calibration-history", headers=api_headers)
|
| 174 |
+
assert response.status_code == 200
|
| 175 |
+
payload = response.get_json()
|
| 176 |
+
assert payload["status"] == "success"
|
| 177 |
+
assert payload["history"]
|
tests/test_feedback.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def test_feedback_success_with_scoring(client, api_headers, app_module):
|
| 9 |
+
audio_bytes = b"\x01" * 400
|
| 10 |
+
payload = {
|
| 11 |
+
"label": "AI_GENERATED",
|
| 12 |
+
"audioFormat": "mp3",
|
| 13 |
+
"audioBase64": base64.b64encode(audio_bytes).decode("utf-8"),
|
| 14 |
+
"runDetection": True,
|
| 15 |
+
"metadata": {"source": "unit-test"}
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
response = client.post("/api/feedback", json=payload, headers=api_headers)
|
| 19 |
+
assert response.status_code == 200
|
| 20 |
+
data = response.get_json()
|
| 21 |
+
assert data["status"] == "success"
|
| 22 |
+
|
| 23 |
+
storage_dir = Path(app_module.FEEDBACK_STORAGE_DIR)
|
| 24 |
+
assert storage_dir.exists()
|
| 25 |
+
stored_files = list(storage_dir.rglob("*.mp3"))
|
| 26 |
+
assert stored_files, "Expected feedback audio file to be stored"
|
| 27 |
+
meta_files = list(storage_dir.rglob("*.json"))
|
| 28 |
+
assert meta_files, "Expected feedback metadata to be stored"
|
| 29 |
+
|
| 30 |
+
metadata = json.loads(meta_files[0].read_text(encoding="utf-8"))
|
| 31 |
+
assert metadata["label"] == "AI_GENERATED"
|
| 32 |
+
assert "physics_score" in metadata
|
| 33 |
+
assert "dl_score" in metadata
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def test_feedback_invalid_label(client, api_headers, sample_audio_base64):
|
| 37 |
+
payload = {
|
| 38 |
+
"label": "UNKNOWN",
|
| 39 |
+
"audioFormat": "mp3",
|
| 40 |
+
"audioBase64": sample_audio_base64
|
| 41 |
+
}
|
| 42 |
+
response = client.post("/api/feedback", json=payload, headers=api_headers)
|
| 43 |
+
assert response.status_code == 400
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def test_feedback_disabled(app_factory, sample_audio_base64, api_headers):
|
| 47 |
+
app_module = app_factory(ENABLE_FEEDBACK_STORAGE="false")
|
| 48 |
+
client = app_module.app.test_client()
|
| 49 |
+
|
| 50 |
+
payload = {
|
| 51 |
+
"label": "HUMAN",
|
| 52 |
+
"audioFormat": "mp3",
|
| 53 |
+
"audioBase64": sample_audio_base64
|
| 54 |
+
}
|
| 55 |
+
response = client.post("/api/feedback", json=payload, headers=api_headers)
|
| 56 |
+
assert response.status_code == 403
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def test_feedback_too_large_payload(app_module, client, api_headers):
|
| 60 |
+
big_audio = base64.b64encode(b"\x00" * (app_module.FEEDBACK_MAX_BYTES + 10)).decode("utf-8")
|
| 61 |
+
payload = {
|
| 62 |
+
"label": "AI_GENERATED",
|
| 63 |
+
"audioFormat": "mp3",
|
| 64 |
+
"audioBase64": big_audio
|
| 65 |
+
}
|
| 66 |
+
response = client.post("/api/feedback", json=payload, headers=api_headers)
|
| 67 |
+
assert response.status_code == 413
|
tests/test_integration_model.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import pytest
|
| 5 |
+
|
| 6 |
+
from detector import HybridEnsembleDetector
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
pytestmark = [
|
| 10 |
+
pytest.mark.integration,
|
| 11 |
+
pytest.mark.skipif(
|
| 12 |
+
os.environ.get("RUN_MODEL_TESTS", "").lower() not in ["1", "true", "yes"],
|
| 13 |
+
reason="Integration tests require RUN_MODEL_TESTS=true and model weights available."
|
| 14 |
+
)
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def find_ai_miss_audio():
|
| 19 |
+
env_path = os.environ.get("AI_MISS_AUDIO_PATH")
|
| 20 |
+
if env_path and Path(env_path).exists():
|
| 21 |
+
return Path(env_path)
|
| 22 |
+
|
| 23 |
+
base_dir = Path(__file__).resolve().parent.parent / "test_audio"
|
| 24 |
+
if not base_dir.exists():
|
| 25 |
+
return None
|
| 26 |
+
|
| 27 |
+
candidates = []
|
| 28 |
+
for path in base_dir.iterdir():
|
| 29 |
+
if path.suffix.lower() not in [".mp3", ".wav"]:
|
| 30 |
+
continue
|
| 31 |
+
name = path.stem.lower()
|
| 32 |
+
if "miss" in name or "false" in name or "hard" in name:
|
| 33 |
+
candidates.append(path)
|
| 34 |
+
|
| 35 |
+
return candidates[0] if candidates else None
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@pytest.mark.xfail(reason="Known false negative before retraining", strict=False)
|
| 39 |
+
def test_known_false_negative_ai_sample():
|
| 40 |
+
audio_path = find_ai_miss_audio()
|
| 41 |
+
if audio_path is None:
|
| 42 |
+
pytest.skip("No known false-negative AI sample provided.")
|
| 43 |
+
|
| 44 |
+
detector = HybridEnsembleDetector(
|
| 45 |
+
deepfake_model_path=os.environ.get(
|
| 46 |
+
"DEEPFAKE_MODEL_PATH",
|
| 47 |
+
"garystafford/wav2vec2-deepfake-voice-detector"
|
| 48 |
+
),
|
| 49 |
+
whisper_model_path=os.environ.get(
|
| 50 |
+
"WHISPER_MODEL_PATH",
|
| 51 |
+
"openai/whisper-base"
|
| 52 |
+
),
|
| 53 |
+
use_local_deepfake_model=os.environ.get("USE_LOCAL_DEEPFAKE_MODEL", "false").lower() in ["1", "true"],
|
| 54 |
+
use_local_whisper_model=os.environ.get("USE_LOCAL_WHISPER_MODEL", "false").lower() in ["1", "true"],
|
| 55 |
+
max_audio_duration=int(os.environ.get("MAX_AUDIO_DURATION", "30"))
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
result = detector.analyze(str(audio_path), input_type="file")
|
| 59 |
+
assert result["status"] == "success"
|
| 60 |
+
assert result["classification"] == "AI_GENERATED"
|
tests/test_streaming.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import json
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class FakeWebSocket:
|
| 6 |
+
def __init__(self, messages, api_key):
|
| 7 |
+
self._messages = iter(messages)
|
| 8 |
+
self.sent = []
|
| 9 |
+
self.environ = {"QUERY_STRING": f"api_key={api_key}"}
|
| 10 |
+
|
| 11 |
+
def receive(self):
|
| 12 |
+
try:
|
| 13 |
+
return next(self._messages)
|
| 14 |
+
except StopIteration:
|
| 15 |
+
return None
|
| 16 |
+
|
| 17 |
+
def send(self, message):
|
| 18 |
+
try:
|
| 19 |
+
self.sent.append(json.loads(message))
|
| 20 |
+
except Exception:
|
| 21 |
+
self.sent.append(message)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def build_pcm16_chunk(sample_rate=16000, channels=1, seconds=1.0):
|
| 25 |
+
bytes_per_second = sample_rate * channels * 2
|
| 26 |
+
size = int(bytes_per_second * seconds)
|
| 27 |
+
return base64.b64encode(b"\x00" * size).decode("utf-8")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def test_streaming_success_with_partial_and_final(app_module):
|
| 31 |
+
start_msg = json.dumps({
|
| 32 |
+
"type": "start",
|
| 33 |
+
"audioFormat": "pcm16",
|
| 34 |
+
"sampleRate": 16000,
|
| 35 |
+
"channels": 1,
|
| 36 |
+
"enablePartial": True,
|
| 37 |
+
"partialIntervalSec": 0.5
|
| 38 |
+
})
|
| 39 |
+
|
| 40 |
+
chunk_msg = json.dumps({
|
| 41 |
+
"type": "audio_chunk",
|
| 42 |
+
"audioChunkBase64": build_pcm16_chunk(seconds=1.0),
|
| 43 |
+
"final": True
|
| 44 |
+
})
|
| 45 |
+
|
| 46 |
+
ws = FakeWebSocket([start_msg, chunk_msg], api_key=app_module.API_KEY)
|
| 47 |
+
app_module.voice_stream(ws)
|
| 48 |
+
|
| 49 |
+
types = [msg.get("type") for msg in ws.sent if isinstance(msg, dict)]
|
| 50 |
+
assert "ack" in types
|
| 51 |
+
assert "progress" in types
|
| 52 |
+
assert "partial_result" in types
|
| 53 |
+
assert "final_result" in types
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def test_streaming_invalid_api_key(app_module):
|
| 57 |
+
start_msg = json.dumps({
|
| 58 |
+
"type": "start",
|
| 59 |
+
"audioFormat": "pcm16",
|
| 60 |
+
"sampleRate": 16000,
|
| 61 |
+
"channels": 1
|
| 62 |
+
})
|
| 63 |
+
|
| 64 |
+
ws = FakeWebSocket([start_msg], api_key="bad_key")
|
| 65 |
+
app_module.voice_stream(ws)
|
| 66 |
+
|
| 67 |
+
assert ws.sent
|
| 68 |
+
assert ws.sent[0]["type"] == "error"
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def test_streaming_invalid_format(app_module):
|
| 72 |
+
start_msg = json.dumps({
|
| 73 |
+
"type": "start",
|
| 74 |
+
"audioFormat": "aac",
|
| 75 |
+
"sampleRate": 16000,
|
| 76 |
+
"channels": 1
|
| 77 |
+
})
|
| 78 |
+
|
| 79 |
+
ws = FakeWebSocket([start_msg], api_key=app_module.API_KEY)
|
| 80 |
+
app_module.voice_stream(ws)
|
| 81 |
+
|
| 82 |
+
assert ws.sent
|
| 83 |
+
assert ws.sent[0]["type"] == "error"
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def test_streaming_disabled(app_factory):
|
| 87 |
+
app_module = app_factory(ENABLE_STREAMING="false")
|
| 88 |
+
ws = FakeWebSocket([json.dumps({"type": "start"})], api_key=app_module.API_KEY)
|
| 89 |
+
app_module.voice_stream(ws)
|
| 90 |
+
|
| 91 |
+
assert ws.sent
|
| 92 |
+
assert ws.sent[0]["type"] == "error"
|
try.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|