arshan123 commited on
Commit
1cb0653
·
0 Parent(s):

Added AI_Voice_Detector

Browse files
.dockerimage ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ *.so
7
+ *.egg
8
+ *.egg-info
9
+ dist
10
+ build
11
+ .pytest_cache
12
+ .coverage
13
+ htmlcov
14
+ .env.local
15
+ .DS_Store
16
+ *.log
17
+ test_audio/
18
+ logs/
19
+ *.md
20
+ .git
21
+ .gitignore
22
+ docker-compose.yml
23
+ test_api.py
24
+ client.py
.gitignore ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AI_voice_dataset/
2
+ Deepfake-audio-detection-V2/
3
+ wav2vec2_finetuned_model/
4
+ wav2vec2-deepfake-voice-detector/
5
+ trained_voice_features.csv
6
+ voice_auth_model.pkl
7
+
8
+ .env
9
+ test.py
10
+
11
+
12
+ # Python
13
+ __pycache__/
14
+ *.py[cod]
15
+ *$py.class
16
+ *.so
17
+ .Python
18
+ build/
19
+ develop-eggs/
20
+ dist/
21
+ downloads/
22
+ eggs/
23
+ .eggs/
24
+ lib/
25
+ lib64/
26
+ parts/
27
+ sdist/
28
+ var/
29
+ wheels/
30
+ *.egg-info/
31
+ .installed.cfg
32
+ *.egg
33
+
34
+ # Virtual Environment
35
+ venv/
36
+ ENV/
37
+ env/
38
+ .venv
39
+
40
+ # IDE
41
+ .vscode/
42
+ .idea/
43
+ *.swp
44
+ *.swo
45
+ *~
46
+
47
+ # Testing
48
+ .pytest_cache/
49
+ .coverage
50
+ htmlcov/
51
+ .tox/
52
+
53
+ # Environment
54
+ .env
55
+ .env.local
56
+ .env.*.local
57
+
58
+ # Logs
59
+ *.log
60
+ logs/
61
+
62
+ # OS
63
+ .DS_Store
64
+ Thumbs.db
65
+
66
+ # Audio files (for testing)
67
+ test_audio/
68
+ *.mp3
69
+ *.wav
70
+
71
+ # Self-learning data
72
+ data/
73
+
74
+ # Docker
75
+ .dockerignore
76
+
77
+ # Temporary files
78
+ *.tmp
79
+ temp/
80
+ tmp/
DockerFile ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Set up a new user named "user" with user ID 1000 (required by HuggingFace Spaces)
4
+ RUN useradd -m -u 1000 user
5
+
6
+ # Set working directory
7
+ WORKDIR /app
8
+
9
+ # System dependencies for audio processing
10
+ RUN apt-get update && apt-get install -y \
11
+ libsndfile1 \
12
+ ffmpeg \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # Copy requirements and install as root
16
+ COPY --chown=user requirements.txt /app/requirements.txt
17
+ RUN pip install --no-cache-dir --upgrade pip && \
18
+ pip install --no-cache-dir -r requirements.txt
19
+
20
+ # Copy application files with correct ownership
21
+ COPY --chown=user app.py /app/
22
+ COPY --chown=user detector.py /app/
23
+ COPY --chown=user self_learning_train.py /app/
24
+
25
+ # Switch to the "user" user
26
+ USER user
27
+
28
+ # Set home to the user's home directory
29
+ ENV HOME=/home/user \
30
+ PATH=/home/user/.local/bin:$PATH \
31
+ PYTHONUNBUFFERED=1
32
+
33
+ # Pre-download models (will be cached in user's home)
34
+ RUN python -c "from transformers import AutoModelForAudioClassification, AutoFeatureExtractor, WhisperProcessor, WhisperForConditionalGeneration; \
35
+ print('Downloading models...'); \
36
+ AutoModelForAudioClassification.from_pretrained('garystafford/wav2vec2-deepfake-voice-detector'); \
37
+ AutoFeatureExtractor.from_pretrained('garystafford/wav2vec2-deepfake-voice-detector'); \
38
+ WhisperProcessor.from_pretrained('openai/whisper-base'); \
39
+ WhisperForConditionalGeneration.from_pretrained('openai/whisper-base'); \
40
+ print('Models downloaded successfully')"
41
+
42
+ # Expose HuggingFace Spaces port
43
+ EXPOSE 7860
44
+
45
+ # Run with uvicorn (FastAPI)
46
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🎙️ Voice Detection API
2
+
3
+ A production-ready REST API that detects whether a voice recording is AI-generated or human using hybrid analysis (physics-based + deep learning).
4
+
5
+ ## 🌟 Features
6
+
7
+ - ✅ **Multi-language Support**: Tamil, English, Hindi, Malayalam, Telugu
8
+ - ✅ **Hybrid Detection**: Combines physics analysis + Wav2Vec2 deepfake detection
9
+ - ✅ **Language Detection**: Automatic language identification using Whisper
10
+ - ✅ **Secure**: API key authentication
11
+ - ✅ **Fast**: Auto-truncates to 30 seconds for quick processing
12
+ - ✅ **Production Ready**: Docker support, logging, health checks
13
+ - ✅ **Realtime Streaming**: WebSocket streaming with partial results
14
+ - ✅ **Self-Learning Ready**: Feedback collection + calibration training
15
+
16
+ ## 📁 Project Structure
17
+
18
+ ```
19
+ voice-detection-api/
20
+ ├── app.py # Flask API application
21
+ ├── detector.py # Your HybridEnsembleDetector class
22
+ ├── self_learning_train.py # Calibration training from feedback data
23
+ ├── client.py # Example Python client
24
+ ├── test_api.py # Automated test suite
25
+ ├── requirements.txt # Python dependencies
26
+ ├── Dockerfile # Docker configuration
27
+ ├── docker-compose.yml # Docker Compose setup
28
+ ├── .env # Environment variables
29
+ ├── DEPLOYMENT.md # Detailed deployment guide
30
+ └── README.md # This file
31
+ ```
32
+
33
+ ## 🚀 Quick Start
34
+
35
+ ### Prerequisites
36
+
37
+ - Python 3.10+
38
+ - pip
39
+ - (Optional) Docker & Docker Compose
40
+
41
+ ### Installation
42
+
43
+ 1. **Clone the repository**
44
+ ```bash
45
+ git clone <your-repo-url>
46
+ cd voice-detection-api
47
+ ```
48
+
49
+ 2. **Install dependencies**
50
+ ```bash
51
+ pip install -r requirements.txt
52
+ ```
53
+
54
+ 3. **Set up environment variables**
55
+ ```bash
56
+ # Copy the example .env file
57
+ cp .env.example .env
58
+
59
+ # Edit .env and set your API key
60
+ nano .env
61
+ ```
62
+
63
+ 4. **Run the API**
64
+ ```bash
65
+ python app.py
66
+ ```
67
+
68
+ The API will start at `http://localhost:5000`
69
+
70
+ ## 🐳 Docker Deployment (Recommended)
71
+
72
+ ### Quick Start with Docker Compose
73
+
74
+ ```bash
75
+ # Start the API
76
+ docker-compose up -d
77
+
78
+ # Check status
79
+ docker-compose ps
80
+
81
+ # View logs
82
+ docker-compose logs -f
83
+
84
+ # Stop the API
85
+ docker-compose down
86
+ ```
87
+
88
+ ### Manual Docker Build
89
+
90
+ ```bash
91
+ # Build image
92
+ docker build -t voice-detection-api .
93
+
94
+ # Run container
95
+ docker run -p 5000:5000 \
96
+ -e API_KEY="your_secret_key" \
97
+ voice-detection-api
98
+ ```
99
+
100
+ ## 📡 API Usage
101
+
102
+ ### Health Check
103
+
104
+ ```bash
105
+ curl http://localhost:5000/health
106
+ ```
107
+
108
+ ### Voice Detection
109
+
110
+ **Using cURL:**
111
+ ```bash
112
+ curl -X POST http://localhost:5000/api/voice-detection \
113
+ -H "Content-Type: application/json" \
114
+ -H "x-api-key: sk_test_123456789" \
115
+ -d '{
116
+ "language": "English",
117
+ "audioFormat": "mp3",
118
+ "audioBase64": "'"$(base64 -w 0 your_audio.mp3)"'"
119
+ }'
120
+ ```
121
+
122
+ **Using Python Client:**
123
+ ```bash
124
+ # Single file
125
+ python client.py --audio test_audio.mp3 --language English
126
+
127
+ # Multiple files
128
+ python client.py \
129
+ --audio file1.mp3 \
130
+ --audio file2.mp3 \
131
+ --language Tamil
132
+ ```
133
+
134
+ **Using Python Requests:**
135
+ ```python
136
+ import requests
137
+ import base64
138
+
139
+ # Encode audio
140
+ with open('audio.mp3', 'rb') as f:
141
+ audio_base64 = base64.b64encode(f.read()).decode()
142
+
143
+ # Make request
144
+ response = requests.post(
145
+ 'http://localhost:5000/api/voice-detection',
146
+ headers={
147
+ 'Content-Type': 'application/json',
148
+ 'x-api-key': 'sk_test_123456789'
149
+ },
150
+ json={
151
+ 'language': 'English',
152
+ 'audioFormat': 'mp3',
153
+ 'audioBase64': audio_base64
154
+ }
155
+ )
156
+
157
+ result = response.json()
158
+ print(f"Classification: {result['classification']}")
159
+ print(f"Confidence: {result['confidenceScore']}")
160
+ ```
161
+
162
+ ### Realtime Streaming (WebSocket)
163
+
164
+ Endpoint: `ws://localhost:5000/ws/voice-stream`
165
+
166
+ Authentication:
167
+ - Query param: `?api_key=sk_test_123456789`
168
+ - Or header: `x-api-key` (non-browser clients)
169
+
170
+ Recommended streaming format: `pcm16` (16kHz, mono). This allows partial
171
+ results while the audio is still streaming.
172
+ If you stream `mp3` or `wav`, partial results are disabled and analysis runs
173
+ on the final buffer.
174
+
175
+ **Client -> Server messages:**
176
+ ```json
177
+ { "type": "start", "audioFormat": "pcm16", "sampleRate": 16000, "channels": 1,
178
+ "enablePartial": true, "partialIntervalSec": 10 }
179
+ ```
180
+ ```json
181
+ { "type": "audio_chunk", "audioChunkBase64": "<base64_pcm_chunk>" }
182
+ ```
183
+ ```json
184
+ { "type": "audio_chunk", "audioChunkBase64": "<base64_pcm_chunk>", "final": true }
185
+ ```
186
+
187
+ **Server -> Client messages:**
188
+ ```json
189
+ { "type": "ack", "sessionId": "...", "status": "ready" }
190
+ ```
191
+ ```json
192
+ { "type": "progress", "receivedBytes": 12345, "bufferBytes": 12345, "bufferSeconds": 2.1 }
193
+ ```
194
+ ```json
195
+ { "type": "partial_result", "result": { "status": "success", "classification": "AI_GENERATED" } }
196
+ ```
197
+ ```json
198
+ { "type": "final_result", "result": { "status": "success", "classification": "HUMAN" } }
199
+ ```
200
+
201
+ **Browser example:**
202
+ ```javascript
203
+ const ws = new WebSocket("ws://localhost:5000/ws/voice-stream?api_key=sk_test_123456789");
204
+ ws.onopen = () => {
205
+ ws.send(JSON.stringify({
206
+ type: "start",
207
+ audioFormat: "pcm16",
208
+ sampleRate: 16000,
209
+ channels: 1,
210
+ enablePartial: true
211
+ }));
212
+ // Send base64-encoded PCM16 chunks as they arrive
213
+ ws.send(JSON.stringify({ type: "audio_chunk", audioChunkBase64: chunkBase64 }));
214
+ ws.send(JSON.stringify({ type: "audio_chunk", audioChunkBase64: lastChunkBase64, final: true }));
215
+ };
216
+ ws.onmessage = (event) => console.log(event.data);
217
+ ```
218
+
219
+ ### Feedback (Self-Learning)
220
+
221
+ Send labeled audio samples so the model can periodically recalibrate.
222
+
223
+ ```bash
224
+ curl -X POST http://localhost:5000/api/feedback \
225
+ -H "Content-Type: application/json" \
226
+ -H "x-api-key: sk_test_123456789" \
227
+ -d '{
228
+ "label": "AI_GENERATED",
229
+ "audioFormat": "mp3",
230
+ "audioBase64": "'"$(base64 -w 0 new_ai_sample.mp3)"'"
231
+ }'
232
+ ```
233
+
234
+ Stored samples are written to `data/feedback/<LABEL>/YYYYMMDD/` along with
235
+ metadata JSON files and an index.
236
+
237
+ ### Train Calibration (Self-Learning)
238
+
239
+ This trains a lightweight calibration layer using feedback samples:
240
+ ```bash
241
+ python self_learning_train.py --data-dir data/feedback --output data/calibration.json
242
+ ```
243
+
244
+ If `CALIBRATION_PATH` exists, the API loads it on startup.
245
+
246
+ When retraining, the script will automatically archive the previous calibration
247
+ to `CALIBRATION_HISTORY_DIR` before writing the new file.
248
+
249
+ Reload calibration without restarting the API:
250
+ ```bash
251
+ curl -X POST http://localhost:5000/api/reload-calibration \
252
+ -H "x-api-key: sk_test_123456789"
253
+ ```
254
+
255
+ Backup the current calibration (creates a timestamped copy):
256
+ ```bash
257
+ curl -X POST http://localhost:5000/api/backup-calibration \
258
+ -H "x-api-key: sk_test_123456789" \
259
+ -d '{"reason": "pre_retrain"}'
260
+ ```
261
+
262
+ List calibration history:
263
+ ```bash
264
+ curl -X GET http://localhost:5000/api/calibration-history \
265
+ -H "x-api-key: sk_test_123456789"
266
+ ```
267
+
268
+ Rollback to a previous calibration:
269
+ ```bash
270
+ curl -X POST http://localhost:5000/api/rollback-calibration \
271
+ -H "x-api-key: sk_test_123456789" \
272
+ -d '{"versionId": "20260207T120000Z_ab12cd34"}'
273
+ ```
274
+
275
+ ## 📊 Response Format
276
+
277
+ ### Success Response
278
+ ```json
279
+ {
280
+ "status": "success",
281
+ "language": "English",
282
+ "classification": "AI_GENERATED",
283
+ "confidenceScore": 0.91,
284
+ "explanation": "Deep learning model detected synthetic voice patterns (confidence: 92.5%)"
285
+ }
286
+ ```
287
+
288
+ ### Error Response
289
+ ```json
290
+ {
291
+ "status": "error",
292
+ "message": "Invalid API key"
293
+ }
294
+ ```
295
+
296
+ ## 🔑 Authentication
297
+
298
+ All requests to `/api/voice-detection` require an API key in the header:
299
+
300
+ ```
301
+ x-api-key: your_api_key_here
302
+ ```
303
+
304
+ **Setting API Key:**
305
+ ```bash
306
+ # In .env file
307
+ API_KEY=sk_test_123456789
308
+
309
+ # Or as environment variable
310
+ export API_KEY="your_secure_key"
311
+ ```
312
+
313
+ ## 🧪 Testing
314
+
315
+ ### Run Test Suite
316
+ ```bash
317
+ pytest
318
+ ```
319
+
320
+ ### Integration Tests (full model)
321
+ ```bash
322
+ RUN_MODEL_TESTS=true pytest -m integration
323
+ ```
324
+ Set `AI_MISS_AUDIO_PATH` to point at a known false-negative AI sample to
325
+ track improvements after recalibration.
326
+
327
+ ### Manual Testing
328
+ ```bash
329
+ # Health check
330
+ curl http://localhost:5000/health
331
+
332
+ # Test with sample audio
333
+ python client.py --audio test_audio.mp3
334
+ ```
335
+
336
+ ## 📝 Supported Features
337
+
338
+ ### Languages
339
+ - Tamil
340
+ - English
341
+ - Hindi
342
+ - Malayalam
343
+ - Telugu
344
+
345
+ ### Classifications
346
+ - `AI_GENERATED` - Synthetic/AI voice
347
+ - `HUMAN` - Real human voice
348
+
349
+ ### Audio Requirements
350
+ - Format: MP3 only
351
+ - Input: Base64 encoded
352
+ - Max duration: 30 seconds (auto-truncated)
353
+
354
+ ## ⚙️ Configuration
355
+
356
+ ### Environment Variables
357
+
358
+ | Variable | Default | Description |
359
+ |----------|---------|-------------|
360
+ | `API_KEY` | `sk_test_123456789` | API authentication key |
361
+ | `PORT` | `5000` | Server port |
362
+ | `FLASK_ENV` | `production` | Flask environment |
363
+ | `ENABLE_STREAMING` | `true` | Enable WebSocket streaming endpoint |
364
+ | `STREAMING_MAX_BUFFER_SECONDS` | `30` | Max audio seconds buffered for streaming |
365
+ | `STREAMING_PARTIAL_INTERVAL_SECONDS` | `10` | Partial result interval for streaming |
366
+ | `STREAMING_PARTIAL_MODE` | `physics` | Partial mode: `full`, `physics`, or `dl` |
367
+ | `STREAMING_MAX_CHUNK_BYTES` | `2097152` | Max size per streaming chunk |
368
+ | `ENABLE_FEEDBACK_STORAGE` | `true` | Enable feedback storage for self-learning |
369
+ | `FEEDBACK_STORAGE_DIR` | `data/feedback` | Feedback storage directory |
370
+ | `FEEDBACK_MAX_BYTES` | `15728640` | Max feedback payload size |
371
+ | `CALIBRATION_PATH` | `data/calibration.json` | Calibration file path |
372
+ | `SKIP_MODEL_LOAD` | `false` | Skip loading models at startup (useful for tests) |
373
+ | `CALIBRATION_HISTORY_DIR` | `data/calibration_history` | Calibration backup directory |
374
+ | `CALIBRATION_HISTORY_MAX` | `50` | Max calibration backups retained |
375
+
376
+ ### Model Configuration
377
+
378
+ Edit the detector initialization in `app.py`:
379
+
380
+ ```python
381
+ detector = HybridEnsembleDetector(
382
+ physics_weight=0.4, # Physics model weight
383
+ dl_weight=0.6, # Deep learning weight
384
+ max_audio_duration=30 # Max seconds to process
385
+ )
386
+ ```
387
+
388
+ ## 🏗️ Architecture
389
+
390
+ ### Detection Pipeline
391
+
392
+ 1. **Audio Input** → Base64 MP3
393
+ 2. **Preprocessing** → Decode, convert to 16kHz mono
394
+ 3. **Language Detection** → Whisper model identifies language
395
+ 4. **Physics Analysis** → Acoustic feature extraction
396
+ 5. **Deep Learning** → Wav2Vec2 deepfake detection
397
+ 6. **Ensemble** → Weighted combination of scores
398
+ 7. **Classification** → AI_GENERATED or HUMAN
399
+
400
+ ### Models Used
401
+
402
+ - **Deepfake Detector**: `garystafford/wav2vec2-deepfake-voice-detector`
403
+ - **Language Detector**: `openai/whisper-base`
404
+
405
+ ## 📈 Performance
406
+
407
+ - **Processing Time**: 2-10 seconds per audio
408
+ - **Memory**: ~2GB RAM minimum
409
+ - **Accuracy**: Varies by language and audio quality
410
+ - **Throughput**: ~5-10 requests/minute per worker
411
+
412
+ ## 🔧 Troubleshooting
413
+
414
+ ### Models Not Loading
415
+ ```bash
416
+ # Pre-download models
417
+ python -c "from transformers import AutoModelForAudioClassification; \
418
+ AutoModelForAudioClassification.from_pretrained('garystafford/wav2vec2-deepfake-voice-detector')"
419
+ ```
420
+
421
+ ### Port Already in Use
422
+ ```bash
423
+ # Change port in .env
424
+ PORT=8000
425
+
426
+ # Or use environment variable
427
+ PORT=8000 python app.py
428
+ ```
429
+
430
+ ### Memory Issues
431
+ - Reduce `max_audio_duration` to 15 seconds
432
+ - Use fewer Docker workers
433
+ - Increase system RAM
434
+
435
+ ## 📖 Documentation
436
+
437
+ - **Full Deployment Guide**: See [DEPLOYMENT.md](DEPLOYMENT.md)
438
+ - **API Reference**: See API section above
439
+ - **Model Details**: See `detector.py` comments
440
+
441
+ ## 🛡️ Security Notes
442
+
443
+ - Never commit API keys to version control
444
+ - Use strong, random API keys in production
445
+ - Enable HTTPS/TLS for production deployments
446
+ - Implement rate limiting for production use
447
+ - Regularly update dependencies
448
+
449
+ ## 🚀 Production Deployment
450
+
451
+ ### Using Gunicorn
452
+ ```bash
453
+ gunicorn --bind 0.0.0.0:5000 --workers 2 --timeout 120 app:app
454
+ ```
455
+
456
+ ### With Nginx Reverse Proxy
457
+ See [DEPLOYMENT.md](DEPLOYMENT.md) for Nginx configuration
458
+
459
+ ### Cloud Platforms
460
+ - AWS: EC2 + Docker or Elastic Beanstalk
461
+ - Google Cloud: Cloud Run or Compute Engine
462
+ - Azure: App Service or Container Instances
463
+ - Heroku: Supports Python + Docker
464
+
465
+ ## 📞 Support
466
+
467
+ For issues or questions:
468
+ 1. Check [DEPLOYMENT.md](DEPLOYMENT.md)
469
+ 2. Run test suite: `python test_api.py`
470
+ 3. Check logs: `docker-compose logs`
471
+
472
+ ## 📄 License
473
+
474
+ This project uses open-source models:
475
+ - Wav2Vec2: Apache 2.0
476
+ - Whisper: MIT
477
+
478
+ ## 🙏 Credits
479
+
480
+ - **Models**: HuggingFace transformers
481
+ - **Framework**: Flask
482
+ - **Audio Processing**: Librosa, SoundFile
483
+
484
+ ---
485
+
486
+ **Version**: 1.0.0
487
+ **Status**: Production Ready ✅
488
+ **Last Updated**: February 2026
app.py ADDED
@@ -0,0 +1,1053 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Voice Detection API - Flask Application (HuggingFace Spaces Version)
3
+ Accepts Base64-encoded MP3 audio and returns AI vs Human classification
4
+ """
5
+
6
+ from flask import Flask, request, jsonify
7
+ from flask_cors import CORS
8
+ from flask_sock import Sock
9
+ from functools import wraps
10
+ import base64
11
+ import json
12
+ import os
13
+ import logging
14
+ import shutil
15
+ import tempfile
16
+ import uuid
17
+ import wave
18
+ from datetime import datetime
19
+ from urllib.parse import parse_qs
20
+
21
+ # Import the detector
22
+ from detector import HybridEnsembleDetector
23
+
24
+ # Configure logging
25
+ logging.basicConfig(
26
+ level=logging.INFO,
27
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
28
+ )
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # Initialize Flask app
32
+ app = Flask(__name__)
33
+ CORS(app)
34
+ sock = Sock(app)
35
+
36
+ # Load API key from environment variable (HuggingFace Secrets)
37
+ API_KEY = os.environ.get('API_KEY', 'sk_test_123456789')
38
+ logger.info(f"API initialized with key: {API_KEY[:10]}...")
39
+
40
+ def parse_bool(value, default=False):
41
+ if value is None:
42
+ return default
43
+ if isinstance(value, bool):
44
+ return value
45
+ return str(value).strip().lower() in ["1", "true", "yes", "y", "on"]
46
+
47
+ # Streaming configuration
48
+ STREAMING_ENABLED = parse_bool(os.environ.get("ENABLE_STREAMING", "true"))
49
+ STREAMING_MAX_BUFFER_SECONDS = int(os.environ.get("STREAMING_MAX_BUFFER_SECONDS", 30))
50
+ STREAMING_PARTIAL_INTERVAL_SECONDS = float(os.environ.get("STREAMING_PARTIAL_INTERVAL_SECONDS", 10))
51
+ STREAMING_PARTIAL_MODE = os.environ.get("STREAMING_PARTIAL_MODE", "physics").lower()
52
+ STREAMING_MAX_CHUNK_BYTES = int(os.environ.get("STREAMING_MAX_CHUNK_BYTES", 2 * 1024 * 1024))
53
+ STREAMING_SUPPORTED_FORMATS = {"pcm16", "wav", "mp3"}
54
+
55
+ # Self-learning / feedback configuration
56
+ ENABLE_FEEDBACK_STORAGE = parse_bool(os.environ.get("ENABLE_FEEDBACK_STORAGE", "true"))
57
+ FEEDBACK_STORAGE_DIR = os.environ.get("FEEDBACK_STORAGE_DIR", "data/feedback")
58
+ FEEDBACK_MAX_BYTES = int(os.environ.get("FEEDBACK_MAX_BYTES", 15 * 1024 * 1024))
59
+ CALIBRATION_PATH = os.environ.get("CALIBRATION_PATH", "data/calibration.json")
60
+ CALIBRATION_HISTORY_DIR = os.environ.get("CALIBRATION_HISTORY_DIR", "data/calibration_history")
61
+ CALIBRATION_HISTORY_MAX = int(os.environ.get("CALIBRATION_HISTORY_MAX", 50))
62
+
63
+ # Initialize the detector globally (load models once at startup)
64
+ logger.info("Loading AI detection models...")
65
+ detector = None
66
+ SKIP_MODEL_LOAD = parse_bool(os.environ.get("SKIP_MODEL_LOAD", "false"))
67
+
68
+ def init_detector():
69
+ """Initialize the detector with models"""
70
+ global detector
71
+ try:
72
+ detector = HybridEnsembleDetector(
73
+ deepfake_model_path=r"D:\hackathons\GUVI_HCL\AI_Voice_Detector\wav2vec2-deepfake-voice-detector",
74
+ whisper_model_path="openai/whisper-base",
75
+ physics_weight=0.4,
76
+ dl_weight=0.6,
77
+ use_local_deepfake_model=True,
78
+ use_local_whisper_model=False,
79
+ calibration_path=CALIBRATION_PATH,
80
+ max_audio_duration=30
81
+ )
82
+ logger.info("✅ Detector initialized successfully")
83
+ return True
84
+ except Exception as e:
85
+ logger.error(f"❌ Failed to initialize detector: {str(e)}")
86
+ return False
87
+
88
+ # Initialize detector at startup
89
+ if SKIP_MODEL_LOAD:
90
+ logger.info("⚠️ Skipping detector initialization (SKIP_MODEL_LOAD=true)")
91
+ elif not init_detector():
92
+ logger.warning("⚠️ API starting without detector - models will be loaded on first request")
93
+
94
+
95
+ # ==========================================================
96
+ # AUTHENTICATION DECORATOR
97
+ # ==========================================================
98
+ def require_api_key(f):
99
+ """Decorator to validate API key from request headers"""
100
+ @wraps(f)
101
+ def decorated_function(*args, **kwargs):
102
+ # Get API key from headers
103
+ provided_key = request.headers.get('x-api-key')
104
+
105
+ if not provided_key:
106
+ logger.warning(f"Request without API key from {request.remote_addr}")
107
+ return jsonify({
108
+ "status": "error",
109
+ "message": "Missing API key. Please provide 'x-api-key' in request headers."
110
+ }), 401
111
+
112
+ if provided_key != API_KEY:
113
+ logger.warning(f"Invalid API key attempt from {request.remote_addr}")
114
+ return jsonify({
115
+ "status": "error",
116
+ "message": "Invalid API key"
117
+ }), 403
118
+
119
+ return f(*args, **kwargs)
120
+
121
+ return decorated_function
122
+
123
+
124
+ def get_ws_api_key(environ):
125
+ if not environ:
126
+ return None
127
+
128
+ key = environ.get("HTTP_X_API_KEY")
129
+ if key:
130
+ return key
131
+
132
+ auth = environ.get("HTTP_AUTHORIZATION")
133
+ if auth and auth.lower().startswith("bearer "):
134
+ return auth.split(" ", 1)[1]
135
+
136
+ query_params = parse_qs(environ.get("QUERY_STRING", ""))
137
+ if "api_key" in query_params:
138
+ return query_params["api_key"][0]
139
+
140
+ return None
141
+
142
+
143
+ def normalize_label(label):
144
+ if label is None:
145
+ return None
146
+ label_value = str(label).strip().upper()
147
+ if label_value in ["AI_GENERATED", "AI", "FAKE", "SYNTHETIC"]:
148
+ return "AI_GENERATED"
149
+ if label_value in ["HUMAN", "REAL"]:
150
+ return "HUMAN"
151
+ return None
152
+
153
+
154
+ def decode_audio_base64(audio_base64):
155
+ detected_format = None
156
+ if isinstance(audio_base64, str) and audio_base64.startswith("data:"):
157
+ header, audio_base64 = audio_base64.split(",", 1)
158
+ header_lower = header.lower()
159
+ if "audio/wav" in header_lower or "audio/x-wav" in header_lower:
160
+ detected_format = "wav"
161
+ elif "audio/mpeg" in header_lower or "audio/mp3" in header_lower:
162
+ detected_format = "mp3"
163
+ audio_bytes = base64.b64decode(audio_base64)
164
+ return audio_bytes, detected_format
165
+
166
+
167
+ def write_bytes_to_temp_file(data, suffix):
168
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
169
+ temp_file.write(data)
170
+ temp_file.close()
171
+ return temp_file.name
172
+
173
+
174
+ def write_pcm16_to_wav_file(pcm_bytes, sample_rate, channels):
175
+ if len(pcm_bytes) % 2 != 0:
176
+ pcm_bytes = pcm_bytes[:len(pcm_bytes) - 1]
177
+
178
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
179
+ temp_path = temp_file.name
180
+ temp_file.close()
181
+
182
+ with wave.open(temp_path, "wb") as wav_file:
183
+ wav_file.setnchannels(channels)
184
+ wav_file.setsampwidth(2)
185
+ wav_file.setframerate(sample_rate)
186
+ wav_file.writeframes(pcm_bytes)
187
+
188
+ return temp_path
189
+
190
+
191
+ def format_detection_payload(result, requested_language=None):
192
+ if result.get("status") != "success":
193
+ return {
194
+ "status": "error",
195
+ "message": result.get("error") or result.get("message") or "Unknown error"
196
+ }
197
+
198
+ payload = {
199
+ "status": "success",
200
+ "classification": result.get("classification"),
201
+ "confidenceScore": result.get("confidenceScore"),
202
+ "explanation": result.get("explanation"),
203
+ "detectedLanguage": result.get("language", "Unknown"),
204
+ "analysisMode": result.get("analysisMode", "full")
205
+ }
206
+
207
+ if requested_language:
208
+ payload["requestedLanguage"] = requested_language
209
+
210
+ return payload
211
+
212
+
213
+ def ensure_dir(path):
214
+ if path:
215
+ os.makedirs(path, exist_ok=True)
216
+
217
+
218
+ def build_calibration_version_id():
219
+ timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
220
+ suffix = uuid.uuid4().hex[:8]
221
+ return f"{timestamp}_{suffix}"
222
+
223
+
224
+ def calibration_history_files():
225
+ if not os.path.isdir(CALIBRATION_HISTORY_DIR):
226
+ return []
227
+
228
+ files = []
229
+ for name in os.listdir(CALIBRATION_HISTORY_DIR):
230
+ if name.startswith("calibration_") and name.endswith(".json"):
231
+ if name.endswith(".meta.json"):
232
+ continue
233
+ files.append(os.path.join(CALIBRATION_HISTORY_DIR, name))
234
+ files.sort(key=lambda p: os.path.getmtime(p), reverse=True)
235
+ return files
236
+
237
+
238
+ def archive_calibration(reason=None):
239
+ if not os.path.exists(CALIBRATION_PATH):
240
+ return None
241
+
242
+ ensure_dir(CALIBRATION_HISTORY_DIR)
243
+ version_id = build_calibration_version_id()
244
+ filename = f"calibration_{version_id}.json"
245
+ dest_path = os.path.join(CALIBRATION_HISTORY_DIR, filename)
246
+ shutil.copy2(CALIBRATION_PATH, dest_path)
247
+
248
+ meta = {
249
+ "versionId": version_id,
250
+ "source": CALIBRATION_PATH,
251
+ "archivedAt": datetime.utcnow().isoformat() + "Z",
252
+ "reason": reason or "manual"
253
+ }
254
+ meta_path = os.path.join(CALIBRATION_HISTORY_DIR, f"calibration_{version_id}.meta.json")
255
+ with open(meta_path, "w", encoding="utf-8") as handle:
256
+ json.dump(meta, handle, indent=2)
257
+
258
+ if CALIBRATION_HISTORY_MAX > 0:
259
+ history = calibration_history_files()
260
+ for path in history[CALIBRATION_HISTORY_MAX:]:
261
+ try:
262
+ os.unlink(path)
263
+ except Exception:
264
+ pass
265
+ meta_path = path.replace(".json", ".meta.json")
266
+ if os.path.exists(meta_path):
267
+ try:
268
+ os.unlink(meta_path)
269
+ except Exception:
270
+ pass
271
+
272
+ return {
273
+ "versionId": version_id,
274
+ "path": dest_path
275
+ }
276
+
277
+
278
+ def list_calibration_history():
279
+ entries = []
280
+ for path in calibration_history_files():
281
+ name = os.path.basename(path)
282
+ version_id = name.replace("calibration_", "").replace(".json", "")
283
+ meta_path = path.replace(".json", ".meta.json")
284
+ meta = {}
285
+ if os.path.exists(meta_path):
286
+ try:
287
+ with open(meta_path, "r", encoding="utf-8") as handle:
288
+ meta = json.load(handle)
289
+ except Exception:
290
+ meta = {}
291
+ entries.append({
292
+ "versionId": version_id,
293
+ "path": path,
294
+ "archivedAt": meta.get("archivedAt"),
295
+ "reason": meta.get("reason")
296
+ })
297
+ return entries
298
+
299
+
300
+ def resolve_history_path(version_id):
301
+ if not version_id:
302
+ return None
303
+ filename = f"calibration_{version_id}.json"
304
+ return os.path.join(CALIBRATION_HISTORY_DIR, filename)
305
+
306
+
307
+ class StreamSession:
308
+ def __init__(
309
+ self,
310
+ audio_format,
311
+ sample_rate,
312
+ channels,
313
+ max_seconds,
314
+ enable_partial,
315
+ partial_interval_seconds,
316
+ partial_mode
317
+ ):
318
+ self.session_id = str(uuid.uuid4())
319
+ self.audio_format = audio_format
320
+ self.sample_rate = sample_rate
321
+ self.channels = channels
322
+ self.max_seconds = max_seconds
323
+ self.enable_partial = enable_partial
324
+ self.partial_interval_seconds = partial_interval_seconds
325
+ self.partial_mode = partial_mode
326
+ self.buffer = bytearray()
327
+ self.total_bytes_received = 0
328
+ self.total_seconds_received = 0.0
329
+ self.last_partial_seconds = 0.0
330
+
331
+ def add_chunk(self, chunk_bytes):
332
+ self.total_bytes_received += len(chunk_bytes)
333
+ self.buffer.extend(chunk_bytes)
334
+
335
+ if self.audio_format == "pcm16":
336
+ bytes_per_second = self.sample_rate * self.channels * 2
337
+ if bytes_per_second > 0:
338
+ self.total_seconds_received = self.total_bytes_received / bytes_per_second
339
+ max_bytes = int(self.max_seconds * bytes_per_second)
340
+ if max_bytes > 0 and len(self.buffer) > max_bytes:
341
+ overflow = len(self.buffer) - max_bytes
342
+ del self.buffer[:overflow]
343
+
344
+ return self.current_buffer_seconds()
345
+
346
+ def current_buffer_seconds(self):
347
+ if self.audio_format != "pcm16":
348
+ return None
349
+ bytes_per_second = self.sample_rate * self.channels * 2
350
+ if bytes_per_second <= 0:
351
+ return None
352
+ return len(self.buffer) / bytes_per_second
353
+
354
+ def should_run_partial(self):
355
+ if not self.enable_partial:
356
+ return False
357
+ if self.audio_format != "pcm16":
358
+ return False
359
+ if self.partial_interval_seconds <= 0:
360
+ return False
361
+ if (self.total_seconds_received - self.last_partial_seconds) >= self.partial_interval_seconds:
362
+ self.last_partial_seconds = self.total_seconds_received
363
+ return True
364
+ return False
365
+
366
+ def write_temp_audio_file(self):
367
+ if self.audio_format == "pcm16":
368
+ return write_pcm16_to_wav_file(self.buffer, self.sample_rate, self.channels), "wav"
369
+
370
+ suffix = ".mp3" if self.audio_format == "mp3" else ".wav"
371
+ return write_bytes_to_temp_file(self.buffer, suffix), self.audio_format
372
+
373
+
374
+ # ==========================================================
375
+ # ROOT ENDPOINT (HuggingFace Spaces Homepage)
376
+ # ==========================================================
377
+ @app.route('/', methods=['GET'])
378
+ def home():
379
+ """Root endpoint - API information"""
380
+ return jsonify({
381
+ "name": "Voice Detection API",
382
+ "version": "1.0.0",
383
+ "description": "AI-powered voice detection system for identifying AI-generated vs human voices",
384
+ "endpoints": {
385
+ "health": "/health",
386
+ "detection": "/api/voice-detection",
387
+ "streaming": "/ws/voice-stream",
388
+ "feedback": "/api/feedback",
389
+ "reload_calibration": "/api/reload-calibration",
390
+ "backup_calibration": "/api/backup-calibration",
391
+ "rollback_calibration": "/api/rollback-calibration",
392
+ "calibration_history": "/api/calibration-history"
393
+ },
394
+ "supported_languages": ["Tamil", "English", "Hindi", "Malayalam", "Telugu"],
395
+ "authentication": "Required - use 'x-api-key' header",
396
+ "documentation": "See README for full API documentation"
397
+ }), 200
398
+
399
+
400
+ # ==========================================================
401
+ # HEALTH CHECK ENDPOINT
402
+ # ==========================================================
403
+ @app.route('/health', methods=['GET'])
404
+ def health_check():
405
+ """Health check endpoint (no authentication required)"""
406
+ return jsonify({
407
+ "status": "healthy",
408
+ "service": "Voice Detection API",
409
+ "timestamp": datetime.utcnow().isoformat(),
410
+ "models_loaded": detector is not None,
411
+ "calibration_loaded": bool(detector and detector.calibrator and detector.calibrator.ready),
412
+ "streaming_enabled": STREAMING_ENABLED,
413
+ "platform": "HuggingFace Spaces"
414
+ }), 200
415
+
416
+
417
+ # ==========================================================
418
+ # MAIN VOICE DETECTION ENDPOINT
419
+ # ==========================================================
420
+ @app.route('/api/voice-detection', methods=['POST'])
421
+ @require_api_key
422
+ def voice_detection():
423
+ """
424
+ Main voice detection endpoint
425
+
426
+ Expected JSON Body:
427
+ {
428
+ "language": "Tamil" | "English" | "Hindi" | "Malayalam" | "Telugu",
429
+ "audioFormat": "mp3",
430
+ "audioBase64": "base64_encoded_audio_string"
431
+ }
432
+
433
+ Returns:
434
+ {
435
+ "status": "success",
436
+ "language": "Tamil",
437
+ "classification": "AI_GENERATED" | "HUMAN",
438
+ "confidenceScore": 0.0-1.0,
439
+ "explanation": "..."
440
+ }
441
+ """
442
+ global detector
443
+
444
+ try:
445
+ # Validate Content-Type
446
+ if not request.is_json:
447
+ return jsonify({
448
+ "status": "error",
449
+ "message": "Content-Type must be application/json"
450
+ }), 400
451
+
452
+ # Get request data
453
+ data = request.get_json()
454
+
455
+ # Validate required fields
456
+ required_fields = ['language', 'audioFormat', 'audioBase64']
457
+ missing_fields = [field for field in required_fields if field not in data]
458
+
459
+ if missing_fields:
460
+ return jsonify({
461
+ "status": "error",
462
+ "message": f"Missing required fields: {', '.join(missing_fields)}"
463
+ }), 400
464
+
465
+ # Validate language
466
+ supported_languages = ['Tamil', 'English', 'Hindi', 'Malayalam', 'Telugu']
467
+ if data['language'] not in supported_languages:
468
+ return jsonify({
469
+ "status": "error",
470
+ "message": f"Unsupported language. Must be one of: {', '.join(supported_languages)}"
471
+ }), 400
472
+
473
+ # Validate audio format
474
+ if data['audioFormat'].lower() != 'mp3':
475
+ return jsonify({
476
+ "status": "error",
477
+ "message": "Only MP3 audio format is supported"
478
+ }), 400
479
+
480
+ # Validate base64 string
481
+ audio_base64 = data['audioBase64']
482
+ if not audio_base64 or len(audio_base64) < 100:
483
+ return jsonify({
484
+ "status": "error",
485
+ "message": "Invalid or empty audio data"
486
+ }), 400
487
+
488
+ # Initialize detector if not already loaded
489
+ if detector is None:
490
+ logger.info("Lazy loading detector on first request...")
491
+ if not init_detector():
492
+ return jsonify({
493
+ "status": "error",
494
+ "message": "Failed to load AI detection models. Please try again later."
495
+ }), 503
496
+
497
+ # Log request
498
+ logger.info(f"Processing voice detection request for language: {data['language']}")
499
+
500
+ # Analyze audio
501
+ result = detector.analyze(
502
+ audio_base64,
503
+ input_type="base64",
504
+ audio_format=data['audioFormat']
505
+ )
506
+
507
+ # Check if analysis was successful
508
+ if result['status'] != 'success':
509
+ error_msg = result.get('error', 'Unknown error during analysis')
510
+ logger.error(f"Analysis failed: {error_msg}")
511
+ return jsonify({
512
+ "status": "error",
513
+ "message": f"Audio analysis failed: {error_msg}"
514
+ }), 500
515
+
516
+ # Prepare response (API compliant format - NO DEBUG INFO in production)
517
+ response = {
518
+ "status": "success",
519
+ "language": data['language'], # Use requested language from input
520
+ "classification": result['classification'],
521
+ "confidenceScore": result['confidenceScore'],
522
+ "explanation": result['explanation']
523
+ }
524
+
525
+ logger.info(f"✅ Analysis complete: {result['classification']} (confidence: {result['confidenceScore']})")
526
+
527
+ return jsonify(response), 200
528
+
529
+ except Exception as e:
530
+ logger.error(f"Unexpected error in voice_detection: {str(e)}", exc_info=True)
531
+ return jsonify({
532
+ "status": "error",
533
+ "message": "Internal server error occurred during processing"
534
+ }), 500
535
+
536
+
537
+ # ==========================================================
538
+ # FEEDBACK / SELF-LEARNING ENDPOINT
539
+ # ==========================================================
540
+ @app.route('/api/feedback', methods=['POST'])
541
+ @require_api_key
542
+ def feedback():
543
+ """
544
+ Collect labeled audio samples for periodic self-learning.
545
+
546
+ Expected JSON Body:
547
+ {
548
+ "label": "AI_GENERATED" | "HUMAN",
549
+ "audioFormat": "mp3" | "wav",
550
+ "audioBase64": "base64_encoded_audio_string",
551
+ "runDetection": false,
552
+ "metadata": { ... }
553
+ }
554
+ """
555
+ if not ENABLE_FEEDBACK_STORAGE:
556
+ return jsonify({
557
+ "status": "error",
558
+ "message": "Feedback storage is disabled"
559
+ }), 403
560
+
561
+ if not request.is_json:
562
+ return jsonify({
563
+ "status": "error",
564
+ "message": "Content-Type must be application/json"
565
+ }), 400
566
+
567
+ data = request.get_json()
568
+ label = normalize_label(data.get("label"))
569
+ if not label:
570
+ return jsonify({
571
+ "status": "error",
572
+ "message": "Invalid label. Use AI_GENERATED or HUMAN."
573
+ }), 400
574
+
575
+ audio_format = str(data.get("audioFormat", "mp3")).lower()
576
+ if audio_format not in ["mp3", "wav"]:
577
+ return jsonify({
578
+ "status": "error",
579
+ "message": "audioFormat must be 'mp3' or 'wav'"
580
+ }), 400
581
+
582
+ audio_base64 = data.get("audioBase64")
583
+ if not audio_base64 or len(audio_base64) < 100:
584
+ return jsonify({
585
+ "status": "error",
586
+ "message": "Invalid or empty audio data"
587
+ }), 400
588
+
589
+ try:
590
+ audio_bytes, detected_format = decode_audio_base64(audio_base64)
591
+ except Exception as e:
592
+ return jsonify({
593
+ "status": "error",
594
+ "message": f"Failed to decode audio: {str(e)}"
595
+ }), 400
596
+
597
+ if detected_format:
598
+ audio_format = detected_format
599
+
600
+ if len(audio_bytes) > FEEDBACK_MAX_BYTES:
601
+ return jsonify({
602
+ "status": "error",
603
+ "message": "Audio payload exceeds maximum size"
604
+ }), 413
605
+
606
+ now = datetime.utcnow()
607
+ date_dir = now.strftime("%Y%m%d")
608
+ label_dir = os.path.join(FEEDBACK_STORAGE_DIR, label, date_dir)
609
+ os.makedirs(label_dir, exist_ok=True)
610
+
611
+ sample_id = str(uuid.uuid4())
612
+ extension = ".mp3" if audio_format == "mp3" else ".wav"
613
+ file_path = os.path.join(label_dir, f"{sample_id}{extension}")
614
+
615
+ with open(file_path, "wb") as handle:
616
+ handle.write(audio_bytes)
617
+
618
+ metadata = {
619
+ "id": sample_id,
620
+ "label": label,
621
+ "audio_format": audio_format,
622
+ "created_at": now.isoformat() + "Z",
623
+ "bytes": len(audio_bytes),
624
+ "path": file_path,
625
+ "client_metadata": data.get("metadata", {})
626
+ }
627
+
628
+ if parse_bool(data.get("runDetection", False)):
629
+ global detector
630
+ if detector is None:
631
+ logger.info("Lazy loading detector for feedback scoring...")
632
+ if not init_detector():
633
+ return jsonify({
634
+ "status": "error",
635
+ "message": "Failed to load AI detection models for scoring"
636
+ }), 503
637
+
638
+ scores = detector.extract_scores(file_path, input_type="file")
639
+ if scores.get("status") == "success":
640
+ metadata["physics_score"] = scores.get("physics_score")
641
+ metadata["dl_score"] = scores.get("dl_score")
642
+ metadata["dl_label"] = scores.get("dl_label")
643
+ metadata["audio_duration"] = scores.get("audio_duration")
644
+ metadata["was_truncated"] = scores.get("was_truncated")
645
+
646
+ meta_path = os.path.join(label_dir, f"{sample_id}.json")
647
+ with open(meta_path, "w", encoding="utf-8") as handle:
648
+ json.dump(metadata, handle, indent=2)
649
+
650
+ index_path = os.path.join(FEEDBACK_STORAGE_DIR, "index.jsonl")
651
+ with open(index_path, "a", encoding="utf-8") as handle:
652
+ handle.write(json.dumps(metadata) + "\n")
653
+
654
+ return jsonify({
655
+ "status": "success",
656
+ "id": sample_id,
657
+ "label": label,
658
+ "audioFormat": audio_format,
659
+ "stored": True
660
+ }), 200
661
+
662
+
663
+ # ==========================================================
664
+ # CALIBRATION RELOAD ENDPOINT
665
+ # ==========================================================
666
+ @app.route('/api/reload-calibration', methods=['POST'])
667
+ @require_api_key
668
+ def reload_calibration():
669
+ global detector
670
+
671
+ if detector is None:
672
+ logger.info("Lazy loading detector for calibration reload...")
673
+ if not init_detector():
674
+ return jsonify({
675
+ "status": "error",
676
+ "message": "Failed to load AI detection models"
677
+ }), 503
678
+
679
+ loaded = detector.reload_calibration(CALIBRATION_PATH)
680
+ if not loaded:
681
+ return jsonify({
682
+ "status": "error",
683
+ "message": "Calibration file not found or invalid"
684
+ }), 404
685
+
686
+ return jsonify({
687
+ "status": "success",
688
+ "calibrationPath": detector.calibrator.calibration_path
689
+ }), 200
690
+
691
+
692
+ @app.route('/api/backup-calibration', methods=['POST'])
693
+ @require_api_key
694
+ def backup_calibration():
695
+ payload = request.get_json(silent=True) or {}
696
+ reason = payload.get("reason")
697
+
698
+ if not os.path.exists(CALIBRATION_PATH):
699
+ return jsonify({
700
+ "status": "error",
701
+ "message": "Calibration file not found"
702
+ }), 404
703
+
704
+ backup = archive_calibration(reason=reason or "api_backup")
705
+ if not backup:
706
+ return jsonify({
707
+ "status": "error",
708
+ "message": "Failed to archive calibration"
709
+ }), 500
710
+
711
+ return jsonify({
712
+ "status": "success",
713
+ "versionId": backup["versionId"],
714
+ "path": backup["path"]
715
+ }), 200
716
+
717
+
718
+ @app.route('/api/calibration-history', methods=['GET'])
719
+ @require_api_key
720
+ def calibration_history():
721
+ history = list_calibration_history()
722
+ return jsonify({
723
+ "status": "success",
724
+ "history": history
725
+ }), 200
726
+
727
+
728
+ @app.route('/api/rollback-calibration', methods=['POST'])
729
+ @require_api_key
730
+ def rollback_calibration():
731
+ payload = request.get_json(silent=True) or {}
732
+ version_id = payload.get("versionId")
733
+
734
+ if not version_id:
735
+ return jsonify({
736
+ "status": "error",
737
+ "message": "Missing versionId"
738
+ }), 400
739
+
740
+ source_path = resolve_history_path(version_id)
741
+ if not source_path or not os.path.exists(source_path):
742
+ return jsonify({
743
+ "status": "error",
744
+ "message": "Calibration version not found"
745
+ }), 404
746
+
747
+ ensure_dir(os.path.dirname(CALIBRATION_PATH))
748
+ shutil.copy2(source_path, CALIBRATION_PATH)
749
+
750
+ global detector
751
+ if detector is None:
752
+ logger.info("Lazy loading detector for rollback...")
753
+ if not init_detector():
754
+ return jsonify({
755
+ "status": "error",
756
+ "message": "Failed to load AI detection models"
757
+ }), 503
758
+
759
+ loaded = detector.reload_calibration(CALIBRATION_PATH)
760
+ if not loaded:
761
+ return jsonify({
762
+ "status": "error",
763
+ "message": "Failed to load calibration after rollback"
764
+ }), 500
765
+
766
+ return jsonify({
767
+ "status": "success",
768
+ "versionId": version_id,
769
+ "calibrationPath": CALIBRATION_PATH
770
+ }), 200
771
+
772
+
773
+ # ==========================================================
774
+ # REALTIME STREAMING ENDPOINT (WEBSOCKET)
775
+ # ==========================================================
776
+ @sock.route('/ws/voice-stream')
777
+ def voice_stream(ws):
778
+ if not STREAMING_ENABLED:
779
+ ws.send(json.dumps({
780
+ "type": "error",
781
+ "message": "Streaming is disabled"
782
+ }))
783
+ return
784
+
785
+ api_key = get_ws_api_key(ws.environ)
786
+ if api_key != API_KEY:
787
+ ws.send(json.dumps({
788
+ "type": "error",
789
+ "message": "Invalid API key"
790
+ }))
791
+ return
792
+
793
+ session = None
794
+ requested_language = None
795
+
796
+ while True:
797
+ message = ws.receive()
798
+ if message is None:
799
+ break
800
+
801
+ try:
802
+ payload = json.loads(message)
803
+ except Exception:
804
+ ws.send(json.dumps({
805
+ "type": "error",
806
+ "message": "Invalid JSON message"
807
+ }))
808
+ continue
809
+
810
+ msg_type = payload.get("type")
811
+
812
+ if msg_type == "start":
813
+ if session is not None:
814
+ ws.send(json.dumps({
815
+ "type": "error",
816
+ "message": "Stream already started"
817
+ }))
818
+ continue
819
+
820
+ audio_format = str(payload.get("audioFormat", "pcm16")).lower()
821
+ if audio_format in ["pcm_s16le", "s16le", "pcm16le"]:
822
+ audio_format = "pcm16"
823
+ if audio_format not in STREAMING_SUPPORTED_FORMATS:
824
+ ws.send(json.dumps({
825
+ "type": "error",
826
+ "message": "Unsupported audioFormat for streaming"
827
+ }))
828
+ continue
829
+
830
+ sample_rate = int(payload.get("sampleRate", 16000))
831
+ channels = int(payload.get("channels", 1))
832
+ if audio_format == "pcm16":
833
+ if sample_rate <= 0 or channels <= 0:
834
+ ws.send(json.dumps({
835
+ "type": "error",
836
+ "message": "sampleRate and channels must be positive for pcm16"
837
+ }))
838
+ continue
839
+ if channels not in [1, 2]:
840
+ ws.send(json.dumps({
841
+ "type": "error",
842
+ "message": "channels must be 1 or 2 for pcm16"
843
+ }))
844
+ continue
845
+ requested_language = payload.get("language")
846
+ enable_partial = parse_bool(payload.get("enablePartial", True))
847
+ partial_interval = float(payload.get("partialIntervalSec", STREAMING_PARTIAL_INTERVAL_SECONDS))
848
+ max_seconds = int(payload.get("maxSeconds", STREAMING_MAX_BUFFER_SECONDS))
849
+ partial_mode = str(payload.get("partialMode", STREAMING_PARTIAL_MODE)).lower()
850
+ if partial_mode not in ["full", "physics", "dl"]:
851
+ partial_mode = "physics"
852
+
853
+ session = StreamSession(
854
+ audio_format=audio_format,
855
+ sample_rate=sample_rate,
856
+ channels=channels,
857
+ max_seconds=max_seconds,
858
+ enable_partial=enable_partial,
859
+ partial_interval_seconds=partial_interval,
860
+ partial_mode=partial_mode
861
+ )
862
+
863
+ ws.send(json.dumps({
864
+ "type": "ack",
865
+ "status": "ready",
866
+ "sessionId": session.session_id,
867
+ "streaming": {
868
+ "audioFormat": audio_format,
869
+ "sampleRate": sample_rate,
870
+ "channels": channels,
871
+ "maxSeconds": max_seconds,
872
+ "partialIntervalSec": partial_interval,
873
+ "partialMode": partial_mode,
874
+ "enablePartial": enable_partial
875
+ }
876
+ }))
877
+ continue
878
+
879
+ if msg_type == "ping":
880
+ ws.send(json.dumps({"type": "pong"}))
881
+ continue
882
+
883
+ if msg_type not in ["audio_chunk", "stop"]:
884
+ ws.send(json.dumps({
885
+ "type": "error",
886
+ "message": "Unsupported message type"
887
+ }))
888
+ continue
889
+
890
+ if session is None:
891
+ ws.send(json.dumps({
892
+ "type": "error",
893
+ "message": "Stream not started"
894
+ }))
895
+ continue
896
+
897
+ finalize_only = False
898
+ if msg_type == "stop":
899
+ payload["final"] = True
900
+ finalize_only = True
901
+
902
+ chunk_b64 = payload.get("audioChunkBase64")
903
+ chunk_bytes = None
904
+ if not chunk_b64:
905
+ if not finalize_only:
906
+ ws.send(json.dumps({
907
+ "type": "error",
908
+ "message": "Missing audioChunkBase64"
909
+ }))
910
+ continue
911
+ else:
912
+ try:
913
+ chunk_bytes = base64.b64decode(chunk_b64)
914
+ except Exception:
915
+ ws.send(json.dumps({
916
+ "type": "error",
917
+ "message": "Invalid base64 audio chunk"
918
+ }))
919
+ continue
920
+
921
+ if len(chunk_bytes) > STREAMING_MAX_CHUNK_BYTES:
922
+ ws.send(json.dumps({
923
+ "type": "error",
924
+ "message": "Audio chunk exceeds maximum size"
925
+ }))
926
+ continue
927
+
928
+ buffer_seconds = session.add_chunk(chunk_bytes)
929
+ ws.send(json.dumps({
930
+ "type": "progress",
931
+ "receivedBytes": session.total_bytes_received,
932
+ "bufferBytes": len(session.buffer),
933
+ "bufferSeconds": buffer_seconds
934
+ }))
935
+
936
+ if session.should_run_partial():
937
+ if detector is None:
938
+ logger.info("Lazy loading detector for streaming...")
939
+ if not init_detector():
940
+ ws.send(json.dumps({
941
+ "type": "error",
942
+ "message": "Failed to load AI detection models"
943
+ }))
944
+ break
945
+
946
+ temp_path = None
947
+ try:
948
+ temp_path, file_format = session.write_temp_audio_file()
949
+ result = detector.analyze(
950
+ temp_path,
951
+ input_type="file",
952
+ audio_format=file_format,
953
+ analysis_mode=session.partial_mode
954
+ )
955
+ ws.send(json.dumps({
956
+ "type": "partial_result",
957
+ "result": format_detection_payload(result, requested_language=requested_language)
958
+ }))
959
+ finally:
960
+ if temp_path and os.path.exists(temp_path):
961
+ try:
962
+ os.unlink(temp_path)
963
+ except Exception:
964
+ pass
965
+
966
+ if parse_bool(payload.get("final", False)):
967
+ if not session.buffer:
968
+ ws.send(json.dumps({
969
+ "type": "error",
970
+ "message": "No audio received"
971
+ }))
972
+ break
973
+
974
+ if detector is None:
975
+ logger.info("Lazy loading detector for streaming...")
976
+ if not init_detector():
977
+ ws.send(json.dumps({
978
+ "type": "error",
979
+ "message": "Failed to load AI detection models"
980
+ }))
981
+ break
982
+
983
+ temp_path = None
984
+ try:
985
+ temp_path, file_format = session.write_temp_audio_file()
986
+ result = detector.analyze(
987
+ temp_path,
988
+ input_type="file",
989
+ audio_format=file_format,
990
+ analysis_mode="full"
991
+ )
992
+ ws.send(json.dumps({
993
+ "type": "final_result",
994
+ "result": format_detection_payload(result, requested_language=requested_language)
995
+ }))
996
+ finally:
997
+ if temp_path and os.path.exists(temp_path):
998
+ try:
999
+ os.unlink(temp_path)
1000
+ except Exception:
1001
+ pass
1002
+ break
1003
+
1004
+
1005
+ # ==========================================================
1006
+ # ERROR HANDLERS
1007
+ # ==========================================================
1008
+ @app.errorhandler(404)
1009
+ def not_found(error):
1010
+ """Handle 404 errors"""
1011
+ return jsonify({
1012
+ "status": "error",
1013
+ "message": "Endpoint not found"
1014
+ }), 404
1015
+
1016
+
1017
+ @app.errorhandler(405)
1018
+ def method_not_allowed(error):
1019
+ """Handle 405 errors"""
1020
+ return jsonify({
1021
+ "status": "error",
1022
+ "message": "Method not allowed for this endpoint"
1023
+ }), 405
1024
+
1025
+
1026
+ @app.errorhandler(500)
1027
+ def internal_error(error):
1028
+ """Handle 500 errors"""
1029
+ logger.error(f"Internal server error: {str(error)}")
1030
+ return jsonify({
1031
+ "status": "error",
1032
+ "message": "Internal server error"
1033
+ }), 500
1034
+
1035
+
1036
+ # ==========================================================
1037
+ # RUN APPLICATION
1038
+ # ==========================================================
1039
+ if __name__ == '__main__':
1040
+ # HuggingFace Spaces uses port 7860
1041
+ port = int(os.environ.get('PORT', 7860))
1042
+
1043
+ # Run the app
1044
+ logger.info(f"🚀 Starting Voice Detection API on port {port}")
1045
+ logger.info(f"📍 Endpoint: http://0.0.0.0:{port}/api/voice-detection")
1046
+ logger.info(f"🔑 API Key: {API_KEY}")
1047
+ logger.info(f"🌐 Platform: HuggingFace Spaces")
1048
+
1049
+ app.run(
1050
+ host='0.0.0.0',
1051
+ port=port,
1052
+ debug=False # Always False in production
1053
+ )
client.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Example Client Script for Voice Detection API
3
+ Demonstrates how to use the API from Python
4
+ """
5
+
6
+ import requests
7
+ import base64
8
+ import json
9
+ import argparse
10
+ from pathlib import Path
11
+
12
+ class VoiceDetectionClient:
13
+ """Client for interacting with Voice Detection API"""
14
+
15
+ def __init__(self, api_url, api_key):
16
+ """
17
+ Initialize the client
18
+
19
+ Args:
20
+ api_url: Base URL of the API (e.g., http://localhost:5000)
21
+ api_key: API authentication key
22
+ """
23
+ self.api_url = api_url.rstrip('/')
24
+ self.api_key = api_key
25
+ self.headers = {
26
+ 'Content-Type': 'application/json',
27
+ 'x-api-key': self.api_key
28
+ }
29
+
30
+ def check_health(self):
31
+ """Check if the API is healthy"""
32
+ try:
33
+ response = requests.get(f"{self.api_url}/health", timeout=5)
34
+ return response.json()
35
+ except Exception as e:
36
+ return {"status": "error", "message": str(e)}
37
+
38
+ def detect_voice(self, audio_path, language="English"):
39
+ """
40
+ Detect if voice is AI-generated or human
41
+
42
+ Args:
43
+ audio_path: Path to MP3 audio file
44
+ language: Language of the audio (Tamil/English/Hindi/Malayalam/Telugu)
45
+
46
+ Returns:
47
+ dict: API response
48
+ """
49
+ # Validate file exists
50
+ if not Path(audio_path).exists():
51
+ return {"status": "error", "message": f"File not found: {audio_path}"}
52
+
53
+ # Validate language
54
+ supported_languages = ['Tamil', 'English', 'Hindi', 'Malayalam', 'Telugu']
55
+ if language not in supported_languages:
56
+ return {
57
+ "status": "error",
58
+ "message": f"Unsupported language. Use: {', '.join(supported_languages)}"
59
+ }
60
+
61
+ # Read and encode audio
62
+ try:
63
+ with open(audio_path, 'rb') as f:
64
+ audio_data = f.read()
65
+ audio_base64 = base64.b64encode(audio_data).decode('utf-8')
66
+ except Exception as e:
67
+ return {"status": "error", "message": f"Failed to read audio file: {str(e)}"}
68
+
69
+ # Prepare request
70
+ payload = {
71
+ "language": language,
72
+ "audioFormat": "mp3",
73
+ "audioBase64": audio_base64
74
+ }
75
+
76
+ # Send request
77
+ try:
78
+ response = requests.post(
79
+ f"{self.api_url}/api/voice-detection",
80
+ headers=self.headers,
81
+ json=payload,
82
+ timeout=120 # 2 minutes timeout
83
+ )
84
+
85
+ return response.json()
86
+
87
+ except requests.exceptions.Timeout:
88
+ return {"status": "error", "message": "Request timed out"}
89
+ except requests.exceptions.ConnectionError:
90
+ return {"status": "error", "message": "Could not connect to API"}
91
+ except Exception as e:
92
+ return {"status": "error", "message": str(e)}
93
+
94
+ def print_result(self, result):
95
+ """Pretty print the result"""
96
+ print("\n" + "="*70)
97
+ print("🎙️ VOICE DETECTION RESULT")
98
+ print("="*70)
99
+
100
+ if result.get('status') == 'success':
101
+ print(f"✅ Status: {result['status'].upper()}")
102
+ print(f"🌐 Language: {result['language']}")
103
+ print(f"🎯 Classification: {result['classification']}")
104
+ print(f"📊 Confidence Score: {result['confidenceScore']:.2f} / 1.00")
105
+ print(f"💡 Explanation: {result['explanation']}")
106
+
107
+ # Interpretation
108
+ print("\n" + "-"*70)
109
+ if result['classification'] == 'AI_GENERATED':
110
+ print("⚠️ This voice appears to be AI-generated or synthetic")
111
+ if result['confidenceScore'] > 0.8:
112
+ print(" High confidence - Strong indicators of AI generation")
113
+ elif result['confidenceScore'] > 0.65:
114
+ print(" Medium confidence - Multiple suspicious patterns detected")
115
+ else:
116
+ print(" Low confidence - Some indicators present but not conclusive")
117
+ else:
118
+ print("✅ This voice appears to be human/real")
119
+ if result['confidenceScore'] < 0.35:
120
+ print(" High confidence - Strong human characteristics")
121
+ elif result['confidenceScore'] < 0.5:
122
+ print(" Medium confidence - Mostly human patterns")
123
+ else:
124
+ print(" Low confidence - Close to threshold")
125
+ else:
126
+ print(f"❌ Status: ERROR")
127
+ print(f"💬 Message: {result.get('message', 'Unknown error')}")
128
+
129
+ print("="*70 + "\n")
130
+
131
+
132
+ def main():
133
+ """Main function with CLI interface"""
134
+ parser = argparse.ArgumentParser(
135
+ description='Voice Detection API Client',
136
+ formatter_class=argparse.RawDescriptionHelpFormatter,
137
+ epilog="""
138
+ Examples:
139
+ # Check API health
140
+ python client.py --health
141
+
142
+ # Detect single audio file
143
+ python client.py --audio test_audio.mp3 --language English
144
+
145
+ # Process multiple files
146
+ python client.py --audio file1.mp3 --audio file2.mp3 --language Tamil
147
+
148
+ # Use custom API URL and key
149
+ python client.py --audio test.mp3 --url http://api.example.com --key your_api_key
150
+ """
151
+ )
152
+
153
+ parser.add_argument(
154
+ '--url',
155
+ default='http://localhost:5000',
156
+ help='API base URL (default: http://localhost:5000)'
157
+ )
158
+
159
+ parser.add_argument(
160
+ '--key',
161
+ default='sk_test_123456789',
162
+ help='API key (default: sk_test_123456789)'
163
+ )
164
+
165
+ parser.add_argument(
166
+ '--health',
167
+ action='store_true',
168
+ help='Check API health'
169
+ )
170
+
171
+ parser.add_argument(
172
+ '--audio',
173
+ action='append',
174
+ help='Path to MP3 audio file (can be used multiple times)'
175
+ )
176
+
177
+ parser.add_argument(
178
+ '--language',
179
+ default='English',
180
+ choices=['Tamil', 'English', 'Hindi', 'Malayalam', 'Telugu'],
181
+ help='Language of the audio (default: English)'
182
+ )
183
+
184
+ args = parser.parse_args()
185
+
186
+ # Initialize client
187
+ client = VoiceDetectionClient(args.url, args.key)
188
+
189
+ # Health check
190
+ if args.health:
191
+ print("🏥 Checking API health...")
192
+ health = client.check_health()
193
+ print(json.dumps(health, indent=2))
194
+ return
195
+
196
+ # Process audio files
197
+ if args.audio:
198
+ for audio_file in args.audio:
199
+ print(f"\n🎵 Processing: {audio_file}")
200
+ print(f" Language: {args.language}")
201
+
202
+ result = client.detect_voice(audio_file, args.language)
203
+ client.print_result(result)
204
+ else:
205
+ parser.print_help()
206
+
207
+
208
+ if __name__ == '__main__':
209
+ main()
detector.py ADDED
@@ -0,0 +1,875 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import librosa
3
+ import numpy as np
4
+ import scipy.stats as stats
5
+ import torch.nn.functional as F
6
+ from transformers import AutoModelForAudioClassification, AutoFeatureExtractor, WhisperProcessor, WhisperForConditionalGeneration
7
+ import base64
8
+ import io
9
+ import json
10
+ import math
11
+ import tempfile
12
+ import os
13
+ import soundfile as sf
14
+ import warnings
15
+
16
+ # Suppress librosa warnings
17
+ warnings.filterwarnings('ignore')
18
+
19
+ class ScoreCalibrator:
20
+ """
21
+ Lightweight calibration model to adapt the final score using
22
+ physics and deep learning scores.
23
+ """
24
+
25
+ def __init__(self, calibration_path=None):
26
+ self.calibration_path = calibration_path
27
+ self.ready = False
28
+ self.weights = None
29
+ self.bias = 0.0
30
+ self.threshold = 0.5
31
+ self.metadata = {}
32
+
33
+ if calibration_path:
34
+ self.load(calibration_path)
35
+
36
+ def load(self, path=None):
37
+ path = path or self.calibration_path
38
+ if not path or not os.path.exists(path):
39
+ self.ready = False
40
+ return False
41
+
42
+ try:
43
+ with open(path, "r", encoding="utf-8") as handle:
44
+ data = json.load(handle)
45
+ except Exception:
46
+ self.ready = False
47
+ return False
48
+
49
+ weights = data.get("weights")
50
+ if not isinstance(weights, list) or len(weights) != 2:
51
+ self.ready = False
52
+ return False
53
+
54
+ self.weights = [float(weights[0]), float(weights[1])]
55
+ self.bias = float(data.get("bias", 0.0))
56
+ self.threshold = float(data.get("threshold", 0.5))
57
+ self.metadata = data
58
+ self.calibration_path = path
59
+ self.ready = True
60
+ return True
61
+
62
+ def predict(self, physics_score, dl_score):
63
+ if not self.ready or self.weights is None:
64
+ return None
65
+
66
+ z = (self.weights[0] * physics_score) + (self.weights[1] * dl_score) + self.bias
67
+ if z >= 0:
68
+ exp_neg = math.exp(-z)
69
+ prob = 1.0 / (1.0 + exp_neg)
70
+ else:
71
+ exp_pos = math.exp(z)
72
+ prob = exp_pos / (1.0 + exp_pos)
73
+ return float(prob)
74
+
75
+ class HybridEnsembleDetector:
76
+ """
77
+ Hybrid AI Voice Detection System with Language Detection
78
+
79
+ Features:
80
+ 1. Physics-based acoustic analysis
81
+ 2. Deep Learning deepfake detection
82
+ 3. Language identification using Whisper (focus on Indian languages)
83
+ 4. Auto-truncation to 30 seconds for faster processing
84
+ """
85
+
86
+ def __init__(
87
+ self,
88
+ deepfake_model_path="garystafford/wav2vec2-deepfake-voice-detector",
89
+ whisper_model_path="openai/whisper-base",
90
+ physics_weight=0.4,
91
+ dl_weight=0.6,
92
+ use_local_deepfake_model=False,
93
+ use_local_whisper_model=False,
94
+ calibration_path=None,
95
+ max_audio_duration=30 # seconds
96
+ ):
97
+ """
98
+ Initialize the hybrid detector
99
+
100
+ Args:
101
+ deepfake_model_path: Path to deepfake detection model
102
+ whisper_model_path: Path to Whisper model for language detection
103
+ physics_weight: Weight for physics score (0-1)
104
+ dl_weight: Weight for DL score (0-1)
105
+ use_local_deepfake_model: Whether to load deepfake model from local path
106
+ use_local_whisper_model: Whether to load Whisper from local path
107
+ calibration_path: Optional path to calibration JSON file
108
+ max_audio_duration: Maximum audio duration to process (seconds)
109
+ """
110
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
111
+ self.max_duration = max_audio_duration
112
+
113
+ # Normalize weights
114
+ total_weight = physics_weight + dl_weight
115
+ self.physics_weight = physics_weight / total_weight
116
+ self.dl_weight = dl_weight / total_weight
117
+
118
+ self.calibrator = ScoreCalibrator(calibration_path)
119
+ if self.calibrator.ready:
120
+ print(f" Calibration loaded from: {self.calibrator.calibration_path}")
121
+
122
+ print(f"🔧 Initializing Hybrid Detector with Language Detection")
123
+ print(f" Device: {self.device}")
124
+ print(f" Physics Weight: {self.physics_weight*100:.0f}%")
125
+ print(f" DL Weight: {self.dl_weight*100:.0f}%")
126
+ print(f" Max Audio Duration: {self.max_duration}s")
127
+
128
+ # --- LOAD DEEPFAKE DETECTION MODEL ---
129
+ try:
130
+ print(f"📥 Loading deepfake detection model from '{deepfake_model_path}'...")
131
+
132
+ if use_local_deepfake_model:
133
+ self.dl_model = AutoModelForAudioClassification.from_pretrained(
134
+ deepfake_model_path,
135
+ local_files_only=True
136
+ )
137
+ self.feature_extractor = AutoFeatureExtractor.from_pretrained(
138
+ deepfake_model_path,
139
+ local_files_only=True
140
+ )
141
+ else:
142
+ self.dl_model = AutoModelForAudioClassification.from_pretrained(deepfake_model_path)
143
+ self.feature_extractor = AutoFeatureExtractor.from_pretrained(deepfake_model_path)
144
+
145
+ self.dl_model.to(self.device)
146
+ self.dl_model.eval()
147
+ self.dl_ready = True
148
+ print("✅ Deepfake Detection Model Loaded")
149
+
150
+ except Exception as e:
151
+ print(f"⚠️ DL Model Load Failed: {e}")
152
+ print(" Running in Physics-Only mode")
153
+ self.dl_ready = False
154
+ self.dl_weight = 0
155
+ self.physics_weight = 1.0
156
+
157
+ # --- LOAD WHISPER FOR LANGUAGE DETECTION ---
158
+ try:
159
+ print(f"📥 Loading Whisper model for language detection from '{whisper_model_path}'...")
160
+
161
+ if use_local_whisper_model:
162
+ self.whisper_processor = WhisperProcessor.from_pretrained(
163
+ whisper_model_path,
164
+ local_files_only=True
165
+ )
166
+ self.whisper_model = WhisperForConditionalGeneration.from_pretrained(
167
+ whisper_model_path,
168
+ local_files_only=True
169
+ )
170
+ else:
171
+ self.whisper_processor = WhisperProcessor.from_pretrained(whisper_model_path)
172
+ self.whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_path)
173
+
174
+ self.whisper_model.to(self.device)
175
+ self.whisper_model.eval()
176
+ self.lang_ready = True
177
+ print("✅ Whisper Language Detection Model Loaded")
178
+
179
+ # Language code mapping for Indian languages and common languages
180
+ self.language_map = {
181
+ 'hi': 'Hindi',
182
+ 'bn': 'Bengali',
183
+ 'te': 'Telugu',
184
+ 'mr': 'Marathi',
185
+ 'ta': 'Tamil',
186
+ 'gu': 'Gujarati',
187
+ 'kn': 'Kannada',
188
+ 'ml': 'Malayalam',
189
+ 'or': 'Odia',
190
+ 'pa': 'Punjabi',
191
+ 'as': 'Assamese',
192
+ 'ur': 'Urdu',
193
+ 'en': 'English',
194
+ 'ne': 'Nepali',
195
+ 'si': 'Sinhala',
196
+ 'sa': 'Sanskrit',
197
+ 'sd': 'Sindhi',
198
+ 'ks': 'Kashmiri'
199
+ }
200
+
201
+ except Exception as e:
202
+ print(f"⚠️ Whisper Model Load Failed: {e}")
203
+ print(" Running without language detection")
204
+ self.lang_ready = False
205
+
206
+ # --- PHYSICS ENGINE PARAMETERS ---
207
+ self.CV_AI_THRESHOLD = 0.20
208
+ self.CV_HUMAN_THRESHOLD = 0.32
209
+ self.INTENSITY_MIN_STD = 0.05
210
+ self.INTENSITY_MAX_STD = 0.15
211
+
212
+ print("✅ Hybrid Detector Ready\n")
213
+
214
+ def reload_calibration(self, calibration_path=None):
215
+ """
216
+ Reload calibration weights from disk.
217
+
218
+ Args:
219
+ calibration_path: Optional override path
220
+
221
+ Returns:
222
+ bool: True if calibration loaded
223
+ """
224
+ if self.calibrator is None:
225
+ self.calibrator = ScoreCalibrator(calibration_path)
226
+ return self.calibrator.ready
227
+ return self.calibrator.load(calibration_path)
228
+
229
+ # ==========================================================
230
+ # HELPER: Audio Preprocessing
231
+ # ==========================================================
232
+ def preprocess_audio(self, audio_path, target_sr=16000):
233
+ """
234
+ Load and preprocess audio:
235
+ 1. Load audio
236
+ 2. Convert to mono
237
+ 3. Truncate to max_duration if needed
238
+ 4. Resample to target_sr
239
+
240
+ Args:
241
+ audio_path: Path to audio file
242
+ target_sr: Target sample rate
243
+
244
+ Returns:
245
+ tuple: (waveform_array, sample_rate, duration, was_truncated)
246
+ """
247
+ try:
248
+ # Load audio
249
+ y, sr = librosa.load(audio_path, sr=None, mono=True)
250
+
251
+ # Calculate duration
252
+ duration = len(y) / sr
253
+ was_truncated = False
254
+
255
+ # Truncate if longer than max_duration
256
+ if duration > self.max_duration:
257
+ print(f" ⚠️ Audio is {duration:.1f}s, truncating to {self.max_duration}s")
258
+ max_samples = int(self.max_duration * sr)
259
+ y = y[:max_samples]
260
+ duration = self.max_duration
261
+ was_truncated = True
262
+
263
+ # Resample if needed
264
+ if sr != target_sr:
265
+ y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
266
+ sr = target_sr
267
+
268
+ return y, sr, duration, was_truncated
269
+
270
+ except Exception as e:
271
+ raise ValueError(f"Failed to preprocess audio: {str(e)}")
272
+
273
+ # ==========================================================
274
+ # HELPER: Base64 Decoding
275
+ # ==========================================================
276
+ def decode_base64_audio(self, base64_string, audio_format="mp3"):
277
+ """
278
+ Decode base64 audio and save to temporary file
279
+
280
+ Args:
281
+ base64_string: Base64 encoded audio data
282
+
283
+ Returns:
284
+ str: Path to temporary audio file
285
+ """
286
+ try:
287
+ detected_format = audio_format
288
+ if isinstance(base64_string, str) and base64_string.startswith("data:"):
289
+ header, base64_string = base64_string.split(",", 1)
290
+ header_lower = header.lower()
291
+ if "audio/wav" in header_lower or "audio/x-wav" in header_lower:
292
+ detected_format = "wav"
293
+ elif "audio/mpeg" in header_lower or "audio/mp3" in header_lower:
294
+ detected_format = "mp3"
295
+
296
+ # Decode base64
297
+ audio_data = base64.b64decode(base64_string)
298
+
299
+ file_suffix = ".wav" if str(detected_format).lower() in ["wav", "wave"] else ".mp3"
300
+
301
+ # Create temporary file
302
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix)
303
+ temp_file.write(audio_data)
304
+ temp_file.close()
305
+
306
+ return temp_file.name
307
+
308
+ except Exception as e:
309
+ raise ValueError(f"Failed to decode base64 audio: {str(e)}")
310
+
311
+ # ==========================================================
312
+ # LANGUAGE DETECTION
313
+ # ==========================================================
314
+ def detect_language(self, audio_path):
315
+ """
316
+ Detect language using Whisper model
317
+
318
+ Args:
319
+ audio_path: Path to audio file
320
+
321
+ Returns:
322
+ str: Detected language name
323
+ """
324
+ if not self.lang_ready:
325
+ return "Unknown"
326
+
327
+ try:
328
+ # Load and preprocess audio for Whisper (uses 16kHz)
329
+ # Use first 30 seconds for language detection
330
+ audio, sr = librosa.load(audio_path, sr=16000, mono=True, duration=30)
331
+
332
+ # Process audio with Whisper processor
333
+ input_features = self.whisper_processor(
334
+ audio,
335
+ sampling_rate=16000,
336
+ return_tensors="pt"
337
+ ).input_features
338
+
339
+ input_features = input_features.to(self.device)
340
+
341
+ # Whisper language detection using forced_decoder_ids
342
+ with torch.no_grad():
343
+ # Generate with language detection enabled
344
+ generated_ids = self.whisper_model.generate(
345
+ input_features,
346
+ task="transcribe",
347
+ return_dict_in_generate=True
348
+ )
349
+
350
+ # Decode the output
351
+ full_output = self.whisper_processor.batch_decode(
352
+ generated_ids.sequences,
353
+ skip_special_tokens=False
354
+ )[0]
355
+
356
+ # Parse language from special tokens
357
+ # Format: <|startoftranscript|><|en|><|transcribe|>...
358
+ detected_lang = None
359
+
360
+ # Look for language tokens in the format <|xx|>
361
+ import re
362
+ lang_pattern = r'<\|([a-z]{2})\|>'
363
+ matches = re.findall(lang_pattern, full_output)
364
+
365
+ if matches:
366
+ # First match after startoftranscript is usually the language
367
+ for match in matches:
368
+ if match in self.language_map:
369
+ detected_lang = match
370
+ break
371
+
372
+ if detected_lang:
373
+ lang_name = self.language_map.get(detected_lang, detected_lang.upper())
374
+ print(f" 🌐 Detected Language: {lang_name} ({detected_lang})")
375
+ return lang_name
376
+ else:
377
+ # Fallback: if transcription successful, assume English
378
+ transcription = self.whisper_processor.batch_decode(
379
+ generated_ids.sequences,
380
+ skip_special_tokens=True
381
+ )[0]
382
+
383
+ if len(transcription.strip()) > 0:
384
+ print(f" 🌐 Detected Language: English (default)")
385
+ return "English"
386
+ else:
387
+ return "Unknown"
388
+
389
+ except Exception as e:
390
+ print(f" ⚠️ Language detection error: {str(e)}")
391
+ return "Unknown"
392
+
393
+ def extract_scores(self, audio_input, input_type="file", audio_format="mp3"):
394
+ """
395
+ Extract physics and deep learning scores without language detection.
396
+
397
+ Args:
398
+ audio_input: Either file path or base64 string
399
+ input_type: "file" or "base64"
400
+ audio_format: "mp3" or "wav" when using base64
401
+
402
+ Returns:
403
+ dict: Score details
404
+ """
405
+ temp_file = None
406
+ try:
407
+ if input_type == "base64":
408
+ temp_file = self.decode_base64_audio(audio_input, audio_format=audio_format)
409
+ audio_path = temp_file
410
+ elif input_type == "file":
411
+ audio_path = audio_input
412
+ if not os.path.exists(audio_path):
413
+ return {
414
+ "status": "error",
415
+ "error": f"Audio file not found: {audio_path}"
416
+ }
417
+ else:
418
+ return {
419
+ "status": "error",
420
+ "error": f"Invalid input_type: {input_type}. Use 'file' or 'base64'"
421
+ }
422
+
423
+ phys_score, phys_method, phys_feats = self.get_physics_score(audio_path)
424
+ dl_score, dl_label = self.get_dl_score(audio_path)
425
+
426
+ return {
427
+ "status": "success",
428
+ "physics_score": float(phys_score),
429
+ "dl_score": float(dl_score),
430
+ "dl_label": dl_label,
431
+ "physics_method": phys_method,
432
+ "audio_duration": float(phys_feats.get("duration", 0)),
433
+ "was_truncated": bool(phys_feats.get("was_truncated", False))
434
+ }
435
+ except Exception as e:
436
+ return {
437
+ "status": "error",
438
+ "error": str(e)
439
+ }
440
+ finally:
441
+ if temp_file and os.path.exists(temp_file):
442
+ try:
443
+ os.unlink(temp_file)
444
+ except Exception:
445
+ pass
446
+
447
+ # ==========================================================
448
+ # PART A: PHYSICS ENGINE (FIXED)
449
+ # ==========================================================
450
+ def get_linear_score(self, val, min_val, max_val):
451
+ """Linear interpolation for scoring"""
452
+ if val <= min_val:
453
+ return 1.0
454
+ if val >= max_val:
455
+ return 0.0
456
+ return 1.0 - ((val - min_val) / (max_val - min_val))
457
+
458
+ def get_physics_score(self, audio_path):
459
+ """
460
+ Analyze audio using physics-based acoustic features
461
+
462
+ Returns:
463
+ tuple: (ai_score, method, features_dict)
464
+ """
465
+ try:
466
+ # Load audio at NATIVE sample rate (don't resample for physics analysis)
467
+ y, sr = librosa.load(audio_path, sr=None, mono=True)
468
+
469
+ # Calculate original duration
470
+ duration = len(y) / sr
471
+ was_truncated = False
472
+
473
+ # Truncate if needed
474
+ if duration > self.max_duration:
475
+ max_samples = int(self.max_duration * sr)
476
+ y = y[:max_samples]
477
+ duration = self.max_duration
478
+ was_truncated = True
479
+
480
+ print(f" 🔬 Running physics analysis on {duration:.1f}s audio at {sr}Hz")
481
+
482
+ # Robust pitch tracking using PYIN
483
+ try:
484
+ f0, voiced_flag, voiced_probs = librosa.pyin(
485
+ y,
486
+ fmin=librosa.note_to_hz('C2'), # ~65 Hz
487
+ fmax=librosa.note_to_hz('C7'), # ~2093 Hz
488
+ sr=sr,
489
+ frame_length=2048
490
+ )
491
+ valid_f0 = f0[~np.isnan(f0)]
492
+ except Exception as pitch_error:
493
+ print(f" ⚠️ Pitch detection failed: {pitch_error}, using fallback method")
494
+ # Fallback: use simpler pitch detection
495
+ valid_f0 = np.array([])
496
+
497
+ if len(valid_f0) < 10: # Need at least 10 valid pitch points
498
+ print(f" ⚠️ Insufficient pitch data ({len(valid_f0)} points), using alternative features")
499
+ # Fall back to non-pitch features
500
+ rms = librosa.feature.rms(y=y)[0]
501
+ centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
502
+ zcr = librosa.feature.zero_crossing_rate(y)[0]
503
+
504
+ feats = {
505
+ 'pitch_cv': 0.25, # Neutral value
506
+ 'intensity_std': np.std(rms),
507
+ 'freq_skew': stats.skew(centroid),
508
+ 'zcr_std': np.std(zcr),
509
+ 'mean_pitch': 0,
510
+ 'std_pitch': 0,
511
+ 'duration': duration,
512
+ 'was_truncated': was_truncated
513
+ }
514
+
515
+ # Score based on available features
516
+ intensity_score = self.get_linear_score(
517
+ feats['intensity_std'],
518
+ self.INTENSITY_MIN_STD,
519
+ self.INTENSITY_MAX_STD
520
+ )
521
+
522
+ zcr_score = self.get_linear_score(
523
+ feats['zcr_std'],
524
+ 0.01,
525
+ 0.08
526
+ )
527
+
528
+ skew_score = self.get_linear_score(
529
+ abs(feats['freq_skew']),
530
+ 0.1,
531
+ 1.0
532
+ )
533
+
534
+ # Weighted combination (no pitch)
535
+ final_score = (intensity_score * 0.5 + zcr_score * 0.2 + skew_score * 0.3)
536
+
537
+ print(f" 🔬 Physics score (no pitch): {final_score:.3f}")
538
+ return round(final_score, 3), "Physics Analysis (Limited)", feats
539
+
540
+ # Full analysis with pitch
541
+ rms = librosa.feature.rms(y=y)[0]
542
+ centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
543
+
544
+ mean_pitch = np.mean(valid_f0)
545
+ std_pitch = np.std(valid_f0)
546
+
547
+ # Calculate feature metrics
548
+ feats = {
549
+ 'pitch_cv': std_pitch / mean_pitch if mean_pitch > 0 else 0,
550
+ 'intensity_std': np.std(rms),
551
+ 'freq_skew': stats.skew(centroid),
552
+ 'mean_pitch': mean_pitch,
553
+ 'std_pitch': std_pitch,
554
+ 'duration': duration,
555
+ 'was_truncated': was_truncated
556
+ }
557
+
558
+ # Individual feature scores (higher = more AI-like)
559
+ intensity_score = self.get_linear_score(
560
+ feats['intensity_std'],
561
+ self.INTENSITY_MIN_STD,
562
+ self.INTENSITY_MAX_STD
563
+ )
564
+
565
+ pitch_score = self.get_linear_score(
566
+ feats['pitch_cv'],
567
+ self.CV_AI_THRESHOLD,
568
+ self.CV_HUMAN_THRESHOLD
569
+ )
570
+
571
+ skew_score = self.get_linear_score(
572
+ abs(feats['freq_skew']),
573
+ 0.1,
574
+ 1.0
575
+ )
576
+
577
+ # Weighted combination
578
+ W_INTENSITY = 0.40
579
+ W_PITCH = 0.40
580
+ W_SKEW = 0.20
581
+
582
+ base_score = (
583
+ intensity_score * W_INTENSITY +
584
+ pitch_score * W_PITCH +
585
+ skew_score * W_SKEW
586
+ )
587
+
588
+ # Synergy bonus: if both intensity and pitch are suspicious
589
+ if intensity_score > 0.4 and pitch_score > 0.4:
590
+ final_score = min(base_score + 0.15, 1.0)
591
+ else:
592
+ final_score = base_score
593
+
594
+ print(f" 🔬 Physics score: {final_score:.3f} (intensity:{intensity_score:.2f}, pitch:{pitch_score:.2f})")
595
+ return round(final_score, 3), "Physics Analysis", feats
596
+
597
+ except Exception as e:
598
+ print(f" ❌ Physics analysis failed: {str(e)}")
599
+ import traceback
600
+ traceback.print_exc()
601
+ return 0.0, f"Physics Error: {str(e)}", {'duration': 0, 'was_truncated': False}
602
+
603
+ # ==========================================================
604
+ # PART B: DEEP LEARNING ENGINE
605
+ # ==========================================================
606
+ def get_dl_score(self, audio_path):
607
+ """
608
+ Analyze audio using deep learning model
609
+
610
+ Returns:
611
+ tuple: (ai_score, label)
612
+ """
613
+ if not self.dl_ready:
614
+ return 0.0, "Model not loaded"
615
+
616
+ try:
617
+ # Load and preprocess audio
618
+ waveform_np, sr, duration, was_truncated = self.preprocess_audio(audio_path, target_sr=16000)
619
+
620
+ # Process with feature extractor
621
+ inputs = self.feature_extractor(
622
+ waveform_np,
623
+ sampling_rate=16000,
624
+ return_tensors="pt",
625
+ padding=True
626
+ )
627
+
628
+ # Move to device
629
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
630
+
631
+ # Run inference
632
+ with torch.no_grad():
633
+ outputs = self.dl_model(**inputs)
634
+ logits = outputs.logits
635
+ probs = F.softmax(logits, dim=-1)
636
+
637
+ # Get predictions
638
+ # Class 0: Real, Class 1: Fake
639
+ prob_real = probs[0][0].item()
640
+ prob_fake = probs[0][1].item()
641
+
642
+ # AI score is the fake probability
643
+ ai_score = prob_fake
644
+
645
+ label = "Fake/Deepfake" if prob_fake > 0.5 else "Real/Human"
646
+
647
+ return round(ai_score, 3), label
648
+
649
+ except Exception as e:
650
+ print(f" ❌ DL analysis failed: {str(e)}")
651
+ return 0.0, f"DL Error: {str(e)}"
652
+
653
+ # ==========================================================
654
+ # PART C: EXPLANATION GENERATOR
655
+ # ==========================================================
656
+ def generate_explanation(self, final_score, phys_score, dl_score, dl_label, phys_feats, ai_threshold=0.55):
657
+ """
658
+ Generate human-readable explanation for the classification
659
+
660
+ Returns:
661
+ str: Explanation text
662
+ """
663
+ explanations = []
664
+
665
+ if final_score > ai_threshold:
666
+ # AI GENERATED
667
+
668
+ # Deep Learning contributions
669
+ if dl_score > 0.55 and self.dl_ready:
670
+ if "Fake" in dl_label or "Deepfake" in dl_label:
671
+ explanations.append(
672
+ f"Deep learning model detected synthetic voice patterns "
673
+ f"(confidence: {dl_score*100:.1f}%)"
674
+ )
675
+
676
+ # Physics contributions
677
+ if phys_score > 0.55:
678
+ p_cv = phys_feats.get('pitch_cv', 0)
679
+ i_std = phys_feats.get('intensity_std', 0)
680
+
681
+ if i_std < 0.06:
682
+ explanations.append(
683
+ f"Unnaturally consistent energy levels detected "
684
+ f"(std: {i_std:.3f}, expected: >0.06)"
685
+ )
686
+
687
+ if p_cv < 0.22 and p_cv > 0:
688
+ explanations.append(
689
+ f"Robotic pitch modulation patterns "
690
+ f"(CV: {p_cv:.2f}, expected: >0.22)"
691
+ )
692
+
693
+ if not explanations or (i_std >= 0.06 and p_cv >= 0.22):
694
+ explanations.append(
695
+ "Acoustic parameters lack natural human variability"
696
+ )
697
+
698
+ if not explanations:
699
+ explanations.append(
700
+ "Voice exhibits characteristics consistent with AI generation"
701
+ )
702
+
703
+ else:
704
+ # HUMAN
705
+ explanations.append(
706
+ "Voice exhibits natural acoustic variability and human speech characteristics"
707
+ )
708
+
709
+ return "; ".join(explanations)
710
+
711
+ # ==========================================================
712
+ # PART D: MAIN ANALYSIS FUNCTION
713
+ # ==========================================================
714
+ def analyze(self, audio_input, input_type="file", audio_format="mp3", analysis_mode="full"):
715
+ """
716
+ Main analysis function with configurable input types
717
+
718
+ Args:
719
+ audio_input: Either file path or base64 string
720
+ input_type: "file" or "base64"
721
+ audio_format: "mp3" or "wav" when using base64 input
722
+ analysis_mode: "full", "physics", or "dl"
723
+
724
+ Returns:
725
+ dict: Analysis results following API response format
726
+ """
727
+ temp_file = None
728
+
729
+ try:
730
+ analysis_mode = (analysis_mode or "full")
731
+ analysis_mode = str(analysis_mode).lower().strip()
732
+ if analysis_mode not in ["full", "physics", "dl"]:
733
+ return {
734
+ "status": "error",
735
+ "error": f"Invalid analysis_mode: {analysis_mode}. Use 'full', 'physics', or 'dl'"
736
+ }
737
+
738
+ # Handle input type
739
+ if input_type == "base64":
740
+ temp_file = self.decode_base64_audio(audio_input, audio_format=audio_format)
741
+ audio_path = temp_file
742
+ elif input_type == "file":
743
+ audio_path = audio_input
744
+ if not os.path.exists(audio_path):
745
+ return {
746
+ "status": "error",
747
+ "error": f"Audio file not found: {audio_path}"
748
+ }
749
+ else:
750
+ return {
751
+ "status": "error",
752
+ "error": f"Invalid input_type: {input_type}. Use 'file' or 'base64'"
753
+ }
754
+
755
+ print(f"🎵 Analyzing: {os.path.basename(audio_path)}")
756
+
757
+ # 1. Detect Language
758
+ detected_language = "Unknown"
759
+ if analysis_mode == "full":
760
+ detected_language = self.detect_language(audio_path)
761
+
762
+ # 2. Run Physics Analysis
763
+ phys_score = 0.0
764
+ phys_method = "Physics Skipped"
765
+ phys_feats = {'duration': 0, 'was_truncated': False}
766
+ if analysis_mode in ["full", "physics"]:
767
+ phys_score, phys_method, phys_feats = self.get_physics_score(audio_path)
768
+
769
+ # 3. Run Deep Learning Analysis
770
+ dl_score = 0.0
771
+ dl_label = "DL Skipped"
772
+ if analysis_mode in ["full", "dl"]:
773
+ dl_score, dl_label = self.get_dl_score(audio_path)
774
+
775
+ # 4. Calculate weighted ensemble score
776
+ used_calibration = False
777
+ threshold = 0.55
778
+
779
+ if analysis_mode == "full" and self.calibrator and self.calibrator.ready:
780
+ calibrated_score = self.calibrator.predict(phys_score, dl_score)
781
+ if calibrated_score is not None:
782
+ final_score = calibrated_score
783
+ used_calibration = True
784
+ threshold = float(self.calibrator.threshold)
785
+ else:
786
+ final_score = (
787
+ self.physics_weight * phys_score +
788
+ self.dl_weight * dl_score
789
+ )
790
+ elif analysis_mode == "physics":
791
+ final_score = phys_score
792
+ elif analysis_mode == "dl":
793
+ final_score = dl_score
794
+ else:
795
+ final_score = (
796
+ self.physics_weight * phys_score +
797
+ self.dl_weight * dl_score
798
+ )
799
+
800
+ # Round to 2 decimal places
801
+ final_score = round(float(final_score), 2)
802
+
803
+ # 5. Determine classification
804
+ classification = "AI_GENERATED" if final_score > threshold else "HUMAN"
805
+
806
+ # 6. Generate explanation
807
+ explanation = self.generate_explanation(
808
+ final_score,
809
+ phys_score,
810
+ dl_score,
811
+ dl_label,
812
+ phys_feats,
813
+ ai_threshold=threshold
814
+ )
815
+
816
+ # 7. Return API-compliant response (ensure all values are JSON serializable)
817
+ return {
818
+ "status": "success",
819
+ "language": detected_language,
820
+ "classification": classification,
821
+ "confidenceScore": float(final_score), # Convert to Python float
822
+ "explanation": explanation,
823
+ "analysisMode": analysis_mode,
824
+ "debug": {
825
+ "physics_score": float(phys_score),
826
+ "dl_score": float(dl_score),
827
+ "dl_label": dl_label,
828
+ "physics_weight": f"{self.physics_weight*100:.0f}%",
829
+ "dl_weight": f"{self.dl_weight*100:.0f}%",
830
+ "analysis_mode": analysis_mode,
831
+ "used_calibration": used_calibration,
832
+ "calibration_threshold": float(threshold) if used_calibration else None,
833
+ "calibration_path": self.calibrator.calibration_path if used_calibration else None,
834
+ "audio_duration": float(phys_feats.get('duration', 0)),
835
+ "was_truncated": bool(phys_feats.get('was_truncated', False)),
836
+ "physics_features": {k: float(v) if isinstance(v, (np.floating, np.integer)) else v
837
+ for k, v in phys_feats.items()
838
+ if k not in ['duration', 'was_truncated']}
839
+ }
840
+ }
841
+
842
+ except Exception as e:
843
+ import traceback
844
+ return {
845
+ "status": "error",
846
+ "error": str(e),
847
+ "traceback": traceback.format_exc()
848
+ }
849
+
850
+ finally:
851
+ # Clean up temporary file
852
+ if temp_file and os.path.exists(temp_file):
853
+ try:
854
+ os.unlink(temp_file)
855
+ except:
856
+ pass
857
+
858
+ # ==========================================================
859
+ # UTILITY: Update Weights
860
+ # ==========================================================
861
+ def update_weights(self, physics_weight, dl_weight):
862
+ """
863
+ Update ensemble weights dynamically
864
+
865
+ Args:
866
+ physics_weight: New physics weight (0-1)
867
+ dl_weight: New DL weight (0-1)
868
+ """
869
+ total = physics_weight + dl_weight
870
+ self.physics_weight = physics_weight / total
871
+ self.dl_weight = dl_weight / total
872
+
873
+ print(f"⚙️ Weights updated:")
874
+ print(f" Physics: {self.physics_weight*100:.0f}%")
875
+ print(f" DL: {self.dl_weight*100:.0f}%")
download_models.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pre-download Models Script
3
+ Downloads all required AI models before deployment
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ from pathlib import Path
9
+
10
+ print("="*70)
11
+ print("Voice Detection API - Model Download Script")
12
+ print("="*70)
13
+ print()
14
+
15
+ # Check if we're in the right directory
16
+ if not Path("requirements.txt").exists():
17
+ print("ERROR: requirements.txt not found!")
18
+ print("Please run this script from the project root directory.")
19
+ sys.exit(1)
20
+
21
+ print("This script will download the following models:")
22
+ print("1. Wav2Vec2 Deepfake Detector (~1.2 GB)")
23
+ print("2. Whisper Base Language Model (~500 MB)")
24
+ print()
25
+ print("Total download size: ~1.7 GB")
26
+ print("This may take 5-15 minutes depending on your internet speed.")
27
+ print()
28
+
29
+ response = input("Continue? (y/n): ")
30
+ if response.lower() != 'y':
31
+ print("Download cancelled.")
32
+ sys.exit(0)
33
+
34
+ print()
35
+ print("="*70)
36
+ print("Step 1/2: Downloading Wav2Vec2 Deepfake Detector")
37
+ print("="*70)
38
+
39
+ try:
40
+ from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
41
+
42
+ print("Downloading model...")
43
+ model = AutoModelForAudioClassification.from_pretrained(
44
+ 'garystafford/wav2vec2-deepfake-voice-detector'
45
+ )
46
+
47
+ print("Downloading feature extractor...")
48
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
49
+ 'garystafford/wav2vec2-deepfake-voice-detector'
50
+ )
51
+
52
+ print("✅ Wav2Vec2 model downloaded successfully!")
53
+ print()
54
+
55
+ except Exception as e:
56
+ print(f"❌ Failed to download Wav2Vec2 model: {str(e)}")
57
+ print("Please check your internet connection and try again.")
58
+ sys.exit(1)
59
+
60
+ print("="*70)
61
+ print("Step 2/2: Downloading Whisper Language Detection Model")
62
+ print("="*70)
63
+
64
+ try:
65
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
66
+
67
+ print("Downloading processor...")
68
+ processor = WhisperProcessor.from_pretrained('openai/whisper-base')
69
+
70
+ print("Downloading model...")
71
+ model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-base')
72
+
73
+ print("✅ Whisper model downloaded successfully!")
74
+ print()
75
+
76
+ except Exception as e:
77
+ print(f"❌ Failed to download Whisper model: {str(e)}")
78
+ print("Please check your internet connection and try again.")
79
+ sys.exit(1)
80
+
81
+ print("="*70)
82
+ print("✅ All models downloaded successfully!")
83
+ print("="*70)
84
+ print()
85
+ print("Models are cached in:", Path.home() / ".cache" / "huggingface")
86
+ print()
87
+ print("Next steps:")
88
+ print("1. The models will be automatically used by the API")
89
+ print("2. Start the API: python app.py")
90
+ print("3. Test the API: python test_api.py")
91
+ print()
92
+ print("="*70)
pytest.ini ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [pytest]
2
+ testpaths = tests
3
+ markers =
4
+ integration: tests that require full models and data
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Flask
2
+ flask-cors
3
+ flask-sock
4
+ Werkzeug
5
+ transformers
6
+ librosa
7
+ soundfile
8
+ scipy
9
+ numpy
10
+ pydub
11
+ python-dotenv
12
+ gunicorn
13
+ pytest
14
+ # Note: torch, torchaudio are handled in Dockerfile
self_learning_train.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Train a lightweight calibration model from feedback audio samples.
3
+
4
+ This script builds a simple logistic regression calibration layer that
5
+ maps physics and deep learning scores to a calibrated probability.
6
+ """
7
+
8
+ import argparse
9
+ import json
10
+ import os
11
+ import shutil
12
+ import sys
13
+ from datetime import datetime
14
+
15
+ import numpy as np
16
+
17
+ from detector import HybridEnsembleDetector
18
+
19
+
20
+ LABEL_MAP = {
21
+ "AI_GENERATED": 1,
22
+ "AI": 1,
23
+ "FAKE": 1,
24
+ "SYNTHETIC": 1,
25
+ "HUMAN": 0,
26
+ "REAL": 0
27
+ }
28
+
29
+
30
+ def sigmoid(z):
31
+ z = np.clip(z, -30.0, 30.0)
32
+ return 1.0 / (1.0 + np.exp(-z))
33
+
34
+
35
+ def train_logreg(X, y, lr=0.5, epochs=300, l2=0.001):
36
+ w = np.zeros(X.shape[1], dtype=np.float64)
37
+ b = 0.0
38
+ n = float(X.shape[0])
39
+
40
+ for _ in range(epochs):
41
+ z = X.dot(w) + b
42
+ p = sigmoid(z)
43
+ error = p - y
44
+ grad_w = (X.T.dot(error) / n) + (l2 * w)
45
+ grad_b = error.mean()
46
+ w -= lr * grad_w
47
+ b -= lr * grad_b
48
+
49
+ return w, b
50
+
51
+
52
+ def best_threshold(y_true, y_prob):
53
+ thresholds = np.linspace(0.1, 0.9, 81)
54
+ best_t = 0.5
55
+ best_f1 = -1.0
56
+
57
+ for t in thresholds:
58
+ preds = (y_prob >= t).astype(int)
59
+ tp = float(((preds == 1) & (y_true == 1)).sum())
60
+ fp = float(((preds == 1) & (y_true == 0)).sum())
61
+ fn = float(((preds == 0) & (y_true == 1)).sum())
62
+ precision = tp / (tp + fp + 1e-9)
63
+ recall = tp / (tp + fn + 1e-9)
64
+ f1 = (2 * precision * recall) / (precision + recall + 1e-9)
65
+ if f1 > best_f1:
66
+ best_f1 = f1
67
+ best_t = float(t)
68
+
69
+ return best_t, best_f1
70
+
71
+
72
+ def iter_audio_files(data_dir, max_per_class=0):
73
+ samples = []
74
+ counts = {0: 0, 1: 0}
75
+
76
+ for label_name, label_value in LABEL_MAP.items():
77
+ label_dir = os.path.join(data_dir, label_name)
78
+ if not os.path.isdir(label_dir):
79
+ continue
80
+
81
+ for root, _, files in os.walk(label_dir):
82
+ for name in files:
83
+ if not name.lower().endswith((".mp3", ".wav")):
84
+ continue
85
+ if max_per_class and counts[label_value] >= max_per_class:
86
+ continue
87
+
88
+ file_path = os.path.join(root, name)
89
+ meta_path = os.path.splitext(file_path)[0] + ".json"
90
+ sample = {
91
+ "path": file_path,
92
+ "label": label_value
93
+ }
94
+
95
+ if os.path.exists(meta_path):
96
+ try:
97
+ with open(meta_path, "r", encoding="utf-8") as handle:
98
+ meta = json.load(handle)
99
+ if "physics_score" in meta and "dl_score" in meta:
100
+ sample["physics_score"] = float(meta["physics_score"])
101
+ sample["dl_score"] = float(meta["dl_score"])
102
+ except Exception:
103
+ pass
104
+
105
+ samples.append(sample)
106
+ counts[label_value] += 1
107
+
108
+ return samples
109
+
110
+
111
+ def main():
112
+ parser = argparse.ArgumentParser(description="Train calibration layer from feedback samples")
113
+ parser.add_argument("--data-dir", default="data/feedback", help="Feedback dataset directory")
114
+ parser.add_argument("--output", default="data/calibration.json", help="Output calibration JSON file")
115
+ parser.add_argument("--history-dir", default=os.environ.get(
116
+ "CALIBRATION_HISTORY_DIR",
117
+ "data/calibration_history"
118
+ ), help="Directory to store calibration history backups")
119
+ parser.add_argument("--epochs", type=int, default=300, help="Training epochs")
120
+ parser.add_argument("--lr", type=float, default=0.5, help="Learning rate")
121
+ parser.add_argument("--l2", type=float, default=0.001, help="L2 regularization")
122
+ parser.add_argument("--min-samples", type=int, default=20, help="Minimum samples required")
123
+ parser.add_argument("--max-per-class", type=int, default=0, help="Max samples per class (0 = all)")
124
+ parser.add_argument("--deepfake-model-path", default=os.environ.get(
125
+ "DEEPFAKE_MODEL_PATH",
126
+ "garystafford/wav2vec2-deepfake-voice-detector"
127
+ ))
128
+ parser.add_argument("--whisper-model-path", default=os.environ.get(
129
+ "WHISPER_MODEL_PATH",
130
+ "openai/whisper-base"
131
+ ))
132
+ parser.add_argument("--use-local-deepfake-model", action="store_true", default=False)
133
+ parser.add_argument("--use-local-whisper-model", action="store_true", default=False)
134
+ parser.add_argument("--max-audio-duration", type=int, default=30)
135
+
136
+ args = parser.parse_args()
137
+
138
+ if args.history_dir:
139
+ os.makedirs(args.history_dir, exist_ok=True)
140
+
141
+ if not os.path.isdir(args.data_dir):
142
+ print(f"Data directory not found: {args.data_dir}")
143
+ return 1
144
+
145
+ samples = iter_audio_files(args.data_dir, max_per_class=args.max_per_class)
146
+ if not samples:
147
+ print("No audio samples found.")
148
+ return 1
149
+
150
+ needs_scoring = any("physics_score" not in sample for sample in samples)
151
+ detector = None
152
+ if needs_scoring:
153
+ detector = HybridEnsembleDetector(
154
+ deepfake_model_path=args.deepfake_model_path,
155
+ whisper_model_path=args.whisper_model_path,
156
+ use_local_deepfake_model=args.use_local_deepfake_model,
157
+ use_local_whisper_model=args.use_local_whisper_model,
158
+ max_audio_duration=args.max_audio_duration
159
+ )
160
+
161
+ features = []
162
+ labels = []
163
+ skipped = 0
164
+
165
+ for sample in samples:
166
+ if "physics_score" in sample and "dl_score" in sample:
167
+ phys_score = sample["physics_score"]
168
+ dl_score = sample["dl_score"]
169
+ else:
170
+ if detector is None:
171
+ skipped += 1
172
+ continue
173
+ scores = detector.extract_scores(sample["path"], input_type="file")
174
+ if scores.get("status") != "success":
175
+ skipped += 1
176
+ continue
177
+ phys_score = scores["physics_score"]
178
+ dl_score = scores["dl_score"]
179
+
180
+ features.append([phys_score, dl_score])
181
+ labels.append(sample["label"])
182
+
183
+ if skipped:
184
+ print(f"Skipped {skipped} samples due to scoring errors.")
185
+
186
+ if len(features) < args.min_samples:
187
+ print(f"Not enough samples to train. Found {len(features)}.")
188
+ return 1
189
+
190
+ X = np.array(features, dtype=np.float64)
191
+ y = np.array(labels, dtype=np.float64)
192
+
193
+ w, b = train_logreg(X, y, lr=args.lr, epochs=args.epochs, l2=args.l2)
194
+ probs = sigmoid(X.dot(w) + b)
195
+ threshold, f1 = best_threshold(y, probs)
196
+ predictions = (probs >= threshold).astype(int)
197
+ accuracy = float((predictions == y).mean())
198
+
199
+ output_dir = os.path.dirname(args.output)
200
+ if output_dir:
201
+ os.makedirs(output_dir, exist_ok=True)
202
+
203
+ if os.path.exists(args.output):
204
+ version_id = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ") + "_" + os.urandom(4).hex()
205
+ history_name = f"calibration_{version_id}.json"
206
+ history_path = os.path.join(args.history_dir, history_name)
207
+ shutil.copy2(args.output, history_path)
208
+ meta_path = os.path.join(args.history_dir, f"calibration_{version_id}.meta.json")
209
+ meta = {
210
+ "versionId": version_id,
211
+ "source": args.output,
212
+ "archivedAt": datetime.utcnow().isoformat() + "Z",
213
+ "reason": "self_learning_train"
214
+ }
215
+ with open(meta_path, "w", encoding="utf-8") as handle:
216
+ json.dump(meta, handle, indent=2)
217
+
218
+ calibration = {
219
+ "version": 1,
220
+ "trained_at": datetime.utcnow().isoformat() + "Z",
221
+ "weights": [float(w[0]), float(w[1])],
222
+ "bias": float(b),
223
+ "threshold": float(threshold),
224
+ "feature_order": ["physics_score", "dl_score"],
225
+ "metrics": {
226
+ "accuracy": accuracy,
227
+ "f1": float(f1)
228
+ },
229
+ "samples": {
230
+ "count": int(len(features)),
231
+ "ai": int((y == 1).sum()),
232
+ "human": int((y == 0).sum())
233
+ }
234
+ }
235
+
236
+ with open(args.output, "w", encoding="utf-8") as handle:
237
+ json.dump(calibration, handle, indent=2)
238
+
239
+ print(f"Calibration saved to {args.output}")
240
+ print(f"Accuracy: {accuracy:.3f} | F1: {f1:.3f} | Threshold: {threshold:.2f}")
241
+ return 0
242
+
243
+
244
+ if __name__ == "__main__":
245
+ sys.exit(main())
tests/conftest.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import importlib
3
+ import os
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ import pytest
8
+
9
+
10
+ TEST_API_KEY = "test_key_123"
11
+
12
+
13
+ class DummyCalibrator:
14
+ ready = False
15
+ calibration_path = None
16
+
17
+
18
+ class DummyDetector:
19
+ def __init__(self):
20
+ self.calibrator = DummyCalibrator()
21
+
22
+ def analyze(self, audio_input, input_type="file", audio_format="mp3", analysis_mode="full"):
23
+ return {
24
+ "status": "success",
25
+ "language": "English",
26
+ "classification": "AI_GENERATED",
27
+ "confidenceScore": 0.87,
28
+ "explanation": "Dummy detector response",
29
+ "analysisMode": analysis_mode,
30
+ "debug": {
31
+ "analysis_mode": analysis_mode,
32
+ "used_calibration": False
33
+ }
34
+ }
35
+
36
+ def extract_scores(self, audio_input, input_type="file", audio_format="mp3"):
37
+ return {
38
+ "status": "success",
39
+ "physics_score": 0.42,
40
+ "dl_score": 0.84,
41
+ "dl_label": "Fake/Deepfake",
42
+ "physics_method": "Physics Analysis",
43
+ "audio_duration": 1.0,
44
+ "was_truncated": False
45
+ }
46
+
47
+ def reload_calibration(self, calibration_path=None):
48
+ return bool(calibration_path and os.path.exists(calibration_path))
49
+
50
+
51
+ def load_app(tmp_path, monkeypatch, overrides=None):
52
+ env = {
53
+ "API_KEY": TEST_API_KEY,
54
+ "SKIP_MODEL_LOAD": "true",
55
+ "ENABLE_STREAMING": "true",
56
+ "ENABLE_FEEDBACK_STORAGE": "true",
57
+ "FEEDBACK_STORAGE_DIR": str(tmp_path / "feedback"),
58
+ "FEEDBACK_MAX_BYTES": "2048",
59
+ "CALIBRATION_PATH": str(tmp_path / "calibration.json"),
60
+ "CALIBRATION_HISTORY_DIR": str(tmp_path / "calibration_history"),
61
+ "CALIBRATION_HISTORY_MAX": "5",
62
+ "STREAMING_PARTIAL_INTERVAL_SECONDS": "0.5"
63
+ }
64
+ if overrides:
65
+ env.update(overrides)
66
+
67
+ for key, value in env.items():
68
+ if value is None:
69
+ monkeypatch.delenv(key, raising=False)
70
+ else:
71
+ monkeypatch.setenv(key, str(value))
72
+
73
+ if "app" in sys.modules:
74
+ del sys.modules["app"]
75
+
76
+ app_module = importlib.import_module("app")
77
+ importlib.reload(app_module)
78
+
79
+ dummy = DummyDetector()
80
+ app_module.detector = dummy
81
+
82
+ def init_detector():
83
+ app_module.detector = dummy
84
+ return True
85
+
86
+ app_module.init_detector = init_detector
87
+
88
+ return app_module
89
+
90
+
91
+ @pytest.fixture
92
+ def app_factory(tmp_path, monkeypatch):
93
+ def _factory(**overrides):
94
+ return load_app(tmp_path, monkeypatch, overrides=overrides)
95
+ return _factory
96
+
97
+
98
+ @pytest.fixture
99
+ def app_module(app_factory):
100
+ return app_factory()
101
+
102
+
103
+ @pytest.fixture
104
+ def client(app_module):
105
+ return app_module.app.test_client()
106
+
107
+
108
+ @pytest.fixture
109
+ def api_headers():
110
+ return {
111
+ "Content-Type": "application/json",
112
+ "x-api-key": TEST_API_KEY
113
+ }
114
+
115
+
116
+ @pytest.fixture
117
+ def sample_audio_base64():
118
+ return base64.b64encode(b"\x00" * 200).decode("utf-8")
119
+
120
+
121
+ def find_test_audio_files():
122
+ base_dir = Path(__file__).resolve().parent.parent / "test_audio"
123
+ if not base_dir.exists():
124
+ return []
125
+ return sorted([p for p in base_dir.iterdir() if p.suffix.lower() in [".mp3", ".wav"]])
126
+
127
+
128
+ def load_test_audio_base64(prefer_extension=".mp3"):
129
+ candidates = find_test_audio_files()
130
+ for path in candidates:
131
+ if path.suffix.lower() == prefer_extension:
132
+ return path, base64.b64encode(path.read_bytes()).decode("utf-8")
133
+ if candidates:
134
+ path = candidates[0]
135
+ return path, base64.b64encode(path.read_bytes()).decode("utf-8")
136
+ return None, None
137
+
138
+
139
+ @pytest.fixture
140
+ def test_audio_base64():
141
+ path, b64_data = load_test_audio_base64(".mp3")
142
+ if not b64_data:
143
+ pytest.skip("No audio files found in test_audio/")
144
+ return path, b64_data
tests/test_api.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+
4
+ import pytest
5
+
6
+
7
+ def test_health(client):
8
+ response = client.get("/health")
9
+ assert response.status_code == 200
10
+ payload = response.get_json()
11
+ assert payload["status"] == "healthy"
12
+ assert payload["streaming_enabled"] is True
13
+
14
+
15
+ def test_voice_detection_success_with_sample_base64(client, api_headers, sample_audio_base64):
16
+ payload = {
17
+ "language": "English",
18
+ "audioFormat": "mp3",
19
+ "audioBase64": sample_audio_base64
20
+ }
21
+ response = client.post("/api/voice-detection", data=json.dumps(payload), headers=api_headers)
22
+ assert response.status_code == 200
23
+ data = response.get_json()
24
+ assert data["status"] == "success"
25
+ assert data["classification"] == "AI_GENERATED"
26
+
27
+
28
+ def test_voice_detection_success_with_test_audio(client, api_headers, test_audio_base64):
29
+ path, audio_b64 = test_audio_base64
30
+ if path.suffix.lower() != ".mp3":
31
+ pytest.skip("test_audio file is not mp3 (endpoint only supports mp3).")
32
+
33
+ payload = {
34
+ "language": "English",
35
+ "audioFormat": "mp3",
36
+ "audioBase64": audio_b64
37
+ }
38
+ response = client.post("/api/voice-detection", data=json.dumps(payload), headers=api_headers)
39
+ assert response.status_code == 200
40
+ data = response.get_json()
41
+ assert data["status"] == "success"
42
+
43
+
44
+ def test_voice_detection_missing_api_key(client, sample_audio_base64):
45
+ payload = {
46
+ "language": "English",
47
+ "audioFormat": "mp3",
48
+ "audioBase64": sample_audio_base64
49
+ }
50
+ response = client.post("/api/voice-detection", json=payload)
51
+ assert response.status_code == 401
52
+
53
+
54
+ def test_voice_detection_invalid_api_key(client, api_headers, sample_audio_base64):
55
+ payload = {
56
+ "language": "English",
57
+ "audioFormat": "mp3",
58
+ "audioBase64": sample_audio_base64
59
+ }
60
+ headers = dict(api_headers)
61
+ headers["x-api-key"] = "wrong_key"
62
+ response = client.post("/api/voice-detection", json=payload, headers=headers)
63
+ assert response.status_code == 403
64
+
65
+
66
+ def test_voice_detection_invalid_content_type(client, api_headers):
67
+ response = client.post("/api/voice-detection", data="not json", headers=api_headers)
68
+ assert response.status_code == 400
69
+
70
+
71
+ def test_voice_detection_missing_fields(client, api_headers):
72
+ payload = {"language": "English"}
73
+ response = client.post("/api/voice-detection", json=payload, headers=api_headers)
74
+ assert response.status_code == 400
75
+
76
+
77
+ def test_voice_detection_unsupported_language(client, api_headers, sample_audio_base64):
78
+ payload = {
79
+ "language": "Spanish",
80
+ "audioFormat": "mp3",
81
+ "audioBase64": sample_audio_base64
82
+ }
83
+ response = client.post("/api/voice-detection", json=payload, headers=api_headers)
84
+ assert response.status_code == 400
85
+
86
+
87
+ def test_voice_detection_unsupported_audio_format(client, api_headers, sample_audio_base64):
88
+ payload = {
89
+ "language": "English",
90
+ "audioFormat": "wav",
91
+ "audioBase64": sample_audio_base64
92
+ }
93
+ response = client.post("/api/voice-detection", json=payload, headers=api_headers)
94
+ assert response.status_code == 400
95
+
96
+
97
+ def test_voice_detection_invalid_audio_payload(client, api_headers):
98
+ payload = {
99
+ "language": "English",
100
+ "audioFormat": "mp3",
101
+ "audioBase64": "short"
102
+ }
103
+ response = client.post("/api/voice-detection", json=payload, headers=api_headers)
104
+ assert response.status_code == 400
105
+
106
+
107
+ def test_voice_detection_analysis_error(app_module, client, api_headers, sample_audio_base64):
108
+ def error_analyze(*args, **kwargs):
109
+ return {"status": "error", "error": "boom"}
110
+
111
+ app_module.detector.analyze = error_analyze
112
+
113
+ payload = {
114
+ "language": "English",
115
+ "audioFormat": "mp3",
116
+ "audioBase64": sample_audio_base64
117
+ }
118
+ response = client.post("/api/voice-detection", json=payload, headers=api_headers)
119
+ assert response.status_code == 500
120
+
121
+
122
+ def test_reload_calibration_not_found(client, api_headers):
123
+ response = client.post("/api/reload-calibration", headers=api_headers)
124
+ assert response.status_code == 404
125
+
126
+
127
+ def test_reload_calibration_success(app_module, client, api_headers):
128
+ calibration_file = Path(app_module.CALIBRATION_PATH)
129
+ calibration_file.parent.mkdir(parents=True, exist_ok=True)
130
+ calibration_file.write_text("{}", encoding="utf-8")
131
+
132
+ response = client.post("/api/reload-calibration", headers=api_headers)
133
+ assert response.status_code == 200
134
+
135
+
136
+ def test_backup_and_rollback_calibration(app_module, client, api_headers):
137
+ calibration_file = Path(app_module.CALIBRATION_PATH)
138
+ calibration_file.parent.mkdir(parents=True, exist_ok=True)
139
+ calibration_file.write_text('{"version": "original"}', encoding="utf-8")
140
+
141
+ backup_response = client.post("/api/backup-calibration", headers=api_headers)
142
+ assert backup_response.status_code == 200
143
+ backup_payload = backup_response.get_json()
144
+ version_id = backup_payload["versionId"]
145
+
146
+ calibration_file.write_text('{"version": "new"}', encoding="utf-8")
147
+
148
+ rollback_response = client.post(
149
+ "/api/rollback-calibration",
150
+ json={"versionId": version_id},
151
+ headers=api_headers
152
+ )
153
+ assert rollback_response.status_code == 200
154
+ assert calibration_file.read_text(encoding="utf-8") == '{"version": "original"}'
155
+
156
+
157
+ def test_backup_calibration_missing_file(client, api_headers):
158
+ response = client.post("/api/backup-calibration", headers=api_headers)
159
+ assert response.status_code == 404
160
+
161
+
162
+ def test_rollback_calibration_missing_version(client, api_headers):
163
+ response = client.post("/api/rollback-calibration", json={}, headers=api_headers)
164
+ assert response.status_code == 400
165
+
166
+
167
+ def test_calibration_history_list(app_module, client, api_headers):
168
+ history_dir = Path(app_module.CALIBRATION_HISTORY_DIR)
169
+ history_dir.mkdir(parents=True, exist_ok=True)
170
+ history_file = history_dir / "calibration_20260207T120000Z_deadbeef.json"
171
+ history_file.write_text("{}", encoding="utf-8")
172
+
173
+ response = client.get("/api/calibration-history", headers=api_headers)
174
+ assert response.status_code == 200
175
+ payload = response.get_json()
176
+ assert payload["status"] == "success"
177
+ assert payload["history"]
tests/test_feedback.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+
7
+
8
+ def test_feedback_success_with_scoring(client, api_headers, app_module):
9
+ audio_bytes = b"\x01" * 400
10
+ payload = {
11
+ "label": "AI_GENERATED",
12
+ "audioFormat": "mp3",
13
+ "audioBase64": base64.b64encode(audio_bytes).decode("utf-8"),
14
+ "runDetection": True,
15
+ "metadata": {"source": "unit-test"}
16
+ }
17
+
18
+ response = client.post("/api/feedback", json=payload, headers=api_headers)
19
+ assert response.status_code == 200
20
+ data = response.get_json()
21
+ assert data["status"] == "success"
22
+
23
+ storage_dir = Path(app_module.FEEDBACK_STORAGE_DIR)
24
+ assert storage_dir.exists()
25
+ stored_files = list(storage_dir.rglob("*.mp3"))
26
+ assert stored_files, "Expected feedback audio file to be stored"
27
+ meta_files = list(storage_dir.rglob("*.json"))
28
+ assert meta_files, "Expected feedback metadata to be stored"
29
+
30
+ metadata = json.loads(meta_files[0].read_text(encoding="utf-8"))
31
+ assert metadata["label"] == "AI_GENERATED"
32
+ assert "physics_score" in metadata
33
+ assert "dl_score" in metadata
34
+
35
+
36
+ def test_feedback_invalid_label(client, api_headers, sample_audio_base64):
37
+ payload = {
38
+ "label": "UNKNOWN",
39
+ "audioFormat": "mp3",
40
+ "audioBase64": sample_audio_base64
41
+ }
42
+ response = client.post("/api/feedback", json=payload, headers=api_headers)
43
+ assert response.status_code == 400
44
+
45
+
46
+ def test_feedback_disabled(app_factory, sample_audio_base64, api_headers):
47
+ app_module = app_factory(ENABLE_FEEDBACK_STORAGE="false")
48
+ client = app_module.app.test_client()
49
+
50
+ payload = {
51
+ "label": "HUMAN",
52
+ "audioFormat": "mp3",
53
+ "audioBase64": sample_audio_base64
54
+ }
55
+ response = client.post("/api/feedback", json=payload, headers=api_headers)
56
+ assert response.status_code == 403
57
+
58
+
59
+ def test_feedback_too_large_payload(app_module, client, api_headers):
60
+ big_audio = base64.b64encode(b"\x00" * (app_module.FEEDBACK_MAX_BYTES + 10)).decode("utf-8")
61
+ payload = {
62
+ "label": "AI_GENERATED",
63
+ "audioFormat": "mp3",
64
+ "audioBase64": big_audio
65
+ }
66
+ response = client.post("/api/feedback", json=payload, headers=api_headers)
67
+ assert response.status_code == 413
tests/test_integration_model.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import pytest
5
+
6
+ from detector import HybridEnsembleDetector
7
+
8
+
9
+ pytestmark = [
10
+ pytest.mark.integration,
11
+ pytest.mark.skipif(
12
+ os.environ.get("RUN_MODEL_TESTS", "").lower() not in ["1", "true", "yes"],
13
+ reason="Integration tests require RUN_MODEL_TESTS=true and model weights available."
14
+ )
15
+ ]
16
+
17
+
18
+ def find_ai_miss_audio():
19
+ env_path = os.environ.get("AI_MISS_AUDIO_PATH")
20
+ if env_path and Path(env_path).exists():
21
+ return Path(env_path)
22
+
23
+ base_dir = Path(__file__).resolve().parent.parent / "test_audio"
24
+ if not base_dir.exists():
25
+ return None
26
+
27
+ candidates = []
28
+ for path in base_dir.iterdir():
29
+ if path.suffix.lower() not in [".mp3", ".wav"]:
30
+ continue
31
+ name = path.stem.lower()
32
+ if "miss" in name or "false" in name or "hard" in name:
33
+ candidates.append(path)
34
+
35
+ return candidates[0] if candidates else None
36
+
37
+
38
+ @pytest.mark.xfail(reason="Known false negative before retraining", strict=False)
39
+ def test_known_false_negative_ai_sample():
40
+ audio_path = find_ai_miss_audio()
41
+ if audio_path is None:
42
+ pytest.skip("No known false-negative AI sample provided.")
43
+
44
+ detector = HybridEnsembleDetector(
45
+ deepfake_model_path=os.environ.get(
46
+ "DEEPFAKE_MODEL_PATH",
47
+ "garystafford/wav2vec2-deepfake-voice-detector"
48
+ ),
49
+ whisper_model_path=os.environ.get(
50
+ "WHISPER_MODEL_PATH",
51
+ "openai/whisper-base"
52
+ ),
53
+ use_local_deepfake_model=os.environ.get("USE_LOCAL_DEEPFAKE_MODEL", "false").lower() in ["1", "true"],
54
+ use_local_whisper_model=os.environ.get("USE_LOCAL_WHISPER_MODEL", "false").lower() in ["1", "true"],
55
+ max_audio_duration=int(os.environ.get("MAX_AUDIO_DURATION", "30"))
56
+ )
57
+
58
+ result = detector.analyze(str(audio_path), input_type="file")
59
+ assert result["status"] == "success"
60
+ assert result["classification"] == "AI_GENERATED"
tests/test_streaming.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+
4
+
5
+ class FakeWebSocket:
6
+ def __init__(self, messages, api_key):
7
+ self._messages = iter(messages)
8
+ self.sent = []
9
+ self.environ = {"QUERY_STRING": f"api_key={api_key}"}
10
+
11
+ def receive(self):
12
+ try:
13
+ return next(self._messages)
14
+ except StopIteration:
15
+ return None
16
+
17
+ def send(self, message):
18
+ try:
19
+ self.sent.append(json.loads(message))
20
+ except Exception:
21
+ self.sent.append(message)
22
+
23
+
24
+ def build_pcm16_chunk(sample_rate=16000, channels=1, seconds=1.0):
25
+ bytes_per_second = sample_rate * channels * 2
26
+ size = int(bytes_per_second * seconds)
27
+ return base64.b64encode(b"\x00" * size).decode("utf-8")
28
+
29
+
30
+ def test_streaming_success_with_partial_and_final(app_module):
31
+ start_msg = json.dumps({
32
+ "type": "start",
33
+ "audioFormat": "pcm16",
34
+ "sampleRate": 16000,
35
+ "channels": 1,
36
+ "enablePartial": True,
37
+ "partialIntervalSec": 0.5
38
+ })
39
+
40
+ chunk_msg = json.dumps({
41
+ "type": "audio_chunk",
42
+ "audioChunkBase64": build_pcm16_chunk(seconds=1.0),
43
+ "final": True
44
+ })
45
+
46
+ ws = FakeWebSocket([start_msg, chunk_msg], api_key=app_module.API_KEY)
47
+ app_module.voice_stream(ws)
48
+
49
+ types = [msg.get("type") for msg in ws.sent if isinstance(msg, dict)]
50
+ assert "ack" in types
51
+ assert "progress" in types
52
+ assert "partial_result" in types
53
+ assert "final_result" in types
54
+
55
+
56
+ def test_streaming_invalid_api_key(app_module):
57
+ start_msg = json.dumps({
58
+ "type": "start",
59
+ "audioFormat": "pcm16",
60
+ "sampleRate": 16000,
61
+ "channels": 1
62
+ })
63
+
64
+ ws = FakeWebSocket([start_msg], api_key="bad_key")
65
+ app_module.voice_stream(ws)
66
+
67
+ assert ws.sent
68
+ assert ws.sent[0]["type"] == "error"
69
+
70
+
71
+ def test_streaming_invalid_format(app_module):
72
+ start_msg = json.dumps({
73
+ "type": "start",
74
+ "audioFormat": "aac",
75
+ "sampleRate": 16000,
76
+ "channels": 1
77
+ })
78
+
79
+ ws = FakeWebSocket([start_msg], api_key=app_module.API_KEY)
80
+ app_module.voice_stream(ws)
81
+
82
+ assert ws.sent
83
+ assert ws.sent[0]["type"] == "error"
84
+
85
+
86
+ def test_streaming_disabled(app_factory):
87
+ app_module = app_factory(ENABLE_STREAMING="false")
88
+ ws = FakeWebSocket([json.dumps({"type": "start"})], api_key=app_module.API_KEY)
89
+ app_module.voice_stream(ws)
90
+
91
+ assert ws.sent
92
+ assert ws.sent[0]["type"] == "error"
try.ipynb ADDED
The diff for this file is too large to render. See raw diff