Spaces:
Running
Running
shivam0897-i commited on
Commit ·
3b6fefe
0
Parent(s):
fix: correct id2label key-type mismatch causing inverted classifications
Browse filesHuggingFace model.config.id2label uses string keys ('0','1') but
torch.argmax().item() returns int. The .get() always missed and fell
through to a hardcoded fallback with opposite label polarity, inverting
every single classification (human->AI, AI->human).
Fix: normalise id2label keys to int before lookup. Add diagnostic logging.
- .env.example +79 -0
- .gitattributes +3 -0
- .gitignore +60 -0
- Dockerfile +40 -0
- README.md +93 -0
- audio_utils.py +182 -0
- config.py +185 -0
- fraud_language.py +191 -0
- llm_semantic_analyzer.py +253 -0
- main.py +1903 -0
- model.py +563 -0
- privacy_utils.py +54 -0
- requirements.txt +23 -0
- speech_to_text.py +158 -0
.env.example
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment Variables
|
| 2 |
+
# Copy this file to .env and update values
|
| 3 |
+
|
| 4 |
+
# API Key for authentication (Must be set!)
|
| 5 |
+
API_KEY=your_secure_api_key_here
|
| 6 |
+
|
| 7 |
+
# Server port (Hugging Face uses 7860)
|
| 8 |
+
PORT=7860
|
| 9 |
+
|
| 10 |
+
# Optional CORS origins
|
| 11 |
+
# Example: ALLOWED_ORIGINS=https://your-ui.vercel.app,http://localhost:5173
|
| 12 |
+
ALLOWED_ORIGINS=*
|
| 13 |
+
|
| 14 |
+
# Realtime ASR settings
|
| 15 |
+
ASR_ENABLED=true
|
| 16 |
+
ASR_MODEL_SIZE=tiny
|
| 17 |
+
ASR_COMPUTE_TYPE=int8
|
| 18 |
+
ASR_BEAM_SIZE=1
|
| 19 |
+
ASR_TIMEOUT_MS=1200
|
| 20 |
+
ASR_MAX_INFLIGHT_TASKS=1
|
| 21 |
+
ASR_WARMUP_ENABLED=true
|
| 22 |
+
AUDIO_PIPELINE_WARMUP_ENABLED=true
|
| 23 |
+
VOICE_WARMUP_ENABLED=true
|
| 24 |
+
|
| 25 |
+
# Voice model settings
|
| 26 |
+
VOICE_MODEL_ID=shivam-2211/voice-detection-model
|
| 27 |
+
VOICE_MODEL_BACKUP_ID=mo-thecreator/Deepfake-audio-detection
|
| 28 |
+
VOICE_MODEL_LOCAL_PATH=./fine_tuned_model
|
| 29 |
+
REALTIME_LIGHTWEIGHT_AUDIO=true
|
| 30 |
+
LEGACY_FALLBACK_RETURNS_UNCERTAIN=true
|
| 31 |
+
|
| 32 |
+
# Privacy and retention defaults
|
| 33 |
+
MASK_TRANSCRIPT_OUTPUT=true
|
| 34 |
+
SESSION_ACTIVE_RETENTION_SECONDS=1800
|
| 35 |
+
SESSION_ENDED_RETENTION_SECONDS=300
|
| 36 |
+
|
| 37 |
+
# Realtime risk policy tuning
|
| 38 |
+
RISK_POLICY_VERSION=v1.2
|
| 39 |
+
RISK_WEIGHT_AUDIO=0.45
|
| 40 |
+
RISK_WEIGHT_KEYWORD=0.20
|
| 41 |
+
RISK_WEIGHT_SEMANTIC=0.15
|
| 42 |
+
RISK_WEIGHT_BEHAVIOUR=0.20
|
| 43 |
+
RISK_DELTA_BOOST_FACTOR=0.30
|
| 44 |
+
|
| 45 |
+
# Optional LLM semantic verifier (second-layer, disabled by default)
|
| 46 |
+
LLM_SEMANTIC_ENABLED=false
|
| 47 |
+
LLM_PROVIDER=gemini
|
| 48 |
+
# Optional override (openai example: gpt-4o-mini, gemini example: gemini-1.5-flash)
|
| 49 |
+
LLM_SEMANTIC_MODEL=
|
| 50 |
+
LLM_SEMANTIC_TIMEOUT_MS=900
|
| 51 |
+
LLM_SEMANTIC_MIN_ASR_CONFIDENCE=0.35
|
| 52 |
+
LLM_SEMANTIC_CHUNK_INTERVAL=2
|
| 53 |
+
LLM_SEMANTIC_BLEND_WEIGHT=0.20
|
| 54 |
+
OPENAI_API_KEY=
|
| 55 |
+
|
| 56 |
+
# Gemini provider key (used when LLM_PROVIDER=gemini)
|
| 57 |
+
GEMINI_API_KEY=
|
| 58 |
+
|
| 59 |
+
# Session store backend
|
| 60 |
+
# memory = current single-instance behavior
|
| 61 |
+
# redis = required for multi-worker / restart-safe sessions
|
| 62 |
+
SESSION_STORE_BACKEND=memory
|
| 63 |
+
REDIS_URL=
|
| 64 |
+
REDIS_PREFIX=ai_call_shield
|
| 65 |
+
REDIS_CONNECT_TIMEOUT_MS=2000
|
| 66 |
+
REDIS_IO_TIMEOUT_MS=2000
|
| 67 |
+
|
| 68 |
+
# Deep-lane async verification (future-ready toggles)
|
| 69 |
+
DEEP_LANE_ENABLED=false
|
| 70 |
+
DEEP_LANE_QUEUE_BACKEND=memory
|
| 71 |
+
DEEP_LANE_MAX_WORKERS=2
|
| 72 |
+
DEEP_LANE_MAX_RETRIES=1
|
| 73 |
+
DEEP_LANE_RETRY_BACKOFF_MS=500
|
| 74 |
+
DEEP_LANE_TARGET_LATENCY_MS=3000
|
| 75 |
+
|
| 76 |
+
# Performance budgets for harness and CI gates
|
| 77 |
+
PERF_CHUNK_P95_TARGET_MS=1200
|
| 78 |
+
PERF_ALERT_P95_TARGET_MS=2500
|
| 79 |
+
|
.gitattributes
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fine_tuned_model/model.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
.venv/
|
| 3 |
+
venv/
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.pyc
|
| 6 |
+
*.pyo
|
| 7 |
+
*.pyd
|
| 8 |
+
.pytest_cache/
|
| 9 |
+
.coverage
|
| 10 |
+
.coverage.*
|
| 11 |
+
htmlcov/
|
| 12 |
+
|
| 13 |
+
# Environment and secrets
|
| 14 |
+
.env
|
| 15 |
+
.env.*
|
| 16 |
+
!.env.example
|
| 17 |
+
|
| 18 |
+
# Local AI/tooling folders
|
| 19 |
+
.agent/
|
| 20 |
+
.agents/
|
| 21 |
+
.codex/
|
| 22 |
+
.claude/
|
| 23 |
+
.gemini/
|
| 24 |
+
.trae/
|
| 25 |
+
.windsurf/
|
| 26 |
+
|
| 27 |
+
# OS / editor
|
| 28 |
+
.DS_Store
|
| 29 |
+
Thumbs.db
|
| 30 |
+
*.log
|
| 31 |
+
.vscode/
|
| 32 |
+
.idea/
|
| 33 |
+
*.swp
|
| 34 |
+
|
| 35 |
+
# Large artifacts
|
| 36 |
+
*.mp4
|
| 37 |
+
*.wav
|
| 38 |
+
*.mp3
|
| 39 |
+
fine_tuned_model/
|
| 40 |
+
training/
|
| 41 |
+
|
| 42 |
+
# === Non-production files (keep out of HF Space) ===
|
| 43 |
+
|
| 44 |
+
# Tests
|
| 45 |
+
tests/
|
| 46 |
+
pytest.ini
|
| 47 |
+
|
| 48 |
+
# Docs and reports
|
| 49 |
+
docs/
|
| 50 |
+
|
| 51 |
+
# Dev/validation scripts
|
| 52 |
+
scripts/
|
| 53 |
+
scenario_validation_cases.py
|
| 54 |
+
|
| 55 |
+
# Test request fixtures
|
| 56 |
+
test_request.json
|
| 57 |
+
test_valid.json
|
| 58 |
+
|
| 59 |
+
# Helper/patch scripts
|
| 60 |
+
_fix_*.py
|
Dockerfile
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies for librosa and audio processing
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
libsndfile1 \
|
| 8 |
+
ffmpeg \
|
| 9 |
+
git \
|
| 10 |
+
git-lfs \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
# Initialize git lfs
|
| 14 |
+
RUN git lfs install
|
| 15 |
+
|
| 16 |
+
# Copy requirements first for better caching
|
| 17 |
+
COPY requirements.txt .
|
| 18 |
+
|
| 19 |
+
# Install CPU-only PyTorch first (smaller size)
|
| 20 |
+
RUN pip install --no-cache-dir torch torchaudio --index-url https://download.pytorch.org/whl/cpu
|
| 21 |
+
|
| 22 |
+
# Install other dependencies
|
| 23 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 24 |
+
|
| 25 |
+
# Copy application code and model
|
| 26 |
+
COPY . .
|
| 27 |
+
|
| 28 |
+
# Create a non-root user for HF Spaces
|
| 29 |
+
RUN useradd -m -u 1000 user
|
| 30 |
+
USER user
|
| 31 |
+
ENV HOME=/home/user \
|
| 32 |
+
PATH=/home/user/.local/bin:$PATH
|
| 33 |
+
|
| 34 |
+
WORKDIR /app
|
| 35 |
+
|
| 36 |
+
# Hugging Face Spaces uses port 7860
|
| 37 |
+
EXPOSE 7860
|
| 38 |
+
|
| 39 |
+
# Run the application
|
| 40 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Voice Detection API
|
| 3 |
+
emoji: 🎤
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
app_port: 7860
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# AI Voice Detection API
|
| 13 |
+
|
| 14 |
+
Detects whether a voice sample is AI-generated or spoken by a real human using a fine-tuned Wav2Vec2 model.
|
| 15 |
+
|
| 16 |
+
## API Endpoint
|
| 17 |
+
|
| 18 |
+
`POST /api/voice-detection`
|
| 19 |
+
|
| 20 |
+
### Headers
|
| 21 |
+
- `x-api-key`: Your API key (set via environment variable `API_KEY`)
|
| 22 |
+
|
| 23 |
+
### Request Body
|
| 24 |
+
```json
|
| 25 |
+
{
|
| 26 |
+
"language": "English",
|
| 27 |
+
"audioFormat": "mp3",
|
| 28 |
+
"audioBase64": "<base64-encoded-audio>"
|
| 29 |
+
}
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
### Response
|
| 33 |
+
```json
|
| 34 |
+
{
|
| 35 |
+
"status": "success",
|
| 36 |
+
"language": "English",
|
| 37 |
+
"classification": "AI_GENERATED" | "HUMAN",
|
| 38 |
+
"confidenceScore": 0.95,
|
| 39 |
+
"explanation": "AI voice indicators: ..."
|
| 40 |
+
}
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## Supported Languages
|
| 44 |
+
- English
|
| 45 |
+
- Tamil
|
| 46 |
+
- Hindi
|
| 47 |
+
- Malayalam
|
| 48 |
+
- Telugu
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
## Realtime Session APIs
|
| 53 |
+
|
| 54 |
+
The backend also supports session-based realtime analysis:
|
| 55 |
+
|
| 56 |
+
- `POST /v1/session/start`
|
| 57 |
+
- `POST /v1/session/{session_id}/chunk`
|
| 58 |
+
- `GET /v1/session/{session_id}/summary`
|
| 59 |
+
- `GET /v1/session/{session_id}/alerts`
|
| 60 |
+
- `POST /v1/session/{session_id}/end`
|
| 61 |
+
|
| 62 |
+
Compatibility aliases are available under `/api/voice-detection/v1/...`.
|
| 63 |
+
|
| 64 |
+
## Optional LLM Semantic Verifier
|
| 65 |
+
|
| 66 |
+
A second-layer semantic verifier can be enabled to improve ambiguous chunk scoring:
|
| 67 |
+
|
| 68 |
+
- `LLM_SEMANTIC_ENABLED=true`
|
| 69 |
+
- `LLM_PROVIDER=openai` with `OPENAI_API_KEY=<your_key>`, or
|
| 70 |
+
- `LLM_PROVIDER=gemini` with `GEMINI_API_KEY=<your_key>`
|
| 71 |
+
- Tune with `LLM_SEMANTIC_*` env variables in `.env.example`.
|
| 72 |
+
|
| 73 |
+
If `LLM_SEMANTIC_MODEL` is empty, provider defaults are used (`gpt-4o-mini` for OpenAI, `gemini-1.5-flash` for Gemini).
|
| 74 |
+
|
| 75 |
+
The LLM layer is optional and the API continues to work when disabled.
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
## Session Store Backend
|
| 79 |
+
|
| 80 |
+
Realtime sessions support two backends:
|
| 81 |
+
|
| 82 |
+
- `memory` (default): single-instance, volatile
|
| 83 |
+
- `redis`: multi-worker and restart-safe (recommended for finals)
|
| 84 |
+
|
| 85 |
+
Backend env settings:
|
| 86 |
+
|
| 87 |
+
- `SESSION_STORE_BACKEND=redis`
|
| 88 |
+
- `REDIS_URL=redis://...` (or `rediss://...`)
|
| 89 |
+
- `REDIS_PREFIX=ai_call_shield`
|
| 90 |
+
|
| 91 |
+
`GET /health` now includes `session_store_backend` so you can verify active backend.
|
| 92 |
+
|
| 93 |
+
See `docs/architecture/redis-credentials-guide.md` for credential formats and setup steps.
|
audio_utils.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio utility functions for Base64 decoding and audio loading.
|
| 3 |
+
"""
|
| 4 |
+
import base64
|
| 5 |
+
import io
|
| 6 |
+
import tempfile
|
| 7 |
+
import os
|
| 8 |
+
import logging
|
| 9 |
+
from typing import Tuple, Optional
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
# Configure logging
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
# Magic bytes for common audio formats
|
| 16 |
+
AUDIO_MAGIC_BYTES = {
|
| 17 |
+
b'\xff\xfb': 'mp3', # MP3 (MPEG Audio Layer 3)
|
| 18 |
+
b'\xff\xfa': 'mp3', # MP3 variant
|
| 19 |
+
b'\xff\xf3': 'mp3', # MP3 variant
|
| 20 |
+
b'\xff\xf2': 'mp3', # MP3 variant
|
| 21 |
+
b'ID3': 'mp3', # MP3 with ID3 tag
|
| 22 |
+
b'RIFF': 'wav', # WAV
|
| 23 |
+
b'fLaC': 'flac', # FLAC
|
| 24 |
+
b'OggS': 'ogg', # OGG
|
| 25 |
+
b'\x00\x00\x00': 'm4a', # M4A/MP4 (ftyp box)
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def validate_audio_content(audio_bytes: bytes) -> Tuple[bool, str]:
|
| 30 |
+
"""
|
| 31 |
+
Validate that the bytes actually contain audio data.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
audio_bytes: Raw bytes to validate
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
Tuple of (is_valid, detected_format_or_error_message)
|
| 38 |
+
"""
|
| 39 |
+
if len(audio_bytes) < 12:
|
| 40 |
+
return False, "Audio data too small to be valid"
|
| 41 |
+
|
| 42 |
+
# Check for text content (common mistake: uploading CSV/JSON as audio)
|
| 43 |
+
# ASCII printable range check on first 100 bytes
|
| 44 |
+
sample = audio_bytes[:100]
|
| 45 |
+
printable_ratio = sum(1 for b in sample if 32 <= b <= 126 or b in (9, 10, 13)) / len(sample)
|
| 46 |
+
if printable_ratio > 0.9:
|
| 47 |
+
# Likely text content
|
| 48 |
+
preview = sample[:50].decode('utf-8', errors='replace')
|
| 49 |
+
return False, f"File appears to be text, not audio. Preview: {preview[:30]}..."
|
| 50 |
+
|
| 51 |
+
# Check magic bytes
|
| 52 |
+
for magic, fmt in AUDIO_MAGIC_BYTES.items():
|
| 53 |
+
if audio_bytes.startswith(magic):
|
| 54 |
+
return True, fmt
|
| 55 |
+
|
| 56 |
+
# Check for M4A/MP4 (ftyp at offset 4)
|
| 57 |
+
if len(audio_bytes) > 8 and audio_bytes[4:8] == b'ftyp':
|
| 58 |
+
return True, "m4a"
|
| 59 |
+
|
| 60 |
+
# Unknown format but not text - allow it and let librosa try
|
| 61 |
+
logger.warning("Unknown audio format, attempting to load anyway")
|
| 62 |
+
return True, "unknown"
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def decode_base64_audio(base64_string: str) -> bytes:
|
| 66 |
+
"""
|
| 67 |
+
Decode a Base64-encoded audio string to raw bytes.
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
base64_string: Base64-encoded audio data
|
| 71 |
+
|
| 72 |
+
Returns:
|
| 73 |
+
Raw audio bytes
|
| 74 |
+
|
| 75 |
+
Raises:
|
| 76 |
+
ValueError: If the Base64 string is invalid
|
| 77 |
+
"""
|
| 78 |
+
try:
|
| 79 |
+
# Strip data URI prefix if present
|
| 80 |
+
if "," in base64_string:
|
| 81 |
+
base64_string = base64_string.split(",", 1)[1]
|
| 82 |
+
|
| 83 |
+
# Remove any whitespace
|
| 84 |
+
base64_string = base64_string.strip()
|
| 85 |
+
|
| 86 |
+
return base64.b64decode(base64_string)
|
| 87 |
+
except Exception as e:
|
| 88 |
+
raise ValueError(f"Invalid Base64 encoding: {str(e)}")
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def load_audio_from_bytes(audio_bytes: bytes, target_sr: int = 22050, audio_format: str = "mp3") -> Tuple[np.ndarray, int]:
|
| 92 |
+
"""
|
| 93 |
+
Load audio from bytes into a numpy array using librosa.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
audio_bytes: Raw audio file bytes
|
| 97 |
+
target_sr: Target sample rate (default 22050 Hz)
|
| 98 |
+
audio_format: Audio format extension (mp3, wav, flac, ogg, m4a, mp4)
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
Tuple of (audio waveform as numpy array, sample rate)
|
| 102 |
+
|
| 103 |
+
Raises:
|
| 104 |
+
ValueError: If audio cannot be loaded or is invalid
|
| 105 |
+
"""
|
| 106 |
+
# Validate audio content BEFORE attempting to decode
|
| 107 |
+
is_valid, validation_result = validate_audio_content(audio_bytes)
|
| 108 |
+
if not is_valid:
|
| 109 |
+
raise ValueError(f"Invalid audio file: {validation_result}")
|
| 110 |
+
|
| 111 |
+
logger.info(f"Audio validation passed. Detected format hint: {validation_result}")
|
| 112 |
+
|
| 113 |
+
tmp_path = None
|
| 114 |
+
try:
|
| 115 |
+
import librosa
|
| 116 |
+
import soundfile as sf
|
| 117 |
+
|
| 118 |
+
# Normalize format
|
| 119 |
+
audio_format = audio_format.lower().strip()
|
| 120 |
+
if audio_format.startswith("."):
|
| 121 |
+
audio_format = audio_format[1:]
|
| 122 |
+
|
| 123 |
+
# Validate format (security)
|
| 124 |
+
if not audio_format.isalnum() or len(audio_format) > 5:
|
| 125 |
+
raise ValueError(f"Invalid audio format: {audio_format}")
|
| 126 |
+
|
| 127 |
+
# Write to temp file for librosa
|
| 128 |
+
with tempfile.NamedTemporaryFile(suffix=f".{audio_format}", delete=False) as tmp_file:
|
| 129 |
+
tmp_file.write(audio_bytes)
|
| 130 |
+
tmp_path = tmp_file.name
|
| 131 |
+
|
| 132 |
+
# Load audio with librosa
|
| 133 |
+
audio, sr = librosa.load(tmp_path, sr=target_sr, mono=True)
|
| 134 |
+
|
| 135 |
+
# Validate loaded audio
|
| 136 |
+
if len(audio) == 0:
|
| 137 |
+
raise ValueError("Audio file is empty or could not be decoded")
|
| 138 |
+
|
| 139 |
+
duration = len(audio) / sr
|
| 140 |
+
logger.info(f"Audio loaded successfully: {duration:.2f}s at {sr}Hz")
|
| 141 |
+
|
| 142 |
+
return audio, sr
|
| 143 |
+
|
| 144 |
+
except Exception as e:
|
| 145 |
+
raise ValueError(f"Failed to load audio: {str(e)}")
|
| 146 |
+
finally:
|
| 147 |
+
# Always clean up temp file, even on exceptions
|
| 148 |
+
if tmp_path and os.path.exists(tmp_path):
|
| 149 |
+
try:
|
| 150 |
+
os.remove(tmp_path)
|
| 151 |
+
except OSError:
|
| 152 |
+
pass # Best effort cleanup
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def get_audio_duration(audio: np.ndarray, sr: int) -> float:
|
| 156 |
+
"""
|
| 157 |
+
Calculate the duration of audio in seconds.
|
| 158 |
+
|
| 159 |
+
Args:
|
| 160 |
+
audio: Audio waveform
|
| 161 |
+
sr: Sample rate
|
| 162 |
+
|
| 163 |
+
Returns:
|
| 164 |
+
Duration in seconds
|
| 165 |
+
"""
|
| 166 |
+
return len(audio) / sr
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def normalize_audio(audio: np.ndarray) -> np.ndarray:
|
| 170 |
+
"""
|
| 171 |
+
Normalize audio to have maximum amplitude of 1.0.
|
| 172 |
+
|
| 173 |
+
Args:
|
| 174 |
+
audio: Audio waveform
|
| 175 |
+
|
| 176 |
+
Returns:
|
| 177 |
+
Normalized audio
|
| 178 |
+
"""
|
| 179 |
+
max_val = np.max(np.abs(audio))
|
| 180 |
+
if max_val > 0:
|
| 181 |
+
return audio / max_val
|
| 182 |
+
return audio
|
config.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration management using Pydantic Settings.
|
| 3 |
+
"""
|
| 4 |
+
from pydantic_settings import BaseSettings
|
| 5 |
+
from typing import List
|
| 6 |
+
from pydantic import Field
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class Settings(BaseSettings):
|
| 10 |
+
"""Application configuration."""
|
| 11 |
+
|
| 12 |
+
# Core API Settings
|
| 13 |
+
API_KEY: str = Field(..., description="API Key for authentication")
|
| 14 |
+
PORT: int = Field(7860, description="Server port")
|
| 15 |
+
WEBSITE_URL: str = Field(
|
| 16 |
+
default="https://voice-detection-nu.vercel.app/",
|
| 17 |
+
description="Project or Portfolio URL"
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
# CORS Settings
|
| 21 |
+
# Use str field with alias to read env var safely (avoids Pydantic trying to parse as JSON)
|
| 22 |
+
ALLOWED_ORIGINS_RAW: str = Field(default="*", alias="ALLOWED_ORIGINS")
|
| 23 |
+
|
| 24 |
+
@property
|
| 25 |
+
def ALLOWED_ORIGINS(self) -> List[str]:
|
| 26 |
+
"""Parse the raw CORS origins string into a list."""
|
| 27 |
+
raw_value: str = self.ALLOWED_ORIGINS_RAW
|
| 28 |
+
if raw_value.strip().startswith("["):
|
| 29 |
+
import json
|
| 30 |
+
try:
|
| 31 |
+
return json.loads(raw_value)
|
| 32 |
+
except json.JSONDecodeError:
|
| 33 |
+
pass
|
| 34 |
+
return [origin.strip() for origin in raw_value.split(",") if origin.strip()]
|
| 35 |
+
|
| 36 |
+
# Audio Constraints
|
| 37 |
+
MAX_AUDIO_SIZE_MB: int = 10
|
| 38 |
+
SUPPORTED_LANGUAGES: List[str] = [
|
| 39 |
+
"Tamil", "English", "Hindi", "Malayalam", "Telugu"
|
| 40 |
+
]
|
| 41 |
+
SUPPORTED_FORMATS: List[str] = [
|
| 42 |
+
"mp3", "wav", "flac", "ogg", "m4a", "mp4"
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
# ASR settings
|
| 46 |
+
ASR_ENABLED: bool = Field(default=True, description="Enable speech-to-text analysis for realtime sessions")
|
| 47 |
+
ASR_MODEL_SIZE: str = Field(default="tiny", description="faster-whisper model size")
|
| 48 |
+
ASR_COMPUTE_TYPE: str = Field(default="int8", description="faster-whisper compute type")
|
| 49 |
+
ASR_BEAM_SIZE: int = Field(default=1, description="Beam size for ASR decoding")
|
| 50 |
+
ASR_TIMEOUT_MS: int = Field(
|
| 51 |
+
default=2500,
|
| 52 |
+
ge=200,
|
| 53 |
+
le=15000,
|
| 54 |
+
description="Max realtime ASR duration per chunk before timeout fallback"
|
| 55 |
+
)
|
| 56 |
+
ASR_MAX_INFLIGHT_TASKS: int = Field(
|
| 57 |
+
default=1,
|
| 58 |
+
ge=1,
|
| 59 |
+
le=8,
|
| 60 |
+
description="Maximum concurrent ASR background tasks allowed to prevent thread pileups"
|
| 61 |
+
)
|
| 62 |
+
ASR_WARMUP_ENABLED: bool = Field(
|
| 63 |
+
default=True,
|
| 64 |
+
description="Warm faster-whisper model during startup to avoid first-chunk latency spike"
|
| 65 |
+
)
|
| 66 |
+
AUDIO_PIPELINE_WARMUP_ENABLED: bool = Field(
|
| 67 |
+
default=True,
|
| 68 |
+
description="Warm audio decoding/resampling pipeline during startup"
|
| 69 |
+
)
|
| 70 |
+
VOICE_WARMUP_ENABLED: bool = Field(
|
| 71 |
+
default=True,
|
| 72 |
+
description="Run one startup inference through voice analyzer to avoid first-chunk latency spikes"
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# Voice classification model settings
|
| 76 |
+
VOICE_MODEL_ID: str = Field(
|
| 77 |
+
default="shivam-2211/voice-detection-model",
|
| 78 |
+
description="Primary Hugging Face model id for AI voice detection"
|
| 79 |
+
)
|
| 80 |
+
VOICE_MODEL_BACKUP_ID: str = Field(
|
| 81 |
+
default="mo-thecreator/Deepfake-audio-detection",
|
| 82 |
+
description="Backup model id if primary model load fails"
|
| 83 |
+
)
|
| 84 |
+
VOICE_MODEL_LOCAL_PATH: str = Field(
|
| 85 |
+
default="./fine_tuned_model",
|
| 86 |
+
description="Optional local model path that takes priority when present"
|
| 87 |
+
)
|
| 88 |
+
REALTIME_LIGHTWEIGHT_AUDIO: bool = Field(
|
| 89 |
+
default=False,
|
| 90 |
+
description="Use lightweight audio analysis path for realtime chunk processing (set true for throughput-first mode)"
|
| 91 |
+
)
|
| 92 |
+
LEGACY_FALLBACK_RETURNS_UNCERTAIN: bool = Field(
|
| 93 |
+
default=True,
|
| 94 |
+
description="Return UNCERTAIN classification on legacy endpoint when ML fallback occurs"
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# Risk policy (versioned + configurable weights)
|
| 98 |
+
RISK_POLICY_VERSION: str = Field(default="v1.2", description="Version tag for realtime risk policy")
|
| 99 |
+
RISK_WEIGHT_AUDIO: float = Field(default=0.45, ge=0.0, le=1.0)
|
| 100 |
+
RISK_WEIGHT_KEYWORD: float = Field(default=0.20, ge=0.0, le=1.0)
|
| 101 |
+
RISK_WEIGHT_SEMANTIC: float = Field(default=0.15, ge=0.0, le=1.0)
|
| 102 |
+
RISK_WEIGHT_BEHAVIOUR: float = Field(default=0.20, ge=0.0, le=1.0)
|
| 103 |
+
RISK_DELTA_BOOST_FACTOR: float = Field(
|
| 104 |
+
default=0.30,
|
| 105 |
+
ge=0.0,
|
| 106 |
+
le=1.0,
|
| 107 |
+
description="How strongly risk increases when per-chunk delta is positive"
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
# Optional LLM semantic verifier (second-layer, not primary classifier)
|
| 111 |
+
LLM_SEMANTIC_ENABLED: bool = Field(default=False)
|
| 112 |
+
LLM_PROVIDER: str = Field(default="openai", description="LLM provider: openai or gemini")
|
| 113 |
+
LLM_SEMANTIC_MODEL: str = Field(default="", description="Model name for selected LLM provider (optional)")
|
| 114 |
+
LLM_SEMANTIC_TIMEOUT_MS: int = Field(default=900, ge=100, le=5000)
|
| 115 |
+
LLM_SEMANTIC_MIN_ASR_CONFIDENCE: float = Field(default=0.35, ge=0.0, le=1.0)
|
| 116 |
+
LLM_SEMANTIC_CHUNK_INTERVAL: int = Field(default=2, ge=1, le=20)
|
| 117 |
+
LLM_SEMANTIC_BLEND_WEIGHT: float = Field(
|
| 118 |
+
default=0.20,
|
| 119 |
+
ge=0.0,
|
| 120 |
+
le=1.0,
|
| 121 |
+
description="Weight assigned to LLM semantic score in fused semantic score"
|
| 122 |
+
)
|
| 123 |
+
OPENAI_API_KEY: str | None = Field(default=None, description="Optional OpenAI API key for LLM semantic verifier")
|
| 124 |
+
GEMINI_API_KEY: str | None = Field(default=None, description="Optional Gemini API key for LLM semantic verifier")
|
| 125 |
+
|
| 126 |
+
# Session store backend
|
| 127 |
+
SESSION_STORE_BACKEND: str = Field(
|
| 128 |
+
default="memory",
|
| 129 |
+
description="Session store backend: memory or redis"
|
| 130 |
+
)
|
| 131 |
+
REDIS_URL: str | None = Field(
|
| 132 |
+
default=None,
|
| 133 |
+
description="Redis URL for session state and queue (required when SESSION_STORE_BACKEND=redis)"
|
| 134 |
+
)
|
| 135 |
+
REDIS_PREFIX: str = Field(
|
| 136 |
+
default="ai_call_shield",
|
| 137 |
+
description="Redis key prefix namespace"
|
| 138 |
+
)
|
| 139 |
+
REDIS_CONNECT_TIMEOUT_MS: int = Field(default=2000, ge=100, le=30000)
|
| 140 |
+
REDIS_IO_TIMEOUT_MS: int = Field(default=2000, ge=100, le=30000)
|
| 141 |
+
|
| 142 |
+
# Deep-lane async verification controls
|
| 143 |
+
DEEP_LANE_ENABLED: bool = Field(
|
| 144 |
+
default=False,
|
| 145 |
+
description="Enable asynchronous deep-lane verification after fast-lane decision"
|
| 146 |
+
)
|
| 147 |
+
DEEP_LANE_QUEUE_BACKEND: str = Field(
|
| 148 |
+
default="memory",
|
| 149 |
+
description="Queue backend: memory or redis"
|
| 150 |
+
)
|
| 151 |
+
DEEP_LANE_MAX_WORKERS: int = Field(default=2, ge=1, le=16)
|
| 152 |
+
DEEP_LANE_MAX_RETRIES: int = Field(default=1, ge=0, le=10)
|
| 153 |
+
DEEP_LANE_RETRY_BACKOFF_MS: int = Field(default=500, ge=0, le=60000)
|
| 154 |
+
DEEP_LANE_TARGET_LATENCY_MS: int = Field(default=3000, ge=200, le=10000)
|
| 155 |
+
|
| 156 |
+
# Performance targets (for harness/reporting and CI gates)
|
| 157 |
+
PERF_CHUNK_P95_TARGET_MS: int = Field(default=1200, ge=100, le=10000)
|
| 158 |
+
PERF_ALERT_P95_TARGET_MS: int = Field(default=2500, ge=100, le=10000)
|
| 159 |
+
|
| 160 |
+
# Session retention and privacy controls
|
| 161 |
+
SESSION_ACTIVE_RETENTION_SECONDS: int = Field(
|
| 162 |
+
default=1800,
|
| 163 |
+
description="Retention TTL for active sessions with no updates"
|
| 164 |
+
)
|
| 165 |
+
SESSION_ENDED_RETENTION_SECONDS: int = Field(
|
| 166 |
+
default=300,
|
| 167 |
+
description="Retention TTL for ended sessions before purge"
|
| 168 |
+
)
|
| 169 |
+
MASK_TRANSCRIPT_OUTPUT: bool = Field(
|
| 170 |
+
default=True,
|
| 171 |
+
description="Mask sensitive entities from transcript before returning response"
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
# Environment Specific
|
| 175 |
+
SPACE_ID: str | None = Field(default=None, description="Hugging Face Space ID if running in Spaces")
|
| 176 |
+
|
| 177 |
+
model_config = {
|
| 178 |
+
"env_file": ".env",
|
| 179 |
+
"case_sensitive": True,
|
| 180 |
+
"extra": "ignore"
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
# Global settings instance
|
| 185 |
+
settings = Settings()
|
fraud_language.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Keyword and semantic fraud signal extraction from transcripts.
|
| 3 |
+
"""
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
import string
|
| 8 |
+
from typing import Any, Dict, List, Set
|
| 9 |
+
|
| 10 |
+
# Baseline keywords that are language-agnostic or commonly spoken in English/Hinglish.
|
| 11 |
+
COMMON_FRAUD_KEYWORDS: Dict[str, Set[str]] = {
|
| 12 |
+
"financial": {
|
| 13 |
+
"bank account", "account", "credit card", "debit card", "loan", "khata",
|
| 14 |
+
},
|
| 15 |
+
"payment": {
|
| 16 |
+
"upi", "upi id", "gpay", "google pay", "phonepe", "paytm", "neft", "rtgs",
|
| 17 |
+
"send money", "transfer money", "payment",
|
| 18 |
+
},
|
| 19 |
+
"authentication": {
|
| 20 |
+
"otp", "pin", "password", "cvv", "verification code", "passcode",
|
| 21 |
+
},
|
| 22 |
+
"urgency": {
|
| 23 |
+
"urgent", "immediately", "right now", "now", "last chance", "today only",
|
| 24 |
+
"abhi", "turant", "jaldi",
|
| 25 |
+
},
|
| 26 |
+
"threat": {
|
| 27 |
+
"blocked", "suspended", "legal action", "police", "arrest", "freeze",
|
| 28 |
+
},
|
| 29 |
+
"impersonation": {
|
| 30 |
+
"rbi", "bank manager", "government", "income tax", "customs", "official",
|
| 31 |
+
},
|
| 32 |
+
"offer_lure": {
|
| 33 |
+
"lottery", "prize", "winner", "cashback", "free", "reward",
|
| 34 |
+
},
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
# Language-specific script and phrase variants to improve 5-language support.
|
| 38 |
+
LANGUAGE_FRAUD_KEYWORDS: Dict[str, Dict[str, Set[str]]] = {
|
| 39 |
+
"Hindi": {
|
| 40 |
+
"financial": {"बैंक", "खाता", "अकाउंट", "लोन"},
|
| 41 |
+
"payment": {"यूपीआई", "युपीआई", "भुगतान", "पैसे भेजो", "ट्रांसफर", "गूगल पे", "फोनपे", "पेटीएम"},
|
| 42 |
+
"authentication": {"ओटीपी", "पिन", "पासवर्ड", "सत्यापन कोड"},
|
| 43 |
+
"urgency": {"अभी", "तुरंत", "जल्दी", "फौरन", "अंतिम मौका"},
|
| 44 |
+
"threat": {"ब्लॉक", "निलंबित", "कानूनी कार्रवाई", "गिरफ्तार", "फ्रीज"},
|
| 45 |
+
"impersonation": {"आरबीआई", "सरकारी अधिकारी", "बैंक मैनेजर", "इनकम टैक्स"},
|
| 46 |
+
"offer_lure": {"लॉटरी", "इनाम", "कैशबैक", "फ्री", "रिवॉर्ड"},
|
| 47 |
+
},
|
| 48 |
+
"Tamil": {
|
| 49 |
+
"financial": {"வங்கி", "கணக்கு", "அக்கவுண்ட்", "கடன்"},
|
| 50 |
+
"payment": {"யுபிஐ", "கூகுள் பே", "போன்பே", "பேடிஎம்", "பணம் அனுப்பு", "பணம் பரிமாற்றம்", "கட்டணம்"},
|
| 51 |
+
"authentication": {"ஓடிபி", "பின்", "கடவுச்சொல்", "சரிபார்ப்பு குறியீடு"},
|
| 52 |
+
"urgency": {"உடனே", "இப்போதே", "விரைவாக", "இப்போது", "அவசரம்"},
|
| 53 |
+
"threat": {"முடக்கப்படும்", "தடைசெய்யப்படும்", "சட்ட நடவடிக்கை", "காவல்", "உறையவைக்கப்படும்"},
|
| 54 |
+
"impersonation": {"ஆர்பிஐ", "அரசு அதிகாரி", "வங்கி மேலாளர்", "வருமானவரி"},
|
| 55 |
+
"offer_lure": {"லாட்டரி", "பரிசு", "கேஷ்பேக்", "இலவசம்", "வெற்றி"},
|
| 56 |
+
},
|
| 57 |
+
"Malayalam": {
|
| 58 |
+
"financial": {"ബാങ്ക്", "അക്കൗണ്ട്", "ഖാത", "ലോൺ"},
|
| 59 |
+
"payment": {"യുപിഐ", "ഗൂഗിൾ പേ", "ഫോൺപേ", "പേടിഎം", "പണം അയക്കൂ", "പേയ്മെന്റ്", "ട്രാൻസ്ഫർ"},
|
| 60 |
+
"authentication": {"ഒടിപി", "പിൻ", "പാസ്വേഡ്", "സ്ഥിരീകരണ കോഡ്"},
|
| 61 |
+
"urgency": {"ഉടൻ", "ഇപ്പോള്", "തൽക്ഷണം", "വേഗം", "അവസരം"},
|
| 62 |
+
"threat": {"ബ്ലോക്ക്", "സസ്പെൻഡ്", "നിയമ നടപടി", "അറസ്റ്റ്", "ഫ്രീസ്"},
|
| 63 |
+
"impersonation": {"ആർബിഐ", "സർക്കാർ ഓഫീസർ", "ബാങ്ക് മാനേജർ", "ഇൻകം ടാക്സ്"},
|
| 64 |
+
"offer_lure": {"ലോട്ടറി", "സമ്മാനം", "കാഷ്ബാക്ക്", "ഫ്രീ", "റിവാർഡ്"},
|
| 65 |
+
},
|
| 66 |
+
"Telugu": {
|
| 67 |
+
"financial": {"బ్యాంక్", "ఖాతా", "అకౌంట్", "లోన్"},
|
| 68 |
+
"payment": {"యూపీఐ", "గూగుల్ పే", "ఫోన్పే", "పేటిఎం", "డబ్బు పంపండి", "చెల్లింపు", "ట్రాన్స్ఫర్"},
|
| 69 |
+
"authentication": {"ఓటిపి", "పిన్", "పాస్వర్డ్", "ధృవీకరణ కోడ్"},
|
| 70 |
+
"urgency": {"వెంటనే", "ఇ��్పుడే", "తక్షణం", "త్వరగా", "చివరి అవకాశం"},
|
| 71 |
+
"threat": {"బ్లాక్", "సస్పెండ్", "చట్టపరమైన చర్య", "అరెస్ట్", "ఫ్రీజ్"},
|
| 72 |
+
"impersonation": {"ఆర్బిఐ", "ప్రభుత్వ అధికారి", "బ్యాంక్ మేనేజర్", "ఇన్కమ్ ట్యాక్స్"},
|
| 73 |
+
"offer_lure": {"లాటరీ", "బహుమతి", "క్యాష్బ్యాక్", "ఉచితం", "రివార్డు"},
|
| 74 |
+
},
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
PUNCT_TRANSLATION = str.maketrans({ch: " " for ch in (string.punctuation + "“”‘’…–—।॥،؛")})
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def _normalize_text(text: str) -> str:
|
| 81 |
+
"""
|
| 82 |
+
Normalize text while preserving non-Latin scripts.
|
| 83 |
+
|
| 84 |
+
We avoid ASCII-only regex stripping so Indic scripts remain searchable.
|
| 85 |
+
"""
|
| 86 |
+
normalized = text.casefold().translate(PUNCT_TRANSLATION)
|
| 87 |
+
normalized = re.sub(r"\s+", " ", normalized).strip()
|
| 88 |
+
return normalized
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _combined_keyword_catalog(language: str | None) -> Dict[str, Set[str]]:
|
| 92 |
+
"""Merge common keywords with optional language-specific keywords."""
|
| 93 |
+
merged: Dict[str, Set[str]] = {category: set(values) for category, values in COMMON_FRAUD_KEYWORDS.items()}
|
| 94 |
+
|
| 95 |
+
if language and language in LANGUAGE_FRAUD_KEYWORDS:
|
| 96 |
+
language_maps = [LANGUAGE_FRAUD_KEYWORDS[language]]
|
| 97 |
+
else:
|
| 98 |
+
# Fallback: support mixed-language transcripts by checking all known script maps.
|
| 99 |
+
language_maps = list(LANGUAGE_FRAUD_KEYWORDS.values())
|
| 100 |
+
|
| 101 |
+
for language_map in language_maps:
|
| 102 |
+
for category, keywords in language_map.items():
|
| 103 |
+
merged.setdefault(category, set()).update(keywords)
|
| 104 |
+
|
| 105 |
+
return merged
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _contains_keyword(normalized_text: str, token_set: Set[str], keyword: str) -> bool:
|
| 109 |
+
key = _normalize_text(keyword)
|
| 110 |
+
if not key:
|
| 111 |
+
return False
|
| 112 |
+
if " " in key:
|
| 113 |
+
return key in normalized_text
|
| 114 |
+
return key in token_set
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def _match_keywords(normalized_text: str, catalog: Dict[str, Set[str]]) -> Dict[str, List[str]]:
|
| 118 |
+
by_category: Dict[str, List[str]] = {}
|
| 119 |
+
token_set = set(normalized_text.split())
|
| 120 |
+
|
| 121 |
+
for category, keywords in catalog.items():
|
| 122 |
+
hits = [kw for kw in keywords if _contains_keyword(normalized_text, token_set, kw)]
|
| 123 |
+
if hits:
|
| 124 |
+
by_category[category] = sorted(hits)
|
| 125 |
+
return by_category
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def analyze_transcript(transcript: str, language: str | None = None) -> Dict[str, Any]:
|
| 129 |
+
"""Extract keyword and semantic signals from transcript text."""
|
| 130 |
+
if not transcript:
|
| 131 |
+
return {
|
| 132 |
+
"keyword_hits": [],
|
| 133 |
+
"keyword_categories": [],
|
| 134 |
+
"keyword_score": 0,
|
| 135 |
+
"semantic_flags": [],
|
| 136 |
+
"semantic_score": 0,
|
| 137 |
+
"behaviour_signals": [],
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
text = _normalize_text(transcript)
|
| 141 |
+
category_hits = _match_keywords(text, _combined_keyword_catalog(language))
|
| 142 |
+
|
| 143 |
+
keyword_hits: List[str] = []
|
| 144 |
+
for category, hits in sorted(category_hits.items()):
|
| 145 |
+
keyword_hits.extend([f"{category}:{hit}" for hit in hits])
|
| 146 |
+
|
| 147 |
+
categories = sorted(category_hits.keys())
|
| 148 |
+
keyword_score = min(100, len(keyword_hits) * 7 + len(categories) * 12)
|
| 149 |
+
|
| 150 |
+
semantic_flags: List[str] = []
|
| 151 |
+
behaviour_signals: List[str] = []
|
| 152 |
+
|
| 153 |
+
has_urgency = "urgency" in category_hits
|
| 154 |
+
has_impersonation = "impersonation" in category_hits
|
| 155 |
+
has_auth = "authentication" in category_hits
|
| 156 |
+
has_payment = "payment" in category_hits
|
| 157 |
+
has_threat = "threat" in category_hits
|
| 158 |
+
|
| 159 |
+
if has_urgency:
|
| 160 |
+
semantic_flags.append("urgency_language")
|
| 161 |
+
behaviour_signals.append("urgency_escalation")
|
| 162 |
+
if has_impersonation:
|
| 163 |
+
semantic_flags.append("authority_impersonation")
|
| 164 |
+
if has_auth:
|
| 165 |
+
semantic_flags.append("credential_request")
|
| 166 |
+
if has_payment:
|
| 167 |
+
semantic_flags.append("payment_redirection")
|
| 168 |
+
if has_threat:
|
| 169 |
+
semantic_flags.append("coercive_threat_language")
|
| 170 |
+
if "offer_lure" in category_hits:
|
| 171 |
+
semantic_flags.append("incentive_lure")
|
| 172 |
+
|
| 173 |
+
semantic_score = min(100, len(semantic_flags) * 14)
|
| 174 |
+
if has_impersonation and has_auth:
|
| 175 |
+
semantic_score = min(100, semantic_score + 18)
|
| 176 |
+
behaviour_signals.append("authority_with_credential_request")
|
| 177 |
+
if has_payment and has_urgency:
|
| 178 |
+
semantic_score = min(100, semantic_score + 14)
|
| 179 |
+
behaviour_signals.append("urgent_payment_pressure")
|
| 180 |
+
if has_threat and has_urgency:
|
| 181 |
+
semantic_score = min(100, semantic_score + 10)
|
| 182 |
+
behaviour_signals.append("threat_plus_urgency")
|
| 183 |
+
|
| 184 |
+
return {
|
| 185 |
+
"keyword_hits": keyword_hits,
|
| 186 |
+
"keyword_categories": categories,
|
| 187 |
+
"keyword_score": keyword_score,
|
| 188 |
+
"semantic_flags": semantic_flags,
|
| 189 |
+
"semantic_score": semantic_score,
|
| 190 |
+
"behaviour_signals": sorted(set(behaviour_signals)),
|
| 191 |
+
}
|
llm_semantic_analyzer.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Optional LLM semantic verifier for realtime transcript analysis.
|
| 3 |
+
|
| 4 |
+
This is a second-layer signal meant for ambiguous/uncertain chunks.
|
| 5 |
+
It must never block realtime flow.
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import logging
|
| 11 |
+
import re
|
| 12 |
+
from typing import Any, Dict, Optional
|
| 13 |
+
|
| 14 |
+
import httpx
|
| 15 |
+
|
| 16 |
+
from config import settings
|
| 17 |
+
from privacy_utils import mask_sensitive_entities
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _clamp_int(value: Any, lo: int = 0, hi: int = 100) -> int:
|
| 23 |
+
try:
|
| 24 |
+
parsed = int(round(float(value)))
|
| 25 |
+
except (TypeError, ValueError):
|
| 26 |
+
return lo
|
| 27 |
+
return max(lo, min(hi, parsed))
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _clamp_float(value: Any, lo: float = 0.0, hi: float = 1.0) -> float:
|
| 31 |
+
try:
|
| 32 |
+
parsed = float(value)
|
| 33 |
+
except (TypeError, ValueError):
|
| 34 |
+
return lo
|
| 35 |
+
return max(lo, min(hi, parsed))
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _extract_json_object(text: str) -> Optional[Dict[str, Any]]:
|
| 39 |
+
if not text:
|
| 40 |
+
return None
|
| 41 |
+
|
| 42 |
+
text = text.strip()
|
| 43 |
+
try:
|
| 44 |
+
parsed = json.loads(text)
|
| 45 |
+
if isinstance(parsed, dict):
|
| 46 |
+
return parsed
|
| 47 |
+
except json.JSONDecodeError:
|
| 48 |
+
pass
|
| 49 |
+
|
| 50 |
+
match = re.search(r"\{[\s\S]*\}", text)
|
| 51 |
+
if not match:
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
parsed = json.loads(match.group(0))
|
| 56 |
+
return parsed if isinstance(parsed, dict) else None
|
| 57 |
+
except json.JSONDecodeError:
|
| 58 |
+
return None
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _resolve_provider() -> str:
|
| 62 |
+
provider = str(getattr(settings, "LLM_PROVIDER", "openai") or "openai").strip().lower()
|
| 63 |
+
if provider in {"gemini", "google"}:
|
| 64 |
+
return "gemini"
|
| 65 |
+
return "openai"
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _resolve_model(provider: str) -> str:
|
| 69 |
+
configured = str(getattr(settings, "LLM_SEMANTIC_MODEL", "") or "").strip()
|
| 70 |
+
if configured:
|
| 71 |
+
return configured
|
| 72 |
+
if provider == "gemini":
|
| 73 |
+
return "gemini-1.5-flash"
|
| 74 |
+
return "gpt-4o-mini"
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _provider_api_key(provider: str) -> Optional[str]:
|
| 78 |
+
if provider == "gemini":
|
| 79 |
+
return getattr(settings, "GEMINI_API_KEY", None)
|
| 80 |
+
return getattr(settings, "OPENAI_API_KEY", None)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def is_llm_semantic_provider_ready() -> bool:
|
| 84 |
+
"""Return True when selected provider has required credentials."""
|
| 85 |
+
provider = _resolve_provider()
|
| 86 |
+
return bool(_provider_api_key(provider))
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _normalized_response(data: Dict[str, Any], model_name: str, engine_name: str) -> Dict[str, Any]:
|
| 90 |
+
semantic_flags = data.get("semantic_flags") or []
|
| 91 |
+
behaviour_signals = data.get("behaviour_signals") or []
|
| 92 |
+
keyword_hints = data.get("keyword_hints") or []
|
| 93 |
+
|
| 94 |
+
if not isinstance(semantic_flags, list):
|
| 95 |
+
semantic_flags = []
|
| 96 |
+
if not isinstance(behaviour_signals, list):
|
| 97 |
+
behaviour_signals = []
|
| 98 |
+
if not isinstance(keyword_hints, list):
|
| 99 |
+
keyword_hints = []
|
| 100 |
+
|
| 101 |
+
return {
|
| 102 |
+
"available": True,
|
| 103 |
+
"semantic_score": _clamp_int(data.get("semantic_score", 0)),
|
| 104 |
+
"confidence": _clamp_float(data.get("confidence", 0.0)),
|
| 105 |
+
"semantic_flags": [str(x) for x in semantic_flags if x],
|
| 106 |
+
"behaviour_signals": [str(x) for x in behaviour_signals if x],
|
| 107 |
+
"keyword_hints": [str(x) for x in keyword_hints if x],
|
| 108 |
+
"model": model_name,
|
| 109 |
+
"engine": engine_name,
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def _build_prompts(language: str, safe_transcript: str) -> tuple[str, str]:
|
| 114 |
+
system_prompt = (
|
| 115 |
+
"You are a telecom fraud intent classifier. "
|
| 116 |
+
"Return ONLY strict JSON with keys: "
|
| 117 |
+
"semantic_score (0-100), confidence (0-1), semantic_flags (string[]), "
|
| 118 |
+
"behaviour_signals (string[]), keyword_hints (string[])."
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
user_prompt = (
|
| 122 |
+
f"Language: {language}\n"
|
| 123 |
+
"Task: detect coercion, impersonation, credential request, and payment pressure.\n"
|
| 124 |
+
f"Transcript: {safe_transcript}"
|
| 125 |
+
)
|
| 126 |
+
return system_prompt, user_prompt
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def _call_openai_semantic(
|
| 130 |
+
client: httpx.Client,
|
| 131 |
+
model_name: str,
|
| 132 |
+
api_key: str,
|
| 133 |
+
system_prompt: str,
|
| 134 |
+
user_prompt: str,
|
| 135 |
+
) -> Dict[str, Any]:
|
| 136 |
+
payload = {
|
| 137 |
+
"model": model_name,
|
| 138 |
+
"temperature": 0,
|
| 139 |
+
"response_format": {"type": "json_object"},
|
| 140 |
+
"messages": [
|
| 141 |
+
{"role": "system", "content": system_prompt},
|
| 142 |
+
{"role": "user", "content": user_prompt},
|
| 143 |
+
],
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
response = client.post(
|
| 147 |
+
"https://api.openai.com/v1/chat/completions",
|
| 148 |
+
headers={
|
| 149 |
+
"Authorization": f"Bearer {api_key}",
|
| 150 |
+
"Content-Type": "application/json",
|
| 151 |
+
},
|
| 152 |
+
json=payload,
|
| 153 |
+
)
|
| 154 |
+
response.raise_for_status()
|
| 155 |
+
data = response.json()
|
| 156 |
+
content = (
|
| 157 |
+
data.get("choices", [{}])[0]
|
| 158 |
+
.get("message", {})
|
| 159 |
+
.get("content", "")
|
| 160 |
+
)
|
| 161 |
+
parsed = _extract_json_object(content)
|
| 162 |
+
if parsed is None:
|
| 163 |
+
return {"available": False, "reason": "invalid_json"}
|
| 164 |
+
return _normalized_response(parsed, model_name=model_name, engine_name="openai-chat-completions")
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def _call_gemini_semantic(
|
| 168 |
+
client: httpx.Client,
|
| 169 |
+
model_name: str,
|
| 170 |
+
api_key: str,
|
| 171 |
+
system_prompt: str,
|
| 172 |
+
user_prompt: str,
|
| 173 |
+
) -> Dict[str, Any]:
|
| 174 |
+
payload = {
|
| 175 |
+
"contents": [
|
| 176 |
+
{
|
| 177 |
+
"role": "user",
|
| 178 |
+
"parts": [
|
| 179 |
+
{"text": f"{system_prompt}\n\n{user_prompt}"},
|
| 180 |
+
],
|
| 181 |
+
}
|
| 182 |
+
],
|
| 183 |
+
"generationConfig": {
|
| 184 |
+
"temperature": 0,
|
| 185 |
+
"responseMimeType": "application/json",
|
| 186 |
+
},
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{model_name}:generateContent"
|
| 190 |
+
response = client.post(url, params={"key": api_key}, json=payload)
|
| 191 |
+
response.raise_for_status()
|
| 192 |
+
data = response.json()
|
| 193 |
+
|
| 194 |
+
content = (
|
| 195 |
+
data.get("candidates", [{}])[0]
|
| 196 |
+
.get("content", {})
|
| 197 |
+
.get("parts", [{}])[0]
|
| 198 |
+
.get("text", "")
|
| 199 |
+
)
|
| 200 |
+
parsed = _extract_json_object(content)
|
| 201 |
+
if parsed is None:
|
| 202 |
+
return {"available": False, "reason": "invalid_json"}
|
| 203 |
+
return _normalized_response(parsed, model_name=model_name, engine_name="gemini-generate-content")
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def analyze_semantic_with_llm(transcript: str, language: str, timeout_ms: Optional[int] = None) -> Dict[str, Any]:
|
| 207 |
+
"""
|
| 208 |
+
Analyze transcript semantics via an optional LLM.
|
| 209 |
+
|
| 210 |
+
Returns a normalized dict with `available` bool and semantic fields.
|
| 211 |
+
"""
|
| 212 |
+
if not settings.LLM_SEMANTIC_ENABLED:
|
| 213 |
+
return {"available": False, "reason": "disabled"}
|
| 214 |
+
|
| 215 |
+
if not transcript or len(transcript.strip()) < 8:
|
| 216 |
+
return {"available": False, "reason": "insufficient_transcript"}
|
| 217 |
+
|
| 218 |
+
provider = _resolve_provider()
|
| 219 |
+
api_key = _provider_api_key(provider)
|
| 220 |
+
if not api_key:
|
| 221 |
+
return {"available": False, "reason": f"missing_{provider}_api_key"}
|
| 222 |
+
|
| 223 |
+
safe_transcript = mask_sensitive_entities(transcript).strip()
|
| 224 |
+
if not safe_transcript:
|
| 225 |
+
return {"available": False, "reason": "empty_after_masking"}
|
| 226 |
+
|
| 227 |
+
timeout_seconds = max(0.1, (timeout_ms or settings.LLM_SEMANTIC_TIMEOUT_MS) / 1000.0)
|
| 228 |
+
model_name = _resolve_model(provider)
|
| 229 |
+
system_prompt, user_prompt = _build_prompts(language, safe_transcript)
|
| 230 |
+
|
| 231 |
+
try:
|
| 232 |
+
with httpx.Client(timeout=timeout_seconds) as client:
|
| 233 |
+
if provider == "openai":
|
| 234 |
+
return _call_openai_semantic(
|
| 235 |
+
client=client,
|
| 236 |
+
model_name=model_name,
|
| 237 |
+
api_key=api_key,
|
| 238 |
+
system_prompt=system_prompt,
|
| 239 |
+
user_prompt=user_prompt,
|
| 240 |
+
)
|
| 241 |
+
if provider == "gemini":
|
| 242 |
+
return _call_gemini_semantic(
|
| 243 |
+
client=client,
|
| 244 |
+
model_name=model_name,
|
| 245 |
+
api_key=api_key,
|
| 246 |
+
system_prompt=system_prompt,
|
| 247 |
+
user_prompt=user_prompt,
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
return {"available": False, "reason": "unsupported_provider"}
|
| 251 |
+
except Exception as exc: # pragma: no cover - network/runtime dependent
|
| 252 |
+
logger.warning("LLM semantic verifier unavailable (%s): %s", provider, exc)
|
| 253 |
+
return {"available": False, "reason": "request_failed"}
|
main.py
ADDED
|
@@ -0,0 +1,1903 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI application for AI-Generated Voice Detection.
|
| 3 |
+
|
| 4 |
+
Endpoint: POST /api/voice-detection
|
| 5 |
+
- Accepts Base64-encoded MP3 audio
|
| 6 |
+
- Returns classification (AI_GENERATED or HUMAN) with confidence score
|
| 7 |
+
"""
|
| 8 |
+
import logging
|
| 9 |
+
import asyncio
|
| 10 |
+
import uuid
|
| 11 |
+
import time
|
| 12 |
+
import json
|
| 13 |
+
import io
|
| 14 |
+
from dataclasses import dataclass, field, asdict
|
| 15 |
+
from datetime import datetime, timezone
|
| 16 |
+
from typing import Optional, Any, Dict, List
|
| 17 |
+
from contextlib import asynccontextmanager
|
| 18 |
+
import numpy as np
|
| 19 |
+
from fastapi import FastAPI, HTTPException, Request, Depends, WebSocket, WebSocketDisconnect
|
| 20 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 21 |
+
from fastapi.responses import JSONResponse
|
| 22 |
+
from pydantic import BaseModel, Field, field_validator, ValidationError
|
| 23 |
+
from slowapi import Limiter, _rate_limit_exceeded_handler
|
| 24 |
+
from slowapi.util import get_remote_address
|
| 25 |
+
from slowapi.errors import RateLimitExceeded
|
| 26 |
+
|
| 27 |
+
# Configure logging
|
| 28 |
+
logging.basicConfig(
|
| 29 |
+
level=logging.INFO,
|
| 30 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 31 |
+
)
|
| 32 |
+
logger = logging.getLogger(__name__)
|
| 33 |
+
|
| 34 |
+
# Rate limiting
|
| 35 |
+
limiter = Limiter(key_func=get_remote_address, default_limits=["1000/minute"])
|
| 36 |
+
|
| 37 |
+
from audio_utils import decode_base64_audio, load_audio_from_bytes
|
| 38 |
+
from model import analyze_voice, AnalysisResult
|
| 39 |
+
from speech_to_text import transcribe_audio
|
| 40 |
+
from fraud_language import analyze_transcript
|
| 41 |
+
from llm_semantic_analyzer import analyze_semantic_with_llm, is_llm_semantic_provider_ready
|
| 42 |
+
from privacy_utils import mask_sensitive_entities, sanitize_for_logging
|
| 43 |
+
from config import settings
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
import redis # type: ignore
|
| 47 |
+
except Exception: # pragma: no cover - optional dependency
|
| 48 |
+
redis = None
|
| 49 |
+
|
| 50 |
+
# Computed constraints
|
| 51 |
+
MAX_AUDIO_BASE64_LENGTH = settings.MAX_AUDIO_SIZE_MB * 1024 * 1024 * 4 // 3
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
@dataclass
|
| 55 |
+
class SessionState:
|
| 56 |
+
"""In-memory state for a real-time analysis session (derived data only)."""
|
| 57 |
+
session_id: str
|
| 58 |
+
language: str
|
| 59 |
+
started_at: str
|
| 60 |
+
status: str = "active"
|
| 61 |
+
chunks_processed: int = 0
|
| 62 |
+
alerts_triggered: int = 0
|
| 63 |
+
max_risk_score: int = 0
|
| 64 |
+
max_cpi: float = 0.0
|
| 65 |
+
final_call_label: str = "UNCERTAIN"
|
| 66 |
+
final_voice_classification: str = "UNCERTAIN"
|
| 67 |
+
final_voice_confidence: float = 0.0
|
| 68 |
+
max_voice_ai_confidence: float = 0.0
|
| 69 |
+
voice_ai_chunks: int = 0
|
| 70 |
+
voice_human_chunks: int = 0
|
| 71 |
+
llm_checks_performed: int = 0
|
| 72 |
+
risk_policy_version: str = settings.RISK_POLICY_VERSION
|
| 73 |
+
risk_history: List[int] = field(default_factory=list)
|
| 74 |
+
transcript_counts: Dict[str, int] = field(default_factory=dict)
|
| 75 |
+
semantic_flag_counts: Dict[str, int] = field(default_factory=dict)
|
| 76 |
+
keyword_category_counts: Dict[str, int] = field(default_factory=dict)
|
| 77 |
+
behaviour_score: int = 0
|
| 78 |
+
session_behaviour_signals: List[str] = field(default_factory=list)
|
| 79 |
+
last_transcript: str = ""
|
| 80 |
+
last_update: Optional[str] = None
|
| 81 |
+
alert_history: List[Dict[str, Any]] = field(default_factory=list)
|
| 82 |
+
llm_last_engine: Optional[str] = None
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
SESSION_STORE: Dict[str, SessionState] = {}
|
| 86 |
+
SESSION_LOCK = asyncio.Lock()
|
| 87 |
+
SESSION_STORE_BACKEND_ACTIVE = "memory"
|
| 88 |
+
REDIS_CLIENT: Any = None
|
| 89 |
+
ASR_INFLIGHT_TASKS: set[asyncio.Task] = set()
|
| 90 |
+
ASR_INFLIGHT_LOCK = asyncio.Lock()
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def use_redis_session_store() -> bool:
|
| 95 |
+
"""Return whether redis-backed session store is active."""
|
| 96 |
+
return SESSION_STORE_BACKEND_ACTIVE == "redis" and REDIS_CLIENT is not None
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def initialize_session_store_backend() -> None:
|
| 100 |
+
"""Initialize configured session backend with safe fallback to memory."""
|
| 101 |
+
global SESSION_STORE_BACKEND_ACTIVE, REDIS_CLIENT
|
| 102 |
+
|
| 103 |
+
configured = str(getattr(settings, "SESSION_STORE_BACKEND", "memory") or "memory").strip().lower()
|
| 104 |
+
if configured != "redis":
|
| 105 |
+
SESSION_STORE_BACKEND_ACTIVE = "memory"
|
| 106 |
+
REDIS_CLIENT = None
|
| 107 |
+
logger.info("Session store backend: memory")
|
| 108 |
+
return
|
| 109 |
+
|
| 110 |
+
if redis is None:
|
| 111 |
+
logger.warning("Redis backend requested but redis package is not installed. Falling back to memory store.")
|
| 112 |
+
SESSION_STORE_BACKEND_ACTIVE = "memory"
|
| 113 |
+
REDIS_CLIENT = None
|
| 114 |
+
return
|
| 115 |
+
|
| 116 |
+
redis_url = getattr(settings, "REDIS_URL", None)
|
| 117 |
+
if not redis_url:
|
| 118 |
+
logger.warning("Redis backend requested but REDIS_URL is empty. Falling back to memory store.")
|
| 119 |
+
SESSION_STORE_BACKEND_ACTIVE = "memory"
|
| 120 |
+
REDIS_CLIENT = None
|
| 121 |
+
return
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
REDIS_CLIENT = redis.Redis.from_url(
|
| 125 |
+
redis_url,
|
| 126 |
+
decode_responses=True,
|
| 127 |
+
socket_connect_timeout=max(0.1, float(settings.REDIS_CONNECT_TIMEOUT_MS) / 1000.0),
|
| 128 |
+
socket_timeout=max(0.1, float(settings.REDIS_IO_TIMEOUT_MS) / 1000.0),
|
| 129 |
+
)
|
| 130 |
+
REDIS_CLIENT.ping()
|
| 131 |
+
SESSION_STORE_BACKEND_ACTIVE = "redis"
|
| 132 |
+
logger.info("Session store backend: redis")
|
| 133 |
+
except Exception as exc:
|
| 134 |
+
logger.warning("Failed to initialize redis session store (%s). Falling back to memory store.", exc)
|
| 135 |
+
SESSION_STORE_BACKEND_ACTIVE = "memory"
|
| 136 |
+
REDIS_CLIENT = None
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def _session_redis_key(session_id: str) -> str:
|
| 140 |
+
return f"{settings.REDIS_PREFIX}:session:{session_id}"
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def _serialize_session(session: SessionState) -> str:
|
| 144 |
+
return json.dumps(asdict(session), ensure_ascii=False, separators=(",", ":"))
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def _deserialize_session(raw: Optional[str]) -> Optional[SessionState]:
|
| 148 |
+
if not raw:
|
| 149 |
+
return None
|
| 150 |
+
try:
|
| 151 |
+
payload = json.loads(raw)
|
| 152 |
+
if not isinstance(payload, dict):
|
| 153 |
+
return None
|
| 154 |
+
return SessionState(**payload)
|
| 155 |
+
except Exception as exc:
|
| 156 |
+
logger.warning("Failed to deserialize session payload: %s", exc)
|
| 157 |
+
return None
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def get_session_state(session_id: str) -> Optional[SessionState]:
|
| 161 |
+
"""Fetch session state from active backend."""
|
| 162 |
+
if use_redis_session_store():
|
| 163 |
+
raw = REDIS_CLIENT.get(_session_redis_key(session_id))
|
| 164 |
+
return _deserialize_session(raw)
|
| 165 |
+
return SESSION_STORE.get(session_id)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def save_session_state(session: SessionState) -> None:
|
| 169 |
+
"""Persist session state to active backend."""
|
| 170 |
+
if use_redis_session_store():
|
| 171 |
+
ttl_seconds = max(1, int(session_retention_seconds(session)))
|
| 172 |
+
REDIS_CLIENT.set(_session_redis_key(session.session_id), _serialize_session(session), ex=ttl_seconds)
|
| 173 |
+
return
|
| 174 |
+
SESSION_STORE[session.session_id] = session
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def delete_session_state(session_id: str) -> None:
|
| 178 |
+
"""Delete session from active backend."""
|
| 179 |
+
if use_redis_session_store():
|
| 180 |
+
REDIS_CLIENT.delete(_session_redis_key(session_id))
|
| 181 |
+
return
|
| 182 |
+
SESSION_STORE.pop(session_id, None)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def _asr_fallback_result(engine: str) -> Dict[str, Any]:
|
| 186 |
+
return {
|
| 187 |
+
"transcript": "",
|
| 188 |
+
"confidence": 0.0,
|
| 189 |
+
"engine": engine,
|
| 190 |
+
"available": False,
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def _discard_asr_task(task: asyncio.Task) -> None:
|
| 195 |
+
ASR_INFLIGHT_TASKS.discard(task)
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
async def transcribe_audio_guarded(
|
| 199 |
+
audio: np.ndarray,
|
| 200 |
+
sr: int,
|
| 201 |
+
language: str,
|
| 202 |
+
timeout_seconds: float,
|
| 203 |
+
request_id: str,
|
| 204 |
+
) -> Dict[str, Any]:
|
| 205 |
+
"""Run ASR with timeout and bounded in-flight tasks to avoid thread pileups."""
|
| 206 |
+
max_inflight = max(1, int(getattr(settings, "ASR_MAX_INFLIGHT_TASKS", 1)))
|
| 207 |
+
|
| 208 |
+
async with ASR_INFLIGHT_LOCK:
|
| 209 |
+
stale_tasks = [task for task in ASR_INFLIGHT_TASKS if task.done()]
|
| 210 |
+
for stale in stale_tasks:
|
| 211 |
+
ASR_INFLIGHT_TASKS.discard(stale)
|
| 212 |
+
|
| 213 |
+
if len(ASR_INFLIGHT_TASKS) >= max_inflight:
|
| 214 |
+
logger.warning(
|
| 215 |
+
"[%s] Realtime ASR skipped (inflight=%s, max=%s); continuing without transcript",
|
| 216 |
+
request_id,
|
| 217 |
+
len(ASR_INFLIGHT_TASKS),
|
| 218 |
+
max_inflight,
|
| 219 |
+
)
|
| 220 |
+
return _asr_fallback_result("busy")
|
| 221 |
+
|
| 222 |
+
asr_task = asyncio.create_task(asyncio.to_thread(transcribe_audio, audio, sr, language))
|
| 223 |
+
ASR_INFLIGHT_TASKS.add(asr_task)
|
| 224 |
+
asr_task.add_done_callback(_discard_asr_task)
|
| 225 |
+
|
| 226 |
+
try:
|
| 227 |
+
return await asyncio.wait_for(asyncio.shield(asr_task), timeout=timeout_seconds)
|
| 228 |
+
except asyncio.TimeoutError:
|
| 229 |
+
logger.warning(
|
| 230 |
+
"[%s] Realtime ASR timed out after %.0fms; continuing without transcript",
|
| 231 |
+
request_id,
|
| 232 |
+
timeout_seconds * 1000,
|
| 233 |
+
)
|
| 234 |
+
return _asr_fallback_result("timeout")
|
| 235 |
+
except Exception as exc:
|
| 236 |
+
logger.warning("[%s] Realtime ASR path failed: %s; continuing without transcript", request_id, exc)
|
| 237 |
+
return _asr_fallback_result("error")
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def warmup_audio_pipeline() -> None:
|
| 241 |
+
"""Warm audio decoding stack to reduce first-request latency spikes."""
|
| 242 |
+
if not settings.AUDIO_PIPELINE_WARMUP_ENABLED:
|
| 243 |
+
return
|
| 244 |
+
try:
|
| 245 |
+
import soundfile as sf
|
| 246 |
+
|
| 247 |
+
warm_audio = np.zeros(16000, dtype=np.float32)
|
| 248 |
+
wav_buffer = io.BytesIO()
|
| 249 |
+
sf.write(wav_buffer, warm_audio, 16000, format="WAV", subtype="PCM_16")
|
| 250 |
+
load_audio_from_bytes(wav_buffer.getvalue(), 22050, "wav")
|
| 251 |
+
logger.info("Audio pipeline warm-up complete")
|
| 252 |
+
except Exception as exc:
|
| 253 |
+
logger.warning("Audio pipeline warm-up skipped: %s", exc)
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def warmup_asr_pipeline() -> None:
|
| 257 |
+
"""Warm ASR model and transcription path on startup."""
|
| 258 |
+
if not settings.ASR_ENABLED or not settings.ASR_WARMUP_ENABLED:
|
| 259 |
+
return
|
| 260 |
+
try:
|
| 261 |
+
warm_audio = np.zeros(16000, dtype=np.float32)
|
| 262 |
+
transcribe_audio(warm_audio, 16000, "English")
|
| 263 |
+
logger.info("ASR warm-up complete")
|
| 264 |
+
except Exception as exc:
|
| 265 |
+
logger.warning("ASR warm-up skipped: %s", exc)
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def warmup_voice_pipeline() -> None:
|
| 269 |
+
"""Run one inference pass to avoid first realtime-model cold latency spike."""
|
| 270 |
+
if not settings.VOICE_WARMUP_ENABLED:
|
| 271 |
+
return
|
| 272 |
+
try:
|
| 273 |
+
sr = 16000
|
| 274 |
+
duration_sec = 1.0
|
| 275 |
+
sample_count = max(1, int(sr * duration_sec))
|
| 276 |
+
t = np.linspace(0.0, duration_sec, sample_count, endpoint=False, dtype=np.float32)
|
| 277 |
+
# Non-silent tone avoids edge-case feature paths and mirrors short speech chunks.
|
| 278 |
+
warm_audio = (0.08 * np.sin(2 * np.pi * 220 * t)).astype(np.float32)
|
| 279 |
+
analyze_voice(warm_audio, sr, "English", True)
|
| 280 |
+
logger.info("Voice model warm-up complete")
|
| 281 |
+
except Exception as exc:
|
| 282 |
+
logger.warning("Voice model warm-up skipped: %s", exc)
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def run_startup_warmups() -> None:
|
| 286 |
+
"""Run non-critical startup warm-ups for latency-sensitive paths."""
|
| 287 |
+
warmup_audio_pipeline()
|
| 288 |
+
warmup_voice_pipeline()
|
| 289 |
+
warmup_asr_pipeline()
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
# Detect environment
|
| 293 |
+
if settings.SPACE_ID:
|
| 294 |
+
logger.info(f"Running on HuggingFace Spaces: {settings.SPACE_ID}")
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
@asynccontextmanager
|
| 298 |
+
async def lifespan(app: FastAPI):
|
| 299 |
+
"""Manage application lifespan events."""
|
| 300 |
+
logger.info("Starting up - preloading ML model...")
|
| 301 |
+
initialize_session_store_backend()
|
| 302 |
+
try:
|
| 303 |
+
from model import preload_model
|
| 304 |
+
preload_model()
|
| 305 |
+
logger.info("ML model loaded successfully")
|
| 306 |
+
except Exception as e:
|
| 307 |
+
logger.error(f"Failed to preload model: {e}")
|
| 308 |
+
|
| 309 |
+
try:
|
| 310 |
+
await asyncio.to_thread(run_startup_warmups)
|
| 311 |
+
except Exception as exc:
|
| 312 |
+
logger.warning("Startup warm-ups encountered an issue: %s", exc)
|
| 313 |
+
|
| 314 |
+
yield
|
| 315 |
+
# Shutdown
|
| 316 |
+
logger.info("Shutting down...")
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
from fastapi.responses import RedirectResponse
|
| 320 |
+
|
| 321 |
+
# Initialize FastAPI app with lifespan
|
| 322 |
+
app = FastAPI(
|
| 323 |
+
title="AI Voice Detection API",
|
| 324 |
+
description="Detects whether a voice sample is AI-generated or spoken by a real human",
|
| 325 |
+
version="1.0.0",
|
| 326 |
+
contact={
|
| 327 |
+
"name": "Shivam",
|
| 328 |
+
"url": settings.WEBSITE_URL,
|
| 329 |
+
},
|
| 330 |
+
docs_url="/docs",
|
| 331 |
+
redoc_url="/redoc",
|
| 332 |
+
lifespan=lifespan
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
# Add rate limiter to app state
|
| 336 |
+
app.state.limiter = limiter
|
| 337 |
+
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
|
| 338 |
+
|
| 339 |
+
# Middleware configuration
|
| 340 |
+
# CORS
|
| 341 |
+
# Note: Set ALLOWED_ORIGINS env var in production
|
| 342 |
+
app.add_middleware(
|
| 343 |
+
CORSMiddleware,
|
| 344 |
+
allow_origins=settings.ALLOWED_ORIGINS,
|
| 345 |
+
allow_credentials=True,
|
| 346 |
+
allow_methods=["GET", "POST", "OPTIONS"],
|
| 347 |
+
allow_headers=["Content-Type", "x-api-key", "Authorization"],
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
# Request Logging & Timing Middleware
|
| 351 |
+
@app.middleware("http")
|
| 352 |
+
async def log_requests(request: Request, call_next):
|
| 353 |
+
# Generate request ID and start timer
|
| 354 |
+
request_id = str(uuid.uuid4())[:8]
|
| 355 |
+
request.state.request_id = request_id
|
| 356 |
+
start_time = time.perf_counter()
|
| 357 |
+
|
| 358 |
+
# Log request start
|
| 359 |
+
method = request.method
|
| 360 |
+
path = request.url.path
|
| 361 |
+
if method == "POST":
|
| 362 |
+
logger.info(f"[{request_id}] [START] {method} {path}")
|
| 363 |
+
|
| 364 |
+
# Process request (async)
|
| 365 |
+
response = await call_next(request)
|
| 366 |
+
|
| 367 |
+
# Calculate duration
|
| 368 |
+
duration_ms = (time.perf_counter() - start_time) * 1000
|
| 369 |
+
status_code = response.status_code
|
| 370 |
+
|
| 371 |
+
# Log request completion with timing
|
| 372 |
+
if method == "POST":
|
| 373 |
+
status_label = "[OK]" if status_code == 200 else "[ERR]" if status_code >= 400 else "[WARN]"
|
| 374 |
+
logger.info(f"[{request_id}] {status_label} END {method} {path} -> {status_code} ({duration_ms:.0f}ms)")
|
| 375 |
+
|
| 376 |
+
# Add headers
|
| 377 |
+
response.headers["X-Request-ID"] = request_id
|
| 378 |
+
response.headers["X-Response-Time"] = f"{duration_ms:.0f}ms"
|
| 379 |
+
response.headers["X-Content-Type-Options"] = "nosniff"
|
| 380 |
+
# Allow embedding in Hugging Face iframe
|
| 381 |
+
# response.headers["X-Frame-Options"] = "DENY"
|
| 382 |
+
response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
|
| 383 |
+
# Relax CSP to allow standard API documentation via CDNs (ReDoc/Swagger)
|
| 384 |
+
response.headers["Content-Security-Policy"] = (
|
| 385 |
+
"default-src 'self'; "
|
| 386 |
+
"script-src 'self' 'unsafe-inline' 'unsafe-eval' https://cdn.jsdelivr.net; "
|
| 387 |
+
"style-src 'self' 'unsafe-inline' https://cdn.jsdelivr.net https://fonts.googleapis.com; "
|
| 388 |
+
"font-src 'self' https://fonts.gstatic.com; "
|
| 389 |
+
"img-src 'self' data: https://fastapi.tiangolo.com;"
|
| 390 |
+
)
|
| 391 |
+
return response
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
# Request/Response Models
|
| 395 |
+
class VoiceDetectionRequest(BaseModel):
|
| 396 |
+
"""Request body for voice detection."""
|
| 397 |
+
language: str = Field(..., description="Language: Tamil, English, Hindi, Malayalam, or Telugu")
|
| 398 |
+
audioFormat: str = Field(default="mp3", description="Audio format (must be mp3)")
|
| 399 |
+
audioBase64: str = Field(..., description="Base64-encoded MP3 audio")
|
| 400 |
+
|
| 401 |
+
@field_validator('audioBase64')
|
| 402 |
+
@classmethod
|
| 403 |
+
def validate_audio_size(cls, v: str) -> str:
|
| 404 |
+
"""Validate audio data is not too small or too large."""
|
| 405 |
+
if len(v) < 100:
|
| 406 |
+
raise ValueError("Audio data too small - provide valid audio content")
|
| 407 |
+
if len(v) > MAX_AUDIO_BASE64_LENGTH:
|
| 408 |
+
raise ValueError(f"Audio data too large - maximum {settings.MAX_AUDIO_SIZE_MB}MB allowed")
|
| 409 |
+
return v
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
class ForensicMetrics(BaseModel):
|
| 413 |
+
"""Detailed forensic analysis metrics."""
|
| 414 |
+
authenticity_score: float = Field(..., description="Overall voice naturalness score (0-100)")
|
| 415 |
+
pitch_naturalness: float = Field(..., description="Pitch stability and jitter score (0-100)")
|
| 416 |
+
spectral_naturalness: float = Field(..., description="Spectral entropy and flatness score (0-100)")
|
| 417 |
+
temporal_naturalness: float = Field(..., description="Rhythm and silence score (0-100)")
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
class VoiceDetectionResponse(BaseModel):
|
| 421 |
+
"""Successful response from voice detection."""
|
| 422 |
+
status: str = "success"
|
| 423 |
+
language: str
|
| 424 |
+
classification: str # AI_GENERATED or HUMAN
|
| 425 |
+
confidenceScore: float = Field(..., ge=0.0, le=1.0)
|
| 426 |
+
explanation: str
|
| 427 |
+
forensic_metrics: Optional[ForensicMetrics] = None
|
| 428 |
+
modelUncertain: bool = False
|
| 429 |
+
recommendedAction: Optional[str] = None
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
class ErrorResponse(BaseModel):
|
| 433 |
+
"""Error response."""
|
| 434 |
+
status: str = "error"
|
| 435 |
+
message: str
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
class SessionStartRequest(BaseModel):
|
| 439 |
+
"""Request body for creating a real-time analysis session."""
|
| 440 |
+
language: str = Field(..., description="Language: Tamil, English, Hindi, Malayalam, or Telugu")
|
| 441 |
+
|
| 442 |
+
|
| 443 |
+
class SessionStartResponse(BaseModel):
|
| 444 |
+
"""Response body after creating a session."""
|
| 445 |
+
status: str = "success"
|
| 446 |
+
session_id: str
|
| 447 |
+
language: str
|
| 448 |
+
started_at: str
|
| 449 |
+
message: str
|
| 450 |
+
|
| 451 |
+
|
| 452 |
+
class SessionChunkRequest(BaseModel):
|
| 453 |
+
"""Audio chunk request for real-time analysis."""
|
| 454 |
+
audioFormat: str = Field(default="mp3", description="Audio format (must be one of supported formats)")
|
| 455 |
+
audioBase64: str = Field(..., description="Base64-encoded audio chunk")
|
| 456 |
+
language: Optional[str] = Field(default=None, description="Optional override. Defaults to session language")
|
| 457 |
+
|
| 458 |
+
@field_validator("audioBase64")
|
| 459 |
+
@classmethod
|
| 460 |
+
def validate_chunk_size(cls, v: str) -> str:
|
| 461 |
+
if len(v) < 100:
|
| 462 |
+
raise ValueError("Audio data too small - provide valid audio content")
|
| 463 |
+
if len(v) > MAX_AUDIO_BASE64_LENGTH:
|
| 464 |
+
raise ValueError(f"Audio data too large - maximum {settings.MAX_AUDIO_SIZE_MB}MB allowed")
|
| 465 |
+
return v
|
| 466 |
+
|
| 467 |
+
|
| 468 |
+
class RiskEvidence(BaseModel):
|
| 469 |
+
"""Model evidence used to produce risk score."""
|
| 470 |
+
audio_patterns: List[str] = Field(default_factory=list)
|
| 471 |
+
keywords: List[str] = Field(default_factory=list)
|
| 472 |
+
behaviour: List[str] = Field(default_factory=list)
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
class RealTimeLanguageAnalysis(BaseModel):
|
| 476 |
+
"""Transcript and language risk signals for the current chunk."""
|
| 477 |
+
transcript: str = ""
|
| 478 |
+
transcript_confidence: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 479 |
+
asr_engine: str = "unavailable"
|
| 480 |
+
keyword_hits: List[str] = Field(default_factory=list)
|
| 481 |
+
keyword_categories: List[str] = Field(default_factory=list)
|
| 482 |
+
semantic_flags: List[str] = Field(default_factory=list)
|
| 483 |
+
keyword_score: int = Field(default=0, ge=0, le=100)
|
| 484 |
+
semantic_score: int = Field(default=0, ge=0, le=100)
|
| 485 |
+
behaviour_score: int = Field(default=0, ge=0, le=100)
|
| 486 |
+
session_behaviour_signals: List[str] = Field(default_factory=list)
|
| 487 |
+
llm_semantic_used: bool = False
|
| 488 |
+
llm_semantic_confidence: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 489 |
+
llm_semantic_model: Optional[str] = None
|
| 490 |
+
|
| 491 |
+
|
| 492 |
+
class RealTimeAlert(BaseModel):
|
| 493 |
+
"""Alert details emitted by the risk engine."""
|
| 494 |
+
triggered: bool
|
| 495 |
+
alert_type: Optional[str] = None
|
| 496 |
+
severity: Optional[str] = None
|
| 497 |
+
reason_summary: Optional[str] = None
|
| 498 |
+
recommended_action: Optional[str] = None
|
| 499 |
+
|
| 500 |
+
|
| 501 |
+
class ExplainabilitySignal(BaseModel):
|
| 502 |
+
"""Per-signal contribution to fused risk score."""
|
| 503 |
+
signal: str
|
| 504 |
+
raw_score: int = Field(..., ge=0, le=100)
|
| 505 |
+
weight: float = Field(..., ge=0.0, le=1.0)
|
| 506 |
+
weighted_score: float = Field(..., ge=0.0, le=100.0)
|
| 507 |
+
|
| 508 |
+
|
| 509 |
+
class RealTimeExplainability(BaseModel):
|
| 510 |
+
"""Human-readable explainability block for chunk risk output."""
|
| 511 |
+
summary: str
|
| 512 |
+
top_indicators: List[str] = Field(default_factory=list)
|
| 513 |
+
signal_contributions: List[ExplainabilitySignal] = Field(default_factory=list)
|
| 514 |
+
uncertainty_note: Optional[str] = None
|
| 515 |
+
|
| 516 |
+
|
| 517 |
+
class RealTimeUpdateResponse(BaseModel):
|
| 518 |
+
"""Chunk-by-chunk update response."""
|
| 519 |
+
status: str = "success"
|
| 520 |
+
session_id: str
|
| 521 |
+
timestamp: str
|
| 522 |
+
risk_score: int = Field(..., ge=0, le=100)
|
| 523 |
+
cpi: float = Field(..., ge=0.0, le=100.0, description="Conversational Pressure Index")
|
| 524 |
+
risk_level: str
|
| 525 |
+
call_label: str
|
| 526 |
+
model_uncertain: bool = False
|
| 527 |
+
voice_classification: str = "UNCERTAIN"
|
| 528 |
+
voice_confidence: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 529 |
+
evidence: RiskEvidence
|
| 530 |
+
language_analysis: RealTimeLanguageAnalysis
|
| 531 |
+
alert: RealTimeAlert
|
| 532 |
+
explainability: RealTimeExplainability
|
| 533 |
+
chunks_processed: int = Field(..., ge=1)
|
| 534 |
+
risk_policy_version: str = settings.RISK_POLICY_VERSION
|
| 535 |
+
|
| 536 |
+
|
| 537 |
+
class SessionSummaryResponse(BaseModel):
|
| 538 |
+
"""Summary response for a completed or active session."""
|
| 539 |
+
status: str = "success"
|
| 540 |
+
session_id: str
|
| 541 |
+
language: str
|
| 542 |
+
session_status: str
|
| 543 |
+
started_at: str
|
| 544 |
+
last_update: Optional[str] = None
|
| 545 |
+
chunks_processed: int = 0
|
| 546 |
+
alerts_triggered: int = 0
|
| 547 |
+
max_risk_score: int = 0
|
| 548 |
+
max_cpi: float = 0.0
|
| 549 |
+
final_call_label: str = "UNCERTAIN"
|
| 550 |
+
final_voice_classification: str = "UNCERTAIN"
|
| 551 |
+
final_voice_confidence: float = 0.0
|
| 552 |
+
max_voice_ai_confidence: float = 0.0
|
| 553 |
+
voice_ai_chunks: int = 0
|
| 554 |
+
voice_human_chunks: int = 0
|
| 555 |
+
llm_checks_performed: int = 0
|
| 556 |
+
risk_policy_version: str = settings.RISK_POLICY_VERSION
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
class AlertHistoryItem(BaseModel):
|
| 560 |
+
"""One alert event emitted during session analysis."""
|
| 561 |
+
timestamp: str
|
| 562 |
+
risk_score: int = Field(..., ge=0, le=100)
|
| 563 |
+
risk_level: str
|
| 564 |
+
call_label: str
|
| 565 |
+
alert_type: str
|
| 566 |
+
severity: str
|
| 567 |
+
reason_summary: str
|
| 568 |
+
recommended_action: str
|
| 569 |
+
|
| 570 |
+
|
| 571 |
+
class AlertHistoryResponse(BaseModel):
|
| 572 |
+
"""Paginated alert history for one session."""
|
| 573 |
+
status: str = "success"
|
| 574 |
+
session_id: str
|
| 575 |
+
total_alerts: int
|
| 576 |
+
alerts: List[AlertHistoryItem] = Field(default_factory=list)
|
| 577 |
+
|
| 578 |
+
|
| 579 |
+
class RetentionPolicyResponse(BaseModel):
|
| 580 |
+
"""Explicit privacy and retention behavior for session processing."""
|
| 581 |
+
status: str = "success"
|
| 582 |
+
raw_audio_storage: str = "not_persisted"
|
| 583 |
+
active_session_retention_seconds: int
|
| 584 |
+
ended_session_retention_seconds: int
|
| 585 |
+
stored_derived_fields: List[str]
|
| 586 |
+
|
| 587 |
+
|
| 588 |
+
def utc_now_iso() -> str:
|
| 589 |
+
"""Return a UTC ISO-8601 timestamp."""
|
| 590 |
+
return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
| 591 |
+
|
| 592 |
+
|
| 593 |
+
STORED_DERIVED_FIELDS = [
|
| 594 |
+
"risk_history",
|
| 595 |
+
"behaviour_score",
|
| 596 |
+
"session_behaviour_signals",
|
| 597 |
+
"transcript_counts",
|
| 598 |
+
"semantic_flag_counts",
|
| 599 |
+
"keyword_category_counts",
|
| 600 |
+
"chunks_processed",
|
| 601 |
+
"alerts_triggered",
|
| 602 |
+
"max_risk_score",
|
| 603 |
+
"final_call_label",
|
| 604 |
+
"voice_ai_chunks",
|
| 605 |
+
"voice_human_chunks",
|
| 606 |
+
"max_voice_ai_confidence",
|
| 607 |
+
"final_voice_classification",
|
| 608 |
+
"llm_checks_performed",
|
| 609 |
+
]
|
| 610 |
+
|
| 611 |
+
|
| 612 |
+
def parse_iso_timestamp(value: Optional[str]) -> Optional[float]:
|
| 613 |
+
"""Convert ISO timestamp to epoch seconds."""
|
| 614 |
+
if value is None:
|
| 615 |
+
return None
|
| 616 |
+
try:
|
| 617 |
+
return datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp()
|
| 618 |
+
except ValueError:
|
| 619 |
+
return None
|
| 620 |
+
|
| 621 |
+
|
| 622 |
+
def session_reference_timestamp(session: SessionState) -> Optional[float]:
|
| 623 |
+
"""Return the best available timestamp for retention checks."""
|
| 624 |
+
return parse_iso_timestamp(session.last_update) or parse_iso_timestamp(session.started_at)
|
| 625 |
+
|
| 626 |
+
|
| 627 |
+
def session_retention_seconds(session: SessionState) -> int:
|
| 628 |
+
"""Resolve retention policy from session status."""
|
| 629 |
+
if session.status == "ended":
|
| 630 |
+
return settings.SESSION_ENDED_RETENTION_SECONDS
|
| 631 |
+
return settings.SESSION_ACTIVE_RETENTION_SECONDS
|
| 632 |
+
|
| 633 |
+
|
| 634 |
+
def is_session_expired(session: SessionState, now_ts: Optional[float] = None) -> bool:
|
| 635 |
+
"""Check if a session exceeded status-specific retention TTL."""
|
| 636 |
+
reference_ts = session_reference_timestamp(session)
|
| 637 |
+
if reference_ts is None:
|
| 638 |
+
return False
|
| 639 |
+
current = now_ts if now_ts is not None else time.time()
|
| 640 |
+
return (current - reference_ts) > session_retention_seconds(session)
|
| 641 |
+
|
| 642 |
+
|
| 643 |
+
def purge_expired_sessions(now_ts: Optional[float] = None) -> int:
|
| 644 |
+
"""Best-effort retention purge for stale sessions (memory backend)."""
|
| 645 |
+
if use_redis_session_store():
|
| 646 |
+
# Redis keys self-expire by TTL; no in-process purge needed.
|
| 647 |
+
return 0
|
| 648 |
+
|
| 649 |
+
current = now_ts if now_ts is not None else time.time()
|
| 650 |
+
expired_ids = [sid for sid, state in SESSION_STORE.items() if is_session_expired(state, current)]
|
| 651 |
+
for expired_id in expired_ids:
|
| 652 |
+
delete_session_state(expired_id)
|
| 653 |
+
return len(expired_ids)
|
| 654 |
+
|
| 655 |
+
|
| 656 |
+
def validate_supported_language(language: str) -> None:
|
| 657 |
+
"""Validate supported language."""
|
| 658 |
+
if language not in settings.SUPPORTED_LANGUAGES:
|
| 659 |
+
raise HTTPException(
|
| 660 |
+
status_code=400,
|
| 661 |
+
detail={
|
| 662 |
+
"status": "error",
|
| 663 |
+
"message": f"Unsupported language. Must be one of: {', '.join(settings.SUPPORTED_LANGUAGES)}"
|
| 664 |
+
}
|
| 665 |
+
)
|
| 666 |
+
|
| 667 |
+
|
| 668 |
+
def validate_supported_format(audio_format: str) -> None:
|
| 669 |
+
"""Validate supported audio format."""
|
| 670 |
+
normalized = audio_format.lower()
|
| 671 |
+
if normalized not in settings.SUPPORTED_FORMATS:
|
| 672 |
+
raise HTTPException(
|
| 673 |
+
status_code=400,
|
| 674 |
+
detail={
|
| 675 |
+
"status": "error",
|
| 676 |
+
"message": f"Unsupported audio format. Must be one of: {', '.join(settings.SUPPORTED_FORMATS)}"
|
| 677 |
+
}
|
| 678 |
+
)
|
| 679 |
+
|
| 680 |
+
|
| 681 |
+
def normalize_transcript_for_behavior(transcript: str) -> str:
|
| 682 |
+
"""Normalize transcript for repetition and trend analysis."""
|
| 683 |
+
lowered = transcript.lower()
|
| 684 |
+
cleaned = "".join(ch if ch.isalnum() or ch.isspace() else " " for ch in lowered)
|
| 685 |
+
return " ".join(cleaned.split())
|
| 686 |
+
|
| 687 |
+
|
| 688 |
+
def token_overlap_ratio(text_a: str, text_b: str) -> float:
|
| 689 |
+
"""Compute Jaccard overlap between token sets."""
|
| 690 |
+
tokens_a = set(text_a.split())
|
| 691 |
+
tokens_b = set(text_b.split())
|
| 692 |
+
if not tokens_a or not tokens_b:
|
| 693 |
+
return 0.0
|
| 694 |
+
return len(tokens_a.intersection(tokens_b)) / len(tokens_a.union(tokens_b))
|
| 695 |
+
|
| 696 |
+
|
| 697 |
+
def dedupe_preserve_order(items: List[str]) -> List[str]:
|
| 698 |
+
"""Return unique string items while preserving first-seen order."""
|
| 699 |
+
seen = set()
|
| 700 |
+
deduped: List[str] = []
|
| 701 |
+
for item in items:
|
| 702 |
+
if item in seen:
|
| 703 |
+
continue
|
| 704 |
+
seen.add(item)
|
| 705 |
+
deduped.append(item)
|
| 706 |
+
return deduped
|
| 707 |
+
|
| 708 |
+
def update_session_behaviour_state(session: SessionState, language_analysis: Dict[str, Any]) -> Dict[str, Any]:
|
| 709 |
+
"""Update session-level behaviour score from transcript and semantic trends."""
|
| 710 |
+
transcript_source = str(language_analysis.get("transcript_raw", language_analysis.get("transcript", "")))
|
| 711 |
+
transcript = normalize_transcript_for_behavior(transcript_source)
|
| 712 |
+
semantic_flags = list(language_analysis.get("semantic_flags", []))
|
| 713 |
+
keyword_categories = list(language_analysis.get("keyword_categories", []))
|
| 714 |
+
|
| 715 |
+
for flag in semantic_flags:
|
| 716 |
+
session.semantic_flag_counts[flag] = session.semantic_flag_counts.get(flag, 0) + 1
|
| 717 |
+
for category in keyword_categories:
|
| 718 |
+
session.keyword_category_counts[category] = session.keyword_category_counts.get(category, 0) + 1
|
| 719 |
+
|
| 720 |
+
behavior_signals: List[str] = []
|
| 721 |
+
|
| 722 |
+
if transcript:
|
| 723 |
+
count = session.transcript_counts.get(transcript, 0) + 1
|
| 724 |
+
session.transcript_counts[transcript] = count
|
| 725 |
+
if count >= 2:
|
| 726 |
+
behavior_signals.append("repetition_loop")
|
| 727 |
+
if session.last_transcript:
|
| 728 |
+
overlap = token_overlap_ratio(transcript, session.last_transcript)
|
| 729 |
+
if overlap >= 0.75 and len(transcript.split()) >= 4:
|
| 730 |
+
behavior_signals.append("repetition_loop")
|
| 731 |
+
session.last_transcript = transcript
|
| 732 |
+
|
| 733 |
+
urgency_count = session.semantic_flag_counts.get("urgency_language", 0)
|
| 734 |
+
if urgency_count >= 2:
|
| 735 |
+
behavior_signals.append("sustained_urgency")
|
| 736 |
+
|
| 737 |
+
has_impersonation = session.semantic_flag_counts.get("authority_impersonation", 0) > 0
|
| 738 |
+
has_credentials = session.semantic_flag_counts.get("credential_request", 0) > 0
|
| 739 |
+
has_payment = session.semantic_flag_counts.get("payment_redirection", 0) > 0
|
| 740 |
+
has_threat = session.semantic_flag_counts.get("coercive_threat_language", 0) > 0
|
| 741 |
+
has_urgency = urgency_count > 0
|
| 742 |
+
|
| 743 |
+
if has_impersonation and has_credentials:
|
| 744 |
+
behavior_signals.append("impersonation_plus_credential_request")
|
| 745 |
+
if has_payment and has_urgency:
|
| 746 |
+
behavior_signals.append("persistent_payment_pressure")
|
| 747 |
+
if has_threat and has_urgency:
|
| 748 |
+
behavior_signals.append("repeated_threat_urgency")
|
| 749 |
+
|
| 750 |
+
repeated_categories = sum(1 for count in session.keyword_category_counts.values() if count >= 2)
|
| 751 |
+
if repeated_categories >= 2:
|
| 752 |
+
behavior_signals.append("repeated_fraud_categories")
|
| 753 |
+
|
| 754 |
+
behavior_signals = sorted(set(behavior_signals))
|
| 755 |
+
|
| 756 |
+
score = 0
|
| 757 |
+
if "repetition_loop" in behavior_signals:
|
| 758 |
+
max_repetition = max(session.transcript_counts.values()) if session.transcript_counts else 2
|
| 759 |
+
score += 25 + min(15, (max_repetition - 2) * 5)
|
| 760 |
+
if "sustained_urgency" in behavior_signals:
|
| 761 |
+
score += 15 + min(10, (urgency_count - 2) * 5)
|
| 762 |
+
if "impersonation_plus_credential_request" in behavior_signals:
|
| 763 |
+
score += 30
|
| 764 |
+
if "persistent_payment_pressure" in behavior_signals:
|
| 765 |
+
score += 20
|
| 766 |
+
if "repeated_threat_urgency" in behavior_signals:
|
| 767 |
+
score += 15
|
| 768 |
+
if "repeated_fraud_categories" in behavior_signals:
|
| 769 |
+
score += 10
|
| 770 |
+
|
| 771 |
+
session.behaviour_score = max(0, min(100, score))
|
| 772 |
+
session.session_behaviour_signals = behavior_signals
|
| 773 |
+
|
| 774 |
+
return {
|
| 775 |
+
"behaviour_score": session.behaviour_score,
|
| 776 |
+
"session_behaviour_signals": session.session_behaviour_signals,
|
| 777 |
+
}
|
| 778 |
+
|
| 779 |
+
|
| 780 |
+
def map_score_to_level(score: int) -> str:
|
| 781 |
+
"""Map numeric score to risk level."""
|
| 782 |
+
if score < 35:
|
| 783 |
+
return "LOW"
|
| 784 |
+
if score < 60:
|
| 785 |
+
return "MEDIUM"
|
| 786 |
+
if score < 80:
|
| 787 |
+
return "HIGH"
|
| 788 |
+
return "CRITICAL"
|
| 789 |
+
|
| 790 |
+
|
| 791 |
+
def map_level_to_label(risk_level: str, model_uncertain: bool) -> str:
|
| 792 |
+
"""Map risk level to user-friendly label."""
|
| 793 |
+
if model_uncertain:
|
| 794 |
+
return "UNCERTAIN"
|
| 795 |
+
if risk_level == "LOW":
|
| 796 |
+
return "SAFE"
|
| 797 |
+
if risk_level == "MEDIUM":
|
| 798 |
+
return "SPAM"
|
| 799 |
+
return "FRAUD"
|
| 800 |
+
|
| 801 |
+
|
| 802 |
+
def recommendation_for_level(risk_level: str, model_uncertain: bool) -> str:
|
| 803 |
+
"""Return a user action recommendation based on severity."""
|
| 804 |
+
if model_uncertain:
|
| 805 |
+
return "Model uncertainty detected. Avoid sharing OTP/PIN and verify caller via official channel."
|
| 806 |
+
if risk_level == "CRITICAL":
|
| 807 |
+
return "High fraud risk. End the call and verify through an official support number."
|
| 808 |
+
if risk_level == "HIGH":
|
| 809 |
+
return "Fraud indicators detected. Do not share OTP, PIN, passwords, or UPI credentials."
|
| 810 |
+
if risk_level == "MEDIUM":
|
| 811 |
+
return "Suspicious call behavior detected. Verify caller identity before taking action."
|
| 812 |
+
return "No high-risk fraud indicators detected in current chunk."
|
| 813 |
+
|
| 814 |
+
|
| 815 |
+
|
| 816 |
+
def should_invoke_llm_semantic(
|
| 817 |
+
provisional_scored: Dict[str, Any],
|
| 818 |
+
transcript: str,
|
| 819 |
+
transcript_confidence: float,
|
| 820 |
+
next_chunk_index: int,
|
| 821 |
+
) -> bool:
|
| 822 |
+
"""Gate optional LLM semantic calls for ambiguous/uncertain chunks."""
|
| 823 |
+
|
| 824 |
+
if not settings.LLM_SEMANTIC_ENABLED:
|
| 825 |
+
return False
|
| 826 |
+
if not is_llm_semantic_provider_ready():
|
| 827 |
+
return False
|
| 828 |
+
if not transcript.strip():
|
| 829 |
+
return False
|
| 830 |
+
if len(transcript.strip()) < 8:
|
| 831 |
+
return False
|
| 832 |
+
if transcript_confidence < settings.LLM_SEMANTIC_MIN_ASR_CONFIDENCE:
|
| 833 |
+
return False
|
| 834 |
+
|
| 835 |
+
interval = max(1, settings.LLM_SEMANTIC_CHUNK_INTERVAL)
|
| 836 |
+
if next_chunk_index > 1 and (next_chunk_index % interval) != 0:
|
| 837 |
+
return False
|
| 838 |
+
|
| 839 |
+
risk_score = int(provisional_scored.get("risk_score", 0))
|
| 840 |
+
model_uncertain = bool(provisional_scored.get("model_uncertain", False))
|
| 841 |
+
ambiguous_band = 35 <= risk_score < 80
|
| 842 |
+
return ambiguous_band or model_uncertain
|
| 843 |
+
|
| 844 |
+
|
| 845 |
+
def normalize_voice_classification(classification: str, model_uncertain: bool) -> str:
|
| 846 |
+
"""Normalize realtime voice-authenticity classification."""
|
| 847 |
+
if model_uncertain:
|
| 848 |
+
return "UNCERTAIN"
|
| 849 |
+
normalized = str(classification or "HUMAN").upper()
|
| 850 |
+
if normalized in {"AI_GENERATED", "HUMAN"}:
|
| 851 |
+
return normalized
|
| 852 |
+
return "HUMAN"
|
| 853 |
+
|
| 854 |
+
def build_explainability_payload(
|
| 855 |
+
risk_level: str,
|
| 856 |
+
call_label: str,
|
| 857 |
+
model_uncertain: bool,
|
| 858 |
+
cpi: float,
|
| 859 |
+
audio_score: int,
|
| 860 |
+
keyword_score: int,
|
| 861 |
+
semantic_score: int,
|
| 862 |
+
behaviour_score: int,
|
| 863 |
+
has_language_signals: bool,
|
| 864 |
+
behaviour_signals: List[str],
|
| 865 |
+
keyword_hits: List[str],
|
| 866 |
+
acoustic_anomaly: float,
|
| 867 |
+
) -> RealTimeExplainability:
|
| 868 |
+
"""Build explicit explainability signals and concise summary."""
|
| 869 |
+
if has_language_signals:
|
| 870 |
+
weights = {
|
| 871 |
+
"audio": 0.45,
|
| 872 |
+
"keywords": 0.20,
|
| 873 |
+
"semantic": 0.15,
|
| 874 |
+
"behaviour": 0.20,
|
| 875 |
+
}
|
| 876 |
+
else:
|
| 877 |
+
weights = {
|
| 878 |
+
"audio": 1.00,
|
| 879 |
+
"keywords": 0.00,
|
| 880 |
+
"semantic": 0.00,
|
| 881 |
+
"behaviour": 0.00,
|
| 882 |
+
}
|
| 883 |
+
|
| 884 |
+
contributions = [
|
| 885 |
+
ExplainabilitySignal(
|
| 886 |
+
signal="audio",
|
| 887 |
+
raw_score=audio_score,
|
| 888 |
+
weight=weights["audio"],
|
| 889 |
+
weighted_score=round(audio_score * weights["audio"], 2),
|
| 890 |
+
),
|
| 891 |
+
ExplainabilitySignal(
|
| 892 |
+
signal="keywords",
|
| 893 |
+
raw_score=keyword_score,
|
| 894 |
+
weight=weights["keywords"],
|
| 895 |
+
weighted_score=round(keyword_score * weights["keywords"], 2),
|
| 896 |
+
),
|
| 897 |
+
ExplainabilitySignal(
|
| 898 |
+
signal="semantic_intent",
|
| 899 |
+
raw_score=semantic_score,
|
| 900 |
+
weight=weights["semantic"],
|
| 901 |
+
weighted_score=round(semantic_score * weights["semantic"], 2),
|
| 902 |
+
),
|
| 903 |
+
ExplainabilitySignal(
|
| 904 |
+
signal="behaviour",
|
| 905 |
+
raw_score=behaviour_score,
|
| 906 |
+
weight=weights["behaviour"],
|
| 907 |
+
weighted_score=round(behaviour_score * weights["behaviour"], 2),
|
| 908 |
+
),
|
| 909 |
+
]
|
| 910 |
+
|
| 911 |
+
indicators: List[str] = []
|
| 912 |
+
if acoustic_anomaly >= 60:
|
| 913 |
+
indicators.append("acoustic_anomaly_detected")
|
| 914 |
+
indicators.extend(behaviour_signals)
|
| 915 |
+
indicators.extend(keyword_hits[:3])
|
| 916 |
+
deduped_indicators = list(dict.fromkeys(indicators))[:6]
|
| 917 |
+
|
| 918 |
+
summary_parts: List[str] = [
|
| 919 |
+
f"{risk_level.title()} risk classified as {call_label}."
|
| 920 |
+
]
|
| 921 |
+
summary_parts.append(f"CPI at {cpi:.1f}/100.")
|
| 922 |
+
if acoustic_anomaly >= 60:
|
| 923 |
+
summary_parts.append("Audio anomalies are materially elevated.")
|
| 924 |
+
if keyword_score >= 45:
|
| 925 |
+
summary_parts.append("Fraud-related keywords contribute to the score.")
|
| 926 |
+
if semantic_score >= 45:
|
| 927 |
+
summary_parts.append("Semantic coercion patterns were detected.")
|
| 928 |
+
if behaviour_score >= 40:
|
| 929 |
+
summary_parts.append("Session behavior trend increases risk.")
|
| 930 |
+
if cpi >= 70:
|
| 931 |
+
summary_parts.append("Pressure escalation velocity is high; early warning triggered.")
|
| 932 |
+
if not has_language_signals:
|
| 933 |
+
summary_parts.append("Assessment is currently audio-dominant.")
|
| 934 |
+
|
| 935 |
+
uncertainty_note = None
|
| 936 |
+
if model_uncertain:
|
| 937 |
+
uncertainty_note = (
|
| 938 |
+
"Model confidence is limited for this chunk. Treat this result conservatively and verify through trusted channels."
|
| 939 |
+
)
|
| 940 |
+
|
| 941 |
+
return RealTimeExplainability(
|
| 942 |
+
summary=" ".join(summary_parts),
|
| 943 |
+
top_indicators=deduped_indicators,
|
| 944 |
+
signal_contributions=contributions,
|
| 945 |
+
uncertainty_note=uncertainty_note,
|
| 946 |
+
)
|
| 947 |
+
|
| 948 |
+
|
| 949 |
+
def build_risk_update(
|
| 950 |
+
result_features: Dict[str, float],
|
| 951 |
+
classification: str,
|
| 952 |
+
confidence: float,
|
| 953 |
+
language_analysis: Dict[str, Any],
|
| 954 |
+
previous_score: Optional[int],
|
| 955 |
+
llm_semantic: Optional[Dict[str, Any]] = None,
|
| 956 |
+
) -> Dict[str, Any]:
|
| 957 |
+
"""Build risk score, evidence and alert from model outputs and session trend."""
|
| 958 |
+
authenticity = float(result_features.get("authenticity_score", 50.0))
|
| 959 |
+
acoustic_anomaly = float(result_features.get("acoustic_anomaly_score", 0.0))
|
| 960 |
+
ml_fallback = bool(result_features.get("ml_fallback", 0.0))
|
| 961 |
+
realtime_heuristic_mode = bool(result_features.get("realtime_heuristic_mode", 0.0))
|
| 962 |
+
normalized_classification = str(classification or "").upper()
|
| 963 |
+
low_confidence_uncertain = bool(
|
| 964 |
+
normalized_classification != "AI_GENERATED"
|
| 965 |
+
and float(confidence) < 0.65
|
| 966 |
+
and int(language_analysis.get("keyword_score", 0)) == 0
|
| 967 |
+
and int(language_analysis.get("semantic_score", 0)) == 0
|
| 968 |
+
and int(language_analysis.get("behaviour_score", 0)) == 0
|
| 969 |
+
)
|
| 970 |
+
heuristic_uncertain = bool(
|
| 971 |
+
realtime_heuristic_mode
|
| 972 |
+
and normalized_classification != "AI_GENERATED"
|
| 973 |
+
and float(confidence) < 0.90
|
| 974 |
+
)
|
| 975 |
+
model_uncertain = ml_fallback or low_confidence_uncertain or heuristic_uncertain
|
| 976 |
+
keyword_score = int(language_analysis.get("keyword_score", 0))
|
| 977 |
+
semantic_score = int(language_analysis.get("semantic_score", 0))
|
| 978 |
+
behaviour_score = int(language_analysis.get("behaviour_score", 0))
|
| 979 |
+
keyword_hits = dedupe_preserve_order(list(language_analysis.get("keyword_hits", [])))
|
| 980 |
+
behavior_from_language = dedupe_preserve_order(list(language_analysis.get("behaviour_signals", [])))
|
| 981 |
+
behavior_from_session = dedupe_preserve_order(list(language_analysis.get("session_behaviour_signals", [])))
|
| 982 |
+
keyword_categories = dedupe_preserve_order(list(language_analysis.get("keyword_categories", [])))
|
| 983 |
+
semantic_flags = dedupe_preserve_order(list(language_analysis.get("semantic_flags", [])))
|
| 984 |
+
transcript = str(language_analysis.get("transcript", "")).strip()
|
| 985 |
+
|
| 986 |
+
llm_semantic_used = False
|
| 987 |
+
llm_semantic_confidence = 0.0
|
| 988 |
+
llm_semantic_model: Optional[str] = None
|
| 989 |
+
if llm_semantic and llm_semantic.get("available"):
|
| 990 |
+
blend_weight = max(0.0, min(1.0, settings.LLM_SEMANTIC_BLEND_WEIGHT))
|
| 991 |
+
llm_score = int(max(0, min(100, llm_semantic.get("semantic_score", semantic_score))))
|
| 992 |
+
semantic_score = int(round((semantic_score * (1.0 - blend_weight)) + (llm_score * blend_weight)))
|
| 993 |
+
llm_semantic_confidence = float(max(0.0, min(1.0, llm_semantic.get("confidence", 0.0))))
|
| 994 |
+
llm_semantic_model = str(llm_semantic.get("model") or settings.LLM_SEMANTIC_MODEL)
|
| 995 |
+
llm_semantic_used = True
|
| 996 |
+
|
| 997 |
+
keyword_hints = dedupe_preserve_order([str(x) for x in llm_semantic.get("keyword_hints", [])])
|
| 998 |
+
if keyword_hints:
|
| 999 |
+
keyword_hits = dedupe_preserve_order(keyword_hits + keyword_hints)
|
| 1000 |
+
keyword_score = min(100, keyword_score + min(18, len(keyword_hints) * 6))
|
| 1001 |
+
|
| 1002 |
+
llm_flags = dedupe_preserve_order([str(x) for x in llm_semantic.get("semantic_flags", [])])
|
| 1003 |
+
if llm_flags:
|
| 1004 |
+
semantic_flags = dedupe_preserve_order(semantic_flags + llm_flags)
|
| 1005 |
+
|
| 1006 |
+
llm_behaviour = dedupe_preserve_order([str(x) for x in llm_semantic.get("behaviour_signals", [])])
|
| 1007 |
+
if llm_behaviour:
|
| 1008 |
+
behavior_from_language = dedupe_preserve_order(behavior_from_language + llm_behaviour)
|
| 1009 |
+
|
| 1010 |
+
# Audio signal risk.
|
| 1011 |
+
if classification == "AI_GENERATED":
|
| 1012 |
+
audio_score = max(
|
| 1013 |
+
int(round(confidence * 100)),
|
| 1014 |
+
int(max(0.0, min(100.0, acoustic_anomaly * 0.85))),
|
| 1015 |
+
)
|
| 1016 |
+
else:
|
| 1017 |
+
authenticity_audio_score = int(max(0, min(100, (50.0 - authenticity) * 1.2)))
|
| 1018 |
+
anomaly_audio_score = int(max(0.0, min(100.0, acoustic_anomaly * 0.90)))
|
| 1019 |
+
audio_score = max(authenticity_audio_score, anomaly_audio_score)
|
| 1020 |
+
|
| 1021 |
+
has_language_signals = bool(transcript) or keyword_score > 0 or semantic_score > 0 or behaviour_score > 0
|
| 1022 |
+
if has_language_signals:
|
| 1023 |
+
raw_weights = {
|
| 1024 |
+
"audio": settings.RISK_WEIGHT_AUDIO,
|
| 1025 |
+
"keywords": settings.RISK_WEIGHT_KEYWORD,
|
| 1026 |
+
"semantic": settings.RISK_WEIGHT_SEMANTIC,
|
| 1027 |
+
"behaviour": settings.RISK_WEIGHT_BEHAVIOUR,
|
| 1028 |
+
}
|
| 1029 |
+
total_weight = sum(raw_weights.values())
|
| 1030 |
+
if total_weight <= 0:
|
| 1031 |
+
raw_weights = {"audio": 0.45, "keywords": 0.20, "semantic": 0.15, "behaviour": 0.20}
|
| 1032 |
+
total_weight = 1.0
|
| 1033 |
+
normalized = {k: v / total_weight for k, v in raw_weights.items()}
|
| 1034 |
+
|
| 1035 |
+
base_score = int(
|
| 1036 |
+
round(
|
| 1037 |
+
(audio_score * normalized["audio"])
|
| 1038 |
+
+ (keyword_score * normalized["keywords"])
|
| 1039 |
+
+ (semantic_score * normalized["semantic"])
|
| 1040 |
+
+ (behaviour_score * normalized["behaviour"])
|
| 1041 |
+
)
|
| 1042 |
+
)
|
| 1043 |
+
else:
|
| 1044 |
+
base_score = audio_score
|
| 1045 |
+
|
| 1046 |
+
if ml_fallback:
|
| 1047 |
+
base_score = max(base_score, 55)
|
| 1048 |
+
|
| 1049 |
+
risk_score = max(0, min(100, base_score))
|
| 1050 |
+
behaviour_signals: List[str] = list(behavior_from_language) + list(behavior_from_session)
|
| 1051 |
+
|
| 1052 |
+
if keyword_score >= 60:
|
| 1053 |
+
behaviour_signals.append("keyword_cluster_detected")
|
| 1054 |
+
if semantic_score >= 60:
|
| 1055 |
+
behaviour_signals.append("semantic_coercion_detected")
|
| 1056 |
+
if behaviour_score >= 40:
|
| 1057 |
+
behaviour_signals.append("behaviour_risk_elevated")
|
| 1058 |
+
if acoustic_anomaly >= 60:
|
| 1059 |
+
behaviour_signals.append("acoustic_anomaly_detected")
|
| 1060 |
+
|
| 1061 |
+
if previous_score is not None:
|
| 1062 |
+
delta = risk_score - previous_score
|
| 1063 |
+
if delta >= 15:
|
| 1064 |
+
behaviour_signals.append("rapid_risk_escalation")
|
| 1065 |
+
if risk_score >= 70 and previous_score >= 70:
|
| 1066 |
+
behaviour_signals.append("sustained_high_risk")
|
| 1067 |
+
else:
|
| 1068 |
+
delta = 0
|
| 1069 |
+
|
| 1070 |
+
if delta > 0:
|
| 1071 |
+
risk_score = min(100, risk_score + int(delta * settings.RISK_DELTA_BOOST_FACTOR))
|
| 1072 |
+
|
| 1073 |
+
if previous_score is None:
|
| 1074 |
+
cpi = min(100.0, max(0.0, (behaviour_score * 0.35) + (semantic_score * 0.20)))
|
| 1075 |
+
else:
|
| 1076 |
+
cpi = min(
|
| 1077 |
+
100.0,
|
| 1078 |
+
max(
|
| 1079 |
+
0.0,
|
| 1080 |
+
(max(0, delta) * 3.2)
|
| 1081 |
+
+ (behaviour_score * 0.35)
|
| 1082 |
+
+ (semantic_score * 0.15),
|
| 1083 |
+
),
|
| 1084 |
+
)
|
| 1085 |
+
if cpi >= 70:
|
| 1086 |
+
behaviour_signals.append("cpi_spike_detected")
|
| 1087 |
+
|
| 1088 |
+
behaviour_signals = dedupe_preserve_order(behaviour_signals)
|
| 1089 |
+
risk_level = map_score_to_level(risk_score)
|
| 1090 |
+
call_label = map_level_to_label(risk_level, model_uncertain)
|
| 1091 |
+
|
| 1092 |
+
audio_patterns = [
|
| 1093 |
+
f"classification:{classification.lower()}",
|
| 1094 |
+
f"model_confidence:{confidence:.2f}",
|
| 1095 |
+
f"authenticity_score:{authenticity:.1f}",
|
| 1096 |
+
f"acoustic_anomaly_score:{acoustic_anomaly:.1f}",
|
| 1097 |
+
f"audio_score:{audio_score}",
|
| 1098 |
+
]
|
| 1099 |
+
if ml_fallback:
|
| 1100 |
+
audio_patterns.append("model_fallback:true")
|
| 1101 |
+
audio_patterns = dedupe_preserve_order(audio_patterns)
|
| 1102 |
+
|
| 1103 |
+
strong_intent = {
|
| 1104 |
+
"authority_with_credential_request",
|
| 1105 |
+
"urgent_payment_pressure",
|
| 1106 |
+
"threat_plus_urgency",
|
| 1107 |
+
"impersonation_plus_credential_request",
|
| 1108 |
+
"persistent_payment_pressure",
|
| 1109 |
+
"repeated_threat_urgency",
|
| 1110 |
+
}
|
| 1111 |
+
alert_triggered = (
|
| 1112 |
+
risk_level in {"HIGH", "CRITICAL"}
|
| 1113 |
+
or "rapid_risk_escalation" in behaviour_signals
|
| 1114 |
+
or cpi >= 70
|
| 1115 |
+
or any(signal in behaviour_signals for signal in strong_intent)
|
| 1116 |
+
)
|
| 1117 |
+
alert_type = None
|
| 1118 |
+
severity = None
|
| 1119 |
+
reason_summary = None
|
| 1120 |
+
recommended_action = None
|
| 1121 |
+
|
| 1122 |
+
if alert_triggered:
|
| 1123 |
+
if risk_level == "CRITICAL":
|
| 1124 |
+
alert_type = "FRAUD_RISK_CRITICAL"
|
| 1125 |
+
elif cpi >= 70:
|
| 1126 |
+
alert_type = "EARLY_PRESSURE_WARNING"
|
| 1127 |
+
elif "rapid_risk_escalation" in behaviour_signals:
|
| 1128 |
+
alert_type = "RISK_ESCALATION"
|
| 1129 |
+
else:
|
| 1130 |
+
alert_type = "FRAUD_RISK_HIGH"
|
| 1131 |
+
severity = risk_level.lower()
|
| 1132 |
+
reasons: List[str] = []
|
| 1133 |
+
if keyword_hits:
|
| 1134 |
+
reasons.append("fraud keywords detected")
|
| 1135 |
+
if semantic_score >= 45:
|
| 1136 |
+
reasons.append("coercive intent patterns detected")
|
| 1137 |
+
if behaviour_score >= 40:
|
| 1138 |
+
reasons.append("session behavior risk elevated")
|
| 1139 |
+
if "repetition_loop" in behaviour_signals:
|
| 1140 |
+
reasons.append("repetition loop detected")
|
| 1141 |
+
if "rapid_risk_escalation" in behaviour_signals:
|
| 1142 |
+
reasons.append("risk escalated rapidly across chunks")
|
| 1143 |
+
if cpi >= 70:
|
| 1144 |
+
reasons.append("conversational pressure index spiked")
|
| 1145 |
+
if not reasons:
|
| 1146 |
+
reasons.append("high-risk audio pattern detected")
|
| 1147 |
+
reason_summary = ". ".join(reasons).capitalize() + "."
|
| 1148 |
+
recommended_action = recommendation_for_level(risk_level, model_uncertain)
|
| 1149 |
+
|
| 1150 |
+
explainability = build_explainability_payload(
|
| 1151 |
+
risk_level=risk_level,
|
| 1152 |
+
call_label=call_label,
|
| 1153 |
+
model_uncertain=model_uncertain,
|
| 1154 |
+
cpi=cpi,
|
| 1155 |
+
audio_score=audio_score,
|
| 1156 |
+
keyword_score=keyword_score,
|
| 1157 |
+
semantic_score=semantic_score,
|
| 1158 |
+
behaviour_score=behaviour_score,
|
| 1159 |
+
has_language_signals=has_language_signals,
|
| 1160 |
+
behaviour_signals=behaviour_signals,
|
| 1161 |
+
keyword_hits=keyword_hits,
|
| 1162 |
+
acoustic_anomaly=acoustic_anomaly,
|
| 1163 |
+
)
|
| 1164 |
+
|
| 1165 |
+
return {
|
| 1166 |
+
"risk_score": risk_score,
|
| 1167 |
+
"cpi": round(cpi, 1),
|
| 1168 |
+
"risk_level": risk_level,
|
| 1169 |
+
"call_label": call_label,
|
| 1170 |
+
"model_uncertain": model_uncertain,
|
| 1171 |
+
"evidence": RiskEvidence(
|
| 1172 |
+
audio_patterns=audio_patterns,
|
| 1173 |
+
keywords=keyword_hits,
|
| 1174 |
+
behaviour=behaviour_signals
|
| 1175 |
+
),
|
| 1176 |
+
"language_analysis": RealTimeLanguageAnalysis(
|
| 1177 |
+
transcript=transcript,
|
| 1178 |
+
transcript_confidence=float(language_analysis.get("transcript_confidence", 0.0)),
|
| 1179 |
+
asr_engine=str(language_analysis.get("asr_engine", "unavailable")),
|
| 1180 |
+
keyword_hits=keyword_hits,
|
| 1181 |
+
keyword_categories=keyword_categories,
|
| 1182 |
+
semantic_flags=semantic_flags,
|
| 1183 |
+
keyword_score=keyword_score,
|
| 1184 |
+
semantic_score=semantic_score,
|
| 1185 |
+
behaviour_score=behaviour_score,
|
| 1186 |
+
session_behaviour_signals=behavior_from_session,
|
| 1187 |
+
llm_semantic_used=llm_semantic_used,
|
| 1188 |
+
llm_semantic_confidence=llm_semantic_confidence,
|
| 1189 |
+
llm_semantic_model=llm_semantic_model,
|
| 1190 |
+
),
|
| 1191 |
+
"alert": RealTimeAlert(
|
| 1192 |
+
triggered=alert_triggered,
|
| 1193 |
+
alert_type=alert_type,
|
| 1194 |
+
severity=severity,
|
| 1195 |
+
reason_summary=reason_summary,
|
| 1196 |
+
recommended_action=recommended_action
|
| 1197 |
+
),
|
| 1198 |
+
"explainability": explainability,
|
| 1199 |
+
}
|
| 1200 |
+
|
| 1201 |
+
|
| 1202 |
+
async def process_audio_chunk(
|
| 1203 |
+
session_id: str,
|
| 1204 |
+
chunk_request: SessionChunkRequest,
|
| 1205 |
+
default_language: str,
|
| 1206 |
+
request_id: str
|
| 1207 |
+
) -> RealTimeUpdateResponse:
|
| 1208 |
+
"""Decode, analyze and score a real-time audio chunk."""
|
| 1209 |
+
chunk_language = chunk_request.language or default_language
|
| 1210 |
+
validate_supported_language(chunk_language)
|
| 1211 |
+
validate_supported_format(chunk_request.audioFormat)
|
| 1212 |
+
|
| 1213 |
+
audio_size_kb = len(chunk_request.audioBase64) * 3 / 4 / 1024
|
| 1214 |
+
logger.info(
|
| 1215 |
+
f"[{request_id}] Realtime chunk: session={session_id}, language={chunk_language}, "
|
| 1216 |
+
f"format={chunk_request.audioFormat}, size~{audio_size_kb:.1f}KB"
|
| 1217 |
+
)
|
| 1218 |
+
|
| 1219 |
+
decode_start = time.perf_counter()
|
| 1220 |
+
audio_bytes = await asyncio.to_thread(decode_base64_audio, chunk_request.audioBase64)
|
| 1221 |
+
decode_ms = (time.perf_counter() - decode_start) * 1000
|
| 1222 |
+
|
| 1223 |
+
load_start = time.perf_counter()
|
| 1224 |
+
audio, sr = await asyncio.to_thread(load_audio_from_bytes, audio_bytes, 22050, chunk_request.audioFormat)
|
| 1225 |
+
load_ms = (time.perf_counter() - load_start) * 1000
|
| 1226 |
+
|
| 1227 |
+
duration_sec = len(audio) / sr
|
| 1228 |
+
logger.info(
|
| 1229 |
+
f"[{request_id}] Realtime analyze {duration_sec:.2f}s (decode {decode_ms:.0f}ms, load {load_ms:.0f}ms)"
|
| 1230 |
+
)
|
| 1231 |
+
|
| 1232 |
+
analyze_start = time.perf_counter()
|
| 1233 |
+
try:
|
| 1234 |
+
analysis_result = await asyncio.to_thread(analyze_voice, audio, sr, chunk_language, True)
|
| 1235 |
+
except Exception as exc:
|
| 1236 |
+
logger.warning("[%s] Realtime model path failed: %s; using conservative fallback", request_id, exc)
|
| 1237 |
+
analysis_result = AnalysisResult(
|
| 1238 |
+
classification="HUMAN",
|
| 1239 |
+
confidence_score=0.5,
|
| 1240 |
+
explanation="Realtime model path unavailable; conservative fallback applied.",
|
| 1241 |
+
features={
|
| 1242 |
+
"ml_fallback": 1.0,
|
| 1243 |
+
"authenticity_score": 50.0,
|
| 1244 |
+
"pitch_naturalness": 50.0,
|
| 1245 |
+
"spectral_naturalness": 50.0,
|
| 1246 |
+
"temporal_naturalness": 50.0,
|
| 1247 |
+
"acoustic_anomaly_score": 50.0,
|
| 1248 |
+
},
|
| 1249 |
+
)
|
| 1250 |
+
analyze_ms = (time.perf_counter() - analyze_start) * 1000
|
| 1251 |
+
logger.info(
|
| 1252 |
+
f"[{request_id}] Realtime result: {analysis_result.classification} "
|
| 1253 |
+
f"({analysis_result.confidence_score:.0%}) in {analyze_ms:.0f}ms"
|
| 1254 |
+
)
|
| 1255 |
+
|
| 1256 |
+
asr_start = time.perf_counter()
|
| 1257 |
+
asr_timeout_seconds = max(0.1, float(settings.ASR_TIMEOUT_MS) / 1000.0)
|
| 1258 |
+
asr_result = await transcribe_audio_guarded(
|
| 1259 |
+
audio=audio,
|
| 1260 |
+
sr=sr,
|
| 1261 |
+
language=chunk_language,
|
| 1262 |
+
timeout_seconds=asr_timeout_seconds,
|
| 1263 |
+
request_id=request_id,
|
| 1264 |
+
)
|
| 1265 |
+
asr_ms = (time.perf_counter() - asr_start) * 1000
|
| 1266 |
+
raw_transcript = str(asr_result.get("transcript", ""))
|
| 1267 |
+
response_transcript = (
|
| 1268 |
+
mask_sensitive_entities(raw_transcript)
|
| 1269 |
+
if settings.MASK_TRANSCRIPT_OUTPUT
|
| 1270 |
+
else raw_transcript
|
| 1271 |
+
)
|
| 1272 |
+
language_result = analyze_transcript(raw_transcript, chunk_language)
|
| 1273 |
+
language_result["transcript_raw"] = raw_transcript
|
| 1274 |
+
language_result["transcript"] = response_transcript
|
| 1275 |
+
language_result["transcript_confidence"] = asr_result.get("confidence", 0.0)
|
| 1276 |
+
language_result["asr_engine"] = asr_result.get("engine", "unavailable")
|
| 1277 |
+
transcript_preview = sanitize_for_logging(raw_transcript, max_chars=90)
|
| 1278 |
+
logger.info(
|
| 1279 |
+
f"[{request_id}] Realtime ASR: engine={language_result['asr_engine']}, "
|
| 1280 |
+
f"confidence={language_result['transcript_confidence']:.2f}, "
|
| 1281 |
+
f"text_len={len(raw_transcript)}, preview='{transcript_preview}', asr={asr_ms:.0f}ms"
|
| 1282 |
+
)
|
| 1283 |
+
|
| 1284 |
+
# Read-only session snapshot for scoring and optional LLM gating.
|
| 1285 |
+
async with SESSION_LOCK:
|
| 1286 |
+
purge_expired_sessions()
|
| 1287 |
+
session = get_session_state(session_id)
|
| 1288 |
+
if session is None:
|
| 1289 |
+
raise HTTPException(
|
| 1290 |
+
status_code=404,
|
| 1291 |
+
detail={"status": "error", "message": "Session not found or expired"}
|
| 1292 |
+
)
|
| 1293 |
+
if session.status != "active":
|
| 1294 |
+
raise HTTPException(
|
| 1295 |
+
status_code=409,
|
| 1296 |
+
detail={"status": "error", "message": "Session is not active. Start a new session to continue."}
|
| 1297 |
+
)
|
| 1298 |
+
previous_score_snapshot = session.risk_history[-1] if session.risk_history else None
|
| 1299 |
+
next_chunk_index = session.chunks_processed + 1
|
| 1300 |
+
|
| 1301 |
+
provisional_scored = build_risk_update(
|
| 1302 |
+
analysis_result.features or {},
|
| 1303 |
+
analysis_result.classification,
|
| 1304 |
+
analysis_result.confidence_score,
|
| 1305 |
+
language_result,
|
| 1306 |
+
previous_score_snapshot,
|
| 1307 |
+
)
|
| 1308 |
+
|
| 1309 |
+
llm_semantic: Optional[Dict[str, Any]] = None
|
| 1310 |
+
llm_invoked = should_invoke_llm_semantic(
|
| 1311 |
+
provisional_scored=provisional_scored,
|
| 1312 |
+
transcript=raw_transcript,
|
| 1313 |
+
transcript_confidence=float(language_result.get("transcript_confidence", 0.0)),
|
| 1314 |
+
next_chunk_index=next_chunk_index,
|
| 1315 |
+
)
|
| 1316 |
+
if llm_invoked:
|
| 1317 |
+
llm_semantic = await asyncio.to_thread(
|
| 1318 |
+
analyze_semantic_with_llm,
|
| 1319 |
+
raw_transcript,
|
| 1320 |
+
chunk_language,
|
| 1321 |
+
settings.LLM_SEMANTIC_TIMEOUT_MS,
|
| 1322 |
+
)
|
| 1323 |
+
|
| 1324 |
+
async with SESSION_LOCK:
|
| 1325 |
+
purge_expired_sessions()
|
| 1326 |
+
session = get_session_state(session_id)
|
| 1327 |
+
if session is None:
|
| 1328 |
+
raise HTTPException(
|
| 1329 |
+
status_code=404,
|
| 1330 |
+
detail={"status": "error", "message": "Session not found or expired"}
|
| 1331 |
+
)
|
| 1332 |
+
if session.status != "active":
|
| 1333 |
+
raise HTTPException(
|
| 1334 |
+
status_code=409,
|
| 1335 |
+
detail={"status": "error", "message": "Session is not active. Start a new session to continue."}
|
| 1336 |
+
)
|
| 1337 |
+
|
| 1338 |
+
if llm_invoked:
|
| 1339 |
+
session.llm_checks_performed += 1
|
| 1340 |
+
if llm_semantic and llm_semantic.get("available"):
|
| 1341 |
+
session.llm_last_engine = str(llm_semantic.get("engine", "openai-chat-completions"))
|
| 1342 |
+
else:
|
| 1343 |
+
reason = str((llm_semantic or {}).get("reason", "unavailable"))
|
| 1344 |
+
session.llm_last_engine = f"skipped:{reason}"
|
| 1345 |
+
|
| 1346 |
+
behaviour_snapshot = update_session_behaviour_state(session, language_result)
|
| 1347 |
+
language_result.update(behaviour_snapshot)
|
| 1348 |
+
previous_score = session.risk_history[-1] if session.risk_history else None
|
| 1349 |
+
scored = build_risk_update(
|
| 1350 |
+
analysis_result.features or {},
|
| 1351 |
+
analysis_result.classification,
|
| 1352 |
+
analysis_result.confidence_score,
|
| 1353 |
+
language_result,
|
| 1354 |
+
previous_score,
|
| 1355 |
+
llm_semantic=llm_semantic,
|
| 1356 |
+
)
|
| 1357 |
+
|
| 1358 |
+
voice_classification = normalize_voice_classification(
|
| 1359 |
+
analysis_result.classification,
|
| 1360 |
+
scored["model_uncertain"],
|
| 1361 |
+
)
|
| 1362 |
+
voice_confidence = float(max(0.0, min(1.0, analysis_result.confidence_score)))
|
| 1363 |
+
|
| 1364 |
+
session.chunks_processed += 1
|
| 1365 |
+
session.last_update = utc_now_iso()
|
| 1366 |
+
session.risk_history.append(scored["risk_score"])
|
| 1367 |
+
if scored["risk_score"] >= session.max_risk_score:
|
| 1368 |
+
session.final_call_label = scored["call_label"]
|
| 1369 |
+
session.max_risk_score = max(session.max_risk_score, scored["risk_score"])
|
| 1370 |
+
session.max_cpi = max(session.max_cpi, float(scored["cpi"]))
|
| 1371 |
+
|
| 1372 |
+
if voice_classification == "AI_GENERATED":
|
| 1373 |
+
session.voice_ai_chunks += 1
|
| 1374 |
+
session.max_voice_ai_confidence = max(session.max_voice_ai_confidence, voice_confidence)
|
| 1375 |
+
elif voice_classification == "HUMAN":
|
| 1376 |
+
session.voice_human_chunks += 1
|
| 1377 |
+
|
| 1378 |
+
session.final_voice_classification = voice_classification
|
| 1379 |
+
session.final_voice_confidence = voice_confidence
|
| 1380 |
+
|
| 1381 |
+
if scored["alert"].triggered:
|
| 1382 |
+
alert_obj = scored["alert"]
|
| 1383 |
+
alert_entry = {
|
| 1384 |
+
"timestamp": session.last_update,
|
| 1385 |
+
"risk_score": scored["risk_score"],
|
| 1386 |
+
"risk_level": scored["risk_level"],
|
| 1387 |
+
"call_label": scored["call_label"],
|
| 1388 |
+
"alert_type": alert_obj.alert_type or "FRAUD_RISK_HIGH",
|
| 1389 |
+
"severity": alert_obj.severity or scored["risk_level"].lower(),
|
| 1390 |
+
"reason_summary": alert_obj.reason_summary or "Fraud indicators detected.",
|
| 1391 |
+
"recommended_action": alert_obj.recommended_action
|
| 1392 |
+
or recommendation_for_level(scored["risk_level"], scored["model_uncertain"]),
|
| 1393 |
+
}
|
| 1394 |
+
|
| 1395 |
+
last_alert = session.alert_history[-1] if session.alert_history else None
|
| 1396 |
+
duplicate_keys = ("alert_type", "severity", "reason_summary", "recommended_action", "call_label", "risk_level")
|
| 1397 |
+
is_duplicate = bool(
|
| 1398 |
+
last_alert
|
| 1399 |
+
and all(last_alert.get(key) == alert_entry.get(key) for key in duplicate_keys)
|
| 1400 |
+
)
|
| 1401 |
+
|
| 1402 |
+
if is_duplicate:
|
| 1403 |
+
last_alert["timestamp"] = session.last_update
|
| 1404 |
+
last_alert["risk_score"] = max(int(last_alert.get("risk_score", 0)), scored["risk_score"])
|
| 1405 |
+
else:
|
| 1406 |
+
session.alerts_triggered += 1
|
| 1407 |
+
session.alert_history.append(alert_entry)
|
| 1408 |
+
if len(session.alert_history) > 100:
|
| 1409 |
+
session.alert_history = session.alert_history[-100:]
|
| 1410 |
+
|
| 1411 |
+
save_session_state(session)
|
| 1412 |
+
|
| 1413 |
+
return RealTimeUpdateResponse(
|
| 1414 |
+
status="success",
|
| 1415 |
+
session_id=session_id,
|
| 1416 |
+
timestamp=session.last_update,
|
| 1417 |
+
risk_score=scored["risk_score"],
|
| 1418 |
+
cpi=scored["cpi"],
|
| 1419 |
+
risk_level=scored["risk_level"],
|
| 1420 |
+
call_label=scored["call_label"],
|
| 1421 |
+
model_uncertain=scored["model_uncertain"],
|
| 1422 |
+
voice_classification=voice_classification,
|
| 1423 |
+
voice_confidence=voice_confidence,
|
| 1424 |
+
evidence=scored["evidence"],
|
| 1425 |
+
language_analysis=scored["language_analysis"],
|
| 1426 |
+
alert=scored["alert"],
|
| 1427 |
+
explainability=scored["explainability"],
|
| 1428 |
+
chunks_processed=session.chunks_processed,
|
| 1429 |
+
risk_policy_version=settings.RISK_POLICY_VERSION,
|
| 1430 |
+
)
|
| 1431 |
+
|
| 1432 |
+
|
| 1433 |
+
def session_to_summary(session: SessionState) -> SessionSummaryResponse:
|
| 1434 |
+
"""Convert session state to response model."""
|
| 1435 |
+
return SessionSummaryResponse(
|
| 1436 |
+
status="success",
|
| 1437 |
+
session_id=session.session_id,
|
| 1438 |
+
language=session.language,
|
| 1439 |
+
session_status=session.status,
|
| 1440 |
+
started_at=session.started_at,
|
| 1441 |
+
last_update=session.last_update,
|
| 1442 |
+
chunks_processed=session.chunks_processed,
|
| 1443 |
+
alerts_triggered=session.alerts_triggered,
|
| 1444 |
+
max_risk_score=session.max_risk_score,
|
| 1445 |
+
max_cpi=round(session.max_cpi, 1),
|
| 1446 |
+
final_call_label=session.final_call_label,
|
| 1447 |
+
final_voice_classification=session.final_voice_classification,
|
| 1448 |
+
final_voice_confidence=round(session.final_voice_confidence, 2),
|
| 1449 |
+
max_voice_ai_confidence=round(session.max_voice_ai_confidence, 2),
|
| 1450 |
+
voice_ai_chunks=session.voice_ai_chunks,
|
| 1451 |
+
voice_human_chunks=session.voice_human_chunks,
|
| 1452 |
+
llm_checks_performed=session.llm_checks_performed,
|
| 1453 |
+
risk_policy_version=settings.RISK_POLICY_VERSION,
|
| 1454 |
+
)
|
| 1455 |
+
|
| 1456 |
+
|
| 1457 |
+
# Authentication
|
| 1458 |
+
from fastapi.security import APIKeyHeader
|
| 1459 |
+
from fastapi import Security
|
| 1460 |
+
|
| 1461 |
+
api_key_header = APIKeyHeader(name="x-api-key", auto_error=False) # Changed to False for better error messages
|
| 1462 |
+
|
| 1463 |
+
async def verify_api_key(x_api_key: str = Security(api_key_header)) -> str:
|
| 1464 |
+
"""Dependency to verify API key. Raises 401 if invalid or missing."""
|
| 1465 |
+
if x_api_key is None:
|
| 1466 |
+
logger.warning("API request without x-api-key header")
|
| 1467 |
+
raise HTTPException(
|
| 1468 |
+
status_code=401,
|
| 1469 |
+
detail={"status": "error", "message": "Missing API key. Include 'x-api-key' header."}
|
| 1470 |
+
)
|
| 1471 |
+
if x_api_key != settings.API_KEY:
|
| 1472 |
+
logger.warning(f"API request with invalid key: {x_api_key[:8]}...")
|
| 1473 |
+
raise HTTPException(
|
| 1474 |
+
status_code=401,
|
| 1475 |
+
detail={"status": "error", "message": "Invalid API key"}
|
| 1476 |
+
)
|
| 1477 |
+
return x_api_key
|
| 1478 |
+
|
| 1479 |
+
|
| 1480 |
+
def verify_websocket_api_key(websocket: WebSocket) -> bool:
|
| 1481 |
+
"""Validate API key for websocket connections."""
|
| 1482 |
+
key = websocket.headers.get("x-api-key") or websocket.query_params.get("api_key")
|
| 1483 |
+
return key == settings.API_KEY
|
| 1484 |
+
|
| 1485 |
+
|
| 1486 |
+
# Routes
|
| 1487 |
+
@app.get("/", include_in_schema=False)
|
| 1488 |
+
async def root():
|
| 1489 |
+
"""Redirect to API documentation."""
|
| 1490 |
+
return RedirectResponse(url="/docs")
|
| 1491 |
+
|
| 1492 |
+
|
| 1493 |
+
@app.get("/health")
|
| 1494 |
+
async def health_check():
|
| 1495 |
+
"""Health check for monitoring - verifies ML model is loaded."""
|
| 1496 |
+
try:
|
| 1497 |
+
from model import _model
|
| 1498 |
+
model_loaded = _model is not None
|
| 1499 |
+
except Exception:
|
| 1500 |
+
model_loaded = False
|
| 1501 |
+
|
| 1502 |
+
return {
|
| 1503 |
+
"status": "healthy" if model_loaded else "degraded",
|
| 1504 |
+
"model_loaded": model_loaded,
|
| 1505 |
+
"session_store_backend": SESSION_STORE_BACKEND_ACTIVE,
|
| 1506 |
+
}
|
| 1507 |
+
|
| 1508 |
+
|
| 1509 |
+
@app.post("/v1/session/start", response_model=SessionStartResponse)
|
| 1510 |
+
@app.post("/api/voice-detection/v1/session/start", response_model=SessionStartResponse)
|
| 1511 |
+
async def start_realtime_session(
|
| 1512 |
+
session_request: SessionStartRequest,
|
| 1513 |
+
api_key: str = Depends(verify_api_key)
|
| 1514 |
+
):
|
| 1515 |
+
"""Create a new real-time fraud analysis session."""
|
| 1516 |
+
validate_supported_language(session_request.language)
|
| 1517 |
+
|
| 1518 |
+
session_id = str(uuid.uuid4())
|
| 1519 |
+
started_at = utc_now_iso()
|
| 1520 |
+
|
| 1521 |
+
async with SESSION_LOCK:
|
| 1522 |
+
purged = purge_expired_sessions()
|
| 1523 |
+
if purged:
|
| 1524 |
+
logger.info("Retention purge removed %s expired sessions", purged)
|
| 1525 |
+
|
| 1526 |
+
session_state = SessionState(
|
| 1527 |
+
session_id=session_id,
|
| 1528 |
+
language=session_request.language,
|
| 1529 |
+
started_at=started_at
|
| 1530 |
+
)
|
| 1531 |
+
save_session_state(session_state)
|
| 1532 |
+
|
| 1533 |
+
return SessionStartResponse(
|
| 1534 |
+
status="success",
|
| 1535 |
+
session_id=session_id,
|
| 1536 |
+
language=session_request.language,
|
| 1537 |
+
started_at=started_at,
|
| 1538 |
+
message="Session created. Send chunks using /v1/session/{session_id}/chunk or websocket stream."
|
| 1539 |
+
)
|
| 1540 |
+
|
| 1541 |
+
|
| 1542 |
+
@app.post("/v1/session/{session_id}/chunk", response_model=RealTimeUpdateResponse)
|
| 1543 |
+
@app.post("/api/voice-detection/v1/session/{session_id}/chunk", response_model=RealTimeUpdateResponse)
|
| 1544 |
+
async def analyze_realtime_chunk(
|
| 1545 |
+
request: Request,
|
| 1546 |
+
session_id: str,
|
| 1547 |
+
chunk_request: SessionChunkRequest,
|
| 1548 |
+
api_key: str = Depends(verify_api_key)
|
| 1549 |
+
):
|
| 1550 |
+
"""Analyze one chunk for an active real-time session."""
|
| 1551 |
+
request_id = getattr(request.state, "request_id", f"sess-{session_id[:8]}")
|
| 1552 |
+
|
| 1553 |
+
async with SESSION_LOCK:
|
| 1554 |
+
purge_expired_sessions()
|
| 1555 |
+
session = get_session_state(session_id)
|
| 1556 |
+
if session is None:
|
| 1557 |
+
raise HTTPException(
|
| 1558 |
+
status_code=404,
|
| 1559 |
+
detail={"status": "error", "message": "Session not found or expired"}
|
| 1560 |
+
)
|
| 1561 |
+
if session.status != "active":
|
| 1562 |
+
raise HTTPException(
|
| 1563 |
+
status_code=409,
|
| 1564 |
+
detail={"status": "error", "message": "Session is not active. Start a new session to continue."}
|
| 1565 |
+
)
|
| 1566 |
+
session_language = session.language
|
| 1567 |
+
|
| 1568 |
+
try:
|
| 1569 |
+
return await process_audio_chunk(session_id, chunk_request, session_language, request_id)
|
| 1570 |
+
except ValueError as e:
|
| 1571 |
+
raise HTTPException(status_code=400, detail={"status": "error", "message": str(e)}) from e
|
| 1572 |
+
|
| 1573 |
+
|
| 1574 |
+
@app.websocket("/v1/session/{session_id}/stream")
|
| 1575 |
+
@app.websocket("/api/voice-detection/v1/session/{session_id}/stream")
|
| 1576 |
+
async def stream_realtime_session(websocket: WebSocket, session_id: str):
|
| 1577 |
+
"""WebSocket endpoint for continuous chunk-based analysis."""
|
| 1578 |
+
if not verify_websocket_api_key(websocket):
|
| 1579 |
+
await websocket.close(code=1008, reason="Invalid API key")
|
| 1580 |
+
return
|
| 1581 |
+
|
| 1582 |
+
async with SESSION_LOCK:
|
| 1583 |
+
purge_expired_sessions()
|
| 1584 |
+
session = get_session_state(session_id)
|
| 1585 |
+
if session is None:
|
| 1586 |
+
await websocket.close(code=1008, reason="Session not found or expired")
|
| 1587 |
+
return
|
| 1588 |
+
if session.status != "active":
|
| 1589 |
+
await websocket.close(code=1008, reason="Session is not active")
|
| 1590 |
+
return
|
| 1591 |
+
session_language = session.language
|
| 1592 |
+
|
| 1593 |
+
await websocket.accept()
|
| 1594 |
+
request_id = f"ws-{session_id[:8]}"
|
| 1595 |
+
|
| 1596 |
+
try:
|
| 1597 |
+
while True:
|
| 1598 |
+
payload = await websocket.receive_json()
|
| 1599 |
+
try:
|
| 1600 |
+
chunk_request = SessionChunkRequest.model_validate(payload)
|
| 1601 |
+
except ValidationError as e:
|
| 1602 |
+
await websocket.send_json({
|
| 1603 |
+
"status": "error",
|
| 1604 |
+
"message": "Invalid chunk payload",
|
| 1605 |
+
"details": e.errors()
|
| 1606 |
+
})
|
| 1607 |
+
continue
|
| 1608 |
+
|
| 1609 |
+
try:
|
| 1610 |
+
update = await process_audio_chunk(session_id, chunk_request, session_language, request_id)
|
| 1611 |
+
await websocket.send_json(update.model_dump())
|
| 1612 |
+
except HTTPException as e:
|
| 1613 |
+
detail = e.detail if isinstance(e.detail, dict) else {"status": "error", "message": str(e.detail)}
|
| 1614 |
+
await websocket.send_json(detail)
|
| 1615 |
+
except ValueError as e:
|
| 1616 |
+
await websocket.send_json({"status": "error", "message": str(e)})
|
| 1617 |
+
except WebSocketDisconnect:
|
| 1618 |
+
logger.info(f"[{request_id}] WebSocket disconnected")
|
| 1619 |
+
|
| 1620 |
+
|
| 1621 |
+
@app.get("/v1/session/{session_id}/summary", response_model=SessionSummaryResponse)
|
| 1622 |
+
@app.get("/api/voice-detection/v1/session/{session_id}/summary", response_model=SessionSummaryResponse)
|
| 1623 |
+
async def get_session_summary(
|
| 1624 |
+
session_id: str,
|
| 1625 |
+
api_key: str = Depends(verify_api_key)
|
| 1626 |
+
):
|
| 1627 |
+
"""Return current summary for a real-time session."""
|
| 1628 |
+
async with SESSION_LOCK:
|
| 1629 |
+
purge_expired_sessions()
|
| 1630 |
+
session = get_session_state(session_id)
|
| 1631 |
+
if session is None:
|
| 1632 |
+
raise HTTPException(
|
| 1633 |
+
status_code=404,
|
| 1634 |
+
detail={"status": "error", "message": "Session not found or expired"}
|
| 1635 |
+
)
|
| 1636 |
+
return session_to_summary(session)
|
| 1637 |
+
|
| 1638 |
+
|
| 1639 |
+
@app.get("/v1/session/{session_id}/alerts", response_model=AlertHistoryResponse)
|
| 1640 |
+
@app.get("/api/voice-detection/v1/session/{session_id}/alerts", response_model=AlertHistoryResponse)
|
| 1641 |
+
async def get_session_alerts(
|
| 1642 |
+
session_id: str,
|
| 1643 |
+
limit: int = 20,
|
| 1644 |
+
api_key: str = Depends(verify_api_key),
|
| 1645 |
+
):
|
| 1646 |
+
"""Return recent alert history for a real-time session."""
|
| 1647 |
+
if limit < 1 or limit > 100:
|
| 1648 |
+
raise HTTPException(
|
| 1649 |
+
status_code=400,
|
| 1650 |
+
detail={"status": "error", "message": "limit must be between 1 and 100"},
|
| 1651 |
+
)
|
| 1652 |
+
|
| 1653 |
+
async with SESSION_LOCK:
|
| 1654 |
+
purge_expired_sessions()
|
| 1655 |
+
session = get_session_state(session_id)
|
| 1656 |
+
if session is None:
|
| 1657 |
+
raise HTTPException(
|
| 1658 |
+
status_code=404,
|
| 1659 |
+
detail={"status": "error", "message": "Session not found or expired"},
|
| 1660 |
+
)
|
| 1661 |
+
|
| 1662 |
+
alerts = [AlertHistoryItem(**item) for item in session.alert_history[-limit:]]
|
| 1663 |
+
return AlertHistoryResponse(
|
| 1664 |
+
status="success",
|
| 1665 |
+
session_id=session_id,
|
| 1666 |
+
total_alerts=len(session.alert_history),
|
| 1667 |
+
alerts=alerts,
|
| 1668 |
+
)
|
| 1669 |
+
|
| 1670 |
+
|
| 1671 |
+
@app.get("/v1/privacy/retention-policy", response_model=RetentionPolicyResponse)
|
| 1672 |
+
@app.get("/api/voice-detection/v1/privacy/retention-policy", response_model=RetentionPolicyResponse)
|
| 1673 |
+
async def get_retention_policy(api_key: str = Depends(verify_api_key)):
|
| 1674 |
+
"""Return explicit privacy defaults for raw audio and session-derived data."""
|
| 1675 |
+
return RetentionPolicyResponse(
|
| 1676 |
+
status="success",
|
| 1677 |
+
raw_audio_storage="not_persisted",
|
| 1678 |
+
active_session_retention_seconds=settings.SESSION_ACTIVE_RETENTION_SECONDS,
|
| 1679 |
+
ended_session_retention_seconds=settings.SESSION_ENDED_RETENTION_SECONDS,
|
| 1680 |
+
stored_derived_fields=STORED_DERIVED_FIELDS,
|
| 1681 |
+
)
|
| 1682 |
+
|
| 1683 |
+
|
| 1684 |
+
@app.post("/v1/session/{session_id}/end", response_model=SessionSummaryResponse)
|
| 1685 |
+
@app.post("/api/voice-detection/v1/session/{session_id}/end", response_model=SessionSummaryResponse)
|
| 1686 |
+
async def end_realtime_session(
|
| 1687 |
+
session_id: str,
|
| 1688 |
+
api_key: str = Depends(verify_api_key)
|
| 1689 |
+
):
|
| 1690 |
+
"""Mark a session as ended and return final summary."""
|
| 1691 |
+
async with SESSION_LOCK:
|
| 1692 |
+
purge_expired_sessions()
|
| 1693 |
+
session = get_session_state(session_id)
|
| 1694 |
+
if session is None:
|
| 1695 |
+
raise HTTPException(
|
| 1696 |
+
status_code=404,
|
| 1697 |
+
detail={"status": "error", "message": "Session not found or expired"}
|
| 1698 |
+
)
|
| 1699 |
+
session.status = "ended"
|
| 1700 |
+
session.last_update = utc_now_iso()
|
| 1701 |
+
save_session_state(session)
|
| 1702 |
+
return session_to_summary(session)
|
| 1703 |
+
|
| 1704 |
+
|
| 1705 |
+
@app.post(
|
| 1706 |
+
"/api/voice-detection",
|
| 1707 |
+
response_model=VoiceDetectionResponse,
|
| 1708 |
+
responses={
|
| 1709 |
+
400: {"model": ErrorResponse, "description": "Bad Request"},
|
| 1710 |
+
401: {"model": ErrorResponse, "description": "Unauthorized"},
|
| 1711 |
+
429: {"model": ErrorResponse, "description": "Rate Limit Exceeded"},
|
| 1712 |
+
500: {"model": ErrorResponse, "description": "Internal Server Error"}
|
| 1713 |
+
}
|
| 1714 |
+
)
|
| 1715 |
+
@limiter.limit("1000/minute") # Rate limit: 1000 requests per minute per IP
|
| 1716 |
+
async def detect_voice(
|
| 1717 |
+
request: Request, # Required for rate limiter
|
| 1718 |
+
voice_request: VoiceDetectionRequest,
|
| 1719 |
+
api_key: str = Depends(verify_api_key) # Use dependency injection
|
| 1720 |
+
):
|
| 1721 |
+
"""
|
| 1722 |
+
Returns classification result with confidence score and explanation.
|
| 1723 |
+
"""
|
| 1724 |
+
# Log request info for debugging
|
| 1725 |
+
request_id = getattr(request.state, 'request_id', 'unknown')
|
| 1726 |
+
audio_size_kb = len(voice_request.audioBase64) * 3 / 4 / 1024 # Approximate decoded size
|
| 1727 |
+
logger.info(f"[{request_id}] Voice detection request: language={voice_request.language}, format={voice_request.audioFormat}, size~{audio_size_kb:.1f}KB")
|
| 1728 |
+
|
| 1729 |
+
validate_supported_language(voice_request.language)
|
| 1730 |
+
validate_supported_format(voice_request.audioFormat)
|
| 1731 |
+
|
| 1732 |
+
try:
|
| 1733 |
+
# Step 1: Decode Base64 (async - runs in thread pool)
|
| 1734 |
+
logger.info(f"[{request_id}] -> Decoding Base64...")
|
| 1735 |
+
decode_start = time.perf_counter()
|
| 1736 |
+
audio_bytes = await asyncio.to_thread(decode_base64_audio, voice_request.audioBase64)
|
| 1737 |
+
decode_time = (time.perf_counter() - decode_start) * 1000
|
| 1738 |
+
|
| 1739 |
+
# Step 2: Load audio (async - runs in thread pool)
|
| 1740 |
+
logger.info(f"[{request_id}] -> Loading audio... (decode took {decode_time:.0f}ms)")
|
| 1741 |
+
load_start = time.perf_counter()
|
| 1742 |
+
audio, sr = await asyncio.to_thread(load_audio_from_bytes, audio_bytes, 22050, voice_request.audioFormat)
|
| 1743 |
+
load_time = (time.perf_counter() - load_start) * 1000
|
| 1744 |
+
|
| 1745 |
+
# Step 3: ML Analysis (async - runs in thread pool, CPU-bound)
|
| 1746 |
+
duration_sec = len(audio) / sr
|
| 1747 |
+
logger.info(f"[{request_id}] -> Analyzing {duration_sec:.1f}s audio... (load took {load_time:.0f}ms)")
|
| 1748 |
+
analyze_start = time.perf_counter()
|
| 1749 |
+
result = await asyncio.to_thread(analyze_voice, audio, sr, voice_request.language)
|
| 1750 |
+
analyze_time = (time.perf_counter() - analyze_start) * 1000
|
| 1751 |
+
|
| 1752 |
+
logger.info(f"[{request_id}] -> Analysis complete: {result.classification} ({result.confidence_score:.0%}) in {analyze_time:.0f}ms")
|
| 1753 |
+
|
| 1754 |
+
# Extract metrics if available
|
| 1755 |
+
metrics = None
|
| 1756 |
+
if result.features:
|
| 1757 |
+
metrics = ForensicMetrics(
|
| 1758 |
+
authenticity_score=result.features.get("authenticity_score", 0),
|
| 1759 |
+
pitch_naturalness=result.features.get("pitch_naturalness", 0),
|
| 1760 |
+
spectral_naturalness=result.features.get("spectral_naturalness", 0),
|
| 1761 |
+
temporal_naturalness=result.features.get("temporal_naturalness", 0)
|
| 1762 |
+
)
|
| 1763 |
+
|
| 1764 |
+
model_uncertain = bool((result.features or {}).get("ml_fallback", 0.0))
|
| 1765 |
+
explanation = result.explanation
|
| 1766 |
+
recommended_action = None
|
| 1767 |
+
response_classification = result.classification
|
| 1768 |
+
if model_uncertain:
|
| 1769 |
+
explanation = (
|
| 1770 |
+
"Model uncertainty detected due fallback inference. "
|
| 1771 |
+
"Treat result as cautionary and verify through trusted channels. "
|
| 1772 |
+
f"{result.explanation}"
|
| 1773 |
+
)
|
| 1774 |
+
recommended_action = (
|
| 1775 |
+
"Do not share OTP, PIN, passwords, or payment credentials. "
|
| 1776 |
+
"Verify caller identity through official support channels."
|
| 1777 |
+
)
|
| 1778 |
+
if settings.LEGACY_FALLBACK_RETURNS_UNCERTAIN:
|
| 1779 |
+
response_classification = "UNCERTAIN"
|
| 1780 |
+
|
| 1781 |
+
# Return response
|
| 1782 |
+
return VoiceDetectionResponse(
|
| 1783 |
+
status="success",
|
| 1784 |
+
language=voice_request.language,
|
| 1785 |
+
classification=response_classification,
|
| 1786 |
+
confidenceScore=result.confidence_score,
|
| 1787 |
+
explanation=explanation,
|
| 1788 |
+
forensic_metrics=metrics,
|
| 1789 |
+
modelUncertain=model_uncertain,
|
| 1790 |
+
recommendedAction=recommended_action,
|
| 1791 |
+
)
|
| 1792 |
+
|
| 1793 |
+
except ValueError as e:
|
| 1794 |
+
logger.warning(f"[{request_id}] [VALIDATION_ERROR] {e}")
|
| 1795 |
+
raise HTTPException(
|
| 1796 |
+
status_code=400,
|
| 1797 |
+
detail={"status": "error", "message": str(e)}
|
| 1798 |
+
)
|
| 1799 |
+
except Exception as e:
|
| 1800 |
+
logger.error(f"[{request_id}] [PROCESSING_ERROR] {e}", exc_info=True)
|
| 1801 |
+
raise HTTPException(
|
| 1802 |
+
status_code=500,
|
| 1803 |
+
detail={"status": "error", "message": f"Internal Server Error (request_id={request_id})"}
|
| 1804 |
+
)
|
| 1805 |
+
|
| 1806 |
+
|
| 1807 |
+
# Exception handlers
|
| 1808 |
+
from fastapi.exceptions import RequestValidationError
|
| 1809 |
+
|
| 1810 |
+
def to_json_safe(value: Any) -> Any:
|
| 1811 |
+
"""Recursively convert values to JSON-safe primitives."""
|
| 1812 |
+
if value is None or isinstance(value, (str, int, float, bool)):
|
| 1813 |
+
return value
|
| 1814 |
+
if isinstance(value, BaseException):
|
| 1815 |
+
return str(value)
|
| 1816 |
+
if isinstance(value, dict):
|
| 1817 |
+
return {str(k): to_json_safe(v) for k, v in value.items()}
|
| 1818 |
+
if isinstance(value, (list, tuple, set)):
|
| 1819 |
+
return [to_json_safe(item) for item in value]
|
| 1820 |
+
return str(value)
|
| 1821 |
+
|
| 1822 |
+
|
| 1823 |
+
@app.exception_handler(RequestValidationError)
|
| 1824 |
+
async def validation_exception_handler(request: Request, exc: RequestValidationError):
|
| 1825 |
+
"""
|
| 1826 |
+
Custom handler for 422 Validation Errors.
|
| 1827 |
+
Provides clearer error messages for common issues.
|
| 1828 |
+
"""
|
| 1829 |
+
errors = to_json_safe(exc.errors())
|
| 1830 |
+
logger.warning("Validation error: %s", errors)
|
| 1831 |
+
|
| 1832 |
+
# Build user-friendly error message
|
| 1833 |
+
error_messages = []
|
| 1834 |
+
for error in errors:
|
| 1835 |
+
loc = " -> ".join(str(l) for l in error.get("loc", []))
|
| 1836 |
+
msg = error.get("msg", "Invalid value")
|
| 1837 |
+
error_messages.append(f"{loc}: {msg}")
|
| 1838 |
+
|
| 1839 |
+
# Common issue detection
|
| 1840 |
+
if any("audioBase64" in str(e.get("loc", [])) for e in errors):
|
| 1841 |
+
hint = " Hint: Ensure 'audioBase64' is a valid Base64-encoded string."
|
| 1842 |
+
elif any("language" in str(e.get("loc", [])) for e in errors):
|
| 1843 |
+
hint = f" Hint: 'language' must be one of: {', '.join(settings.SUPPORTED_LANGUAGES)}."
|
| 1844 |
+
else:
|
| 1845 |
+
hint = ""
|
| 1846 |
+
|
| 1847 |
+
return JSONResponse(
|
| 1848 |
+
status_code=422,
|
| 1849 |
+
content={
|
| 1850 |
+
"status": "error",
|
| 1851 |
+
"message": f"Request validation failed: {'; '.join(error_messages)}.{hint}",
|
| 1852 |
+
"details": errors
|
| 1853 |
+
}
|
| 1854 |
+
)
|
| 1855 |
+
|
| 1856 |
+
|
| 1857 |
+
@app.exception_handler(HTTPException)
|
| 1858 |
+
async def http_exception_handler(request: Request, exc: HTTPException):
|
| 1859 |
+
"""Custom exception handler to ensure consistent error format."""
|
| 1860 |
+
if isinstance(exc.detail, dict):
|
| 1861 |
+
return JSONResponse(
|
| 1862 |
+
status_code=exc.status_code,
|
| 1863 |
+
content=exc.detail
|
| 1864 |
+
)
|
| 1865 |
+
return JSONResponse(
|
| 1866 |
+
status_code=exc.status_code,
|
| 1867 |
+
content={"status": "error", "message": str(exc.detail)}
|
| 1868 |
+
)
|
| 1869 |
+
|
| 1870 |
+
|
| 1871 |
+
@app.exception_handler(Exception)
|
| 1872 |
+
async def global_exception_handler(request: Request, exc: Exception):
|
| 1873 |
+
"""Global handler to catch unhandled exceptions and prevent stack traces."""
|
| 1874 |
+
logger.error(f"Unhandled error: {exc}", exc_info=True)
|
| 1875 |
+
return JSONResponse(
|
| 1876 |
+
status_code=500,
|
| 1877 |
+
content={"status": "error", "message": "Internal Server Error"}
|
| 1878 |
+
)
|
| 1879 |
+
|
| 1880 |
+
|
| 1881 |
+
if __name__ == "__main__":
|
| 1882 |
+
import uvicorn
|
| 1883 |
+
uvicorn.run(app, host="0.0.0.0", port=settings.PORT)
|
| 1884 |
+
|
| 1885 |
+
|
| 1886 |
+
|
| 1887 |
+
|
| 1888 |
+
|
| 1889 |
+
|
| 1890 |
+
|
| 1891 |
+
|
| 1892 |
+
|
| 1893 |
+
|
| 1894 |
+
|
| 1895 |
+
|
| 1896 |
+
|
| 1897 |
+
|
| 1898 |
+
|
| 1899 |
+
|
| 1900 |
+
|
| 1901 |
+
|
| 1902 |
+
|
| 1903 |
+
|
model.py
ADDED
|
@@ -0,0 +1,563 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Voice Analysis Engine.
|
| 3 |
+
Combines Wav2Vec2 deepfake detection with signal forensics.
|
| 4 |
+
"""
|
| 5 |
+
import logging
|
| 6 |
+
import os
|
| 7 |
+
import numpy as np
|
| 8 |
+
from typing import Dict, Tuple, List, Optional
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
import warnings
|
| 11 |
+
|
| 12 |
+
from config import settings
|
| 13 |
+
|
| 14 |
+
# Configure logging
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
# Suppress warnings
|
| 18 |
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
| 19 |
+
warnings.filterwarnings("ignore", category=UserWarning)
|
| 20 |
+
|
| 21 |
+
# Global model cache
|
| 22 |
+
_model = None
|
| 23 |
+
_processor = None
|
| 24 |
+
_device = None
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@dataclass
|
| 28 |
+
class AnalysisResult:
|
| 29 |
+
"""Result of voice analysis."""
|
| 30 |
+
classification: str # "AI_GENERATED" or "HUMAN"
|
| 31 |
+
confidence_score: float # 0.0 to 1.0
|
| 32 |
+
explanation: str
|
| 33 |
+
features: Dict[str, float] # Individual feature scores for debugging
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_device():
|
| 37 |
+
"""Get the best available device (GPU or CPU)."""
|
| 38 |
+
global _device
|
| 39 |
+
if _device is None:
|
| 40 |
+
import torch
|
| 41 |
+
if torch.cuda.is_available():
|
| 42 |
+
_device = "cuda"
|
| 43 |
+
else:
|
| 44 |
+
_device = "cpu"
|
| 45 |
+
logger.info(f"Using device: {_device}")
|
| 46 |
+
return _device
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def load_model():
|
| 50 |
+
"""
|
| 51 |
+
Load the Wav2Vec2 deepfake detection model.
|
| 52 |
+
Prioritizes HuggingFace Hub model, with local fallback.
|
| 53 |
+
"""
|
| 54 |
+
global _model, _processor
|
| 55 |
+
|
| 56 |
+
if _model is None:
|
| 57 |
+
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
|
| 58 |
+
|
| 59 |
+
# Model priority:
|
| 60 |
+
# 1. Local fine-tuned model (for development)
|
| 61 |
+
# 2. HuggingFace Hub model (for production/deployment)
|
| 62 |
+
# 3. Fallback to public model
|
| 63 |
+
|
| 64 |
+
local_path = settings.VOICE_MODEL_LOCAL_PATH
|
| 65 |
+
hf_model = settings.VOICE_MODEL_ID
|
| 66 |
+
backup_model = settings.VOICE_MODEL_BACKUP_ID
|
| 67 |
+
|
| 68 |
+
if os.path.exists(local_path):
|
| 69 |
+
logger.info(f"Loading local fine-tuned model from: {local_path}")
|
| 70 |
+
model_name = local_path
|
| 71 |
+
else:
|
| 72 |
+
logger.info(f"Loading model from HuggingFace Hub: {hf_model}")
|
| 73 |
+
model_name = hf_model
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
_processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
|
| 77 |
+
_model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
|
| 78 |
+
_model.to(get_device())
|
| 79 |
+
_model.eval()
|
| 80 |
+
logger.info(f"Model loaded successfully: {model_name}")
|
| 81 |
+
except Exception as e:
|
| 82 |
+
logger.error(f"Failed to load model {model_name}: {e}")
|
| 83 |
+
if model_name != backup_model:
|
| 84 |
+
logger.warning("Trying backup model...")
|
| 85 |
+
model_name = backup_model
|
| 86 |
+
try:
|
| 87 |
+
_processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
|
| 88 |
+
_model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
|
| 89 |
+
_model.to(get_device())
|
| 90 |
+
_model.eval()
|
| 91 |
+
logger.info(f"Backup model loaded: {model_name}")
|
| 92 |
+
except Exception as e2:
|
| 93 |
+
raise RuntimeError(f"Could not load any model: {e2}")
|
| 94 |
+
else:
|
| 95 |
+
raise e
|
| 96 |
+
|
| 97 |
+
return _model, _processor
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def extract_signal_features(audio: np.ndarray, sr: int, fast_mode: bool = False) -> Dict[str, float]:
|
| 102 |
+
"""Extract signal-based features (pitch, entropy, silence)."""
|
| 103 |
+
import librosa
|
| 104 |
+
from scipy.stats import entropy
|
| 105 |
+
|
| 106 |
+
features = {}
|
| 107 |
+
|
| 108 |
+
try:
|
| 109 |
+
# Use smaller FFT in fast mode for realtime throughput.
|
| 110 |
+
n_fft = 512 if fast_mode else 2048
|
| 111 |
+
hop_length = 256 if fast_mode else 512
|
| 112 |
+
S = np.abs(librosa.stft(audio, n_fft=n_fft, hop_length=hop_length))
|
| 113 |
+
|
| 114 |
+
# Pitch analysis.
|
| 115 |
+
if fast_mode:
|
| 116 |
+
# Approximate pitch variability from centroid dynamics to avoid expensive pYIN on realtime path.
|
| 117 |
+
spec_centroid = librosa.feature.spectral_centroid(S=S, sr=sr)[0]
|
| 118 |
+
centroid_mean = float(np.mean(spec_centroid) + 1e-8)
|
| 119 |
+
features["pitch_stability"] = float(np.clip(np.var(spec_centroid) / (centroid_mean ** 2), 0.0, 1.5))
|
| 120 |
+
features["jitter"] = float(np.clip(np.mean(np.abs(np.diff(spec_centroid))) / centroid_mean, 0.0, 0.2))
|
| 121 |
+
voiced_flag = librosa.feature.rms(y=audio, frame_length=n_fft, hop_length=hop_length)[0] > 0.02
|
| 122 |
+
else:
|
| 123 |
+
f0, voiced_flag, _ = librosa.pyin(
|
| 124 |
+
audio,
|
| 125 |
+
fmin=librosa.note_to_hz('C2'),
|
| 126 |
+
fmax=librosa.note_to_hz('C7'),
|
| 127 |
+
sr=sr
|
| 128 |
+
)
|
| 129 |
+
f0_voiced = f0[~np.isnan(f0)]
|
| 130 |
+
if len(f0_voiced) > 10:
|
| 131 |
+
pitch_mean = np.mean(f0_voiced)
|
| 132 |
+
pitch_std = np.std(f0_voiced)
|
| 133 |
+
features["pitch_stability"] = pitch_std / pitch_mean if pitch_mean > 0 else 0
|
| 134 |
+
features["jitter"] = np.mean(np.abs(np.diff(f0_voiced))) / pitch_mean if pitch_mean > 0 else 0
|
| 135 |
+
else:
|
| 136 |
+
features["pitch_stability"] = 0.5
|
| 137 |
+
features["jitter"] = 0.05
|
| 138 |
+
|
| 139 |
+
# Spectral features
|
| 140 |
+
spec_centroid = librosa.feature.spectral_centroid(S=S, sr=sr)[0]
|
| 141 |
+
features["spectral_centroid_var"] = float(np.var(spec_centroid))
|
| 142 |
+
|
| 143 |
+
spec_flatness = librosa.feature.spectral_flatness(S=S)[0]
|
| 144 |
+
features["spectral_flatness"] = float(np.mean(spec_flatness))
|
| 145 |
+
|
| 146 |
+
# Entropy
|
| 147 |
+
S_norm = S / (np.sum(S, axis=0, keepdims=True) + 1e-10)
|
| 148 |
+
frame_entropies = [entropy(frame + 1e-10) for frame in S_norm.T]
|
| 149 |
+
features["spectral_entropy"] = float(np.mean(frame_entropies))
|
| 150 |
+
|
| 151 |
+
# Silence detection
|
| 152 |
+
silence_threshold = 1e-5
|
| 153 |
+
features["silence_ratio"] = float(np.sum(np.abs(audio) < silence_threshold) / len(audio))
|
| 154 |
+
features["perfect_silence"] = float(np.sum(audio == 0) / len(audio))
|
| 155 |
+
|
| 156 |
+
# Zero crossing rate
|
| 157 |
+
zcr = librosa.feature.zero_crossing_rate(audio)[0]
|
| 158 |
+
features["zcr_variance"] = float(np.var(zcr))
|
| 159 |
+
|
| 160 |
+
# Additional acoustic heuristics for suspicious audio artifacts.
|
| 161 |
+
spec_rolloff = librosa.feature.spectral_rolloff(S=S, sr=sr)[0]
|
| 162 |
+
features["spectral_rolloff_var"] = float(np.var(spec_rolloff))
|
| 163 |
+
features["voiced_ratio"] = float(np.mean(voiced_flag.astype(np.float32))) if voiced_flag is not None else 0.0
|
| 164 |
+
|
| 165 |
+
rms = librosa.feature.rms(y=audio)[0]
|
| 166 |
+
features["rms_var"] = float(np.var(rms))
|
| 167 |
+
|
| 168 |
+
if fast_mode:
|
| 169 |
+
# Cheap HNR approximation from flatness and entropy for realtime throughput.
|
| 170 |
+
hnr_db = float(max(0.0, 30.0 - (features["spectral_flatness"] * 120.0)))
|
| 171 |
+
else:
|
| 172 |
+
harmonic, percussive = librosa.effects.hpss(audio)
|
| 173 |
+
harmonic_rms = float(np.sqrt(np.mean(np.square(harmonic))) + 1e-8)
|
| 174 |
+
percussive_rms = float(np.sqrt(np.mean(np.square(percussive))) + 1e-8)
|
| 175 |
+
hnr_db = float(20.0 * np.log10(harmonic_rms / percussive_rms))
|
| 176 |
+
features["harmonic_noise_ratio_db"] = hnr_db
|
| 177 |
+
|
| 178 |
+
except Exception as e:
|
| 179 |
+
logger.warning(f"Feature extraction error: {e}")
|
| 180 |
+
features = {
|
| 181 |
+
"pitch_stability": 0.5,
|
| 182 |
+
"jitter": 0.05,
|
| 183 |
+
"spectral_centroid_var": 1000,
|
| 184 |
+
"spectral_flatness": 0.1,
|
| 185 |
+
"spectral_entropy": 5.0,
|
| 186 |
+
"silence_ratio": 0.0,
|
| 187 |
+
"perfect_silence": 0.0,
|
| 188 |
+
"zcr_variance": 0.01,
|
| 189 |
+
"spectral_rolloff_var": 50000.0,
|
| 190 |
+
"voiced_ratio": 0.65,
|
| 191 |
+
"rms_var": 0.005,
|
| 192 |
+
"harmonic_noise_ratio_db": 14.0,
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
return features
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def generate_explanation(
|
| 199 |
+
classification: str,
|
| 200 |
+
ml_confidence: float,
|
| 201 |
+
features: Dict[str, float]
|
| 202 |
+
) -> str:
|
| 203 |
+
"""Generate a data-driven forensic explanation for the classification."""
|
| 204 |
+
|
| 205 |
+
# Calculate acoustic anomaly scores (0-100 scale)
|
| 206 |
+
pitch_score = _calculate_pitch_score(features)
|
| 207 |
+
spectral_score = _calculate_spectral_score(features)
|
| 208 |
+
temporal_score = _calculate_temporal_score(features)
|
| 209 |
+
|
| 210 |
+
# Overall authenticity score (inverted for AI detection)
|
| 211 |
+
authenticity_score = (pitch_score + spectral_score + temporal_score) / 3
|
| 212 |
+
|
| 213 |
+
# Confidence tier affects explanation style
|
| 214 |
+
if ml_confidence >= 0.95:
|
| 215 |
+
confidence_tier = "high"
|
| 216 |
+
elif ml_confidence >= 0.75:
|
| 217 |
+
confidence_tier = "moderate"
|
| 218 |
+
else:
|
| 219 |
+
confidence_tier = "low"
|
| 220 |
+
|
| 221 |
+
if classification == "AI_GENERATED":
|
| 222 |
+
return _explain_ai_detection(
|
| 223 |
+
confidence_tier, ml_confidence, authenticity_score,
|
| 224 |
+
pitch_score, spectral_score, temporal_score, features
|
| 225 |
+
)
|
| 226 |
+
else:
|
| 227 |
+
return _explain_human_detection(
|
| 228 |
+
confidence_tier, ml_confidence, authenticity_score,
|
| 229 |
+
pitch_score, spectral_score, temporal_score, features
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def _calculate_pitch_score(features: Dict[str, float]) -> float:
|
| 234 |
+
"""Calculate pitch naturalness score (0-100). Higher = more human-like."""
|
| 235 |
+
pitch_stability = features.get("pitch_stability", 0.5)
|
| 236 |
+
jitter = features.get("jitter", 0.05)
|
| 237 |
+
|
| 238 |
+
# Typical Human: stability 0.1-0.3, jitter 0.02-0.08
|
| 239 |
+
# Typical AI: stability < 0.1, jitter < 0.02
|
| 240 |
+
|
| 241 |
+
stability_score = min(100, max(0, (pitch_stability - 0.05) / 0.25 * 100))
|
| 242 |
+
jitter_score = min(100, max(0, (jitter - 0.005) / 0.075 * 100))
|
| 243 |
+
|
| 244 |
+
return (stability_score * 0.6 + jitter_score * 0.4)
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def _calculate_spectral_score(features: Dict[str, float]) -> float:
|
| 248 |
+
"""Calculate spectral naturalness score (0-100). Higher = more human-like."""
|
| 249 |
+
entropy = features.get("spectral_entropy", 5.0)
|
| 250 |
+
flatness = features.get("spectral_flatness", 0.1)
|
| 251 |
+
|
| 252 |
+
# Typical Human: entropy 4.5-7, flatness 0.02-0.12
|
| 253 |
+
# Typical AI: entropy < 4.5, flatness > 0.12
|
| 254 |
+
|
| 255 |
+
entropy_score = min(100, max(0, (entropy - 3.0) / 4.0 * 100))
|
| 256 |
+
flatness_score = min(100, max(0, (0.2 - flatness) / 0.18 * 100))
|
| 257 |
+
|
| 258 |
+
return (entropy_score * 0.5 + flatness_score * 0.5)
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
def _calculate_temporal_score(features: Dict[str, float]) -> float:
|
| 262 |
+
"""Calculate temporal/rhythm naturalness score (0-100). Higher = more human-like."""
|
| 263 |
+
zcr_var = features.get("zcr_variance", 0.01)
|
| 264 |
+
silence_ratio = features.get("silence_ratio", 0.0)
|
| 265 |
+
perfect_silence = features.get("perfect_silence", 0.0)
|
| 266 |
+
|
| 267 |
+
# Penalize digital silence (exact zeros) - strong AI indicator
|
| 268 |
+
digital_penalty = min(50, perfect_silence * 500)
|
| 269 |
+
|
| 270 |
+
zcr_score = min(100, max(0, zcr_var / 0.02 * 100))
|
| 271 |
+
|
| 272 |
+
return max(0, zcr_score - digital_penalty)
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
def _calculate_acoustic_anomaly_score(features: Dict[str, float]) -> float:
|
| 276 |
+
"""
|
| 277 |
+
Estimate suspicious acoustic artifact intensity (0-100).
|
| 278 |
+
Higher score indicates stronger synthetic/spoof-like signal artifacts.
|
| 279 |
+
"""
|
| 280 |
+
perfect_silence = features.get("perfect_silence", 0.0)
|
| 281 |
+
spectral_flatness = features.get("spectral_flatness", 0.1)
|
| 282 |
+
rolloff_var = features.get("spectral_rolloff_var", 50000.0)
|
| 283 |
+
voiced_ratio = features.get("voiced_ratio", 0.65)
|
| 284 |
+
hnr_db = features.get("harmonic_noise_ratio_db", 14.0)
|
| 285 |
+
|
| 286 |
+
digital_artifact_score = min(100.0, perfect_silence * 10000.0)
|
| 287 |
+
flatness_artifact_score = min(100.0, max(0.0, (spectral_flatness - 0.13) * 500.0))
|
| 288 |
+
rolloff_score = min(100.0, max(0.0, (np.log10(rolloff_var + 1.0) - 3.8) * 45.0))
|
| 289 |
+
|
| 290 |
+
if voiced_ratio < 0.35:
|
| 291 |
+
voiced_ratio_score = min(100.0, (0.35 - voiced_ratio) * 180.0)
|
| 292 |
+
elif voiced_ratio > 0.95:
|
| 293 |
+
voiced_ratio_score = min(100.0, (voiced_ratio - 0.95) * 180.0)
|
| 294 |
+
else:
|
| 295 |
+
voiced_ratio_score = 0.0
|
| 296 |
+
|
| 297 |
+
if hnr_db < 6.0:
|
| 298 |
+
hnr_score = min(100.0, (6.0 - hnr_db) * 8.0)
|
| 299 |
+
elif hnr_db > 28.0:
|
| 300 |
+
hnr_score = min(100.0, (hnr_db - 28.0) * 4.0)
|
| 301 |
+
else:
|
| 302 |
+
hnr_score = 0.0
|
| 303 |
+
|
| 304 |
+
anomaly_score = (
|
| 305 |
+
(digital_artifact_score * 0.35)
|
| 306 |
+
+ (flatness_artifact_score * 0.20)
|
| 307 |
+
+ (rolloff_score * 0.20)
|
| 308 |
+
+ (voiced_ratio_score * 0.15)
|
| 309 |
+
+ (hnr_score * 0.10)
|
| 310 |
+
)
|
| 311 |
+
return float(max(0.0, min(100.0, anomaly_score)))
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def _explain_ai_detection(
|
| 315 |
+
confidence_tier: str,
|
| 316 |
+
ml_confidence: float,
|
| 317 |
+
authenticity_score: float,
|
| 318 |
+
pitch_score: float,
|
| 319 |
+
spectral_score: float,
|
| 320 |
+
temporal_score: float,
|
| 321 |
+
features: Dict[str, float]
|
| 322 |
+
) -> str:
|
| 323 |
+
"""Generate explanation for AI-detected audio."""
|
| 324 |
+
|
| 325 |
+
# Find the weakest scores (most AI-like characteristics)
|
| 326 |
+
scores = {
|
| 327 |
+
"vocal pitch patterns": pitch_score,
|
| 328 |
+
"spectral characteristics": spectral_score,
|
| 329 |
+
"temporal dynamics": temporal_score
|
| 330 |
+
}
|
| 331 |
+
sorted_scores = sorted(scores.items(), key=lambda x: x[1])
|
| 332 |
+
|
| 333 |
+
# Build forensic-style explanation
|
| 334 |
+
primary_indicator = sorted_scores[0][0]
|
| 335 |
+
primary_score = sorted_scores[0][1]
|
| 336 |
+
|
| 337 |
+
if confidence_tier == "high":
|
| 338 |
+
intro = f"Strong synthetic markers detected (confidence: {ml_confidence:.0%}). "
|
| 339 |
+
elif confidence_tier == "moderate":
|
| 340 |
+
intro = f"Synthetic patterns identified (confidence: {ml_confidence:.0%}). "
|
| 341 |
+
else:
|
| 342 |
+
intro = f"Possible synthetic audio (confidence: {ml_confidence:.0%}). "
|
| 343 |
+
|
| 344 |
+
# Specific findings based on lowest scoring area
|
| 345 |
+
if primary_indicator == "vocal pitch patterns":
|
| 346 |
+
jitter = features.get("jitter", 0)
|
| 347 |
+
stability = features.get("pitch_stability", 0)
|
| 348 |
+
detail = f"Pitch analysis shows unusually consistent patterns (stability: {stability:.3f}, micro-variation: {jitter:.4f}) - typical of synthesized speech."
|
| 349 |
+
elif primary_indicator == "spectral characteristics":
|
| 350 |
+
entropy = features.get("spectral_entropy", 0)
|
| 351 |
+
flatness = features.get("spectral_flatness", 0)
|
| 352 |
+
detail = f"Spectral fingerprint indicates synthetic generation (complexity: {entropy:.2f}, flatness: {flatness:.3f}) - lacking natural harmonic richness."
|
| 353 |
+
else:
|
| 354 |
+
perfect_silence = features.get("perfect_silence", 0)
|
| 355 |
+
if perfect_silence > 0.005:
|
| 356 |
+
detail = f"Digital artifacts detected: {perfect_silence:.1%} exact-zero samples found, indicating synthetic audio processing."
|
| 357 |
+
else:
|
| 358 |
+
detail = f"Temporal patterns suggest algorithmic generation - rhythm lacks natural human irregularities."
|
| 359 |
+
|
| 360 |
+
# Add authenticity score as a unique metric
|
| 361 |
+
authenticity_label = "very low" if authenticity_score < 25 else "low" if authenticity_score < 50 else "borderline"
|
| 362 |
+
|
| 363 |
+
return f"{intro}{detail} Authenticity score: {authenticity_score:.0f}/100 ({authenticity_label})."
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
def _explain_human_detection(
|
| 367 |
+
confidence_tier: str,
|
| 368 |
+
ml_confidence: float,
|
| 369 |
+
authenticity_score: float,
|
| 370 |
+
pitch_score: float,
|
| 371 |
+
spectral_score: float,
|
| 372 |
+
temporal_score: float,
|
| 373 |
+
features: Dict[str, float]
|
| 374 |
+
) -> str:
|
| 375 |
+
"""Generate explanation for human-detected audio."""
|
| 376 |
+
|
| 377 |
+
# Find the strongest scores (most human-like characteristics)
|
| 378 |
+
scores = {
|
| 379 |
+
"vocal pitch patterns": pitch_score,
|
| 380 |
+
"spectral characteristics": spectral_score,
|
| 381 |
+
"temporal dynamics": temporal_score
|
| 382 |
+
}
|
| 383 |
+
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
| 384 |
+
|
| 385 |
+
primary_indicator = sorted_scores[0][0]
|
| 386 |
+
primary_score = sorted_scores[0][1]
|
| 387 |
+
|
| 388 |
+
if confidence_tier == "high":
|
| 389 |
+
intro = f"Strong human voice markers detected (confidence: {ml_confidence:.0%}). "
|
| 390 |
+
elif confidence_tier == "moderate":
|
| 391 |
+
intro = f"Human speech patterns identified (confidence: {ml_confidence:.0%}). "
|
| 392 |
+
else:
|
| 393 |
+
intro = f"Likely human voice (confidence: {ml_confidence:.0%}). "
|
| 394 |
+
|
| 395 |
+
# Specific findings based on highest scoring area
|
| 396 |
+
if primary_indicator == "vocal pitch patterns":
|
| 397 |
+
jitter = features.get("jitter", 0)
|
| 398 |
+
stability = features.get("pitch_stability", 0)
|
| 399 |
+
detail = f"Natural pitch dynamics confirmed (variability: {stability:.3f}, micro-fluctuations: {jitter:.4f}) - consistent with biological speech production."
|
| 400 |
+
elif primary_indicator == "spectral characteristics":
|
| 401 |
+
entropy = features.get("spectral_entropy", 0)
|
| 402 |
+
detail = f"Rich harmonic structure detected (complexity score: {entropy:.2f}) - characteristic of natural vocal tract resonance."
|
| 403 |
+
else:
|
| 404 |
+
zcr_var = features.get("zcr_variance", 0)
|
| 405 |
+
detail = f"Organic speech rhythm detected (variance: {zcr_var:.4f}) - natural breathing and articulation patterns present."
|
| 406 |
+
|
| 407 |
+
# Add authenticity score
|
| 408 |
+
authenticity_label = "excellent" if authenticity_score > 75 else "good" if authenticity_score > 50 else "moderate"
|
| 409 |
+
|
| 410 |
+
return f"{intro}{detail} Authenticity score: {authenticity_score:.0f}/100 ({authenticity_label})."
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
def classify_with_model(audio: np.ndarray, sr: int) -> Tuple[str, float]:
|
| 414 |
+
"""
|
| 415 |
+
Classify audio using the Wav2Vec2 model.
|
| 416 |
+
|
| 417 |
+
Returns:
|
| 418 |
+
Tuple of (classification, confidence)
|
| 419 |
+
"""
|
| 420 |
+
import torch
|
| 421 |
+
import librosa
|
| 422 |
+
|
| 423 |
+
model, processor = load_model()
|
| 424 |
+
device = get_device()
|
| 425 |
+
|
| 426 |
+
# Normalize audio to prevent clipping issues
|
| 427 |
+
max_val = np.max(np.abs(audio))
|
| 428 |
+
if max_val > 0:
|
| 429 |
+
audio = audio / max_val
|
| 430 |
+
|
| 431 |
+
# Resample to 16kHz if needed (Wav2Vec2 expects 16kHz)
|
| 432 |
+
target_sr = 16000
|
| 433 |
+
if sr != target_sr:
|
| 434 |
+
audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
|
| 435 |
+
|
| 436 |
+
# Process audio
|
| 437 |
+
inputs = processor(
|
| 438 |
+
audio,
|
| 439 |
+
sampling_rate=target_sr,
|
| 440 |
+
return_tensors="pt",
|
| 441 |
+
padding=True
|
| 442 |
+
)
|
| 443 |
+
|
| 444 |
+
# Move to device
|
| 445 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 446 |
+
|
| 447 |
+
# Run inference
|
| 448 |
+
with torch.no_grad():
|
| 449 |
+
outputs = model(**inputs)
|
| 450 |
+
logits = outputs.logits
|
| 451 |
+
probabilities = torch.softmax(logits, dim=-1)
|
| 452 |
+
|
| 453 |
+
# Get prediction
|
| 454 |
+
predicted_class = torch.argmax(probabilities, dim=-1).item()
|
| 455 |
+
confidence = probabilities[0][predicted_class].item()
|
| 456 |
+
|
| 457 |
+
# Map class to label using the model's id2label config.
|
| 458 |
+
# IMPORTANT: HuggingFace stores id2label with STRING keys ("0", "1")
|
| 459 |
+
# but predicted_class from torch.argmax().item() is an int.
|
| 460 |
+
# We must normalise the keys to int so .get() actually matches.
|
| 461 |
+
raw_id2label = getattr(model.config, 'id2label', None) or {}
|
| 462 |
+
id2label = {int(k): v for k, v in raw_id2label.items()}
|
| 463 |
+
label = id2label.get(predicted_class, 'UNKNOWN')
|
| 464 |
+
|
| 465 |
+
logger.info(
|
| 466 |
+
"Model id2label=%s predicted_class=%d resolved_label=%s",
|
| 467 |
+
id2label, predicted_class, label,
|
| 468 |
+
)
|
| 469 |
+
|
| 470 |
+
# Normalize label
|
| 471 |
+
if label.upper() in ['FAKE', 'SPOOF', 'SYNTHETIC', 'AI']:
|
| 472 |
+
classification = "AI_GENERATED"
|
| 473 |
+
else:
|
| 474 |
+
classification = "HUMAN"
|
| 475 |
+
|
| 476 |
+
return classification, confidence
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
def analyze_voice(audio: np.ndarray, sr: int, language: str = "English", realtime: bool = False) -> AnalysisResult:
|
| 480 |
+
"""
|
| 481 |
+
Analyze a voice sample and classify as AI-generated or Human.
|
| 482 |
+
|
| 483 |
+
Args:
|
| 484 |
+
audio: Audio waveform as numpy array
|
| 485 |
+
sr: Sample rate
|
| 486 |
+
language: Language of the audio (for context)
|
| 487 |
+
|
| 488 |
+
Returns:
|
| 489 |
+
AnalysisResult with classification, confidence, and explanation
|
| 490 |
+
|
| 491 |
+
Raises:
|
| 492 |
+
ValueError: If audio is too short for reliable analysis
|
| 493 |
+
"""
|
| 494 |
+
# Validate minimum audio duration (at least 0.5 seconds for reliable analysis)
|
| 495 |
+
min_duration = 0.5 # seconds
|
| 496 |
+
duration = len(audio) / sr
|
| 497 |
+
if duration < min_duration:
|
| 498 |
+
raise ValueError(f"Audio too short ({duration:.2f}s). Minimum {min_duration}s required for reliable analysis.")
|
| 499 |
+
|
| 500 |
+
fast_mode = bool(realtime and settings.REALTIME_LIGHTWEIGHT_AUDIO)
|
| 501 |
+
|
| 502 |
+
# Get model prediction (legacy/deep path) or defer to lightweight realtime heuristic.
|
| 503 |
+
ml_fallback = False
|
| 504 |
+
classification = "HUMAN"
|
| 505 |
+
ml_confidence = 0.5
|
| 506 |
+
if not fast_mode:
|
| 507 |
+
try:
|
| 508 |
+
classification, ml_confidence = classify_with_model(audio, sr)
|
| 509 |
+
except Exception as e:
|
| 510 |
+
logger.error(f"ML model error: {e}, falling back to signal analysis")
|
| 511 |
+
ml_fallback = True
|
| 512 |
+
classification = "HUMAN"
|
| 513 |
+
ml_confidence = 0.5
|
| 514 |
+
|
| 515 |
+
# Extract signal features for explainability.
|
| 516 |
+
features = extract_signal_features(audio, sr, fast_mode=fast_mode)
|
| 517 |
+
|
| 518 |
+
# Calculate scores explicitly for return.
|
| 519 |
+
pitch_score = _calculate_pitch_score(features)
|
| 520 |
+
spectral_score = _calculate_spectral_score(features)
|
| 521 |
+
temporal_score = _calculate_temporal_score(features)
|
| 522 |
+
authenticity_score = (pitch_score + spectral_score + temporal_score) / 3
|
| 523 |
+
acoustic_anomaly_score = _calculate_acoustic_anomaly_score(features)
|
| 524 |
+
|
| 525 |
+
# Lightweight realtime path avoids transformer inference for throughput.
|
| 526 |
+
if fast_mode:
|
| 527 |
+
ai_probability = max(
|
| 528 |
+
acoustic_anomaly_score / 100.0,
|
| 529 |
+
max(0.0, min(1.0, (52.0 - authenticity_score) / 52.0)),
|
| 530 |
+
)
|
| 531 |
+
classification = "AI_GENERATED" if ai_probability >= 0.56 else "HUMAN"
|
| 532 |
+
ml_confidence = ai_probability if classification == "AI_GENERATED" else (1.0 - ai_probability)
|
| 533 |
+
ml_confidence = float(max(0.5, min(0.99, ml_confidence)))
|
| 534 |
+
|
| 535 |
+
features["ml_confidence"] = ml_confidence
|
| 536 |
+
features["ml_fallback"] = float(ml_fallback)
|
| 537 |
+
features["realtime_heuristic_mode"] = float(fast_mode)
|
| 538 |
+
|
| 539 |
+
# Add computed high-level scores to features for API response.
|
| 540 |
+
features["authenticity_score"] = round(authenticity_score, 1)
|
| 541 |
+
features["pitch_naturalness"] = round(pitch_score, 1)
|
| 542 |
+
features["spectral_naturalness"] = round(spectral_score, 1)
|
| 543 |
+
features["temporal_naturalness"] = round(temporal_score, 1)
|
| 544 |
+
features["acoustic_anomaly_score"] = round(acoustic_anomaly_score, 1)
|
| 545 |
+
|
| 546 |
+
# Generate explanation
|
| 547 |
+
explanation = generate_explanation(classification, ml_confidence, features)
|
| 548 |
+
|
| 549 |
+
return AnalysisResult(
|
| 550 |
+
classification=classification,
|
| 551 |
+
confidence_score=round(ml_confidence, 2),
|
| 552 |
+
explanation=explanation,
|
| 553 |
+
features=features
|
| 554 |
+
)
|
| 555 |
+
|
| 556 |
+
|
| 557 |
+
# Pre-load model at module import (optional, for faster first request)
|
| 558 |
+
def preload_model():
|
| 559 |
+
"""Pre-load the model to speed up first request."""
|
| 560 |
+
try:
|
| 561 |
+
load_model()
|
| 562 |
+
except Exception as e:
|
| 563 |
+
logger.error(f"Model preload failed: {e}")
|
privacy_utils.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Privacy helpers for masking sensitive entities in transcripts and logs.
|
| 3 |
+
"""
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
PHONE_PATTERN = re.compile(r"(?<!\d)(?:\+?91[-\s]?)?[6-9]\d{9}(?!\d)")
|
| 10 |
+
UPI_PATTERN = re.compile(r"\b[a-zA-Z0-9._-]{2,}@[a-zA-Z]{2,}\b")
|
| 11 |
+
ACCOUNT_OR_CARD_PATTERN = re.compile(r"(?<!\d)(?:\d[ -]?){9,19}(?!\d)")
|
| 12 |
+
OTP_CONTEXT_PATTERN = re.compile(r"\b(otp|pin)\s*[:\-]?\s*(\d{4,8})\b", re.IGNORECASE)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _mask_numeric_token(token: str, preserve_tail: int = 2) -> str:
|
| 16 |
+
digits = re.sub(r"\D", "", token)
|
| 17 |
+
if len(digits) <= preserve_tail:
|
| 18 |
+
return "[REDACTED_NUM]"
|
| 19 |
+
return f"[REDACTED_NUM_XX{digits[-preserve_tail:]}]"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _mask_account_or_card(match: re.Match[str]) -> str:
|
| 23 |
+
token = match.group(0)
|
| 24 |
+
digits = re.sub(r"\D", "", token)
|
| 25 |
+
if len(digits) < 9:
|
| 26 |
+
return token
|
| 27 |
+
return _mask_numeric_token(token)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _mask_otp(match: re.Match[str]) -> str:
|
| 31 |
+
return f"{match.group(1)} [REDACTED_OTP]"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def mask_sensitive_entities(text: str) -> str:
|
| 35 |
+
"""Redact common scam-sensitive entities from plain text."""
|
| 36 |
+
if not text:
|
| 37 |
+
return ""
|
| 38 |
+
|
| 39 |
+
masked = OTP_CONTEXT_PATTERN.sub(_mask_otp, text)
|
| 40 |
+
masked = UPI_PATTERN.sub("[REDACTED_UPI]", masked)
|
| 41 |
+
masked = PHONE_PATTERN.sub("[REDACTED_PHONE]", masked)
|
| 42 |
+
masked = ACCOUNT_OR_CARD_PATTERN.sub(_mask_account_or_card, masked)
|
| 43 |
+
return masked
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def sanitize_for_logging(text: str, max_chars: int = 120) -> str:
|
| 47 |
+
"""
|
| 48 |
+
Mask and compact text for safe structured logging.
|
| 49 |
+
"""
|
| 50 |
+
masked = mask_sensitive_entities(text)
|
| 51 |
+
compact = " ".join(masked.split())
|
| 52 |
+
if len(compact) <= max_chars:
|
| 53 |
+
return compact
|
| 54 |
+
return compact[: max_chars - 3] + "..."
|
requirements.txt
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.100.0
|
| 2 |
+
uvicorn[standard]>=0.22.0
|
| 3 |
+
python-multipart
|
| 4 |
+
librosa>=0.10.0
|
| 5 |
+
soundfile>=0.12.0
|
| 6 |
+
numpy>=1.24.0
|
| 7 |
+
scipy>=1.10.0
|
| 8 |
+
python-dotenv
|
| 9 |
+
pydantic>=2.0.0
|
| 10 |
+
transformers>=4.30.0
|
| 11 |
+
datasets>=2.14.0
|
| 12 |
+
scikit-learn>=1.3.0
|
| 13 |
+
accelerate>=0.20.0
|
| 14 |
+
slowapi>=0.1.9
|
| 15 |
+
pydantic-settings>=2.0.0
|
| 16 |
+
httpx>=0.27.0
|
| 17 |
+
# PyTorch - install manually for your platform if not using Docker:
|
| 18 |
+
# pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu
|
| 19 |
+
torch>=2.0.0
|
| 20 |
+
torchaudio>=2.0.0
|
| 21 |
+
faster-whisper>=1.0.3
|
| 22 |
+
|
| 23 |
+
redis>=5.0.0
|
speech_to_text.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Speech-to-text helper with optional faster-whisper backend.
|
| 3 |
+
|
| 4 |
+
The module degrades safely when ASR dependencies are unavailable.
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
from typing import Any, Dict, Iterable, Optional
|
| 10 |
+
|
| 11 |
+
import numpy as np
|
| 12 |
+
|
| 13 |
+
from config import settings
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
_asr_model = None
|
| 18 |
+
_asr_load_attempted = False
|
| 19 |
+
|
| 20 |
+
LANGUAGE_TO_WHISPER = {
|
| 21 |
+
"English": "en",
|
| 22 |
+
"Tamil": "ta",
|
| 23 |
+
"Hindi": "hi",
|
| 24 |
+
"Malayalam": "ml",
|
| 25 |
+
"Telugu": "te",
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _load_asr_model():
|
| 30 |
+
"""Load faster-whisper model lazily."""
|
| 31 |
+
global _asr_model, _asr_load_attempted
|
| 32 |
+
if _asr_model is not None:
|
| 33 |
+
return _asr_model
|
| 34 |
+
if _asr_load_attempted:
|
| 35 |
+
return None
|
| 36 |
+
|
| 37 |
+
_asr_load_attempted = True
|
| 38 |
+
try:
|
| 39 |
+
from faster_whisper import WhisperModel
|
| 40 |
+
|
| 41 |
+
_asr_model = WhisperModel(
|
| 42 |
+
model_size_or_path=settings.ASR_MODEL_SIZE,
|
| 43 |
+
device="cpu",
|
| 44 |
+
compute_type=settings.ASR_COMPUTE_TYPE,
|
| 45 |
+
)
|
| 46 |
+
logger.info(
|
| 47 |
+
"ASR model loaded successfully: size=%s compute_type=%s",
|
| 48 |
+
settings.ASR_MODEL_SIZE,
|
| 49 |
+
settings.ASR_COMPUTE_TYPE,
|
| 50 |
+
)
|
| 51 |
+
return _asr_model
|
| 52 |
+
except Exception as exc: # pragma: no cover - environment dependent
|
| 53 |
+
logger.warning("ASR model unavailable: %s", exc)
|
| 54 |
+
return None
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _decode_segments(segments: Iterable[Any]) -> Dict[str, Any]:
|
| 58 |
+
"""Extract transcript and confidence proxy from whisper segments."""
|
| 59 |
+
transcript_parts = []
|
| 60 |
+
confidence_parts = []
|
| 61 |
+
|
| 62 |
+
for seg in segments:
|
| 63 |
+
text = (seg.text or "").strip()
|
| 64 |
+
if text:
|
| 65 |
+
transcript_parts.append(text)
|
| 66 |
+
avg_logprob = getattr(seg, "avg_logprob", None)
|
| 67 |
+
if avg_logprob is not None:
|
| 68 |
+
confidence_parts.append(float(np.exp(min(0.0, avg_logprob))))
|
| 69 |
+
|
| 70 |
+
transcript = " ".join(transcript_parts).strip()
|
| 71 |
+
confidence = float(np.mean(confidence_parts)) if confidence_parts else (0.0 if not transcript else 0.5)
|
| 72 |
+
confidence = max(0.0, min(1.0, confidence))
|
| 73 |
+
|
| 74 |
+
return {
|
| 75 |
+
"transcript": transcript,
|
| 76 |
+
"confidence": confidence,
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def _run_transcribe(model: Any, audio: np.ndarray, language_code: Optional[str]) -> Dict[str, Any]:
|
| 81 |
+
"""Run one whisper transcription pass with optional language hint."""
|
| 82 |
+
segments, _ = model.transcribe(
|
| 83 |
+
audio,
|
| 84 |
+
language=language_code,
|
| 85 |
+
beam_size=settings.ASR_BEAM_SIZE,
|
| 86 |
+
vad_filter=True,
|
| 87 |
+
condition_on_previous_text=False,
|
| 88 |
+
word_timestamps=False,
|
| 89 |
+
)
|
| 90 |
+
return _decode_segments(segments)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def transcribe_audio(audio: np.ndarray, sr: int, language: str) -> Dict[str, Any]:
|
| 94 |
+
"""
|
| 95 |
+
Transcribe audio to text.
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
{
|
| 99 |
+
"transcript": str,
|
| 100 |
+
"confidence": float [0..1],
|
| 101 |
+
"engine": str,
|
| 102 |
+
"available": bool
|
| 103 |
+
}
|
| 104 |
+
"""
|
| 105 |
+
if not settings.ASR_ENABLED:
|
| 106 |
+
return {
|
| 107 |
+
"transcript": "",
|
| 108 |
+
"confidence": 0.0,
|
| 109 |
+
"engine": "disabled",
|
| 110 |
+
"available": False,
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
model = _load_asr_model()
|
| 114 |
+
if model is None:
|
| 115 |
+
return {
|
| 116 |
+
"transcript": "",
|
| 117 |
+
"confidence": 0.0,
|
| 118 |
+
"engine": "unavailable",
|
| 119 |
+
"available": False,
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
try:
|
| 123 |
+
if sr != 16000:
|
| 124 |
+
import librosa
|
| 125 |
+
|
| 126 |
+
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
| 127 |
+
|
| 128 |
+
audio = np.asarray(audio, dtype=np.float32)
|
| 129 |
+
language_code = LANGUAGE_TO_WHISPER.get(language)
|
| 130 |
+
|
| 131 |
+
hinted = _run_transcribe(model, audio, language_code)
|
| 132 |
+
|
| 133 |
+
# Recovery path: if language hint produced no text, retry with auto-detect.
|
| 134 |
+
# This improves robustness for mixed-language/accented input.
|
| 135 |
+
if not hinted["transcript"]:
|
| 136 |
+
autodetect = _run_transcribe(model, audio, None)
|
| 137 |
+
if autodetect["transcript"]:
|
| 138 |
+
return {
|
| 139 |
+
"transcript": autodetect["transcript"],
|
| 140 |
+
"confidence": autodetect["confidence"],
|
| 141 |
+
"engine": "faster-whisper:auto",
|
| 142 |
+
"available": True,
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
return {
|
| 146 |
+
"transcript": hinted["transcript"],
|
| 147 |
+
"confidence": hinted["confidence"],
|
| 148 |
+
"engine": "faster-whisper",
|
| 149 |
+
"available": True,
|
| 150 |
+
}
|
| 151 |
+
except Exception as exc: # pragma: no cover - runtime/audio dependent
|
| 152 |
+
logger.warning("ASR transcription failed: %s", exc)
|
| 153 |
+
return {
|
| 154 |
+
"transcript": "",
|
| 155 |
+
"confidence": 0.0,
|
| 156 |
+
"engine": "error",
|
| 157 |
+
"available": False,
|
| 158 |
+
}
|