shivam0897-i commited on
Commit
3b6fefe
·
0 Parent(s):

fix: correct id2label key-type mismatch causing inverted classifications

Browse files

HuggingFace model.config.id2label uses string keys ('0','1') but
torch.argmax().item() returns int. The .get() always missed and fell
through to a hardcoded fallback with opposite label polarity, inverting
every single classification (human->AI, AI->human).

Fix: normalise id2label keys to int before lookup. Add diagnostic logging.

Files changed (14) hide show
  1. .env.example +79 -0
  2. .gitattributes +3 -0
  3. .gitignore +60 -0
  4. Dockerfile +40 -0
  5. README.md +93 -0
  6. audio_utils.py +182 -0
  7. config.py +185 -0
  8. fraud_language.py +191 -0
  9. llm_semantic_analyzer.py +253 -0
  10. main.py +1903 -0
  11. model.py +563 -0
  12. privacy_utils.py +54 -0
  13. requirements.txt +23 -0
  14. speech_to_text.py +158 -0
.env.example ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment Variables
2
+ # Copy this file to .env and update values
3
+
4
+ # API Key for authentication (Must be set!)
5
+ API_KEY=your_secure_api_key_here
6
+
7
+ # Server port (Hugging Face uses 7860)
8
+ PORT=7860
9
+
10
+ # Optional CORS origins
11
+ # Example: ALLOWED_ORIGINS=https://your-ui.vercel.app,http://localhost:5173
12
+ ALLOWED_ORIGINS=*
13
+
14
+ # Realtime ASR settings
15
+ ASR_ENABLED=true
16
+ ASR_MODEL_SIZE=tiny
17
+ ASR_COMPUTE_TYPE=int8
18
+ ASR_BEAM_SIZE=1
19
+ ASR_TIMEOUT_MS=1200
20
+ ASR_MAX_INFLIGHT_TASKS=1
21
+ ASR_WARMUP_ENABLED=true
22
+ AUDIO_PIPELINE_WARMUP_ENABLED=true
23
+ VOICE_WARMUP_ENABLED=true
24
+
25
+ # Voice model settings
26
+ VOICE_MODEL_ID=shivam-2211/voice-detection-model
27
+ VOICE_MODEL_BACKUP_ID=mo-thecreator/Deepfake-audio-detection
28
+ VOICE_MODEL_LOCAL_PATH=./fine_tuned_model
29
+ REALTIME_LIGHTWEIGHT_AUDIO=true
30
+ LEGACY_FALLBACK_RETURNS_UNCERTAIN=true
31
+
32
+ # Privacy and retention defaults
33
+ MASK_TRANSCRIPT_OUTPUT=true
34
+ SESSION_ACTIVE_RETENTION_SECONDS=1800
35
+ SESSION_ENDED_RETENTION_SECONDS=300
36
+
37
+ # Realtime risk policy tuning
38
+ RISK_POLICY_VERSION=v1.2
39
+ RISK_WEIGHT_AUDIO=0.45
40
+ RISK_WEIGHT_KEYWORD=0.20
41
+ RISK_WEIGHT_SEMANTIC=0.15
42
+ RISK_WEIGHT_BEHAVIOUR=0.20
43
+ RISK_DELTA_BOOST_FACTOR=0.30
44
+
45
+ # Optional LLM semantic verifier (second-layer, disabled by default)
46
+ LLM_SEMANTIC_ENABLED=false
47
+ LLM_PROVIDER=gemini
48
+ # Optional override (openai example: gpt-4o-mini, gemini example: gemini-1.5-flash)
49
+ LLM_SEMANTIC_MODEL=
50
+ LLM_SEMANTIC_TIMEOUT_MS=900
51
+ LLM_SEMANTIC_MIN_ASR_CONFIDENCE=0.35
52
+ LLM_SEMANTIC_CHUNK_INTERVAL=2
53
+ LLM_SEMANTIC_BLEND_WEIGHT=0.20
54
+ OPENAI_API_KEY=
55
+
56
+ # Gemini provider key (used when LLM_PROVIDER=gemini)
57
+ GEMINI_API_KEY=
58
+
59
+ # Session store backend
60
+ # memory = current single-instance behavior
61
+ # redis = required for multi-worker / restart-safe sessions
62
+ SESSION_STORE_BACKEND=memory
63
+ REDIS_URL=
64
+ REDIS_PREFIX=ai_call_shield
65
+ REDIS_CONNECT_TIMEOUT_MS=2000
66
+ REDIS_IO_TIMEOUT_MS=2000
67
+
68
+ # Deep-lane async verification (future-ready toggles)
69
+ DEEP_LANE_ENABLED=false
70
+ DEEP_LANE_QUEUE_BACKEND=memory
71
+ DEEP_LANE_MAX_WORKERS=2
72
+ DEEP_LANE_MAX_RETRIES=1
73
+ DEEP_LANE_RETRY_BACKOFF_MS=500
74
+ DEEP_LANE_TARGET_LATENCY_MS=3000
75
+
76
+ # Performance budgets for harness and CI gates
77
+ PERF_CHUNK_P95_TARGET_MS=1200
78
+ PERF_ALERT_P95_TARGET_MS=2500
79
+
.gitattributes ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ fine_tuned_model/model.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ .venv/
3
+ venv/
4
+ __pycache__/
5
+ *.pyc
6
+ *.pyo
7
+ *.pyd
8
+ .pytest_cache/
9
+ .coverage
10
+ .coverage.*
11
+ htmlcov/
12
+
13
+ # Environment and secrets
14
+ .env
15
+ .env.*
16
+ !.env.example
17
+
18
+ # Local AI/tooling folders
19
+ .agent/
20
+ .agents/
21
+ .codex/
22
+ .claude/
23
+ .gemini/
24
+ .trae/
25
+ .windsurf/
26
+
27
+ # OS / editor
28
+ .DS_Store
29
+ Thumbs.db
30
+ *.log
31
+ .vscode/
32
+ .idea/
33
+ *.swp
34
+
35
+ # Large artifacts
36
+ *.mp4
37
+ *.wav
38
+ *.mp3
39
+ fine_tuned_model/
40
+ training/
41
+
42
+ # === Non-production files (keep out of HF Space) ===
43
+
44
+ # Tests
45
+ tests/
46
+ pytest.ini
47
+
48
+ # Docs and reports
49
+ docs/
50
+
51
+ # Dev/validation scripts
52
+ scripts/
53
+ scenario_validation_cases.py
54
+
55
+ # Test request fixtures
56
+ test_request.json
57
+ test_valid.json
58
+
59
+ # Helper/patch scripts
60
+ _fix_*.py
Dockerfile ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies for librosa and audio processing
6
+ RUN apt-get update && apt-get install -y \
7
+ libsndfile1 \
8
+ ffmpeg \
9
+ git \
10
+ git-lfs \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Initialize git lfs
14
+ RUN git lfs install
15
+
16
+ # Copy requirements first for better caching
17
+ COPY requirements.txt .
18
+
19
+ # Install CPU-only PyTorch first (smaller size)
20
+ RUN pip install --no-cache-dir torch torchaudio --index-url https://download.pytorch.org/whl/cpu
21
+
22
+ # Install other dependencies
23
+ RUN pip install --no-cache-dir -r requirements.txt
24
+
25
+ # Copy application code and model
26
+ COPY . .
27
+
28
+ # Create a non-root user for HF Spaces
29
+ RUN useradd -m -u 1000 user
30
+ USER user
31
+ ENV HOME=/home/user \
32
+ PATH=/home/user/.local/bin:$PATH
33
+
34
+ WORKDIR /app
35
+
36
+ # Hugging Face Spaces uses port 7860
37
+ EXPOSE 7860
38
+
39
+ # Run the application
40
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Voice Detection API
3
+ emoji: 🎤
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ app_port: 7860
10
+ ---
11
+
12
+ # AI Voice Detection API
13
+
14
+ Detects whether a voice sample is AI-generated or spoken by a real human using a fine-tuned Wav2Vec2 model.
15
+
16
+ ## API Endpoint
17
+
18
+ `POST /api/voice-detection`
19
+
20
+ ### Headers
21
+ - `x-api-key`: Your API key (set via environment variable `API_KEY`)
22
+
23
+ ### Request Body
24
+ ```json
25
+ {
26
+ "language": "English",
27
+ "audioFormat": "mp3",
28
+ "audioBase64": "<base64-encoded-audio>"
29
+ }
30
+ ```
31
+
32
+ ### Response
33
+ ```json
34
+ {
35
+ "status": "success",
36
+ "language": "English",
37
+ "classification": "AI_GENERATED" | "HUMAN",
38
+ "confidenceScore": 0.95,
39
+ "explanation": "AI voice indicators: ..."
40
+ }
41
+ ```
42
+
43
+ ## Supported Languages
44
+ - English
45
+ - Tamil
46
+ - Hindi
47
+ - Malayalam
48
+ - Telugu
49
+
50
+
51
+
52
+ ## Realtime Session APIs
53
+
54
+ The backend also supports session-based realtime analysis:
55
+
56
+ - `POST /v1/session/start`
57
+ - `POST /v1/session/{session_id}/chunk`
58
+ - `GET /v1/session/{session_id}/summary`
59
+ - `GET /v1/session/{session_id}/alerts`
60
+ - `POST /v1/session/{session_id}/end`
61
+
62
+ Compatibility aliases are available under `/api/voice-detection/v1/...`.
63
+
64
+ ## Optional LLM Semantic Verifier
65
+
66
+ A second-layer semantic verifier can be enabled to improve ambiguous chunk scoring:
67
+
68
+ - `LLM_SEMANTIC_ENABLED=true`
69
+ - `LLM_PROVIDER=openai` with `OPENAI_API_KEY=<your_key>`, or
70
+ - `LLM_PROVIDER=gemini` with `GEMINI_API_KEY=<your_key>`
71
+ - Tune with `LLM_SEMANTIC_*` env variables in `.env.example`.
72
+
73
+ If `LLM_SEMANTIC_MODEL` is empty, provider defaults are used (`gpt-4o-mini` for OpenAI, `gemini-1.5-flash` for Gemini).
74
+
75
+ The LLM layer is optional and the API continues to work when disabled.
76
+
77
+
78
+ ## Session Store Backend
79
+
80
+ Realtime sessions support two backends:
81
+
82
+ - `memory` (default): single-instance, volatile
83
+ - `redis`: multi-worker and restart-safe (recommended for finals)
84
+
85
+ Backend env settings:
86
+
87
+ - `SESSION_STORE_BACKEND=redis`
88
+ - `REDIS_URL=redis://...` (or `rediss://...`)
89
+ - `REDIS_PREFIX=ai_call_shield`
90
+
91
+ `GET /health` now includes `session_store_backend` so you can verify active backend.
92
+
93
+ See `docs/architecture/redis-credentials-guide.md` for credential formats and setup steps.
audio_utils.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio utility functions for Base64 decoding and audio loading.
3
+ """
4
+ import base64
5
+ import io
6
+ import tempfile
7
+ import os
8
+ import logging
9
+ from typing import Tuple, Optional
10
+ import numpy as np
11
+
12
+ # Configure logging
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Magic bytes for common audio formats
16
+ AUDIO_MAGIC_BYTES = {
17
+ b'\xff\xfb': 'mp3', # MP3 (MPEG Audio Layer 3)
18
+ b'\xff\xfa': 'mp3', # MP3 variant
19
+ b'\xff\xf3': 'mp3', # MP3 variant
20
+ b'\xff\xf2': 'mp3', # MP3 variant
21
+ b'ID3': 'mp3', # MP3 with ID3 tag
22
+ b'RIFF': 'wav', # WAV
23
+ b'fLaC': 'flac', # FLAC
24
+ b'OggS': 'ogg', # OGG
25
+ b'\x00\x00\x00': 'm4a', # M4A/MP4 (ftyp box)
26
+ }
27
+
28
+
29
+ def validate_audio_content(audio_bytes: bytes) -> Tuple[bool, str]:
30
+ """
31
+ Validate that the bytes actually contain audio data.
32
+
33
+ Args:
34
+ audio_bytes: Raw bytes to validate
35
+
36
+ Returns:
37
+ Tuple of (is_valid, detected_format_or_error_message)
38
+ """
39
+ if len(audio_bytes) < 12:
40
+ return False, "Audio data too small to be valid"
41
+
42
+ # Check for text content (common mistake: uploading CSV/JSON as audio)
43
+ # ASCII printable range check on first 100 bytes
44
+ sample = audio_bytes[:100]
45
+ printable_ratio = sum(1 for b in sample if 32 <= b <= 126 or b in (9, 10, 13)) / len(sample)
46
+ if printable_ratio > 0.9:
47
+ # Likely text content
48
+ preview = sample[:50].decode('utf-8', errors='replace')
49
+ return False, f"File appears to be text, not audio. Preview: {preview[:30]}..."
50
+
51
+ # Check magic bytes
52
+ for magic, fmt in AUDIO_MAGIC_BYTES.items():
53
+ if audio_bytes.startswith(magic):
54
+ return True, fmt
55
+
56
+ # Check for M4A/MP4 (ftyp at offset 4)
57
+ if len(audio_bytes) > 8 and audio_bytes[4:8] == b'ftyp':
58
+ return True, "m4a"
59
+
60
+ # Unknown format but not text - allow it and let librosa try
61
+ logger.warning("Unknown audio format, attempting to load anyway")
62
+ return True, "unknown"
63
+
64
+
65
+ def decode_base64_audio(base64_string: str) -> bytes:
66
+ """
67
+ Decode a Base64-encoded audio string to raw bytes.
68
+
69
+ Args:
70
+ base64_string: Base64-encoded audio data
71
+
72
+ Returns:
73
+ Raw audio bytes
74
+
75
+ Raises:
76
+ ValueError: If the Base64 string is invalid
77
+ """
78
+ try:
79
+ # Strip data URI prefix if present
80
+ if "," in base64_string:
81
+ base64_string = base64_string.split(",", 1)[1]
82
+
83
+ # Remove any whitespace
84
+ base64_string = base64_string.strip()
85
+
86
+ return base64.b64decode(base64_string)
87
+ except Exception as e:
88
+ raise ValueError(f"Invalid Base64 encoding: {str(e)}")
89
+
90
+
91
+ def load_audio_from_bytes(audio_bytes: bytes, target_sr: int = 22050, audio_format: str = "mp3") -> Tuple[np.ndarray, int]:
92
+ """
93
+ Load audio from bytes into a numpy array using librosa.
94
+
95
+ Args:
96
+ audio_bytes: Raw audio file bytes
97
+ target_sr: Target sample rate (default 22050 Hz)
98
+ audio_format: Audio format extension (mp3, wav, flac, ogg, m4a, mp4)
99
+
100
+ Returns:
101
+ Tuple of (audio waveform as numpy array, sample rate)
102
+
103
+ Raises:
104
+ ValueError: If audio cannot be loaded or is invalid
105
+ """
106
+ # Validate audio content BEFORE attempting to decode
107
+ is_valid, validation_result = validate_audio_content(audio_bytes)
108
+ if not is_valid:
109
+ raise ValueError(f"Invalid audio file: {validation_result}")
110
+
111
+ logger.info(f"Audio validation passed. Detected format hint: {validation_result}")
112
+
113
+ tmp_path = None
114
+ try:
115
+ import librosa
116
+ import soundfile as sf
117
+
118
+ # Normalize format
119
+ audio_format = audio_format.lower().strip()
120
+ if audio_format.startswith("."):
121
+ audio_format = audio_format[1:]
122
+
123
+ # Validate format (security)
124
+ if not audio_format.isalnum() or len(audio_format) > 5:
125
+ raise ValueError(f"Invalid audio format: {audio_format}")
126
+
127
+ # Write to temp file for librosa
128
+ with tempfile.NamedTemporaryFile(suffix=f".{audio_format}", delete=False) as tmp_file:
129
+ tmp_file.write(audio_bytes)
130
+ tmp_path = tmp_file.name
131
+
132
+ # Load audio with librosa
133
+ audio, sr = librosa.load(tmp_path, sr=target_sr, mono=True)
134
+
135
+ # Validate loaded audio
136
+ if len(audio) == 0:
137
+ raise ValueError("Audio file is empty or could not be decoded")
138
+
139
+ duration = len(audio) / sr
140
+ logger.info(f"Audio loaded successfully: {duration:.2f}s at {sr}Hz")
141
+
142
+ return audio, sr
143
+
144
+ except Exception as e:
145
+ raise ValueError(f"Failed to load audio: {str(e)}")
146
+ finally:
147
+ # Always clean up temp file, even on exceptions
148
+ if tmp_path and os.path.exists(tmp_path):
149
+ try:
150
+ os.remove(tmp_path)
151
+ except OSError:
152
+ pass # Best effort cleanup
153
+
154
+
155
+ def get_audio_duration(audio: np.ndarray, sr: int) -> float:
156
+ """
157
+ Calculate the duration of audio in seconds.
158
+
159
+ Args:
160
+ audio: Audio waveform
161
+ sr: Sample rate
162
+
163
+ Returns:
164
+ Duration in seconds
165
+ """
166
+ return len(audio) / sr
167
+
168
+
169
+ def normalize_audio(audio: np.ndarray) -> np.ndarray:
170
+ """
171
+ Normalize audio to have maximum amplitude of 1.0.
172
+
173
+ Args:
174
+ audio: Audio waveform
175
+
176
+ Returns:
177
+ Normalized audio
178
+ """
179
+ max_val = np.max(np.abs(audio))
180
+ if max_val > 0:
181
+ return audio / max_val
182
+ return audio
config.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration management using Pydantic Settings.
3
+ """
4
+ from pydantic_settings import BaseSettings
5
+ from typing import List
6
+ from pydantic import Field
7
+
8
+
9
+ class Settings(BaseSettings):
10
+ """Application configuration."""
11
+
12
+ # Core API Settings
13
+ API_KEY: str = Field(..., description="API Key for authentication")
14
+ PORT: int = Field(7860, description="Server port")
15
+ WEBSITE_URL: str = Field(
16
+ default="https://voice-detection-nu.vercel.app/",
17
+ description="Project or Portfolio URL"
18
+ )
19
+
20
+ # CORS Settings
21
+ # Use str field with alias to read env var safely (avoids Pydantic trying to parse as JSON)
22
+ ALLOWED_ORIGINS_RAW: str = Field(default="*", alias="ALLOWED_ORIGINS")
23
+
24
+ @property
25
+ def ALLOWED_ORIGINS(self) -> List[str]:
26
+ """Parse the raw CORS origins string into a list."""
27
+ raw_value: str = self.ALLOWED_ORIGINS_RAW
28
+ if raw_value.strip().startswith("["):
29
+ import json
30
+ try:
31
+ return json.loads(raw_value)
32
+ except json.JSONDecodeError:
33
+ pass
34
+ return [origin.strip() for origin in raw_value.split(",") if origin.strip()]
35
+
36
+ # Audio Constraints
37
+ MAX_AUDIO_SIZE_MB: int = 10
38
+ SUPPORTED_LANGUAGES: List[str] = [
39
+ "Tamil", "English", "Hindi", "Malayalam", "Telugu"
40
+ ]
41
+ SUPPORTED_FORMATS: List[str] = [
42
+ "mp3", "wav", "flac", "ogg", "m4a", "mp4"
43
+ ]
44
+
45
+ # ASR settings
46
+ ASR_ENABLED: bool = Field(default=True, description="Enable speech-to-text analysis for realtime sessions")
47
+ ASR_MODEL_SIZE: str = Field(default="tiny", description="faster-whisper model size")
48
+ ASR_COMPUTE_TYPE: str = Field(default="int8", description="faster-whisper compute type")
49
+ ASR_BEAM_SIZE: int = Field(default=1, description="Beam size for ASR decoding")
50
+ ASR_TIMEOUT_MS: int = Field(
51
+ default=2500,
52
+ ge=200,
53
+ le=15000,
54
+ description="Max realtime ASR duration per chunk before timeout fallback"
55
+ )
56
+ ASR_MAX_INFLIGHT_TASKS: int = Field(
57
+ default=1,
58
+ ge=1,
59
+ le=8,
60
+ description="Maximum concurrent ASR background tasks allowed to prevent thread pileups"
61
+ )
62
+ ASR_WARMUP_ENABLED: bool = Field(
63
+ default=True,
64
+ description="Warm faster-whisper model during startup to avoid first-chunk latency spike"
65
+ )
66
+ AUDIO_PIPELINE_WARMUP_ENABLED: bool = Field(
67
+ default=True,
68
+ description="Warm audio decoding/resampling pipeline during startup"
69
+ )
70
+ VOICE_WARMUP_ENABLED: bool = Field(
71
+ default=True,
72
+ description="Run one startup inference through voice analyzer to avoid first-chunk latency spikes"
73
+ )
74
+
75
+ # Voice classification model settings
76
+ VOICE_MODEL_ID: str = Field(
77
+ default="shivam-2211/voice-detection-model",
78
+ description="Primary Hugging Face model id for AI voice detection"
79
+ )
80
+ VOICE_MODEL_BACKUP_ID: str = Field(
81
+ default="mo-thecreator/Deepfake-audio-detection",
82
+ description="Backup model id if primary model load fails"
83
+ )
84
+ VOICE_MODEL_LOCAL_PATH: str = Field(
85
+ default="./fine_tuned_model",
86
+ description="Optional local model path that takes priority when present"
87
+ )
88
+ REALTIME_LIGHTWEIGHT_AUDIO: bool = Field(
89
+ default=False,
90
+ description="Use lightweight audio analysis path for realtime chunk processing (set true for throughput-first mode)"
91
+ )
92
+ LEGACY_FALLBACK_RETURNS_UNCERTAIN: bool = Field(
93
+ default=True,
94
+ description="Return UNCERTAIN classification on legacy endpoint when ML fallback occurs"
95
+ )
96
+
97
+ # Risk policy (versioned + configurable weights)
98
+ RISK_POLICY_VERSION: str = Field(default="v1.2", description="Version tag for realtime risk policy")
99
+ RISK_WEIGHT_AUDIO: float = Field(default=0.45, ge=0.0, le=1.0)
100
+ RISK_WEIGHT_KEYWORD: float = Field(default=0.20, ge=0.0, le=1.0)
101
+ RISK_WEIGHT_SEMANTIC: float = Field(default=0.15, ge=0.0, le=1.0)
102
+ RISK_WEIGHT_BEHAVIOUR: float = Field(default=0.20, ge=0.0, le=1.0)
103
+ RISK_DELTA_BOOST_FACTOR: float = Field(
104
+ default=0.30,
105
+ ge=0.0,
106
+ le=1.0,
107
+ description="How strongly risk increases when per-chunk delta is positive"
108
+ )
109
+
110
+ # Optional LLM semantic verifier (second-layer, not primary classifier)
111
+ LLM_SEMANTIC_ENABLED: bool = Field(default=False)
112
+ LLM_PROVIDER: str = Field(default="openai", description="LLM provider: openai or gemini")
113
+ LLM_SEMANTIC_MODEL: str = Field(default="", description="Model name for selected LLM provider (optional)")
114
+ LLM_SEMANTIC_TIMEOUT_MS: int = Field(default=900, ge=100, le=5000)
115
+ LLM_SEMANTIC_MIN_ASR_CONFIDENCE: float = Field(default=0.35, ge=0.0, le=1.0)
116
+ LLM_SEMANTIC_CHUNK_INTERVAL: int = Field(default=2, ge=1, le=20)
117
+ LLM_SEMANTIC_BLEND_WEIGHT: float = Field(
118
+ default=0.20,
119
+ ge=0.0,
120
+ le=1.0,
121
+ description="Weight assigned to LLM semantic score in fused semantic score"
122
+ )
123
+ OPENAI_API_KEY: str | None = Field(default=None, description="Optional OpenAI API key for LLM semantic verifier")
124
+ GEMINI_API_KEY: str | None = Field(default=None, description="Optional Gemini API key for LLM semantic verifier")
125
+
126
+ # Session store backend
127
+ SESSION_STORE_BACKEND: str = Field(
128
+ default="memory",
129
+ description="Session store backend: memory or redis"
130
+ )
131
+ REDIS_URL: str | None = Field(
132
+ default=None,
133
+ description="Redis URL for session state and queue (required when SESSION_STORE_BACKEND=redis)"
134
+ )
135
+ REDIS_PREFIX: str = Field(
136
+ default="ai_call_shield",
137
+ description="Redis key prefix namespace"
138
+ )
139
+ REDIS_CONNECT_TIMEOUT_MS: int = Field(default=2000, ge=100, le=30000)
140
+ REDIS_IO_TIMEOUT_MS: int = Field(default=2000, ge=100, le=30000)
141
+
142
+ # Deep-lane async verification controls
143
+ DEEP_LANE_ENABLED: bool = Field(
144
+ default=False,
145
+ description="Enable asynchronous deep-lane verification after fast-lane decision"
146
+ )
147
+ DEEP_LANE_QUEUE_BACKEND: str = Field(
148
+ default="memory",
149
+ description="Queue backend: memory or redis"
150
+ )
151
+ DEEP_LANE_MAX_WORKERS: int = Field(default=2, ge=1, le=16)
152
+ DEEP_LANE_MAX_RETRIES: int = Field(default=1, ge=0, le=10)
153
+ DEEP_LANE_RETRY_BACKOFF_MS: int = Field(default=500, ge=0, le=60000)
154
+ DEEP_LANE_TARGET_LATENCY_MS: int = Field(default=3000, ge=200, le=10000)
155
+
156
+ # Performance targets (for harness/reporting and CI gates)
157
+ PERF_CHUNK_P95_TARGET_MS: int = Field(default=1200, ge=100, le=10000)
158
+ PERF_ALERT_P95_TARGET_MS: int = Field(default=2500, ge=100, le=10000)
159
+
160
+ # Session retention and privacy controls
161
+ SESSION_ACTIVE_RETENTION_SECONDS: int = Field(
162
+ default=1800,
163
+ description="Retention TTL for active sessions with no updates"
164
+ )
165
+ SESSION_ENDED_RETENTION_SECONDS: int = Field(
166
+ default=300,
167
+ description="Retention TTL for ended sessions before purge"
168
+ )
169
+ MASK_TRANSCRIPT_OUTPUT: bool = Field(
170
+ default=True,
171
+ description="Mask sensitive entities from transcript before returning response"
172
+ )
173
+
174
+ # Environment Specific
175
+ SPACE_ID: str | None = Field(default=None, description="Hugging Face Space ID if running in Spaces")
176
+
177
+ model_config = {
178
+ "env_file": ".env",
179
+ "case_sensitive": True,
180
+ "extra": "ignore"
181
+ }
182
+
183
+
184
+ # Global settings instance
185
+ settings = Settings()
fraud_language.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Keyword and semantic fraud signal extraction from transcripts.
3
+ """
4
+ from __future__ import annotations
5
+
6
+ import re
7
+ import string
8
+ from typing import Any, Dict, List, Set
9
+
10
+ # Baseline keywords that are language-agnostic or commonly spoken in English/Hinglish.
11
+ COMMON_FRAUD_KEYWORDS: Dict[str, Set[str]] = {
12
+ "financial": {
13
+ "bank account", "account", "credit card", "debit card", "loan", "khata",
14
+ },
15
+ "payment": {
16
+ "upi", "upi id", "gpay", "google pay", "phonepe", "paytm", "neft", "rtgs",
17
+ "send money", "transfer money", "payment",
18
+ },
19
+ "authentication": {
20
+ "otp", "pin", "password", "cvv", "verification code", "passcode",
21
+ },
22
+ "urgency": {
23
+ "urgent", "immediately", "right now", "now", "last chance", "today only",
24
+ "abhi", "turant", "jaldi",
25
+ },
26
+ "threat": {
27
+ "blocked", "suspended", "legal action", "police", "arrest", "freeze",
28
+ },
29
+ "impersonation": {
30
+ "rbi", "bank manager", "government", "income tax", "customs", "official",
31
+ },
32
+ "offer_lure": {
33
+ "lottery", "prize", "winner", "cashback", "free", "reward",
34
+ },
35
+ }
36
+
37
+ # Language-specific script and phrase variants to improve 5-language support.
38
+ LANGUAGE_FRAUD_KEYWORDS: Dict[str, Dict[str, Set[str]]] = {
39
+ "Hindi": {
40
+ "financial": {"बैंक", "खाता", "अकाउंट", "लोन"},
41
+ "payment": {"यूपीआई", "युपीआई", "भुगतान", "पैसे भेजो", "ट्रांसफर", "गूगल पे", "फोनपे", "पेटीएम"},
42
+ "authentication": {"ओटीपी", "पिन", "पासवर्ड", "सत्यापन कोड"},
43
+ "urgency": {"अभी", "तुरंत", "जल्दी", "फौरन", "अंतिम मौका"},
44
+ "threat": {"ब्लॉक", "निलंबित", "कानूनी कार्रवाई", "गिरफ्तार", "फ्रीज"},
45
+ "impersonation": {"आरबीआई", "सरकारी अधिकारी", "बैंक मैनेजर", "इनकम टैक्स"},
46
+ "offer_lure": {"लॉटरी", "इनाम", "कैशबैक", "फ्री", "रिवॉर्ड"},
47
+ },
48
+ "Tamil": {
49
+ "financial": {"வங்கி", "கணக்கு", "அக்கவுண்ட்", "கடன்"},
50
+ "payment": {"யுபிஐ", "கூகுள் பே", "போன்பே", "பேடிஎம்", "பணம் அனுப்பு", "பணம் பரிமாற்றம்", "கட்டணம்"},
51
+ "authentication": {"ஓடிபி", "பின்", "கடவுச்சொல்", "சரிபார்ப்பு குறியீடு"},
52
+ "urgency": {"உடனே", "இப்போதே", "விரைவாக", "இப்போது", "அவசரம்"},
53
+ "threat": {"முடக்கப்படும்", "தடைசெய்யப்படும்", "சட்ட நடவடிக்கை", "காவல்", "உறையவைக்கப்படும்"},
54
+ "impersonation": {"ஆர்பிஐ", "அரசு அதிகாரி", "வங்கி மேலாளர்", "வருமானவரி"},
55
+ "offer_lure": {"லாட்டரி", "பரிசு", "கேஷ்பேக்", "இலவசம்", "வெற்றி"},
56
+ },
57
+ "Malayalam": {
58
+ "financial": {"ബാങ്ക്", "അക്കൗണ്ട്", "ഖാത", "ലോൺ"},
59
+ "payment": {"യുപിഐ", "ഗൂഗിൾ പേ", "ഫോൺപേ", "പേടിഎം", "പണം അയക്കൂ", "പേയ്മെന്റ്", "ട്രാൻസ്ഫർ"},
60
+ "authentication": {"ഒടിപി", "പിൻ", "പാസ്‌വേഡ്", "സ്ഥിരീകരണ കോഡ്"},
61
+ "urgency": {"ഉടൻ", "ഇപ്പോള്", "തൽക്ഷണം", "വേഗം", "അവസരം"},
62
+ "threat": {"ബ്ലോക്ക്", "സസ്പെൻഡ്", "നിയമ നടപടി", "അറസ്റ്റ്", "ഫ്രീസ്"},
63
+ "impersonation": {"ആർബിഐ", "സർക്കാർ ഓഫീസർ", "ബാങ്ക് മാനേജർ", "ഇൻകം ടാക്സ്"},
64
+ "offer_lure": {"ലോട്ടറി", "സമ്മാനം", "കാഷ്ബാക്ക്", "ഫ്രീ", "റിവാർഡ്"},
65
+ },
66
+ "Telugu": {
67
+ "financial": {"బ్యాంక్", "ఖాతా", "అకౌంట్", "లోన్"},
68
+ "payment": {"యూపీఐ", "గూగుల్ పే", "ఫోన్‌పే", "పేటిఎం", "డబ్బు పంపండి", "చెల్లింపు", "ట్రాన్స్‌ఫర్"},
69
+ "authentication": {"ఓటిపి", "పిన్", "పాస్‌వర్డ్", "ధృవీకరణ కోడ్"},
70
+ "urgency": {"వెంటనే", "ఇ��్పుడే", "తక్షణం", "త్వరగా", "చివరి అవకాశం"},
71
+ "threat": {"బ్లాక్", "సస్పెండ్", "చట్టపరమైన చర్య", "అరెస్ట్", "ఫ్రీజ్"},
72
+ "impersonation": {"ఆర్బిఐ", "ప్రభుత్వ అధికారి", "బ్యాంక్ మేనేజర్", "ఇన్కమ్ ట్యాక్స్"},
73
+ "offer_lure": {"లాటరీ", "బహుమతి", "క్యాష్‌బ్యాక్", "ఉచితం", "రివార్డు"},
74
+ },
75
+ }
76
+
77
+ PUNCT_TRANSLATION = str.maketrans({ch: " " for ch in (string.punctuation + "“”‘’…–—।॥،؛")})
78
+
79
+
80
+ def _normalize_text(text: str) -> str:
81
+ """
82
+ Normalize text while preserving non-Latin scripts.
83
+
84
+ We avoid ASCII-only regex stripping so Indic scripts remain searchable.
85
+ """
86
+ normalized = text.casefold().translate(PUNCT_TRANSLATION)
87
+ normalized = re.sub(r"\s+", " ", normalized).strip()
88
+ return normalized
89
+
90
+
91
+ def _combined_keyword_catalog(language: str | None) -> Dict[str, Set[str]]:
92
+ """Merge common keywords with optional language-specific keywords."""
93
+ merged: Dict[str, Set[str]] = {category: set(values) for category, values in COMMON_FRAUD_KEYWORDS.items()}
94
+
95
+ if language and language in LANGUAGE_FRAUD_KEYWORDS:
96
+ language_maps = [LANGUAGE_FRAUD_KEYWORDS[language]]
97
+ else:
98
+ # Fallback: support mixed-language transcripts by checking all known script maps.
99
+ language_maps = list(LANGUAGE_FRAUD_KEYWORDS.values())
100
+
101
+ for language_map in language_maps:
102
+ for category, keywords in language_map.items():
103
+ merged.setdefault(category, set()).update(keywords)
104
+
105
+ return merged
106
+
107
+
108
+ def _contains_keyword(normalized_text: str, token_set: Set[str], keyword: str) -> bool:
109
+ key = _normalize_text(keyword)
110
+ if not key:
111
+ return False
112
+ if " " in key:
113
+ return key in normalized_text
114
+ return key in token_set
115
+
116
+
117
+ def _match_keywords(normalized_text: str, catalog: Dict[str, Set[str]]) -> Dict[str, List[str]]:
118
+ by_category: Dict[str, List[str]] = {}
119
+ token_set = set(normalized_text.split())
120
+
121
+ for category, keywords in catalog.items():
122
+ hits = [kw for kw in keywords if _contains_keyword(normalized_text, token_set, kw)]
123
+ if hits:
124
+ by_category[category] = sorted(hits)
125
+ return by_category
126
+
127
+
128
+ def analyze_transcript(transcript: str, language: str | None = None) -> Dict[str, Any]:
129
+ """Extract keyword and semantic signals from transcript text."""
130
+ if not transcript:
131
+ return {
132
+ "keyword_hits": [],
133
+ "keyword_categories": [],
134
+ "keyword_score": 0,
135
+ "semantic_flags": [],
136
+ "semantic_score": 0,
137
+ "behaviour_signals": [],
138
+ }
139
+
140
+ text = _normalize_text(transcript)
141
+ category_hits = _match_keywords(text, _combined_keyword_catalog(language))
142
+
143
+ keyword_hits: List[str] = []
144
+ for category, hits in sorted(category_hits.items()):
145
+ keyword_hits.extend([f"{category}:{hit}" for hit in hits])
146
+
147
+ categories = sorted(category_hits.keys())
148
+ keyword_score = min(100, len(keyword_hits) * 7 + len(categories) * 12)
149
+
150
+ semantic_flags: List[str] = []
151
+ behaviour_signals: List[str] = []
152
+
153
+ has_urgency = "urgency" in category_hits
154
+ has_impersonation = "impersonation" in category_hits
155
+ has_auth = "authentication" in category_hits
156
+ has_payment = "payment" in category_hits
157
+ has_threat = "threat" in category_hits
158
+
159
+ if has_urgency:
160
+ semantic_flags.append("urgency_language")
161
+ behaviour_signals.append("urgency_escalation")
162
+ if has_impersonation:
163
+ semantic_flags.append("authority_impersonation")
164
+ if has_auth:
165
+ semantic_flags.append("credential_request")
166
+ if has_payment:
167
+ semantic_flags.append("payment_redirection")
168
+ if has_threat:
169
+ semantic_flags.append("coercive_threat_language")
170
+ if "offer_lure" in category_hits:
171
+ semantic_flags.append("incentive_lure")
172
+
173
+ semantic_score = min(100, len(semantic_flags) * 14)
174
+ if has_impersonation and has_auth:
175
+ semantic_score = min(100, semantic_score + 18)
176
+ behaviour_signals.append("authority_with_credential_request")
177
+ if has_payment and has_urgency:
178
+ semantic_score = min(100, semantic_score + 14)
179
+ behaviour_signals.append("urgent_payment_pressure")
180
+ if has_threat and has_urgency:
181
+ semantic_score = min(100, semantic_score + 10)
182
+ behaviour_signals.append("threat_plus_urgency")
183
+
184
+ return {
185
+ "keyword_hits": keyword_hits,
186
+ "keyword_categories": categories,
187
+ "keyword_score": keyword_score,
188
+ "semantic_flags": semantic_flags,
189
+ "semantic_score": semantic_score,
190
+ "behaviour_signals": sorted(set(behaviour_signals)),
191
+ }
llm_semantic_analyzer.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Optional LLM semantic verifier for realtime transcript analysis.
3
+
4
+ This is a second-layer signal meant for ambiguous/uncertain chunks.
5
+ It must never block realtime flow.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import logging
11
+ import re
12
+ from typing import Any, Dict, Optional
13
+
14
+ import httpx
15
+
16
+ from config import settings
17
+ from privacy_utils import mask_sensitive_entities
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def _clamp_int(value: Any, lo: int = 0, hi: int = 100) -> int:
23
+ try:
24
+ parsed = int(round(float(value)))
25
+ except (TypeError, ValueError):
26
+ return lo
27
+ return max(lo, min(hi, parsed))
28
+
29
+
30
+ def _clamp_float(value: Any, lo: float = 0.0, hi: float = 1.0) -> float:
31
+ try:
32
+ parsed = float(value)
33
+ except (TypeError, ValueError):
34
+ return lo
35
+ return max(lo, min(hi, parsed))
36
+
37
+
38
+ def _extract_json_object(text: str) -> Optional[Dict[str, Any]]:
39
+ if not text:
40
+ return None
41
+
42
+ text = text.strip()
43
+ try:
44
+ parsed = json.loads(text)
45
+ if isinstance(parsed, dict):
46
+ return parsed
47
+ except json.JSONDecodeError:
48
+ pass
49
+
50
+ match = re.search(r"\{[\s\S]*\}", text)
51
+ if not match:
52
+ return None
53
+
54
+ try:
55
+ parsed = json.loads(match.group(0))
56
+ return parsed if isinstance(parsed, dict) else None
57
+ except json.JSONDecodeError:
58
+ return None
59
+
60
+
61
+ def _resolve_provider() -> str:
62
+ provider = str(getattr(settings, "LLM_PROVIDER", "openai") or "openai").strip().lower()
63
+ if provider in {"gemini", "google"}:
64
+ return "gemini"
65
+ return "openai"
66
+
67
+
68
+ def _resolve_model(provider: str) -> str:
69
+ configured = str(getattr(settings, "LLM_SEMANTIC_MODEL", "") or "").strip()
70
+ if configured:
71
+ return configured
72
+ if provider == "gemini":
73
+ return "gemini-1.5-flash"
74
+ return "gpt-4o-mini"
75
+
76
+
77
+ def _provider_api_key(provider: str) -> Optional[str]:
78
+ if provider == "gemini":
79
+ return getattr(settings, "GEMINI_API_KEY", None)
80
+ return getattr(settings, "OPENAI_API_KEY", None)
81
+
82
+
83
+ def is_llm_semantic_provider_ready() -> bool:
84
+ """Return True when selected provider has required credentials."""
85
+ provider = _resolve_provider()
86
+ return bool(_provider_api_key(provider))
87
+
88
+
89
+ def _normalized_response(data: Dict[str, Any], model_name: str, engine_name: str) -> Dict[str, Any]:
90
+ semantic_flags = data.get("semantic_flags") or []
91
+ behaviour_signals = data.get("behaviour_signals") or []
92
+ keyword_hints = data.get("keyword_hints") or []
93
+
94
+ if not isinstance(semantic_flags, list):
95
+ semantic_flags = []
96
+ if not isinstance(behaviour_signals, list):
97
+ behaviour_signals = []
98
+ if not isinstance(keyword_hints, list):
99
+ keyword_hints = []
100
+
101
+ return {
102
+ "available": True,
103
+ "semantic_score": _clamp_int(data.get("semantic_score", 0)),
104
+ "confidence": _clamp_float(data.get("confidence", 0.0)),
105
+ "semantic_flags": [str(x) for x in semantic_flags if x],
106
+ "behaviour_signals": [str(x) for x in behaviour_signals if x],
107
+ "keyword_hints": [str(x) for x in keyword_hints if x],
108
+ "model": model_name,
109
+ "engine": engine_name,
110
+ }
111
+
112
+
113
+ def _build_prompts(language: str, safe_transcript: str) -> tuple[str, str]:
114
+ system_prompt = (
115
+ "You are a telecom fraud intent classifier. "
116
+ "Return ONLY strict JSON with keys: "
117
+ "semantic_score (0-100), confidence (0-1), semantic_flags (string[]), "
118
+ "behaviour_signals (string[]), keyword_hints (string[])."
119
+ )
120
+
121
+ user_prompt = (
122
+ f"Language: {language}\n"
123
+ "Task: detect coercion, impersonation, credential request, and payment pressure.\n"
124
+ f"Transcript: {safe_transcript}"
125
+ )
126
+ return system_prompt, user_prompt
127
+
128
+
129
+ def _call_openai_semantic(
130
+ client: httpx.Client,
131
+ model_name: str,
132
+ api_key: str,
133
+ system_prompt: str,
134
+ user_prompt: str,
135
+ ) -> Dict[str, Any]:
136
+ payload = {
137
+ "model": model_name,
138
+ "temperature": 0,
139
+ "response_format": {"type": "json_object"},
140
+ "messages": [
141
+ {"role": "system", "content": system_prompt},
142
+ {"role": "user", "content": user_prompt},
143
+ ],
144
+ }
145
+
146
+ response = client.post(
147
+ "https://api.openai.com/v1/chat/completions",
148
+ headers={
149
+ "Authorization": f"Bearer {api_key}",
150
+ "Content-Type": "application/json",
151
+ },
152
+ json=payload,
153
+ )
154
+ response.raise_for_status()
155
+ data = response.json()
156
+ content = (
157
+ data.get("choices", [{}])[0]
158
+ .get("message", {})
159
+ .get("content", "")
160
+ )
161
+ parsed = _extract_json_object(content)
162
+ if parsed is None:
163
+ return {"available": False, "reason": "invalid_json"}
164
+ return _normalized_response(parsed, model_name=model_name, engine_name="openai-chat-completions")
165
+
166
+
167
+ def _call_gemini_semantic(
168
+ client: httpx.Client,
169
+ model_name: str,
170
+ api_key: str,
171
+ system_prompt: str,
172
+ user_prompt: str,
173
+ ) -> Dict[str, Any]:
174
+ payload = {
175
+ "contents": [
176
+ {
177
+ "role": "user",
178
+ "parts": [
179
+ {"text": f"{system_prompt}\n\n{user_prompt}"},
180
+ ],
181
+ }
182
+ ],
183
+ "generationConfig": {
184
+ "temperature": 0,
185
+ "responseMimeType": "application/json",
186
+ },
187
+ }
188
+
189
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{model_name}:generateContent"
190
+ response = client.post(url, params={"key": api_key}, json=payload)
191
+ response.raise_for_status()
192
+ data = response.json()
193
+
194
+ content = (
195
+ data.get("candidates", [{}])[0]
196
+ .get("content", {})
197
+ .get("parts", [{}])[0]
198
+ .get("text", "")
199
+ )
200
+ parsed = _extract_json_object(content)
201
+ if parsed is None:
202
+ return {"available": False, "reason": "invalid_json"}
203
+ return _normalized_response(parsed, model_name=model_name, engine_name="gemini-generate-content")
204
+
205
+
206
+ def analyze_semantic_with_llm(transcript: str, language: str, timeout_ms: Optional[int] = None) -> Dict[str, Any]:
207
+ """
208
+ Analyze transcript semantics via an optional LLM.
209
+
210
+ Returns a normalized dict with `available` bool and semantic fields.
211
+ """
212
+ if not settings.LLM_SEMANTIC_ENABLED:
213
+ return {"available": False, "reason": "disabled"}
214
+
215
+ if not transcript or len(transcript.strip()) < 8:
216
+ return {"available": False, "reason": "insufficient_transcript"}
217
+
218
+ provider = _resolve_provider()
219
+ api_key = _provider_api_key(provider)
220
+ if not api_key:
221
+ return {"available": False, "reason": f"missing_{provider}_api_key"}
222
+
223
+ safe_transcript = mask_sensitive_entities(transcript).strip()
224
+ if not safe_transcript:
225
+ return {"available": False, "reason": "empty_after_masking"}
226
+
227
+ timeout_seconds = max(0.1, (timeout_ms or settings.LLM_SEMANTIC_TIMEOUT_MS) / 1000.0)
228
+ model_name = _resolve_model(provider)
229
+ system_prompt, user_prompt = _build_prompts(language, safe_transcript)
230
+
231
+ try:
232
+ with httpx.Client(timeout=timeout_seconds) as client:
233
+ if provider == "openai":
234
+ return _call_openai_semantic(
235
+ client=client,
236
+ model_name=model_name,
237
+ api_key=api_key,
238
+ system_prompt=system_prompt,
239
+ user_prompt=user_prompt,
240
+ )
241
+ if provider == "gemini":
242
+ return _call_gemini_semantic(
243
+ client=client,
244
+ model_name=model_name,
245
+ api_key=api_key,
246
+ system_prompt=system_prompt,
247
+ user_prompt=user_prompt,
248
+ )
249
+
250
+ return {"available": False, "reason": "unsupported_provider"}
251
+ except Exception as exc: # pragma: no cover - network/runtime dependent
252
+ logger.warning("LLM semantic verifier unavailable (%s): %s", provider, exc)
253
+ return {"available": False, "reason": "request_failed"}
main.py ADDED
@@ -0,0 +1,1903 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI application for AI-Generated Voice Detection.
3
+
4
+ Endpoint: POST /api/voice-detection
5
+ - Accepts Base64-encoded MP3 audio
6
+ - Returns classification (AI_GENERATED or HUMAN) with confidence score
7
+ """
8
+ import logging
9
+ import asyncio
10
+ import uuid
11
+ import time
12
+ import json
13
+ import io
14
+ from dataclasses import dataclass, field, asdict
15
+ from datetime import datetime, timezone
16
+ from typing import Optional, Any, Dict, List
17
+ from contextlib import asynccontextmanager
18
+ import numpy as np
19
+ from fastapi import FastAPI, HTTPException, Request, Depends, WebSocket, WebSocketDisconnect
20
+ from fastapi.middleware.cors import CORSMiddleware
21
+ from fastapi.responses import JSONResponse
22
+ from pydantic import BaseModel, Field, field_validator, ValidationError
23
+ from slowapi import Limiter, _rate_limit_exceeded_handler
24
+ from slowapi.util import get_remote_address
25
+ from slowapi.errors import RateLimitExceeded
26
+
27
+ # Configure logging
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
31
+ )
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # Rate limiting
35
+ limiter = Limiter(key_func=get_remote_address, default_limits=["1000/minute"])
36
+
37
+ from audio_utils import decode_base64_audio, load_audio_from_bytes
38
+ from model import analyze_voice, AnalysisResult
39
+ from speech_to_text import transcribe_audio
40
+ from fraud_language import analyze_transcript
41
+ from llm_semantic_analyzer import analyze_semantic_with_llm, is_llm_semantic_provider_ready
42
+ from privacy_utils import mask_sensitive_entities, sanitize_for_logging
43
+ from config import settings
44
+
45
+ try:
46
+ import redis # type: ignore
47
+ except Exception: # pragma: no cover - optional dependency
48
+ redis = None
49
+
50
+ # Computed constraints
51
+ MAX_AUDIO_BASE64_LENGTH = settings.MAX_AUDIO_SIZE_MB * 1024 * 1024 * 4 // 3
52
+
53
+
54
+ @dataclass
55
+ class SessionState:
56
+ """In-memory state for a real-time analysis session (derived data only)."""
57
+ session_id: str
58
+ language: str
59
+ started_at: str
60
+ status: str = "active"
61
+ chunks_processed: int = 0
62
+ alerts_triggered: int = 0
63
+ max_risk_score: int = 0
64
+ max_cpi: float = 0.0
65
+ final_call_label: str = "UNCERTAIN"
66
+ final_voice_classification: str = "UNCERTAIN"
67
+ final_voice_confidence: float = 0.0
68
+ max_voice_ai_confidence: float = 0.0
69
+ voice_ai_chunks: int = 0
70
+ voice_human_chunks: int = 0
71
+ llm_checks_performed: int = 0
72
+ risk_policy_version: str = settings.RISK_POLICY_VERSION
73
+ risk_history: List[int] = field(default_factory=list)
74
+ transcript_counts: Dict[str, int] = field(default_factory=dict)
75
+ semantic_flag_counts: Dict[str, int] = field(default_factory=dict)
76
+ keyword_category_counts: Dict[str, int] = field(default_factory=dict)
77
+ behaviour_score: int = 0
78
+ session_behaviour_signals: List[str] = field(default_factory=list)
79
+ last_transcript: str = ""
80
+ last_update: Optional[str] = None
81
+ alert_history: List[Dict[str, Any]] = field(default_factory=list)
82
+ llm_last_engine: Optional[str] = None
83
+
84
+
85
+ SESSION_STORE: Dict[str, SessionState] = {}
86
+ SESSION_LOCK = asyncio.Lock()
87
+ SESSION_STORE_BACKEND_ACTIVE = "memory"
88
+ REDIS_CLIENT: Any = None
89
+ ASR_INFLIGHT_TASKS: set[asyncio.Task] = set()
90
+ ASR_INFLIGHT_LOCK = asyncio.Lock()
91
+
92
+
93
+
94
+ def use_redis_session_store() -> bool:
95
+ """Return whether redis-backed session store is active."""
96
+ return SESSION_STORE_BACKEND_ACTIVE == "redis" and REDIS_CLIENT is not None
97
+
98
+
99
+ def initialize_session_store_backend() -> None:
100
+ """Initialize configured session backend with safe fallback to memory."""
101
+ global SESSION_STORE_BACKEND_ACTIVE, REDIS_CLIENT
102
+
103
+ configured = str(getattr(settings, "SESSION_STORE_BACKEND", "memory") or "memory").strip().lower()
104
+ if configured != "redis":
105
+ SESSION_STORE_BACKEND_ACTIVE = "memory"
106
+ REDIS_CLIENT = None
107
+ logger.info("Session store backend: memory")
108
+ return
109
+
110
+ if redis is None:
111
+ logger.warning("Redis backend requested but redis package is not installed. Falling back to memory store.")
112
+ SESSION_STORE_BACKEND_ACTIVE = "memory"
113
+ REDIS_CLIENT = None
114
+ return
115
+
116
+ redis_url = getattr(settings, "REDIS_URL", None)
117
+ if not redis_url:
118
+ logger.warning("Redis backend requested but REDIS_URL is empty. Falling back to memory store.")
119
+ SESSION_STORE_BACKEND_ACTIVE = "memory"
120
+ REDIS_CLIENT = None
121
+ return
122
+
123
+ try:
124
+ REDIS_CLIENT = redis.Redis.from_url(
125
+ redis_url,
126
+ decode_responses=True,
127
+ socket_connect_timeout=max(0.1, float(settings.REDIS_CONNECT_TIMEOUT_MS) / 1000.0),
128
+ socket_timeout=max(0.1, float(settings.REDIS_IO_TIMEOUT_MS) / 1000.0),
129
+ )
130
+ REDIS_CLIENT.ping()
131
+ SESSION_STORE_BACKEND_ACTIVE = "redis"
132
+ logger.info("Session store backend: redis")
133
+ except Exception as exc:
134
+ logger.warning("Failed to initialize redis session store (%s). Falling back to memory store.", exc)
135
+ SESSION_STORE_BACKEND_ACTIVE = "memory"
136
+ REDIS_CLIENT = None
137
+
138
+
139
+ def _session_redis_key(session_id: str) -> str:
140
+ return f"{settings.REDIS_PREFIX}:session:{session_id}"
141
+
142
+
143
+ def _serialize_session(session: SessionState) -> str:
144
+ return json.dumps(asdict(session), ensure_ascii=False, separators=(",", ":"))
145
+
146
+
147
+ def _deserialize_session(raw: Optional[str]) -> Optional[SessionState]:
148
+ if not raw:
149
+ return None
150
+ try:
151
+ payload = json.loads(raw)
152
+ if not isinstance(payload, dict):
153
+ return None
154
+ return SessionState(**payload)
155
+ except Exception as exc:
156
+ logger.warning("Failed to deserialize session payload: %s", exc)
157
+ return None
158
+
159
+
160
+ def get_session_state(session_id: str) -> Optional[SessionState]:
161
+ """Fetch session state from active backend."""
162
+ if use_redis_session_store():
163
+ raw = REDIS_CLIENT.get(_session_redis_key(session_id))
164
+ return _deserialize_session(raw)
165
+ return SESSION_STORE.get(session_id)
166
+
167
+
168
+ def save_session_state(session: SessionState) -> None:
169
+ """Persist session state to active backend."""
170
+ if use_redis_session_store():
171
+ ttl_seconds = max(1, int(session_retention_seconds(session)))
172
+ REDIS_CLIENT.set(_session_redis_key(session.session_id), _serialize_session(session), ex=ttl_seconds)
173
+ return
174
+ SESSION_STORE[session.session_id] = session
175
+
176
+
177
+ def delete_session_state(session_id: str) -> None:
178
+ """Delete session from active backend."""
179
+ if use_redis_session_store():
180
+ REDIS_CLIENT.delete(_session_redis_key(session_id))
181
+ return
182
+ SESSION_STORE.pop(session_id, None)
183
+
184
+
185
+ def _asr_fallback_result(engine: str) -> Dict[str, Any]:
186
+ return {
187
+ "transcript": "",
188
+ "confidence": 0.0,
189
+ "engine": engine,
190
+ "available": False,
191
+ }
192
+
193
+
194
+ def _discard_asr_task(task: asyncio.Task) -> None:
195
+ ASR_INFLIGHT_TASKS.discard(task)
196
+
197
+
198
+ async def transcribe_audio_guarded(
199
+ audio: np.ndarray,
200
+ sr: int,
201
+ language: str,
202
+ timeout_seconds: float,
203
+ request_id: str,
204
+ ) -> Dict[str, Any]:
205
+ """Run ASR with timeout and bounded in-flight tasks to avoid thread pileups."""
206
+ max_inflight = max(1, int(getattr(settings, "ASR_MAX_INFLIGHT_TASKS", 1)))
207
+
208
+ async with ASR_INFLIGHT_LOCK:
209
+ stale_tasks = [task for task in ASR_INFLIGHT_TASKS if task.done()]
210
+ for stale in stale_tasks:
211
+ ASR_INFLIGHT_TASKS.discard(stale)
212
+
213
+ if len(ASR_INFLIGHT_TASKS) >= max_inflight:
214
+ logger.warning(
215
+ "[%s] Realtime ASR skipped (inflight=%s, max=%s); continuing without transcript",
216
+ request_id,
217
+ len(ASR_INFLIGHT_TASKS),
218
+ max_inflight,
219
+ )
220
+ return _asr_fallback_result("busy")
221
+
222
+ asr_task = asyncio.create_task(asyncio.to_thread(transcribe_audio, audio, sr, language))
223
+ ASR_INFLIGHT_TASKS.add(asr_task)
224
+ asr_task.add_done_callback(_discard_asr_task)
225
+
226
+ try:
227
+ return await asyncio.wait_for(asyncio.shield(asr_task), timeout=timeout_seconds)
228
+ except asyncio.TimeoutError:
229
+ logger.warning(
230
+ "[%s] Realtime ASR timed out after %.0fms; continuing without transcript",
231
+ request_id,
232
+ timeout_seconds * 1000,
233
+ )
234
+ return _asr_fallback_result("timeout")
235
+ except Exception as exc:
236
+ logger.warning("[%s] Realtime ASR path failed: %s; continuing without transcript", request_id, exc)
237
+ return _asr_fallback_result("error")
238
+
239
+
240
+ def warmup_audio_pipeline() -> None:
241
+ """Warm audio decoding stack to reduce first-request latency spikes."""
242
+ if not settings.AUDIO_PIPELINE_WARMUP_ENABLED:
243
+ return
244
+ try:
245
+ import soundfile as sf
246
+
247
+ warm_audio = np.zeros(16000, dtype=np.float32)
248
+ wav_buffer = io.BytesIO()
249
+ sf.write(wav_buffer, warm_audio, 16000, format="WAV", subtype="PCM_16")
250
+ load_audio_from_bytes(wav_buffer.getvalue(), 22050, "wav")
251
+ logger.info("Audio pipeline warm-up complete")
252
+ except Exception as exc:
253
+ logger.warning("Audio pipeline warm-up skipped: %s", exc)
254
+
255
+
256
+ def warmup_asr_pipeline() -> None:
257
+ """Warm ASR model and transcription path on startup."""
258
+ if not settings.ASR_ENABLED or not settings.ASR_WARMUP_ENABLED:
259
+ return
260
+ try:
261
+ warm_audio = np.zeros(16000, dtype=np.float32)
262
+ transcribe_audio(warm_audio, 16000, "English")
263
+ logger.info("ASR warm-up complete")
264
+ except Exception as exc:
265
+ logger.warning("ASR warm-up skipped: %s", exc)
266
+
267
+
268
+ def warmup_voice_pipeline() -> None:
269
+ """Run one inference pass to avoid first realtime-model cold latency spike."""
270
+ if not settings.VOICE_WARMUP_ENABLED:
271
+ return
272
+ try:
273
+ sr = 16000
274
+ duration_sec = 1.0
275
+ sample_count = max(1, int(sr * duration_sec))
276
+ t = np.linspace(0.0, duration_sec, sample_count, endpoint=False, dtype=np.float32)
277
+ # Non-silent tone avoids edge-case feature paths and mirrors short speech chunks.
278
+ warm_audio = (0.08 * np.sin(2 * np.pi * 220 * t)).astype(np.float32)
279
+ analyze_voice(warm_audio, sr, "English", True)
280
+ logger.info("Voice model warm-up complete")
281
+ except Exception as exc:
282
+ logger.warning("Voice model warm-up skipped: %s", exc)
283
+
284
+
285
+ def run_startup_warmups() -> None:
286
+ """Run non-critical startup warm-ups for latency-sensitive paths."""
287
+ warmup_audio_pipeline()
288
+ warmup_voice_pipeline()
289
+ warmup_asr_pipeline()
290
+
291
+
292
+ # Detect environment
293
+ if settings.SPACE_ID:
294
+ logger.info(f"Running on HuggingFace Spaces: {settings.SPACE_ID}")
295
+
296
+
297
+ @asynccontextmanager
298
+ async def lifespan(app: FastAPI):
299
+ """Manage application lifespan events."""
300
+ logger.info("Starting up - preloading ML model...")
301
+ initialize_session_store_backend()
302
+ try:
303
+ from model import preload_model
304
+ preload_model()
305
+ logger.info("ML model loaded successfully")
306
+ except Exception as e:
307
+ logger.error(f"Failed to preload model: {e}")
308
+
309
+ try:
310
+ await asyncio.to_thread(run_startup_warmups)
311
+ except Exception as exc:
312
+ logger.warning("Startup warm-ups encountered an issue: %s", exc)
313
+
314
+ yield
315
+ # Shutdown
316
+ logger.info("Shutting down...")
317
+
318
+
319
+ from fastapi.responses import RedirectResponse
320
+
321
+ # Initialize FastAPI app with lifespan
322
+ app = FastAPI(
323
+ title="AI Voice Detection API",
324
+ description="Detects whether a voice sample is AI-generated or spoken by a real human",
325
+ version="1.0.0",
326
+ contact={
327
+ "name": "Shivam",
328
+ "url": settings.WEBSITE_URL,
329
+ },
330
+ docs_url="/docs",
331
+ redoc_url="/redoc",
332
+ lifespan=lifespan
333
+ )
334
+
335
+ # Add rate limiter to app state
336
+ app.state.limiter = limiter
337
+ app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
338
+
339
+ # Middleware configuration
340
+ # CORS
341
+ # Note: Set ALLOWED_ORIGINS env var in production
342
+ app.add_middleware(
343
+ CORSMiddleware,
344
+ allow_origins=settings.ALLOWED_ORIGINS,
345
+ allow_credentials=True,
346
+ allow_methods=["GET", "POST", "OPTIONS"],
347
+ allow_headers=["Content-Type", "x-api-key", "Authorization"],
348
+ )
349
+
350
+ # Request Logging & Timing Middleware
351
+ @app.middleware("http")
352
+ async def log_requests(request: Request, call_next):
353
+ # Generate request ID and start timer
354
+ request_id = str(uuid.uuid4())[:8]
355
+ request.state.request_id = request_id
356
+ start_time = time.perf_counter()
357
+
358
+ # Log request start
359
+ method = request.method
360
+ path = request.url.path
361
+ if method == "POST":
362
+ logger.info(f"[{request_id}] [START] {method} {path}")
363
+
364
+ # Process request (async)
365
+ response = await call_next(request)
366
+
367
+ # Calculate duration
368
+ duration_ms = (time.perf_counter() - start_time) * 1000
369
+ status_code = response.status_code
370
+
371
+ # Log request completion with timing
372
+ if method == "POST":
373
+ status_label = "[OK]" if status_code == 200 else "[ERR]" if status_code >= 400 else "[WARN]"
374
+ logger.info(f"[{request_id}] {status_label} END {method} {path} -> {status_code} ({duration_ms:.0f}ms)")
375
+
376
+ # Add headers
377
+ response.headers["X-Request-ID"] = request_id
378
+ response.headers["X-Response-Time"] = f"{duration_ms:.0f}ms"
379
+ response.headers["X-Content-Type-Options"] = "nosniff"
380
+ # Allow embedding in Hugging Face iframe
381
+ # response.headers["X-Frame-Options"] = "DENY"
382
+ response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
383
+ # Relax CSP to allow standard API documentation via CDNs (ReDoc/Swagger)
384
+ response.headers["Content-Security-Policy"] = (
385
+ "default-src 'self'; "
386
+ "script-src 'self' 'unsafe-inline' 'unsafe-eval' https://cdn.jsdelivr.net; "
387
+ "style-src 'self' 'unsafe-inline' https://cdn.jsdelivr.net https://fonts.googleapis.com; "
388
+ "font-src 'self' https://fonts.gstatic.com; "
389
+ "img-src 'self' data: https://fastapi.tiangolo.com;"
390
+ )
391
+ return response
392
+
393
+
394
+ # Request/Response Models
395
+ class VoiceDetectionRequest(BaseModel):
396
+ """Request body for voice detection."""
397
+ language: str = Field(..., description="Language: Tamil, English, Hindi, Malayalam, or Telugu")
398
+ audioFormat: str = Field(default="mp3", description="Audio format (must be mp3)")
399
+ audioBase64: str = Field(..., description="Base64-encoded MP3 audio")
400
+
401
+ @field_validator('audioBase64')
402
+ @classmethod
403
+ def validate_audio_size(cls, v: str) -> str:
404
+ """Validate audio data is not too small or too large."""
405
+ if len(v) < 100:
406
+ raise ValueError("Audio data too small - provide valid audio content")
407
+ if len(v) > MAX_AUDIO_BASE64_LENGTH:
408
+ raise ValueError(f"Audio data too large - maximum {settings.MAX_AUDIO_SIZE_MB}MB allowed")
409
+ return v
410
+
411
+
412
+ class ForensicMetrics(BaseModel):
413
+ """Detailed forensic analysis metrics."""
414
+ authenticity_score: float = Field(..., description="Overall voice naturalness score (0-100)")
415
+ pitch_naturalness: float = Field(..., description="Pitch stability and jitter score (0-100)")
416
+ spectral_naturalness: float = Field(..., description="Spectral entropy and flatness score (0-100)")
417
+ temporal_naturalness: float = Field(..., description="Rhythm and silence score (0-100)")
418
+
419
+
420
+ class VoiceDetectionResponse(BaseModel):
421
+ """Successful response from voice detection."""
422
+ status: str = "success"
423
+ language: str
424
+ classification: str # AI_GENERATED or HUMAN
425
+ confidenceScore: float = Field(..., ge=0.0, le=1.0)
426
+ explanation: str
427
+ forensic_metrics: Optional[ForensicMetrics] = None
428
+ modelUncertain: bool = False
429
+ recommendedAction: Optional[str] = None
430
+
431
+
432
+ class ErrorResponse(BaseModel):
433
+ """Error response."""
434
+ status: str = "error"
435
+ message: str
436
+
437
+
438
+ class SessionStartRequest(BaseModel):
439
+ """Request body for creating a real-time analysis session."""
440
+ language: str = Field(..., description="Language: Tamil, English, Hindi, Malayalam, or Telugu")
441
+
442
+
443
+ class SessionStartResponse(BaseModel):
444
+ """Response body after creating a session."""
445
+ status: str = "success"
446
+ session_id: str
447
+ language: str
448
+ started_at: str
449
+ message: str
450
+
451
+
452
+ class SessionChunkRequest(BaseModel):
453
+ """Audio chunk request for real-time analysis."""
454
+ audioFormat: str = Field(default="mp3", description="Audio format (must be one of supported formats)")
455
+ audioBase64: str = Field(..., description="Base64-encoded audio chunk")
456
+ language: Optional[str] = Field(default=None, description="Optional override. Defaults to session language")
457
+
458
+ @field_validator("audioBase64")
459
+ @classmethod
460
+ def validate_chunk_size(cls, v: str) -> str:
461
+ if len(v) < 100:
462
+ raise ValueError("Audio data too small - provide valid audio content")
463
+ if len(v) > MAX_AUDIO_BASE64_LENGTH:
464
+ raise ValueError(f"Audio data too large - maximum {settings.MAX_AUDIO_SIZE_MB}MB allowed")
465
+ return v
466
+
467
+
468
+ class RiskEvidence(BaseModel):
469
+ """Model evidence used to produce risk score."""
470
+ audio_patterns: List[str] = Field(default_factory=list)
471
+ keywords: List[str] = Field(default_factory=list)
472
+ behaviour: List[str] = Field(default_factory=list)
473
+
474
+
475
+ class RealTimeLanguageAnalysis(BaseModel):
476
+ """Transcript and language risk signals for the current chunk."""
477
+ transcript: str = ""
478
+ transcript_confidence: float = Field(default=0.0, ge=0.0, le=1.0)
479
+ asr_engine: str = "unavailable"
480
+ keyword_hits: List[str] = Field(default_factory=list)
481
+ keyword_categories: List[str] = Field(default_factory=list)
482
+ semantic_flags: List[str] = Field(default_factory=list)
483
+ keyword_score: int = Field(default=0, ge=0, le=100)
484
+ semantic_score: int = Field(default=0, ge=0, le=100)
485
+ behaviour_score: int = Field(default=0, ge=0, le=100)
486
+ session_behaviour_signals: List[str] = Field(default_factory=list)
487
+ llm_semantic_used: bool = False
488
+ llm_semantic_confidence: float = Field(default=0.0, ge=0.0, le=1.0)
489
+ llm_semantic_model: Optional[str] = None
490
+
491
+
492
+ class RealTimeAlert(BaseModel):
493
+ """Alert details emitted by the risk engine."""
494
+ triggered: bool
495
+ alert_type: Optional[str] = None
496
+ severity: Optional[str] = None
497
+ reason_summary: Optional[str] = None
498
+ recommended_action: Optional[str] = None
499
+
500
+
501
+ class ExplainabilitySignal(BaseModel):
502
+ """Per-signal contribution to fused risk score."""
503
+ signal: str
504
+ raw_score: int = Field(..., ge=0, le=100)
505
+ weight: float = Field(..., ge=0.0, le=1.0)
506
+ weighted_score: float = Field(..., ge=0.0, le=100.0)
507
+
508
+
509
+ class RealTimeExplainability(BaseModel):
510
+ """Human-readable explainability block for chunk risk output."""
511
+ summary: str
512
+ top_indicators: List[str] = Field(default_factory=list)
513
+ signal_contributions: List[ExplainabilitySignal] = Field(default_factory=list)
514
+ uncertainty_note: Optional[str] = None
515
+
516
+
517
+ class RealTimeUpdateResponse(BaseModel):
518
+ """Chunk-by-chunk update response."""
519
+ status: str = "success"
520
+ session_id: str
521
+ timestamp: str
522
+ risk_score: int = Field(..., ge=0, le=100)
523
+ cpi: float = Field(..., ge=0.0, le=100.0, description="Conversational Pressure Index")
524
+ risk_level: str
525
+ call_label: str
526
+ model_uncertain: bool = False
527
+ voice_classification: str = "UNCERTAIN"
528
+ voice_confidence: float = Field(default=0.0, ge=0.0, le=1.0)
529
+ evidence: RiskEvidence
530
+ language_analysis: RealTimeLanguageAnalysis
531
+ alert: RealTimeAlert
532
+ explainability: RealTimeExplainability
533
+ chunks_processed: int = Field(..., ge=1)
534
+ risk_policy_version: str = settings.RISK_POLICY_VERSION
535
+
536
+
537
+ class SessionSummaryResponse(BaseModel):
538
+ """Summary response for a completed or active session."""
539
+ status: str = "success"
540
+ session_id: str
541
+ language: str
542
+ session_status: str
543
+ started_at: str
544
+ last_update: Optional[str] = None
545
+ chunks_processed: int = 0
546
+ alerts_triggered: int = 0
547
+ max_risk_score: int = 0
548
+ max_cpi: float = 0.0
549
+ final_call_label: str = "UNCERTAIN"
550
+ final_voice_classification: str = "UNCERTAIN"
551
+ final_voice_confidence: float = 0.0
552
+ max_voice_ai_confidence: float = 0.0
553
+ voice_ai_chunks: int = 0
554
+ voice_human_chunks: int = 0
555
+ llm_checks_performed: int = 0
556
+ risk_policy_version: str = settings.RISK_POLICY_VERSION
557
+
558
+
559
+ class AlertHistoryItem(BaseModel):
560
+ """One alert event emitted during session analysis."""
561
+ timestamp: str
562
+ risk_score: int = Field(..., ge=0, le=100)
563
+ risk_level: str
564
+ call_label: str
565
+ alert_type: str
566
+ severity: str
567
+ reason_summary: str
568
+ recommended_action: str
569
+
570
+
571
+ class AlertHistoryResponse(BaseModel):
572
+ """Paginated alert history for one session."""
573
+ status: str = "success"
574
+ session_id: str
575
+ total_alerts: int
576
+ alerts: List[AlertHistoryItem] = Field(default_factory=list)
577
+
578
+
579
+ class RetentionPolicyResponse(BaseModel):
580
+ """Explicit privacy and retention behavior for session processing."""
581
+ status: str = "success"
582
+ raw_audio_storage: str = "not_persisted"
583
+ active_session_retention_seconds: int
584
+ ended_session_retention_seconds: int
585
+ stored_derived_fields: List[str]
586
+
587
+
588
+ def utc_now_iso() -> str:
589
+ """Return a UTC ISO-8601 timestamp."""
590
+ return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
591
+
592
+
593
+ STORED_DERIVED_FIELDS = [
594
+ "risk_history",
595
+ "behaviour_score",
596
+ "session_behaviour_signals",
597
+ "transcript_counts",
598
+ "semantic_flag_counts",
599
+ "keyword_category_counts",
600
+ "chunks_processed",
601
+ "alerts_triggered",
602
+ "max_risk_score",
603
+ "final_call_label",
604
+ "voice_ai_chunks",
605
+ "voice_human_chunks",
606
+ "max_voice_ai_confidence",
607
+ "final_voice_classification",
608
+ "llm_checks_performed",
609
+ ]
610
+
611
+
612
+ def parse_iso_timestamp(value: Optional[str]) -> Optional[float]:
613
+ """Convert ISO timestamp to epoch seconds."""
614
+ if value is None:
615
+ return None
616
+ try:
617
+ return datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp()
618
+ except ValueError:
619
+ return None
620
+
621
+
622
+ def session_reference_timestamp(session: SessionState) -> Optional[float]:
623
+ """Return the best available timestamp for retention checks."""
624
+ return parse_iso_timestamp(session.last_update) or parse_iso_timestamp(session.started_at)
625
+
626
+
627
+ def session_retention_seconds(session: SessionState) -> int:
628
+ """Resolve retention policy from session status."""
629
+ if session.status == "ended":
630
+ return settings.SESSION_ENDED_RETENTION_SECONDS
631
+ return settings.SESSION_ACTIVE_RETENTION_SECONDS
632
+
633
+
634
+ def is_session_expired(session: SessionState, now_ts: Optional[float] = None) -> bool:
635
+ """Check if a session exceeded status-specific retention TTL."""
636
+ reference_ts = session_reference_timestamp(session)
637
+ if reference_ts is None:
638
+ return False
639
+ current = now_ts if now_ts is not None else time.time()
640
+ return (current - reference_ts) > session_retention_seconds(session)
641
+
642
+
643
+ def purge_expired_sessions(now_ts: Optional[float] = None) -> int:
644
+ """Best-effort retention purge for stale sessions (memory backend)."""
645
+ if use_redis_session_store():
646
+ # Redis keys self-expire by TTL; no in-process purge needed.
647
+ return 0
648
+
649
+ current = now_ts if now_ts is not None else time.time()
650
+ expired_ids = [sid for sid, state in SESSION_STORE.items() if is_session_expired(state, current)]
651
+ for expired_id in expired_ids:
652
+ delete_session_state(expired_id)
653
+ return len(expired_ids)
654
+
655
+
656
+ def validate_supported_language(language: str) -> None:
657
+ """Validate supported language."""
658
+ if language not in settings.SUPPORTED_LANGUAGES:
659
+ raise HTTPException(
660
+ status_code=400,
661
+ detail={
662
+ "status": "error",
663
+ "message": f"Unsupported language. Must be one of: {', '.join(settings.SUPPORTED_LANGUAGES)}"
664
+ }
665
+ )
666
+
667
+
668
+ def validate_supported_format(audio_format: str) -> None:
669
+ """Validate supported audio format."""
670
+ normalized = audio_format.lower()
671
+ if normalized not in settings.SUPPORTED_FORMATS:
672
+ raise HTTPException(
673
+ status_code=400,
674
+ detail={
675
+ "status": "error",
676
+ "message": f"Unsupported audio format. Must be one of: {', '.join(settings.SUPPORTED_FORMATS)}"
677
+ }
678
+ )
679
+
680
+
681
+ def normalize_transcript_for_behavior(transcript: str) -> str:
682
+ """Normalize transcript for repetition and trend analysis."""
683
+ lowered = transcript.lower()
684
+ cleaned = "".join(ch if ch.isalnum() or ch.isspace() else " " for ch in lowered)
685
+ return " ".join(cleaned.split())
686
+
687
+
688
+ def token_overlap_ratio(text_a: str, text_b: str) -> float:
689
+ """Compute Jaccard overlap between token sets."""
690
+ tokens_a = set(text_a.split())
691
+ tokens_b = set(text_b.split())
692
+ if not tokens_a or not tokens_b:
693
+ return 0.0
694
+ return len(tokens_a.intersection(tokens_b)) / len(tokens_a.union(tokens_b))
695
+
696
+
697
+ def dedupe_preserve_order(items: List[str]) -> List[str]:
698
+ """Return unique string items while preserving first-seen order."""
699
+ seen = set()
700
+ deduped: List[str] = []
701
+ for item in items:
702
+ if item in seen:
703
+ continue
704
+ seen.add(item)
705
+ deduped.append(item)
706
+ return deduped
707
+
708
+ def update_session_behaviour_state(session: SessionState, language_analysis: Dict[str, Any]) -> Dict[str, Any]:
709
+ """Update session-level behaviour score from transcript and semantic trends."""
710
+ transcript_source = str(language_analysis.get("transcript_raw", language_analysis.get("transcript", "")))
711
+ transcript = normalize_transcript_for_behavior(transcript_source)
712
+ semantic_flags = list(language_analysis.get("semantic_flags", []))
713
+ keyword_categories = list(language_analysis.get("keyword_categories", []))
714
+
715
+ for flag in semantic_flags:
716
+ session.semantic_flag_counts[flag] = session.semantic_flag_counts.get(flag, 0) + 1
717
+ for category in keyword_categories:
718
+ session.keyword_category_counts[category] = session.keyword_category_counts.get(category, 0) + 1
719
+
720
+ behavior_signals: List[str] = []
721
+
722
+ if transcript:
723
+ count = session.transcript_counts.get(transcript, 0) + 1
724
+ session.transcript_counts[transcript] = count
725
+ if count >= 2:
726
+ behavior_signals.append("repetition_loop")
727
+ if session.last_transcript:
728
+ overlap = token_overlap_ratio(transcript, session.last_transcript)
729
+ if overlap >= 0.75 and len(transcript.split()) >= 4:
730
+ behavior_signals.append("repetition_loop")
731
+ session.last_transcript = transcript
732
+
733
+ urgency_count = session.semantic_flag_counts.get("urgency_language", 0)
734
+ if urgency_count >= 2:
735
+ behavior_signals.append("sustained_urgency")
736
+
737
+ has_impersonation = session.semantic_flag_counts.get("authority_impersonation", 0) > 0
738
+ has_credentials = session.semantic_flag_counts.get("credential_request", 0) > 0
739
+ has_payment = session.semantic_flag_counts.get("payment_redirection", 0) > 0
740
+ has_threat = session.semantic_flag_counts.get("coercive_threat_language", 0) > 0
741
+ has_urgency = urgency_count > 0
742
+
743
+ if has_impersonation and has_credentials:
744
+ behavior_signals.append("impersonation_plus_credential_request")
745
+ if has_payment and has_urgency:
746
+ behavior_signals.append("persistent_payment_pressure")
747
+ if has_threat and has_urgency:
748
+ behavior_signals.append("repeated_threat_urgency")
749
+
750
+ repeated_categories = sum(1 for count in session.keyword_category_counts.values() if count >= 2)
751
+ if repeated_categories >= 2:
752
+ behavior_signals.append("repeated_fraud_categories")
753
+
754
+ behavior_signals = sorted(set(behavior_signals))
755
+
756
+ score = 0
757
+ if "repetition_loop" in behavior_signals:
758
+ max_repetition = max(session.transcript_counts.values()) if session.transcript_counts else 2
759
+ score += 25 + min(15, (max_repetition - 2) * 5)
760
+ if "sustained_urgency" in behavior_signals:
761
+ score += 15 + min(10, (urgency_count - 2) * 5)
762
+ if "impersonation_plus_credential_request" in behavior_signals:
763
+ score += 30
764
+ if "persistent_payment_pressure" in behavior_signals:
765
+ score += 20
766
+ if "repeated_threat_urgency" in behavior_signals:
767
+ score += 15
768
+ if "repeated_fraud_categories" in behavior_signals:
769
+ score += 10
770
+
771
+ session.behaviour_score = max(0, min(100, score))
772
+ session.session_behaviour_signals = behavior_signals
773
+
774
+ return {
775
+ "behaviour_score": session.behaviour_score,
776
+ "session_behaviour_signals": session.session_behaviour_signals,
777
+ }
778
+
779
+
780
+ def map_score_to_level(score: int) -> str:
781
+ """Map numeric score to risk level."""
782
+ if score < 35:
783
+ return "LOW"
784
+ if score < 60:
785
+ return "MEDIUM"
786
+ if score < 80:
787
+ return "HIGH"
788
+ return "CRITICAL"
789
+
790
+
791
+ def map_level_to_label(risk_level: str, model_uncertain: bool) -> str:
792
+ """Map risk level to user-friendly label."""
793
+ if model_uncertain:
794
+ return "UNCERTAIN"
795
+ if risk_level == "LOW":
796
+ return "SAFE"
797
+ if risk_level == "MEDIUM":
798
+ return "SPAM"
799
+ return "FRAUD"
800
+
801
+
802
+ def recommendation_for_level(risk_level: str, model_uncertain: bool) -> str:
803
+ """Return a user action recommendation based on severity."""
804
+ if model_uncertain:
805
+ return "Model uncertainty detected. Avoid sharing OTP/PIN and verify caller via official channel."
806
+ if risk_level == "CRITICAL":
807
+ return "High fraud risk. End the call and verify through an official support number."
808
+ if risk_level == "HIGH":
809
+ return "Fraud indicators detected. Do not share OTP, PIN, passwords, or UPI credentials."
810
+ if risk_level == "MEDIUM":
811
+ return "Suspicious call behavior detected. Verify caller identity before taking action."
812
+ return "No high-risk fraud indicators detected in current chunk."
813
+
814
+
815
+
816
+ def should_invoke_llm_semantic(
817
+ provisional_scored: Dict[str, Any],
818
+ transcript: str,
819
+ transcript_confidence: float,
820
+ next_chunk_index: int,
821
+ ) -> bool:
822
+ """Gate optional LLM semantic calls for ambiguous/uncertain chunks."""
823
+
824
+ if not settings.LLM_SEMANTIC_ENABLED:
825
+ return False
826
+ if not is_llm_semantic_provider_ready():
827
+ return False
828
+ if not transcript.strip():
829
+ return False
830
+ if len(transcript.strip()) < 8:
831
+ return False
832
+ if transcript_confidence < settings.LLM_SEMANTIC_MIN_ASR_CONFIDENCE:
833
+ return False
834
+
835
+ interval = max(1, settings.LLM_SEMANTIC_CHUNK_INTERVAL)
836
+ if next_chunk_index > 1 and (next_chunk_index % interval) != 0:
837
+ return False
838
+
839
+ risk_score = int(provisional_scored.get("risk_score", 0))
840
+ model_uncertain = bool(provisional_scored.get("model_uncertain", False))
841
+ ambiguous_band = 35 <= risk_score < 80
842
+ return ambiguous_band or model_uncertain
843
+
844
+
845
+ def normalize_voice_classification(classification: str, model_uncertain: bool) -> str:
846
+ """Normalize realtime voice-authenticity classification."""
847
+ if model_uncertain:
848
+ return "UNCERTAIN"
849
+ normalized = str(classification or "HUMAN").upper()
850
+ if normalized in {"AI_GENERATED", "HUMAN"}:
851
+ return normalized
852
+ return "HUMAN"
853
+
854
+ def build_explainability_payload(
855
+ risk_level: str,
856
+ call_label: str,
857
+ model_uncertain: bool,
858
+ cpi: float,
859
+ audio_score: int,
860
+ keyword_score: int,
861
+ semantic_score: int,
862
+ behaviour_score: int,
863
+ has_language_signals: bool,
864
+ behaviour_signals: List[str],
865
+ keyword_hits: List[str],
866
+ acoustic_anomaly: float,
867
+ ) -> RealTimeExplainability:
868
+ """Build explicit explainability signals and concise summary."""
869
+ if has_language_signals:
870
+ weights = {
871
+ "audio": 0.45,
872
+ "keywords": 0.20,
873
+ "semantic": 0.15,
874
+ "behaviour": 0.20,
875
+ }
876
+ else:
877
+ weights = {
878
+ "audio": 1.00,
879
+ "keywords": 0.00,
880
+ "semantic": 0.00,
881
+ "behaviour": 0.00,
882
+ }
883
+
884
+ contributions = [
885
+ ExplainabilitySignal(
886
+ signal="audio",
887
+ raw_score=audio_score,
888
+ weight=weights["audio"],
889
+ weighted_score=round(audio_score * weights["audio"], 2),
890
+ ),
891
+ ExplainabilitySignal(
892
+ signal="keywords",
893
+ raw_score=keyword_score,
894
+ weight=weights["keywords"],
895
+ weighted_score=round(keyword_score * weights["keywords"], 2),
896
+ ),
897
+ ExplainabilitySignal(
898
+ signal="semantic_intent",
899
+ raw_score=semantic_score,
900
+ weight=weights["semantic"],
901
+ weighted_score=round(semantic_score * weights["semantic"], 2),
902
+ ),
903
+ ExplainabilitySignal(
904
+ signal="behaviour",
905
+ raw_score=behaviour_score,
906
+ weight=weights["behaviour"],
907
+ weighted_score=round(behaviour_score * weights["behaviour"], 2),
908
+ ),
909
+ ]
910
+
911
+ indicators: List[str] = []
912
+ if acoustic_anomaly >= 60:
913
+ indicators.append("acoustic_anomaly_detected")
914
+ indicators.extend(behaviour_signals)
915
+ indicators.extend(keyword_hits[:3])
916
+ deduped_indicators = list(dict.fromkeys(indicators))[:6]
917
+
918
+ summary_parts: List[str] = [
919
+ f"{risk_level.title()} risk classified as {call_label}."
920
+ ]
921
+ summary_parts.append(f"CPI at {cpi:.1f}/100.")
922
+ if acoustic_anomaly >= 60:
923
+ summary_parts.append("Audio anomalies are materially elevated.")
924
+ if keyword_score >= 45:
925
+ summary_parts.append("Fraud-related keywords contribute to the score.")
926
+ if semantic_score >= 45:
927
+ summary_parts.append("Semantic coercion patterns were detected.")
928
+ if behaviour_score >= 40:
929
+ summary_parts.append("Session behavior trend increases risk.")
930
+ if cpi >= 70:
931
+ summary_parts.append("Pressure escalation velocity is high; early warning triggered.")
932
+ if not has_language_signals:
933
+ summary_parts.append("Assessment is currently audio-dominant.")
934
+
935
+ uncertainty_note = None
936
+ if model_uncertain:
937
+ uncertainty_note = (
938
+ "Model confidence is limited for this chunk. Treat this result conservatively and verify through trusted channels."
939
+ )
940
+
941
+ return RealTimeExplainability(
942
+ summary=" ".join(summary_parts),
943
+ top_indicators=deduped_indicators,
944
+ signal_contributions=contributions,
945
+ uncertainty_note=uncertainty_note,
946
+ )
947
+
948
+
949
+ def build_risk_update(
950
+ result_features: Dict[str, float],
951
+ classification: str,
952
+ confidence: float,
953
+ language_analysis: Dict[str, Any],
954
+ previous_score: Optional[int],
955
+ llm_semantic: Optional[Dict[str, Any]] = None,
956
+ ) -> Dict[str, Any]:
957
+ """Build risk score, evidence and alert from model outputs and session trend."""
958
+ authenticity = float(result_features.get("authenticity_score", 50.0))
959
+ acoustic_anomaly = float(result_features.get("acoustic_anomaly_score", 0.0))
960
+ ml_fallback = bool(result_features.get("ml_fallback", 0.0))
961
+ realtime_heuristic_mode = bool(result_features.get("realtime_heuristic_mode", 0.0))
962
+ normalized_classification = str(classification or "").upper()
963
+ low_confidence_uncertain = bool(
964
+ normalized_classification != "AI_GENERATED"
965
+ and float(confidence) < 0.65
966
+ and int(language_analysis.get("keyword_score", 0)) == 0
967
+ and int(language_analysis.get("semantic_score", 0)) == 0
968
+ and int(language_analysis.get("behaviour_score", 0)) == 0
969
+ )
970
+ heuristic_uncertain = bool(
971
+ realtime_heuristic_mode
972
+ and normalized_classification != "AI_GENERATED"
973
+ and float(confidence) < 0.90
974
+ )
975
+ model_uncertain = ml_fallback or low_confidence_uncertain or heuristic_uncertain
976
+ keyword_score = int(language_analysis.get("keyword_score", 0))
977
+ semantic_score = int(language_analysis.get("semantic_score", 0))
978
+ behaviour_score = int(language_analysis.get("behaviour_score", 0))
979
+ keyword_hits = dedupe_preserve_order(list(language_analysis.get("keyword_hits", [])))
980
+ behavior_from_language = dedupe_preserve_order(list(language_analysis.get("behaviour_signals", [])))
981
+ behavior_from_session = dedupe_preserve_order(list(language_analysis.get("session_behaviour_signals", [])))
982
+ keyword_categories = dedupe_preserve_order(list(language_analysis.get("keyword_categories", [])))
983
+ semantic_flags = dedupe_preserve_order(list(language_analysis.get("semantic_flags", [])))
984
+ transcript = str(language_analysis.get("transcript", "")).strip()
985
+
986
+ llm_semantic_used = False
987
+ llm_semantic_confidence = 0.0
988
+ llm_semantic_model: Optional[str] = None
989
+ if llm_semantic and llm_semantic.get("available"):
990
+ blend_weight = max(0.0, min(1.0, settings.LLM_SEMANTIC_BLEND_WEIGHT))
991
+ llm_score = int(max(0, min(100, llm_semantic.get("semantic_score", semantic_score))))
992
+ semantic_score = int(round((semantic_score * (1.0 - blend_weight)) + (llm_score * blend_weight)))
993
+ llm_semantic_confidence = float(max(0.0, min(1.0, llm_semantic.get("confidence", 0.0))))
994
+ llm_semantic_model = str(llm_semantic.get("model") or settings.LLM_SEMANTIC_MODEL)
995
+ llm_semantic_used = True
996
+
997
+ keyword_hints = dedupe_preserve_order([str(x) for x in llm_semantic.get("keyword_hints", [])])
998
+ if keyword_hints:
999
+ keyword_hits = dedupe_preserve_order(keyword_hits + keyword_hints)
1000
+ keyword_score = min(100, keyword_score + min(18, len(keyword_hints) * 6))
1001
+
1002
+ llm_flags = dedupe_preserve_order([str(x) for x in llm_semantic.get("semantic_flags", [])])
1003
+ if llm_flags:
1004
+ semantic_flags = dedupe_preserve_order(semantic_flags + llm_flags)
1005
+
1006
+ llm_behaviour = dedupe_preserve_order([str(x) for x in llm_semantic.get("behaviour_signals", [])])
1007
+ if llm_behaviour:
1008
+ behavior_from_language = dedupe_preserve_order(behavior_from_language + llm_behaviour)
1009
+
1010
+ # Audio signal risk.
1011
+ if classification == "AI_GENERATED":
1012
+ audio_score = max(
1013
+ int(round(confidence * 100)),
1014
+ int(max(0.0, min(100.0, acoustic_anomaly * 0.85))),
1015
+ )
1016
+ else:
1017
+ authenticity_audio_score = int(max(0, min(100, (50.0 - authenticity) * 1.2)))
1018
+ anomaly_audio_score = int(max(0.0, min(100.0, acoustic_anomaly * 0.90)))
1019
+ audio_score = max(authenticity_audio_score, anomaly_audio_score)
1020
+
1021
+ has_language_signals = bool(transcript) or keyword_score > 0 or semantic_score > 0 or behaviour_score > 0
1022
+ if has_language_signals:
1023
+ raw_weights = {
1024
+ "audio": settings.RISK_WEIGHT_AUDIO,
1025
+ "keywords": settings.RISK_WEIGHT_KEYWORD,
1026
+ "semantic": settings.RISK_WEIGHT_SEMANTIC,
1027
+ "behaviour": settings.RISK_WEIGHT_BEHAVIOUR,
1028
+ }
1029
+ total_weight = sum(raw_weights.values())
1030
+ if total_weight <= 0:
1031
+ raw_weights = {"audio": 0.45, "keywords": 0.20, "semantic": 0.15, "behaviour": 0.20}
1032
+ total_weight = 1.0
1033
+ normalized = {k: v / total_weight for k, v in raw_weights.items()}
1034
+
1035
+ base_score = int(
1036
+ round(
1037
+ (audio_score * normalized["audio"])
1038
+ + (keyword_score * normalized["keywords"])
1039
+ + (semantic_score * normalized["semantic"])
1040
+ + (behaviour_score * normalized["behaviour"])
1041
+ )
1042
+ )
1043
+ else:
1044
+ base_score = audio_score
1045
+
1046
+ if ml_fallback:
1047
+ base_score = max(base_score, 55)
1048
+
1049
+ risk_score = max(0, min(100, base_score))
1050
+ behaviour_signals: List[str] = list(behavior_from_language) + list(behavior_from_session)
1051
+
1052
+ if keyword_score >= 60:
1053
+ behaviour_signals.append("keyword_cluster_detected")
1054
+ if semantic_score >= 60:
1055
+ behaviour_signals.append("semantic_coercion_detected")
1056
+ if behaviour_score >= 40:
1057
+ behaviour_signals.append("behaviour_risk_elevated")
1058
+ if acoustic_anomaly >= 60:
1059
+ behaviour_signals.append("acoustic_anomaly_detected")
1060
+
1061
+ if previous_score is not None:
1062
+ delta = risk_score - previous_score
1063
+ if delta >= 15:
1064
+ behaviour_signals.append("rapid_risk_escalation")
1065
+ if risk_score >= 70 and previous_score >= 70:
1066
+ behaviour_signals.append("sustained_high_risk")
1067
+ else:
1068
+ delta = 0
1069
+
1070
+ if delta > 0:
1071
+ risk_score = min(100, risk_score + int(delta * settings.RISK_DELTA_BOOST_FACTOR))
1072
+
1073
+ if previous_score is None:
1074
+ cpi = min(100.0, max(0.0, (behaviour_score * 0.35) + (semantic_score * 0.20)))
1075
+ else:
1076
+ cpi = min(
1077
+ 100.0,
1078
+ max(
1079
+ 0.0,
1080
+ (max(0, delta) * 3.2)
1081
+ + (behaviour_score * 0.35)
1082
+ + (semantic_score * 0.15),
1083
+ ),
1084
+ )
1085
+ if cpi >= 70:
1086
+ behaviour_signals.append("cpi_spike_detected")
1087
+
1088
+ behaviour_signals = dedupe_preserve_order(behaviour_signals)
1089
+ risk_level = map_score_to_level(risk_score)
1090
+ call_label = map_level_to_label(risk_level, model_uncertain)
1091
+
1092
+ audio_patterns = [
1093
+ f"classification:{classification.lower()}",
1094
+ f"model_confidence:{confidence:.2f}",
1095
+ f"authenticity_score:{authenticity:.1f}",
1096
+ f"acoustic_anomaly_score:{acoustic_anomaly:.1f}",
1097
+ f"audio_score:{audio_score}",
1098
+ ]
1099
+ if ml_fallback:
1100
+ audio_patterns.append("model_fallback:true")
1101
+ audio_patterns = dedupe_preserve_order(audio_patterns)
1102
+
1103
+ strong_intent = {
1104
+ "authority_with_credential_request",
1105
+ "urgent_payment_pressure",
1106
+ "threat_plus_urgency",
1107
+ "impersonation_plus_credential_request",
1108
+ "persistent_payment_pressure",
1109
+ "repeated_threat_urgency",
1110
+ }
1111
+ alert_triggered = (
1112
+ risk_level in {"HIGH", "CRITICAL"}
1113
+ or "rapid_risk_escalation" in behaviour_signals
1114
+ or cpi >= 70
1115
+ or any(signal in behaviour_signals for signal in strong_intent)
1116
+ )
1117
+ alert_type = None
1118
+ severity = None
1119
+ reason_summary = None
1120
+ recommended_action = None
1121
+
1122
+ if alert_triggered:
1123
+ if risk_level == "CRITICAL":
1124
+ alert_type = "FRAUD_RISK_CRITICAL"
1125
+ elif cpi >= 70:
1126
+ alert_type = "EARLY_PRESSURE_WARNING"
1127
+ elif "rapid_risk_escalation" in behaviour_signals:
1128
+ alert_type = "RISK_ESCALATION"
1129
+ else:
1130
+ alert_type = "FRAUD_RISK_HIGH"
1131
+ severity = risk_level.lower()
1132
+ reasons: List[str] = []
1133
+ if keyword_hits:
1134
+ reasons.append("fraud keywords detected")
1135
+ if semantic_score >= 45:
1136
+ reasons.append("coercive intent patterns detected")
1137
+ if behaviour_score >= 40:
1138
+ reasons.append("session behavior risk elevated")
1139
+ if "repetition_loop" in behaviour_signals:
1140
+ reasons.append("repetition loop detected")
1141
+ if "rapid_risk_escalation" in behaviour_signals:
1142
+ reasons.append("risk escalated rapidly across chunks")
1143
+ if cpi >= 70:
1144
+ reasons.append("conversational pressure index spiked")
1145
+ if not reasons:
1146
+ reasons.append("high-risk audio pattern detected")
1147
+ reason_summary = ". ".join(reasons).capitalize() + "."
1148
+ recommended_action = recommendation_for_level(risk_level, model_uncertain)
1149
+
1150
+ explainability = build_explainability_payload(
1151
+ risk_level=risk_level,
1152
+ call_label=call_label,
1153
+ model_uncertain=model_uncertain,
1154
+ cpi=cpi,
1155
+ audio_score=audio_score,
1156
+ keyword_score=keyword_score,
1157
+ semantic_score=semantic_score,
1158
+ behaviour_score=behaviour_score,
1159
+ has_language_signals=has_language_signals,
1160
+ behaviour_signals=behaviour_signals,
1161
+ keyword_hits=keyword_hits,
1162
+ acoustic_anomaly=acoustic_anomaly,
1163
+ )
1164
+
1165
+ return {
1166
+ "risk_score": risk_score,
1167
+ "cpi": round(cpi, 1),
1168
+ "risk_level": risk_level,
1169
+ "call_label": call_label,
1170
+ "model_uncertain": model_uncertain,
1171
+ "evidence": RiskEvidence(
1172
+ audio_patterns=audio_patterns,
1173
+ keywords=keyword_hits,
1174
+ behaviour=behaviour_signals
1175
+ ),
1176
+ "language_analysis": RealTimeLanguageAnalysis(
1177
+ transcript=transcript,
1178
+ transcript_confidence=float(language_analysis.get("transcript_confidence", 0.0)),
1179
+ asr_engine=str(language_analysis.get("asr_engine", "unavailable")),
1180
+ keyword_hits=keyword_hits,
1181
+ keyword_categories=keyword_categories,
1182
+ semantic_flags=semantic_flags,
1183
+ keyword_score=keyword_score,
1184
+ semantic_score=semantic_score,
1185
+ behaviour_score=behaviour_score,
1186
+ session_behaviour_signals=behavior_from_session,
1187
+ llm_semantic_used=llm_semantic_used,
1188
+ llm_semantic_confidence=llm_semantic_confidence,
1189
+ llm_semantic_model=llm_semantic_model,
1190
+ ),
1191
+ "alert": RealTimeAlert(
1192
+ triggered=alert_triggered,
1193
+ alert_type=alert_type,
1194
+ severity=severity,
1195
+ reason_summary=reason_summary,
1196
+ recommended_action=recommended_action
1197
+ ),
1198
+ "explainability": explainability,
1199
+ }
1200
+
1201
+
1202
+ async def process_audio_chunk(
1203
+ session_id: str,
1204
+ chunk_request: SessionChunkRequest,
1205
+ default_language: str,
1206
+ request_id: str
1207
+ ) -> RealTimeUpdateResponse:
1208
+ """Decode, analyze and score a real-time audio chunk."""
1209
+ chunk_language = chunk_request.language or default_language
1210
+ validate_supported_language(chunk_language)
1211
+ validate_supported_format(chunk_request.audioFormat)
1212
+
1213
+ audio_size_kb = len(chunk_request.audioBase64) * 3 / 4 / 1024
1214
+ logger.info(
1215
+ f"[{request_id}] Realtime chunk: session={session_id}, language={chunk_language}, "
1216
+ f"format={chunk_request.audioFormat}, size~{audio_size_kb:.1f}KB"
1217
+ )
1218
+
1219
+ decode_start = time.perf_counter()
1220
+ audio_bytes = await asyncio.to_thread(decode_base64_audio, chunk_request.audioBase64)
1221
+ decode_ms = (time.perf_counter() - decode_start) * 1000
1222
+
1223
+ load_start = time.perf_counter()
1224
+ audio, sr = await asyncio.to_thread(load_audio_from_bytes, audio_bytes, 22050, chunk_request.audioFormat)
1225
+ load_ms = (time.perf_counter() - load_start) * 1000
1226
+
1227
+ duration_sec = len(audio) / sr
1228
+ logger.info(
1229
+ f"[{request_id}] Realtime analyze {duration_sec:.2f}s (decode {decode_ms:.0f}ms, load {load_ms:.0f}ms)"
1230
+ )
1231
+
1232
+ analyze_start = time.perf_counter()
1233
+ try:
1234
+ analysis_result = await asyncio.to_thread(analyze_voice, audio, sr, chunk_language, True)
1235
+ except Exception as exc:
1236
+ logger.warning("[%s] Realtime model path failed: %s; using conservative fallback", request_id, exc)
1237
+ analysis_result = AnalysisResult(
1238
+ classification="HUMAN",
1239
+ confidence_score=0.5,
1240
+ explanation="Realtime model path unavailable; conservative fallback applied.",
1241
+ features={
1242
+ "ml_fallback": 1.0,
1243
+ "authenticity_score": 50.0,
1244
+ "pitch_naturalness": 50.0,
1245
+ "spectral_naturalness": 50.0,
1246
+ "temporal_naturalness": 50.0,
1247
+ "acoustic_anomaly_score": 50.0,
1248
+ },
1249
+ )
1250
+ analyze_ms = (time.perf_counter() - analyze_start) * 1000
1251
+ logger.info(
1252
+ f"[{request_id}] Realtime result: {analysis_result.classification} "
1253
+ f"({analysis_result.confidence_score:.0%}) in {analyze_ms:.0f}ms"
1254
+ )
1255
+
1256
+ asr_start = time.perf_counter()
1257
+ asr_timeout_seconds = max(0.1, float(settings.ASR_TIMEOUT_MS) / 1000.0)
1258
+ asr_result = await transcribe_audio_guarded(
1259
+ audio=audio,
1260
+ sr=sr,
1261
+ language=chunk_language,
1262
+ timeout_seconds=asr_timeout_seconds,
1263
+ request_id=request_id,
1264
+ )
1265
+ asr_ms = (time.perf_counter() - asr_start) * 1000
1266
+ raw_transcript = str(asr_result.get("transcript", ""))
1267
+ response_transcript = (
1268
+ mask_sensitive_entities(raw_transcript)
1269
+ if settings.MASK_TRANSCRIPT_OUTPUT
1270
+ else raw_transcript
1271
+ )
1272
+ language_result = analyze_transcript(raw_transcript, chunk_language)
1273
+ language_result["transcript_raw"] = raw_transcript
1274
+ language_result["transcript"] = response_transcript
1275
+ language_result["transcript_confidence"] = asr_result.get("confidence", 0.0)
1276
+ language_result["asr_engine"] = asr_result.get("engine", "unavailable")
1277
+ transcript_preview = sanitize_for_logging(raw_transcript, max_chars=90)
1278
+ logger.info(
1279
+ f"[{request_id}] Realtime ASR: engine={language_result['asr_engine']}, "
1280
+ f"confidence={language_result['transcript_confidence']:.2f}, "
1281
+ f"text_len={len(raw_transcript)}, preview='{transcript_preview}', asr={asr_ms:.0f}ms"
1282
+ )
1283
+
1284
+ # Read-only session snapshot for scoring and optional LLM gating.
1285
+ async with SESSION_LOCK:
1286
+ purge_expired_sessions()
1287
+ session = get_session_state(session_id)
1288
+ if session is None:
1289
+ raise HTTPException(
1290
+ status_code=404,
1291
+ detail={"status": "error", "message": "Session not found or expired"}
1292
+ )
1293
+ if session.status != "active":
1294
+ raise HTTPException(
1295
+ status_code=409,
1296
+ detail={"status": "error", "message": "Session is not active. Start a new session to continue."}
1297
+ )
1298
+ previous_score_snapshot = session.risk_history[-1] if session.risk_history else None
1299
+ next_chunk_index = session.chunks_processed + 1
1300
+
1301
+ provisional_scored = build_risk_update(
1302
+ analysis_result.features or {},
1303
+ analysis_result.classification,
1304
+ analysis_result.confidence_score,
1305
+ language_result,
1306
+ previous_score_snapshot,
1307
+ )
1308
+
1309
+ llm_semantic: Optional[Dict[str, Any]] = None
1310
+ llm_invoked = should_invoke_llm_semantic(
1311
+ provisional_scored=provisional_scored,
1312
+ transcript=raw_transcript,
1313
+ transcript_confidence=float(language_result.get("transcript_confidence", 0.0)),
1314
+ next_chunk_index=next_chunk_index,
1315
+ )
1316
+ if llm_invoked:
1317
+ llm_semantic = await asyncio.to_thread(
1318
+ analyze_semantic_with_llm,
1319
+ raw_transcript,
1320
+ chunk_language,
1321
+ settings.LLM_SEMANTIC_TIMEOUT_MS,
1322
+ )
1323
+
1324
+ async with SESSION_LOCK:
1325
+ purge_expired_sessions()
1326
+ session = get_session_state(session_id)
1327
+ if session is None:
1328
+ raise HTTPException(
1329
+ status_code=404,
1330
+ detail={"status": "error", "message": "Session not found or expired"}
1331
+ )
1332
+ if session.status != "active":
1333
+ raise HTTPException(
1334
+ status_code=409,
1335
+ detail={"status": "error", "message": "Session is not active. Start a new session to continue."}
1336
+ )
1337
+
1338
+ if llm_invoked:
1339
+ session.llm_checks_performed += 1
1340
+ if llm_semantic and llm_semantic.get("available"):
1341
+ session.llm_last_engine = str(llm_semantic.get("engine", "openai-chat-completions"))
1342
+ else:
1343
+ reason = str((llm_semantic or {}).get("reason", "unavailable"))
1344
+ session.llm_last_engine = f"skipped:{reason}"
1345
+
1346
+ behaviour_snapshot = update_session_behaviour_state(session, language_result)
1347
+ language_result.update(behaviour_snapshot)
1348
+ previous_score = session.risk_history[-1] if session.risk_history else None
1349
+ scored = build_risk_update(
1350
+ analysis_result.features or {},
1351
+ analysis_result.classification,
1352
+ analysis_result.confidence_score,
1353
+ language_result,
1354
+ previous_score,
1355
+ llm_semantic=llm_semantic,
1356
+ )
1357
+
1358
+ voice_classification = normalize_voice_classification(
1359
+ analysis_result.classification,
1360
+ scored["model_uncertain"],
1361
+ )
1362
+ voice_confidence = float(max(0.0, min(1.0, analysis_result.confidence_score)))
1363
+
1364
+ session.chunks_processed += 1
1365
+ session.last_update = utc_now_iso()
1366
+ session.risk_history.append(scored["risk_score"])
1367
+ if scored["risk_score"] >= session.max_risk_score:
1368
+ session.final_call_label = scored["call_label"]
1369
+ session.max_risk_score = max(session.max_risk_score, scored["risk_score"])
1370
+ session.max_cpi = max(session.max_cpi, float(scored["cpi"]))
1371
+
1372
+ if voice_classification == "AI_GENERATED":
1373
+ session.voice_ai_chunks += 1
1374
+ session.max_voice_ai_confidence = max(session.max_voice_ai_confidence, voice_confidence)
1375
+ elif voice_classification == "HUMAN":
1376
+ session.voice_human_chunks += 1
1377
+
1378
+ session.final_voice_classification = voice_classification
1379
+ session.final_voice_confidence = voice_confidence
1380
+
1381
+ if scored["alert"].triggered:
1382
+ alert_obj = scored["alert"]
1383
+ alert_entry = {
1384
+ "timestamp": session.last_update,
1385
+ "risk_score": scored["risk_score"],
1386
+ "risk_level": scored["risk_level"],
1387
+ "call_label": scored["call_label"],
1388
+ "alert_type": alert_obj.alert_type or "FRAUD_RISK_HIGH",
1389
+ "severity": alert_obj.severity or scored["risk_level"].lower(),
1390
+ "reason_summary": alert_obj.reason_summary or "Fraud indicators detected.",
1391
+ "recommended_action": alert_obj.recommended_action
1392
+ or recommendation_for_level(scored["risk_level"], scored["model_uncertain"]),
1393
+ }
1394
+
1395
+ last_alert = session.alert_history[-1] if session.alert_history else None
1396
+ duplicate_keys = ("alert_type", "severity", "reason_summary", "recommended_action", "call_label", "risk_level")
1397
+ is_duplicate = bool(
1398
+ last_alert
1399
+ and all(last_alert.get(key) == alert_entry.get(key) for key in duplicate_keys)
1400
+ )
1401
+
1402
+ if is_duplicate:
1403
+ last_alert["timestamp"] = session.last_update
1404
+ last_alert["risk_score"] = max(int(last_alert.get("risk_score", 0)), scored["risk_score"])
1405
+ else:
1406
+ session.alerts_triggered += 1
1407
+ session.alert_history.append(alert_entry)
1408
+ if len(session.alert_history) > 100:
1409
+ session.alert_history = session.alert_history[-100:]
1410
+
1411
+ save_session_state(session)
1412
+
1413
+ return RealTimeUpdateResponse(
1414
+ status="success",
1415
+ session_id=session_id,
1416
+ timestamp=session.last_update,
1417
+ risk_score=scored["risk_score"],
1418
+ cpi=scored["cpi"],
1419
+ risk_level=scored["risk_level"],
1420
+ call_label=scored["call_label"],
1421
+ model_uncertain=scored["model_uncertain"],
1422
+ voice_classification=voice_classification,
1423
+ voice_confidence=voice_confidence,
1424
+ evidence=scored["evidence"],
1425
+ language_analysis=scored["language_analysis"],
1426
+ alert=scored["alert"],
1427
+ explainability=scored["explainability"],
1428
+ chunks_processed=session.chunks_processed,
1429
+ risk_policy_version=settings.RISK_POLICY_VERSION,
1430
+ )
1431
+
1432
+
1433
+ def session_to_summary(session: SessionState) -> SessionSummaryResponse:
1434
+ """Convert session state to response model."""
1435
+ return SessionSummaryResponse(
1436
+ status="success",
1437
+ session_id=session.session_id,
1438
+ language=session.language,
1439
+ session_status=session.status,
1440
+ started_at=session.started_at,
1441
+ last_update=session.last_update,
1442
+ chunks_processed=session.chunks_processed,
1443
+ alerts_triggered=session.alerts_triggered,
1444
+ max_risk_score=session.max_risk_score,
1445
+ max_cpi=round(session.max_cpi, 1),
1446
+ final_call_label=session.final_call_label,
1447
+ final_voice_classification=session.final_voice_classification,
1448
+ final_voice_confidence=round(session.final_voice_confidence, 2),
1449
+ max_voice_ai_confidence=round(session.max_voice_ai_confidence, 2),
1450
+ voice_ai_chunks=session.voice_ai_chunks,
1451
+ voice_human_chunks=session.voice_human_chunks,
1452
+ llm_checks_performed=session.llm_checks_performed,
1453
+ risk_policy_version=settings.RISK_POLICY_VERSION,
1454
+ )
1455
+
1456
+
1457
+ # Authentication
1458
+ from fastapi.security import APIKeyHeader
1459
+ from fastapi import Security
1460
+
1461
+ api_key_header = APIKeyHeader(name="x-api-key", auto_error=False) # Changed to False for better error messages
1462
+
1463
+ async def verify_api_key(x_api_key: str = Security(api_key_header)) -> str:
1464
+ """Dependency to verify API key. Raises 401 if invalid or missing."""
1465
+ if x_api_key is None:
1466
+ logger.warning("API request without x-api-key header")
1467
+ raise HTTPException(
1468
+ status_code=401,
1469
+ detail={"status": "error", "message": "Missing API key. Include 'x-api-key' header."}
1470
+ )
1471
+ if x_api_key != settings.API_KEY:
1472
+ logger.warning(f"API request with invalid key: {x_api_key[:8]}...")
1473
+ raise HTTPException(
1474
+ status_code=401,
1475
+ detail={"status": "error", "message": "Invalid API key"}
1476
+ )
1477
+ return x_api_key
1478
+
1479
+
1480
+ def verify_websocket_api_key(websocket: WebSocket) -> bool:
1481
+ """Validate API key for websocket connections."""
1482
+ key = websocket.headers.get("x-api-key") or websocket.query_params.get("api_key")
1483
+ return key == settings.API_KEY
1484
+
1485
+
1486
+ # Routes
1487
+ @app.get("/", include_in_schema=False)
1488
+ async def root():
1489
+ """Redirect to API documentation."""
1490
+ return RedirectResponse(url="/docs")
1491
+
1492
+
1493
+ @app.get("/health")
1494
+ async def health_check():
1495
+ """Health check for monitoring - verifies ML model is loaded."""
1496
+ try:
1497
+ from model import _model
1498
+ model_loaded = _model is not None
1499
+ except Exception:
1500
+ model_loaded = False
1501
+
1502
+ return {
1503
+ "status": "healthy" if model_loaded else "degraded",
1504
+ "model_loaded": model_loaded,
1505
+ "session_store_backend": SESSION_STORE_BACKEND_ACTIVE,
1506
+ }
1507
+
1508
+
1509
+ @app.post("/v1/session/start", response_model=SessionStartResponse)
1510
+ @app.post("/api/voice-detection/v1/session/start", response_model=SessionStartResponse)
1511
+ async def start_realtime_session(
1512
+ session_request: SessionStartRequest,
1513
+ api_key: str = Depends(verify_api_key)
1514
+ ):
1515
+ """Create a new real-time fraud analysis session."""
1516
+ validate_supported_language(session_request.language)
1517
+
1518
+ session_id = str(uuid.uuid4())
1519
+ started_at = utc_now_iso()
1520
+
1521
+ async with SESSION_LOCK:
1522
+ purged = purge_expired_sessions()
1523
+ if purged:
1524
+ logger.info("Retention purge removed %s expired sessions", purged)
1525
+
1526
+ session_state = SessionState(
1527
+ session_id=session_id,
1528
+ language=session_request.language,
1529
+ started_at=started_at
1530
+ )
1531
+ save_session_state(session_state)
1532
+
1533
+ return SessionStartResponse(
1534
+ status="success",
1535
+ session_id=session_id,
1536
+ language=session_request.language,
1537
+ started_at=started_at,
1538
+ message="Session created. Send chunks using /v1/session/{session_id}/chunk or websocket stream."
1539
+ )
1540
+
1541
+
1542
+ @app.post("/v1/session/{session_id}/chunk", response_model=RealTimeUpdateResponse)
1543
+ @app.post("/api/voice-detection/v1/session/{session_id}/chunk", response_model=RealTimeUpdateResponse)
1544
+ async def analyze_realtime_chunk(
1545
+ request: Request,
1546
+ session_id: str,
1547
+ chunk_request: SessionChunkRequest,
1548
+ api_key: str = Depends(verify_api_key)
1549
+ ):
1550
+ """Analyze one chunk for an active real-time session."""
1551
+ request_id = getattr(request.state, "request_id", f"sess-{session_id[:8]}")
1552
+
1553
+ async with SESSION_LOCK:
1554
+ purge_expired_sessions()
1555
+ session = get_session_state(session_id)
1556
+ if session is None:
1557
+ raise HTTPException(
1558
+ status_code=404,
1559
+ detail={"status": "error", "message": "Session not found or expired"}
1560
+ )
1561
+ if session.status != "active":
1562
+ raise HTTPException(
1563
+ status_code=409,
1564
+ detail={"status": "error", "message": "Session is not active. Start a new session to continue."}
1565
+ )
1566
+ session_language = session.language
1567
+
1568
+ try:
1569
+ return await process_audio_chunk(session_id, chunk_request, session_language, request_id)
1570
+ except ValueError as e:
1571
+ raise HTTPException(status_code=400, detail={"status": "error", "message": str(e)}) from e
1572
+
1573
+
1574
+ @app.websocket("/v1/session/{session_id}/stream")
1575
+ @app.websocket("/api/voice-detection/v1/session/{session_id}/stream")
1576
+ async def stream_realtime_session(websocket: WebSocket, session_id: str):
1577
+ """WebSocket endpoint for continuous chunk-based analysis."""
1578
+ if not verify_websocket_api_key(websocket):
1579
+ await websocket.close(code=1008, reason="Invalid API key")
1580
+ return
1581
+
1582
+ async with SESSION_LOCK:
1583
+ purge_expired_sessions()
1584
+ session = get_session_state(session_id)
1585
+ if session is None:
1586
+ await websocket.close(code=1008, reason="Session not found or expired")
1587
+ return
1588
+ if session.status != "active":
1589
+ await websocket.close(code=1008, reason="Session is not active")
1590
+ return
1591
+ session_language = session.language
1592
+
1593
+ await websocket.accept()
1594
+ request_id = f"ws-{session_id[:8]}"
1595
+
1596
+ try:
1597
+ while True:
1598
+ payload = await websocket.receive_json()
1599
+ try:
1600
+ chunk_request = SessionChunkRequest.model_validate(payload)
1601
+ except ValidationError as e:
1602
+ await websocket.send_json({
1603
+ "status": "error",
1604
+ "message": "Invalid chunk payload",
1605
+ "details": e.errors()
1606
+ })
1607
+ continue
1608
+
1609
+ try:
1610
+ update = await process_audio_chunk(session_id, chunk_request, session_language, request_id)
1611
+ await websocket.send_json(update.model_dump())
1612
+ except HTTPException as e:
1613
+ detail = e.detail if isinstance(e.detail, dict) else {"status": "error", "message": str(e.detail)}
1614
+ await websocket.send_json(detail)
1615
+ except ValueError as e:
1616
+ await websocket.send_json({"status": "error", "message": str(e)})
1617
+ except WebSocketDisconnect:
1618
+ logger.info(f"[{request_id}] WebSocket disconnected")
1619
+
1620
+
1621
+ @app.get("/v1/session/{session_id}/summary", response_model=SessionSummaryResponse)
1622
+ @app.get("/api/voice-detection/v1/session/{session_id}/summary", response_model=SessionSummaryResponse)
1623
+ async def get_session_summary(
1624
+ session_id: str,
1625
+ api_key: str = Depends(verify_api_key)
1626
+ ):
1627
+ """Return current summary for a real-time session."""
1628
+ async with SESSION_LOCK:
1629
+ purge_expired_sessions()
1630
+ session = get_session_state(session_id)
1631
+ if session is None:
1632
+ raise HTTPException(
1633
+ status_code=404,
1634
+ detail={"status": "error", "message": "Session not found or expired"}
1635
+ )
1636
+ return session_to_summary(session)
1637
+
1638
+
1639
+ @app.get("/v1/session/{session_id}/alerts", response_model=AlertHistoryResponse)
1640
+ @app.get("/api/voice-detection/v1/session/{session_id}/alerts", response_model=AlertHistoryResponse)
1641
+ async def get_session_alerts(
1642
+ session_id: str,
1643
+ limit: int = 20,
1644
+ api_key: str = Depends(verify_api_key),
1645
+ ):
1646
+ """Return recent alert history for a real-time session."""
1647
+ if limit < 1 or limit > 100:
1648
+ raise HTTPException(
1649
+ status_code=400,
1650
+ detail={"status": "error", "message": "limit must be between 1 and 100"},
1651
+ )
1652
+
1653
+ async with SESSION_LOCK:
1654
+ purge_expired_sessions()
1655
+ session = get_session_state(session_id)
1656
+ if session is None:
1657
+ raise HTTPException(
1658
+ status_code=404,
1659
+ detail={"status": "error", "message": "Session not found or expired"},
1660
+ )
1661
+
1662
+ alerts = [AlertHistoryItem(**item) for item in session.alert_history[-limit:]]
1663
+ return AlertHistoryResponse(
1664
+ status="success",
1665
+ session_id=session_id,
1666
+ total_alerts=len(session.alert_history),
1667
+ alerts=alerts,
1668
+ )
1669
+
1670
+
1671
+ @app.get("/v1/privacy/retention-policy", response_model=RetentionPolicyResponse)
1672
+ @app.get("/api/voice-detection/v1/privacy/retention-policy", response_model=RetentionPolicyResponse)
1673
+ async def get_retention_policy(api_key: str = Depends(verify_api_key)):
1674
+ """Return explicit privacy defaults for raw audio and session-derived data."""
1675
+ return RetentionPolicyResponse(
1676
+ status="success",
1677
+ raw_audio_storage="not_persisted",
1678
+ active_session_retention_seconds=settings.SESSION_ACTIVE_RETENTION_SECONDS,
1679
+ ended_session_retention_seconds=settings.SESSION_ENDED_RETENTION_SECONDS,
1680
+ stored_derived_fields=STORED_DERIVED_FIELDS,
1681
+ )
1682
+
1683
+
1684
+ @app.post("/v1/session/{session_id}/end", response_model=SessionSummaryResponse)
1685
+ @app.post("/api/voice-detection/v1/session/{session_id}/end", response_model=SessionSummaryResponse)
1686
+ async def end_realtime_session(
1687
+ session_id: str,
1688
+ api_key: str = Depends(verify_api_key)
1689
+ ):
1690
+ """Mark a session as ended and return final summary."""
1691
+ async with SESSION_LOCK:
1692
+ purge_expired_sessions()
1693
+ session = get_session_state(session_id)
1694
+ if session is None:
1695
+ raise HTTPException(
1696
+ status_code=404,
1697
+ detail={"status": "error", "message": "Session not found or expired"}
1698
+ )
1699
+ session.status = "ended"
1700
+ session.last_update = utc_now_iso()
1701
+ save_session_state(session)
1702
+ return session_to_summary(session)
1703
+
1704
+
1705
+ @app.post(
1706
+ "/api/voice-detection",
1707
+ response_model=VoiceDetectionResponse,
1708
+ responses={
1709
+ 400: {"model": ErrorResponse, "description": "Bad Request"},
1710
+ 401: {"model": ErrorResponse, "description": "Unauthorized"},
1711
+ 429: {"model": ErrorResponse, "description": "Rate Limit Exceeded"},
1712
+ 500: {"model": ErrorResponse, "description": "Internal Server Error"}
1713
+ }
1714
+ )
1715
+ @limiter.limit("1000/minute") # Rate limit: 1000 requests per minute per IP
1716
+ async def detect_voice(
1717
+ request: Request, # Required for rate limiter
1718
+ voice_request: VoiceDetectionRequest,
1719
+ api_key: str = Depends(verify_api_key) # Use dependency injection
1720
+ ):
1721
+ """
1722
+ Returns classification result with confidence score and explanation.
1723
+ """
1724
+ # Log request info for debugging
1725
+ request_id = getattr(request.state, 'request_id', 'unknown')
1726
+ audio_size_kb = len(voice_request.audioBase64) * 3 / 4 / 1024 # Approximate decoded size
1727
+ logger.info(f"[{request_id}] Voice detection request: language={voice_request.language}, format={voice_request.audioFormat}, size~{audio_size_kb:.1f}KB")
1728
+
1729
+ validate_supported_language(voice_request.language)
1730
+ validate_supported_format(voice_request.audioFormat)
1731
+
1732
+ try:
1733
+ # Step 1: Decode Base64 (async - runs in thread pool)
1734
+ logger.info(f"[{request_id}] -> Decoding Base64...")
1735
+ decode_start = time.perf_counter()
1736
+ audio_bytes = await asyncio.to_thread(decode_base64_audio, voice_request.audioBase64)
1737
+ decode_time = (time.perf_counter() - decode_start) * 1000
1738
+
1739
+ # Step 2: Load audio (async - runs in thread pool)
1740
+ logger.info(f"[{request_id}] -> Loading audio... (decode took {decode_time:.0f}ms)")
1741
+ load_start = time.perf_counter()
1742
+ audio, sr = await asyncio.to_thread(load_audio_from_bytes, audio_bytes, 22050, voice_request.audioFormat)
1743
+ load_time = (time.perf_counter() - load_start) * 1000
1744
+
1745
+ # Step 3: ML Analysis (async - runs in thread pool, CPU-bound)
1746
+ duration_sec = len(audio) / sr
1747
+ logger.info(f"[{request_id}] -> Analyzing {duration_sec:.1f}s audio... (load took {load_time:.0f}ms)")
1748
+ analyze_start = time.perf_counter()
1749
+ result = await asyncio.to_thread(analyze_voice, audio, sr, voice_request.language)
1750
+ analyze_time = (time.perf_counter() - analyze_start) * 1000
1751
+
1752
+ logger.info(f"[{request_id}] -> Analysis complete: {result.classification} ({result.confidence_score:.0%}) in {analyze_time:.0f}ms")
1753
+
1754
+ # Extract metrics if available
1755
+ metrics = None
1756
+ if result.features:
1757
+ metrics = ForensicMetrics(
1758
+ authenticity_score=result.features.get("authenticity_score", 0),
1759
+ pitch_naturalness=result.features.get("pitch_naturalness", 0),
1760
+ spectral_naturalness=result.features.get("spectral_naturalness", 0),
1761
+ temporal_naturalness=result.features.get("temporal_naturalness", 0)
1762
+ )
1763
+
1764
+ model_uncertain = bool((result.features or {}).get("ml_fallback", 0.0))
1765
+ explanation = result.explanation
1766
+ recommended_action = None
1767
+ response_classification = result.classification
1768
+ if model_uncertain:
1769
+ explanation = (
1770
+ "Model uncertainty detected due fallback inference. "
1771
+ "Treat result as cautionary and verify through trusted channels. "
1772
+ f"{result.explanation}"
1773
+ )
1774
+ recommended_action = (
1775
+ "Do not share OTP, PIN, passwords, or payment credentials. "
1776
+ "Verify caller identity through official support channels."
1777
+ )
1778
+ if settings.LEGACY_FALLBACK_RETURNS_UNCERTAIN:
1779
+ response_classification = "UNCERTAIN"
1780
+
1781
+ # Return response
1782
+ return VoiceDetectionResponse(
1783
+ status="success",
1784
+ language=voice_request.language,
1785
+ classification=response_classification,
1786
+ confidenceScore=result.confidence_score,
1787
+ explanation=explanation,
1788
+ forensic_metrics=metrics,
1789
+ modelUncertain=model_uncertain,
1790
+ recommendedAction=recommended_action,
1791
+ )
1792
+
1793
+ except ValueError as e:
1794
+ logger.warning(f"[{request_id}] [VALIDATION_ERROR] {e}")
1795
+ raise HTTPException(
1796
+ status_code=400,
1797
+ detail={"status": "error", "message": str(e)}
1798
+ )
1799
+ except Exception as e:
1800
+ logger.error(f"[{request_id}] [PROCESSING_ERROR] {e}", exc_info=True)
1801
+ raise HTTPException(
1802
+ status_code=500,
1803
+ detail={"status": "error", "message": f"Internal Server Error (request_id={request_id})"}
1804
+ )
1805
+
1806
+
1807
+ # Exception handlers
1808
+ from fastapi.exceptions import RequestValidationError
1809
+
1810
+ def to_json_safe(value: Any) -> Any:
1811
+ """Recursively convert values to JSON-safe primitives."""
1812
+ if value is None or isinstance(value, (str, int, float, bool)):
1813
+ return value
1814
+ if isinstance(value, BaseException):
1815
+ return str(value)
1816
+ if isinstance(value, dict):
1817
+ return {str(k): to_json_safe(v) for k, v in value.items()}
1818
+ if isinstance(value, (list, tuple, set)):
1819
+ return [to_json_safe(item) for item in value]
1820
+ return str(value)
1821
+
1822
+
1823
+ @app.exception_handler(RequestValidationError)
1824
+ async def validation_exception_handler(request: Request, exc: RequestValidationError):
1825
+ """
1826
+ Custom handler for 422 Validation Errors.
1827
+ Provides clearer error messages for common issues.
1828
+ """
1829
+ errors = to_json_safe(exc.errors())
1830
+ logger.warning("Validation error: %s", errors)
1831
+
1832
+ # Build user-friendly error message
1833
+ error_messages = []
1834
+ for error in errors:
1835
+ loc = " -> ".join(str(l) for l in error.get("loc", []))
1836
+ msg = error.get("msg", "Invalid value")
1837
+ error_messages.append(f"{loc}: {msg}")
1838
+
1839
+ # Common issue detection
1840
+ if any("audioBase64" in str(e.get("loc", [])) for e in errors):
1841
+ hint = " Hint: Ensure 'audioBase64' is a valid Base64-encoded string."
1842
+ elif any("language" in str(e.get("loc", [])) for e in errors):
1843
+ hint = f" Hint: 'language' must be one of: {', '.join(settings.SUPPORTED_LANGUAGES)}."
1844
+ else:
1845
+ hint = ""
1846
+
1847
+ return JSONResponse(
1848
+ status_code=422,
1849
+ content={
1850
+ "status": "error",
1851
+ "message": f"Request validation failed: {'; '.join(error_messages)}.{hint}",
1852
+ "details": errors
1853
+ }
1854
+ )
1855
+
1856
+
1857
+ @app.exception_handler(HTTPException)
1858
+ async def http_exception_handler(request: Request, exc: HTTPException):
1859
+ """Custom exception handler to ensure consistent error format."""
1860
+ if isinstance(exc.detail, dict):
1861
+ return JSONResponse(
1862
+ status_code=exc.status_code,
1863
+ content=exc.detail
1864
+ )
1865
+ return JSONResponse(
1866
+ status_code=exc.status_code,
1867
+ content={"status": "error", "message": str(exc.detail)}
1868
+ )
1869
+
1870
+
1871
+ @app.exception_handler(Exception)
1872
+ async def global_exception_handler(request: Request, exc: Exception):
1873
+ """Global handler to catch unhandled exceptions and prevent stack traces."""
1874
+ logger.error(f"Unhandled error: {exc}", exc_info=True)
1875
+ return JSONResponse(
1876
+ status_code=500,
1877
+ content={"status": "error", "message": "Internal Server Error"}
1878
+ )
1879
+
1880
+
1881
+ if __name__ == "__main__":
1882
+ import uvicorn
1883
+ uvicorn.run(app, host="0.0.0.0", port=settings.PORT)
1884
+
1885
+
1886
+
1887
+
1888
+
1889
+
1890
+
1891
+
1892
+
1893
+
1894
+
1895
+
1896
+
1897
+
1898
+
1899
+
1900
+
1901
+
1902
+
1903
+
model.py ADDED
@@ -0,0 +1,563 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Voice Analysis Engine.
3
+ Combines Wav2Vec2 deepfake detection with signal forensics.
4
+ """
5
+ import logging
6
+ import os
7
+ import numpy as np
8
+ from typing import Dict, Tuple, List, Optional
9
+ from dataclasses import dataclass
10
+ import warnings
11
+
12
+ from config import settings
13
+
14
+ # Configure logging
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Suppress warnings
18
+ warnings.filterwarnings("ignore", category=FutureWarning)
19
+ warnings.filterwarnings("ignore", category=UserWarning)
20
+
21
+ # Global model cache
22
+ _model = None
23
+ _processor = None
24
+ _device = None
25
+
26
+
27
+ @dataclass
28
+ class AnalysisResult:
29
+ """Result of voice analysis."""
30
+ classification: str # "AI_GENERATED" or "HUMAN"
31
+ confidence_score: float # 0.0 to 1.0
32
+ explanation: str
33
+ features: Dict[str, float] # Individual feature scores for debugging
34
+
35
+
36
+ def get_device():
37
+ """Get the best available device (GPU or CPU)."""
38
+ global _device
39
+ if _device is None:
40
+ import torch
41
+ if torch.cuda.is_available():
42
+ _device = "cuda"
43
+ else:
44
+ _device = "cpu"
45
+ logger.info(f"Using device: {_device}")
46
+ return _device
47
+
48
+
49
+ def load_model():
50
+ """
51
+ Load the Wav2Vec2 deepfake detection model.
52
+ Prioritizes HuggingFace Hub model, with local fallback.
53
+ """
54
+ global _model, _processor
55
+
56
+ if _model is None:
57
+ from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
58
+
59
+ # Model priority:
60
+ # 1. Local fine-tuned model (for development)
61
+ # 2. HuggingFace Hub model (for production/deployment)
62
+ # 3. Fallback to public model
63
+
64
+ local_path = settings.VOICE_MODEL_LOCAL_PATH
65
+ hf_model = settings.VOICE_MODEL_ID
66
+ backup_model = settings.VOICE_MODEL_BACKUP_ID
67
+
68
+ if os.path.exists(local_path):
69
+ logger.info(f"Loading local fine-tuned model from: {local_path}")
70
+ model_name = local_path
71
+ else:
72
+ logger.info(f"Loading model from HuggingFace Hub: {hf_model}")
73
+ model_name = hf_model
74
+
75
+ try:
76
+ _processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
77
+ _model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
78
+ _model.to(get_device())
79
+ _model.eval()
80
+ logger.info(f"Model loaded successfully: {model_name}")
81
+ except Exception as e:
82
+ logger.error(f"Failed to load model {model_name}: {e}")
83
+ if model_name != backup_model:
84
+ logger.warning("Trying backup model...")
85
+ model_name = backup_model
86
+ try:
87
+ _processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
88
+ _model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
89
+ _model.to(get_device())
90
+ _model.eval()
91
+ logger.info(f"Backup model loaded: {model_name}")
92
+ except Exception as e2:
93
+ raise RuntimeError(f"Could not load any model: {e2}")
94
+ else:
95
+ raise e
96
+
97
+ return _model, _processor
98
+
99
+
100
+
101
+ def extract_signal_features(audio: np.ndarray, sr: int, fast_mode: bool = False) -> Dict[str, float]:
102
+ """Extract signal-based features (pitch, entropy, silence)."""
103
+ import librosa
104
+ from scipy.stats import entropy
105
+
106
+ features = {}
107
+
108
+ try:
109
+ # Use smaller FFT in fast mode for realtime throughput.
110
+ n_fft = 512 if fast_mode else 2048
111
+ hop_length = 256 if fast_mode else 512
112
+ S = np.abs(librosa.stft(audio, n_fft=n_fft, hop_length=hop_length))
113
+
114
+ # Pitch analysis.
115
+ if fast_mode:
116
+ # Approximate pitch variability from centroid dynamics to avoid expensive pYIN on realtime path.
117
+ spec_centroid = librosa.feature.spectral_centroid(S=S, sr=sr)[0]
118
+ centroid_mean = float(np.mean(spec_centroid) + 1e-8)
119
+ features["pitch_stability"] = float(np.clip(np.var(spec_centroid) / (centroid_mean ** 2), 0.0, 1.5))
120
+ features["jitter"] = float(np.clip(np.mean(np.abs(np.diff(spec_centroid))) / centroid_mean, 0.0, 0.2))
121
+ voiced_flag = librosa.feature.rms(y=audio, frame_length=n_fft, hop_length=hop_length)[0] > 0.02
122
+ else:
123
+ f0, voiced_flag, _ = librosa.pyin(
124
+ audio,
125
+ fmin=librosa.note_to_hz('C2'),
126
+ fmax=librosa.note_to_hz('C7'),
127
+ sr=sr
128
+ )
129
+ f0_voiced = f0[~np.isnan(f0)]
130
+ if len(f0_voiced) > 10:
131
+ pitch_mean = np.mean(f0_voiced)
132
+ pitch_std = np.std(f0_voiced)
133
+ features["pitch_stability"] = pitch_std / pitch_mean if pitch_mean > 0 else 0
134
+ features["jitter"] = np.mean(np.abs(np.diff(f0_voiced))) / pitch_mean if pitch_mean > 0 else 0
135
+ else:
136
+ features["pitch_stability"] = 0.5
137
+ features["jitter"] = 0.05
138
+
139
+ # Spectral features
140
+ spec_centroid = librosa.feature.spectral_centroid(S=S, sr=sr)[0]
141
+ features["spectral_centroid_var"] = float(np.var(spec_centroid))
142
+
143
+ spec_flatness = librosa.feature.spectral_flatness(S=S)[0]
144
+ features["spectral_flatness"] = float(np.mean(spec_flatness))
145
+
146
+ # Entropy
147
+ S_norm = S / (np.sum(S, axis=0, keepdims=True) + 1e-10)
148
+ frame_entropies = [entropy(frame + 1e-10) for frame in S_norm.T]
149
+ features["spectral_entropy"] = float(np.mean(frame_entropies))
150
+
151
+ # Silence detection
152
+ silence_threshold = 1e-5
153
+ features["silence_ratio"] = float(np.sum(np.abs(audio) < silence_threshold) / len(audio))
154
+ features["perfect_silence"] = float(np.sum(audio == 0) / len(audio))
155
+
156
+ # Zero crossing rate
157
+ zcr = librosa.feature.zero_crossing_rate(audio)[0]
158
+ features["zcr_variance"] = float(np.var(zcr))
159
+
160
+ # Additional acoustic heuristics for suspicious audio artifacts.
161
+ spec_rolloff = librosa.feature.spectral_rolloff(S=S, sr=sr)[0]
162
+ features["spectral_rolloff_var"] = float(np.var(spec_rolloff))
163
+ features["voiced_ratio"] = float(np.mean(voiced_flag.astype(np.float32))) if voiced_flag is not None else 0.0
164
+
165
+ rms = librosa.feature.rms(y=audio)[0]
166
+ features["rms_var"] = float(np.var(rms))
167
+
168
+ if fast_mode:
169
+ # Cheap HNR approximation from flatness and entropy for realtime throughput.
170
+ hnr_db = float(max(0.0, 30.0 - (features["spectral_flatness"] * 120.0)))
171
+ else:
172
+ harmonic, percussive = librosa.effects.hpss(audio)
173
+ harmonic_rms = float(np.sqrt(np.mean(np.square(harmonic))) + 1e-8)
174
+ percussive_rms = float(np.sqrt(np.mean(np.square(percussive))) + 1e-8)
175
+ hnr_db = float(20.0 * np.log10(harmonic_rms / percussive_rms))
176
+ features["harmonic_noise_ratio_db"] = hnr_db
177
+
178
+ except Exception as e:
179
+ logger.warning(f"Feature extraction error: {e}")
180
+ features = {
181
+ "pitch_stability": 0.5,
182
+ "jitter": 0.05,
183
+ "spectral_centroid_var": 1000,
184
+ "spectral_flatness": 0.1,
185
+ "spectral_entropy": 5.0,
186
+ "silence_ratio": 0.0,
187
+ "perfect_silence": 0.0,
188
+ "zcr_variance": 0.01,
189
+ "spectral_rolloff_var": 50000.0,
190
+ "voiced_ratio": 0.65,
191
+ "rms_var": 0.005,
192
+ "harmonic_noise_ratio_db": 14.0,
193
+ }
194
+
195
+ return features
196
+
197
+
198
+ def generate_explanation(
199
+ classification: str,
200
+ ml_confidence: float,
201
+ features: Dict[str, float]
202
+ ) -> str:
203
+ """Generate a data-driven forensic explanation for the classification."""
204
+
205
+ # Calculate acoustic anomaly scores (0-100 scale)
206
+ pitch_score = _calculate_pitch_score(features)
207
+ spectral_score = _calculate_spectral_score(features)
208
+ temporal_score = _calculate_temporal_score(features)
209
+
210
+ # Overall authenticity score (inverted for AI detection)
211
+ authenticity_score = (pitch_score + spectral_score + temporal_score) / 3
212
+
213
+ # Confidence tier affects explanation style
214
+ if ml_confidence >= 0.95:
215
+ confidence_tier = "high"
216
+ elif ml_confidence >= 0.75:
217
+ confidence_tier = "moderate"
218
+ else:
219
+ confidence_tier = "low"
220
+
221
+ if classification == "AI_GENERATED":
222
+ return _explain_ai_detection(
223
+ confidence_tier, ml_confidence, authenticity_score,
224
+ pitch_score, spectral_score, temporal_score, features
225
+ )
226
+ else:
227
+ return _explain_human_detection(
228
+ confidence_tier, ml_confidence, authenticity_score,
229
+ pitch_score, spectral_score, temporal_score, features
230
+ )
231
+
232
+
233
+ def _calculate_pitch_score(features: Dict[str, float]) -> float:
234
+ """Calculate pitch naturalness score (0-100). Higher = more human-like."""
235
+ pitch_stability = features.get("pitch_stability", 0.5)
236
+ jitter = features.get("jitter", 0.05)
237
+
238
+ # Typical Human: stability 0.1-0.3, jitter 0.02-0.08
239
+ # Typical AI: stability < 0.1, jitter < 0.02
240
+
241
+ stability_score = min(100, max(0, (pitch_stability - 0.05) / 0.25 * 100))
242
+ jitter_score = min(100, max(0, (jitter - 0.005) / 0.075 * 100))
243
+
244
+ return (stability_score * 0.6 + jitter_score * 0.4)
245
+
246
+
247
+ def _calculate_spectral_score(features: Dict[str, float]) -> float:
248
+ """Calculate spectral naturalness score (0-100). Higher = more human-like."""
249
+ entropy = features.get("spectral_entropy", 5.0)
250
+ flatness = features.get("spectral_flatness", 0.1)
251
+
252
+ # Typical Human: entropy 4.5-7, flatness 0.02-0.12
253
+ # Typical AI: entropy < 4.5, flatness > 0.12
254
+
255
+ entropy_score = min(100, max(0, (entropy - 3.0) / 4.0 * 100))
256
+ flatness_score = min(100, max(0, (0.2 - flatness) / 0.18 * 100))
257
+
258
+ return (entropy_score * 0.5 + flatness_score * 0.5)
259
+
260
+
261
+ def _calculate_temporal_score(features: Dict[str, float]) -> float:
262
+ """Calculate temporal/rhythm naturalness score (0-100). Higher = more human-like."""
263
+ zcr_var = features.get("zcr_variance", 0.01)
264
+ silence_ratio = features.get("silence_ratio", 0.0)
265
+ perfect_silence = features.get("perfect_silence", 0.0)
266
+
267
+ # Penalize digital silence (exact zeros) - strong AI indicator
268
+ digital_penalty = min(50, perfect_silence * 500)
269
+
270
+ zcr_score = min(100, max(0, zcr_var / 0.02 * 100))
271
+
272
+ return max(0, zcr_score - digital_penalty)
273
+
274
+
275
+ def _calculate_acoustic_anomaly_score(features: Dict[str, float]) -> float:
276
+ """
277
+ Estimate suspicious acoustic artifact intensity (0-100).
278
+ Higher score indicates stronger synthetic/spoof-like signal artifacts.
279
+ """
280
+ perfect_silence = features.get("perfect_silence", 0.0)
281
+ spectral_flatness = features.get("spectral_flatness", 0.1)
282
+ rolloff_var = features.get("spectral_rolloff_var", 50000.0)
283
+ voiced_ratio = features.get("voiced_ratio", 0.65)
284
+ hnr_db = features.get("harmonic_noise_ratio_db", 14.0)
285
+
286
+ digital_artifact_score = min(100.0, perfect_silence * 10000.0)
287
+ flatness_artifact_score = min(100.0, max(0.0, (spectral_flatness - 0.13) * 500.0))
288
+ rolloff_score = min(100.0, max(0.0, (np.log10(rolloff_var + 1.0) - 3.8) * 45.0))
289
+
290
+ if voiced_ratio < 0.35:
291
+ voiced_ratio_score = min(100.0, (0.35 - voiced_ratio) * 180.0)
292
+ elif voiced_ratio > 0.95:
293
+ voiced_ratio_score = min(100.0, (voiced_ratio - 0.95) * 180.0)
294
+ else:
295
+ voiced_ratio_score = 0.0
296
+
297
+ if hnr_db < 6.0:
298
+ hnr_score = min(100.0, (6.0 - hnr_db) * 8.0)
299
+ elif hnr_db > 28.0:
300
+ hnr_score = min(100.0, (hnr_db - 28.0) * 4.0)
301
+ else:
302
+ hnr_score = 0.0
303
+
304
+ anomaly_score = (
305
+ (digital_artifact_score * 0.35)
306
+ + (flatness_artifact_score * 0.20)
307
+ + (rolloff_score * 0.20)
308
+ + (voiced_ratio_score * 0.15)
309
+ + (hnr_score * 0.10)
310
+ )
311
+ return float(max(0.0, min(100.0, anomaly_score)))
312
+
313
+
314
+ def _explain_ai_detection(
315
+ confidence_tier: str,
316
+ ml_confidence: float,
317
+ authenticity_score: float,
318
+ pitch_score: float,
319
+ spectral_score: float,
320
+ temporal_score: float,
321
+ features: Dict[str, float]
322
+ ) -> str:
323
+ """Generate explanation for AI-detected audio."""
324
+
325
+ # Find the weakest scores (most AI-like characteristics)
326
+ scores = {
327
+ "vocal pitch patterns": pitch_score,
328
+ "spectral characteristics": spectral_score,
329
+ "temporal dynamics": temporal_score
330
+ }
331
+ sorted_scores = sorted(scores.items(), key=lambda x: x[1])
332
+
333
+ # Build forensic-style explanation
334
+ primary_indicator = sorted_scores[0][0]
335
+ primary_score = sorted_scores[0][1]
336
+
337
+ if confidence_tier == "high":
338
+ intro = f"Strong synthetic markers detected (confidence: {ml_confidence:.0%}). "
339
+ elif confidence_tier == "moderate":
340
+ intro = f"Synthetic patterns identified (confidence: {ml_confidence:.0%}). "
341
+ else:
342
+ intro = f"Possible synthetic audio (confidence: {ml_confidence:.0%}). "
343
+
344
+ # Specific findings based on lowest scoring area
345
+ if primary_indicator == "vocal pitch patterns":
346
+ jitter = features.get("jitter", 0)
347
+ stability = features.get("pitch_stability", 0)
348
+ detail = f"Pitch analysis shows unusually consistent patterns (stability: {stability:.3f}, micro-variation: {jitter:.4f}) - typical of synthesized speech."
349
+ elif primary_indicator == "spectral characteristics":
350
+ entropy = features.get("spectral_entropy", 0)
351
+ flatness = features.get("spectral_flatness", 0)
352
+ detail = f"Spectral fingerprint indicates synthetic generation (complexity: {entropy:.2f}, flatness: {flatness:.3f}) - lacking natural harmonic richness."
353
+ else:
354
+ perfect_silence = features.get("perfect_silence", 0)
355
+ if perfect_silence > 0.005:
356
+ detail = f"Digital artifacts detected: {perfect_silence:.1%} exact-zero samples found, indicating synthetic audio processing."
357
+ else:
358
+ detail = f"Temporal patterns suggest algorithmic generation - rhythm lacks natural human irregularities."
359
+
360
+ # Add authenticity score as a unique metric
361
+ authenticity_label = "very low" if authenticity_score < 25 else "low" if authenticity_score < 50 else "borderline"
362
+
363
+ return f"{intro}{detail} Authenticity score: {authenticity_score:.0f}/100 ({authenticity_label})."
364
+
365
+
366
+ def _explain_human_detection(
367
+ confidence_tier: str,
368
+ ml_confidence: float,
369
+ authenticity_score: float,
370
+ pitch_score: float,
371
+ spectral_score: float,
372
+ temporal_score: float,
373
+ features: Dict[str, float]
374
+ ) -> str:
375
+ """Generate explanation for human-detected audio."""
376
+
377
+ # Find the strongest scores (most human-like characteristics)
378
+ scores = {
379
+ "vocal pitch patterns": pitch_score,
380
+ "spectral characteristics": spectral_score,
381
+ "temporal dynamics": temporal_score
382
+ }
383
+ sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
384
+
385
+ primary_indicator = sorted_scores[0][0]
386
+ primary_score = sorted_scores[0][1]
387
+
388
+ if confidence_tier == "high":
389
+ intro = f"Strong human voice markers detected (confidence: {ml_confidence:.0%}). "
390
+ elif confidence_tier == "moderate":
391
+ intro = f"Human speech patterns identified (confidence: {ml_confidence:.0%}). "
392
+ else:
393
+ intro = f"Likely human voice (confidence: {ml_confidence:.0%}). "
394
+
395
+ # Specific findings based on highest scoring area
396
+ if primary_indicator == "vocal pitch patterns":
397
+ jitter = features.get("jitter", 0)
398
+ stability = features.get("pitch_stability", 0)
399
+ detail = f"Natural pitch dynamics confirmed (variability: {stability:.3f}, micro-fluctuations: {jitter:.4f}) - consistent with biological speech production."
400
+ elif primary_indicator == "spectral characteristics":
401
+ entropy = features.get("spectral_entropy", 0)
402
+ detail = f"Rich harmonic structure detected (complexity score: {entropy:.2f}) - characteristic of natural vocal tract resonance."
403
+ else:
404
+ zcr_var = features.get("zcr_variance", 0)
405
+ detail = f"Organic speech rhythm detected (variance: {zcr_var:.4f}) - natural breathing and articulation patterns present."
406
+
407
+ # Add authenticity score
408
+ authenticity_label = "excellent" if authenticity_score > 75 else "good" if authenticity_score > 50 else "moderate"
409
+
410
+ return f"{intro}{detail} Authenticity score: {authenticity_score:.0f}/100 ({authenticity_label})."
411
+
412
+
413
+ def classify_with_model(audio: np.ndarray, sr: int) -> Tuple[str, float]:
414
+ """
415
+ Classify audio using the Wav2Vec2 model.
416
+
417
+ Returns:
418
+ Tuple of (classification, confidence)
419
+ """
420
+ import torch
421
+ import librosa
422
+
423
+ model, processor = load_model()
424
+ device = get_device()
425
+
426
+ # Normalize audio to prevent clipping issues
427
+ max_val = np.max(np.abs(audio))
428
+ if max_val > 0:
429
+ audio = audio / max_val
430
+
431
+ # Resample to 16kHz if needed (Wav2Vec2 expects 16kHz)
432
+ target_sr = 16000
433
+ if sr != target_sr:
434
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
435
+
436
+ # Process audio
437
+ inputs = processor(
438
+ audio,
439
+ sampling_rate=target_sr,
440
+ return_tensors="pt",
441
+ padding=True
442
+ )
443
+
444
+ # Move to device
445
+ inputs = {k: v.to(device) for k, v in inputs.items()}
446
+
447
+ # Run inference
448
+ with torch.no_grad():
449
+ outputs = model(**inputs)
450
+ logits = outputs.logits
451
+ probabilities = torch.softmax(logits, dim=-1)
452
+
453
+ # Get prediction
454
+ predicted_class = torch.argmax(probabilities, dim=-1).item()
455
+ confidence = probabilities[0][predicted_class].item()
456
+
457
+ # Map class to label using the model's id2label config.
458
+ # IMPORTANT: HuggingFace stores id2label with STRING keys ("0", "1")
459
+ # but predicted_class from torch.argmax().item() is an int.
460
+ # We must normalise the keys to int so .get() actually matches.
461
+ raw_id2label = getattr(model.config, 'id2label', None) or {}
462
+ id2label = {int(k): v for k, v in raw_id2label.items()}
463
+ label = id2label.get(predicted_class, 'UNKNOWN')
464
+
465
+ logger.info(
466
+ "Model id2label=%s predicted_class=%d resolved_label=%s",
467
+ id2label, predicted_class, label,
468
+ )
469
+
470
+ # Normalize label
471
+ if label.upper() in ['FAKE', 'SPOOF', 'SYNTHETIC', 'AI']:
472
+ classification = "AI_GENERATED"
473
+ else:
474
+ classification = "HUMAN"
475
+
476
+ return classification, confidence
477
+
478
+
479
+ def analyze_voice(audio: np.ndarray, sr: int, language: str = "English", realtime: bool = False) -> AnalysisResult:
480
+ """
481
+ Analyze a voice sample and classify as AI-generated or Human.
482
+
483
+ Args:
484
+ audio: Audio waveform as numpy array
485
+ sr: Sample rate
486
+ language: Language of the audio (for context)
487
+
488
+ Returns:
489
+ AnalysisResult with classification, confidence, and explanation
490
+
491
+ Raises:
492
+ ValueError: If audio is too short for reliable analysis
493
+ """
494
+ # Validate minimum audio duration (at least 0.5 seconds for reliable analysis)
495
+ min_duration = 0.5 # seconds
496
+ duration = len(audio) / sr
497
+ if duration < min_duration:
498
+ raise ValueError(f"Audio too short ({duration:.2f}s). Minimum {min_duration}s required for reliable analysis.")
499
+
500
+ fast_mode = bool(realtime and settings.REALTIME_LIGHTWEIGHT_AUDIO)
501
+
502
+ # Get model prediction (legacy/deep path) or defer to lightweight realtime heuristic.
503
+ ml_fallback = False
504
+ classification = "HUMAN"
505
+ ml_confidence = 0.5
506
+ if not fast_mode:
507
+ try:
508
+ classification, ml_confidence = classify_with_model(audio, sr)
509
+ except Exception as e:
510
+ logger.error(f"ML model error: {e}, falling back to signal analysis")
511
+ ml_fallback = True
512
+ classification = "HUMAN"
513
+ ml_confidence = 0.5
514
+
515
+ # Extract signal features for explainability.
516
+ features = extract_signal_features(audio, sr, fast_mode=fast_mode)
517
+
518
+ # Calculate scores explicitly for return.
519
+ pitch_score = _calculate_pitch_score(features)
520
+ spectral_score = _calculate_spectral_score(features)
521
+ temporal_score = _calculate_temporal_score(features)
522
+ authenticity_score = (pitch_score + spectral_score + temporal_score) / 3
523
+ acoustic_anomaly_score = _calculate_acoustic_anomaly_score(features)
524
+
525
+ # Lightweight realtime path avoids transformer inference for throughput.
526
+ if fast_mode:
527
+ ai_probability = max(
528
+ acoustic_anomaly_score / 100.0,
529
+ max(0.0, min(1.0, (52.0 - authenticity_score) / 52.0)),
530
+ )
531
+ classification = "AI_GENERATED" if ai_probability >= 0.56 else "HUMAN"
532
+ ml_confidence = ai_probability if classification == "AI_GENERATED" else (1.0 - ai_probability)
533
+ ml_confidence = float(max(0.5, min(0.99, ml_confidence)))
534
+
535
+ features["ml_confidence"] = ml_confidence
536
+ features["ml_fallback"] = float(ml_fallback)
537
+ features["realtime_heuristic_mode"] = float(fast_mode)
538
+
539
+ # Add computed high-level scores to features for API response.
540
+ features["authenticity_score"] = round(authenticity_score, 1)
541
+ features["pitch_naturalness"] = round(pitch_score, 1)
542
+ features["spectral_naturalness"] = round(spectral_score, 1)
543
+ features["temporal_naturalness"] = round(temporal_score, 1)
544
+ features["acoustic_anomaly_score"] = round(acoustic_anomaly_score, 1)
545
+
546
+ # Generate explanation
547
+ explanation = generate_explanation(classification, ml_confidence, features)
548
+
549
+ return AnalysisResult(
550
+ classification=classification,
551
+ confidence_score=round(ml_confidence, 2),
552
+ explanation=explanation,
553
+ features=features
554
+ )
555
+
556
+
557
+ # Pre-load model at module import (optional, for faster first request)
558
+ def preload_model():
559
+ """Pre-load the model to speed up first request."""
560
+ try:
561
+ load_model()
562
+ except Exception as e:
563
+ logger.error(f"Model preload failed: {e}")
privacy_utils.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Privacy helpers for masking sensitive entities in transcripts and logs.
3
+ """
4
+ from __future__ import annotations
5
+
6
+ import re
7
+
8
+
9
+ PHONE_PATTERN = re.compile(r"(?<!\d)(?:\+?91[-\s]?)?[6-9]\d{9}(?!\d)")
10
+ UPI_PATTERN = re.compile(r"\b[a-zA-Z0-9._-]{2,}@[a-zA-Z]{2,}\b")
11
+ ACCOUNT_OR_CARD_PATTERN = re.compile(r"(?<!\d)(?:\d[ -]?){9,19}(?!\d)")
12
+ OTP_CONTEXT_PATTERN = re.compile(r"\b(otp|pin)\s*[:\-]?\s*(\d{4,8})\b", re.IGNORECASE)
13
+
14
+
15
+ def _mask_numeric_token(token: str, preserve_tail: int = 2) -> str:
16
+ digits = re.sub(r"\D", "", token)
17
+ if len(digits) <= preserve_tail:
18
+ return "[REDACTED_NUM]"
19
+ return f"[REDACTED_NUM_XX{digits[-preserve_tail:]}]"
20
+
21
+
22
+ def _mask_account_or_card(match: re.Match[str]) -> str:
23
+ token = match.group(0)
24
+ digits = re.sub(r"\D", "", token)
25
+ if len(digits) < 9:
26
+ return token
27
+ return _mask_numeric_token(token)
28
+
29
+
30
+ def _mask_otp(match: re.Match[str]) -> str:
31
+ return f"{match.group(1)} [REDACTED_OTP]"
32
+
33
+
34
+ def mask_sensitive_entities(text: str) -> str:
35
+ """Redact common scam-sensitive entities from plain text."""
36
+ if not text:
37
+ return ""
38
+
39
+ masked = OTP_CONTEXT_PATTERN.sub(_mask_otp, text)
40
+ masked = UPI_PATTERN.sub("[REDACTED_UPI]", masked)
41
+ masked = PHONE_PATTERN.sub("[REDACTED_PHONE]", masked)
42
+ masked = ACCOUNT_OR_CARD_PATTERN.sub(_mask_account_or_card, masked)
43
+ return masked
44
+
45
+
46
+ def sanitize_for_logging(text: str, max_chars: int = 120) -> str:
47
+ """
48
+ Mask and compact text for safe structured logging.
49
+ """
50
+ masked = mask_sensitive_entities(text)
51
+ compact = " ".join(masked.split())
52
+ if len(compact) <= max_chars:
53
+ return compact
54
+ return compact[: max_chars - 3] + "..."
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.100.0
2
+ uvicorn[standard]>=0.22.0
3
+ python-multipart
4
+ librosa>=0.10.0
5
+ soundfile>=0.12.0
6
+ numpy>=1.24.0
7
+ scipy>=1.10.0
8
+ python-dotenv
9
+ pydantic>=2.0.0
10
+ transformers>=4.30.0
11
+ datasets>=2.14.0
12
+ scikit-learn>=1.3.0
13
+ accelerate>=0.20.0
14
+ slowapi>=0.1.9
15
+ pydantic-settings>=2.0.0
16
+ httpx>=0.27.0
17
+ # PyTorch - install manually for your platform if not using Docker:
18
+ # pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu
19
+ torch>=2.0.0
20
+ torchaudio>=2.0.0
21
+ faster-whisper>=1.0.3
22
+
23
+ redis>=5.0.0
speech_to_text.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Speech-to-text helper with optional faster-whisper backend.
3
+
4
+ The module degrades safely when ASR dependencies are unavailable.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ from typing import Any, Dict, Iterable, Optional
10
+
11
+ import numpy as np
12
+
13
+ from config import settings
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ _asr_model = None
18
+ _asr_load_attempted = False
19
+
20
+ LANGUAGE_TO_WHISPER = {
21
+ "English": "en",
22
+ "Tamil": "ta",
23
+ "Hindi": "hi",
24
+ "Malayalam": "ml",
25
+ "Telugu": "te",
26
+ }
27
+
28
+
29
+ def _load_asr_model():
30
+ """Load faster-whisper model lazily."""
31
+ global _asr_model, _asr_load_attempted
32
+ if _asr_model is not None:
33
+ return _asr_model
34
+ if _asr_load_attempted:
35
+ return None
36
+
37
+ _asr_load_attempted = True
38
+ try:
39
+ from faster_whisper import WhisperModel
40
+
41
+ _asr_model = WhisperModel(
42
+ model_size_or_path=settings.ASR_MODEL_SIZE,
43
+ device="cpu",
44
+ compute_type=settings.ASR_COMPUTE_TYPE,
45
+ )
46
+ logger.info(
47
+ "ASR model loaded successfully: size=%s compute_type=%s",
48
+ settings.ASR_MODEL_SIZE,
49
+ settings.ASR_COMPUTE_TYPE,
50
+ )
51
+ return _asr_model
52
+ except Exception as exc: # pragma: no cover - environment dependent
53
+ logger.warning("ASR model unavailable: %s", exc)
54
+ return None
55
+
56
+
57
+ def _decode_segments(segments: Iterable[Any]) -> Dict[str, Any]:
58
+ """Extract transcript and confidence proxy from whisper segments."""
59
+ transcript_parts = []
60
+ confidence_parts = []
61
+
62
+ for seg in segments:
63
+ text = (seg.text or "").strip()
64
+ if text:
65
+ transcript_parts.append(text)
66
+ avg_logprob = getattr(seg, "avg_logprob", None)
67
+ if avg_logprob is not None:
68
+ confidence_parts.append(float(np.exp(min(0.0, avg_logprob))))
69
+
70
+ transcript = " ".join(transcript_parts).strip()
71
+ confidence = float(np.mean(confidence_parts)) if confidence_parts else (0.0 if not transcript else 0.5)
72
+ confidence = max(0.0, min(1.0, confidence))
73
+
74
+ return {
75
+ "transcript": transcript,
76
+ "confidence": confidence,
77
+ }
78
+
79
+
80
+ def _run_transcribe(model: Any, audio: np.ndarray, language_code: Optional[str]) -> Dict[str, Any]:
81
+ """Run one whisper transcription pass with optional language hint."""
82
+ segments, _ = model.transcribe(
83
+ audio,
84
+ language=language_code,
85
+ beam_size=settings.ASR_BEAM_SIZE,
86
+ vad_filter=True,
87
+ condition_on_previous_text=False,
88
+ word_timestamps=False,
89
+ )
90
+ return _decode_segments(segments)
91
+
92
+
93
+ def transcribe_audio(audio: np.ndarray, sr: int, language: str) -> Dict[str, Any]:
94
+ """
95
+ Transcribe audio to text.
96
+
97
+ Returns:
98
+ {
99
+ "transcript": str,
100
+ "confidence": float [0..1],
101
+ "engine": str,
102
+ "available": bool
103
+ }
104
+ """
105
+ if not settings.ASR_ENABLED:
106
+ return {
107
+ "transcript": "",
108
+ "confidence": 0.0,
109
+ "engine": "disabled",
110
+ "available": False,
111
+ }
112
+
113
+ model = _load_asr_model()
114
+ if model is None:
115
+ return {
116
+ "transcript": "",
117
+ "confidence": 0.0,
118
+ "engine": "unavailable",
119
+ "available": False,
120
+ }
121
+
122
+ try:
123
+ if sr != 16000:
124
+ import librosa
125
+
126
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
127
+
128
+ audio = np.asarray(audio, dtype=np.float32)
129
+ language_code = LANGUAGE_TO_WHISPER.get(language)
130
+
131
+ hinted = _run_transcribe(model, audio, language_code)
132
+
133
+ # Recovery path: if language hint produced no text, retry with auto-detect.
134
+ # This improves robustness for mixed-language/accented input.
135
+ if not hinted["transcript"]:
136
+ autodetect = _run_transcribe(model, audio, None)
137
+ if autodetect["transcript"]:
138
+ return {
139
+ "transcript": autodetect["transcript"],
140
+ "confidence": autodetect["confidence"],
141
+ "engine": "faster-whisper:auto",
142
+ "available": True,
143
+ }
144
+
145
+ return {
146
+ "transcript": hinted["transcript"],
147
+ "confidence": hinted["confidence"],
148
+ "engine": "faster-whisper",
149
+ "available": True,
150
+ }
151
+ except Exception as exc: # pragma: no cover - runtime/audio dependent
152
+ logger.warning("ASR transcription failed: %s", exc)
153
+ return {
154
+ "transcript": "",
155
+ "confidence": 0.0,
156
+ "engine": "error",
157
+ "available": False,
158
+ }