Spaces:

shivam-2211
/

voice-detection-api

Running

shivam0897-i commited on 27 days ago

Commit

3b6fefe

0 Parent(s):

fix: correct id2label key-type mismatch causing inverted classifications

HuggingFace model.config.id2label uses string keys ('0','1') but
torch.argmax().item() returns int. The .get() always missed and fell
through to a hardcoded fallback with opposite label polarity, inverting
every single classification (human->AI, AI->human).

Fix: normalise id2label keys to int before lookup. Add diagnostic logging.

Files changed (14) hide show

.env.example +79 -0
.gitattributes +3 -0
.gitignore +60 -0
Dockerfile +40 -0
README.md +93 -0
audio_utils.py +182 -0
config.py +185 -0
fraud_language.py +191 -0
llm_semantic_analyzer.py +253 -0
main.py +1903 -0
model.py +563 -0
privacy_utils.py +54 -0
requirements.txt +23 -0
speech_to_text.py +158 -0

.env.example ADDED Viewed

	@@ -0,0 +1,79 @@

+# Environment Variables
+# Copy this file to .env and update values
+# API Key for authentication (Must be set!)
+API_KEY=your_secure_api_key_here
+# Server port (Hugging Face uses 7860)
+PORT=7860
+# Optional CORS origins
+# Example: ALLOWED_ORIGINS=https://your-ui.vercel.app,http://localhost:5173
+ALLOWED_ORIGINS=*
+# Realtime ASR settings
+ASR_ENABLED=true
+ASR_MODEL_SIZE=tiny
+ASR_COMPUTE_TYPE=int8
+ASR_BEAM_SIZE=1
+ASR_TIMEOUT_MS=1200
+ASR_MAX_INFLIGHT_TASKS=1
+ASR_WARMUP_ENABLED=true
+AUDIO_PIPELINE_WARMUP_ENABLED=true
+VOICE_WARMUP_ENABLED=true
+# Voice model settings
+VOICE_MODEL_ID=shivam-2211/voice-detection-model
+VOICE_MODEL_BACKUP_ID=mo-thecreator/Deepfake-audio-detection
+VOICE_MODEL_LOCAL_PATH=./fine_tuned_model
+REALTIME_LIGHTWEIGHT_AUDIO=true
+LEGACY_FALLBACK_RETURNS_UNCERTAIN=true
+# Privacy and retention defaults
+MASK_TRANSCRIPT_OUTPUT=true
+SESSION_ACTIVE_RETENTION_SECONDS=1800
+SESSION_ENDED_RETENTION_SECONDS=300
+# Realtime risk policy tuning
+RISK_POLICY_VERSION=v1.2
+RISK_WEIGHT_AUDIO=0.45
+RISK_WEIGHT_KEYWORD=0.20
+RISK_WEIGHT_SEMANTIC=0.15
+RISK_WEIGHT_BEHAVIOUR=0.20
+RISK_DELTA_BOOST_FACTOR=0.30
+# Optional LLM semantic verifier (second-layer, disabled by default)
+LLM_SEMANTIC_ENABLED=false
+LLM_PROVIDER=gemini
+# Optional override (openai example: gpt-4o-mini, gemini example: gemini-1.5-flash)
+LLM_SEMANTIC_MODEL=
+LLM_SEMANTIC_TIMEOUT_MS=900
+LLM_SEMANTIC_MIN_ASR_CONFIDENCE=0.35
+LLM_SEMANTIC_CHUNK_INTERVAL=2
+LLM_SEMANTIC_BLEND_WEIGHT=0.20
+OPENAI_API_KEY=
+# Gemini provider key (used when LLM_PROVIDER=gemini)
+GEMINI_API_KEY=
+# Session store backend
+# memory = current single-instance behavior
+# redis = required for multi-worker / restart-safe sessions
+SESSION_STORE_BACKEND=memory
+REDIS_URL=
+REDIS_PREFIX=ai_call_shield
+REDIS_CONNECT_TIMEOUT_MS=2000
+REDIS_IO_TIMEOUT_MS=2000
+# Deep-lane async verification (future-ready toggles)
+DEEP_LANE_ENABLED=false
+DEEP_LANE_QUEUE_BACKEND=memory
+DEEP_LANE_MAX_WORKERS=2
+DEEP_LANE_MAX_RETRIES=1
+DEEP_LANE_RETRY_BACKOFF_MS=500
+DEEP_LANE_TARGET_LATENCY_MS=3000
+# Performance budgets for harness and CI gates
+PERF_CHUNK_P95_TARGET_MS=1200
+PERF_ALERT_P95_TARGET_MS=2500

.gitattributes ADDED Viewed

	@@ -0,0 +1,3 @@

+fine_tuned_model/model.safetensors filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,60 @@

+# Python
+.venv/
+venv/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.pytest_cache/
+.coverage
+.coverage.*
+htmlcov/
+# Environment and secrets
+.env
+.env.*
+!.env.example
+# Local AI/tooling folders
+.agent/
+.agents/
+.codex/
+.claude/
+.gemini/
+.trae/
+.windsurf/
+# OS / editor
+.DS_Store
+Thumbs.db
+*.log
+.vscode/
+.idea/
+*.swp
+# Large artifacts
+*.mp4
+*.wav
+*.mp3
+fine_tuned_model/
+training/
+# === Non-production files (keep out of HF Space) ===
+# Tests
+tests/
+pytest.ini
+# Docs and reports
+docs/
+# Dev/validation scripts
+scripts/
+scenario_validation_cases.py
+# Test request fixtures
+test_request.json
+test_valid.json
+# Helper/patch scripts
+_fix_*.py

Dockerfile ADDED Viewed

	@@ -0,0 +1,40 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies for librosa and audio processing
+RUN apt-get update && apt-get install -y \
+    libsndfile1 \
+    ffmpeg \
+    git \
+    git-lfs \
+    && rm -rf /var/lib/apt/lists/*
+# Initialize git lfs
+RUN git lfs install
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Install CPU-only PyTorch first (smaller size)
+RUN pip install --no-cache-dir torch torchaudio --index-url https://download.pytorch.org/whl/cpu
+# Install other dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code and model
+COPY . .
+# Create a non-root user for HF Spaces
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR /app
+# Hugging Face Spaces uses port 7860
+EXPOSE 7860
+# Run the application
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,93 @@

+---
+title: Voice Detection API
+emoji: 🎤
+colorFrom: blue
+colorTo: purple
+sdk: docker
+pinned: false
+license: mit
+app_port: 7860
+---
+# AI Voice Detection API
+Detects whether a voice sample is AI-generated or spoken by a real human using a fine-tuned Wav2Vec2 model.
+## API Endpoint
+`POST /api/voice-detection`
+### Headers
+- `x-api-key`: Your API key (set via environment variable `API_KEY`)
+### Request Body
+```json
+{
+  "language": "English",
+  "audioFormat": "mp3",
+  "audioBase64": "<base64-encoded-audio>"
+}
+```
+### Response
+```json
+{
+  "status": "success",
+  "language": "English",
+  "classification": "AI_GENERATED" | "HUMAN",
+  "confidenceScore": 0.95,
+  "explanation": "AI voice indicators: ..."
+}
+```
+## Supported Languages
+- English
+- Tamil
+- Hindi
+- Malayalam
+- Telugu
+## Realtime Session APIs
+The backend also supports session-based realtime analysis:
+- `POST /v1/session/start`
+- `POST /v1/session/{session_id}/chunk`
+- `GET /v1/session/{session_id}/summary`
+- `GET /v1/session/{session_id}/alerts`
+- `POST /v1/session/{session_id}/end`
+Compatibility aliases are available under `/api/voice-detection/v1/...`.
+## Optional LLM Semantic Verifier
+A second-layer semantic verifier can be enabled to improve ambiguous chunk scoring:
+- `LLM_SEMANTIC_ENABLED=true`
+- `LLM_PROVIDER=openai` with `OPENAI_API_KEY=<your_key>`, or
+- `LLM_PROVIDER=gemini` with `GEMINI_API_KEY=<your_key>`
+- Tune with `LLM_SEMANTIC_*` env variables in `.env.example`.
+If `LLM_SEMANTIC_MODEL` is empty, provider defaults are used (`gpt-4o-mini` for OpenAI, `gemini-1.5-flash` for Gemini).
+The LLM layer is optional and the API continues to work when disabled.
+## Session Store Backend
+Realtime sessions support two backends:
+- `memory` (default): single-instance, volatile
+- `redis`: multi-worker and restart-safe (recommended for finals)
+Backend env settings:
+- `SESSION_STORE_BACKEND=redis`
+- `REDIS_URL=redis://...` (or `rediss://...`)
+- `REDIS_PREFIX=ai_call_shield`
+`GET /health` now includes `session_store_backend` so you can verify active backend.
+See `docs/architecture/redis-credentials-guide.md` for credential formats and setup steps.

audio_utils.py ADDED Viewed

	@@ -0,0 +1,182 @@

+"""
+Audio utility functions for Base64 decoding and audio loading.
+"""
+import base64
+import io
+import tempfile
+import os
+import logging
+from typing import Tuple, Optional
+import numpy as np
+# Configure logging
+logger = logging.getLogger(__name__)
+# Magic bytes for common audio formats
+AUDIO_MAGIC_BYTES = {
+    b'\xff\xfb': 'mp3',      # MP3 (MPEG Audio Layer 3)
+    b'\xff\xfa': 'mp3',      # MP3 variant
+    b'\xff\xf3': 'mp3',      # MP3 variant
+    b'\xff\xf2': 'mp3',      # MP3 variant
+    b'ID3': 'mp3',           # MP3 with ID3 tag
+    b'RIFF': 'wav',          # WAV
+    b'fLaC': 'flac',         # FLAC
+    b'OggS': 'ogg',          # OGG
+    b'\x00\x00\x00': 'm4a',  # M4A/MP4 (ftyp box)
+}
+def validate_audio_content(audio_bytes: bytes) -> Tuple[bool, str]:
+    """
+    Validate that the bytes actually contain audio data.
+    Args:
+        audio_bytes: Raw bytes to validate
+    Returns:
+        Tuple of (is_valid, detected_format_or_error_message)
+    """
+    if len(audio_bytes) < 12:
+        return False, "Audio data too small to be valid"
+    # Check for text content (common mistake: uploading CSV/JSON as audio)
+    # ASCII printable range check on first 100 bytes
+    sample = audio_bytes[:100]
+    printable_ratio = sum(1 for b in sample if 32 <= b <= 126 or b in (9, 10, 13)) / len(sample)
+    if printable_ratio > 0.9:
+        # Likely text content
+        preview = sample[:50].decode('utf-8', errors='replace')
+        return False, f"File appears to be text, not audio. Preview: {preview[:30]}..."
+    # Check magic bytes
+    for magic, fmt in AUDIO_MAGIC_BYTES.items():
+        if audio_bytes.startswith(magic):
+            return True, fmt
+    # Check for M4A/MP4 (ftyp at offset 4)
+    if len(audio_bytes) > 8 and audio_bytes[4:8] == b'ftyp':
+        return True, "m4a"
+    # Unknown format but not text - allow it and let librosa try
+    logger.warning("Unknown audio format, attempting to load anyway")
+    return True, "unknown"
+def decode_base64_audio(base64_string: str) -> bytes:
+    """
+    Decode a Base64-encoded audio string to raw bytes.
+    Args:
+        base64_string: Base64-encoded audio data
+    Returns:
+        Raw audio bytes
+    Raises:
+        ValueError: If the Base64 string is invalid
+    """
+    try:
+        # Strip data URI prefix if present
+        if "," in base64_string:
+            base64_string = base64_string.split(",", 1)[1]
+        # Remove any whitespace
+        base64_string = base64_string.strip()
+        return base64.b64decode(base64_string)
+    except Exception as e:
+        raise ValueError(f"Invalid Base64 encoding: {str(e)}")
+def load_audio_from_bytes(audio_bytes: bytes, target_sr: int = 22050, audio_format: str = "mp3") -> Tuple[np.ndarray, int]:
+    """
+    Load audio from bytes into a numpy array using librosa.
+    Args:
+        audio_bytes: Raw audio file bytes
+        target_sr: Target sample rate (default 22050 Hz)
+        audio_format: Audio format extension (mp3, wav, flac, ogg, m4a, mp4)
+    Returns:
+        Tuple of (audio waveform as numpy array, sample rate)
+    Raises:
+        ValueError: If audio cannot be loaded or is invalid
+    """
+    # Validate audio content BEFORE attempting to decode
+    is_valid, validation_result = validate_audio_content(audio_bytes)
+    if not is_valid:
+        raise ValueError(f"Invalid audio file: {validation_result}")
+    logger.info(f"Audio validation passed. Detected format hint: {validation_result}")
+    tmp_path = None
+    try:
+        import librosa
+        import soundfile as sf
+        # Normalize format
+        audio_format = audio_format.lower().strip()
+        if audio_format.startswith("."):
+            audio_format = audio_format[1:]
+        # Validate format (security)
+        if not audio_format.isalnum() or len(audio_format) > 5:
+            raise ValueError(f"Invalid audio format: {audio_format}")
+        # Write to temp file for librosa
+        with tempfile.NamedTemporaryFile(suffix=f".{audio_format}", delete=False) as tmp_file:
+            tmp_file.write(audio_bytes)
+            tmp_path = tmp_file.name
+        # Load audio with librosa
+        audio, sr = librosa.load(tmp_path, sr=target_sr, mono=True)
+        # Validate loaded audio
+        if len(audio) == 0:
+            raise ValueError("Audio file is empty or could not be decoded")
+        duration = len(audio) / sr
+        logger.info(f"Audio loaded successfully: {duration:.2f}s at {sr}Hz")
+        return audio, sr
+    except Exception as e:
+        raise ValueError(f"Failed to load audio: {str(e)}")
+    finally:
+        # Always clean up temp file, even on exceptions
+        if tmp_path and os.path.exists(tmp_path):
+            try:
+                os.remove(tmp_path)
+            except OSError:
+                pass  # Best effort cleanup
+def get_audio_duration(audio: np.ndarray, sr: int) -> float:
+    """
+    Calculate the duration of audio in seconds.
+    Args:
+        audio: Audio waveform
+        sr: Sample rate
+    Returns:
+        Duration in seconds
+    """
+    return len(audio) / sr
+def normalize_audio(audio: np.ndarray) -> np.ndarray:
+    """
+    Normalize audio to have maximum amplitude of 1.0.
+    Args:
+        audio: Audio waveform
+    Returns:
+        Normalized audio
+    """
+    max_val = np.max(np.abs(audio))
+    if max_val > 0:
+        return audio / max_val
+    return audio

config.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+Configuration management using Pydantic Settings.
+"""
+from pydantic_settings import BaseSettings
+from typing import List
+from pydantic import Field
+class Settings(BaseSettings):
+    """Application configuration."""
+    # Core API Settings
+    API_KEY: str = Field(..., description="API Key for authentication")
+    PORT: int = Field(7860, description="Server port")
+    WEBSITE_URL: str = Field(
+        default="https://voice-detection-nu.vercel.app/",
+        description="Project or Portfolio URL"
+    )
+    # CORS Settings
+    # Use str field with alias to read env var safely (avoids Pydantic trying to parse as JSON)
+    ALLOWED_ORIGINS_RAW: str = Field(default="*", alias="ALLOWED_ORIGINS")
+    @property
+    def ALLOWED_ORIGINS(self) -> List[str]:
+        """Parse the raw CORS origins string into a list."""
+        raw_value: str = self.ALLOWED_ORIGINS_RAW
+        if raw_value.strip().startswith("["):
+            import json
+            try:
+                return json.loads(raw_value)
+            except json.JSONDecodeError:
+                pass
+        return [origin.strip() for origin in raw_value.split(",") if origin.strip()]
+    # Audio Constraints
+    MAX_AUDIO_SIZE_MB: int = 10
+    SUPPORTED_LANGUAGES: List[str] = [
+        "Tamil", "English", "Hindi", "Malayalam", "Telugu"
+    ]
+    SUPPORTED_FORMATS: List[str] = [
+        "mp3", "wav", "flac", "ogg", "m4a", "mp4"
+    ]
+    # ASR settings
+    ASR_ENABLED: bool = Field(default=True, description="Enable speech-to-text analysis for realtime sessions")
+    ASR_MODEL_SIZE: str = Field(default="tiny", description="faster-whisper model size")
+    ASR_COMPUTE_TYPE: str = Field(default="int8", description="faster-whisper compute type")
+    ASR_BEAM_SIZE: int = Field(default=1, description="Beam size for ASR decoding")
+    ASR_TIMEOUT_MS: int = Field(
+        default=2500,
+        ge=200,
+        le=15000,
+        description="Max realtime ASR duration per chunk before timeout fallback"
+    )
+    ASR_MAX_INFLIGHT_TASKS: int = Field(
+        default=1,
+        ge=1,
+        le=8,
+        description="Maximum concurrent ASR background tasks allowed to prevent thread pileups"
+    )
+    ASR_WARMUP_ENABLED: bool = Field(
+        default=True,
+        description="Warm faster-whisper model during startup to avoid first-chunk latency spike"
+    )
+    AUDIO_PIPELINE_WARMUP_ENABLED: bool = Field(
+        default=True,
+        description="Warm audio decoding/resampling pipeline during startup"
+    )
+    VOICE_WARMUP_ENABLED: bool = Field(
+        default=True,
+        description="Run one startup inference through voice analyzer to avoid first-chunk latency spikes"
+    )
+    # Voice classification model settings
+    VOICE_MODEL_ID: str = Field(
+        default="shivam-2211/voice-detection-model",
+        description="Primary Hugging Face model id for AI voice detection"
+    )
+    VOICE_MODEL_BACKUP_ID: str = Field(
+        default="mo-thecreator/Deepfake-audio-detection",
+        description="Backup model id if primary model load fails"
+    )
+    VOICE_MODEL_LOCAL_PATH: str = Field(
+        default="./fine_tuned_model",
+        description="Optional local model path that takes priority when present"
+    )
+    REALTIME_LIGHTWEIGHT_AUDIO: bool = Field(
+        default=False,
+        description="Use lightweight audio analysis path for realtime chunk processing (set true for throughput-first mode)"
+    )
+    LEGACY_FALLBACK_RETURNS_UNCERTAIN: bool = Field(
+        default=True,
+        description="Return UNCERTAIN classification on legacy endpoint when ML fallback occurs"
+    )
+    # Risk policy (versioned + configurable weights)
+    RISK_POLICY_VERSION: str = Field(default="v1.2", description="Version tag for realtime risk policy")
+    RISK_WEIGHT_AUDIO: float = Field(default=0.45, ge=0.0, le=1.0)
+    RISK_WEIGHT_KEYWORD: float = Field(default=0.20, ge=0.0, le=1.0)
+    RISK_WEIGHT_SEMANTIC: float = Field(default=0.15, ge=0.0, le=1.0)
+    RISK_WEIGHT_BEHAVIOUR: float = Field(default=0.20, ge=0.0, le=1.0)
+    RISK_DELTA_BOOST_FACTOR: float = Field(
+        default=0.30,
+        ge=0.0,
+        le=1.0,
+        description="How strongly risk increases when per-chunk delta is positive"
+    )
+    # Optional LLM semantic verifier (second-layer, not primary classifier)
+    LLM_SEMANTIC_ENABLED: bool = Field(default=False)
+    LLM_PROVIDER: str = Field(default="openai", description="LLM provider: openai or gemini")
+    LLM_SEMANTIC_MODEL: str = Field(default="", description="Model name for selected LLM provider (optional)")
+    LLM_SEMANTIC_TIMEOUT_MS: int = Field(default=900, ge=100, le=5000)
+    LLM_SEMANTIC_MIN_ASR_CONFIDENCE: float = Field(default=0.35, ge=0.0, le=1.0)
+    LLM_SEMANTIC_CHUNK_INTERVAL: int = Field(default=2, ge=1, le=20)
+    LLM_SEMANTIC_BLEND_WEIGHT: float = Field(
+        default=0.20,
+        ge=0.0,
+        le=1.0,
+        description="Weight assigned to LLM semantic score in fused semantic score"
+    )
+    OPENAI_API_KEY: str | None = Field(default=None, description="Optional OpenAI API key for LLM semantic verifier")
+    GEMINI_API_KEY: str | None = Field(default=None, description="Optional Gemini API key for LLM semantic verifier")
+    # Session store backend
+    SESSION_STORE_BACKEND: str = Field(
+        default="memory",
+        description="Session store backend: memory or redis"
+    )
+    REDIS_URL: str | None = Field(
+        default=None,
+        description="Redis URL for session state and queue (required when SESSION_STORE_BACKEND=redis)"
+    )
+    REDIS_PREFIX: str = Field(
+        default="ai_call_shield",
+        description="Redis key prefix namespace"
+    )
+    REDIS_CONNECT_TIMEOUT_MS: int = Field(default=2000, ge=100, le=30000)
+    REDIS_IO_TIMEOUT_MS: int = Field(default=2000, ge=100, le=30000)
+    # Deep-lane async verification controls
+    DEEP_LANE_ENABLED: bool = Field(
+        default=False,
+        description="Enable asynchronous deep-lane verification after fast-lane decision"
+    )
+    DEEP_LANE_QUEUE_BACKEND: str = Field(
+        default="memory",
+        description="Queue backend: memory or redis"
+    )
+    DEEP_LANE_MAX_WORKERS: int = Field(default=2, ge=1, le=16)
+    DEEP_LANE_MAX_RETRIES: int = Field(default=1, ge=0, le=10)
+    DEEP_LANE_RETRY_BACKOFF_MS: int = Field(default=500, ge=0, le=60000)
+    DEEP_LANE_TARGET_LATENCY_MS: int = Field(default=3000, ge=200, le=10000)
+    # Performance targets (for harness/reporting and CI gates)
+    PERF_CHUNK_P95_TARGET_MS: int = Field(default=1200, ge=100, le=10000)
+    PERF_ALERT_P95_TARGET_MS: int = Field(default=2500, ge=100, le=10000)
+    # Session retention and privacy controls
+    SESSION_ACTIVE_RETENTION_SECONDS: int = Field(
+        default=1800,
+        description="Retention TTL for active sessions with no updates"
+    )
+    SESSION_ENDED_RETENTION_SECONDS: int = Field(
+        default=300,
+        description="Retention TTL for ended sessions before purge"
+    )
+    MASK_TRANSCRIPT_OUTPUT: bool = Field(
+        default=True,
+        description="Mask sensitive entities from transcript before returning response"
+    )
+    # Environment Specific
+    SPACE_ID: str | None = Field(default=None, description="Hugging Face Space ID if running in Spaces")
+    model_config = {
+        "env_file": ".env",
+        "case_sensitive": True,
+        "extra": "ignore"
+    }
+# Global settings instance
+settings = Settings()

fraud_language.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""
+Keyword and semantic fraud signal extraction from transcripts.
+"""
+from __future__ import annotations
+import re
+import string
+from typing import Any, Dict, List, Set
+# Baseline keywords that are language-agnostic or commonly spoken in English/Hinglish.
+COMMON_FRAUD_KEYWORDS: Dict[str, Set[str]] = {
+    "financial": {
+        "bank account", "account", "credit card", "debit card", "loan", "khata",
+    },
+    "payment": {
+        "upi", "upi id", "gpay", "google pay", "phonepe", "paytm", "neft", "rtgs",
+        "send money", "transfer money", "payment",
+    },
+    "authentication": {
+        "otp", "pin", "password", "cvv", "verification code", "passcode",
+    },
+    "urgency": {
+        "urgent", "immediately", "right now", "now", "last chance", "today only",
+        "abhi", "turant", "jaldi",
+    },
+    "threat": {
+        "blocked", "suspended", "legal action", "police", "arrest", "freeze",
+    },
+    "impersonation": {
+        "rbi", "bank manager", "government", "income tax", "customs", "official",
+    },
+    "offer_lure": {
+        "lottery", "prize", "winner", "cashback", "free", "reward",
+    },
+}
+# Language-specific script and phrase variants to improve 5-language support.
+LANGUAGE_FRAUD_KEYWORDS: Dict[str, Dict[str, Set[str]]] = {
+    "Hindi": {
+        "financial": {"बैंक", "खाता", "अकाउंट", "लोन"},
+        "payment": {"यूपीआई", "युपीआई", "भुगतान", "पैसे भेजो", "ट्रांसफर", "गूगल पे", "फोनपे", "पेटीएम"},
+        "authentication": {"ओटीपी", "पिन", "पासवर्ड", "सत्यापन कोड"},
+        "urgency": {"अभी", "तुरंत", "जल्दी", "फौरन", "अंतिम मौका"},
+        "threat": {"ब्लॉक", "निलंबित", "कानूनी कार्रवाई", "गिरफ्तार", "फ्रीज"},
+        "impersonation": {"आरबीआई", "सरकारी अधिकारी", "बैंक मैनेजर", "इनकम टैक्स"},
+        "offer_lure": {"लॉटरी", "इनाम", "कैशबैक", "फ्री", "रिवॉर्ड"},
+    },
+    "Tamil": {
+        "financial": {"வங்கி", "கணக்கு", "அக்கவுண்ட்", "கடன்"},
+        "payment": {"யுபிஐ", "கூகுள் பே", "போன்பே", "பேடிஎம்", "பணம் அனுப்பு", "பணம் பரிமாற்றம்", "கட்டணம்"},
+        "authentication": {"ஓடிபி", "பின்", "கடவுச்சொல்", "சரிபார்ப்பு குறியீடு"},
+        "urgency": {"உடனே", "இப்போதே", "விரைவாக", "இப்போது", "அவசரம்"},
+        "threat": {"முடக்கப்படும்", "தடைசெய்யப்படும்", "சட்ட நடவடிக்கை", "காவல்", "உறையவைக்கப்படும்"},
+        "impersonation": {"ஆர்பிஐ", "அரசு அதிகாரி", "வங்கி மேலாளர்", "வருமானவரி"},
+        "offer_lure": {"லாட்டரி", "பரிசு", "கேஷ்பேக்", "இலவசம்", "வெற்றி"},
+    },
+    "Malayalam": {
+        "financial": {"ബാങ്ക്", "അക്കൗണ്ട്", "ഖാത", "ലോൺ"},
+        "payment": {"യുപിഐ", "ഗൂഗിൾ പേ", "ഫോൺപേ", "പേടിഎം", "പണം അയക്കൂ", "പേയ്മെന്റ്", "ട്രാൻസ്ഫർ"},
+        "authentication": {"ഒടിപി", "പിൻ", "പാസ്‌വേഡ്", "സ്ഥിരീകരണ കോഡ്"},
+        "urgency": {"ഉടൻ", "ഇപ്പോള്", "തൽക്ഷണം", "വേഗം", "അവസരം"},
+        "threat": {"ബ്ലോക്ക്", "സസ്പെൻഡ്", "നിയമ നടപടി", "അറസ്റ്റ്", "ഫ്രീസ്"},
+        "impersonation": {"ആർബിഐ", "സർക്കാർ ഓഫീസർ", "ബാങ്ക് മാനേജർ", "ഇൻകം ടാക്സ്"},
+        "offer_lure": {"ലോട്ടറി", "സമ്മാനം", "കാഷ്ബാക്ക്", "ഫ്രീ", "റിവാർഡ്"},
+    },
+    "Telugu": {
+        "financial": {"బ్యాంక్", "ఖాతా", "అకౌంట్", "లోన్"},
+        "payment": {"యూపీఐ", "గూగుల్ పే", "ఫోన్‌పే", "పేటిఎం", "డబ్బు పంపండి", "చెల్లింపు", "ట్రాన్స్‌ఫర్"},
+        "authentication": {"ఓటిపి", "పిన్", "పాస్‌వర్డ్", "ధృవీకరణ కోడ్"},
+        "urgency": {"వెంటనే", "ఇ��్పుడే", "తక్షణం", "త్వరగా", "చివరి అవకాశం"},
+        "threat": {"బ్లాక్", "సస్పెండ్", "చట్టపరమైన చర్య", "అరెస్ట్", "ఫ్రీజ్"},
+        "impersonation": {"ఆర్బిఐ", "ప్రభుత్వ అధికారి", "బ్యాంక్ మేనేజర్", "ఇన్కమ్ ట్యాక్స్"},
+        "offer_lure": {"లాటరీ", "బహుమతి", "క్యాష్‌బ్యాక్", "ఉచితం", "రివార్డు"},
+    },
+}
+PUNCT_TRANSLATION = str.maketrans({ch: " " for ch in (string.punctuation + "“”‘’…–—।॥،؛")})
+def _normalize_text(text: str) -> str:
+    """
+    Normalize text while preserving non-Latin scripts.
+    We avoid ASCII-only regex stripping so Indic scripts remain searchable.
+    """
+    normalized = text.casefold().translate(PUNCT_TRANSLATION)
+    normalized = re.sub(r"\s+", " ", normalized).strip()
+    return normalized
+def _combined_keyword_catalog(language: str | None) -> Dict[str, Set[str]]:
+    """Merge common keywords with optional language-specific keywords."""
+    merged: Dict[str, Set[str]] = {category: set(values) for category, values in COMMON_FRAUD_KEYWORDS.items()}
+    if language and language in LANGUAGE_FRAUD_KEYWORDS:
+        language_maps = [LANGUAGE_FRAUD_KEYWORDS[language]]
+    else:
+        # Fallback: support mixed-language transcripts by checking all known script maps.
+        language_maps = list(LANGUAGE_FRAUD_KEYWORDS.values())
+    for language_map in language_maps:
+        for category, keywords in language_map.items():
+            merged.setdefault(category, set()).update(keywords)
+    return merged
+def _contains_keyword(normalized_text: str, token_set: Set[str], keyword: str) -> bool:
+    key = _normalize_text(keyword)
+    if not key:
+        return False
+    if " " in key:
+        return key in normalized_text
+    return key in token_set
+def _match_keywords(normalized_text: str, catalog: Dict[str, Set[str]]) -> Dict[str, List[str]]:
+    by_category: Dict[str, List[str]] = {}
+    token_set = set(normalized_text.split())
+    for category, keywords in catalog.items():
+        hits = [kw for kw in keywords if _contains_keyword(normalized_text, token_set, kw)]
+        if hits:
+            by_category[category] = sorted(hits)
+    return by_category
+def analyze_transcript(transcript: str, language: str | None = None) -> Dict[str, Any]:
+    """Extract keyword and semantic signals from transcript text."""
+    if not transcript:
+        return {
+            "keyword_hits": [],
+            "keyword_categories": [],
+            "keyword_score": 0,
+            "semantic_flags": [],
+            "semantic_score": 0,
+            "behaviour_signals": [],
+        }
+    text = _normalize_text(transcript)
+    category_hits = _match_keywords(text, _combined_keyword_catalog(language))
+    keyword_hits: List[str] = []
+    for category, hits in sorted(category_hits.items()):
+        keyword_hits.extend([f"{category}:{hit}" for hit in hits])
+    categories = sorted(category_hits.keys())
+    keyword_score = min(100, len(keyword_hits) * 7 + len(categories) * 12)
+    semantic_flags: List[str] = []
+    behaviour_signals: List[str] = []
+    has_urgency = "urgency" in category_hits
+    has_impersonation = "impersonation" in category_hits
+    has_auth = "authentication" in category_hits
+    has_payment = "payment" in category_hits
+    has_threat = "threat" in category_hits
+    if has_urgency:
+        semantic_flags.append("urgency_language")
+        behaviour_signals.append("urgency_escalation")
+    if has_impersonation:
+        semantic_flags.append("authority_impersonation")
+    if has_auth:
+        semantic_flags.append("credential_request")
+    if has_payment:
+        semantic_flags.append("payment_redirection")
+    if has_threat:
+        semantic_flags.append("coercive_threat_language")
+    if "offer_lure" in category_hits:
+        semantic_flags.append("incentive_lure")
+    semantic_score = min(100, len(semantic_flags) * 14)
+    if has_impersonation and has_auth:
+        semantic_score = min(100, semantic_score + 18)
+        behaviour_signals.append("authority_with_credential_request")
+    if has_payment and has_urgency:
+        semantic_score = min(100, semantic_score + 14)
+        behaviour_signals.append("urgent_payment_pressure")
+    if has_threat and has_urgency:
+        semantic_score = min(100, semantic_score + 10)
+        behaviour_signals.append("threat_plus_urgency")
+    return {
+        "keyword_hits": keyword_hits,
+        "keyword_categories": categories,
+        "keyword_score": keyword_score,
+        "semantic_flags": semantic_flags,
+        "semantic_score": semantic_score,
+        "behaviour_signals": sorted(set(behaviour_signals)),
+    }

llm_semantic_analyzer.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+Optional LLM semantic verifier for realtime transcript analysis.
+This is a second-layer signal meant for ambiguous/uncertain chunks.
+It must never block realtime flow.
+"""
+from __future__ import annotations
+import json
+import logging
+import re
+from typing import Any, Dict, Optional
+import httpx
+from config import settings
+from privacy_utils import mask_sensitive_entities
+logger = logging.getLogger(__name__)
+def _clamp_int(value: Any, lo: int = 0, hi: int = 100) -> int:
+    try:
+        parsed = int(round(float(value)))
+    except (TypeError, ValueError):
+        return lo
+    return max(lo, min(hi, parsed))
+def _clamp_float(value: Any, lo: float = 0.0, hi: float = 1.0) -> float:
+    try:
+        parsed = float(value)
+    except (TypeError, ValueError):
+        return lo
+    return max(lo, min(hi, parsed))
+def _extract_json_object(text: str) -> Optional[Dict[str, Any]]:
+    if not text:
+        return None
+    text = text.strip()
+    try:
+        parsed = json.loads(text)
+        if isinstance(parsed, dict):
+            return parsed
+    except json.JSONDecodeError:
+        pass
+    match = re.search(r"\{[\s\S]*\}", text)
+    if not match:
+        return None
+    try:
+        parsed = json.loads(match.group(0))
+        return parsed if isinstance(parsed, dict) else None
+    except json.JSONDecodeError:
+        return None
+def _resolve_provider() -> str:
+    provider = str(getattr(settings, "LLM_PROVIDER", "openai") or "openai").strip().lower()
+    if provider in {"gemini", "google"}:
+        return "gemini"
+    return "openai"
+def _resolve_model(provider: str) -> str:
+    configured = str(getattr(settings, "LLM_SEMANTIC_MODEL", "") or "").strip()
+    if configured:
+        return configured
+    if provider == "gemini":
+        return "gemini-1.5-flash"
+    return "gpt-4o-mini"
+def _provider_api_key(provider: str) -> Optional[str]:
+    if provider == "gemini":
+        return getattr(settings, "GEMINI_API_KEY", None)
+    return getattr(settings, "OPENAI_API_KEY", None)
+def is_llm_semantic_provider_ready() -> bool:
+    """Return True when selected provider has required credentials."""
+    provider = _resolve_provider()
+    return bool(_provider_api_key(provider))
+def _normalized_response(data: Dict[str, Any], model_name: str, engine_name: str) -> Dict[str, Any]:
+    semantic_flags = data.get("semantic_flags") or []
+    behaviour_signals = data.get("behaviour_signals") or []
+    keyword_hints = data.get("keyword_hints") or []
+    if not isinstance(semantic_flags, list):
+        semantic_flags = []
+    if not isinstance(behaviour_signals, list):
+        behaviour_signals = []
+    if not isinstance(keyword_hints, list):
+        keyword_hints = []
+    return {
+        "available": True,
+        "semantic_score": _clamp_int(data.get("semantic_score", 0)),
+        "confidence": _clamp_float(data.get("confidence", 0.0)),
+        "semantic_flags": [str(x) for x in semantic_flags if x],
+        "behaviour_signals": [str(x) for x in behaviour_signals if x],
+        "keyword_hints": [str(x) for x in keyword_hints if x],
+        "model": model_name,
+        "engine": engine_name,
+    }
+def _build_prompts(language: str, safe_transcript: str) -> tuple[str, str]:
+    system_prompt = (
+        "You are a telecom fraud intent classifier. "
+        "Return ONLY strict JSON with keys: "
+        "semantic_score (0-100), confidence (0-1), semantic_flags (string[]), "
+        "behaviour_signals (string[]), keyword_hints (string[])."
+    )
+    user_prompt = (
+        f"Language: {language}\n"
+        "Task: detect coercion, impersonation, credential request, and payment pressure.\n"
+        f"Transcript: {safe_transcript}"
+    )
+    return system_prompt, user_prompt
+def _call_openai_semantic(
+    client: httpx.Client,
+    model_name: str,
+    api_key: str,
+    system_prompt: str,
+    user_prompt: str,
+) -> Dict[str, Any]:
+    payload = {
+        "model": model_name,
+        "temperature": 0,
+        "response_format": {"type": "json_object"},
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+    }
+    response = client.post(
+        "https://api.openai.com/v1/chat/completions",
+        headers={
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json",
+        },
+        json=payload,
+    )
+    response.raise_for_status()
+    data = response.json()
+    content = (
+        data.get("choices", [{}])[0]
+        .get("message", {})
+        .get("content", "")
+    )
+    parsed = _extract_json_object(content)
+    if parsed is None:
+        return {"available": False, "reason": "invalid_json"}
+    return _normalized_response(parsed, model_name=model_name, engine_name="openai-chat-completions")
+def _call_gemini_semantic(
+    client: httpx.Client,
+    model_name: str,
+    api_key: str,
+    system_prompt: str,
+    user_prompt: str,
+) -> Dict[str, Any]:
+    payload = {
+        "contents": [
+            {
+                "role": "user",
+                "parts": [
+                    {"text": f"{system_prompt}\n\n{user_prompt}"},
+                ],
+            }
+        ],
+        "generationConfig": {
+            "temperature": 0,
+            "responseMimeType": "application/json",
+        },
+    }
+    url = f"https://generativelanguage.googleapis.com/v1beta/models/{model_name}:generateContent"
+    response = client.post(url, params={"key": api_key}, json=payload)
+    response.raise_for_status()
+    data = response.json()
+    content = (
+        data.get("candidates", [{}])[0]
+        .get("content", {})
+        .get("parts", [{}])[0]
+        .get("text", "")
+    )
+    parsed = _extract_json_object(content)
+    if parsed is None:
+        return {"available": False, "reason": "invalid_json"}
+    return _normalized_response(parsed, model_name=model_name, engine_name="gemini-generate-content")
+def analyze_semantic_with_llm(transcript: str, language: str, timeout_ms: Optional[int] = None) -> Dict[str, Any]:
+    """
+    Analyze transcript semantics via an optional LLM.
+    Returns a normalized dict with `available` bool and semantic fields.
+    """
+    if not settings.LLM_SEMANTIC_ENABLED:
+        return {"available": False, "reason": "disabled"}
+    if not transcript or len(transcript.strip()) < 8:
+        return {"available": False, "reason": "insufficient_transcript"}
+    provider = _resolve_provider()
+    api_key = _provider_api_key(provider)
+    if not api_key:
+        return {"available": False, "reason": f"missing_{provider}_api_key"}
+    safe_transcript = mask_sensitive_entities(transcript).strip()
+    if not safe_transcript:
+        return {"available": False, "reason": "empty_after_masking"}
+    timeout_seconds = max(0.1, (timeout_ms or settings.LLM_SEMANTIC_TIMEOUT_MS) / 1000.0)
+    model_name = _resolve_model(provider)
+    system_prompt, user_prompt = _build_prompts(language, safe_transcript)
+    try:
+        with httpx.Client(timeout=timeout_seconds) as client:
+            if provider == "openai":
+                return _call_openai_semantic(
+                    client=client,
+                    model_name=model_name,
+                    api_key=api_key,
+                    system_prompt=system_prompt,
+                    user_prompt=user_prompt,
+                )
+            if provider == "gemini":
+                return _call_gemini_semantic(
+                    client=client,
+                    model_name=model_name,
+                    api_key=api_key,
+                    system_prompt=system_prompt,
+                    user_prompt=user_prompt,
+                )
+        return {"available": False, "reason": "unsupported_provider"}
+    except Exception as exc:  # pragma: no cover - network/runtime dependent
+        logger.warning("LLM semantic verifier unavailable (%s): %s", provider, exc)
+        return {"available": False, "reason": "request_failed"}

main.py ADDED Viewed

	@@ -0,0 +1,1903 @@

+"""
+FastAPI application for AI-Generated Voice Detection.
+Endpoint: POST /api/voice-detection
+- Accepts Base64-encoded MP3 audio
+- Returns classification (AI_GENERATED or HUMAN) with confidence score
+"""
+import logging
+import asyncio
+import uuid
+import time
+import json
+import io
+from dataclasses import dataclass, field, asdict
+from datetime import datetime, timezone
+from typing import Optional, Any, Dict, List
+from contextlib import asynccontextmanager
+import numpy as np
+from fastapi import FastAPI, HTTPException, Request, Depends, WebSocket, WebSocketDisconnect
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel, Field, field_validator, ValidationError
+from slowapi import Limiter, _rate_limit_exceeded_handler
+from slowapi.util import get_remote_address
+from slowapi.errors import RateLimitExceeded
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Rate limiting
+limiter = Limiter(key_func=get_remote_address, default_limits=["1000/minute"])
+from audio_utils import decode_base64_audio, load_audio_from_bytes
+from model import analyze_voice, AnalysisResult
+from speech_to_text import transcribe_audio
+from fraud_language import analyze_transcript
+from llm_semantic_analyzer import analyze_semantic_with_llm, is_llm_semantic_provider_ready
+from privacy_utils import mask_sensitive_entities, sanitize_for_logging
+from config import settings
+try:
+    import redis  # type: ignore
+except Exception:  # pragma: no cover - optional dependency
+    redis = None
+# Computed constraints
+MAX_AUDIO_BASE64_LENGTH = settings.MAX_AUDIO_SIZE_MB * 1024 * 1024 * 4 // 3
+@dataclass
+class SessionState:
+    """In-memory state for a real-time analysis session (derived data only)."""
+    session_id: str
+    language: str
+    started_at: str
+    status: str = "active"
+    chunks_processed: int = 0
+    alerts_triggered: int = 0
+    max_risk_score: int = 0
+    max_cpi: float = 0.0
+    final_call_label: str = "UNCERTAIN"
+    final_voice_classification: str = "UNCERTAIN"
+    final_voice_confidence: float = 0.0
+    max_voice_ai_confidence: float = 0.0
+    voice_ai_chunks: int = 0
+    voice_human_chunks: int = 0
+    llm_checks_performed: int = 0
+    risk_policy_version: str = settings.RISK_POLICY_VERSION
+    risk_history: List[int] = field(default_factory=list)
+    transcript_counts: Dict[str, int] = field(default_factory=dict)
+    semantic_flag_counts: Dict[str, int] = field(default_factory=dict)
+    keyword_category_counts: Dict[str, int] = field(default_factory=dict)
+    behaviour_score: int = 0
+    session_behaviour_signals: List[str] = field(default_factory=list)
+    last_transcript: str = ""
+    last_update: Optional[str] = None
+    alert_history: List[Dict[str, Any]] = field(default_factory=list)
+    llm_last_engine: Optional[str] = None
+SESSION_STORE: Dict[str, SessionState] = {}
+SESSION_LOCK = asyncio.Lock()
+SESSION_STORE_BACKEND_ACTIVE = "memory"
+REDIS_CLIENT: Any = None
+ASR_INFLIGHT_TASKS: set[asyncio.Task] = set()
+ASR_INFLIGHT_LOCK = asyncio.Lock()
+def use_redis_session_store() -> bool:
+    """Return whether redis-backed session store is active."""
+    return SESSION_STORE_BACKEND_ACTIVE == "redis" and REDIS_CLIENT is not None
+def initialize_session_store_backend() -> None:
+    """Initialize configured session backend with safe fallback to memory."""
+    global SESSION_STORE_BACKEND_ACTIVE, REDIS_CLIENT
+    configured = str(getattr(settings, "SESSION_STORE_BACKEND", "memory") or "memory").strip().lower()
+    if configured != "redis":
+        SESSION_STORE_BACKEND_ACTIVE = "memory"
+        REDIS_CLIENT = None
+        logger.info("Session store backend: memory")
+        return
+    if redis is None:
+        logger.warning("Redis backend requested but redis package is not installed. Falling back to memory store.")
+        SESSION_STORE_BACKEND_ACTIVE = "memory"
+        REDIS_CLIENT = None
+        return
+    redis_url = getattr(settings, "REDIS_URL", None)
+    if not redis_url:
+        logger.warning("Redis backend requested but REDIS_URL is empty. Falling back to memory store.")
+        SESSION_STORE_BACKEND_ACTIVE = "memory"
+        REDIS_CLIENT = None
+        return
+    try:
+        REDIS_CLIENT = redis.Redis.from_url(
+            redis_url,
+            decode_responses=True,
+            socket_connect_timeout=max(0.1, float(settings.REDIS_CONNECT_TIMEOUT_MS) / 1000.0),
+            socket_timeout=max(0.1, float(settings.REDIS_IO_TIMEOUT_MS) / 1000.0),
+        )
+        REDIS_CLIENT.ping()
+        SESSION_STORE_BACKEND_ACTIVE = "redis"
+        logger.info("Session store backend: redis")
+    except Exception as exc:
+        logger.warning("Failed to initialize redis session store (%s). Falling back to memory store.", exc)
+        SESSION_STORE_BACKEND_ACTIVE = "memory"
+        REDIS_CLIENT = None
+def _session_redis_key(session_id: str) -> str:
+    return f"{settings.REDIS_PREFIX}:session:{session_id}"
+def _serialize_session(session: SessionState) -> str:
+    return json.dumps(asdict(session), ensure_ascii=False, separators=(",", ":"))
+def _deserialize_session(raw: Optional[str]) -> Optional[SessionState]:
+    if not raw:
+        return None
+    try:
+        payload = json.loads(raw)
+        if not isinstance(payload, dict):
+            return None
+        return SessionState(**payload)
+    except Exception as exc:
+        logger.warning("Failed to deserialize session payload: %s", exc)
+        return None
+def get_session_state(session_id: str) -> Optional[SessionState]:
+    """Fetch session state from active backend."""
+    if use_redis_session_store():
+        raw = REDIS_CLIENT.get(_session_redis_key(session_id))
+        return _deserialize_session(raw)
+    return SESSION_STORE.get(session_id)
+def save_session_state(session: SessionState) -> None:
+    """Persist session state to active backend."""
+    if use_redis_session_store():
+        ttl_seconds = max(1, int(session_retention_seconds(session)))
+        REDIS_CLIENT.set(_session_redis_key(session.session_id), _serialize_session(session), ex=ttl_seconds)
+        return
+    SESSION_STORE[session.session_id] = session
+def delete_session_state(session_id: str) -> None:
+    """Delete session from active backend."""
+    if use_redis_session_store():
+        REDIS_CLIENT.delete(_session_redis_key(session_id))
+        return
+    SESSION_STORE.pop(session_id, None)
+def _asr_fallback_result(engine: str) -> Dict[str, Any]:
+    return {
+        "transcript": "",
+        "confidence": 0.0,
+        "engine": engine,
+        "available": False,
+    }
+def _discard_asr_task(task: asyncio.Task) -> None:
+    ASR_INFLIGHT_TASKS.discard(task)
+async def transcribe_audio_guarded(
+    audio: np.ndarray,
+    sr: int,
+    language: str,
+    timeout_seconds: float,
+    request_id: str,
+) -> Dict[str, Any]:
+    """Run ASR with timeout and bounded in-flight tasks to avoid thread pileups."""
+    max_inflight = max(1, int(getattr(settings, "ASR_MAX_INFLIGHT_TASKS", 1)))
+    async with ASR_INFLIGHT_LOCK:
+        stale_tasks = [task for task in ASR_INFLIGHT_TASKS if task.done()]
+        for stale in stale_tasks:
+            ASR_INFLIGHT_TASKS.discard(stale)
+        if len(ASR_INFLIGHT_TASKS) >= max_inflight:
+            logger.warning(
+                "[%s] Realtime ASR skipped (inflight=%s, max=%s); continuing without transcript",
+                request_id,
+                len(ASR_INFLIGHT_TASKS),
+                max_inflight,
+            )
+            return _asr_fallback_result("busy")
+        asr_task = asyncio.create_task(asyncio.to_thread(transcribe_audio, audio, sr, language))
+        ASR_INFLIGHT_TASKS.add(asr_task)
+        asr_task.add_done_callback(_discard_asr_task)
+    try:
+        return await asyncio.wait_for(asyncio.shield(asr_task), timeout=timeout_seconds)
+    except asyncio.TimeoutError:
+        logger.warning(
+            "[%s] Realtime ASR timed out after %.0fms; continuing without transcript",
+            request_id,
+            timeout_seconds * 1000,
+        )
+        return _asr_fallback_result("timeout")
+    except Exception as exc:
+        logger.warning("[%s] Realtime ASR path failed: %s; continuing without transcript", request_id, exc)
+        return _asr_fallback_result("error")
+def warmup_audio_pipeline() -> None:
+    """Warm audio decoding stack to reduce first-request latency spikes."""
+    if not settings.AUDIO_PIPELINE_WARMUP_ENABLED:
+        return
+    try:
+        import soundfile as sf
+        warm_audio = np.zeros(16000, dtype=np.float32)
+        wav_buffer = io.BytesIO()
+        sf.write(wav_buffer, warm_audio, 16000, format="WAV", subtype="PCM_16")
+        load_audio_from_bytes(wav_buffer.getvalue(), 22050, "wav")
+        logger.info("Audio pipeline warm-up complete")
+    except Exception as exc:
+        logger.warning("Audio pipeline warm-up skipped: %s", exc)
+def warmup_asr_pipeline() -> None:
+    """Warm ASR model and transcription path on startup."""
+    if not settings.ASR_ENABLED or not settings.ASR_WARMUP_ENABLED:
+        return
+    try:
+        warm_audio = np.zeros(16000, dtype=np.float32)
+        transcribe_audio(warm_audio, 16000, "English")
+        logger.info("ASR warm-up complete")
+    except Exception as exc:
+        logger.warning("ASR warm-up skipped: %s", exc)
+def warmup_voice_pipeline() -> None:
+    """Run one inference pass to avoid first realtime-model cold latency spike."""
+    if not settings.VOICE_WARMUP_ENABLED:
+        return
+    try:
+        sr = 16000
+        duration_sec = 1.0
+        sample_count = max(1, int(sr * duration_sec))
+        t = np.linspace(0.0, duration_sec, sample_count, endpoint=False, dtype=np.float32)
+        # Non-silent tone avoids edge-case feature paths and mirrors short speech chunks.
+        warm_audio = (0.08 * np.sin(2 * np.pi * 220 * t)).astype(np.float32)
+        analyze_voice(warm_audio, sr, "English", True)
+        logger.info("Voice model warm-up complete")
+    except Exception as exc:
+        logger.warning("Voice model warm-up skipped: %s", exc)
+def run_startup_warmups() -> None:
+    """Run non-critical startup warm-ups for latency-sensitive paths."""
+    warmup_audio_pipeline()
+    warmup_voice_pipeline()
+    warmup_asr_pipeline()
+# Detect environment
+if settings.SPACE_ID:
+    logger.info(f"Running on HuggingFace Spaces: {settings.SPACE_ID}")
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Manage application lifespan events."""
+    logger.info("Starting up - preloading ML model...")
+    initialize_session_store_backend()
+    try:
+        from model import preload_model
+        preload_model()
+        logger.info("ML model loaded successfully")
+    except Exception as e:
+        logger.error(f"Failed to preload model: {e}")
+    try:
+        await asyncio.to_thread(run_startup_warmups)
+    except Exception as exc:
+        logger.warning("Startup warm-ups encountered an issue: %s", exc)
+    yield
+    # Shutdown
+    logger.info("Shutting down...")
+from fastapi.responses import RedirectResponse
+# Initialize FastAPI app with lifespan
+app = FastAPI(
+    title="AI Voice Detection API",
+    description="Detects whether a voice sample is AI-generated or spoken by a real human",
+    version="1.0.0",
+    contact={
+        "name": "Shivam",
+        "url": settings.WEBSITE_URL,
+    },
+    docs_url="/docs",
+    redoc_url="/redoc",
+    lifespan=lifespan
+)
+# Add rate limiter to app state
+app.state.limiter = limiter
+app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
+# Middleware configuration
+# CORS
+# Note: Set ALLOWED_ORIGINS env var in production
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=settings.ALLOWED_ORIGINS,
+    allow_credentials=True,
+    allow_methods=["GET", "POST", "OPTIONS"],
+    allow_headers=["Content-Type", "x-api-key", "Authorization"],
+)
+# Request Logging & Timing Middleware
+@app.middleware("http")
+async def log_requests(request: Request, call_next):
+    # Generate request ID and start timer
+    request_id = str(uuid.uuid4())[:8]
+    request.state.request_id = request_id
+    start_time = time.perf_counter()
+    # Log request start
+    method = request.method
+    path = request.url.path
+    if method == "POST":
+        logger.info(f"[{request_id}] [START] {method} {path}")
+    # Process request (async)
+    response = await call_next(request)
+    # Calculate duration
+    duration_ms = (time.perf_counter() - start_time) * 1000
+    status_code = response.status_code
+    # Log request completion with timing
+    if method == "POST":
+        status_label = "[OK]" if status_code == 200 else "[ERR]" if status_code >= 400 else "[WARN]"
+        logger.info(f"[{request_id}] {status_label} END {method} {path} -> {status_code} ({duration_ms:.0f}ms)")
+    # Add headers
+    response.headers["X-Request-ID"] = request_id
+    response.headers["X-Response-Time"] = f"{duration_ms:.0f}ms"
+    response.headers["X-Content-Type-Options"] = "nosniff"
+    # Allow embedding in Hugging Face iframe
+    # response.headers["X-Frame-Options"] = "DENY"
+    response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
+    # Relax CSP to allow standard API documentation via CDNs (ReDoc/Swagger)
+    response.headers["Content-Security-Policy"] = (
+        "default-src 'self'; "
+        "script-src 'self' 'unsafe-inline' 'unsafe-eval' https://cdn.jsdelivr.net; "
+        "style-src 'self' 'unsafe-inline' https://cdn.jsdelivr.net https://fonts.googleapis.com; "
+        "font-src 'self' https://fonts.gstatic.com; "
+        "img-src 'self' data: https://fastapi.tiangolo.com;"
+    )
+    return response
+# Request/Response Models
+class VoiceDetectionRequest(BaseModel):
+    """Request body for voice detection."""
+    language: str = Field(..., description="Language: Tamil, English, Hindi, Malayalam, or Telugu")
+    audioFormat: str = Field(default="mp3", description="Audio format (must be mp3)")
+    audioBase64: str = Field(..., description="Base64-encoded MP3 audio")
+    @field_validator('audioBase64')
+    @classmethod
+    def validate_audio_size(cls, v: str) -> str:
+        """Validate audio data is not too small or too large."""
+        if len(v) < 100:
+            raise ValueError("Audio data too small - provide valid audio content")
+        if len(v) > MAX_AUDIO_BASE64_LENGTH:
+            raise ValueError(f"Audio data too large - maximum {settings.MAX_AUDIO_SIZE_MB}MB allowed")
+        return v
+class ForensicMetrics(BaseModel):
+    """Detailed forensic analysis metrics."""
+    authenticity_score: float = Field(..., description="Overall voice naturalness score (0-100)")
+    pitch_naturalness: float = Field(..., description="Pitch stability and jitter score (0-100)")
+    spectral_naturalness: float = Field(..., description="Spectral entropy and flatness score (0-100)")
+    temporal_naturalness: float = Field(..., description="Rhythm and silence score (0-100)")
+class VoiceDetectionResponse(BaseModel):
+    """Successful response from voice detection."""
+    status: str = "success"
+    language: str
+    classification: str  # AI_GENERATED or HUMAN
+    confidenceScore: float = Field(..., ge=0.0, le=1.0)
+    explanation: str
+    forensic_metrics: Optional[ForensicMetrics] = None
+    modelUncertain: bool = False
+    recommendedAction: Optional[str] = None
+class ErrorResponse(BaseModel):
+    """Error response."""
+    status: str = "error"
+    message: str
+class SessionStartRequest(BaseModel):
+    """Request body for creating a real-time analysis session."""
+    language: str = Field(..., description="Language: Tamil, English, Hindi, Malayalam, or Telugu")
+class SessionStartResponse(BaseModel):
+    """Response body after creating a session."""
+    status: str = "success"
+    session_id: str
+    language: str
+    started_at: str
+    message: str
+class SessionChunkRequest(BaseModel):
+    """Audio chunk request for real-time analysis."""
+    audioFormat: str = Field(default="mp3", description="Audio format (must be one of supported formats)")
+    audioBase64: str = Field(..., description="Base64-encoded audio chunk")
+    language: Optional[str] = Field(default=None, description="Optional override. Defaults to session language")
+    @field_validator("audioBase64")
+    @classmethod
+    def validate_chunk_size(cls, v: str) -> str:
+        if len(v) < 100:
+            raise ValueError("Audio data too small - provide valid audio content")
+        if len(v) > MAX_AUDIO_BASE64_LENGTH:
+            raise ValueError(f"Audio data too large - maximum {settings.MAX_AUDIO_SIZE_MB}MB allowed")
+        return v
+class RiskEvidence(BaseModel):
+    """Model evidence used to produce risk score."""
+    audio_patterns: List[str] = Field(default_factory=list)
+    keywords: List[str] = Field(default_factory=list)
+    behaviour: List[str] = Field(default_factory=list)
+class RealTimeLanguageAnalysis(BaseModel):
+    """Transcript and language risk signals for the current chunk."""
+    transcript: str = ""
+    transcript_confidence: float = Field(default=0.0, ge=0.0, le=1.0)
+    asr_engine: str = "unavailable"
+    keyword_hits: List[str] = Field(default_factory=list)
+    keyword_categories: List[str] = Field(default_factory=list)
+    semantic_flags: List[str] = Field(default_factory=list)
+    keyword_score: int = Field(default=0, ge=0, le=100)
+    semantic_score: int = Field(default=0, ge=0, le=100)
+    behaviour_score: int = Field(default=0, ge=0, le=100)
+    session_behaviour_signals: List[str] = Field(default_factory=list)
+    llm_semantic_used: bool = False
+    llm_semantic_confidence: float = Field(default=0.0, ge=0.0, le=1.0)
+    llm_semantic_model: Optional[str] = None
+class RealTimeAlert(BaseModel):
+    """Alert details emitted by the risk engine."""
+    triggered: bool
+    alert_type: Optional[str] = None
+    severity: Optional[str] = None
+    reason_summary: Optional[str] = None
+    recommended_action: Optional[str] = None
+class ExplainabilitySignal(BaseModel):
+    """Per-signal contribution to fused risk score."""
+    signal: str
+    raw_score: int = Field(..., ge=0, le=100)
+    weight: float = Field(..., ge=0.0, le=1.0)
+    weighted_score: float = Field(..., ge=0.0, le=100.0)
+class RealTimeExplainability(BaseModel):
+    """Human-readable explainability block for chunk risk output."""
+    summary: str
+    top_indicators: List[str] = Field(default_factory=list)
+    signal_contributions: List[ExplainabilitySignal] = Field(default_factory=list)
+    uncertainty_note: Optional[str] = None
+class RealTimeUpdateResponse(BaseModel):
+    """Chunk-by-chunk update response."""
+    status: str = "success"
+    session_id: str
+    timestamp: str
+    risk_score: int = Field(..., ge=0, le=100)
+    cpi: float = Field(..., ge=0.0, le=100.0, description="Conversational Pressure Index")
+    risk_level: str
+    call_label: str
+    model_uncertain: bool = False
+    voice_classification: str = "UNCERTAIN"
+    voice_confidence: float = Field(default=0.0, ge=0.0, le=1.0)
+    evidence: RiskEvidence
+    language_analysis: RealTimeLanguageAnalysis
+    alert: RealTimeAlert
+    explainability: RealTimeExplainability
+    chunks_processed: int = Field(..., ge=1)
+    risk_policy_version: str = settings.RISK_POLICY_VERSION
+class SessionSummaryResponse(BaseModel):
+    """Summary response for a completed or active session."""
+    status: str = "success"
+    session_id: str
+    language: str
+    session_status: str
+    started_at: str
+    last_update: Optional[str] = None
+    chunks_processed: int = 0
+    alerts_triggered: int = 0
+    max_risk_score: int = 0
+    max_cpi: float = 0.0
+    final_call_label: str = "UNCERTAIN"
+    final_voice_classification: str = "UNCERTAIN"
+    final_voice_confidence: float = 0.0
+    max_voice_ai_confidence: float = 0.0
+    voice_ai_chunks: int = 0
+    voice_human_chunks: int = 0
+    llm_checks_performed: int = 0
+    risk_policy_version: str = settings.RISK_POLICY_VERSION
+class AlertHistoryItem(BaseModel):
+    """One alert event emitted during session analysis."""
+    timestamp: str
+    risk_score: int = Field(..., ge=0, le=100)
+    risk_level: str
+    call_label: str
+    alert_type: str
+    severity: str
+    reason_summary: str
+    recommended_action: str
+class AlertHistoryResponse(BaseModel):
+    """Paginated alert history for one session."""
+    status: str = "success"
+    session_id: str
+    total_alerts: int
+    alerts: List[AlertHistoryItem] = Field(default_factory=list)
+class RetentionPolicyResponse(BaseModel):
+    """Explicit privacy and retention behavior for session processing."""
+    status: str = "success"
+    raw_audio_storage: str = "not_persisted"
+    active_session_retention_seconds: int
+    ended_session_retention_seconds: int
+    stored_derived_fields: List[str]
+def utc_now_iso() -> str:
+    """Return a UTC ISO-8601 timestamp."""
+    return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
+STORED_DERIVED_FIELDS = [
+    "risk_history",
+    "behaviour_score",
+    "session_behaviour_signals",
+    "transcript_counts",
+    "semantic_flag_counts",
+    "keyword_category_counts",
+    "chunks_processed",
+    "alerts_triggered",
+    "max_risk_score",
+    "final_call_label",
+    "voice_ai_chunks",
+    "voice_human_chunks",
+    "max_voice_ai_confidence",
+    "final_voice_classification",
+    "llm_checks_performed",
+]
+def parse_iso_timestamp(value: Optional[str]) -> Optional[float]:
+    """Convert ISO timestamp to epoch seconds."""
+    if value is None:
+        return None
+    try:
+        return datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp()
+    except ValueError:
+        return None
+def session_reference_timestamp(session: SessionState) -> Optional[float]:
+    """Return the best available timestamp for retention checks."""
+    return parse_iso_timestamp(session.last_update) or parse_iso_timestamp(session.started_at)
+def session_retention_seconds(session: SessionState) -> int:
+    """Resolve retention policy from session status."""
+    if session.status == "ended":
+        return settings.SESSION_ENDED_RETENTION_SECONDS
+    return settings.SESSION_ACTIVE_RETENTION_SECONDS
+def is_session_expired(session: SessionState, now_ts: Optional[float] = None) -> bool:
+    """Check if a session exceeded status-specific retention TTL."""
+    reference_ts = session_reference_timestamp(session)
+    if reference_ts is None:
+        return False
+    current = now_ts if now_ts is not None else time.time()
+    return (current - reference_ts) > session_retention_seconds(session)
+def purge_expired_sessions(now_ts: Optional[float] = None) -> int:
+    """Best-effort retention purge for stale sessions (memory backend)."""
+    if use_redis_session_store():
+        # Redis keys self-expire by TTL; no in-process purge needed.
+        return 0
+    current = now_ts if now_ts is not None else time.time()
+    expired_ids = [sid for sid, state in SESSION_STORE.items() if is_session_expired(state, current)]
+    for expired_id in expired_ids:
+        delete_session_state(expired_id)
+    return len(expired_ids)
+def validate_supported_language(language: str) -> None:
+    """Validate supported language."""
+    if language not in settings.SUPPORTED_LANGUAGES:
+        raise HTTPException(
+            status_code=400,
+            detail={
+                "status": "error",
+                "message": f"Unsupported language. Must be one of: {', '.join(settings.SUPPORTED_LANGUAGES)}"
+            }
+        )
+def validate_supported_format(audio_format: str) -> None:
+    """Validate supported audio format."""
+    normalized = audio_format.lower()
+    if normalized not in settings.SUPPORTED_FORMATS:
+        raise HTTPException(
+            status_code=400,
+            detail={
+                "status": "error",
+                "message": f"Unsupported audio format. Must be one of: {', '.join(settings.SUPPORTED_FORMATS)}"
+            }
+        )
+def normalize_transcript_for_behavior(transcript: str) -> str:
+    """Normalize transcript for repetition and trend analysis."""
+    lowered = transcript.lower()
+    cleaned = "".join(ch if ch.isalnum() or ch.isspace() else " " for ch in lowered)
+    return " ".join(cleaned.split())
+def token_overlap_ratio(text_a: str, text_b: str) -> float:
+    """Compute Jaccard overlap between token sets."""
+    tokens_a = set(text_a.split())
+    tokens_b = set(text_b.split())
+    if not tokens_a or not tokens_b:
+        return 0.0
+    return len(tokens_a.intersection(tokens_b)) / len(tokens_a.union(tokens_b))
+def dedupe_preserve_order(items: List[str]) -> List[str]:
+    """Return unique string items while preserving first-seen order."""
+    seen = set()
+    deduped: List[str] = []
+    for item in items:
+        if item in seen:
+            continue
+        seen.add(item)
+        deduped.append(item)
+    return deduped
+def update_session_behaviour_state(session: SessionState, language_analysis: Dict[str, Any]) -> Dict[str, Any]:
+    """Update session-level behaviour score from transcript and semantic trends."""
+    transcript_source = str(language_analysis.get("transcript_raw", language_analysis.get("transcript", "")))
+    transcript = normalize_transcript_for_behavior(transcript_source)
+    semantic_flags = list(language_analysis.get("semantic_flags", []))
+    keyword_categories = list(language_analysis.get("keyword_categories", []))
+    for flag in semantic_flags:
+        session.semantic_flag_counts[flag] = session.semantic_flag_counts.get(flag, 0) + 1
+    for category in keyword_categories:
+        session.keyword_category_counts[category] = session.keyword_category_counts.get(category, 0) + 1
+    behavior_signals: List[str] = []
+    if transcript:
+        count = session.transcript_counts.get(transcript, 0) + 1
+        session.transcript_counts[transcript] = count
+        if count >= 2:
+            behavior_signals.append("repetition_loop")
+        if session.last_transcript:
+            overlap = token_overlap_ratio(transcript, session.last_transcript)
+            if overlap >= 0.75 and len(transcript.split()) >= 4:
+                behavior_signals.append("repetition_loop")
+        session.last_transcript = transcript
+    urgency_count = session.semantic_flag_counts.get("urgency_language", 0)
+    if urgency_count >= 2:
+        behavior_signals.append("sustained_urgency")
+    has_impersonation = session.semantic_flag_counts.get("authority_impersonation", 0) > 0
+    has_credentials = session.semantic_flag_counts.get("credential_request", 0) > 0
+    has_payment = session.semantic_flag_counts.get("payment_redirection", 0) > 0
+    has_threat = session.semantic_flag_counts.get("coercive_threat_language", 0) > 0
+    has_urgency = urgency_count > 0
+    if has_impersonation and has_credentials:
+        behavior_signals.append("impersonation_plus_credential_request")
+    if has_payment and has_urgency:
+        behavior_signals.append("persistent_payment_pressure")
+    if has_threat and has_urgency:
+        behavior_signals.append("repeated_threat_urgency")
+    repeated_categories = sum(1 for count in session.keyword_category_counts.values() if count >= 2)
+    if repeated_categories >= 2:
+        behavior_signals.append("repeated_fraud_categories")
+    behavior_signals = sorted(set(behavior_signals))
+    score = 0
+    if "repetition_loop" in behavior_signals:
+        max_repetition = max(session.transcript_counts.values()) if session.transcript_counts else 2
+        score += 25 + min(15, (max_repetition - 2) * 5)
+    if "sustained_urgency" in behavior_signals:
+        score += 15 + min(10, (urgency_count - 2) * 5)
+    if "impersonation_plus_credential_request" in behavior_signals:
+        score += 30
+    if "persistent_payment_pressure" in behavior_signals:
+        score += 20
+    if "repeated_threat_urgency" in behavior_signals:
+        score += 15
+    if "repeated_fraud_categories" in behavior_signals:
+        score += 10
+    session.behaviour_score = max(0, min(100, score))
+    session.session_behaviour_signals = behavior_signals
+    return {
+        "behaviour_score": session.behaviour_score,
+        "session_behaviour_signals": session.session_behaviour_signals,
+    }
+def map_score_to_level(score: int) -> str:
+    """Map numeric score to risk level."""
+    if score < 35:
+        return "LOW"
+    if score < 60:
+        return "MEDIUM"
+    if score < 80:
+        return "HIGH"
+    return "CRITICAL"
+def map_level_to_label(risk_level: str, model_uncertain: bool) -> str:
+    """Map risk level to user-friendly label."""
+    if model_uncertain:
+        return "UNCERTAIN"
+    if risk_level == "LOW":
+        return "SAFE"
+    if risk_level == "MEDIUM":
+        return "SPAM"
+    return "FRAUD"
+def recommendation_for_level(risk_level: str, model_uncertain: bool) -> str:
+    """Return a user action recommendation based on severity."""
+    if model_uncertain:
+        return "Model uncertainty detected. Avoid sharing OTP/PIN and verify caller via official channel."
+    if risk_level == "CRITICAL":
+        return "High fraud risk. End the call and verify through an official support number."
+    if risk_level == "HIGH":
+        return "Fraud indicators detected. Do not share OTP, PIN, passwords, or UPI credentials."
+    if risk_level == "MEDIUM":
+        return "Suspicious call behavior detected. Verify caller identity before taking action."
+    return "No high-risk fraud indicators detected in current chunk."
+def should_invoke_llm_semantic(
+    provisional_scored: Dict[str, Any],
+    transcript: str,
+    transcript_confidence: float,
+    next_chunk_index: int,
+) -> bool:
+    """Gate optional LLM semantic calls for ambiguous/uncertain chunks."""
+    if not settings.LLM_SEMANTIC_ENABLED:
+        return False
+    if not is_llm_semantic_provider_ready():
+        return False
+    if not transcript.strip():
+        return False
+    if len(transcript.strip()) < 8:
+        return False
+    if transcript_confidence < settings.LLM_SEMANTIC_MIN_ASR_CONFIDENCE:
+        return False
+    interval = max(1, settings.LLM_SEMANTIC_CHUNK_INTERVAL)
+    if next_chunk_index > 1 and (next_chunk_index % interval) != 0:
+        return False
+    risk_score = int(provisional_scored.get("risk_score", 0))
+    model_uncertain = bool(provisional_scored.get("model_uncertain", False))
+    ambiguous_band = 35 <= risk_score < 80
+    return ambiguous_band or model_uncertain
+def normalize_voice_classification(classification: str, model_uncertain: bool) -> str:
+    """Normalize realtime voice-authenticity classification."""
+    if model_uncertain:
+        return "UNCERTAIN"
+    normalized = str(classification or "HUMAN").upper()
+    if normalized in {"AI_GENERATED", "HUMAN"}:
+        return normalized
+    return "HUMAN"
+def build_explainability_payload(
+    risk_level: str,
+    call_label: str,
+    model_uncertain: bool,
+    cpi: float,
+    audio_score: int,
+    keyword_score: int,
+    semantic_score: int,
+    behaviour_score: int,
+    has_language_signals: bool,
+    behaviour_signals: List[str],
+    keyword_hits: List[str],
+    acoustic_anomaly: float,
+) -> RealTimeExplainability:
+    """Build explicit explainability signals and concise summary."""
+    if has_language_signals:
+        weights = {
+            "audio": 0.45,
+            "keywords": 0.20,
+            "semantic": 0.15,
+            "behaviour": 0.20,
+        }
+    else:
+        weights = {
+            "audio": 1.00,
+            "keywords": 0.00,
+            "semantic": 0.00,
+            "behaviour": 0.00,
+        }
+    contributions = [
+        ExplainabilitySignal(
+            signal="audio",
+            raw_score=audio_score,
+            weight=weights["audio"],
+            weighted_score=round(audio_score * weights["audio"], 2),
+        ),
+        ExplainabilitySignal(
+            signal="keywords",
+            raw_score=keyword_score,
+            weight=weights["keywords"],
+            weighted_score=round(keyword_score * weights["keywords"], 2),
+        ),
+        ExplainabilitySignal(
+            signal="semantic_intent",
+            raw_score=semantic_score,
+            weight=weights["semantic"],
+            weighted_score=round(semantic_score * weights["semantic"], 2),
+        ),
+        ExplainabilitySignal(
+            signal="behaviour",
+            raw_score=behaviour_score,
+            weight=weights["behaviour"],
+            weighted_score=round(behaviour_score * weights["behaviour"], 2),
+        ),
+    ]
+    indicators: List[str] = []
+    if acoustic_anomaly >= 60:
+        indicators.append("acoustic_anomaly_detected")
+    indicators.extend(behaviour_signals)
+    indicators.extend(keyword_hits[:3])
+    deduped_indicators = list(dict.fromkeys(indicators))[:6]
+    summary_parts: List[str] = [
+        f"{risk_level.title()} risk classified as {call_label}."
+    ]
+    summary_parts.append(f"CPI at {cpi:.1f}/100.")
+    if acoustic_anomaly >= 60:
+        summary_parts.append("Audio anomalies are materially elevated.")
+    if keyword_score >= 45:
+        summary_parts.append("Fraud-related keywords contribute to the score.")
+    if semantic_score >= 45:
+        summary_parts.append("Semantic coercion patterns were detected.")
+    if behaviour_score >= 40:
+        summary_parts.append("Session behavior trend increases risk.")
+    if cpi >= 70:
+        summary_parts.append("Pressure escalation velocity is high; early warning triggered.")
+    if not has_language_signals:
+        summary_parts.append("Assessment is currently audio-dominant.")
+    uncertainty_note = None
+    if model_uncertain:
+        uncertainty_note = (
+            "Model confidence is limited for this chunk. Treat this result conservatively and verify through trusted channels."
+        )
+    return RealTimeExplainability(
+        summary=" ".join(summary_parts),
+        top_indicators=deduped_indicators,
+        signal_contributions=contributions,
+        uncertainty_note=uncertainty_note,
+    )
+def build_risk_update(
+    result_features: Dict[str, float],
+    classification: str,
+    confidence: float,
+    language_analysis: Dict[str, Any],
+    previous_score: Optional[int],
+    llm_semantic: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Any]:
+    """Build risk score, evidence and alert from model outputs and session trend."""
+    authenticity = float(result_features.get("authenticity_score", 50.0))
+    acoustic_anomaly = float(result_features.get("acoustic_anomaly_score", 0.0))
+    ml_fallback = bool(result_features.get("ml_fallback", 0.0))
+    realtime_heuristic_mode = bool(result_features.get("realtime_heuristic_mode", 0.0))
+    normalized_classification = str(classification or "").upper()
+    low_confidence_uncertain = bool(
+        normalized_classification != "AI_GENERATED"
+        and float(confidence) < 0.65
+        and int(language_analysis.get("keyword_score", 0)) == 0
+        and int(language_analysis.get("semantic_score", 0)) == 0
+        and int(language_analysis.get("behaviour_score", 0)) == 0
+    )
+    heuristic_uncertain = bool(
+        realtime_heuristic_mode
+        and normalized_classification != "AI_GENERATED"
+        and float(confidence) < 0.90
+    )
+    model_uncertain = ml_fallback or low_confidence_uncertain or heuristic_uncertain
+    keyword_score = int(language_analysis.get("keyword_score", 0))
+    semantic_score = int(language_analysis.get("semantic_score", 0))
+    behaviour_score = int(language_analysis.get("behaviour_score", 0))
+    keyword_hits = dedupe_preserve_order(list(language_analysis.get("keyword_hits", [])))
+    behavior_from_language = dedupe_preserve_order(list(language_analysis.get("behaviour_signals", [])))
+    behavior_from_session = dedupe_preserve_order(list(language_analysis.get("session_behaviour_signals", [])))
+    keyword_categories = dedupe_preserve_order(list(language_analysis.get("keyword_categories", [])))
+    semantic_flags = dedupe_preserve_order(list(language_analysis.get("semantic_flags", [])))
+    transcript = str(language_analysis.get("transcript", "")).strip()
+    llm_semantic_used = False
+    llm_semantic_confidence = 0.0
+    llm_semantic_model: Optional[str] = None
+    if llm_semantic and llm_semantic.get("available"):
+        blend_weight = max(0.0, min(1.0, settings.LLM_SEMANTIC_BLEND_WEIGHT))
+        llm_score = int(max(0, min(100, llm_semantic.get("semantic_score", semantic_score))))
+        semantic_score = int(round((semantic_score * (1.0 - blend_weight)) + (llm_score * blend_weight)))
+        llm_semantic_confidence = float(max(0.0, min(1.0, llm_semantic.get("confidence", 0.0))))
+        llm_semantic_model = str(llm_semantic.get("model") or settings.LLM_SEMANTIC_MODEL)
+        llm_semantic_used = True
+        keyword_hints = dedupe_preserve_order([str(x) for x in llm_semantic.get("keyword_hints", [])])
+        if keyword_hints:
+            keyword_hits = dedupe_preserve_order(keyword_hits + keyword_hints)
+            keyword_score = min(100, keyword_score + min(18, len(keyword_hints) * 6))
+        llm_flags = dedupe_preserve_order([str(x) for x in llm_semantic.get("semantic_flags", [])])
+        if llm_flags:
+            semantic_flags = dedupe_preserve_order(semantic_flags + llm_flags)
+        llm_behaviour = dedupe_preserve_order([str(x) for x in llm_semantic.get("behaviour_signals", [])])
+        if llm_behaviour:
+            behavior_from_language = dedupe_preserve_order(behavior_from_language + llm_behaviour)
+    # Audio signal risk.
+    if classification == "AI_GENERATED":
+        audio_score = max(
+            int(round(confidence * 100)),
+            int(max(0.0, min(100.0, acoustic_anomaly * 0.85))),
+        )
+    else:
+        authenticity_audio_score = int(max(0, min(100, (50.0 - authenticity) * 1.2)))
+        anomaly_audio_score = int(max(0.0, min(100.0, acoustic_anomaly * 0.90)))
+        audio_score = max(authenticity_audio_score, anomaly_audio_score)
+    has_language_signals = bool(transcript) or keyword_score > 0 or semantic_score > 0 or behaviour_score > 0
+    if has_language_signals:
+        raw_weights = {
+            "audio": settings.RISK_WEIGHT_AUDIO,
+            "keywords": settings.RISK_WEIGHT_KEYWORD,
+            "semantic": settings.RISK_WEIGHT_SEMANTIC,
+            "behaviour": settings.RISK_WEIGHT_BEHAVIOUR,
+        }
+        total_weight = sum(raw_weights.values())
+        if total_weight <= 0:
+            raw_weights = {"audio": 0.45, "keywords": 0.20, "semantic": 0.15, "behaviour": 0.20}
+            total_weight = 1.0
+        normalized = {k: v / total_weight for k, v in raw_weights.items()}
+        base_score = int(
+            round(
+                (audio_score * normalized["audio"])
+                + (keyword_score * normalized["keywords"])
+                + (semantic_score * normalized["semantic"])
+                + (behaviour_score * normalized["behaviour"])
+            )
+        )
+    else:
+        base_score = audio_score
+    if ml_fallback:
+        base_score = max(base_score, 55)
+    risk_score = max(0, min(100, base_score))
+    behaviour_signals: List[str] = list(behavior_from_language) + list(behavior_from_session)
+    if keyword_score >= 60:
+        behaviour_signals.append("keyword_cluster_detected")
+    if semantic_score >= 60:
+        behaviour_signals.append("semantic_coercion_detected")
+    if behaviour_score >= 40:
+        behaviour_signals.append("behaviour_risk_elevated")
+    if acoustic_anomaly >= 60:
+        behaviour_signals.append("acoustic_anomaly_detected")
+    if previous_score is not None:
+        delta = risk_score - previous_score
+        if delta >= 15:
+            behaviour_signals.append("rapid_risk_escalation")
+        if risk_score >= 70 and previous_score >= 70:
+            behaviour_signals.append("sustained_high_risk")
+    else:
+        delta = 0
+    if delta > 0:
+        risk_score = min(100, risk_score + int(delta * settings.RISK_DELTA_BOOST_FACTOR))
+    if previous_score is None:
+        cpi = min(100.0, max(0.0, (behaviour_score * 0.35) + (semantic_score * 0.20)))
+    else:
+        cpi = min(
+            100.0,
+            max(
+                0.0,
+                (max(0, delta) * 3.2)
+                + (behaviour_score * 0.35)
+                + (semantic_score * 0.15),
+            ),
+        )
+    if cpi >= 70:
+        behaviour_signals.append("cpi_spike_detected")
+    behaviour_signals = dedupe_preserve_order(behaviour_signals)
+    risk_level = map_score_to_level(risk_score)
+    call_label = map_level_to_label(risk_level, model_uncertain)
+    audio_patterns = [
+        f"classification:{classification.lower()}",
+        f"model_confidence:{confidence:.2f}",
+        f"authenticity_score:{authenticity:.1f}",
+        f"acoustic_anomaly_score:{acoustic_anomaly:.1f}",
+        f"audio_score:{audio_score}",
+    ]
+    if ml_fallback:
+        audio_patterns.append("model_fallback:true")
+    audio_patterns = dedupe_preserve_order(audio_patterns)
+    strong_intent = {
+        "authority_with_credential_request",
+        "urgent_payment_pressure",
+        "threat_plus_urgency",
+        "impersonation_plus_credential_request",
+        "persistent_payment_pressure",
+        "repeated_threat_urgency",
+    }
+    alert_triggered = (
+        risk_level in {"HIGH", "CRITICAL"}
+        or "rapid_risk_escalation" in behaviour_signals
+        or cpi >= 70
+        or any(signal in behaviour_signals for signal in strong_intent)
+    )
+    alert_type = None
+    severity = None
+    reason_summary = None
+    recommended_action = None
+    if alert_triggered:
+        if risk_level == "CRITICAL":
+            alert_type = "FRAUD_RISK_CRITICAL"
+        elif cpi >= 70:
+            alert_type = "EARLY_PRESSURE_WARNING"
+        elif "rapid_risk_escalation" in behaviour_signals:
+            alert_type = "RISK_ESCALATION"
+        else:
+            alert_type = "FRAUD_RISK_HIGH"
+        severity = risk_level.lower()
+        reasons: List[str] = []
+        if keyword_hits:
+            reasons.append("fraud keywords detected")
+        if semantic_score >= 45:
+            reasons.append("coercive intent patterns detected")
+        if behaviour_score >= 40:
+            reasons.append("session behavior risk elevated")
+        if "repetition_loop" in behaviour_signals:
+            reasons.append("repetition loop detected")
+        if "rapid_risk_escalation" in behaviour_signals:
+            reasons.append("risk escalated rapidly across chunks")
+        if cpi >= 70:
+            reasons.append("conversational pressure index spiked")
+        if not reasons:
+            reasons.append("high-risk audio pattern detected")
+        reason_summary = ". ".join(reasons).capitalize() + "."
+        recommended_action = recommendation_for_level(risk_level, model_uncertain)
+    explainability = build_explainability_payload(
+        risk_level=risk_level,
+        call_label=call_label,
+        model_uncertain=model_uncertain,
+        cpi=cpi,
+        audio_score=audio_score,
+        keyword_score=keyword_score,
+        semantic_score=semantic_score,
+        behaviour_score=behaviour_score,
+        has_language_signals=has_language_signals,
+        behaviour_signals=behaviour_signals,
+        keyword_hits=keyword_hits,
+        acoustic_anomaly=acoustic_anomaly,
+    )
+    return {
+        "risk_score": risk_score,
+        "cpi": round(cpi, 1),
+        "risk_level": risk_level,
+        "call_label": call_label,
+        "model_uncertain": model_uncertain,
+        "evidence": RiskEvidence(
+            audio_patterns=audio_patterns,
+            keywords=keyword_hits,
+            behaviour=behaviour_signals
+        ),
+        "language_analysis": RealTimeLanguageAnalysis(
+            transcript=transcript,
+            transcript_confidence=float(language_analysis.get("transcript_confidence", 0.0)),
+            asr_engine=str(language_analysis.get("asr_engine", "unavailable")),
+            keyword_hits=keyword_hits,
+            keyword_categories=keyword_categories,
+            semantic_flags=semantic_flags,
+            keyword_score=keyword_score,
+            semantic_score=semantic_score,
+            behaviour_score=behaviour_score,
+            session_behaviour_signals=behavior_from_session,
+            llm_semantic_used=llm_semantic_used,
+            llm_semantic_confidence=llm_semantic_confidence,
+            llm_semantic_model=llm_semantic_model,
+        ),
+        "alert": RealTimeAlert(
+            triggered=alert_triggered,
+            alert_type=alert_type,
+            severity=severity,
+            reason_summary=reason_summary,
+            recommended_action=recommended_action
+        ),
+        "explainability": explainability,
+    }
+async def process_audio_chunk(
+    session_id: str,
+    chunk_request: SessionChunkRequest,
+    default_language: str,
+    request_id: str
+) -> RealTimeUpdateResponse:
+    """Decode, analyze and score a real-time audio chunk."""
+    chunk_language = chunk_request.language or default_language
+    validate_supported_language(chunk_language)
+    validate_supported_format(chunk_request.audioFormat)
+    audio_size_kb = len(chunk_request.audioBase64) * 3 / 4 / 1024
+    logger.info(
+        f"[{request_id}] Realtime chunk: session={session_id}, language={chunk_language}, "
+        f"format={chunk_request.audioFormat}, size~{audio_size_kb:.1f}KB"
+    )
+    decode_start = time.perf_counter()
+    audio_bytes = await asyncio.to_thread(decode_base64_audio, chunk_request.audioBase64)
+    decode_ms = (time.perf_counter() - decode_start) * 1000
+    load_start = time.perf_counter()
+    audio, sr = await asyncio.to_thread(load_audio_from_bytes, audio_bytes, 22050, chunk_request.audioFormat)
+    load_ms = (time.perf_counter() - load_start) * 1000
+    duration_sec = len(audio) / sr
+    logger.info(
+        f"[{request_id}] Realtime analyze {duration_sec:.2f}s (decode {decode_ms:.0f}ms, load {load_ms:.0f}ms)"
+    )
+    analyze_start = time.perf_counter()
+    try:
+        analysis_result = await asyncio.to_thread(analyze_voice, audio, sr, chunk_language, True)
+    except Exception as exc:
+        logger.warning("[%s] Realtime model path failed: %s; using conservative fallback", request_id, exc)
+        analysis_result = AnalysisResult(
+            classification="HUMAN",
+            confidence_score=0.5,
+            explanation="Realtime model path unavailable; conservative fallback applied.",
+            features={
+                "ml_fallback": 1.0,
+                "authenticity_score": 50.0,
+                "pitch_naturalness": 50.0,
+                "spectral_naturalness": 50.0,
+                "temporal_naturalness": 50.0,
+                "acoustic_anomaly_score": 50.0,
+            },
+        )
+    analyze_ms = (time.perf_counter() - analyze_start) * 1000
+    logger.info(
+        f"[{request_id}] Realtime result: {analysis_result.classification} "
+        f"({analysis_result.confidence_score:.0%}) in {analyze_ms:.0f}ms"
+    )
+    asr_start = time.perf_counter()
+    asr_timeout_seconds = max(0.1, float(settings.ASR_TIMEOUT_MS) / 1000.0)
+    asr_result = await transcribe_audio_guarded(
+        audio=audio,
+        sr=sr,
+        language=chunk_language,
+        timeout_seconds=asr_timeout_seconds,
+        request_id=request_id,
+    )
+    asr_ms = (time.perf_counter() - asr_start) * 1000
+    raw_transcript = str(asr_result.get("transcript", ""))
+    response_transcript = (
+        mask_sensitive_entities(raw_transcript)
+        if settings.MASK_TRANSCRIPT_OUTPUT
+        else raw_transcript
+    )
+    language_result = analyze_transcript(raw_transcript, chunk_language)
+    language_result["transcript_raw"] = raw_transcript
+    language_result["transcript"] = response_transcript
+    language_result["transcript_confidence"] = asr_result.get("confidence", 0.0)
+    language_result["asr_engine"] = asr_result.get("engine", "unavailable")
+    transcript_preview = sanitize_for_logging(raw_transcript, max_chars=90)
+    logger.info(
+        f"[{request_id}] Realtime ASR: engine={language_result['asr_engine']}, "
+        f"confidence={language_result['transcript_confidence']:.2f}, "
+        f"text_len={len(raw_transcript)}, preview='{transcript_preview}', asr={asr_ms:.0f}ms"
+    )
+    # Read-only session snapshot for scoring and optional LLM gating.
+    async with SESSION_LOCK:
+        purge_expired_sessions()
+        session = get_session_state(session_id)
+        if session is None:
+            raise HTTPException(
+                status_code=404,
+                detail={"status": "error", "message": "Session not found or expired"}
+            )
+        if session.status != "active":
+            raise HTTPException(
+                status_code=409,
+                detail={"status": "error", "message": "Session is not active. Start a new session to continue."}
+            )
+        previous_score_snapshot = session.risk_history[-1] if session.risk_history else None
+        next_chunk_index = session.chunks_processed + 1
+    provisional_scored = build_risk_update(
+        analysis_result.features or {},
+        analysis_result.classification,
+        analysis_result.confidence_score,
+        language_result,
+        previous_score_snapshot,
+    )
+    llm_semantic: Optional[Dict[str, Any]] = None
+    llm_invoked = should_invoke_llm_semantic(
+        provisional_scored=provisional_scored,
+        transcript=raw_transcript,
+        transcript_confidence=float(language_result.get("transcript_confidence", 0.0)),
+        next_chunk_index=next_chunk_index,
+    )
+    if llm_invoked:
+        llm_semantic = await asyncio.to_thread(
+            analyze_semantic_with_llm,
+            raw_transcript,
+            chunk_language,
+            settings.LLM_SEMANTIC_TIMEOUT_MS,
+        )
+    async with SESSION_LOCK:
+        purge_expired_sessions()
+        session = get_session_state(session_id)
+        if session is None:
+            raise HTTPException(
+                status_code=404,
+                detail={"status": "error", "message": "Session not found or expired"}
+            )
+        if session.status != "active":
+            raise HTTPException(
+                status_code=409,
+                detail={"status": "error", "message": "Session is not active. Start a new session to continue."}
+            )
+        if llm_invoked:
+            session.llm_checks_performed += 1
+            if llm_semantic and llm_semantic.get("available"):
+                session.llm_last_engine = str(llm_semantic.get("engine", "openai-chat-completions"))
+            else:
+                reason = str((llm_semantic or {}).get("reason", "unavailable"))
+                session.llm_last_engine = f"skipped:{reason}"
+        behaviour_snapshot = update_session_behaviour_state(session, language_result)
+        language_result.update(behaviour_snapshot)
+        previous_score = session.risk_history[-1] if session.risk_history else None
+        scored = build_risk_update(
+            analysis_result.features or {},
+            analysis_result.classification,
+            analysis_result.confidence_score,
+            language_result,
+            previous_score,
+            llm_semantic=llm_semantic,
+        )
+        voice_classification = normalize_voice_classification(
+            analysis_result.classification,
+            scored["model_uncertain"],
+        )
+        voice_confidence = float(max(0.0, min(1.0, analysis_result.confidence_score)))
+        session.chunks_processed += 1
+        session.last_update = utc_now_iso()
+        session.risk_history.append(scored["risk_score"])
+        if scored["risk_score"] >= session.max_risk_score:
+            session.final_call_label = scored["call_label"]
+        session.max_risk_score = max(session.max_risk_score, scored["risk_score"])
+        session.max_cpi = max(session.max_cpi, float(scored["cpi"]))
+        if voice_classification == "AI_GENERATED":
+            session.voice_ai_chunks += 1
+            session.max_voice_ai_confidence = max(session.max_voice_ai_confidence, voice_confidence)
+        elif voice_classification == "HUMAN":
+            session.voice_human_chunks += 1
+        session.final_voice_classification = voice_classification
+        session.final_voice_confidence = voice_confidence
+        if scored["alert"].triggered:
+            alert_obj = scored["alert"]
+            alert_entry = {
+                "timestamp": session.last_update,
+                "risk_score": scored["risk_score"],
+                "risk_level": scored["risk_level"],
+                "call_label": scored["call_label"],
+                "alert_type": alert_obj.alert_type or "FRAUD_RISK_HIGH",
+                "severity": alert_obj.severity or scored["risk_level"].lower(),
+                "reason_summary": alert_obj.reason_summary or "Fraud indicators detected.",
+                "recommended_action": alert_obj.recommended_action
+                or recommendation_for_level(scored["risk_level"], scored["model_uncertain"]),
+            }
+            last_alert = session.alert_history[-1] if session.alert_history else None
+            duplicate_keys = ("alert_type", "severity", "reason_summary", "recommended_action", "call_label", "risk_level")
+            is_duplicate = bool(
+                last_alert
+                and all(last_alert.get(key) == alert_entry.get(key) for key in duplicate_keys)
+            )
+            if is_duplicate:
+                last_alert["timestamp"] = session.last_update
+                last_alert["risk_score"] = max(int(last_alert.get("risk_score", 0)), scored["risk_score"])
+            else:
+                session.alerts_triggered += 1
+                session.alert_history.append(alert_entry)
+                if len(session.alert_history) > 100:
+                    session.alert_history = session.alert_history[-100:]
+        save_session_state(session)
+        return RealTimeUpdateResponse(
+            status="success",
+            session_id=session_id,
+            timestamp=session.last_update,
+            risk_score=scored["risk_score"],
+            cpi=scored["cpi"],
+            risk_level=scored["risk_level"],
+            call_label=scored["call_label"],
+            model_uncertain=scored["model_uncertain"],
+            voice_classification=voice_classification,
+            voice_confidence=voice_confidence,
+            evidence=scored["evidence"],
+            language_analysis=scored["language_analysis"],
+            alert=scored["alert"],
+            explainability=scored["explainability"],
+            chunks_processed=session.chunks_processed,
+            risk_policy_version=settings.RISK_POLICY_VERSION,
+        )
+def session_to_summary(session: SessionState) -> SessionSummaryResponse:
+    """Convert session state to response model."""
+    return SessionSummaryResponse(
+        status="success",
+        session_id=session.session_id,
+        language=session.language,
+        session_status=session.status,
+        started_at=session.started_at,
+        last_update=session.last_update,
+        chunks_processed=session.chunks_processed,
+        alerts_triggered=session.alerts_triggered,
+        max_risk_score=session.max_risk_score,
+        max_cpi=round(session.max_cpi, 1),
+        final_call_label=session.final_call_label,
+        final_voice_classification=session.final_voice_classification,
+        final_voice_confidence=round(session.final_voice_confidence, 2),
+        max_voice_ai_confidence=round(session.max_voice_ai_confidence, 2),
+        voice_ai_chunks=session.voice_ai_chunks,
+        voice_human_chunks=session.voice_human_chunks,
+        llm_checks_performed=session.llm_checks_performed,
+        risk_policy_version=settings.RISK_POLICY_VERSION,
+    )
+# Authentication
+from fastapi.security import APIKeyHeader
+from fastapi import Security
+api_key_header = APIKeyHeader(name="x-api-key", auto_error=False)  # Changed to False for better error messages
+async def verify_api_key(x_api_key: str = Security(api_key_header)) -> str:
+    """Dependency to verify API key. Raises 401 if invalid or missing."""
+    if x_api_key is None:
+        logger.warning("API request without x-api-key header")
+        raise HTTPException(
+            status_code=401,
+            detail={"status": "error", "message": "Missing API key. Include 'x-api-key' header."}
+        )
+    if x_api_key != settings.API_KEY:
+        logger.warning(f"API request with invalid key: {x_api_key[:8]}...")
+        raise HTTPException(
+            status_code=401,
+            detail={"status": "error", "message": "Invalid API key"}
+        )
+    return x_api_key
+def verify_websocket_api_key(websocket: WebSocket) -> bool:
+    """Validate API key for websocket connections."""
+    key = websocket.headers.get("x-api-key") or websocket.query_params.get("api_key")
+    return key == settings.API_KEY
+# Routes
+@app.get("/", include_in_schema=False)
+async def root():
+    """Redirect to API documentation."""
+    return RedirectResponse(url="/docs")
+@app.get("/health")
+async def health_check():
+    """Health check for monitoring - verifies ML model is loaded."""
+    try:
+        from model import _model
+        model_loaded = _model is not None
+    except Exception:
+        model_loaded = False
+    return {
+        "status": "healthy" if model_loaded else "degraded",
+        "model_loaded": model_loaded,
+        "session_store_backend": SESSION_STORE_BACKEND_ACTIVE,
+    }
+@app.post("/v1/session/start", response_model=SessionStartResponse)
+@app.post("/api/voice-detection/v1/session/start", response_model=SessionStartResponse)
+async def start_realtime_session(
+    session_request: SessionStartRequest,
+    api_key: str = Depends(verify_api_key)
+):
+    """Create a new real-time fraud analysis session."""
+    validate_supported_language(session_request.language)
+    session_id = str(uuid.uuid4())
+    started_at = utc_now_iso()
+    async with SESSION_LOCK:
+        purged = purge_expired_sessions()
+        if purged:
+            logger.info("Retention purge removed %s expired sessions", purged)
+        session_state = SessionState(
+            session_id=session_id,
+            language=session_request.language,
+            started_at=started_at
+        )
+        save_session_state(session_state)
+    return SessionStartResponse(
+        status="success",
+        session_id=session_id,
+        language=session_request.language,
+        started_at=started_at,
+        message="Session created. Send chunks using /v1/session/{session_id}/chunk or websocket stream."
+    )
+@app.post("/v1/session/{session_id}/chunk", response_model=RealTimeUpdateResponse)
+@app.post("/api/voice-detection/v1/session/{session_id}/chunk", response_model=RealTimeUpdateResponse)
+async def analyze_realtime_chunk(
+    request: Request,
+    session_id: str,
+    chunk_request: SessionChunkRequest,
+    api_key: str = Depends(verify_api_key)
+):
+    """Analyze one chunk for an active real-time session."""
+    request_id = getattr(request.state, "request_id", f"sess-{session_id[:8]}")
+    async with SESSION_LOCK:
+        purge_expired_sessions()
+        session = get_session_state(session_id)
+        if session is None:
+            raise HTTPException(
+                status_code=404,
+                detail={"status": "error", "message": "Session not found or expired"}
+            )
+        if session.status != "active":
+            raise HTTPException(
+                status_code=409,
+                detail={"status": "error", "message": "Session is not active. Start a new session to continue."}
+            )
+        session_language = session.language
+    try:
+        return await process_audio_chunk(session_id, chunk_request, session_language, request_id)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail={"status": "error", "message": str(e)}) from e
+@app.websocket("/v1/session/{session_id}/stream")
+@app.websocket("/api/voice-detection/v1/session/{session_id}/stream")
+async def stream_realtime_session(websocket: WebSocket, session_id: str):
+    """WebSocket endpoint for continuous chunk-based analysis."""
+    if not verify_websocket_api_key(websocket):
+        await websocket.close(code=1008, reason="Invalid API key")
+        return
+    async with SESSION_LOCK:
+        purge_expired_sessions()
+        session = get_session_state(session_id)
+        if session is None:
+            await websocket.close(code=1008, reason="Session not found or expired")
+            return
+        if session.status != "active":
+            await websocket.close(code=1008, reason="Session is not active")
+            return
+        session_language = session.language
+    await websocket.accept()
+    request_id = f"ws-{session_id[:8]}"
+    try:
+        while True:
+            payload = await websocket.receive_json()
+            try:
+                chunk_request = SessionChunkRequest.model_validate(payload)
+            except ValidationError as e:
+                await websocket.send_json({
+                    "status": "error",
+                    "message": "Invalid chunk payload",
+                    "details": e.errors()
+                })
+                continue
+            try:
+                update = await process_audio_chunk(session_id, chunk_request, session_language, request_id)
+                await websocket.send_json(update.model_dump())
+            except HTTPException as e:
+                detail = e.detail if isinstance(e.detail, dict) else {"status": "error", "message": str(e.detail)}
+                await websocket.send_json(detail)
+            except ValueError as e:
+                await websocket.send_json({"status": "error", "message": str(e)})
+    except WebSocketDisconnect:
+        logger.info(f"[{request_id}] WebSocket disconnected")
+@app.get("/v1/session/{session_id}/summary", response_model=SessionSummaryResponse)
+@app.get("/api/voice-detection/v1/session/{session_id}/summary", response_model=SessionSummaryResponse)
+async def get_session_summary(
+    session_id: str,
+    api_key: str = Depends(verify_api_key)
+):
+    """Return current summary for a real-time session."""
+    async with SESSION_LOCK:
+        purge_expired_sessions()
+        session = get_session_state(session_id)
+        if session is None:
+            raise HTTPException(
+                status_code=404,
+                detail={"status": "error", "message": "Session not found or expired"}
+            )
+        return session_to_summary(session)
+@app.get("/v1/session/{session_id}/alerts", response_model=AlertHistoryResponse)
+@app.get("/api/voice-detection/v1/session/{session_id}/alerts", response_model=AlertHistoryResponse)
+async def get_session_alerts(
+    session_id: str,
+    limit: int = 20,
+    api_key: str = Depends(verify_api_key),
+):
+    """Return recent alert history for a real-time session."""
+    if limit < 1 or limit > 100:
+        raise HTTPException(
+            status_code=400,
+            detail={"status": "error", "message": "limit must be between 1 and 100"},
+        )
+    async with SESSION_LOCK:
+        purge_expired_sessions()
+        session = get_session_state(session_id)
+        if session is None:
+            raise HTTPException(
+                status_code=404,
+                detail={"status": "error", "message": "Session not found or expired"},
+            )
+        alerts = [AlertHistoryItem(**item) for item in session.alert_history[-limit:]]
+        return AlertHistoryResponse(
+            status="success",
+            session_id=session_id,
+            total_alerts=len(session.alert_history),
+            alerts=alerts,
+        )
+@app.get("/v1/privacy/retention-policy", response_model=RetentionPolicyResponse)
+@app.get("/api/voice-detection/v1/privacy/retention-policy", response_model=RetentionPolicyResponse)
+async def get_retention_policy(api_key: str = Depends(verify_api_key)):
+    """Return explicit privacy defaults for raw audio and session-derived data."""
+    return RetentionPolicyResponse(
+        status="success",
+        raw_audio_storage="not_persisted",
+        active_session_retention_seconds=settings.SESSION_ACTIVE_RETENTION_SECONDS,
+        ended_session_retention_seconds=settings.SESSION_ENDED_RETENTION_SECONDS,
+        stored_derived_fields=STORED_DERIVED_FIELDS,
+    )
+@app.post("/v1/session/{session_id}/end", response_model=SessionSummaryResponse)
+@app.post("/api/voice-detection/v1/session/{session_id}/end", response_model=SessionSummaryResponse)
+async def end_realtime_session(
+    session_id: str,
+    api_key: str = Depends(verify_api_key)
+):
+    """Mark a session as ended and return final summary."""
+    async with SESSION_LOCK:
+        purge_expired_sessions()
+        session = get_session_state(session_id)
+        if session is None:
+            raise HTTPException(
+                status_code=404,
+                detail={"status": "error", "message": "Session not found or expired"}
+            )
+        session.status = "ended"
+        session.last_update = utc_now_iso()
+        save_session_state(session)
+        return session_to_summary(session)
+@app.post(
+    "/api/voice-detection",
+    response_model=VoiceDetectionResponse,
+    responses={
+        400: {"model": ErrorResponse, "description": "Bad Request"},
+        401: {"model": ErrorResponse, "description": "Unauthorized"},
+        429: {"model": ErrorResponse, "description": "Rate Limit Exceeded"},
+        500: {"model": ErrorResponse, "description": "Internal Server Error"}
+    }
+)
+@limiter.limit("1000/minute")  # Rate limit: 1000 requests per minute per IP
+async def detect_voice(
+    request: Request,  # Required for rate limiter
+    voice_request: VoiceDetectionRequest,
+    api_key: str = Depends(verify_api_key)  # Use dependency injection
+):
+    """
+    Returns classification result with confidence score and explanation.
+    """
+    # Log request info for debugging
+    request_id = getattr(request.state, 'request_id', 'unknown')
+    audio_size_kb = len(voice_request.audioBase64) * 3 / 4 / 1024  # Approximate decoded size
+    logger.info(f"[{request_id}] Voice detection request: language={voice_request.language}, format={voice_request.audioFormat}, size~{audio_size_kb:.1f}KB")
+    validate_supported_language(voice_request.language)
+    validate_supported_format(voice_request.audioFormat)
+    try:
+        # Step 1: Decode Base64 (async - runs in thread pool)
+        logger.info(f"[{request_id}]   -> Decoding Base64...")
+        decode_start = time.perf_counter()
+        audio_bytes = await asyncio.to_thread(decode_base64_audio, voice_request.audioBase64)
+        decode_time = (time.perf_counter() - decode_start) * 1000
+        # Step 2: Load audio (async - runs in thread pool)
+        logger.info(f"[{request_id}]   -> Loading audio... (decode took {decode_time:.0f}ms)")
+        load_start = time.perf_counter()
+        audio, sr = await asyncio.to_thread(load_audio_from_bytes, audio_bytes, 22050, voice_request.audioFormat)
+        load_time = (time.perf_counter() - load_start) * 1000
+        # Step 3: ML Analysis (async - runs in thread pool, CPU-bound)
+        duration_sec = len(audio) / sr
+        logger.info(f"[{request_id}]   -> Analyzing {duration_sec:.1f}s audio... (load took {load_time:.0f}ms)")
+        analyze_start = time.perf_counter()
+        result = await asyncio.to_thread(analyze_voice, audio, sr, voice_request.language)
+        analyze_time = (time.perf_counter() - analyze_start) * 1000
+        logger.info(f"[{request_id}]   -> Analysis complete: {result.classification} ({result.confidence_score:.0%}) in {analyze_time:.0f}ms")
+        # Extract metrics if available
+        metrics = None
+        if result.features:
+            metrics = ForensicMetrics(
+                authenticity_score=result.features.get("authenticity_score", 0),
+                pitch_naturalness=result.features.get("pitch_naturalness", 0),
+                spectral_naturalness=result.features.get("spectral_naturalness", 0),
+                temporal_naturalness=result.features.get("temporal_naturalness", 0)
+            )
+        model_uncertain = bool((result.features or {}).get("ml_fallback", 0.0))
+        explanation = result.explanation
+        recommended_action = None
+        response_classification = result.classification
+        if model_uncertain:
+            explanation = (
+                "Model uncertainty detected due fallback inference. "
+                "Treat result as cautionary and verify through trusted channels. "
+                f"{result.explanation}"
+            )
+            recommended_action = (
+                "Do not share OTP, PIN, passwords, or payment credentials. "
+                "Verify caller identity through official support channels."
+            )
+            if settings.LEGACY_FALLBACK_RETURNS_UNCERTAIN:
+                response_classification = "UNCERTAIN"
+        # Return response
+        return VoiceDetectionResponse(
+            status="success",
+            language=voice_request.language,
+            classification=response_classification,
+            confidenceScore=result.confidence_score,
+            explanation=explanation,
+            forensic_metrics=metrics,
+            modelUncertain=model_uncertain,
+            recommendedAction=recommended_action,
+        )
+    except ValueError as e:
+        logger.warning(f"[{request_id}]   [VALIDATION_ERROR] {e}")
+        raise HTTPException(
+            status_code=400,
+            detail={"status": "error", "message": str(e)}
+        )
+    except Exception as e:
+        logger.error(f"[{request_id}]   [PROCESSING_ERROR] {e}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail={"status": "error", "message": f"Internal Server Error (request_id={request_id})"}
+        )
+# Exception handlers
+from fastapi.exceptions import RequestValidationError
+def to_json_safe(value: Any) -> Any:
+    """Recursively convert values to JSON-safe primitives."""
+    if value is None or isinstance(value, (str, int, float, bool)):
+        return value
+    if isinstance(value, BaseException):
+        return str(value)
+    if isinstance(value, dict):
+        return {str(k): to_json_safe(v) for k, v in value.items()}
+    if isinstance(value, (list, tuple, set)):
+        return [to_json_safe(item) for item in value]
+    return str(value)
+@app.exception_handler(RequestValidationError)
+async def validation_exception_handler(request: Request, exc: RequestValidationError):
+    """
+    Custom handler for 422 Validation Errors.
+    Provides clearer error messages for common issues.
+    """
+    errors = to_json_safe(exc.errors())
+    logger.warning("Validation error: %s", errors)
+    # Build user-friendly error message
+    error_messages = []
+    for error in errors:
+        loc = " -> ".join(str(l) for l in error.get("loc", []))
+        msg = error.get("msg", "Invalid value")
+        error_messages.append(f"{loc}: {msg}")
+    # Common issue detection
+    if any("audioBase64" in str(e.get("loc", [])) for e in errors):
+        hint = " Hint: Ensure 'audioBase64' is a valid Base64-encoded string."
+    elif any("language" in str(e.get("loc", [])) for e in errors):
+        hint = f" Hint: 'language' must be one of: {', '.join(settings.SUPPORTED_LANGUAGES)}."
+    else:
+        hint = ""
+    return JSONResponse(
+        status_code=422,
+        content={
+            "status": "error",
+            "message": f"Request validation failed: {'; '.join(error_messages)}.{hint}",
+            "details": errors
+        }
+    )
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException):
+    """Custom exception handler to ensure consistent error format."""
+    if isinstance(exc.detail, dict):
+        return JSONResponse(
+            status_code=exc.status_code,
+            content=exc.detail
+        )
+    return JSONResponse(
+        status_code=exc.status_code,
+        content={"status": "error", "message": str(exc.detail)}
+    )
+@app.exception_handler(Exception)
+async def global_exception_handler(request: Request, exc: Exception):
+    """Global handler to catch unhandled exceptions and prevent stack traces."""
+    logger.error(f"Unhandled error: {exc}", exc_info=True)
+    return JSONResponse(
+        status_code=500,
+        content={"status": "error", "message": "Internal Server Error"}
+    )
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=settings.PORT)

model.py ADDED Viewed

	@@ -0,0 +1,563 @@

+"""
+Voice Analysis Engine.
+Combines Wav2Vec2 deepfake detection with signal forensics.
+"""
+import logging
+import os
+import numpy as np
+from typing import Dict, Tuple, List, Optional
+from dataclasses import dataclass
+import warnings
+from config import settings
+# Configure logging
+logger = logging.getLogger(__name__)
+# Suppress warnings
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+# Global model cache
+_model = None
+_processor = None
+_device = None
+@dataclass
+class AnalysisResult:
+    """Result of voice analysis."""
+    classification: str  # "AI_GENERATED" or "HUMAN"
+    confidence_score: float  # 0.0 to 1.0
+    explanation: str
+    features: Dict[str, float]  # Individual feature scores for debugging
+def get_device():
+    """Get the best available device (GPU or CPU)."""
+    global _device
+    if _device is None:
+        import torch
+        if torch.cuda.is_available():
+            _device = "cuda"
+        else:
+            _device = "cpu"
+        logger.info(f"Using device: {_device}")
+    return _device
+def load_model():
+    """
+    Load the Wav2Vec2 deepfake detection model.
+    Prioritizes HuggingFace Hub model, with local fallback.
+    """
+    global _model, _processor
+    if _model is None:
+        from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
+        # Model priority:
+        # 1. Local fine-tuned model (for development)
+        # 2. HuggingFace Hub model (for production/deployment)
+        # 3. Fallback to public model
+        local_path = settings.VOICE_MODEL_LOCAL_PATH
+        hf_model = settings.VOICE_MODEL_ID
+        backup_model = settings.VOICE_MODEL_BACKUP_ID
+        if os.path.exists(local_path):
+            logger.info(f"Loading local fine-tuned model from: {local_path}")
+            model_name = local_path
+        else:
+            logger.info(f"Loading model from HuggingFace Hub: {hf_model}")
+            model_name = hf_model
+        try:
+            _processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
+            _model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
+            _model.to(get_device())
+            _model.eval()
+            logger.info(f"Model loaded successfully: {model_name}")
+        except Exception as e:
+            logger.error(f"Failed to load model {model_name}: {e}")
+            if model_name != backup_model:
+                logger.warning("Trying backup model...")
+                model_name = backup_model
+                try:
+                    _processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
+                    _model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
+                    _model.to(get_device())
+                    _model.eval()
+                    logger.info(f"Backup model loaded: {model_name}")
+                except Exception as e2:
+                    raise RuntimeError(f"Could not load any model: {e2}")
+            else:
+                raise e
+    return _model, _processor
+def extract_signal_features(audio: np.ndarray, sr: int, fast_mode: bool = False) -> Dict[str, float]:
+    """Extract signal-based features (pitch, entropy, silence)."""
+    import librosa
+    from scipy.stats import entropy
+    features = {}
+    try:
+        # Use smaller FFT in fast mode for realtime throughput.
+        n_fft = 512 if fast_mode else 2048
+        hop_length = 256 if fast_mode else 512
+        S = np.abs(librosa.stft(audio, n_fft=n_fft, hop_length=hop_length))
+        # Pitch analysis.
+        if fast_mode:
+            # Approximate pitch variability from centroid dynamics to avoid expensive pYIN on realtime path.
+            spec_centroid = librosa.feature.spectral_centroid(S=S, sr=sr)[0]
+            centroid_mean = float(np.mean(spec_centroid) + 1e-8)
+            features["pitch_stability"] = float(np.clip(np.var(spec_centroid) / (centroid_mean ** 2), 0.0, 1.5))
+            features["jitter"] = float(np.clip(np.mean(np.abs(np.diff(spec_centroid))) / centroid_mean, 0.0, 0.2))
+            voiced_flag = librosa.feature.rms(y=audio, frame_length=n_fft, hop_length=hop_length)[0] > 0.02
+        else:
+            f0, voiced_flag, _ = librosa.pyin(
+                audio,
+                fmin=librosa.note_to_hz('C2'),
+                fmax=librosa.note_to_hz('C7'),
+                sr=sr
+            )
+            f0_voiced = f0[~np.isnan(f0)]
+            if len(f0_voiced) > 10:
+                pitch_mean = np.mean(f0_voiced)
+                pitch_std = np.std(f0_voiced)
+                features["pitch_stability"] = pitch_std / pitch_mean if pitch_mean > 0 else 0
+                features["jitter"] = np.mean(np.abs(np.diff(f0_voiced))) / pitch_mean if pitch_mean > 0 else 0
+            else:
+                features["pitch_stability"] = 0.5
+                features["jitter"] = 0.05
+        # Spectral features
+        spec_centroid = librosa.feature.spectral_centroid(S=S, sr=sr)[0]
+        features["spectral_centroid_var"] = float(np.var(spec_centroid))
+        spec_flatness = librosa.feature.spectral_flatness(S=S)[0]
+        features["spectral_flatness"] = float(np.mean(spec_flatness))
+        # Entropy
+        S_norm = S / (np.sum(S, axis=0, keepdims=True) + 1e-10)
+        frame_entropies = [entropy(frame + 1e-10) for frame in S_norm.T]
+        features["spectral_entropy"] = float(np.mean(frame_entropies))
+        # Silence detection
+        silence_threshold = 1e-5
+        features["silence_ratio"] = float(np.sum(np.abs(audio) < silence_threshold) / len(audio))
+        features["perfect_silence"] = float(np.sum(audio == 0) / len(audio))
+        # Zero crossing rate
+        zcr = librosa.feature.zero_crossing_rate(audio)[0]
+        features["zcr_variance"] = float(np.var(zcr))
+        # Additional acoustic heuristics for suspicious audio artifacts.
+        spec_rolloff = librosa.feature.spectral_rolloff(S=S, sr=sr)[0]
+        features["spectral_rolloff_var"] = float(np.var(spec_rolloff))
+        features["voiced_ratio"] = float(np.mean(voiced_flag.astype(np.float32))) if voiced_flag is not None else 0.0
+        rms = librosa.feature.rms(y=audio)[0]
+        features["rms_var"] = float(np.var(rms))
+        if fast_mode:
+            # Cheap HNR approximation from flatness and entropy for realtime throughput.
+            hnr_db = float(max(0.0, 30.0 - (features["spectral_flatness"] * 120.0)))
+        else:
+            harmonic, percussive = librosa.effects.hpss(audio)
+            harmonic_rms = float(np.sqrt(np.mean(np.square(harmonic))) + 1e-8)
+            percussive_rms = float(np.sqrt(np.mean(np.square(percussive))) + 1e-8)
+            hnr_db = float(20.0 * np.log10(harmonic_rms / percussive_rms))
+        features["harmonic_noise_ratio_db"] = hnr_db
+    except Exception as e:
+        logger.warning(f"Feature extraction error: {e}")
+        features = {
+            "pitch_stability": 0.5,
+            "jitter": 0.05,
+            "spectral_centroid_var": 1000,
+            "spectral_flatness": 0.1,
+            "spectral_entropy": 5.0,
+            "silence_ratio": 0.0,
+            "perfect_silence": 0.0,
+            "zcr_variance": 0.01,
+            "spectral_rolloff_var": 50000.0,
+            "voiced_ratio": 0.65,
+            "rms_var": 0.005,
+            "harmonic_noise_ratio_db": 14.0,
+        }
+    return features
+def generate_explanation(
+    classification: str,
+    ml_confidence: float,
+    features: Dict[str, float]
+) -> str:
+    """Generate a data-driven forensic explanation for the classification."""
+    # Calculate acoustic anomaly scores (0-100 scale)
+    pitch_score = _calculate_pitch_score(features)
+    spectral_score = _calculate_spectral_score(features)
+    temporal_score = _calculate_temporal_score(features)
+    # Overall authenticity score (inverted for AI detection)
+    authenticity_score = (pitch_score + spectral_score + temporal_score) / 3
+    # Confidence tier affects explanation style
+    if ml_confidence >= 0.95:
+        confidence_tier = "high"
+    elif ml_confidence >= 0.75:
+        confidence_tier = "moderate"
+    else:
+        confidence_tier = "low"
+    if classification == "AI_GENERATED":
+        return _explain_ai_detection(
+            confidence_tier, ml_confidence, authenticity_score,
+            pitch_score, spectral_score, temporal_score, features
+        )
+    else:
+        return _explain_human_detection(
+            confidence_tier, ml_confidence, authenticity_score,
+            pitch_score, spectral_score, temporal_score, features
+        )
+def _calculate_pitch_score(features: Dict[str, float]) -> float:
+    """Calculate pitch naturalness score (0-100). Higher = more human-like."""
+    pitch_stability = features.get("pitch_stability", 0.5)
+    jitter = features.get("jitter", 0.05)
+    # Typical Human: stability 0.1-0.3, jitter 0.02-0.08
+    # Typical AI: stability < 0.1, jitter < 0.02
+    stability_score = min(100, max(0, (pitch_stability - 0.05) / 0.25 * 100))
+    jitter_score = min(100, max(0, (jitter - 0.005) / 0.075 * 100))
+    return (stability_score * 0.6 + jitter_score * 0.4)
+def _calculate_spectral_score(features: Dict[str, float]) -> float:
+    """Calculate spectral naturalness score (0-100). Higher = more human-like."""
+    entropy = features.get("spectral_entropy", 5.0)
+    flatness = features.get("spectral_flatness", 0.1)
+    # Typical Human: entropy 4.5-7, flatness 0.02-0.12
+    # Typical AI: entropy < 4.5, flatness > 0.12
+    entropy_score = min(100, max(0, (entropy - 3.0) / 4.0 * 100))
+    flatness_score = min(100, max(0, (0.2 - flatness) / 0.18 * 100))
+    return (entropy_score * 0.5 + flatness_score * 0.5)
+def _calculate_temporal_score(features: Dict[str, float]) -> float:
+    """Calculate temporal/rhythm naturalness score (0-100). Higher = more human-like."""
+    zcr_var = features.get("zcr_variance", 0.01)
+    silence_ratio = features.get("silence_ratio", 0.0)
+    perfect_silence = features.get("perfect_silence", 0.0)
+    # Penalize digital silence (exact zeros) - strong AI indicator
+    digital_penalty = min(50, perfect_silence * 500)
+    zcr_score = min(100, max(0, zcr_var / 0.02 * 100))
+    return max(0, zcr_score - digital_penalty)
+def _calculate_acoustic_anomaly_score(features: Dict[str, float]) -> float:
+    """
+    Estimate suspicious acoustic artifact intensity (0-100).
+    Higher score indicates stronger synthetic/spoof-like signal artifacts.
+    """
+    perfect_silence = features.get("perfect_silence", 0.0)
+    spectral_flatness = features.get("spectral_flatness", 0.1)
+    rolloff_var = features.get("spectral_rolloff_var", 50000.0)
+    voiced_ratio = features.get("voiced_ratio", 0.65)
+    hnr_db = features.get("harmonic_noise_ratio_db", 14.0)
+    digital_artifact_score = min(100.0, perfect_silence * 10000.0)
+    flatness_artifact_score = min(100.0, max(0.0, (spectral_flatness - 0.13) * 500.0))
+    rolloff_score = min(100.0, max(0.0, (np.log10(rolloff_var + 1.0) - 3.8) * 45.0))
+    if voiced_ratio < 0.35:
+        voiced_ratio_score = min(100.0, (0.35 - voiced_ratio) * 180.0)
+    elif voiced_ratio > 0.95:
+        voiced_ratio_score = min(100.0, (voiced_ratio - 0.95) * 180.0)
+    else:
+        voiced_ratio_score = 0.0
+    if hnr_db < 6.0:
+        hnr_score = min(100.0, (6.0 - hnr_db) * 8.0)
+    elif hnr_db > 28.0:
+        hnr_score = min(100.0, (hnr_db - 28.0) * 4.0)
+    else:
+        hnr_score = 0.0
+    anomaly_score = (
+        (digital_artifact_score * 0.35)
+        + (flatness_artifact_score * 0.20)
+        + (rolloff_score * 0.20)
+        + (voiced_ratio_score * 0.15)
+        + (hnr_score * 0.10)
+    )
+    return float(max(0.0, min(100.0, anomaly_score)))
+def _explain_ai_detection(
+    confidence_tier: str,
+    ml_confidence: float,
+    authenticity_score: float,
+    pitch_score: float,
+    spectral_score: float,
+    temporal_score: float,
+    features: Dict[str, float]
+) -> str:
+    """Generate explanation for AI-detected audio."""
+    # Find the weakest scores (most AI-like characteristics)
+    scores = {
+        "vocal pitch patterns": pitch_score,
+        "spectral characteristics": spectral_score,
+        "temporal dynamics": temporal_score
+    }
+    sorted_scores = sorted(scores.items(), key=lambda x: x[1])
+    # Build forensic-style explanation
+    primary_indicator = sorted_scores[0][0]
+    primary_score = sorted_scores[0][1]
+    if confidence_tier == "high":
+        intro = f"Strong synthetic markers detected (confidence: {ml_confidence:.0%}). "
+    elif confidence_tier == "moderate":
+        intro = f"Synthetic patterns identified (confidence: {ml_confidence:.0%}). "
+    else:
+        intro = f"Possible synthetic audio (confidence: {ml_confidence:.0%}). "
+    # Specific findings based on lowest scoring area
+    if primary_indicator == "vocal pitch patterns":
+        jitter = features.get("jitter", 0)
+        stability = features.get("pitch_stability", 0)
+        detail = f"Pitch analysis shows unusually consistent patterns (stability: {stability:.3f}, micro-variation: {jitter:.4f}) - typical of synthesized speech."
+    elif primary_indicator == "spectral characteristics":
+        entropy = features.get("spectral_entropy", 0)
+        flatness = features.get("spectral_flatness", 0)
+        detail = f"Spectral fingerprint indicates synthetic generation (complexity: {entropy:.2f}, flatness: {flatness:.3f}) - lacking natural harmonic richness."
+    else:
+        perfect_silence = features.get("perfect_silence", 0)
+        if perfect_silence > 0.005:
+            detail = f"Digital artifacts detected: {perfect_silence:.1%} exact-zero samples found, indicating synthetic audio processing."
+        else:
+            detail = f"Temporal patterns suggest algorithmic generation - rhythm lacks natural human irregularities."
+    # Add authenticity score as a unique metric
+    authenticity_label = "very low" if authenticity_score < 25 else "low" if authenticity_score < 50 else "borderline"
+    return f"{intro}{detail} Authenticity score: {authenticity_score:.0f}/100 ({authenticity_label})."
+def _explain_human_detection(
+    confidence_tier: str,
+    ml_confidence: float,
+    authenticity_score: float,
+    pitch_score: float,
+    spectral_score: float,
+    temporal_score: float,
+    features: Dict[str, float]
+) -> str:
+    """Generate explanation for human-detected audio."""
+    # Find the strongest scores (most human-like characteristics)
+    scores = {
+        "vocal pitch patterns": pitch_score,
+        "spectral characteristics": spectral_score,
+        "temporal dynamics": temporal_score
+    }
+    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+    primary_indicator = sorted_scores[0][0]
+    primary_score = sorted_scores[0][1]
+    if confidence_tier == "high":
+        intro = f"Strong human voice markers detected (confidence: {ml_confidence:.0%}). "
+    elif confidence_tier == "moderate":
+        intro = f"Human speech patterns identified (confidence: {ml_confidence:.0%}). "
+    else:
+        intro = f"Likely human voice (confidence: {ml_confidence:.0%}). "
+    # Specific findings based on highest scoring area
+    if primary_indicator == "vocal pitch patterns":
+        jitter = features.get("jitter", 0)
+        stability = features.get("pitch_stability", 0)
+        detail = f"Natural pitch dynamics confirmed (variability: {stability:.3f}, micro-fluctuations: {jitter:.4f}) - consistent with biological speech production."
+    elif primary_indicator == "spectral characteristics":
+        entropy = features.get("spectral_entropy", 0)
+        detail = f"Rich harmonic structure detected (complexity score: {entropy:.2f}) - characteristic of natural vocal tract resonance."
+    else:
+        zcr_var = features.get("zcr_variance", 0)
+        detail = f"Organic speech rhythm detected (variance: {zcr_var:.4f}) - natural breathing and articulation patterns present."
+    # Add authenticity score
+    authenticity_label = "excellent" if authenticity_score > 75 else "good" if authenticity_score > 50 else "moderate"
+    return f"{intro}{detail} Authenticity score: {authenticity_score:.0f}/100 ({authenticity_label})."
+def classify_with_model(audio: np.ndarray, sr: int) -> Tuple[str, float]:
+    """
+    Classify audio using the Wav2Vec2 model.
+    Returns:
+        Tuple of (classification, confidence)
+    """
+    import torch
+    import librosa
+    model, processor = load_model()
+    device = get_device()
+    # Normalize audio to prevent clipping issues
+    max_val = np.max(np.abs(audio))
+    if max_val > 0:
+        audio = audio / max_val
+    # Resample to 16kHz if needed (Wav2Vec2 expects 16kHz)
+    target_sr = 16000
+    if sr != target_sr:
+        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
+    # Process audio
+    inputs = processor(
+        audio,
+        sampling_rate=target_sr,
+        return_tensors="pt",
+        padding=True
+    )
+    # Move to device
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Run inference
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs.logits
+        probabilities = torch.softmax(logits, dim=-1)
+        # Get prediction
+        predicted_class = torch.argmax(probabilities, dim=-1).item()
+        confidence = probabilities[0][predicted_class].item()
+    # Map class to label using the model's id2label config.
+    # IMPORTANT: HuggingFace stores id2label with STRING keys ("0", "1")
+    # but predicted_class from torch.argmax().item() is an int.
+    # We must normalise the keys to int so .get() actually matches.
+    raw_id2label = getattr(model.config, 'id2label', None) or {}
+    id2label = {int(k): v for k, v in raw_id2label.items()}
+    label = id2label.get(predicted_class, 'UNKNOWN')
+    logger.info(
+        "Model id2label=%s  predicted_class=%d  resolved_label=%s",
+        id2label, predicted_class, label,
+    )
+    # Normalize label
+    if label.upper() in ['FAKE', 'SPOOF', 'SYNTHETIC', 'AI']:
+        classification = "AI_GENERATED"
+    else:
+        classification = "HUMAN"
+    return classification, confidence
+def analyze_voice(audio: np.ndarray, sr: int, language: str = "English", realtime: bool = False) -> AnalysisResult:
+    """
+    Analyze a voice sample and classify as AI-generated or Human.
+    Args:
+        audio: Audio waveform as numpy array
+        sr: Sample rate
+        language: Language of the audio (for context)
+    Returns:
+        AnalysisResult with classification, confidence, and explanation
+    Raises:
+        ValueError: If audio is too short for reliable analysis
+    """
+    # Validate minimum audio duration (at least 0.5 seconds for reliable analysis)
+    min_duration = 0.5  # seconds
+    duration = len(audio) / sr
+    if duration < min_duration:
+        raise ValueError(f"Audio too short ({duration:.2f}s). Minimum {min_duration}s required for reliable analysis.")
+    fast_mode = bool(realtime and settings.REALTIME_LIGHTWEIGHT_AUDIO)
+    # Get model prediction (legacy/deep path) or defer to lightweight realtime heuristic.
+    ml_fallback = False
+    classification = "HUMAN"
+    ml_confidence = 0.5
+    if not fast_mode:
+        try:
+            classification, ml_confidence = classify_with_model(audio, sr)
+        except Exception as e:
+            logger.error(f"ML model error: {e}, falling back to signal analysis")
+            ml_fallback = True
+            classification = "HUMAN"
+            ml_confidence = 0.5
+    # Extract signal features for explainability.
+    features = extract_signal_features(audio, sr, fast_mode=fast_mode)
+    # Calculate scores explicitly for return.
+    pitch_score = _calculate_pitch_score(features)
+    spectral_score = _calculate_spectral_score(features)
+    temporal_score = _calculate_temporal_score(features)
+    authenticity_score = (pitch_score + spectral_score + temporal_score) / 3
+    acoustic_anomaly_score = _calculate_acoustic_anomaly_score(features)
+    # Lightweight realtime path avoids transformer inference for throughput.
+    if fast_mode:
+        ai_probability = max(
+            acoustic_anomaly_score / 100.0,
+            max(0.0, min(1.0, (52.0 - authenticity_score) / 52.0)),
+        )
+        classification = "AI_GENERATED" if ai_probability >= 0.56 else "HUMAN"
+        ml_confidence = ai_probability if classification == "AI_GENERATED" else (1.0 - ai_probability)
+        ml_confidence = float(max(0.5, min(0.99, ml_confidence)))
+    features["ml_confidence"] = ml_confidence
+    features["ml_fallback"] = float(ml_fallback)
+    features["realtime_heuristic_mode"] = float(fast_mode)
+    # Add computed high-level scores to features for API response.
+    features["authenticity_score"] = round(authenticity_score, 1)
+    features["pitch_naturalness"] = round(pitch_score, 1)
+    features["spectral_naturalness"] = round(spectral_score, 1)
+    features["temporal_naturalness"] = round(temporal_score, 1)
+    features["acoustic_anomaly_score"] = round(acoustic_anomaly_score, 1)
+    # Generate explanation
+    explanation = generate_explanation(classification, ml_confidence, features)
+    return AnalysisResult(
+        classification=classification,
+        confidence_score=round(ml_confidence, 2),
+        explanation=explanation,
+        features=features
+    )
+# Pre-load model at module import (optional, for faster first request)
+def preload_model():
+    """Pre-load the model to speed up first request."""
+    try:
+        load_model()
+    except Exception as e:
+        logger.error(f"Model preload failed: {e}")

privacy_utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+Privacy helpers for masking sensitive entities in transcripts and logs.
+"""
+from __future__ import annotations
+import re
+PHONE_PATTERN = re.compile(r"(?<!\d)(?:\+?91[-\s]?)?[6-9]\d{9}(?!\d)")
+UPI_PATTERN = re.compile(r"\b[a-zA-Z0-9._-]{2,}@[a-zA-Z]{2,}\b")
+ACCOUNT_OR_CARD_PATTERN = re.compile(r"(?<!\d)(?:\d[ -]?){9,19}(?!\d)")
+OTP_CONTEXT_PATTERN = re.compile(r"\b(otp|pin)\s*[:\-]?\s*(\d{4,8})\b", re.IGNORECASE)
+def _mask_numeric_token(token: str, preserve_tail: int = 2) -> str:
+    digits = re.sub(r"\D", "", token)
+    if len(digits) <= preserve_tail:
+        return "[REDACTED_NUM]"
+    return f"[REDACTED_NUM_XX{digits[-preserve_tail:]}]"
+def _mask_account_or_card(match: re.Match[str]) -> str:
+    token = match.group(0)
+    digits = re.sub(r"\D", "", token)
+    if len(digits) < 9:
+        return token
+    return _mask_numeric_token(token)
+def _mask_otp(match: re.Match[str]) -> str:
+    return f"{match.group(1)} [REDACTED_OTP]"
+def mask_sensitive_entities(text: str) -> str:
+    """Redact common scam-sensitive entities from plain text."""
+    if not text:
+        return ""
+    masked = OTP_CONTEXT_PATTERN.sub(_mask_otp, text)
+    masked = UPI_PATTERN.sub("[REDACTED_UPI]", masked)
+    masked = PHONE_PATTERN.sub("[REDACTED_PHONE]", masked)
+    masked = ACCOUNT_OR_CARD_PATTERN.sub(_mask_account_or_card, masked)
+    return masked
+def sanitize_for_logging(text: str, max_chars: int = 120) -> str:
+    """
+    Mask and compact text for safe structured logging.
+    """
+    masked = mask_sensitive_entities(text)
+    compact = " ".join(masked.split())
+    if len(compact) <= max_chars:
+        return compact
+    return compact[: max_chars - 3] + "..."

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+fastapi>=0.100.0
+uvicorn[standard]>=0.22.0
+python-multipart
+librosa>=0.10.0
+soundfile>=0.12.0
+numpy>=1.24.0
+scipy>=1.10.0
+python-dotenv
+pydantic>=2.0.0
+transformers>=4.30.0
+datasets>=2.14.0
+scikit-learn>=1.3.0
+accelerate>=0.20.0
+slowapi>=0.1.9
+pydantic-settings>=2.0.0
+httpx>=0.27.0
+# PyTorch - install manually for your platform if not using Docker:
+# pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu
+torch>=2.0.0
+torchaudio>=2.0.0
+faster-whisper>=1.0.3
+redis>=5.0.0

speech_to_text.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""
+Speech-to-text helper with optional faster-whisper backend.
+The module degrades safely when ASR dependencies are unavailable.
+"""
+from __future__ import annotations
+import logging
+from typing import Any, Dict, Iterable, Optional
+import numpy as np
+from config import settings
+logger = logging.getLogger(__name__)
+_asr_model = None
+_asr_load_attempted = False
+LANGUAGE_TO_WHISPER = {
+    "English": "en",
+    "Tamil": "ta",
+    "Hindi": "hi",
+    "Malayalam": "ml",
+    "Telugu": "te",
+}
+def _load_asr_model():
+    """Load faster-whisper model lazily."""
+    global _asr_model, _asr_load_attempted
+    if _asr_model is not None:
+        return _asr_model
+    if _asr_load_attempted:
+        return None
+    _asr_load_attempted = True
+    try:
+        from faster_whisper import WhisperModel
+        _asr_model = WhisperModel(
+            model_size_or_path=settings.ASR_MODEL_SIZE,
+            device="cpu",
+            compute_type=settings.ASR_COMPUTE_TYPE,
+        )
+        logger.info(
+            "ASR model loaded successfully: size=%s compute_type=%s",
+            settings.ASR_MODEL_SIZE,
+            settings.ASR_COMPUTE_TYPE,
+        )
+        return _asr_model
+    except Exception as exc:  # pragma: no cover - environment dependent
+        logger.warning("ASR model unavailable: %s", exc)
+        return None
+def _decode_segments(segments: Iterable[Any]) -> Dict[str, Any]:
+    """Extract transcript and confidence proxy from whisper segments."""
+    transcript_parts = []
+    confidence_parts = []
+    for seg in segments:
+        text = (seg.text or "").strip()
+        if text:
+            transcript_parts.append(text)
+        avg_logprob = getattr(seg, "avg_logprob", None)
+        if avg_logprob is not None:
+            confidence_parts.append(float(np.exp(min(0.0, avg_logprob))))
+    transcript = " ".join(transcript_parts).strip()
+    confidence = float(np.mean(confidence_parts)) if confidence_parts else (0.0 if not transcript else 0.5)
+    confidence = max(0.0, min(1.0, confidence))
+    return {
+        "transcript": transcript,
+        "confidence": confidence,
+    }
+def _run_transcribe(model: Any, audio: np.ndarray, language_code: Optional[str]) -> Dict[str, Any]:
+    """Run one whisper transcription pass with optional language hint."""
+    segments, _ = model.transcribe(
+        audio,
+        language=language_code,
+        beam_size=settings.ASR_BEAM_SIZE,
+        vad_filter=True,
+        condition_on_previous_text=False,
+        word_timestamps=False,
+    )
+    return _decode_segments(segments)
+def transcribe_audio(audio: np.ndarray, sr: int, language: str) -> Dict[str, Any]:
+    """
+    Transcribe audio to text.
+    Returns:
+        {
+            "transcript": str,
+            "confidence": float [0..1],
+            "engine": str,
+            "available": bool
+        }
+    """
+    if not settings.ASR_ENABLED:
+        return {
+            "transcript": "",
+            "confidence": 0.0,
+            "engine": "disabled",
+            "available": False,
+        }
+    model = _load_asr_model()
+    if model is None:
+        return {
+            "transcript": "",
+            "confidence": 0.0,
+            "engine": "unavailable",
+            "available": False,
+        }
+    try:
+        if sr != 16000:
+            import librosa
+            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+        audio = np.asarray(audio, dtype=np.float32)
+        language_code = LANGUAGE_TO_WHISPER.get(language)
+        hinted = _run_transcribe(model, audio, language_code)
+        # Recovery path: if language hint produced no text, retry with auto-detect.
+        # This improves robustness for mixed-language/accented input.
+        if not hinted["transcript"]:
+            autodetect = _run_transcribe(model, audio, None)
+            if autodetect["transcript"]:
+                return {
+                    "transcript": autodetect["transcript"],
+                    "confidence": autodetect["confidence"],
+                    "engine": "faster-whisper:auto",
+                    "available": True,
+                }
+        return {
+            "transcript": hinted["transcript"],
+            "confidence": hinted["confidence"],
+            "engine": "faster-whisper",
+            "available": True,
+        }
+    except Exception as exc:  # pragma: no cover - runtime/audio dependent
+        logger.warning("ASR transcription failed: %s", exc)
+        return {
+            "transcript": "",
+            "confidence": 0.0,
+            "engine": "error",
+            "available": False,
+        }