shivam0897-i commited on
Commit
4bfc577
·
1 Parent(s): 8a6ab53

chore: clean codebase for production

Browse files

- Remove fix-tag comments and verbose inline comments
- Convert f-string loggers to lazy %s formatting
- Clean up trailing whitespace and blank lines
- Update .gitignore to exclude non-production files
- Remove tracked test artifacts (evaluation_results, test_my_api)

Files changed (7) hide show
  1. .gitignore +18 -12
  2. audio_utils.py +3 -9
  3. config.py +1 -1
  4. evaluation_results.json +0 -50
  5. main.py +53 -154
  6. model.py +28 -70
  7. test_my_api.py +0 -171
.gitignore CHANGED
@@ -39,18 +39,10 @@ Thumbs.db
39
  fine_tuned_model/
40
  training/
41
 
42
- # === Non-production files (keep out of HF Space) ===
43
-
44
- # Tests
45
- tests/
46
- pytest.ini
47
-
48
- # Docs and reports
49
- docs/
50
-
51
- # Dev/validation scripts
52
- scripts/
53
- scenario_validation_cases.py
54
 
55
  # Test request fixtures
56
  test_request.json
@@ -59,3 +51,17 @@ test_valid.json
59
  # Helper/patch scripts
60
  _fix_*.py
61
  _test_*.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  fine_tuned_model/
40
  training/
41
 
42
+ # Test artifacts (generated output)
43
+ evaluation_results.json
44
+ test_my_api.py
45
+ run_final_tests.py
 
 
 
 
 
 
 
 
46
 
47
  # Test request fixtures
48
  test_request.json
 
51
  # Helper/patch scripts
52
  _fix_*.py
53
  _test_*.py
54
+
55
+ # Download folders
56
+ drive-download-*/
57
+
58
+ # Local docs / tests / scripts (not deployed)
59
+ docs/
60
+ tests/
61
+ scripts/
62
+ pytest.ini
63
+ scenario_validation_cases.py
64
+ realtime-analysis-*.json
65
+
66
+ # Python project metadata (not needed for deployment)
67
+ pyproject.toml
audio_utils.py CHANGED
@@ -11,7 +11,6 @@ import numpy as np
11
  import librosa
12
  import soundfile as sf
13
 
14
- # Configure logging
15
  logger = logging.getLogger(__name__)
16
 
17
  # Magic bytes for common audio formats
@@ -106,38 +105,33 @@ def load_audio_from_bytes(audio_bytes: bytes, target_sr: int = 22050, audio_form
106
  Raises:
107
  ValueError: If audio cannot be loaded or is invalid
108
  """
109
- # Validate audio content BEFORE attempting to decode
110
  is_valid, validation_result = validate_audio_content(audio_bytes)
111
  if not is_valid:
112
  raise ValueError(f"Invalid audio file: {validation_result}")
113
 
114
- logger.info(f"Audio validation passed. Detected format hint: {validation_result}")
115
 
116
  tmp_path = None
117
  try:
118
- # Normalize format
119
  audio_format = audio_format.lower().strip()
120
  if audio_format.startswith("."):
121
  audio_format = audio_format[1:]
122
 
123
- # Validate format (security)
124
  if not audio_format.isalnum() or len(audio_format) > 5:
125
  raise ValueError(f"Invalid audio format: {audio_format}")
126
 
127
- # Write to temp file for librosa
128
  with tempfile.NamedTemporaryFile(suffix=f".{audio_format}", delete=False) as tmp_file:
129
  tmp_file.write(audio_bytes)
130
  tmp_path = tmp_file.name
131
 
132
- # Load audio with librosa
133
  audio, sr = librosa.load(tmp_path, sr=target_sr, mono=True)
134
 
135
- # Validate loaded audio
136
  if len(audio) == 0:
137
  raise ValueError("Audio file is empty or could not be decoded")
138
 
139
  duration = len(audio) / sr
140
- logger.info(f"Audio loaded successfully: {duration:.2f}s at {sr}Hz")
141
 
142
  return audio, sr
143
 
 
11
  import librosa
12
  import soundfile as sf
13
 
 
14
  logger = logging.getLogger(__name__)
15
 
16
  # Magic bytes for common audio formats
 
105
  Raises:
106
  ValueError: If audio cannot be loaded or is invalid
107
  """
 
108
  is_valid, validation_result = validate_audio_content(audio_bytes)
109
  if not is_valid:
110
  raise ValueError(f"Invalid audio file: {validation_result}")
111
 
112
+ logger.info("Audio validation passed. Detected format hint: %s", validation_result)
113
 
114
  tmp_path = None
115
  try:
 
116
  audio_format = audio_format.lower().strip()
117
  if audio_format.startswith("."):
118
  audio_format = audio_format[1:]
119
 
120
+ # Reject suspicious format strings
121
  if not audio_format.isalnum() or len(audio_format) > 5:
122
  raise ValueError(f"Invalid audio format: {audio_format}")
123
 
 
124
  with tempfile.NamedTemporaryFile(suffix=f".{audio_format}", delete=False) as tmp_file:
125
  tmp_file.write(audio_bytes)
126
  tmp_path = tmp_file.name
127
 
 
128
  audio, sr = librosa.load(tmp_path, sr=target_sr, mono=True)
129
 
 
130
  if len(audio) == 0:
131
  raise ValueError("Audio file is empty or could not be decoded")
132
 
133
  duration = len(audio) / sr
134
+ logger.info("Audio loaded: %.2fs at %dHz", duration, sr)
135
 
136
  return audio, sr
137
 
config.py CHANGED
@@ -178,7 +178,7 @@ class Settings(BaseSettings):
178
  description="Mask sensitive entities from transcript before returning response"
179
  )
180
 
181
- # WebSocket limits (M8 fix)
182
  WS_MAX_DURATION_SECONDS: int = Field(
183
  default=1800,
184
  description="Maximum WebSocket connection duration in seconds (30 min)"
 
178
  description="Mask sensitive entities from transcript before returning response"
179
  )
180
 
181
+ # WebSocket limits
182
  WS_MAX_DURATION_SECONDS: int = Field(
183
  default=1800,
184
  description="Maximum WebSocket connection duration in seconds (30 min)"
evaluation_results.json DELETED
@@ -1,50 +0,0 @@
1
- {
2
- "finalScore": 100,
3
- "totalFiles": 5,
4
- "scorePerFile": 20.0,
5
- "successfulClassifications": 5,
6
- "wrongClassifications": 0,
7
- "failedTests": 0,
8
- "fileResults": [
9
- {
10
- "fileIndex": 0,
11
- "status": "success",
12
- "matched": true,
13
- "score": 20.0,
14
- "actualClassification": "AI_GENERATED",
15
- "confidenceScore": 0.99
16
- },
17
- {
18
- "fileIndex": 1,
19
- "status": "success",
20
- "matched": true,
21
- "score": 20.0,
22
- "actualClassification": "HUMAN",
23
- "confidenceScore": 0.99
24
- },
25
- {
26
- "fileIndex": 2,
27
- "status": "success",
28
- "matched": true,
29
- "score": 20.0,
30
- "actualClassification": "AI_GENERATED",
31
- "confidenceScore": 0.99
32
- },
33
- {
34
- "fileIndex": 3,
35
- "status": "success",
36
- "matched": true,
37
- "score": 20.0,
38
- "actualClassification": "HUMAN",
39
- "confidenceScore": 0.99
40
- },
41
- {
42
- "fileIndex": 4,
43
- "status": "success",
44
- "matched": true,
45
- "score": 20.0,
46
- "actualClassification": "AI_GENERATED",
47
- "confidenceScore": 0.99
48
- }
49
- ]
50
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main.py CHANGED
@@ -27,14 +27,12 @@ from slowapi import Limiter, _rate_limit_exceeded_handler
27
  from slowapi.util import get_remote_address
28
  from slowapi.errors import RateLimitExceeded
29
 
30
- # Configure logging
31
  logging.basicConfig(
32
  level=logging.INFO,
33
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
34
  )
35
  logger = logging.getLogger(__name__)
36
 
37
- # Rate limiting
38
  limiter = Limiter(key_func=get_remote_address, default_limits=["1000/minute"])
39
 
40
  from audio_utils import decode_base64_audio, load_audio_from_bytes
@@ -50,7 +48,6 @@ try:
50
  except Exception: # pragma: no cover - optional dependency
51
  redis = None
52
 
53
- # Computed constraints
54
  MAX_AUDIO_BASE64_LENGTH = settings.MAX_AUDIO_SIZE_MB * 1024 * 1024 * 4 // 3
55
 
56
 
@@ -88,14 +85,13 @@ class SessionState:
88
 
89
  SESSION_STORE: Dict[str, SessionState] = {}
90
  SESSION_LOCK = asyncio.Lock()
91
- SESSION_LOCKS: Dict[str, asyncio.Lock] = {} # Per-session locks (M1)
92
  SESSION_STORE_BACKEND_ACTIVE = "memory"
93
  REDIS_CLIENT: Any = None
94
  ASR_INFLIGHT_TASKS: set[asyncio.Task] = set()
95
  ASR_INFLIGHT_LOCK = asyncio.Lock()
96
 
97
 
98
-
99
  def use_redis_session_store() -> bool:
100
  """Return whether redis-backed session store is active."""
101
  return SESSION_STORE_BACKEND_ACTIVE == "redis" and REDIS_CLIENT is not None
@@ -296,24 +292,23 @@ def run_startup_warmups() -> None:
296
 
297
  # Detect environment
298
  if settings.SPACE_ID:
299
- logger.info(f"Running on HuggingFace Spaces: {settings.SPACE_ID}")
300
 
301
 
302
  def get_session_lock(session_id: str) -> asyncio.Lock:
303
- """Return a per-session lock, creating one if needed (M1 fix)."""
304
  if session_id not in SESSION_LOCKS:
305
  SESSION_LOCKS[session_id] = asyncio.Lock()
306
  return SESSION_LOCKS[session_id]
307
 
308
 
309
  async def _periodic_session_purge(interval: int = 60) -> None:
310
- """Background task: purge expired sessions every *interval* seconds (M2 fix)."""
311
  while True:
312
  try:
313
  await asyncio.sleep(interval)
314
  async with SESSION_LOCK:
315
  removed = purge_expired_sessions()
316
- # Also clean up per-session locks for removed sessions
317
  stale_lock_keys = [k for k in SESSION_LOCKS if k not in SESSION_STORE]
318
  for k in stale_lock_keys:
319
  del SESSION_LOCKS[k]
@@ -335,14 +330,13 @@ async def lifespan(app: FastAPI):
335
  preload_model()
336
  logger.info("ML model loaded successfully")
337
  except Exception as e:
338
- logger.error(f"Failed to preload model: {e}")
339
 
340
  try:
341
  await asyncio.to_thread(run_startup_warmups)
342
  except Exception as exc:
343
  logger.warning("Startup warm-ups encountered an issue: %s", exc)
344
 
345
- # Background periodic purge task (M2 fix: avoid purging on every request)
346
  purge_task = asyncio.create_task(_periodic_session_purge())
347
 
348
  yield
@@ -355,7 +349,6 @@ async def lifespan(app: FastAPI):
355
  logger.info("Shutting down...")
356
 
357
 
358
- # Initialize FastAPI app with lifespan
359
  app = FastAPI(
360
  title="AI Voice Detection API",
361
  description="Detects whether a voice sample is AI-generated or spoken by a real human",
@@ -369,14 +362,9 @@ app = FastAPI(
369
  lifespan=lifespan
370
  )
371
 
372
- # Add rate limiter to app state
373
  app.state.limiter = limiter
374
  app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
375
 
376
- # Middleware configuration
377
- # CORS
378
- # Note: Set ALLOWED_ORIGINS env var in production
379
- # L2 fix: disable credentials for wildcard origins (browser ignores Set-Cookie anyway)
380
  _cors_origins = settings.ALLOWED_ORIGINS
381
  _cors_credentials = "*" not in _cors_origins
382
  if not _cors_credentials:
@@ -389,38 +377,29 @@ app.add_middleware(
389
  allow_headers=["Content-Type", "x-api-key", "Authorization"],
390
  )
391
 
392
- # Request Logging & Timing Middleware
393
  @app.middleware("http")
394
  async def log_requests(request: Request, call_next):
395
- # Generate request ID and start timer
396
  request_id = str(uuid.uuid4())[:8]
397
  request.state.request_id = request_id
398
  start_time = time.perf_counter()
399
 
400
- # Log request start
401
  method = request.method
402
  path = request.url.path
403
  if method == "POST":
404
- logger.info(f"[{request_id}] [START] {method} {path}")
405
-
406
- # Process request (async)
407
  response = await call_next(request)
408
-
409
- # Calculate duration
410
  duration_ms = (time.perf_counter() - start_time) * 1000
411
  status_code = response.status_code
412
-
413
- # Log request completion with timing
414
  if method == "POST":
415
  status_label = "[OK]" if status_code == 200 else "[ERR]" if status_code >= 400 else "[WARN]"
416
- logger.info(f"[{request_id}] {status_label} END {method} {path} -> {status_code} ({duration_ms:.0f}ms)")
417
-
418
- # Add headers
419
  response.headers["X-Request-ID"] = request_id
420
  response.headers["X-Response-Time"] = f"{duration_ms:.0f}ms"
421
  response.headers["X-Content-Type-Options"] = "nosniff"
422
- # Allow embedding in Hugging Face iframe
423
- # response.headers["X-Frame-Options"] = "DENY"
424
  response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
425
  # Relax CSP to allow standard API documentation via CDNs (ReDoc/Swagger)
426
  response.headers["Content-Security-Policy"] = (
@@ -433,7 +412,6 @@ async def log_requests(request: Request, call_next):
433
  return response
434
 
435
 
436
- # Request/Response Models
437
  class VoiceDetectionRequest(BaseModel):
438
  """Request body for voice detection."""
439
  language: str = Field(default="Auto", description="Language hint (Auto, English, Hindi, Hinglish, Tamil, Malayalam, Telugu). Defaults to auto-detect.")
@@ -703,7 +681,7 @@ def validate_supported_language(language: str) -> str:
703
  """Validate supported language. Falls back to 'Auto' for unknown languages so the
704
  evaluator never gets a 400 for an unexpected language hint."""
705
  if language not in settings.SUPPORTED_LANGUAGES:
706
- logger.warning(f"Unsupported language '{language}' — falling back to 'Auto'")
707
  return "Auto"
708
  return language
709
 
@@ -751,6 +729,7 @@ def dedupe_preserve_order(items: List[str]) -> List[str]:
751
  def update_session_behaviour_state(session: SessionState, language_analysis: Dict[str, Any]) -> Dict[str, Any]:
752
  """Update session-level behaviour score from transcript and semantic trends."""
753
  transcript_source = str(language_analysis.get("transcript_raw", language_analysis.get("transcript", "")))
 
754
  transcript = normalize_transcript_for_behavior(transcript_source)
755
  semantic_flags = list(language_analysis.get("semantic_flags", []))
756
  keyword_categories = list(language_analysis.get("keyword_categories", []))
@@ -1087,24 +1066,14 @@ def build_risk_update(
1087
  confidence_audio = int(round(confidence * 100))
1088
  anomaly_audio = int(max(0.0, min(100.0, acoustic_anomaly * 0.85)))
1089
  audio_score = max(confidence_audio, anomaly_audio)
1090
- # When authenticity (signal forensics) contradicts AI classification,
1091
- # dampen the audio_score. Browser-mic audio typically has
1092
- # authenticity 34-60, so the threshold starts low.
1093
- # IMPORTANT: Only dampen for mic source — file uploads should trust
1094
- # the model classification.
1095
  if authenticity > 35 and _audio_source == "mic":
1096
- # Scale factor: authenticity 35 → 1.0 (no change),
1097
- # authenticity 55 → 0.80,
1098
- # authenticity 80 → 0.55
1099
  auth_dampen = max(0.50, 1.0 - (authenticity - 35) / 100.0)
1100
  audio_score = int(round(audio_score * auth_dampen))
1101
  else:
1102
  authenticity_audio_score = int(max(0, min(100, (50.0 - authenticity) * 1.2)))
1103
- # Browser mic naturally has higher spectral anomaly (40-78) due to
1104
- # noise floor and frequency response. Use 0.55 multiplier for mic
1105
- # (was 0.70) so anomaly 60 → score 33 instead of 42, keeping
1106
- # HUMAN chunks in LOW risk range where they belong.
1107
- # File uploads use standard 0.90 multiplier.
1108
  _anomaly_mult = 0.55 if _audio_source == "mic" else 0.90
1109
  anomaly_audio_score = int(max(0.0, min(100.0, acoustic_anomaly * _anomaly_mult)))
1110
  audio_score = max(authenticity_audio_score, anomaly_audio_score)
@@ -1163,21 +1132,14 @@ def build_risk_update(
1163
  delta_boost = int(delta * settings.RISK_DELTA_BOOST_FACTOR)
1164
  risk_score = min(100, risk_score + delta_boost)
1165
 
1166
- # ── P2a: Risk dampening prevent single-chunk LOW→CRITICAL ────
1167
- # If previous score was below 60 (LOW/MEDIUM) and new score jumps
1168
- # to CRITICAL (>=80), cap at 79 unless 2+ recent HIGH scores in
1169
- # the session history support the escalation.
1170
  if previous_score is not None and previous_score < 60 and risk_score >= 80:
1171
  recent_high = sum(1 for s in _risk_history[-5:] if s >= 60)
1172
  if recent_high < 2:
1173
  risk_score = min(risk_score, 79)
1174
  behaviour_signals.append("risk_dampened_no_prior_high")
1175
 
1176
- # ── L3 fix: First-chunk guard ──────────────────────────────────────
1177
- # The very first chunk often contains connection noise / silence.
1178
- # Cap its risk at 60 (MEDIUM) so one noisy handshake doesn't set
1179
- # the session trajectory high — UNLESS there's a strong positive
1180
- # signal (AI voice, high acoustic anomaly, or fraud keywords).
1181
  if _chunks_processed == 0 and risk_score > 60:
1182
  has_strong_signal = (
1183
  (classification == "AI_GENERATED" and confidence >= 0.80)
@@ -1189,10 +1151,7 @@ def build_risk_update(
1189
  risk_score = 60
1190
  behaviour_signals.append("first_chunk_capped")
1191
 
1192
- # ── M4 fix: Cumulative risk escalation for sustained moderate signals ──
1193
- # If 3+ of the last 5 chunks scored ≥40 AND the current chunk also
1194
- # scores ≥40, apply a cumulative boost (3 pts per recent moderate chunk,
1195
- # max +15). This ensures sustained low-grade fraud eventually triggers alerts.
1196
  if len(_risk_history) >= 3 and risk_score >= 40:
1197
  recent_moderate = sum(1 for s in _risk_history[-5:] if s >= 40)
1198
  if recent_moderate >= 3:
@@ -1200,23 +1159,15 @@ def build_risk_update(
1200
  risk_score = min(100, risk_score + cumulative_boost)
1201
  behaviour_signals.append("sustained_moderate_risk")
1202
 
1203
- # ── P2b: Sustained AI voice escalation ───────────────────────────
1204
- # Instead of a flat floor at 70, escalate based on how many chunks
1205
- # have been classified as AI. floor = 70 + min(20, ai_chunks * 5)
1206
- # This means: 1 AI chunk → 75, 2 → 80, 3 → 85, 4+ → 90.
1207
- # Raised confidence threshold to 0.92 (was 0.85) because with
1208
- # temperature scaling T=4.0, the softened model outputs 0.67-0.84
1209
- # for browser mic audio. Only truly confident AI predictions should
1210
- # trigger this floor escalation.
1211
  if classification == "AI_GENERATED" and confidence >= 0.92:
1212
  ai_floor = 70 + min(20, _voice_ai_chunks * 5)
1213
  risk_score = max(risk_score, ai_floor)
1214
  if _voice_ai_chunks >= 2:
1215
  behaviour_signals.append("sustained_ai_voice")
1216
 
1217
- # ── P1: AI-voice-aware CPI ───────────────────────────────────────
1218
- # Add an AI-voice ratio component so CPI doesn't stay at 0 when
1219
- # the only signal is the model detecting synthetic voice.
1220
  _ai_ratio = (_voice_ai_chunks / max(1, _chunks_processed)) if _chunks_processed > 0 else 0.0
1221
  if previous_score is None:
1222
  cpi = min(100.0, max(0.0,
@@ -1264,11 +1215,7 @@ def build_risk_update(
1264
  or any(signal in behaviour_signals for signal in strong_intent)
1265
  )
1266
 
1267
- # ── P5: First-chunk alert guard ──────────────────────────────────
1268
- # On the very first chunk (_chunks_processed == 0), suppress the
1269
- # alert unless CRITICAL (risk >= 80) or strong semantic intent.
1270
- # This prevents a single false-positive chunk from triggering an
1271
- # alert that will persist in the session history.
1272
  if alert_triggered and _chunks_processed == 0:
1273
  has_strong_intent = any(s in behaviour_signals for s in strong_intent)
1274
  if risk_level != "CRITICAL" and not has_strong_intent:
@@ -1424,14 +1371,9 @@ async def process_audio_chunk(
1424
  f"({analysis_result.confidence_score:.0%}) in {analyze_ms:.0f}ms"
1425
  )
1426
 
1427
- # ── Short-chunk guard (bidirectional) ────────────────────────────
1428
- # Audio segments shorter than 2 s give the classifier insufficient
1429
- # spectral context, leading to unreliable predictions in both
1430
- # directions (e.g. a 1.6 s human tail flipping to AI 100%, or a
1431
- # short synthetic tail flipping to HUMAN 99%). When the session
1432
- # already has a clear majority classification, we carry that
1433
- # forward instead of trusting a sub-2-second segment.
1434
- MIN_RELIABLE_DURATION = 2.0 # seconds
1435
  if duration_sec < MIN_RELIABLE_DURATION:
1436
  async with SESSION_LOCK:
1437
  _sess = get_session_state(session_id)
@@ -1625,11 +1567,7 @@ async def process_audio_chunk(
1625
  session.final_voice_classification = voice_classification
1626
  session.final_voice_confidence = voice_confidence
1627
 
1628
- # ── P4: Reconcile final_call_label with majority vote ────────
1629
- # If the majority vote says HUMAN but the watermark-based label
1630
- # is FRAUD, downgrade. Use average risk (not max) to decide
1631
- # between SPAM and SAFE — a single spike shouldn't override an
1632
- # otherwise clean session.
1633
  if session.final_voice_classification == "HUMAN" and session.final_call_label == "FRAUD":
1634
  avg_risk = sum(session.risk_history) / max(1, len(session.risk_history))
1635
  session.final_call_label = "SPAM" if avg_risk >= 30 else "SAFE"
@@ -1638,19 +1576,14 @@ async def process_audio_chunk(
1638
  elif session.final_voice_classification == "AI_GENERATED" and session.final_call_label == "SAFE":
1639
  session.final_call_label = "SPAM"
1640
 
1641
- # ── P5: Average risk sanity check ────────────────────────────
1642
- # When the average risk across all chunks is LOW (< 35) but the
1643
- # label is FRAUD (because one or two spikes hit max_risk_score),
1644
- # downgrade to SPAM. A session where 80%+ of chunks are SAFE
1645
- # should not be labelled FRAUD — the spikes were likely
1646
- # misclassifications from browser mic audio artifacts.
1647
  if session.final_call_label == "FRAUD" and session.chunks_processed >= 5:
1648
  avg_risk = sum(session.risk_history) / max(1, len(session.risk_history))
1649
  if avg_risk < 35:
1650
  session.final_call_label = "SPAM"
1651
  logger.info(
1652
- f"P5 sanity: downgraded FRAUD SPAM (avg_risk={avg_risk:.1f}, "
1653
- f"chunks={session.chunks_processed})"
1654
  )
1655
 
1656
  if scored["alert"].triggered:
@@ -1852,7 +1785,7 @@ async def analyze_realtime_chunk(
1852
  @app.websocket("/api/voice-detection/v1/session/{session_id}/stream")
1853
  async def stream_realtime_session(websocket: WebSocket, session_id: str):
1854
  """WebSocket endpoint for continuous chunk-based analysis."""
1855
- # L4 fix: accept auth via query param (legacy) OR first-message auth
1856
  has_query_key = verify_websocket_api_key(websocket)
1857
  if not has_query_key:
1858
  # No query-param key — accept connection and require first-message auth
@@ -1872,7 +1805,7 @@ async def stream_realtime_session(websocket: WebSocket, session_id: str):
1872
  request_id = f"ws-{session_id[:8]}"
1873
  ws_start = time.time()
1874
 
1875
- # L4 fix: if no query-param key, require first-message auth
1876
  if not has_query_key:
1877
  try:
1878
  auth_msg = await asyncio.wait_for(websocket.receive_json(), timeout=10.0)
@@ -1890,7 +1823,7 @@ async def stream_realtime_session(websocket: WebSocket, session_id: str):
1890
 
1891
  try:
1892
  while True:
1893
- # M8 fix: enforce max connection duration
1894
  elapsed = time.time() - ws_start
1895
  if elapsed >= settings.WS_MAX_DURATION_SECONDS:
1896
  await websocket.send_json({
@@ -1900,7 +1833,7 @@ async def stream_realtime_session(websocket: WebSocket, session_id: str):
1900
  await websocket.close(code=1000, reason="Max duration exceeded")
1901
  break
1902
 
1903
- # M8 fix: enforce idle timeout
1904
  try:
1905
  payload = await asyncio.wait_for(
1906
  websocket.receive_json(),
@@ -1932,7 +1865,7 @@ async def stream_realtime_session(websocket: WebSocket, session_id: str):
1932
  except ValueError as e:
1933
  await websocket.send_json({"status": "error", "message": str(e)})
1934
  except WebSocketDisconnect:
1935
- logger.info(f"[{request_id}] WebSocket disconnected")
1936
 
1937
 
1938
  @app.get("/v1/session/{session_id}/summary", response_model=SessionSummaryResponse, include_in_schema=False)
@@ -2035,40 +1968,28 @@ async def detect_voice(
2035
  """
2036
  Returns classification result with confidence score and explanation.
2037
  """
2038
- # Log request info for debugging
2039
  request_id = getattr(request.state, 'request_id', 'unknown')
2040
- audio_size_kb = len(voice_request.audioBase64) * 3 / 4 / 1024 # Approximate decoded size
2041
- logger.info(f"[{request_id}] Voice detection request: language={voice_request.language}, format={voice_request.audioFormat}, size~{audio_size_kb:.1f}KB")
2042
-
 
2043
  voice_request.language = validate_supported_language(voice_request.language)
2044
  validate_supported_format(voice_request.audioFormat)
2045
 
2046
- # Hard timeout guard: evaluator kills requests at 30s — bail at 20s with a safe fallback
2047
  LEGACY_TIMEOUT_SECONDS = 20
2048
-
2049
  try:
2050
- # Step 1: Decode Base64 (async - runs in thread pool)
2051
- logger.info(f"[{request_id}] -> Decoding Base64...")
2052
  decode_start = time.perf_counter()
2053
  audio_bytes = await asyncio.to_thread(decode_base64_audio, voice_request.audioBase64)
2054
- decode_time = (time.perf_counter() - decode_start) * 1000
2055
-
2056
- # Step 2: Load audio (async - runs in thread pool)
2057
- logger.info(f"[{request_id}] -> Loading audio... (decode took {decode_time:.0f}ms)")
2058
- load_start = time.perf_counter()
2059
  audio, sr = await asyncio.to_thread(load_audio_from_bytes, audio_bytes, 16000, voice_request.audioFormat)
2060
- load_time = (time.perf_counter() - load_start) * 1000
2061
 
2062
- # Truncate long audio to avoid timeout (keep first 20s max — plenty for classification)
2063
  max_samples = sr * 20
2064
  if len(audio) > max_samples:
2065
- logger.warning(f"[{request_id}] -> Truncating audio from {len(audio)/sr:.1f}s to 20s for timeout safety")
2066
  audio = audio[:max_samples]
2067
-
2068
- # Step 3: ML Analysis (async - runs in thread pool, CPU-bound) with timeout guard
2069
  duration_sec = len(audio) / sr
2070
- logger.info(f"[{request_id}] -> Analyzing {duration_sec:.1f}s audio... (load took {load_time:.0f}ms)")
2071
- analyze_start = time.perf_counter()
2072
  remaining_budget = LEGACY_TIMEOUT_SECONDS - (time.perf_counter() - decode_start)
2073
  if remaining_budget < 2:
2074
  raise asyncio.TimeoutError("Insufficient time budget for analysis")
@@ -2076,11 +1997,11 @@ async def detect_voice(
2076
  asyncio.to_thread(analyze_voice, audio, sr, voice_request.language),
2077
  timeout=max(2.0, remaining_budget)
2078
  )
2079
- analyze_time = (time.perf_counter() - analyze_start) * 1000
2080
-
2081
- logger.info(f"[{request_id}] -> Analysis complete: {result.classification} ({result.confidence_score:.0%}) in {analyze_time:.0f}ms")
2082
-
2083
- # Extract metrics if available
2084
  metrics = None
2085
  if result.features:
2086
  metrics = ForensicMetrics(
@@ -2094,7 +2015,6 @@ async def detect_voice(
2094
  explanation = result.explanation
2095
  recommended_action = None
2096
  response_classification = result.classification
2097
- # Never return UNCERTAIN on legacy endpoint — evaluator only accepts HUMAN / AI_GENERATED
2098
  if model_uncertain:
2099
  explanation = (
2100
  "Model uncertainty detected due fallback inference. "
@@ -2111,7 +2031,6 @@ async def detect_voice(
2111
  "credentials. Verify caller identity through official channels."
2112
  )
2113
 
2114
- # Return response
2115
  return VoiceDetectionResponse(
2116
  status="success",
2117
  language=voice_request.language,
@@ -2124,13 +2043,13 @@ async def detect_voice(
2124
  )
2125
 
2126
  except ValueError as e:
2127
- logger.warning(f"[{request_id}] [VALIDATION_ERROR] {e}")
2128
  raise HTTPException(
2129
  status_code=400,
2130
  detail={"status": "error", "message": str(e)}
2131
  )
2132
  except asyncio.TimeoutError:
2133
- logger.warning(f"[{request_id}] [TIMEOUT] Legacy endpoint exceeded {LEGACY_TIMEOUT_SECONDS}s budget returning safe fallback")
2134
  return VoiceDetectionResponse(
2135
  status="success",
2136
  language=voice_request.language,
@@ -2142,10 +2061,10 @@ async def detect_voice(
2142
  recommendedAction="Analysis took too long. Verify caller identity through official channels.",
2143
  )
2144
  except Exception as e:
2145
- logger.error(f"[{request_id}] [PROCESSING_ERROR] {e}", exc_info=True)
2146
  raise HTTPException(
2147
  status_code=500,
2148
- detail={"status": "error", "message": f"Internal Server Error (request_id={request_id})"}
2149
  )
2150
 
2151
 
@@ -2215,7 +2134,7 @@ async def http_exception_handler(request: Request, exc: HTTPException):
2215
  @app.exception_handler(Exception)
2216
  async def global_exception_handler(request: Request, exc: Exception):
2217
  """Global handler to catch unhandled exceptions and prevent stack traces."""
2218
- logger.error(f"Unhandled error: {exc}", exc_info=True)
2219
  return JSONResponse(
2220
  status_code=500,
2221
  content={"status": "error", "message": "Internal Server Error"}
@@ -2225,23 +2144,3 @@ async def global_exception_handler(request: Request, exc: Exception):
2225
  if __name__ == "__main__":
2226
  import uvicorn
2227
  uvicorn.run(app, host="0.0.0.0", port=settings.PORT)
2228
-
2229
-
2230
-
2231
-
2232
-
2233
-
2234
-
2235
-
2236
-
2237
-
2238
-
2239
-
2240
-
2241
-
2242
-
2243
-
2244
-
2245
-
2246
-
2247
-
 
27
  from slowapi.util import get_remote_address
28
  from slowapi.errors import RateLimitExceeded
29
 
 
30
  logging.basicConfig(
31
  level=logging.INFO,
32
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
33
  )
34
  logger = logging.getLogger(__name__)
35
 
 
36
  limiter = Limiter(key_func=get_remote_address, default_limits=["1000/minute"])
37
 
38
  from audio_utils import decode_base64_audio, load_audio_from_bytes
 
48
  except Exception: # pragma: no cover - optional dependency
49
  redis = None
50
 
 
51
  MAX_AUDIO_BASE64_LENGTH = settings.MAX_AUDIO_SIZE_MB * 1024 * 1024 * 4 // 3
52
 
53
 
 
85
 
86
  SESSION_STORE: Dict[str, SessionState] = {}
87
  SESSION_LOCK = asyncio.Lock()
88
+ SESSION_LOCKS: Dict[str, asyncio.Lock] = {}
89
  SESSION_STORE_BACKEND_ACTIVE = "memory"
90
  REDIS_CLIENT: Any = None
91
  ASR_INFLIGHT_TASKS: set[asyncio.Task] = set()
92
  ASR_INFLIGHT_LOCK = asyncio.Lock()
93
 
94
 
 
95
  def use_redis_session_store() -> bool:
96
  """Return whether redis-backed session store is active."""
97
  return SESSION_STORE_BACKEND_ACTIVE == "redis" and REDIS_CLIENT is not None
 
292
 
293
  # Detect environment
294
  if settings.SPACE_ID:
295
+ logger.info("Running on HuggingFace Spaces: %s", settings.SPACE_ID)
296
 
297
 
298
  def get_session_lock(session_id: str) -> asyncio.Lock:
299
+ """Return a per-session lock, creating one if needed."""
300
  if session_id not in SESSION_LOCKS:
301
  SESSION_LOCKS[session_id] = asyncio.Lock()
302
  return SESSION_LOCKS[session_id]
303
 
304
 
305
  async def _periodic_session_purge(interval: int = 60) -> None:
306
+ """Background task: purge expired sessions every *interval* seconds."""
307
  while True:
308
  try:
309
  await asyncio.sleep(interval)
310
  async with SESSION_LOCK:
311
  removed = purge_expired_sessions()
 
312
  stale_lock_keys = [k for k in SESSION_LOCKS if k not in SESSION_STORE]
313
  for k in stale_lock_keys:
314
  del SESSION_LOCKS[k]
 
330
  preload_model()
331
  logger.info("ML model loaded successfully")
332
  except Exception as e:
333
+ logger.error("Failed to preload model: %s", e)
334
 
335
  try:
336
  await asyncio.to_thread(run_startup_warmups)
337
  except Exception as exc:
338
  logger.warning("Startup warm-ups encountered an issue: %s", exc)
339
 
 
340
  purge_task = asyncio.create_task(_periodic_session_purge())
341
 
342
  yield
 
349
  logger.info("Shutting down...")
350
 
351
 
 
352
  app = FastAPI(
353
  title="AI Voice Detection API",
354
  description="Detects whether a voice sample is AI-generated or spoken by a real human",
 
362
  lifespan=lifespan
363
  )
364
 
 
365
  app.state.limiter = limiter
366
  app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
367
 
 
 
 
 
368
  _cors_origins = settings.ALLOWED_ORIGINS
369
  _cors_credentials = "*" not in _cors_origins
370
  if not _cors_credentials:
 
377
  allow_headers=["Content-Type", "x-api-key", "Authorization"],
378
  )
379
 
 
380
  @app.middleware("http")
381
  async def log_requests(request: Request, call_next):
 
382
  request_id = str(uuid.uuid4())[:8]
383
  request.state.request_id = request_id
384
  start_time = time.perf_counter()
385
 
 
386
  method = request.method
387
  path = request.url.path
388
  if method == "POST":
389
+ logger.info("[%s] [START] %s %s", request_id, method, path)
390
+
 
391
  response = await call_next(request)
392
+
 
393
  duration_ms = (time.perf_counter() - start_time) * 1000
394
  status_code = response.status_code
395
+
 
396
  if method == "POST":
397
  status_label = "[OK]" if status_code == 200 else "[ERR]" if status_code >= 400 else "[WARN]"
398
+ logger.info("[%s] %s END %s %s -> %s (%0.fms)", request_id, status_label, method, path, status_code, duration_ms)
399
+
 
400
  response.headers["X-Request-ID"] = request_id
401
  response.headers["X-Response-Time"] = f"{duration_ms:.0f}ms"
402
  response.headers["X-Content-Type-Options"] = "nosniff"
 
 
403
  response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
404
  # Relax CSP to allow standard API documentation via CDNs (ReDoc/Swagger)
405
  response.headers["Content-Security-Policy"] = (
 
412
  return response
413
 
414
 
 
415
  class VoiceDetectionRequest(BaseModel):
416
  """Request body for voice detection."""
417
  language: str = Field(default="Auto", description="Language hint (Auto, English, Hindi, Hinglish, Tamil, Malayalam, Telugu). Defaults to auto-detect.")
 
681
  """Validate supported language. Falls back to 'Auto' for unknown languages so the
682
  evaluator never gets a 400 for an unexpected language hint."""
683
  if language not in settings.SUPPORTED_LANGUAGES:
684
+ logger.warning("Unsupported language '%s' — falling back to 'Auto'", language)
685
  return "Auto"
686
  return language
687
 
 
729
  def update_session_behaviour_state(session: SessionState, language_analysis: Dict[str, Any]) -> Dict[str, Any]:
730
  """Update session-level behaviour score from transcript and semantic trends."""
731
  transcript_source = str(language_analysis.get("transcript_raw", language_analysis.get("transcript", "")))
732
+
733
  transcript = normalize_transcript_for_behavior(transcript_source)
734
  semantic_flags = list(language_analysis.get("semantic_flags", []))
735
  keyword_categories = list(language_analysis.get("keyword_categories", []))
 
1066
  confidence_audio = int(round(confidence * 100))
1067
  anomaly_audio = int(max(0.0, min(100.0, acoustic_anomaly * 0.85)))
1068
  audio_score = max(confidence_audio, anomaly_audio)
1069
+ # Dampen audio_score when signal forensics contradict AI classification
1070
+ # for mic-source audio (browser mic has authenticity 34-60 naturally).
 
 
 
1071
  if authenticity > 35 and _audio_source == "mic":
 
 
 
1072
  auth_dampen = max(0.50, 1.0 - (authenticity - 35) / 100.0)
1073
  audio_score = int(round(audio_score * auth_dampen))
1074
  else:
1075
  authenticity_audio_score = int(max(0, min(100, (50.0 - authenticity) * 1.2)))
1076
+ # Mic audio has higher spectral anomaly (40-78); use lower multiplier.
 
 
 
 
1077
  _anomaly_mult = 0.55 if _audio_source == "mic" else 0.90
1078
  anomaly_audio_score = int(max(0.0, min(100.0, acoustic_anomaly * _anomaly_mult)))
1079
  audio_score = max(authenticity_audio_score, anomaly_audio_score)
 
1132
  delta_boost = int(delta * settings.RISK_DELTA_BOOST_FACTOR)
1133
  risk_score = min(100, risk_score + delta_boost)
1134
 
1135
+ # Risk dampening: prevent single-chunk LOW→CRITICAL jumps.
 
 
 
1136
  if previous_score is not None and previous_score < 60 and risk_score >= 80:
1137
  recent_high = sum(1 for s in _risk_history[-5:] if s >= 60)
1138
  if recent_high < 2:
1139
  risk_score = min(risk_score, 79)
1140
  behaviour_signals.append("risk_dampened_no_prior_high")
1141
 
1142
+ # First-chunk guard: cap noise-only first chunks at MEDIUM.
 
 
 
 
1143
  if _chunks_processed == 0 and risk_score > 60:
1144
  has_strong_signal = (
1145
  (classification == "AI_GENERATED" and confidence >= 0.80)
 
1151
  risk_score = 60
1152
  behaviour_signals.append("first_chunk_capped")
1153
 
1154
+ # Cumulative escalation for sustained moderate signals.
 
 
 
1155
  if len(_risk_history) >= 3 and risk_score >= 40:
1156
  recent_moderate = sum(1 for s in _risk_history[-5:] if s >= 40)
1157
  if recent_moderate >= 3:
 
1159
  risk_score = min(100, risk_score + cumulative_boost)
1160
  behaviour_signals.append("sustained_moderate_risk")
1161
 
1162
+ # Sustained AI voice floor escalation.
1163
+ # floor = 70 + min(20, ai_chunks * 5)
 
 
 
 
 
 
1164
  if classification == "AI_GENERATED" and confidence >= 0.92:
1165
  ai_floor = 70 + min(20, _voice_ai_chunks * 5)
1166
  risk_score = max(risk_score, ai_floor)
1167
  if _voice_ai_chunks >= 2:
1168
  behaviour_signals.append("sustained_ai_voice")
1169
 
1170
+ # AI-voice-aware CPI includes synthetic voice ratio.
 
 
1171
  _ai_ratio = (_voice_ai_chunks / max(1, _chunks_processed)) if _chunks_processed > 0 else 0.0
1172
  if previous_score is None:
1173
  cpi = min(100.0, max(0.0,
 
1215
  or any(signal in behaviour_signals for signal in strong_intent)
1216
  )
1217
 
1218
+ # First-chunk alert guard: suppress unless CRITICAL or strong intent.
 
 
 
 
1219
  if alert_triggered and _chunks_processed == 0:
1220
  has_strong_intent = any(s in behaviour_signals for s in strong_intent)
1221
  if risk_level != "CRITICAL" and not has_strong_intent:
 
1371
  f"({analysis_result.confidence_score:.0%}) in {analyze_ms:.0f}ms"
1372
  )
1373
 
1374
+ # Short-chunk guard: sub-2s segments are unreliable; carry forward
1375
+ # the session's majority classification instead.
1376
+ MIN_RELIABLE_DURATION = 2.0
 
 
 
 
 
1377
  if duration_sec < MIN_RELIABLE_DURATION:
1378
  async with SESSION_LOCK:
1379
  _sess = get_session_state(session_id)
 
1567
  session.final_voice_classification = voice_classification
1568
  session.final_voice_confidence = voice_confidence
1569
 
1570
+ # Reconcile final_call_label with majority vote.
 
 
 
 
1571
  if session.final_voice_classification == "HUMAN" and session.final_call_label == "FRAUD":
1572
  avg_risk = sum(session.risk_history) / max(1, len(session.risk_history))
1573
  session.final_call_label = "SPAM" if avg_risk >= 30 else "SAFE"
 
1576
  elif session.final_voice_classification == "AI_GENERATED" and session.final_call_label == "SAFE":
1577
  session.final_call_label = "SPAM"
1578
 
1579
+ # Average risk sanity check: downgrade FRAUD when most chunks are LOW.
 
 
 
 
 
1580
  if session.final_call_label == "FRAUD" and session.chunks_processed >= 5:
1581
  avg_risk = sum(session.risk_history) / max(1, len(session.risk_history))
1582
  if avg_risk < 35:
1583
  session.final_call_label = "SPAM"
1584
  logger.info(
1585
+ "Sanity: downgraded FRAUD -> SPAM (avg_risk=%.1f, chunks=%d)",
1586
+ avg_risk, session.chunks_processed,
1587
  )
1588
 
1589
  if scored["alert"].triggered:
 
1785
  @app.websocket("/api/voice-detection/v1/session/{session_id}/stream")
1786
  async def stream_realtime_session(websocket: WebSocket, session_id: str):
1787
  """WebSocket endpoint for continuous chunk-based analysis."""
1788
+ # Accept auth via query-param or first-message token
1789
  has_query_key = verify_websocket_api_key(websocket)
1790
  if not has_query_key:
1791
  # No query-param key — accept connection and require first-message auth
 
1805
  request_id = f"ws-{session_id[:8]}"
1806
  ws_start = time.time()
1807
 
1808
+ # Fall back to first-message authentication
1809
  if not has_query_key:
1810
  try:
1811
  auth_msg = await asyncio.wait_for(websocket.receive_json(), timeout=10.0)
 
1823
 
1824
  try:
1825
  while True:
1826
+ # Enforce max connection duration
1827
  elapsed = time.time() - ws_start
1828
  if elapsed >= settings.WS_MAX_DURATION_SECONDS:
1829
  await websocket.send_json({
 
1833
  await websocket.close(code=1000, reason="Max duration exceeded")
1834
  break
1835
 
1836
+ # Enforce idle timeout
1837
  try:
1838
  payload = await asyncio.wait_for(
1839
  websocket.receive_json(),
 
1865
  except ValueError as e:
1866
  await websocket.send_json({"status": "error", "message": str(e)})
1867
  except WebSocketDisconnect:
1868
+ logger.info("[%s] WebSocket disconnected", request_id)
1869
 
1870
 
1871
  @app.get("/v1/session/{session_id}/summary", response_model=SessionSummaryResponse, include_in_schema=False)
 
1968
  """
1969
  Returns classification result with confidence score and explanation.
1970
  """
 
1971
  request_id = getattr(request.state, 'request_id', 'unknown')
1972
+ audio_size_kb = len(voice_request.audioBase64) * 3 / 4 / 1024
1973
+ logger.info("[%s] Voice detection: lang=%s, fmt=%s, size~%.1fKB",
1974
+ request_id, voice_request.language, voice_request.audioFormat, audio_size_kb)
1975
+
1976
  voice_request.language = validate_supported_language(voice_request.language)
1977
  validate_supported_format(voice_request.audioFormat)
1978
 
 
1979
  LEGACY_TIMEOUT_SECONDS = 20
1980
+
1981
  try:
 
 
1982
  decode_start = time.perf_counter()
1983
  audio_bytes = await asyncio.to_thread(decode_base64_audio, voice_request.audioBase64)
1984
+
 
 
 
 
1985
  audio, sr = await asyncio.to_thread(load_audio_from_bytes, audio_bytes, 16000, voice_request.audioFormat)
 
1986
 
 
1987
  max_samples = sr * 20
1988
  if len(audio) > max_samples:
1989
+ logger.warning("[%s] Truncating audio from %.1fs to 20s", request_id, len(audio) / sr)
1990
  audio = audio[:max_samples]
1991
+
 
1992
  duration_sec = len(audio) / sr
 
 
1993
  remaining_budget = LEGACY_TIMEOUT_SECONDS - (time.perf_counter() - decode_start)
1994
  if remaining_budget < 2:
1995
  raise asyncio.TimeoutError("Insufficient time budget for analysis")
 
1997
  asyncio.to_thread(analyze_voice, audio, sr, voice_request.language),
1998
  timeout=max(2.0, remaining_budget)
1999
  )
2000
+ analyze_time = (time.perf_counter() - decode_start) * 1000
2001
+
2002
+ logger.info("[%s] Analysis complete: %s (%.0f%%) in %.0fms",
2003
+ request_id, result.classification, result.confidence_score * 100, analyze_time)
2004
+
2005
  metrics = None
2006
  if result.features:
2007
  metrics = ForensicMetrics(
 
2015
  explanation = result.explanation
2016
  recommended_action = None
2017
  response_classification = result.classification
 
2018
  if model_uncertain:
2019
  explanation = (
2020
  "Model uncertainty detected due fallback inference. "
 
2031
  "credentials. Verify caller identity through official channels."
2032
  )
2033
 
 
2034
  return VoiceDetectionResponse(
2035
  status="success",
2036
  language=voice_request.language,
 
2043
  )
2044
 
2045
  except ValueError as e:
2046
+ logger.warning("[%s] Validation error: %s", request_id, e)
2047
  raise HTTPException(
2048
  status_code=400,
2049
  detail={"status": "error", "message": str(e)}
2050
  )
2051
  except asyncio.TimeoutError:
2052
+ logger.warning("[%s] Legacy endpoint exceeded %ds budget", request_id, LEGACY_TIMEOUT_SECONDS)
2053
  return VoiceDetectionResponse(
2054
  status="success",
2055
  language=voice_request.language,
 
2061
  recommendedAction="Analysis took too long. Verify caller identity through official channels.",
2062
  )
2063
  except Exception as e:
2064
+ logger.error("[%s] Processing error: %s", request_id, e, exc_info=True)
2065
  raise HTTPException(
2066
  status_code=500,
2067
+ detail={"status": "error", "message": "Internal Server Error"}
2068
  )
2069
 
2070
 
 
2134
  @app.exception_handler(Exception)
2135
  async def global_exception_handler(request: Request, exc: Exception):
2136
  """Global handler to catch unhandled exceptions and prevent stack traces."""
2137
+ logger.error("Unhandled error: %s", exc, exc_info=True)
2138
  return JSONResponse(
2139
  status_code=500,
2140
  content={"status": "error", "message": "Internal Server Error"}
 
2144
  if __name__ == "__main__":
2145
  import uvicorn
2146
  uvicorn.run(app, host="0.0.0.0", port=settings.PORT)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model.py CHANGED
@@ -14,14 +14,12 @@ import warnings
14
 
15
  from config import settings
16
 
17
- # Configure logging
18
  logger = logging.getLogger(__name__)
19
 
20
- # Suppress warnings
21
  warnings.filterwarnings("ignore", category=FutureWarning)
22
  warnings.filterwarnings("ignore", category=UserWarning)
23
 
24
- # ── Heuristic thresholds (M7 fix: centralised for easy tuning) ──────
25
  HEURISTIC_THRESHOLDS = {
26
  # Pitch scoring
27
  "pitch_optimal_stability": float(os.getenv("PITCH_OPTIMAL_STABILITY", "0.20")),
@@ -46,7 +44,6 @@ _model = None
46
  _processor = None
47
  _device = None
48
 
49
-
50
  @dataclass
51
  class AnalysisResult:
52
  """Result of voice analysis."""
@@ -64,7 +61,7 @@ def get_device():
64
  _device = "cuda"
65
  else:
66
  _device = "cpu"
67
- logger.info(f"Using device: {_device}")
68
  return _device
69
 
70
 
@@ -75,11 +72,9 @@ def _detect_label_inversion(model):
75
  """Check once at load time whether this model needs label flipping."""
76
  global _invert_labels
77
  name = getattr(model.config, '_name_or_path', '').lower()
78
- if 'shivam-2211' in name or 'voice-detection-model' in name:
79
- _invert_labels = True
80
- logger.info("Model has inverted training labels label flip enabled (logged once)")
81
- else:
82
- _invert_labels = False
83
 
84
 
85
  def load_model():
@@ -102,10 +97,10 @@ def load_model():
102
  backup_model = settings.VOICE_MODEL_BACKUP_ID
103
 
104
  if os.path.exists(local_path):
105
- logger.info(f"Loading local fine-tuned model from: {local_path}")
106
  model_name = local_path
107
  else:
108
- logger.info(f"Loading model from HuggingFace Hub: {hf_model}")
109
  model_name = hf_model
110
 
111
  try:
@@ -113,10 +108,10 @@ def load_model():
113
  _model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
114
  _model.to(get_device())
115
  _model.eval()
116
- logger.info(f"Model loaded successfully: {model_name}")
117
  _detect_label_inversion(_model)
118
  except Exception as e:
119
- logger.error(f"Failed to load model {model_name}: {e}")
120
  if model_name != backup_model:
121
  logger.warning("Trying backup model...")
122
  model_name = backup_model
@@ -125,7 +120,7 @@ def load_model():
125
  _model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
126
  _model.to(get_device())
127
  _model.eval()
128
- logger.info(f"Backup model loaded: {model_name}")
129
  _detect_label_inversion(_model)
130
  except Exception as e2:
131
  raise RuntimeError(f"Could not load any model: {e2}")
@@ -212,7 +207,7 @@ def extract_signal_features(audio: np.ndarray, sr: int, fast_mode: bool = False)
212
  features["harmonic_noise_ratio_db"] = hnr_db
213
 
214
  except Exception as e:
215
- logger.warning(f"Feature extraction error: {e}")
216
  features = {
217
  "pitch_stability": 0.5,
218
  "jitter": 0.05,
@@ -478,12 +473,12 @@ def classify_with_model(audio: np.ndarray, sr: int) -> Tuple[str, float]:
478
  model, processor = load_model()
479
  device = get_device()
480
 
481
- # Normalize audio to prevent clipping issues
482
  max_val = np.max(np.abs(audio))
483
  if max_val > 0:
484
  audio = audio / max_val
485
 
486
- # Resample to 16kHz if needed (Wav2Vec2 expects 16kHz)
487
  target_sr = 16000
488
  if sr != target_sr:
489
  audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
@@ -495,33 +490,25 @@ def classify_with_model(audio: np.ndarray, sr: int) -> Tuple[str, float]:
495
  return_tensors="pt",
496
  padding=True
497
  )
498
-
499
- # Move to device
500
  inputs = {k: v.to(device) for k, v in inputs.items()}
501
-
502
- # Run inference
503
  with torch.no_grad():
504
  outputs = model(**inputs)
505
  logits = outputs.logits
506
 
507
- # Temperature scaling: divide logits by T > 1 to soften the
508
- # probability distribution. The model routinely saturates at
509
- # 1.00 confidence for browser-mic audio, leaving zero room for
510
- # the heuristic cross-check to correct a wrong classification.
511
  temperature = float(settings.MODEL_LOGIT_TEMPERATURE)
512
  if temperature > 1.0:
513
  logits = logits / temperature
514
 
515
  probabilities = torch.softmax(logits, dim=-1)
516
-
517
  # Get prediction
518
  predicted_class = torch.argmax(probabilities, dim=-1).item()
519
  confidence = probabilities[0][predicted_class].item()
520
 
521
- # Map class to label using the model's id2label config.
522
- # IMPORTANT: HuggingFace stores id2label with STRING keys ("0", "1")
523
- # but predicted_class from torch.argmax().item() is an int.
524
- # We must normalise the keys to int so .get() actually matches.
525
  raw_id2label = getattr(model.config, 'id2label', None) or {}
526
  id2label = {int(k): v for k, v in raw_id2label.items()}
527
  label = id2label.get(predicted_class, 'UNKNOWN')
@@ -532,21 +519,10 @@ def classify_with_model(audio: np.ndarray, sr: int) -> Tuple[str, float]:
532
  [f"{p:.4f}" for p in probabilities[0].cpu().tolist()],
533
  )
534
 
535
- # ── Label interpretation ──
536
- # The primary model (shivam-2211/voice-detection-model) was trained with
537
- # inverted label semantics: its class-0 output actually corresponds to
538
- # REAL/human audio and class-1 to FAKE/AI-generated, despite the config
539
- # claiming 0=FAKE and 1=REAL. Detected once at load time via
540
- # _detect_label_inversion().
541
  if _invert_labels:
542
- # Flip: treat model class-0 as REAL, class-1 as FAKE
543
- if predicted_class == 0:
544
- classification = "HUMAN"
545
- else:
546
- classification = "AI_GENERATED"
547
- # confidence stays the same (model's own softmax output)
548
  else:
549
- # Standard mapping: use labels from config
550
  if label.upper() in ['FAKE', 'SPOOF', 'SYNTHETIC', 'AI']:
551
  classification = "AI_GENERATED"
552
  else:
@@ -586,7 +562,7 @@ def analyze_voice(audio: np.ndarray, sr: int, language: str = "English", realtim
586
  try:
587
  classification, ml_confidence = classify_with_model(audio, sr)
588
  except Exception as e:
589
- logger.error(f"ML model error: {e}, falling back to signal analysis")
590
  ml_fallback = True
591
  classification = "HUMAN"
592
  ml_confidence = 0.5
@@ -611,39 +587,21 @@ def analyze_voice(audio: np.ndarray, sr: int, language: str = "English", realtim
611
  ml_confidence = ai_probability if classification == "AI_GENERATED" else (1.0 - ai_probability)
612
  ml_confidence = float(max(0.5, min(0.99, ml_confidence)))
613
 
614
- # ── Authenticity cross-check (REALTIME ONLY) ─────────────────────
615
- # When the model says AI_GENERATED but the signal forensics indicate
616
- # human-like audio (high authenticity), moderate the confidence.
617
- # This prevents a poorly-calibrated model from steamrolling the
618
- # heuristic evidence. The model was fine-tuned on curated datasets
619
- # and can misclassify real browser-mic audio as synthetic.
620
- #
621
- # IMPORTANT: This override is ONLY for realtime browser-mic sessions.
622
- # File uploads use clean audio paths and the model's classification
623
- # should be trusted. Applying the override to file uploads would
624
- # cause real AI-generated audio to be misclassified as HUMAN.
625
- #
626
- # Browser-mic audio typically has authenticity 34-60 and anomaly 40-78
627
- # (naturally higher noise floor and spectral irregularity). The
628
- # thresholds must reflect these real-world ranges.
629
  if realtime and source == "mic" and classification == "AI_GENERATED" and authenticity_score > 35:
630
- # The higher the authenticity, the more we moderate.
631
- # authenticity 35 → no change. authenticity 60 → cap at ~0.75
632
- # authenticity 80 → cap at ~0.55
633
  moderation_factor = max(0.50, 1.0 - (authenticity_score - 35) / 100.0)
634
  if ml_confidence > moderation_factor:
635
  logger.info(
636
- "Authenticity cross-check: moderated AI confidence %.2f %.2f "
637
  "(authenticity=%.1f, anomaly=%.1f)",
638
  ml_confidence, moderation_factor,
639
  authenticity_score, acoustic_anomaly_score,
640
  )
641
  ml_confidence = moderation_factor
642
- # If authenticity indicates human-like features (>40) and anomaly
643
- # is not extreme (<65), override the classification — the signal
644
- # evidence strongly contradicts the model. Browser mic audio
645
- # naturally has anomaly 22-64 and authenticity 34-68, so the
646
- # thresholds must cover these real-world ranges.
647
  if authenticity_score > 40 and acoustic_anomaly_score < 65:
648
  logger.info(
649
  "Authenticity override: flipping AI_GENERATED → HUMAN "
@@ -682,4 +640,4 @@ def preload_model():
682
  try:
683
  load_model()
684
  except Exception as e:
685
- logger.error(f"Model preload failed: {e}")
 
14
 
15
  from config import settings
16
 
 
17
  logger = logging.getLogger(__name__)
18
 
 
19
  warnings.filterwarnings("ignore", category=FutureWarning)
20
  warnings.filterwarnings("ignore", category=UserWarning)
21
 
22
+ # Heuristic thresholds (env-configurable for tuning)
23
  HEURISTIC_THRESHOLDS = {
24
  # Pitch scoring
25
  "pitch_optimal_stability": float(os.getenv("PITCH_OPTIMAL_STABILITY", "0.20")),
 
44
  _processor = None
45
  _device = None
46
 
 
47
  @dataclass
48
  class AnalysisResult:
49
  """Result of voice analysis."""
 
61
  _device = "cuda"
62
  else:
63
  _device = "cpu"
64
+ logger.info("Using device: %s", _device)
65
  return _device
66
 
67
 
 
72
  """Check once at load time whether this model needs label flipping."""
73
  global _invert_labels
74
  name = getattr(model.config, '_name_or_path', '').lower()
75
+ _invert_labels = 'shivam-2211' in name or 'voice-detection-model' in name
76
+ if _invert_labels:
77
+ logger.info("Label inversion enabled for model: %s", name)
 
 
78
 
79
 
80
  def load_model():
 
97
  backup_model = settings.VOICE_MODEL_BACKUP_ID
98
 
99
  if os.path.exists(local_path):
100
+ logger.info("Loading local model from: %s", local_path)
101
  model_name = local_path
102
  else:
103
+ logger.info("Loading model from HuggingFace Hub: %s", hf_model)
104
  model_name = hf_model
105
 
106
  try:
 
108
  _model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
109
  _model.to(get_device())
110
  _model.eval()
111
+ logger.info("Model loaded: %s", model_name)
112
  _detect_label_inversion(_model)
113
  except Exception as e:
114
+ logger.error("Failed to load model %s: %s", model_name, e)
115
  if model_name != backup_model:
116
  logger.warning("Trying backup model...")
117
  model_name = backup_model
 
120
  _model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
121
  _model.to(get_device())
122
  _model.eval()
123
+ logger.info("Backup model loaded: %s", model_name)
124
  _detect_label_inversion(_model)
125
  except Exception as e2:
126
  raise RuntimeError(f"Could not load any model: {e2}")
 
207
  features["harmonic_noise_ratio_db"] = hnr_db
208
 
209
  except Exception as e:
210
+ logger.warning("Feature extraction error: %s", e)
211
  features = {
212
  "pitch_stability": 0.5,
213
  "jitter": 0.05,
 
473
  model, processor = load_model()
474
  device = get_device()
475
 
476
+ # Normalize audio
477
  max_val = np.max(np.abs(audio))
478
  if max_val > 0:
479
  audio = audio / max_val
480
 
481
+ # Resample to 16kHz if needed
482
  target_sr = 16000
483
  if sr != target_sr:
484
  audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
 
490
  return_tensors="pt",
491
  padding=True
492
  )
493
+
 
494
  inputs = {k: v.to(device) for k, v in inputs.items()}
495
+
 
496
  with torch.no_grad():
497
  outputs = model(**inputs)
498
  logits = outputs.logits
499
 
500
+ # Temperature scaling: soften probability distribution so the
501
+ # heuristic cross-check can still correct misclassifications.
 
 
502
  temperature = float(settings.MODEL_LOGIT_TEMPERATURE)
503
  if temperature > 1.0:
504
  logits = logits / temperature
505
 
506
  probabilities = torch.softmax(logits, dim=-1)
 
507
  # Get prediction
508
  predicted_class = torch.argmax(probabilities, dim=-1).item()
509
  confidence = probabilities[0][predicted_class].item()
510
 
511
+ # Normalise id2label keys from str to int (HF convention mismatch).
 
 
 
512
  raw_id2label = getattr(model.config, 'id2label', None) or {}
513
  id2label = {int(k): v for k, v in raw_id2label.items()}
514
  label = id2label.get(predicted_class, 'UNKNOWN')
 
519
  [f"{p:.4f}" for p in probabilities[0].cpu().tolist()],
520
  )
521
 
522
+ # Label interpretation — see _detect_label_inversion() for rationale.
 
 
 
 
 
523
  if _invert_labels:
524
+ classification = "HUMAN" if predicted_class == 0 else "AI_GENERATED"
 
 
 
 
 
525
  else:
 
526
  if label.upper() in ['FAKE', 'SPOOF', 'SYNTHETIC', 'AI']:
527
  classification = "AI_GENERATED"
528
  else:
 
562
  try:
563
  classification, ml_confidence = classify_with_model(audio, sr)
564
  except Exception as e:
565
+ logger.error("ML model error: %s, falling back to signal analysis", e)
566
  ml_fallback = True
567
  classification = "HUMAN"
568
  ml_confidence = 0.5
 
587
  ml_confidence = ai_probability if classification == "AI_GENERATED" else (1.0 - ai_probability)
588
  ml_confidence = float(max(0.5, min(0.99, ml_confidence)))
589
 
590
+ # Authenticity cross-check (realtime mic only): when the model says
591
+ # AI_GENERATED but signal forensics show human-like audio, moderate
592
+ # the confidence or flip the classification. Not applied to file
593
+ # uploads where the model should be trusted.
 
 
 
 
 
 
 
 
 
 
 
594
  if realtime and source == "mic" and classification == "AI_GENERATED" and authenticity_score > 35:
 
 
 
595
  moderation_factor = max(0.50, 1.0 - (authenticity_score - 35) / 100.0)
596
  if ml_confidence > moderation_factor:
597
  logger.info(
598
+ "Authenticity cross-check: moderated AI confidence %.2f -> %.2f "
599
  "(authenticity=%.1f, anomaly=%.1f)",
600
  ml_confidence, moderation_factor,
601
  authenticity_score, acoustic_anomaly_score,
602
  )
603
  ml_confidence = moderation_factor
604
+ # Override when signal evidence strongly contradicts the model.
 
 
 
 
605
  if authenticity_score > 40 and acoustic_anomaly_score < 65:
606
  logger.info(
607
  "Authenticity override: flipping AI_GENERATED → HUMAN "
 
640
  try:
641
  load_model()
642
  except Exception as e:
643
+ logger.error("Model preload failed: %s", e)
test_my_api.py DELETED
@@ -1,171 +0,0 @@
1
- """
2
- Official evaluation script from the hackathon guide, configured with our 5 test files.
3
- This mirrors EXACTLY what the evaluator will run.
4
- """
5
- import requests
6
- import base64
7
- import json
8
-
9
- def evaluate_voice_detection_api(endpoint_url, api_key, test_files):
10
- if not endpoint_url:
11
- print("Error: Endpoint URL is required")
12
- return False
13
- if not test_files or len(test_files) == 0:
14
- print("Error: No test files provided")
15
- return False
16
-
17
- total_files = len(test_files)
18
- score_per_file = 100 / total_files
19
- total_score = 0
20
- file_results = []
21
-
22
- print(f"\n{'='*60}")
23
- print(f"Starting Evaluation")
24
- print(f"{'='*60}")
25
- print(f"Endpoint: {endpoint_url}")
26
- print(f"Total Test Files: {total_files}")
27
- print(f"Score per File: {score_per_file:.2f}")
28
- print(f"{'='*60}\n")
29
-
30
- for idx, file_data in enumerate(test_files):
31
- language = file_data.get('language', 'English')
32
- file_path = file_data.get('file_path', '')
33
- expected_classification = file_data.get('expected_classification', '')
34
-
35
- print(f"Test {idx + 1}/{total_files}: {file_path}")
36
-
37
- if not file_path or not expected_classification:
38
- file_results.append({'fileIndex': idx, 'status': 'skipped', 'score': 0})
39
- print(f" Skipped: Missing file path or expected classification\n")
40
- continue
41
-
42
- try:
43
- with open(file_path, 'rb') as audio_file:
44
- audio_base64 = base64.b64encode(audio_file.read()).decode('utf-8')
45
- except Exception as e:
46
- file_results.append({'fileIndex': idx, 'status': 'failed', 'message': f'Failed to read: {e}', 'score': 0})
47
- print(f" Failed to read file: {e}\n")
48
- continue
49
-
50
- headers = {'Content-Type': 'application/json', 'x-api-key': api_key}
51
- request_body = {'language': language, 'audioFormat': 'mp3', 'audioBase64': audio_base64}
52
-
53
- try:
54
- response = requests.post(endpoint_url, headers=headers, json=request_body, timeout=30)
55
-
56
- if response.status_code != 200:
57
- file_results.append({'fileIndex': idx, 'status': 'failed', 'message': f'HTTP {response.status_code}', 'score': 0})
58
- print(f" HTTP Status: {response.status_code}")
59
- print(f" Response: {response.text[:200]}\n")
60
- continue
61
-
62
- response_data = response.json()
63
-
64
- if not isinstance(response_data, dict):
65
- file_results.append({'fileIndex': idx, 'status': 'failed', 'message': 'Not a JSON object', 'score': 0})
66
- print(f" Invalid response type\n")
67
- continue
68
-
69
- response_status = response_data.get('status', '')
70
- response_classification = response_data.get('classification', '')
71
- confidence_score = response_data.get('confidenceScore', None)
72
-
73
- if not response_status or not response_classification or confidence_score is None:
74
- file_results.append({'fileIndex': idx, 'status': 'failed', 'message': 'Missing required fields', 'score': 0})
75
- print(f" Missing required fields")
76
- print(f" Response: {json.dumps(response_data, indent=2)[:200]}\n")
77
- continue
78
-
79
- if response_status != 'success':
80
- file_results.append({'fileIndex': idx, 'status': 'failed', 'message': f'Status: {response_status}', 'score': 0})
81
- print(f" Status not 'success': {response_status}\n")
82
- continue
83
-
84
- if not isinstance(confidence_score, (int, float)) or confidence_score < 0 or confidence_score > 1:
85
- file_results.append({'fileIndex': idx, 'status': 'failed', 'message': f'Invalid confidence: {confidence_score}', 'score': 0})
86
- print(f" Invalid confidence score: {confidence_score}\n")
87
- continue
88
-
89
- valid_classifications = ['HUMAN', 'AI_GENERATED']
90
- if response_classification not in valid_classifications:
91
- file_results.append({'fileIndex': idx, 'status': 'failed', 'message': f'Invalid classification: {response_classification}', 'score': 0})
92
- print(f" Invalid classification: {response_classification}\n")
93
- continue
94
-
95
- # Score calculation
96
- file_score = 0
97
- if response_classification == expected_classification:
98
- if confidence_score >= 0.8:
99
- file_score = score_per_file
100
- confidence_tier = "100%"
101
- elif confidence_score >= 0.6:
102
- file_score = score_per_file * 0.75
103
- confidence_tier = "75%"
104
- elif confidence_score >= 0.4:
105
- file_score = score_per_file * 0.5
106
- confidence_tier = "50%"
107
- else:
108
- file_score = score_per_file * 0.25
109
- confidence_tier = "25%"
110
- total_score += file_score
111
- file_results.append({'fileIndex': idx, 'status': 'success', 'matched': True, 'score': round(file_score, 2),
112
- 'actualClassification': response_classification, 'confidenceScore': confidence_score})
113
- print(f" CORRECT: {response_classification}")
114
- print(f" Confidence: {confidence_score:.2f} -> {confidence_tier} of points")
115
- print(f" Score: {file_score:.2f}/{score_per_file:.2f}\n")
116
- else:
117
- file_results.append({'fileIndex': idx, 'status': 'success', 'matched': False, 'score': 0,
118
- 'actualClassification': response_classification, 'confidenceScore': confidence_score})
119
- print(f" WRONG: {response_classification} (Expected: {expected_classification})")
120
- print(f" Score: 0/{score_per_file:.2f}\n")
121
-
122
- except requests.exceptions.Timeout:
123
- file_results.append({'fileIndex': idx, 'status': 'failed', 'message': 'Timeout (>30s)', 'score': 0})
124
- print(f" TIMEOUT: Request took longer than 30 seconds\n")
125
- except requests.exceptions.ConnectionError:
126
- file_results.append({'fileIndex': idx, 'status': 'failed', 'message': 'Connection error', 'score': 0})
127
- print(f" CONNECTION ERROR\n")
128
- except Exception as e:
129
- file_results.append({'fileIndex': idx, 'status': 'failed', 'message': str(e), 'score': 0})
130
- print(f" ERROR: {e}\n")
131
-
132
- final_score = round(total_score)
133
-
134
- print(f"{'='*60}")
135
- print(f"EVALUATION SUMMARY")
136
- print(f"{'='*60}")
137
- print(f"Total Files Tested: {total_files}")
138
- print(f"Final Score: {final_score}/100")
139
- print(f"{'='*60}\n")
140
-
141
- successful = sum(1 for r in file_results if r.get('matched', False))
142
- failed = sum(1 for r in file_results if r['status'] == 'failed')
143
- wrong = sum(1 for r in file_results if r['status'] == 'success' and not r.get('matched', False))
144
-
145
- print(f"Correct Classifications: {successful}/{total_files}")
146
- print(f"Wrong Classifications: {wrong}/{total_files}")
147
- print(f"Failed/Errors: {failed}/{total_files}\n")
148
-
149
- with open('evaluation_results.json', 'w') as f:
150
- json.dump({'finalScore': final_score, 'totalFiles': total_files, 'scorePerFile': round(score_per_file, 2),
151
- 'successfulClassifications': successful, 'wrongClassifications': wrong, 'failedTests': failed,
152
- 'fileResults': file_results}, f, indent=2)
153
- print(f"Detailed results saved to: evaluation_results.json\n")
154
- return True
155
-
156
-
157
- if __name__ == '__main__':
158
- ENDPOINT_URL = 'https://shivam-2211-voice-detection-api.hf.space/api/voice-detection'
159
- API_KEY = 'sk_test_voice_detection_2026'
160
-
161
- DIR = r'c:\Users\shiva\OneDrive\Desktop\Voice Project\voice-detection-api\drive-download-20260216T053632Z-1-001'
162
-
163
- TEST_FILES = [
164
- {'language': 'English', 'file_path': f'{DIR}\\English_voice_AI_GENERATED.mp3', 'expected_classification': 'AI_GENERATED'},
165
- {'language': 'Hindi', 'file_path': f'{DIR}\\Hindi_Voice_HUMAN.mp3', 'expected_classification': 'HUMAN'},
166
- {'language': 'Malayalam','file_path': f'{DIR}\\Malayalam_AI_GENERATED.mp3', 'expected_classification': 'AI_GENERATED'},
167
- {'language': 'Tamil', 'file_path': f'{DIR}\\TAMIL_VOICE__HUMAN.mp3', 'expected_classification': 'HUMAN'},
168
- {'language': 'Telugu', 'file_path': f'{DIR}\\Telugu_Voice_AI_GENERATED.mp3', 'expected_classification': 'AI_GENERATED'},
169
- ]
170
-
171
- evaluate_voice_detection_api(ENDPOINT_URL, API_KEY, TEST_FILES)