Spaces:
Sleeping
Sleeping
shivam0897-i commited on
Commit ·
4bfc577
1
Parent(s): 8a6ab53
chore: clean codebase for production
Browse files- Remove fix-tag comments and verbose inline comments
- Convert f-string loggers to lazy %s formatting
- Clean up trailing whitespace and blank lines
- Update .gitignore to exclude non-production files
- Remove tracked test artifacts (evaluation_results, test_my_api)
- .gitignore +18 -12
- audio_utils.py +3 -9
- config.py +1 -1
- evaluation_results.json +0 -50
- main.py +53 -154
- model.py +28 -70
- test_my_api.py +0 -171
.gitignore
CHANGED
|
@@ -39,18 +39,10 @@ Thumbs.db
|
|
| 39 |
fine_tuned_model/
|
| 40 |
training/
|
| 41 |
|
| 42 |
-
#
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
pytest.ini
|
| 47 |
-
|
| 48 |
-
# Docs and reports
|
| 49 |
-
docs/
|
| 50 |
-
|
| 51 |
-
# Dev/validation scripts
|
| 52 |
-
scripts/
|
| 53 |
-
scenario_validation_cases.py
|
| 54 |
|
| 55 |
# Test request fixtures
|
| 56 |
test_request.json
|
|
@@ -59,3 +51,17 @@ test_valid.json
|
|
| 59 |
# Helper/patch scripts
|
| 60 |
_fix_*.py
|
| 61 |
_test_*.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
fine_tuned_model/
|
| 40 |
training/
|
| 41 |
|
| 42 |
+
# Test artifacts (generated output)
|
| 43 |
+
evaluation_results.json
|
| 44 |
+
test_my_api.py
|
| 45 |
+
run_final_tests.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
# Test request fixtures
|
| 48 |
test_request.json
|
|
|
|
| 51 |
# Helper/patch scripts
|
| 52 |
_fix_*.py
|
| 53 |
_test_*.py
|
| 54 |
+
|
| 55 |
+
# Download folders
|
| 56 |
+
drive-download-*/
|
| 57 |
+
|
| 58 |
+
# Local docs / tests / scripts (not deployed)
|
| 59 |
+
docs/
|
| 60 |
+
tests/
|
| 61 |
+
scripts/
|
| 62 |
+
pytest.ini
|
| 63 |
+
scenario_validation_cases.py
|
| 64 |
+
realtime-analysis-*.json
|
| 65 |
+
|
| 66 |
+
# Python project metadata (not needed for deployment)
|
| 67 |
+
pyproject.toml
|
audio_utils.py
CHANGED
|
@@ -11,7 +11,6 @@ import numpy as np
|
|
| 11 |
import librosa
|
| 12 |
import soundfile as sf
|
| 13 |
|
| 14 |
-
# Configure logging
|
| 15 |
logger = logging.getLogger(__name__)
|
| 16 |
|
| 17 |
# Magic bytes for common audio formats
|
|
@@ -106,38 +105,33 @@ def load_audio_from_bytes(audio_bytes: bytes, target_sr: int = 22050, audio_form
|
|
| 106 |
Raises:
|
| 107 |
ValueError: If audio cannot be loaded or is invalid
|
| 108 |
"""
|
| 109 |
-
# Validate audio content BEFORE attempting to decode
|
| 110 |
is_valid, validation_result = validate_audio_content(audio_bytes)
|
| 111 |
if not is_valid:
|
| 112 |
raise ValueError(f"Invalid audio file: {validation_result}")
|
| 113 |
|
| 114 |
-
logger.info(
|
| 115 |
|
| 116 |
tmp_path = None
|
| 117 |
try:
|
| 118 |
-
# Normalize format
|
| 119 |
audio_format = audio_format.lower().strip()
|
| 120 |
if audio_format.startswith("."):
|
| 121 |
audio_format = audio_format[1:]
|
| 122 |
|
| 123 |
-
#
|
| 124 |
if not audio_format.isalnum() or len(audio_format) > 5:
|
| 125 |
raise ValueError(f"Invalid audio format: {audio_format}")
|
| 126 |
|
| 127 |
-
# Write to temp file for librosa
|
| 128 |
with tempfile.NamedTemporaryFile(suffix=f".{audio_format}", delete=False) as tmp_file:
|
| 129 |
tmp_file.write(audio_bytes)
|
| 130 |
tmp_path = tmp_file.name
|
| 131 |
|
| 132 |
-
# Load audio with librosa
|
| 133 |
audio, sr = librosa.load(tmp_path, sr=target_sr, mono=True)
|
| 134 |
|
| 135 |
-
# Validate loaded audio
|
| 136 |
if len(audio) == 0:
|
| 137 |
raise ValueError("Audio file is empty or could not be decoded")
|
| 138 |
|
| 139 |
duration = len(audio) / sr
|
| 140 |
-
logger.info(
|
| 141 |
|
| 142 |
return audio, sr
|
| 143 |
|
|
|
|
| 11 |
import librosa
|
| 12 |
import soundfile as sf
|
| 13 |
|
|
|
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
|
| 16 |
# Magic bytes for common audio formats
|
|
|
|
| 105 |
Raises:
|
| 106 |
ValueError: If audio cannot be loaded or is invalid
|
| 107 |
"""
|
|
|
|
| 108 |
is_valid, validation_result = validate_audio_content(audio_bytes)
|
| 109 |
if not is_valid:
|
| 110 |
raise ValueError(f"Invalid audio file: {validation_result}")
|
| 111 |
|
| 112 |
+
logger.info("Audio validation passed. Detected format hint: %s", validation_result)
|
| 113 |
|
| 114 |
tmp_path = None
|
| 115 |
try:
|
|
|
|
| 116 |
audio_format = audio_format.lower().strip()
|
| 117 |
if audio_format.startswith("."):
|
| 118 |
audio_format = audio_format[1:]
|
| 119 |
|
| 120 |
+
# Reject suspicious format strings
|
| 121 |
if not audio_format.isalnum() or len(audio_format) > 5:
|
| 122 |
raise ValueError(f"Invalid audio format: {audio_format}")
|
| 123 |
|
|
|
|
| 124 |
with tempfile.NamedTemporaryFile(suffix=f".{audio_format}", delete=False) as tmp_file:
|
| 125 |
tmp_file.write(audio_bytes)
|
| 126 |
tmp_path = tmp_file.name
|
| 127 |
|
|
|
|
| 128 |
audio, sr = librosa.load(tmp_path, sr=target_sr, mono=True)
|
| 129 |
|
|
|
|
| 130 |
if len(audio) == 0:
|
| 131 |
raise ValueError("Audio file is empty or could not be decoded")
|
| 132 |
|
| 133 |
duration = len(audio) / sr
|
| 134 |
+
logger.info("Audio loaded: %.2fs at %dHz", duration, sr)
|
| 135 |
|
| 136 |
return audio, sr
|
| 137 |
|
config.py
CHANGED
|
@@ -178,7 +178,7 @@ class Settings(BaseSettings):
|
|
| 178 |
description="Mask sensitive entities from transcript before returning response"
|
| 179 |
)
|
| 180 |
|
| 181 |
-
# WebSocket limits
|
| 182 |
WS_MAX_DURATION_SECONDS: int = Field(
|
| 183 |
default=1800,
|
| 184 |
description="Maximum WebSocket connection duration in seconds (30 min)"
|
|
|
|
| 178 |
description="Mask sensitive entities from transcript before returning response"
|
| 179 |
)
|
| 180 |
|
| 181 |
+
# WebSocket limits
|
| 182 |
WS_MAX_DURATION_SECONDS: int = Field(
|
| 183 |
default=1800,
|
| 184 |
description="Maximum WebSocket connection duration in seconds (30 min)"
|
evaluation_results.json
DELETED
|
@@ -1,50 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"finalScore": 100,
|
| 3 |
-
"totalFiles": 5,
|
| 4 |
-
"scorePerFile": 20.0,
|
| 5 |
-
"successfulClassifications": 5,
|
| 6 |
-
"wrongClassifications": 0,
|
| 7 |
-
"failedTests": 0,
|
| 8 |
-
"fileResults": [
|
| 9 |
-
{
|
| 10 |
-
"fileIndex": 0,
|
| 11 |
-
"status": "success",
|
| 12 |
-
"matched": true,
|
| 13 |
-
"score": 20.0,
|
| 14 |
-
"actualClassification": "AI_GENERATED",
|
| 15 |
-
"confidenceScore": 0.99
|
| 16 |
-
},
|
| 17 |
-
{
|
| 18 |
-
"fileIndex": 1,
|
| 19 |
-
"status": "success",
|
| 20 |
-
"matched": true,
|
| 21 |
-
"score": 20.0,
|
| 22 |
-
"actualClassification": "HUMAN",
|
| 23 |
-
"confidenceScore": 0.99
|
| 24 |
-
},
|
| 25 |
-
{
|
| 26 |
-
"fileIndex": 2,
|
| 27 |
-
"status": "success",
|
| 28 |
-
"matched": true,
|
| 29 |
-
"score": 20.0,
|
| 30 |
-
"actualClassification": "AI_GENERATED",
|
| 31 |
-
"confidenceScore": 0.99
|
| 32 |
-
},
|
| 33 |
-
{
|
| 34 |
-
"fileIndex": 3,
|
| 35 |
-
"status": "success",
|
| 36 |
-
"matched": true,
|
| 37 |
-
"score": 20.0,
|
| 38 |
-
"actualClassification": "HUMAN",
|
| 39 |
-
"confidenceScore": 0.99
|
| 40 |
-
},
|
| 41 |
-
{
|
| 42 |
-
"fileIndex": 4,
|
| 43 |
-
"status": "success",
|
| 44 |
-
"matched": true,
|
| 45 |
-
"score": 20.0,
|
| 46 |
-
"actualClassification": "AI_GENERATED",
|
| 47 |
-
"confidenceScore": 0.99
|
| 48 |
-
}
|
| 49 |
-
]
|
| 50 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main.py
CHANGED
|
@@ -27,14 +27,12 @@ from slowapi import Limiter, _rate_limit_exceeded_handler
|
|
| 27 |
from slowapi.util import get_remote_address
|
| 28 |
from slowapi.errors import RateLimitExceeded
|
| 29 |
|
| 30 |
-
# Configure logging
|
| 31 |
logging.basicConfig(
|
| 32 |
level=logging.INFO,
|
| 33 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 34 |
)
|
| 35 |
logger = logging.getLogger(__name__)
|
| 36 |
|
| 37 |
-
# Rate limiting
|
| 38 |
limiter = Limiter(key_func=get_remote_address, default_limits=["1000/minute"])
|
| 39 |
|
| 40 |
from audio_utils import decode_base64_audio, load_audio_from_bytes
|
|
@@ -50,7 +48,6 @@ try:
|
|
| 50 |
except Exception: # pragma: no cover - optional dependency
|
| 51 |
redis = None
|
| 52 |
|
| 53 |
-
# Computed constraints
|
| 54 |
MAX_AUDIO_BASE64_LENGTH = settings.MAX_AUDIO_SIZE_MB * 1024 * 1024 * 4 // 3
|
| 55 |
|
| 56 |
|
|
@@ -88,14 +85,13 @@ class SessionState:
|
|
| 88 |
|
| 89 |
SESSION_STORE: Dict[str, SessionState] = {}
|
| 90 |
SESSION_LOCK = asyncio.Lock()
|
| 91 |
-
SESSION_LOCKS: Dict[str, asyncio.Lock] = {}
|
| 92 |
SESSION_STORE_BACKEND_ACTIVE = "memory"
|
| 93 |
REDIS_CLIENT: Any = None
|
| 94 |
ASR_INFLIGHT_TASKS: set[asyncio.Task] = set()
|
| 95 |
ASR_INFLIGHT_LOCK = asyncio.Lock()
|
| 96 |
|
| 97 |
|
| 98 |
-
|
| 99 |
def use_redis_session_store() -> bool:
|
| 100 |
"""Return whether redis-backed session store is active."""
|
| 101 |
return SESSION_STORE_BACKEND_ACTIVE == "redis" and REDIS_CLIENT is not None
|
|
@@ -296,24 +292,23 @@ def run_startup_warmups() -> None:
|
|
| 296 |
|
| 297 |
# Detect environment
|
| 298 |
if settings.SPACE_ID:
|
| 299 |
-
logger.info(
|
| 300 |
|
| 301 |
|
| 302 |
def get_session_lock(session_id: str) -> asyncio.Lock:
|
| 303 |
-
"""Return a per-session lock, creating one if needed
|
| 304 |
if session_id not in SESSION_LOCKS:
|
| 305 |
SESSION_LOCKS[session_id] = asyncio.Lock()
|
| 306 |
return SESSION_LOCKS[session_id]
|
| 307 |
|
| 308 |
|
| 309 |
async def _periodic_session_purge(interval: int = 60) -> None:
|
| 310 |
-
"""Background task: purge expired sessions every *interval* seconds
|
| 311 |
while True:
|
| 312 |
try:
|
| 313 |
await asyncio.sleep(interval)
|
| 314 |
async with SESSION_LOCK:
|
| 315 |
removed = purge_expired_sessions()
|
| 316 |
-
# Also clean up per-session locks for removed sessions
|
| 317 |
stale_lock_keys = [k for k in SESSION_LOCKS if k not in SESSION_STORE]
|
| 318 |
for k in stale_lock_keys:
|
| 319 |
del SESSION_LOCKS[k]
|
|
@@ -335,14 +330,13 @@ async def lifespan(app: FastAPI):
|
|
| 335 |
preload_model()
|
| 336 |
logger.info("ML model loaded successfully")
|
| 337 |
except Exception as e:
|
| 338 |
-
logger.error(
|
| 339 |
|
| 340 |
try:
|
| 341 |
await asyncio.to_thread(run_startup_warmups)
|
| 342 |
except Exception as exc:
|
| 343 |
logger.warning("Startup warm-ups encountered an issue: %s", exc)
|
| 344 |
|
| 345 |
-
# Background periodic purge task (M2 fix: avoid purging on every request)
|
| 346 |
purge_task = asyncio.create_task(_periodic_session_purge())
|
| 347 |
|
| 348 |
yield
|
|
@@ -355,7 +349,6 @@ async def lifespan(app: FastAPI):
|
|
| 355 |
logger.info("Shutting down...")
|
| 356 |
|
| 357 |
|
| 358 |
-
# Initialize FastAPI app with lifespan
|
| 359 |
app = FastAPI(
|
| 360 |
title="AI Voice Detection API",
|
| 361 |
description="Detects whether a voice sample is AI-generated or spoken by a real human",
|
|
@@ -369,14 +362,9 @@ app = FastAPI(
|
|
| 369 |
lifespan=lifespan
|
| 370 |
)
|
| 371 |
|
| 372 |
-
# Add rate limiter to app state
|
| 373 |
app.state.limiter = limiter
|
| 374 |
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
|
| 375 |
|
| 376 |
-
# Middleware configuration
|
| 377 |
-
# CORS
|
| 378 |
-
# Note: Set ALLOWED_ORIGINS env var in production
|
| 379 |
-
# L2 fix: disable credentials for wildcard origins (browser ignores Set-Cookie anyway)
|
| 380 |
_cors_origins = settings.ALLOWED_ORIGINS
|
| 381 |
_cors_credentials = "*" not in _cors_origins
|
| 382 |
if not _cors_credentials:
|
|
@@ -389,38 +377,29 @@ app.add_middleware(
|
|
| 389 |
allow_headers=["Content-Type", "x-api-key", "Authorization"],
|
| 390 |
)
|
| 391 |
|
| 392 |
-
# Request Logging & Timing Middleware
|
| 393 |
@app.middleware("http")
|
| 394 |
async def log_requests(request: Request, call_next):
|
| 395 |
-
# Generate request ID and start timer
|
| 396 |
request_id = str(uuid.uuid4())[:8]
|
| 397 |
request.state.request_id = request_id
|
| 398 |
start_time = time.perf_counter()
|
| 399 |
|
| 400 |
-
# Log request start
|
| 401 |
method = request.method
|
| 402 |
path = request.url.path
|
| 403 |
if method == "POST":
|
| 404 |
-
logger.info(
|
| 405 |
-
|
| 406 |
-
# Process request (async)
|
| 407 |
response = await call_next(request)
|
| 408 |
-
|
| 409 |
-
# Calculate duration
|
| 410 |
duration_ms = (time.perf_counter() - start_time) * 1000
|
| 411 |
status_code = response.status_code
|
| 412 |
-
|
| 413 |
-
# Log request completion with timing
|
| 414 |
if method == "POST":
|
| 415 |
status_label = "[OK]" if status_code == 200 else "[ERR]" if status_code >= 400 else "[WARN]"
|
| 416 |
-
logger.info(
|
| 417 |
-
|
| 418 |
-
# Add headers
|
| 419 |
response.headers["X-Request-ID"] = request_id
|
| 420 |
response.headers["X-Response-Time"] = f"{duration_ms:.0f}ms"
|
| 421 |
response.headers["X-Content-Type-Options"] = "nosniff"
|
| 422 |
-
# Allow embedding in Hugging Face iframe
|
| 423 |
-
# response.headers["X-Frame-Options"] = "DENY"
|
| 424 |
response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
|
| 425 |
# Relax CSP to allow standard API documentation via CDNs (ReDoc/Swagger)
|
| 426 |
response.headers["Content-Security-Policy"] = (
|
|
@@ -433,7 +412,6 @@ async def log_requests(request: Request, call_next):
|
|
| 433 |
return response
|
| 434 |
|
| 435 |
|
| 436 |
-
# Request/Response Models
|
| 437 |
class VoiceDetectionRequest(BaseModel):
|
| 438 |
"""Request body for voice detection."""
|
| 439 |
language: str = Field(default="Auto", description="Language hint (Auto, English, Hindi, Hinglish, Tamil, Malayalam, Telugu). Defaults to auto-detect.")
|
|
@@ -703,7 +681,7 @@ def validate_supported_language(language: str) -> str:
|
|
| 703 |
"""Validate supported language. Falls back to 'Auto' for unknown languages so the
|
| 704 |
evaluator never gets a 400 for an unexpected language hint."""
|
| 705 |
if language not in settings.SUPPORTED_LANGUAGES:
|
| 706 |
-
logger.warning(
|
| 707 |
return "Auto"
|
| 708 |
return language
|
| 709 |
|
|
@@ -751,6 +729,7 @@ def dedupe_preserve_order(items: List[str]) -> List[str]:
|
|
| 751 |
def update_session_behaviour_state(session: SessionState, language_analysis: Dict[str, Any]) -> Dict[str, Any]:
|
| 752 |
"""Update session-level behaviour score from transcript and semantic trends."""
|
| 753 |
transcript_source = str(language_analysis.get("transcript_raw", language_analysis.get("transcript", "")))
|
|
|
|
| 754 |
transcript = normalize_transcript_for_behavior(transcript_source)
|
| 755 |
semantic_flags = list(language_analysis.get("semantic_flags", []))
|
| 756 |
keyword_categories = list(language_analysis.get("keyword_categories", []))
|
|
@@ -1087,24 +1066,14 @@ def build_risk_update(
|
|
| 1087 |
confidence_audio = int(round(confidence * 100))
|
| 1088 |
anomaly_audio = int(max(0.0, min(100.0, acoustic_anomaly * 0.85)))
|
| 1089 |
audio_score = max(confidence_audio, anomaly_audio)
|
| 1090 |
-
#
|
| 1091 |
-
#
|
| 1092 |
-
# authenticity 34-60, so the threshold starts low.
|
| 1093 |
-
# IMPORTANT: Only dampen for mic source — file uploads should trust
|
| 1094 |
-
# the model classification.
|
| 1095 |
if authenticity > 35 and _audio_source == "mic":
|
| 1096 |
-
# Scale factor: authenticity 35 → 1.0 (no change),
|
| 1097 |
-
# authenticity 55 → 0.80,
|
| 1098 |
-
# authenticity 80 → 0.55
|
| 1099 |
auth_dampen = max(0.50, 1.0 - (authenticity - 35) / 100.0)
|
| 1100 |
audio_score = int(round(audio_score * auth_dampen))
|
| 1101 |
else:
|
| 1102 |
authenticity_audio_score = int(max(0, min(100, (50.0 - authenticity) * 1.2)))
|
| 1103 |
-
#
|
| 1104 |
-
# noise floor and frequency response. Use 0.55 multiplier for mic
|
| 1105 |
-
# (was 0.70) so anomaly 60 → score 33 instead of 42, keeping
|
| 1106 |
-
# HUMAN chunks in LOW risk range where they belong.
|
| 1107 |
-
# File uploads use standard 0.90 multiplier.
|
| 1108 |
_anomaly_mult = 0.55 if _audio_source == "mic" else 0.90
|
| 1109 |
anomaly_audio_score = int(max(0.0, min(100.0, acoustic_anomaly * _anomaly_mult)))
|
| 1110 |
audio_score = max(authenticity_audio_score, anomaly_audio_score)
|
|
@@ -1163,21 +1132,14 @@ def build_risk_update(
|
|
| 1163 |
delta_boost = int(delta * settings.RISK_DELTA_BOOST_FACTOR)
|
| 1164 |
risk_score = min(100, risk_score + delta_boost)
|
| 1165 |
|
| 1166 |
-
#
|
| 1167 |
-
# If previous score was below 60 (LOW/MEDIUM) and new score jumps
|
| 1168 |
-
# to CRITICAL (>=80), cap at 79 unless 2+ recent HIGH scores in
|
| 1169 |
-
# the session history support the escalation.
|
| 1170 |
if previous_score is not None and previous_score < 60 and risk_score >= 80:
|
| 1171 |
recent_high = sum(1 for s in _risk_history[-5:] if s >= 60)
|
| 1172 |
if recent_high < 2:
|
| 1173 |
risk_score = min(risk_score, 79)
|
| 1174 |
behaviour_signals.append("risk_dampened_no_prior_high")
|
| 1175 |
|
| 1176 |
-
#
|
| 1177 |
-
# The very first chunk often contains connection noise / silence.
|
| 1178 |
-
# Cap its risk at 60 (MEDIUM) so one noisy handshake doesn't set
|
| 1179 |
-
# the session trajectory high — UNLESS there's a strong positive
|
| 1180 |
-
# signal (AI voice, high acoustic anomaly, or fraud keywords).
|
| 1181 |
if _chunks_processed == 0 and risk_score > 60:
|
| 1182 |
has_strong_signal = (
|
| 1183 |
(classification == "AI_GENERATED" and confidence >= 0.80)
|
|
@@ -1189,10 +1151,7 @@ def build_risk_update(
|
|
| 1189 |
risk_score = 60
|
| 1190 |
behaviour_signals.append("first_chunk_capped")
|
| 1191 |
|
| 1192 |
-
#
|
| 1193 |
-
# If 3+ of the last 5 chunks scored ≥40 AND the current chunk also
|
| 1194 |
-
# scores ≥40, apply a cumulative boost (3 pts per recent moderate chunk,
|
| 1195 |
-
# max +15). This ensures sustained low-grade fraud eventually triggers alerts.
|
| 1196 |
if len(_risk_history) >= 3 and risk_score >= 40:
|
| 1197 |
recent_moderate = sum(1 for s in _risk_history[-5:] if s >= 40)
|
| 1198 |
if recent_moderate >= 3:
|
|
@@ -1200,23 +1159,15 @@ def build_risk_update(
|
|
| 1200 |
risk_score = min(100, risk_score + cumulative_boost)
|
| 1201 |
behaviour_signals.append("sustained_moderate_risk")
|
| 1202 |
|
| 1203 |
-
#
|
| 1204 |
-
#
|
| 1205 |
-
# have been classified as AI. floor = 70 + min(20, ai_chunks * 5)
|
| 1206 |
-
# This means: 1 AI chunk → 75, 2 → 80, 3 → 85, 4+ → 90.
|
| 1207 |
-
# Raised confidence threshold to 0.92 (was 0.85) because with
|
| 1208 |
-
# temperature scaling T=4.0, the softened model outputs 0.67-0.84
|
| 1209 |
-
# for browser mic audio. Only truly confident AI predictions should
|
| 1210 |
-
# trigger this floor escalation.
|
| 1211 |
if classification == "AI_GENERATED" and confidence >= 0.92:
|
| 1212 |
ai_floor = 70 + min(20, _voice_ai_chunks * 5)
|
| 1213 |
risk_score = max(risk_score, ai_floor)
|
| 1214 |
if _voice_ai_chunks >= 2:
|
| 1215 |
behaviour_signals.append("sustained_ai_voice")
|
| 1216 |
|
| 1217 |
-
#
|
| 1218 |
-
# Add an AI-voice ratio component so CPI doesn't stay at 0 when
|
| 1219 |
-
# the only signal is the model detecting synthetic voice.
|
| 1220 |
_ai_ratio = (_voice_ai_chunks / max(1, _chunks_processed)) if _chunks_processed > 0 else 0.0
|
| 1221 |
if previous_score is None:
|
| 1222 |
cpi = min(100.0, max(0.0,
|
|
@@ -1264,11 +1215,7 @@ def build_risk_update(
|
|
| 1264 |
or any(signal in behaviour_signals for signal in strong_intent)
|
| 1265 |
)
|
| 1266 |
|
| 1267 |
-
#
|
| 1268 |
-
# On the very first chunk (_chunks_processed == 0), suppress the
|
| 1269 |
-
# alert unless CRITICAL (risk >= 80) or strong semantic intent.
|
| 1270 |
-
# This prevents a single false-positive chunk from triggering an
|
| 1271 |
-
# alert that will persist in the session history.
|
| 1272 |
if alert_triggered and _chunks_processed == 0:
|
| 1273 |
has_strong_intent = any(s in behaviour_signals for s in strong_intent)
|
| 1274 |
if risk_level != "CRITICAL" and not has_strong_intent:
|
|
@@ -1424,14 +1371,9 @@ async def process_audio_chunk(
|
|
| 1424 |
f"({analysis_result.confidence_score:.0%}) in {analyze_ms:.0f}ms"
|
| 1425 |
)
|
| 1426 |
|
| 1427 |
-
#
|
| 1428 |
-
#
|
| 1429 |
-
|
| 1430 |
-
# directions (e.g. a 1.6 s human tail flipping to AI 100%, or a
|
| 1431 |
-
# short synthetic tail flipping to HUMAN 99%). When the session
|
| 1432 |
-
# already has a clear majority classification, we carry that
|
| 1433 |
-
# forward instead of trusting a sub-2-second segment.
|
| 1434 |
-
MIN_RELIABLE_DURATION = 2.0 # seconds
|
| 1435 |
if duration_sec < MIN_RELIABLE_DURATION:
|
| 1436 |
async with SESSION_LOCK:
|
| 1437 |
_sess = get_session_state(session_id)
|
|
@@ -1625,11 +1567,7 @@ async def process_audio_chunk(
|
|
| 1625 |
session.final_voice_classification = voice_classification
|
| 1626 |
session.final_voice_confidence = voice_confidence
|
| 1627 |
|
| 1628 |
-
#
|
| 1629 |
-
# If the majority vote says HUMAN but the watermark-based label
|
| 1630 |
-
# is FRAUD, downgrade. Use average risk (not max) to decide
|
| 1631 |
-
# between SPAM and SAFE — a single spike shouldn't override an
|
| 1632 |
-
# otherwise clean session.
|
| 1633 |
if session.final_voice_classification == "HUMAN" and session.final_call_label == "FRAUD":
|
| 1634 |
avg_risk = sum(session.risk_history) / max(1, len(session.risk_history))
|
| 1635 |
session.final_call_label = "SPAM" if avg_risk >= 30 else "SAFE"
|
|
@@ -1638,19 +1576,14 @@ async def process_audio_chunk(
|
|
| 1638 |
elif session.final_voice_classification == "AI_GENERATED" and session.final_call_label == "SAFE":
|
| 1639 |
session.final_call_label = "SPAM"
|
| 1640 |
|
| 1641 |
-
#
|
| 1642 |
-
# When the average risk across all chunks is LOW (< 35) but the
|
| 1643 |
-
# label is FRAUD (because one or two spikes hit max_risk_score),
|
| 1644 |
-
# downgrade to SPAM. A session where 80%+ of chunks are SAFE
|
| 1645 |
-
# should not be labelled FRAUD — the spikes were likely
|
| 1646 |
-
# misclassifications from browser mic audio artifacts.
|
| 1647 |
if session.final_call_label == "FRAUD" and session.chunks_processed >= 5:
|
| 1648 |
avg_risk = sum(session.risk_history) / max(1, len(session.risk_history))
|
| 1649 |
if avg_risk < 35:
|
| 1650 |
session.final_call_label = "SPAM"
|
| 1651 |
logger.info(
|
| 1652 |
-
|
| 1653 |
-
|
| 1654 |
)
|
| 1655 |
|
| 1656 |
if scored["alert"].triggered:
|
|
@@ -1852,7 +1785,7 @@ async def analyze_realtime_chunk(
|
|
| 1852 |
@app.websocket("/api/voice-detection/v1/session/{session_id}/stream")
|
| 1853 |
async def stream_realtime_session(websocket: WebSocket, session_id: str):
|
| 1854 |
"""WebSocket endpoint for continuous chunk-based analysis."""
|
| 1855 |
-
#
|
| 1856 |
has_query_key = verify_websocket_api_key(websocket)
|
| 1857 |
if not has_query_key:
|
| 1858 |
# No query-param key — accept connection and require first-message auth
|
|
@@ -1872,7 +1805,7 @@ async def stream_realtime_session(websocket: WebSocket, session_id: str):
|
|
| 1872 |
request_id = f"ws-{session_id[:8]}"
|
| 1873 |
ws_start = time.time()
|
| 1874 |
|
| 1875 |
-
#
|
| 1876 |
if not has_query_key:
|
| 1877 |
try:
|
| 1878 |
auth_msg = await asyncio.wait_for(websocket.receive_json(), timeout=10.0)
|
|
@@ -1890,7 +1823,7 @@ async def stream_realtime_session(websocket: WebSocket, session_id: str):
|
|
| 1890 |
|
| 1891 |
try:
|
| 1892 |
while True:
|
| 1893 |
-
#
|
| 1894 |
elapsed = time.time() - ws_start
|
| 1895 |
if elapsed >= settings.WS_MAX_DURATION_SECONDS:
|
| 1896 |
await websocket.send_json({
|
|
@@ -1900,7 +1833,7 @@ async def stream_realtime_session(websocket: WebSocket, session_id: str):
|
|
| 1900 |
await websocket.close(code=1000, reason="Max duration exceeded")
|
| 1901 |
break
|
| 1902 |
|
| 1903 |
-
#
|
| 1904 |
try:
|
| 1905 |
payload = await asyncio.wait_for(
|
| 1906 |
websocket.receive_json(),
|
|
@@ -1932,7 +1865,7 @@ async def stream_realtime_session(websocket: WebSocket, session_id: str):
|
|
| 1932 |
except ValueError as e:
|
| 1933 |
await websocket.send_json({"status": "error", "message": str(e)})
|
| 1934 |
except WebSocketDisconnect:
|
| 1935 |
-
logger.info(
|
| 1936 |
|
| 1937 |
|
| 1938 |
@app.get("/v1/session/{session_id}/summary", response_model=SessionSummaryResponse, include_in_schema=False)
|
|
@@ -2035,40 +1968,28 @@ async def detect_voice(
|
|
| 2035 |
"""
|
| 2036 |
Returns classification result with confidence score and explanation.
|
| 2037 |
"""
|
| 2038 |
-
# Log request info for debugging
|
| 2039 |
request_id = getattr(request.state, 'request_id', 'unknown')
|
| 2040 |
-
audio_size_kb = len(voice_request.audioBase64) * 3 / 4 / 1024
|
| 2041 |
-
logger.info(
|
| 2042 |
-
|
|
|
|
| 2043 |
voice_request.language = validate_supported_language(voice_request.language)
|
| 2044 |
validate_supported_format(voice_request.audioFormat)
|
| 2045 |
|
| 2046 |
-
# Hard timeout guard: evaluator kills requests at 30s — bail at 20s with a safe fallback
|
| 2047 |
LEGACY_TIMEOUT_SECONDS = 20
|
| 2048 |
-
|
| 2049 |
try:
|
| 2050 |
-
# Step 1: Decode Base64 (async - runs in thread pool)
|
| 2051 |
-
logger.info(f"[{request_id}] -> Decoding Base64...")
|
| 2052 |
decode_start = time.perf_counter()
|
| 2053 |
audio_bytes = await asyncio.to_thread(decode_base64_audio, voice_request.audioBase64)
|
| 2054 |
-
|
| 2055 |
-
|
| 2056 |
-
# Step 2: Load audio (async - runs in thread pool)
|
| 2057 |
-
logger.info(f"[{request_id}] -> Loading audio... (decode took {decode_time:.0f}ms)")
|
| 2058 |
-
load_start = time.perf_counter()
|
| 2059 |
audio, sr = await asyncio.to_thread(load_audio_from_bytes, audio_bytes, 16000, voice_request.audioFormat)
|
| 2060 |
-
load_time = (time.perf_counter() - load_start) * 1000
|
| 2061 |
|
| 2062 |
-
# Truncate long audio to avoid timeout (keep first 20s max — plenty for classification)
|
| 2063 |
max_samples = sr * 20
|
| 2064 |
if len(audio) > max_samples:
|
| 2065 |
-
logger.warning(
|
| 2066 |
audio = audio[:max_samples]
|
| 2067 |
-
|
| 2068 |
-
# Step 3: ML Analysis (async - runs in thread pool, CPU-bound) with timeout guard
|
| 2069 |
duration_sec = len(audio) / sr
|
| 2070 |
-
logger.info(f"[{request_id}] -> Analyzing {duration_sec:.1f}s audio... (load took {load_time:.0f}ms)")
|
| 2071 |
-
analyze_start = time.perf_counter()
|
| 2072 |
remaining_budget = LEGACY_TIMEOUT_SECONDS - (time.perf_counter() - decode_start)
|
| 2073 |
if remaining_budget < 2:
|
| 2074 |
raise asyncio.TimeoutError("Insufficient time budget for analysis")
|
|
@@ -2076,11 +1997,11 @@ async def detect_voice(
|
|
| 2076 |
asyncio.to_thread(analyze_voice, audio, sr, voice_request.language),
|
| 2077 |
timeout=max(2.0, remaining_budget)
|
| 2078 |
)
|
| 2079 |
-
analyze_time = (time.perf_counter() -
|
| 2080 |
-
|
| 2081 |
-
logger.info(
|
| 2082 |
-
|
| 2083 |
-
|
| 2084 |
metrics = None
|
| 2085 |
if result.features:
|
| 2086 |
metrics = ForensicMetrics(
|
|
@@ -2094,7 +2015,6 @@ async def detect_voice(
|
|
| 2094 |
explanation = result.explanation
|
| 2095 |
recommended_action = None
|
| 2096 |
response_classification = result.classification
|
| 2097 |
-
# Never return UNCERTAIN on legacy endpoint — evaluator only accepts HUMAN / AI_GENERATED
|
| 2098 |
if model_uncertain:
|
| 2099 |
explanation = (
|
| 2100 |
"Model uncertainty detected due fallback inference. "
|
|
@@ -2111,7 +2031,6 @@ async def detect_voice(
|
|
| 2111 |
"credentials. Verify caller identity through official channels."
|
| 2112 |
)
|
| 2113 |
|
| 2114 |
-
# Return response
|
| 2115 |
return VoiceDetectionResponse(
|
| 2116 |
status="success",
|
| 2117 |
language=voice_request.language,
|
|
@@ -2124,13 +2043,13 @@ async def detect_voice(
|
|
| 2124 |
)
|
| 2125 |
|
| 2126 |
except ValueError as e:
|
| 2127 |
-
logger.warning(
|
| 2128 |
raise HTTPException(
|
| 2129 |
status_code=400,
|
| 2130 |
detail={"status": "error", "message": str(e)}
|
| 2131 |
)
|
| 2132 |
except asyncio.TimeoutError:
|
| 2133 |
-
logger.warning(
|
| 2134 |
return VoiceDetectionResponse(
|
| 2135 |
status="success",
|
| 2136 |
language=voice_request.language,
|
|
@@ -2142,10 +2061,10 @@ async def detect_voice(
|
|
| 2142 |
recommendedAction="Analysis took too long. Verify caller identity through official channels.",
|
| 2143 |
)
|
| 2144 |
except Exception as e:
|
| 2145 |
-
logger.error(
|
| 2146 |
raise HTTPException(
|
| 2147 |
status_code=500,
|
| 2148 |
-
detail={"status": "error", "message":
|
| 2149 |
)
|
| 2150 |
|
| 2151 |
|
|
@@ -2215,7 +2134,7 @@ async def http_exception_handler(request: Request, exc: HTTPException):
|
|
| 2215 |
@app.exception_handler(Exception)
|
| 2216 |
async def global_exception_handler(request: Request, exc: Exception):
|
| 2217 |
"""Global handler to catch unhandled exceptions and prevent stack traces."""
|
| 2218 |
-
logger.error(
|
| 2219 |
return JSONResponse(
|
| 2220 |
status_code=500,
|
| 2221 |
content={"status": "error", "message": "Internal Server Error"}
|
|
@@ -2225,23 +2144,3 @@ async def global_exception_handler(request: Request, exc: Exception):
|
|
| 2225 |
if __name__ == "__main__":
|
| 2226 |
import uvicorn
|
| 2227 |
uvicorn.run(app, host="0.0.0.0", port=settings.PORT)
|
| 2228 |
-
|
| 2229 |
-
|
| 2230 |
-
|
| 2231 |
-
|
| 2232 |
-
|
| 2233 |
-
|
| 2234 |
-
|
| 2235 |
-
|
| 2236 |
-
|
| 2237 |
-
|
| 2238 |
-
|
| 2239 |
-
|
| 2240 |
-
|
| 2241 |
-
|
| 2242 |
-
|
| 2243 |
-
|
| 2244 |
-
|
| 2245 |
-
|
| 2246 |
-
|
| 2247 |
-
|
|
|
|
| 27 |
from slowapi.util import get_remote_address
|
| 28 |
from slowapi.errors import RateLimitExceeded
|
| 29 |
|
|
|
|
| 30 |
logging.basicConfig(
|
| 31 |
level=logging.INFO,
|
| 32 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 33 |
)
|
| 34 |
logger = logging.getLogger(__name__)
|
| 35 |
|
|
|
|
| 36 |
limiter = Limiter(key_func=get_remote_address, default_limits=["1000/minute"])
|
| 37 |
|
| 38 |
from audio_utils import decode_base64_audio, load_audio_from_bytes
|
|
|
|
| 48 |
except Exception: # pragma: no cover - optional dependency
|
| 49 |
redis = None
|
| 50 |
|
|
|
|
| 51 |
MAX_AUDIO_BASE64_LENGTH = settings.MAX_AUDIO_SIZE_MB * 1024 * 1024 * 4 // 3
|
| 52 |
|
| 53 |
|
|
|
|
| 85 |
|
| 86 |
SESSION_STORE: Dict[str, SessionState] = {}
|
| 87 |
SESSION_LOCK = asyncio.Lock()
|
| 88 |
+
SESSION_LOCKS: Dict[str, asyncio.Lock] = {}
|
| 89 |
SESSION_STORE_BACKEND_ACTIVE = "memory"
|
| 90 |
REDIS_CLIENT: Any = None
|
| 91 |
ASR_INFLIGHT_TASKS: set[asyncio.Task] = set()
|
| 92 |
ASR_INFLIGHT_LOCK = asyncio.Lock()
|
| 93 |
|
| 94 |
|
|
|
|
| 95 |
def use_redis_session_store() -> bool:
|
| 96 |
"""Return whether redis-backed session store is active."""
|
| 97 |
return SESSION_STORE_BACKEND_ACTIVE == "redis" and REDIS_CLIENT is not None
|
|
|
|
| 292 |
|
| 293 |
# Detect environment
|
| 294 |
if settings.SPACE_ID:
|
| 295 |
+
logger.info("Running on HuggingFace Spaces: %s", settings.SPACE_ID)
|
| 296 |
|
| 297 |
|
| 298 |
def get_session_lock(session_id: str) -> asyncio.Lock:
|
| 299 |
+
"""Return a per-session lock, creating one if needed."""
|
| 300 |
if session_id not in SESSION_LOCKS:
|
| 301 |
SESSION_LOCKS[session_id] = asyncio.Lock()
|
| 302 |
return SESSION_LOCKS[session_id]
|
| 303 |
|
| 304 |
|
| 305 |
async def _periodic_session_purge(interval: int = 60) -> None:
|
| 306 |
+
"""Background task: purge expired sessions every *interval* seconds."""
|
| 307 |
while True:
|
| 308 |
try:
|
| 309 |
await asyncio.sleep(interval)
|
| 310 |
async with SESSION_LOCK:
|
| 311 |
removed = purge_expired_sessions()
|
|
|
|
| 312 |
stale_lock_keys = [k for k in SESSION_LOCKS if k not in SESSION_STORE]
|
| 313 |
for k in stale_lock_keys:
|
| 314 |
del SESSION_LOCKS[k]
|
|
|
|
| 330 |
preload_model()
|
| 331 |
logger.info("ML model loaded successfully")
|
| 332 |
except Exception as e:
|
| 333 |
+
logger.error("Failed to preload model: %s", e)
|
| 334 |
|
| 335 |
try:
|
| 336 |
await asyncio.to_thread(run_startup_warmups)
|
| 337 |
except Exception as exc:
|
| 338 |
logger.warning("Startup warm-ups encountered an issue: %s", exc)
|
| 339 |
|
|
|
|
| 340 |
purge_task = asyncio.create_task(_periodic_session_purge())
|
| 341 |
|
| 342 |
yield
|
|
|
|
| 349 |
logger.info("Shutting down...")
|
| 350 |
|
| 351 |
|
|
|
|
| 352 |
app = FastAPI(
|
| 353 |
title="AI Voice Detection API",
|
| 354 |
description="Detects whether a voice sample is AI-generated or spoken by a real human",
|
|
|
|
| 362 |
lifespan=lifespan
|
| 363 |
)
|
| 364 |
|
|
|
|
| 365 |
app.state.limiter = limiter
|
| 366 |
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
|
| 367 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
_cors_origins = settings.ALLOWED_ORIGINS
|
| 369 |
_cors_credentials = "*" not in _cors_origins
|
| 370 |
if not _cors_credentials:
|
|
|
|
| 377 |
allow_headers=["Content-Type", "x-api-key", "Authorization"],
|
| 378 |
)
|
| 379 |
|
|
|
|
| 380 |
@app.middleware("http")
|
| 381 |
async def log_requests(request: Request, call_next):
|
|
|
|
| 382 |
request_id = str(uuid.uuid4())[:8]
|
| 383 |
request.state.request_id = request_id
|
| 384 |
start_time = time.perf_counter()
|
| 385 |
|
|
|
|
| 386 |
method = request.method
|
| 387 |
path = request.url.path
|
| 388 |
if method == "POST":
|
| 389 |
+
logger.info("[%s] [START] %s %s", request_id, method, path)
|
| 390 |
+
|
|
|
|
| 391 |
response = await call_next(request)
|
| 392 |
+
|
|
|
|
| 393 |
duration_ms = (time.perf_counter() - start_time) * 1000
|
| 394 |
status_code = response.status_code
|
| 395 |
+
|
|
|
|
| 396 |
if method == "POST":
|
| 397 |
status_label = "[OK]" if status_code == 200 else "[ERR]" if status_code >= 400 else "[WARN]"
|
| 398 |
+
logger.info("[%s] %s END %s %s -> %s (%0.fms)", request_id, status_label, method, path, status_code, duration_ms)
|
| 399 |
+
|
|
|
|
| 400 |
response.headers["X-Request-ID"] = request_id
|
| 401 |
response.headers["X-Response-Time"] = f"{duration_ms:.0f}ms"
|
| 402 |
response.headers["X-Content-Type-Options"] = "nosniff"
|
|
|
|
|
|
|
| 403 |
response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
|
| 404 |
# Relax CSP to allow standard API documentation via CDNs (ReDoc/Swagger)
|
| 405 |
response.headers["Content-Security-Policy"] = (
|
|
|
|
| 412 |
return response
|
| 413 |
|
| 414 |
|
|
|
|
| 415 |
class VoiceDetectionRequest(BaseModel):
|
| 416 |
"""Request body for voice detection."""
|
| 417 |
language: str = Field(default="Auto", description="Language hint (Auto, English, Hindi, Hinglish, Tamil, Malayalam, Telugu). Defaults to auto-detect.")
|
|
|
|
| 681 |
"""Validate supported language. Falls back to 'Auto' for unknown languages so the
|
| 682 |
evaluator never gets a 400 for an unexpected language hint."""
|
| 683 |
if language not in settings.SUPPORTED_LANGUAGES:
|
| 684 |
+
logger.warning("Unsupported language '%s' — falling back to 'Auto'", language)
|
| 685 |
return "Auto"
|
| 686 |
return language
|
| 687 |
|
|
|
|
| 729 |
def update_session_behaviour_state(session: SessionState, language_analysis: Dict[str, Any]) -> Dict[str, Any]:
|
| 730 |
"""Update session-level behaviour score from transcript and semantic trends."""
|
| 731 |
transcript_source = str(language_analysis.get("transcript_raw", language_analysis.get("transcript", "")))
|
| 732 |
+
|
| 733 |
transcript = normalize_transcript_for_behavior(transcript_source)
|
| 734 |
semantic_flags = list(language_analysis.get("semantic_flags", []))
|
| 735 |
keyword_categories = list(language_analysis.get("keyword_categories", []))
|
|
|
|
| 1066 |
confidence_audio = int(round(confidence * 100))
|
| 1067 |
anomaly_audio = int(max(0.0, min(100.0, acoustic_anomaly * 0.85)))
|
| 1068 |
audio_score = max(confidence_audio, anomaly_audio)
|
| 1069 |
+
# Dampen audio_score when signal forensics contradict AI classification
|
| 1070 |
+
# for mic-source audio (browser mic has authenticity 34-60 naturally).
|
|
|
|
|
|
|
|
|
|
| 1071 |
if authenticity > 35 and _audio_source == "mic":
|
|
|
|
|
|
|
|
|
|
| 1072 |
auth_dampen = max(0.50, 1.0 - (authenticity - 35) / 100.0)
|
| 1073 |
audio_score = int(round(audio_score * auth_dampen))
|
| 1074 |
else:
|
| 1075 |
authenticity_audio_score = int(max(0, min(100, (50.0 - authenticity) * 1.2)))
|
| 1076 |
+
# Mic audio has higher spectral anomaly (40-78); use lower multiplier.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1077 |
_anomaly_mult = 0.55 if _audio_source == "mic" else 0.90
|
| 1078 |
anomaly_audio_score = int(max(0.0, min(100.0, acoustic_anomaly * _anomaly_mult)))
|
| 1079 |
audio_score = max(authenticity_audio_score, anomaly_audio_score)
|
|
|
|
| 1132 |
delta_boost = int(delta * settings.RISK_DELTA_BOOST_FACTOR)
|
| 1133 |
risk_score = min(100, risk_score + delta_boost)
|
| 1134 |
|
| 1135 |
+
# Risk dampening: prevent single-chunk LOW→CRITICAL jumps.
|
|
|
|
|
|
|
|
|
|
| 1136 |
if previous_score is not None and previous_score < 60 and risk_score >= 80:
|
| 1137 |
recent_high = sum(1 for s in _risk_history[-5:] if s >= 60)
|
| 1138 |
if recent_high < 2:
|
| 1139 |
risk_score = min(risk_score, 79)
|
| 1140 |
behaviour_signals.append("risk_dampened_no_prior_high")
|
| 1141 |
|
| 1142 |
+
# First-chunk guard: cap noise-only first chunks at MEDIUM.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1143 |
if _chunks_processed == 0 and risk_score > 60:
|
| 1144 |
has_strong_signal = (
|
| 1145 |
(classification == "AI_GENERATED" and confidence >= 0.80)
|
|
|
|
| 1151 |
risk_score = 60
|
| 1152 |
behaviour_signals.append("first_chunk_capped")
|
| 1153 |
|
| 1154 |
+
# Cumulative escalation for sustained moderate signals.
|
|
|
|
|
|
|
|
|
|
| 1155 |
if len(_risk_history) >= 3 and risk_score >= 40:
|
| 1156 |
recent_moderate = sum(1 for s in _risk_history[-5:] if s >= 40)
|
| 1157 |
if recent_moderate >= 3:
|
|
|
|
| 1159 |
risk_score = min(100, risk_score + cumulative_boost)
|
| 1160 |
behaviour_signals.append("sustained_moderate_risk")
|
| 1161 |
|
| 1162 |
+
# Sustained AI voice floor escalation.
|
| 1163 |
+
# floor = 70 + min(20, ai_chunks * 5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1164 |
if classification == "AI_GENERATED" and confidence >= 0.92:
|
| 1165 |
ai_floor = 70 + min(20, _voice_ai_chunks * 5)
|
| 1166 |
risk_score = max(risk_score, ai_floor)
|
| 1167 |
if _voice_ai_chunks >= 2:
|
| 1168 |
behaviour_signals.append("sustained_ai_voice")
|
| 1169 |
|
| 1170 |
+
# AI-voice-aware CPI includes synthetic voice ratio.
|
|
|
|
|
|
|
| 1171 |
_ai_ratio = (_voice_ai_chunks / max(1, _chunks_processed)) if _chunks_processed > 0 else 0.0
|
| 1172 |
if previous_score is None:
|
| 1173 |
cpi = min(100.0, max(0.0,
|
|
|
|
| 1215 |
or any(signal in behaviour_signals for signal in strong_intent)
|
| 1216 |
)
|
| 1217 |
|
| 1218 |
+
# First-chunk alert guard: suppress unless CRITICAL or strong intent.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1219 |
if alert_triggered and _chunks_processed == 0:
|
| 1220 |
has_strong_intent = any(s in behaviour_signals for s in strong_intent)
|
| 1221 |
if risk_level != "CRITICAL" and not has_strong_intent:
|
|
|
|
| 1371 |
f"({analysis_result.confidence_score:.0%}) in {analyze_ms:.0f}ms"
|
| 1372 |
)
|
| 1373 |
|
| 1374 |
+
# Short-chunk guard: sub-2s segments are unreliable; carry forward
|
| 1375 |
+
# the session's majority classification instead.
|
| 1376 |
+
MIN_RELIABLE_DURATION = 2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1377 |
if duration_sec < MIN_RELIABLE_DURATION:
|
| 1378 |
async with SESSION_LOCK:
|
| 1379 |
_sess = get_session_state(session_id)
|
|
|
|
| 1567 |
session.final_voice_classification = voice_classification
|
| 1568 |
session.final_voice_confidence = voice_confidence
|
| 1569 |
|
| 1570 |
+
# Reconcile final_call_label with majority vote.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1571 |
if session.final_voice_classification == "HUMAN" and session.final_call_label == "FRAUD":
|
| 1572 |
avg_risk = sum(session.risk_history) / max(1, len(session.risk_history))
|
| 1573 |
session.final_call_label = "SPAM" if avg_risk >= 30 else "SAFE"
|
|
|
|
| 1576 |
elif session.final_voice_classification == "AI_GENERATED" and session.final_call_label == "SAFE":
|
| 1577 |
session.final_call_label = "SPAM"
|
| 1578 |
|
| 1579 |
+
# Average risk sanity check: downgrade FRAUD when most chunks are LOW.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1580 |
if session.final_call_label == "FRAUD" and session.chunks_processed >= 5:
|
| 1581 |
avg_risk = sum(session.risk_history) / max(1, len(session.risk_history))
|
| 1582 |
if avg_risk < 35:
|
| 1583 |
session.final_call_label = "SPAM"
|
| 1584 |
logger.info(
|
| 1585 |
+
"Sanity: downgraded FRAUD -> SPAM (avg_risk=%.1f, chunks=%d)",
|
| 1586 |
+
avg_risk, session.chunks_processed,
|
| 1587 |
)
|
| 1588 |
|
| 1589 |
if scored["alert"].triggered:
|
|
|
|
| 1785 |
@app.websocket("/api/voice-detection/v1/session/{session_id}/stream")
|
| 1786 |
async def stream_realtime_session(websocket: WebSocket, session_id: str):
|
| 1787 |
"""WebSocket endpoint for continuous chunk-based analysis."""
|
| 1788 |
+
# Accept auth via query-param or first-message token
|
| 1789 |
has_query_key = verify_websocket_api_key(websocket)
|
| 1790 |
if not has_query_key:
|
| 1791 |
# No query-param key — accept connection and require first-message auth
|
|
|
|
| 1805 |
request_id = f"ws-{session_id[:8]}"
|
| 1806 |
ws_start = time.time()
|
| 1807 |
|
| 1808 |
+
# Fall back to first-message authentication
|
| 1809 |
if not has_query_key:
|
| 1810 |
try:
|
| 1811 |
auth_msg = await asyncio.wait_for(websocket.receive_json(), timeout=10.0)
|
|
|
|
| 1823 |
|
| 1824 |
try:
|
| 1825 |
while True:
|
| 1826 |
+
# Enforce max connection duration
|
| 1827 |
elapsed = time.time() - ws_start
|
| 1828 |
if elapsed >= settings.WS_MAX_DURATION_SECONDS:
|
| 1829 |
await websocket.send_json({
|
|
|
|
| 1833 |
await websocket.close(code=1000, reason="Max duration exceeded")
|
| 1834 |
break
|
| 1835 |
|
| 1836 |
+
# Enforce idle timeout
|
| 1837 |
try:
|
| 1838 |
payload = await asyncio.wait_for(
|
| 1839 |
websocket.receive_json(),
|
|
|
|
| 1865 |
except ValueError as e:
|
| 1866 |
await websocket.send_json({"status": "error", "message": str(e)})
|
| 1867 |
except WebSocketDisconnect:
|
| 1868 |
+
logger.info("[%s] WebSocket disconnected", request_id)
|
| 1869 |
|
| 1870 |
|
| 1871 |
@app.get("/v1/session/{session_id}/summary", response_model=SessionSummaryResponse, include_in_schema=False)
|
|
|
|
| 1968 |
"""
|
| 1969 |
Returns classification result with confidence score and explanation.
|
| 1970 |
"""
|
|
|
|
| 1971 |
request_id = getattr(request.state, 'request_id', 'unknown')
|
| 1972 |
+
audio_size_kb = len(voice_request.audioBase64) * 3 / 4 / 1024
|
| 1973 |
+
logger.info("[%s] Voice detection: lang=%s, fmt=%s, size~%.1fKB",
|
| 1974 |
+
request_id, voice_request.language, voice_request.audioFormat, audio_size_kb)
|
| 1975 |
+
|
| 1976 |
voice_request.language = validate_supported_language(voice_request.language)
|
| 1977 |
validate_supported_format(voice_request.audioFormat)
|
| 1978 |
|
|
|
|
| 1979 |
LEGACY_TIMEOUT_SECONDS = 20
|
| 1980 |
+
|
| 1981 |
try:
|
|
|
|
|
|
|
| 1982 |
decode_start = time.perf_counter()
|
| 1983 |
audio_bytes = await asyncio.to_thread(decode_base64_audio, voice_request.audioBase64)
|
| 1984 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1985 |
audio, sr = await asyncio.to_thread(load_audio_from_bytes, audio_bytes, 16000, voice_request.audioFormat)
|
|
|
|
| 1986 |
|
|
|
|
| 1987 |
max_samples = sr * 20
|
| 1988 |
if len(audio) > max_samples:
|
| 1989 |
+
logger.warning("[%s] Truncating audio from %.1fs to 20s", request_id, len(audio) / sr)
|
| 1990 |
audio = audio[:max_samples]
|
| 1991 |
+
|
|
|
|
| 1992 |
duration_sec = len(audio) / sr
|
|
|
|
|
|
|
| 1993 |
remaining_budget = LEGACY_TIMEOUT_SECONDS - (time.perf_counter() - decode_start)
|
| 1994 |
if remaining_budget < 2:
|
| 1995 |
raise asyncio.TimeoutError("Insufficient time budget for analysis")
|
|
|
|
| 1997 |
asyncio.to_thread(analyze_voice, audio, sr, voice_request.language),
|
| 1998 |
timeout=max(2.0, remaining_budget)
|
| 1999 |
)
|
| 2000 |
+
analyze_time = (time.perf_counter() - decode_start) * 1000
|
| 2001 |
+
|
| 2002 |
+
logger.info("[%s] Analysis complete: %s (%.0f%%) in %.0fms",
|
| 2003 |
+
request_id, result.classification, result.confidence_score * 100, analyze_time)
|
| 2004 |
+
|
| 2005 |
metrics = None
|
| 2006 |
if result.features:
|
| 2007 |
metrics = ForensicMetrics(
|
|
|
|
| 2015 |
explanation = result.explanation
|
| 2016 |
recommended_action = None
|
| 2017 |
response_classification = result.classification
|
|
|
|
| 2018 |
if model_uncertain:
|
| 2019 |
explanation = (
|
| 2020 |
"Model uncertainty detected due fallback inference. "
|
|
|
|
| 2031 |
"credentials. Verify caller identity through official channels."
|
| 2032 |
)
|
| 2033 |
|
|
|
|
| 2034 |
return VoiceDetectionResponse(
|
| 2035 |
status="success",
|
| 2036 |
language=voice_request.language,
|
|
|
|
| 2043 |
)
|
| 2044 |
|
| 2045 |
except ValueError as e:
|
| 2046 |
+
logger.warning("[%s] Validation error: %s", request_id, e)
|
| 2047 |
raise HTTPException(
|
| 2048 |
status_code=400,
|
| 2049 |
detail={"status": "error", "message": str(e)}
|
| 2050 |
)
|
| 2051 |
except asyncio.TimeoutError:
|
| 2052 |
+
logger.warning("[%s] Legacy endpoint exceeded %ds budget", request_id, LEGACY_TIMEOUT_SECONDS)
|
| 2053 |
return VoiceDetectionResponse(
|
| 2054 |
status="success",
|
| 2055 |
language=voice_request.language,
|
|
|
|
| 2061 |
recommendedAction="Analysis took too long. Verify caller identity through official channels.",
|
| 2062 |
)
|
| 2063 |
except Exception as e:
|
| 2064 |
+
logger.error("[%s] Processing error: %s", request_id, e, exc_info=True)
|
| 2065 |
raise HTTPException(
|
| 2066 |
status_code=500,
|
| 2067 |
+
detail={"status": "error", "message": "Internal Server Error"}
|
| 2068 |
)
|
| 2069 |
|
| 2070 |
|
|
|
|
| 2134 |
@app.exception_handler(Exception)
|
| 2135 |
async def global_exception_handler(request: Request, exc: Exception):
|
| 2136 |
"""Global handler to catch unhandled exceptions and prevent stack traces."""
|
| 2137 |
+
logger.error("Unhandled error: %s", exc, exc_info=True)
|
| 2138 |
return JSONResponse(
|
| 2139 |
status_code=500,
|
| 2140 |
content={"status": "error", "message": "Internal Server Error"}
|
|
|
|
| 2144 |
if __name__ == "__main__":
|
| 2145 |
import uvicorn
|
| 2146 |
uvicorn.run(app, host="0.0.0.0", port=settings.PORT)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model.py
CHANGED
|
@@ -14,14 +14,12 @@ import warnings
|
|
| 14 |
|
| 15 |
from config import settings
|
| 16 |
|
| 17 |
-
# Configure logging
|
| 18 |
logger = logging.getLogger(__name__)
|
| 19 |
|
| 20 |
-
# Suppress warnings
|
| 21 |
warnings.filterwarnings("ignore", category=FutureWarning)
|
| 22 |
warnings.filterwarnings("ignore", category=UserWarning)
|
| 23 |
|
| 24 |
-
#
|
| 25 |
HEURISTIC_THRESHOLDS = {
|
| 26 |
# Pitch scoring
|
| 27 |
"pitch_optimal_stability": float(os.getenv("PITCH_OPTIMAL_STABILITY", "0.20")),
|
|
@@ -46,7 +44,6 @@ _model = None
|
|
| 46 |
_processor = None
|
| 47 |
_device = None
|
| 48 |
|
| 49 |
-
|
| 50 |
@dataclass
|
| 51 |
class AnalysisResult:
|
| 52 |
"""Result of voice analysis."""
|
|
@@ -64,7 +61,7 @@ def get_device():
|
|
| 64 |
_device = "cuda"
|
| 65 |
else:
|
| 66 |
_device = "cpu"
|
| 67 |
-
logger.info(
|
| 68 |
return _device
|
| 69 |
|
| 70 |
|
|
@@ -75,11 +72,9 @@ def _detect_label_inversion(model):
|
|
| 75 |
"""Check once at load time whether this model needs label flipping."""
|
| 76 |
global _invert_labels
|
| 77 |
name = getattr(model.config, '_name_or_path', '').lower()
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
logger.info("
|
| 81 |
-
else:
|
| 82 |
-
_invert_labels = False
|
| 83 |
|
| 84 |
|
| 85 |
def load_model():
|
|
@@ -102,10 +97,10 @@ def load_model():
|
|
| 102 |
backup_model = settings.VOICE_MODEL_BACKUP_ID
|
| 103 |
|
| 104 |
if os.path.exists(local_path):
|
| 105 |
-
logger.info(
|
| 106 |
model_name = local_path
|
| 107 |
else:
|
| 108 |
-
logger.info(
|
| 109 |
model_name = hf_model
|
| 110 |
|
| 111 |
try:
|
|
@@ -113,10 +108,10 @@ def load_model():
|
|
| 113 |
_model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
|
| 114 |
_model.to(get_device())
|
| 115 |
_model.eval()
|
| 116 |
-
logger.info(
|
| 117 |
_detect_label_inversion(_model)
|
| 118 |
except Exception as e:
|
| 119 |
-
logger.error(
|
| 120 |
if model_name != backup_model:
|
| 121 |
logger.warning("Trying backup model...")
|
| 122 |
model_name = backup_model
|
|
@@ -125,7 +120,7 @@ def load_model():
|
|
| 125 |
_model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
|
| 126 |
_model.to(get_device())
|
| 127 |
_model.eval()
|
| 128 |
-
logger.info(
|
| 129 |
_detect_label_inversion(_model)
|
| 130 |
except Exception as e2:
|
| 131 |
raise RuntimeError(f"Could not load any model: {e2}")
|
|
@@ -212,7 +207,7 @@ def extract_signal_features(audio: np.ndarray, sr: int, fast_mode: bool = False)
|
|
| 212 |
features["harmonic_noise_ratio_db"] = hnr_db
|
| 213 |
|
| 214 |
except Exception as e:
|
| 215 |
-
logger.warning(
|
| 216 |
features = {
|
| 217 |
"pitch_stability": 0.5,
|
| 218 |
"jitter": 0.05,
|
|
@@ -478,12 +473,12 @@ def classify_with_model(audio: np.ndarray, sr: int) -> Tuple[str, float]:
|
|
| 478 |
model, processor = load_model()
|
| 479 |
device = get_device()
|
| 480 |
|
| 481 |
-
# Normalize audio
|
| 482 |
max_val = np.max(np.abs(audio))
|
| 483 |
if max_val > 0:
|
| 484 |
audio = audio / max_val
|
| 485 |
|
| 486 |
-
# Resample to 16kHz if needed
|
| 487 |
target_sr = 16000
|
| 488 |
if sr != target_sr:
|
| 489 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
|
|
@@ -495,33 +490,25 @@ def classify_with_model(audio: np.ndarray, sr: int) -> Tuple[str, float]:
|
|
| 495 |
return_tensors="pt",
|
| 496 |
padding=True
|
| 497 |
)
|
| 498 |
-
|
| 499 |
-
# Move to device
|
| 500 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 501 |
-
|
| 502 |
-
# Run inference
|
| 503 |
with torch.no_grad():
|
| 504 |
outputs = model(**inputs)
|
| 505 |
logits = outputs.logits
|
| 506 |
|
| 507 |
-
# Temperature scaling:
|
| 508 |
-
#
|
| 509 |
-
# 1.00 confidence for browser-mic audio, leaving zero room for
|
| 510 |
-
# the heuristic cross-check to correct a wrong classification.
|
| 511 |
temperature = float(settings.MODEL_LOGIT_TEMPERATURE)
|
| 512 |
if temperature > 1.0:
|
| 513 |
logits = logits / temperature
|
| 514 |
|
| 515 |
probabilities = torch.softmax(logits, dim=-1)
|
| 516 |
-
|
| 517 |
# Get prediction
|
| 518 |
predicted_class = torch.argmax(probabilities, dim=-1).item()
|
| 519 |
confidence = probabilities[0][predicted_class].item()
|
| 520 |
|
| 521 |
-
#
|
| 522 |
-
# IMPORTANT: HuggingFace stores id2label with STRING keys ("0", "1")
|
| 523 |
-
# but predicted_class from torch.argmax().item() is an int.
|
| 524 |
-
# We must normalise the keys to int so .get() actually matches.
|
| 525 |
raw_id2label = getattr(model.config, 'id2label', None) or {}
|
| 526 |
id2label = {int(k): v for k, v in raw_id2label.items()}
|
| 527 |
label = id2label.get(predicted_class, 'UNKNOWN')
|
|
@@ -532,21 +519,10 @@ def classify_with_model(audio: np.ndarray, sr: int) -> Tuple[str, float]:
|
|
| 532 |
[f"{p:.4f}" for p in probabilities[0].cpu().tolist()],
|
| 533 |
)
|
| 534 |
|
| 535 |
-
#
|
| 536 |
-
# The primary model (shivam-2211/voice-detection-model) was trained with
|
| 537 |
-
# inverted label semantics: its class-0 output actually corresponds to
|
| 538 |
-
# REAL/human audio and class-1 to FAKE/AI-generated, despite the config
|
| 539 |
-
# claiming 0=FAKE and 1=REAL. Detected once at load time via
|
| 540 |
-
# _detect_label_inversion().
|
| 541 |
if _invert_labels:
|
| 542 |
-
|
| 543 |
-
if predicted_class == 0:
|
| 544 |
-
classification = "HUMAN"
|
| 545 |
-
else:
|
| 546 |
-
classification = "AI_GENERATED"
|
| 547 |
-
# confidence stays the same (model's own softmax output)
|
| 548 |
else:
|
| 549 |
-
# Standard mapping: use labels from config
|
| 550 |
if label.upper() in ['FAKE', 'SPOOF', 'SYNTHETIC', 'AI']:
|
| 551 |
classification = "AI_GENERATED"
|
| 552 |
else:
|
|
@@ -586,7 +562,7 @@ def analyze_voice(audio: np.ndarray, sr: int, language: str = "English", realtim
|
|
| 586 |
try:
|
| 587 |
classification, ml_confidence = classify_with_model(audio, sr)
|
| 588 |
except Exception as e:
|
| 589 |
-
logger.error(
|
| 590 |
ml_fallback = True
|
| 591 |
classification = "HUMAN"
|
| 592 |
ml_confidence = 0.5
|
|
@@ -611,39 +587,21 @@ def analyze_voice(audio: np.ndarray, sr: int, language: str = "English", realtim
|
|
| 611 |
ml_confidence = ai_probability if classification == "AI_GENERATED" else (1.0 - ai_probability)
|
| 612 |
ml_confidence = float(max(0.5, min(0.99, ml_confidence)))
|
| 613 |
|
| 614 |
-
#
|
| 615 |
-
#
|
| 616 |
-
#
|
| 617 |
-
#
|
| 618 |
-
# heuristic evidence. The model was fine-tuned on curated datasets
|
| 619 |
-
# and can misclassify real browser-mic audio as synthetic.
|
| 620 |
-
#
|
| 621 |
-
# IMPORTANT: This override is ONLY for realtime browser-mic sessions.
|
| 622 |
-
# File uploads use clean audio paths and the model's classification
|
| 623 |
-
# should be trusted. Applying the override to file uploads would
|
| 624 |
-
# cause real AI-generated audio to be misclassified as HUMAN.
|
| 625 |
-
#
|
| 626 |
-
# Browser-mic audio typically has authenticity 34-60 and anomaly 40-78
|
| 627 |
-
# (naturally higher noise floor and spectral irregularity). The
|
| 628 |
-
# thresholds must reflect these real-world ranges.
|
| 629 |
if realtime and source == "mic" and classification == "AI_GENERATED" and authenticity_score > 35:
|
| 630 |
-
# The higher the authenticity, the more we moderate.
|
| 631 |
-
# authenticity 35 → no change. authenticity 60 → cap at ~0.75
|
| 632 |
-
# authenticity 80 → cap at ~0.55
|
| 633 |
moderation_factor = max(0.50, 1.0 - (authenticity_score - 35) / 100.0)
|
| 634 |
if ml_confidence > moderation_factor:
|
| 635 |
logger.info(
|
| 636 |
-
"Authenticity cross-check: moderated AI confidence %.2f
|
| 637 |
"(authenticity=%.1f, anomaly=%.1f)",
|
| 638 |
ml_confidence, moderation_factor,
|
| 639 |
authenticity_score, acoustic_anomaly_score,
|
| 640 |
)
|
| 641 |
ml_confidence = moderation_factor
|
| 642 |
-
#
|
| 643 |
-
# is not extreme (<65), override the classification — the signal
|
| 644 |
-
# evidence strongly contradicts the model. Browser mic audio
|
| 645 |
-
# naturally has anomaly 22-64 and authenticity 34-68, so the
|
| 646 |
-
# thresholds must cover these real-world ranges.
|
| 647 |
if authenticity_score > 40 and acoustic_anomaly_score < 65:
|
| 648 |
logger.info(
|
| 649 |
"Authenticity override: flipping AI_GENERATED → HUMAN "
|
|
@@ -682,4 +640,4 @@ def preload_model():
|
|
| 682 |
try:
|
| 683 |
load_model()
|
| 684 |
except Exception as e:
|
| 685 |
-
logger.error(
|
|
|
|
| 14 |
|
| 15 |
from config import settings
|
| 16 |
|
|
|
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
|
|
|
| 19 |
warnings.filterwarnings("ignore", category=FutureWarning)
|
| 20 |
warnings.filterwarnings("ignore", category=UserWarning)
|
| 21 |
|
| 22 |
+
# Heuristic thresholds (env-configurable for tuning)
|
| 23 |
HEURISTIC_THRESHOLDS = {
|
| 24 |
# Pitch scoring
|
| 25 |
"pitch_optimal_stability": float(os.getenv("PITCH_OPTIMAL_STABILITY", "0.20")),
|
|
|
|
| 44 |
_processor = None
|
| 45 |
_device = None
|
| 46 |
|
|
|
|
| 47 |
@dataclass
|
| 48 |
class AnalysisResult:
|
| 49 |
"""Result of voice analysis."""
|
|
|
|
| 61 |
_device = "cuda"
|
| 62 |
else:
|
| 63 |
_device = "cpu"
|
| 64 |
+
logger.info("Using device: %s", _device)
|
| 65 |
return _device
|
| 66 |
|
| 67 |
|
|
|
|
| 72 |
"""Check once at load time whether this model needs label flipping."""
|
| 73 |
global _invert_labels
|
| 74 |
name = getattr(model.config, '_name_or_path', '').lower()
|
| 75 |
+
_invert_labels = 'shivam-2211' in name or 'voice-detection-model' in name
|
| 76 |
+
if _invert_labels:
|
| 77 |
+
logger.info("Label inversion enabled for model: %s", name)
|
|
|
|
|
|
|
| 78 |
|
| 79 |
|
| 80 |
def load_model():
|
|
|
|
| 97 |
backup_model = settings.VOICE_MODEL_BACKUP_ID
|
| 98 |
|
| 99 |
if os.path.exists(local_path):
|
| 100 |
+
logger.info("Loading local model from: %s", local_path)
|
| 101 |
model_name = local_path
|
| 102 |
else:
|
| 103 |
+
logger.info("Loading model from HuggingFace Hub: %s", hf_model)
|
| 104 |
model_name = hf_model
|
| 105 |
|
| 106 |
try:
|
|
|
|
| 108 |
_model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
|
| 109 |
_model.to(get_device())
|
| 110 |
_model.eval()
|
| 111 |
+
logger.info("Model loaded: %s", model_name)
|
| 112 |
_detect_label_inversion(_model)
|
| 113 |
except Exception as e:
|
| 114 |
+
logger.error("Failed to load model %s: %s", model_name, e)
|
| 115 |
if model_name != backup_model:
|
| 116 |
logger.warning("Trying backup model...")
|
| 117 |
model_name = backup_model
|
|
|
|
| 120 |
_model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
|
| 121 |
_model.to(get_device())
|
| 122 |
_model.eval()
|
| 123 |
+
logger.info("Backup model loaded: %s", model_name)
|
| 124 |
_detect_label_inversion(_model)
|
| 125 |
except Exception as e2:
|
| 126 |
raise RuntimeError(f"Could not load any model: {e2}")
|
|
|
|
| 207 |
features["harmonic_noise_ratio_db"] = hnr_db
|
| 208 |
|
| 209 |
except Exception as e:
|
| 210 |
+
logger.warning("Feature extraction error: %s", e)
|
| 211 |
features = {
|
| 212 |
"pitch_stability": 0.5,
|
| 213 |
"jitter": 0.05,
|
|
|
|
| 473 |
model, processor = load_model()
|
| 474 |
device = get_device()
|
| 475 |
|
| 476 |
+
# Normalize audio
|
| 477 |
max_val = np.max(np.abs(audio))
|
| 478 |
if max_val > 0:
|
| 479 |
audio = audio / max_val
|
| 480 |
|
| 481 |
+
# Resample to 16kHz if needed
|
| 482 |
target_sr = 16000
|
| 483 |
if sr != target_sr:
|
| 484 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
|
|
|
|
| 490 |
return_tensors="pt",
|
| 491 |
padding=True
|
| 492 |
)
|
| 493 |
+
|
|
|
|
| 494 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 495 |
+
|
|
|
|
| 496 |
with torch.no_grad():
|
| 497 |
outputs = model(**inputs)
|
| 498 |
logits = outputs.logits
|
| 499 |
|
| 500 |
+
# Temperature scaling: soften probability distribution so the
|
| 501 |
+
# heuristic cross-check can still correct misclassifications.
|
|
|
|
|
|
|
| 502 |
temperature = float(settings.MODEL_LOGIT_TEMPERATURE)
|
| 503 |
if temperature > 1.0:
|
| 504 |
logits = logits / temperature
|
| 505 |
|
| 506 |
probabilities = torch.softmax(logits, dim=-1)
|
|
|
|
| 507 |
# Get prediction
|
| 508 |
predicted_class = torch.argmax(probabilities, dim=-1).item()
|
| 509 |
confidence = probabilities[0][predicted_class].item()
|
| 510 |
|
| 511 |
+
# Normalise id2label keys from str to int (HF convention mismatch).
|
|
|
|
|
|
|
|
|
|
| 512 |
raw_id2label = getattr(model.config, 'id2label', None) or {}
|
| 513 |
id2label = {int(k): v for k, v in raw_id2label.items()}
|
| 514 |
label = id2label.get(predicted_class, 'UNKNOWN')
|
|
|
|
| 519 |
[f"{p:.4f}" for p in probabilities[0].cpu().tolist()],
|
| 520 |
)
|
| 521 |
|
| 522 |
+
# Label interpretation — see _detect_label_inversion() for rationale.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 523 |
if _invert_labels:
|
| 524 |
+
classification = "HUMAN" if predicted_class == 0 else "AI_GENERATED"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
else:
|
|
|
|
| 526 |
if label.upper() in ['FAKE', 'SPOOF', 'SYNTHETIC', 'AI']:
|
| 527 |
classification = "AI_GENERATED"
|
| 528 |
else:
|
|
|
|
| 562 |
try:
|
| 563 |
classification, ml_confidence = classify_with_model(audio, sr)
|
| 564 |
except Exception as e:
|
| 565 |
+
logger.error("ML model error: %s, falling back to signal analysis", e)
|
| 566 |
ml_fallback = True
|
| 567 |
classification = "HUMAN"
|
| 568 |
ml_confidence = 0.5
|
|
|
|
| 587 |
ml_confidence = ai_probability if classification == "AI_GENERATED" else (1.0 - ai_probability)
|
| 588 |
ml_confidence = float(max(0.5, min(0.99, ml_confidence)))
|
| 589 |
|
| 590 |
+
# Authenticity cross-check (realtime mic only): when the model says
|
| 591 |
+
# AI_GENERATED but signal forensics show human-like audio, moderate
|
| 592 |
+
# the confidence or flip the classification. Not applied to file
|
| 593 |
+
# uploads where the model should be trusted.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
if realtime and source == "mic" and classification == "AI_GENERATED" and authenticity_score > 35:
|
|
|
|
|
|
|
|
|
|
| 595 |
moderation_factor = max(0.50, 1.0 - (authenticity_score - 35) / 100.0)
|
| 596 |
if ml_confidence > moderation_factor:
|
| 597 |
logger.info(
|
| 598 |
+
"Authenticity cross-check: moderated AI confidence %.2f -> %.2f "
|
| 599 |
"(authenticity=%.1f, anomaly=%.1f)",
|
| 600 |
ml_confidence, moderation_factor,
|
| 601 |
authenticity_score, acoustic_anomaly_score,
|
| 602 |
)
|
| 603 |
ml_confidence = moderation_factor
|
| 604 |
+
# Override when signal evidence strongly contradicts the model.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 605 |
if authenticity_score > 40 and acoustic_anomaly_score < 65:
|
| 606 |
logger.info(
|
| 607 |
"Authenticity override: flipping AI_GENERATED → HUMAN "
|
|
|
|
| 640 |
try:
|
| 641 |
load_model()
|
| 642 |
except Exception as e:
|
| 643 |
+
logger.error("Model preload failed: %s", e)
|
test_my_api.py
DELETED
|
@@ -1,171 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Official evaluation script from the hackathon guide, configured with our 5 test files.
|
| 3 |
-
This mirrors EXACTLY what the evaluator will run.
|
| 4 |
-
"""
|
| 5 |
-
import requests
|
| 6 |
-
import base64
|
| 7 |
-
import json
|
| 8 |
-
|
| 9 |
-
def evaluate_voice_detection_api(endpoint_url, api_key, test_files):
|
| 10 |
-
if not endpoint_url:
|
| 11 |
-
print("Error: Endpoint URL is required")
|
| 12 |
-
return False
|
| 13 |
-
if not test_files or len(test_files) == 0:
|
| 14 |
-
print("Error: No test files provided")
|
| 15 |
-
return False
|
| 16 |
-
|
| 17 |
-
total_files = len(test_files)
|
| 18 |
-
score_per_file = 100 / total_files
|
| 19 |
-
total_score = 0
|
| 20 |
-
file_results = []
|
| 21 |
-
|
| 22 |
-
print(f"\n{'='*60}")
|
| 23 |
-
print(f"Starting Evaluation")
|
| 24 |
-
print(f"{'='*60}")
|
| 25 |
-
print(f"Endpoint: {endpoint_url}")
|
| 26 |
-
print(f"Total Test Files: {total_files}")
|
| 27 |
-
print(f"Score per File: {score_per_file:.2f}")
|
| 28 |
-
print(f"{'='*60}\n")
|
| 29 |
-
|
| 30 |
-
for idx, file_data in enumerate(test_files):
|
| 31 |
-
language = file_data.get('language', 'English')
|
| 32 |
-
file_path = file_data.get('file_path', '')
|
| 33 |
-
expected_classification = file_data.get('expected_classification', '')
|
| 34 |
-
|
| 35 |
-
print(f"Test {idx + 1}/{total_files}: {file_path}")
|
| 36 |
-
|
| 37 |
-
if not file_path or not expected_classification:
|
| 38 |
-
file_results.append({'fileIndex': idx, 'status': 'skipped', 'score': 0})
|
| 39 |
-
print(f" Skipped: Missing file path or expected classification\n")
|
| 40 |
-
continue
|
| 41 |
-
|
| 42 |
-
try:
|
| 43 |
-
with open(file_path, 'rb') as audio_file:
|
| 44 |
-
audio_base64 = base64.b64encode(audio_file.read()).decode('utf-8')
|
| 45 |
-
except Exception as e:
|
| 46 |
-
file_results.append({'fileIndex': idx, 'status': 'failed', 'message': f'Failed to read: {e}', 'score': 0})
|
| 47 |
-
print(f" Failed to read file: {e}\n")
|
| 48 |
-
continue
|
| 49 |
-
|
| 50 |
-
headers = {'Content-Type': 'application/json', 'x-api-key': api_key}
|
| 51 |
-
request_body = {'language': language, 'audioFormat': 'mp3', 'audioBase64': audio_base64}
|
| 52 |
-
|
| 53 |
-
try:
|
| 54 |
-
response = requests.post(endpoint_url, headers=headers, json=request_body, timeout=30)
|
| 55 |
-
|
| 56 |
-
if response.status_code != 200:
|
| 57 |
-
file_results.append({'fileIndex': idx, 'status': 'failed', 'message': f'HTTP {response.status_code}', 'score': 0})
|
| 58 |
-
print(f" HTTP Status: {response.status_code}")
|
| 59 |
-
print(f" Response: {response.text[:200]}\n")
|
| 60 |
-
continue
|
| 61 |
-
|
| 62 |
-
response_data = response.json()
|
| 63 |
-
|
| 64 |
-
if not isinstance(response_data, dict):
|
| 65 |
-
file_results.append({'fileIndex': idx, 'status': 'failed', 'message': 'Not a JSON object', 'score': 0})
|
| 66 |
-
print(f" Invalid response type\n")
|
| 67 |
-
continue
|
| 68 |
-
|
| 69 |
-
response_status = response_data.get('status', '')
|
| 70 |
-
response_classification = response_data.get('classification', '')
|
| 71 |
-
confidence_score = response_data.get('confidenceScore', None)
|
| 72 |
-
|
| 73 |
-
if not response_status or not response_classification or confidence_score is None:
|
| 74 |
-
file_results.append({'fileIndex': idx, 'status': 'failed', 'message': 'Missing required fields', 'score': 0})
|
| 75 |
-
print(f" Missing required fields")
|
| 76 |
-
print(f" Response: {json.dumps(response_data, indent=2)[:200]}\n")
|
| 77 |
-
continue
|
| 78 |
-
|
| 79 |
-
if response_status != 'success':
|
| 80 |
-
file_results.append({'fileIndex': idx, 'status': 'failed', 'message': f'Status: {response_status}', 'score': 0})
|
| 81 |
-
print(f" Status not 'success': {response_status}\n")
|
| 82 |
-
continue
|
| 83 |
-
|
| 84 |
-
if not isinstance(confidence_score, (int, float)) or confidence_score < 0 or confidence_score > 1:
|
| 85 |
-
file_results.append({'fileIndex': idx, 'status': 'failed', 'message': f'Invalid confidence: {confidence_score}', 'score': 0})
|
| 86 |
-
print(f" Invalid confidence score: {confidence_score}\n")
|
| 87 |
-
continue
|
| 88 |
-
|
| 89 |
-
valid_classifications = ['HUMAN', 'AI_GENERATED']
|
| 90 |
-
if response_classification not in valid_classifications:
|
| 91 |
-
file_results.append({'fileIndex': idx, 'status': 'failed', 'message': f'Invalid classification: {response_classification}', 'score': 0})
|
| 92 |
-
print(f" Invalid classification: {response_classification}\n")
|
| 93 |
-
continue
|
| 94 |
-
|
| 95 |
-
# Score calculation
|
| 96 |
-
file_score = 0
|
| 97 |
-
if response_classification == expected_classification:
|
| 98 |
-
if confidence_score >= 0.8:
|
| 99 |
-
file_score = score_per_file
|
| 100 |
-
confidence_tier = "100%"
|
| 101 |
-
elif confidence_score >= 0.6:
|
| 102 |
-
file_score = score_per_file * 0.75
|
| 103 |
-
confidence_tier = "75%"
|
| 104 |
-
elif confidence_score >= 0.4:
|
| 105 |
-
file_score = score_per_file * 0.5
|
| 106 |
-
confidence_tier = "50%"
|
| 107 |
-
else:
|
| 108 |
-
file_score = score_per_file * 0.25
|
| 109 |
-
confidence_tier = "25%"
|
| 110 |
-
total_score += file_score
|
| 111 |
-
file_results.append({'fileIndex': idx, 'status': 'success', 'matched': True, 'score': round(file_score, 2),
|
| 112 |
-
'actualClassification': response_classification, 'confidenceScore': confidence_score})
|
| 113 |
-
print(f" CORRECT: {response_classification}")
|
| 114 |
-
print(f" Confidence: {confidence_score:.2f} -> {confidence_tier} of points")
|
| 115 |
-
print(f" Score: {file_score:.2f}/{score_per_file:.2f}\n")
|
| 116 |
-
else:
|
| 117 |
-
file_results.append({'fileIndex': idx, 'status': 'success', 'matched': False, 'score': 0,
|
| 118 |
-
'actualClassification': response_classification, 'confidenceScore': confidence_score})
|
| 119 |
-
print(f" WRONG: {response_classification} (Expected: {expected_classification})")
|
| 120 |
-
print(f" Score: 0/{score_per_file:.2f}\n")
|
| 121 |
-
|
| 122 |
-
except requests.exceptions.Timeout:
|
| 123 |
-
file_results.append({'fileIndex': idx, 'status': 'failed', 'message': 'Timeout (>30s)', 'score': 0})
|
| 124 |
-
print(f" TIMEOUT: Request took longer than 30 seconds\n")
|
| 125 |
-
except requests.exceptions.ConnectionError:
|
| 126 |
-
file_results.append({'fileIndex': idx, 'status': 'failed', 'message': 'Connection error', 'score': 0})
|
| 127 |
-
print(f" CONNECTION ERROR\n")
|
| 128 |
-
except Exception as e:
|
| 129 |
-
file_results.append({'fileIndex': idx, 'status': 'failed', 'message': str(e), 'score': 0})
|
| 130 |
-
print(f" ERROR: {e}\n")
|
| 131 |
-
|
| 132 |
-
final_score = round(total_score)
|
| 133 |
-
|
| 134 |
-
print(f"{'='*60}")
|
| 135 |
-
print(f"EVALUATION SUMMARY")
|
| 136 |
-
print(f"{'='*60}")
|
| 137 |
-
print(f"Total Files Tested: {total_files}")
|
| 138 |
-
print(f"Final Score: {final_score}/100")
|
| 139 |
-
print(f"{'='*60}\n")
|
| 140 |
-
|
| 141 |
-
successful = sum(1 for r in file_results if r.get('matched', False))
|
| 142 |
-
failed = sum(1 for r in file_results if r['status'] == 'failed')
|
| 143 |
-
wrong = sum(1 for r in file_results if r['status'] == 'success' and not r.get('matched', False))
|
| 144 |
-
|
| 145 |
-
print(f"Correct Classifications: {successful}/{total_files}")
|
| 146 |
-
print(f"Wrong Classifications: {wrong}/{total_files}")
|
| 147 |
-
print(f"Failed/Errors: {failed}/{total_files}\n")
|
| 148 |
-
|
| 149 |
-
with open('evaluation_results.json', 'w') as f:
|
| 150 |
-
json.dump({'finalScore': final_score, 'totalFiles': total_files, 'scorePerFile': round(score_per_file, 2),
|
| 151 |
-
'successfulClassifications': successful, 'wrongClassifications': wrong, 'failedTests': failed,
|
| 152 |
-
'fileResults': file_results}, f, indent=2)
|
| 153 |
-
print(f"Detailed results saved to: evaluation_results.json\n")
|
| 154 |
-
return True
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
if __name__ == '__main__':
|
| 158 |
-
ENDPOINT_URL = 'https://shivam-2211-voice-detection-api.hf.space/api/voice-detection'
|
| 159 |
-
API_KEY = 'sk_test_voice_detection_2026'
|
| 160 |
-
|
| 161 |
-
DIR = r'c:\Users\shiva\OneDrive\Desktop\Voice Project\voice-detection-api\drive-download-20260216T053632Z-1-001'
|
| 162 |
-
|
| 163 |
-
TEST_FILES = [
|
| 164 |
-
{'language': 'English', 'file_path': f'{DIR}\\English_voice_AI_GENERATED.mp3', 'expected_classification': 'AI_GENERATED'},
|
| 165 |
-
{'language': 'Hindi', 'file_path': f'{DIR}\\Hindi_Voice_HUMAN.mp3', 'expected_classification': 'HUMAN'},
|
| 166 |
-
{'language': 'Malayalam','file_path': f'{DIR}\\Malayalam_AI_GENERATED.mp3', 'expected_classification': 'AI_GENERATED'},
|
| 167 |
-
{'language': 'Tamil', 'file_path': f'{DIR}\\TAMIL_VOICE__HUMAN.mp3', 'expected_classification': 'HUMAN'},
|
| 168 |
-
{'language': 'Telugu', 'file_path': f'{DIR}\\Telugu_Voice_AI_GENERATED.mp3', 'expected_classification': 'AI_GENERATED'},
|
| 169 |
-
]
|
| 170 |
-
|
| 171 |
-
evaluate_voice_detection_api(ENDPOINT_URL, API_KEY, TEST_FILES)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|