Spaces:
Sleeping
Sleeping
anish
Upgrade ML pipeline: 160+ signal features, new predict engine, feature-aware explanations
fc46ab2 | import os | |
| import uuid | |
| import base64 | |
| from fastapi import FastAPI, Header, Body | |
| from fastapi.responses import JSONResponse | |
| from pydantic import BaseModel | |
| import librosa | |
| from pydub import AudioSegment | |
| # ML inference bridge | |
| from predict import predict | |
| # -------------------- CONFIGURATION -------------------- | |
| app = FastAPI(title="AI Voice Detection API") | |
| # API Key (use ENV in production) | |
| API_KEY = os.getenv("API_KEY", "hackathon-secret") | |
| SUPPORTED_LANGUAGES = [ | |
| "Tamil", | |
| "English", | |
| "Hindi", | |
| "Malayalam", | |
| "Telugu" | |
| ] | |
| # -------------------- REQUEST MODEL -------------------- | |
| class VoiceRequest(BaseModel): | |
| language: str | |
| audioFormat: str | |
| audioBase64: str | |
| # -------------------- HELPER FUNCTIONS -------------------- | |
| def generate_explanation(classification: str, confidence: float, language: str, features: dict = None) -> str: | |
| """ | |
| Generate explanation based on actual acoustic features detected. | |
| """ | |
| if features is None: | |
| features = {} | |
| reasons = [] | |
| if classification == "AI_GENERATED": | |
| # Check actual feature values | |
| jitter = features.get('jitter', -1) | |
| shimmer = features.get('shimmer', -1) | |
| pitch_cv = features.get('pitch_cv', -1) | |
| silence_ratio = features.get('silence_ratio', -1) | |
| rms_cv = features.get('rms_cv', -1) | |
| hf_smooth = features.get('hf_smooth', -1) | |
| hnr = features.get('hnr', -1) | |
| if 0 <= jitter < 0.02: | |
| reasons.append(f"unusually low pitch micro-variations (jitter={jitter:.4f}) suggesting synthetic vocal generation") | |
| if 0 <= shimmer < 0.2: | |
| reasons.append("abnormally consistent amplitude patterns not typical of natural speech") | |
| if 0 <= pitch_cv < 0.15: | |
| reasons.append("limited pitch variation indicating machine-generated monotone characteristics") | |
| if 0 <= rms_cv < 0.4: | |
| reasons.append("uniform energy distribution lacking natural human speech dynamics") | |
| if silence_ratio >= 0 and silence_ratio < 0.05: | |
| reasons.append("absence of natural breathing pauses between speech segments") | |
| if hf_smooth >= 0 and hf_smooth < 0.001: | |
| reasons.append("smooth high-frequency spectrum consistent with neural vocoder artifacts") | |
| if hnr > 15: | |
| reasons.append("abnormally high harmonic-to-noise ratio indicating synthesized audio clarity") | |
| if not reasons: | |
| reasons.append("combination of spectral and temporal patterns consistent with AI-generated speech") | |
| if confidence >= 0.85: | |
| prefix = f"High-confidence AI detection in the {language} sample" | |
| elif confidence >= 0.7: | |
| prefix = f"Moderate indicators of synthetic generation in the {language} sample" | |
| else: | |
| prefix = f"Subtle synthetic patterns detected in the {language} sample" | |
| # Pick top 2 reasons max | |
| selected = reasons[:2] | |
| return f"{prefix}: {'; '.join(selected)}." | |
| else: | |
| pitch_cv = features.get('pitch_cv', -1) | |
| jitter = features.get('jitter', -1) | |
| rms_cv = features.get('rms_cv', -1) | |
| silence_ratio = features.get('silence_ratio', -1) | |
| if jitter > 0.02: | |
| reasons.append("natural pitch micro-variations (jitter) consistent with human vocal cord vibration") | |
| if pitch_cv > 0.15: | |
| reasons.append("healthy pitch variation reflecting natural prosody and emotional expression") | |
| if rms_cv > 0.5: | |
| reasons.append("dynamic energy patterns showing natural speech rhythm and emphasis") | |
| if silence_ratio > 0.05: | |
| reasons.append("natural breathing pauses and organic speech timing detected") | |
| if not reasons: | |
| reasons.append("overall acoustic signature consistent with natural human speech production") | |
| if confidence >= 0.85: | |
| prefix = f"Strong indicators of natural human speech in the {language} sample" | |
| elif confidence >= 0.7: | |
| prefix = f"Speech patterns in the {language} sample align with human vocal characteristics" | |
| else: | |
| prefix = f"The {language} sample shows characteristics generally associated with human speech" | |
| selected = reasons[:2] | |
| return f"{prefix}: {'; '.join(selected)}." | |
| # -------------------- HEALTH CHECK -------------------- | |
| def health_check(): | |
| return { | |
| "status": "ok", | |
| "message": "AI Voice Detection API is running" | |
| } | |
| # -------------------- MAIN API -------------------- | |
| def detect_voice( | |
| request: VoiceRequest = Body(...), | |
| x_api_key: str = Header(...) | |
| ): | |
| # 1️⃣ API KEY VALIDATION | |
| if x_api_key != API_KEY: | |
| return JSONResponse( | |
| status_code=401, | |
| content={"status": "error", "message": "Invalid API key"} | |
| ) | |
| # 2️⃣ LANGUAGE VALIDATION | |
| if request.language not in SUPPORTED_LANGUAGES: | |
| return JSONResponse( | |
| status_code=400, | |
| content={"status": "error", "message": f"Unsupported language. Allowed values: {SUPPORTED_LANGUAGES}"} | |
| ) | |
| # 3️⃣ AUDIO FORMAT VALIDATION | |
| if request.audioFormat.lower() != "mp3": | |
| return JSONResponse( | |
| status_code=400, | |
| content={"status": "error", "message": "Only mp3 audio format is supported"} | |
| ) | |
| # Temporary file names | |
| temp_mp3 = f"temp_{uuid.uuid4()}.mp3" | |
| original_temp_mp3 = temp_mp3 | |
| try: | |
| # 4️⃣ BASE64 DECODE | |
| try: | |
| audio_bytes = base64.b64decode( | |
| request.audioBase64, | |
| validate=True | |
| ) | |
| except Exception: | |
| return JSONResponse( | |
| status_code=400, | |
| content={"status": "error", "message": "Invalid Base64 audio string"} | |
| ) | |
| # Reject empty or fake audio | |
| if len(audio_bytes) < 1000: | |
| return JSONResponse( | |
| status_code=400, | |
| content={"status": "error", "message": "Audio data is too small or empty"} | |
| ) | |
| # 5️⃣ SAVE MP3 FILE | |
| with open(temp_mp3, "wb") as f: | |
| f.write(audio_bytes) | |
| # 5.5️⃣ CHECK AND TRIM AUDIO DURATION (max 30 seconds) | |
| y, sr = librosa.load(temp_mp3, sr=None) | |
| duration = len(y) / sr | |
| if duration > 30: | |
| audio = AudioSegment.from_file(temp_mp3, format="mp3") | |
| trimmed_audio = audio[:30000] # 30 seconds in milliseconds | |
| trimmed_mp3 = temp_mp3.replace(".mp3", "_trimmed.mp3") | |
| trimmed_audio.export(trimmed_mp3, format="mp3") | |
| temp_mp3 = trimmed_mp3 # Use trimmed file | |
| # 6️⃣ ML INFERENCE | |
| result = predict(temp_mp3) | |
| classification = result.get("classification") | |
| confidence = result.get("confidenceScore") | |
| if classification not in ["AI_GENERATED", "HUMAN"]: | |
| return JSONResponse( | |
| status_code=500, | |
| content={"status": "error", "message": "Invalid classification returned by ML model"} | |
| ) | |
| features = result.get("features", {}) | |
| explanation = generate_explanation(classification, confidence, request.language, features) | |
| # 8️⃣ SUCCESS RESPONSE (STRICT FORMAT) | |
| return { | |
| "status": "success", | |
| "language": request.language, | |
| "classification": classification, | |
| "confidenceScore": confidence, | |
| "explanation": explanation | |
| } | |
| except Exception as e: | |
| # Catch-all for unexpected failures | |
| return JSONResponse( | |
| status_code=500, | |
| content={"status": "error", "message": f"Processing error: {str(e)}"} | |
| ) | |
| finally: | |
| # 9️⃣ CLEANUP TEMP FILES | |
| for path in [original_temp_mp3, temp_mp3]: | |
| if os.path.exists(path): | |
| try: | |
| os.remove(path) | |
| except: | |
| pass | |