Spaces:

AnishKumbhar
/

gunashree_hackathon

Sleeping

anish

Upgrade ML pipeline: 160+ signal features, new predict engine, feature-aware explanations

fc46ab2 2 months ago

8.03 kB

	import os
	import uuid
	import base64
	from fastapi import FastAPI, Header, Body
	from fastapi.responses import JSONResponse
	from pydantic import BaseModel
	import librosa
	from pydub import AudioSegment

	# ML inference bridge
	from predict import predict


	# -------------------- CONFIGURATION --------------------

	app = FastAPI(title="AI Voice Detection API")

	# API Key (use ENV in production)
	API_KEY = os.getenv("API_KEY", "hackathon-secret")

	SUPPORTED_LANGUAGES = [
	"Tamil",
	"English",
	"Hindi",
	"Malayalam",
	"Telugu"
	]


	# -------------------- REQUEST MODEL --------------------

	class VoiceRequest(BaseModel):
	language: str
	audioFormat: str
	audioBase64: str


	# -------------------- HELPER FUNCTIONS --------------------

	def generate_explanation(classification: str, confidence: float, language: str, features: dict = None) -> str:
	"""
	Generate explanation based on actual acoustic features detected.
	"""
	if features is None:
	features = {}

	reasons = []

	if classification == "AI_GENERATED":
	# Check actual feature values
	jitter = features.get('jitter', -1)
	shimmer = features.get('shimmer', -1)
	pitch_cv = features.get('pitch_cv', -1)
	silence_ratio = features.get('silence_ratio', -1)
	rms_cv = features.get('rms_cv', -1)
	hf_smooth = features.get('hf_smooth', -1)
	hnr = features.get('hnr', -1)

	if 0 <= jitter < 0.02:
	reasons.append(f"unusually low pitch micro-variations (jitter={jitter:.4f}) suggesting synthetic vocal generation")
	if 0 <= shimmer < 0.2:
	reasons.append("abnormally consistent amplitude patterns not typical of natural speech")
	if 0 <= pitch_cv < 0.15:
	reasons.append("limited pitch variation indicating machine-generated monotone characteristics")
	if 0 <= rms_cv < 0.4:
	reasons.append("uniform energy distribution lacking natural human speech dynamics")
	if silence_ratio >= 0 and silence_ratio < 0.05:
	reasons.append("absence of natural breathing pauses between speech segments")
	if hf_smooth >= 0 and hf_smooth < 0.001:
	reasons.append("smooth high-frequency spectrum consistent with neural vocoder artifacts")
	if hnr > 15:
	reasons.append("abnormally high harmonic-to-noise ratio indicating synthesized audio clarity")

	if not reasons:
	reasons.append("combination of spectral and temporal patterns consistent with AI-generated speech")

	if confidence >= 0.85:
	prefix = f"High-confidence AI detection in the {language} sample"
	elif confidence >= 0.7:
	prefix = f"Moderate indicators of synthetic generation in the {language} sample"
	else:
	prefix = f"Subtle synthetic patterns detected in the {language} sample"

	# Pick top 2 reasons max
	selected = reasons[:2]
	return f"{prefix}: {'; '.join(selected)}."

	else:
	pitch_cv = features.get('pitch_cv', -1)
	jitter = features.get('jitter', -1)
	rms_cv = features.get('rms_cv', -1)
	silence_ratio = features.get('silence_ratio', -1)

	if jitter > 0.02:
	reasons.append("natural pitch micro-variations (jitter) consistent with human vocal cord vibration")
	if pitch_cv > 0.15:
	reasons.append("healthy pitch variation reflecting natural prosody and emotional expression")
	if rms_cv > 0.5:
	reasons.append("dynamic energy patterns showing natural speech rhythm and emphasis")
	if silence_ratio > 0.05:
	reasons.append("natural breathing pauses and organic speech timing detected")

	if not reasons:
	reasons.append("overall acoustic signature consistent with natural human speech production")

	if confidence >= 0.85:
	prefix = f"Strong indicators of natural human speech in the {language} sample"
	elif confidence >= 0.7:
	prefix = f"Speech patterns in the {language} sample align with human vocal characteristics"
	else:
	prefix = f"The {language} sample shows characteristics generally associated with human speech"

	selected = reasons[:2]
	return f"{prefix}: {'; '.join(selected)}."

	# -------------------- HEALTH CHECK --------------------

	@app.get("/health")
	def health_check():
	return {
	"status": "ok",
	"message": "AI Voice Detection API is running"
	}


	# -------------------- MAIN API --------------------

	@app.post("/api/voice-detection")
	def detect_voice(
	request: VoiceRequest = Body(...),
	x_api_key: str = Header(...)
	):
	# 1️⃣ API KEY VALIDATION
	if x_api_key != API_KEY:
	return JSONResponse(
	status_code=401,
	content={"status": "error", "message": "Invalid API key"}
	)

	# 2️⃣ LANGUAGE VALIDATION
	if request.language not in SUPPORTED_LANGUAGES:
	return JSONResponse(
	status_code=400,
	content={"status": "error", "message": f"Unsupported language. Allowed values: {SUPPORTED_LANGUAGES}"}
	)

	# 3️⃣ AUDIO FORMAT VALIDATION
	if request.audioFormat.lower() != "mp3":
	return JSONResponse(
	status_code=400,
	content={"status": "error", "message": "Only mp3 audio format is supported"}
	)

	# Temporary file names
	temp_mp3 = f"temp_{uuid.uuid4()}.mp3"
	original_temp_mp3 = temp_mp3

	try:
	# 4️⃣ BASE64 DECODE
	try:
	audio_bytes = base64.b64decode(
	request.audioBase64,
	validate=True
	)
	except Exception:
	return JSONResponse(
	status_code=400,
	content={"status": "error", "message": "Invalid Base64 audio string"}
	)

	# Reject empty or fake audio
	if len(audio_bytes) < 1000:
	return JSONResponse(
	status_code=400,
	content={"status": "error", "message": "Audio data is too small or empty"}
	)

	# 5️⃣ SAVE MP3 FILE
	with open(temp_mp3, "wb") as f:
	f.write(audio_bytes)

	# 5.5️⃣ CHECK AND TRIM AUDIO DURATION (max 30 seconds)
	y, sr = librosa.load(temp_mp3, sr=None)
	duration = len(y) / sr
	if duration > 30:
	audio = AudioSegment.from_file(temp_mp3, format="mp3")
	trimmed_audio = audio[:30000] # 30 seconds in milliseconds
	trimmed_mp3 = temp_mp3.replace(".mp3", "_trimmed.mp3")
	trimmed_audio.export(trimmed_mp3, format="mp3")
	temp_mp3 = trimmed_mp3 # Use trimmed file

	# 6️⃣ ML INFERENCE
	result = predict(temp_mp3)

	classification = result.get("classification")
	confidence = result.get("confidenceScore")

	if classification not in ["AI_GENERATED", "HUMAN"]:
	return JSONResponse(
	status_code=500,
	content={"status": "error", "message": "Invalid classification returned by ML model"}
	)

	features = result.get("features", {})
	explanation = generate_explanation(classification, confidence, request.language, features)

	# 8️⃣ SUCCESS RESPONSE (STRICT FORMAT)
	return {
	"status": "success",
	"language": request.language,
	"classification": classification,
	"confidenceScore": confidence,
	"explanation": explanation
	}

	except Exception as e:
	# Catch-all for unexpected failures
	return JSONResponse(
	status_code=500,
	content={"status": "error", "message": f"Processing error: {str(e)}"}
	)

	finally:
	# 9️⃣ CLEANUP TEMP FILES
	for path in [original_temp_mp3, temp_mp3]:
	if os.path.exists(path):
	try:
	os.remove(path)
	except:
	pass