Spaces:

AnishKumbhar
/

gunashree_hackathon

Sleeping

App Files Files Community

anish commited on Feb 5

Commit

7015e1f

1 Parent(s): 06a727d

Done

Browse files

Files changed (7) hide show

app/__init__.py +2 -0
app/main.py +186 -0
ml/__init__.py +2 -0
ml/inference/__init__.py +5 -0
ml/inference/predict.py +124 -0
ml/models/signal_scaler.joblib +0 -0
ml/models/voice_classifier.joblib +0 -0

app/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # App module
2	+

app/main.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import os
+import uuid
+import base64
+from fastapi import FastAPI, Header, Body
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+import librosa
+from pydub import AudioSegment
+# ML inference bridge
+from ml.inference.predict import predict
+# -------------------- CONFIGURATION --------------------
+app = FastAPI(title="AI Voice Detection API")
+# API Key (use ENV in production)
+API_KEY = os.getenv("API_KEY", "hackathon-secret")
+SUPPORTED_LANGUAGES = [
+    "Tamil",
+    "English",
+    "Hindi",
+    "Malayalam",
+    "Telugu"
+]
+# -------------------- REQUEST MODEL --------------------
+class VoiceRequest(BaseModel):
+    language: str
+    audioFormat: str
+    audioBase64: str
+# -------------------- HELPER FUNCTIONS --------------------
+def generate_explanation(classification: str, confidence: float, language: str) -> str:
+    """
+    Generates a human-readable explanation for the result.
+    """
+    if classification == "AI_GENERATED":
+        if confidence >= 0.8:
+            return (
+                f"High-confidence detection of synthetic voice patterns, "
+                f"including unnatural pitch consistency in the {language} sample."
+            )
+        else:
+            return (
+                f"Minor digital artifacts detected in the {language} speech, "
+                f"suggesting possible AI generation."
+            )
+    else:
+        if confidence >= 0.8:
+            return (
+                f"Natural prosody, breathing patterns, and organic speech flow "
+                f"detected, consistent with a human {language} speaker."
+            )
+        else:
+            return (
+                f"Speech characteristics align with human vocal patterns "
+                f"for the {language} language."
+            )
+# -------------------- HEALTH CHECK --------------------
+@app.get("/health")
+def health_check():
+    return {
+        "status": "ok",
+        "message": "AI Voice Detection API is running"
+    }
+# -------------------- MAIN API --------------------
+@app.post("/api/voice-detection")
+def detect_voice(
+    request: VoiceRequest = Body(...),
+    x_api_key: str = Header(...)
+):
+    # 1️⃣ API KEY VALIDATION
+    if x_api_key != API_KEY:
+        return JSONResponse(
+            status_code=401,
+            content={"status": "error", "message": "Invalid API key"}
+        )
+    # 2️⃣ LANGUAGE VALIDATION
+    if request.language not in SUPPORTED_LANGUAGES:
+        return JSONResponse(
+            status_code=400,
+            content={"status": "error", "message": f"Unsupported language. Allowed values: {SUPPORTED_LANGUAGES}"}
+        )
+    # 3️⃣ AUDIO FORMAT VALIDATION
+    if request.audioFormat.lower() != "mp3":
+        return JSONResponse(
+            status_code=400,
+            content={"status": "error", "message": "Only mp3 audio format is supported"}
+        )
+    # Temporary file names
+    temp_mp3 = f"temp_{uuid.uuid4()}.mp3"
+    original_temp_mp3 = temp_mp3
+    try:
+        # 4️⃣ BASE64 DECODE
+        try:
+            audio_bytes = base64.b64decode(
+                request.audioBase64,
+                validate=True
+            )
+        except Exception:
+            return JSONResponse(
+                status_code=400,
+                content={"status": "error", "message": "Invalid Base64 audio string"}
+            )
+        # Reject empty or fake audio
+        if len(audio_bytes) < 1000:
+            return JSONResponse(
+                status_code=400,
+                content={"status": "error", "message": "Audio data is too small or empty"}
+            )
+        # 5️⃣ SAVE MP3 FILE
+        with open(temp_mp3, "wb") as f:
+            f.write(audio_bytes)
+        # 5.5️⃣ CHECK AND TRIM AUDIO DURATION (max 60 seconds)
+        y, sr = librosa.load(temp_mp3, sr=None)
+        duration = len(y) / sr
+        if duration > 30:
+            # Trim to first 60 seconds
+            audio = AudioSegment.from_file(temp_mp3, format="mp3")
+            trimmed_audio = audio[:30000] # 60 seconds in milliseconds
+            trimmed_mp3 = temp_mp3.replace(".mp3", "_trimmed.mp3")
+            trimmed_audio.export(trimmed_mp3, format="mp3")
+            temp_mp3 = trimmed_mp3  # Use trimmed file
+        # 6️⃣ ML INFERENCE (Member-1 implementation)
+        result = predict(temp_mp3)
+        classification = result.get("classification")
+        confidence = result.get("confidenceScore")
+        if classification not in ["AI_GENERATED", "HUMAN"]:
+            return JSONResponse(
+                status_code=500,
+                content={"status": "error", "message": "Invalid classification returned by ML model"}
+            )
+        explanation = generate_explanation(
+            classification,
+            confidence,
+            request.language
+        )
+        # 8️⃣ SUCCESS RESPONSE (STRICT FORMAT)
+        return {
+            "status": "success",
+            "language": request.language,
+            "classification": classification,
+            "confidenceScore": confidence,
+            "explanation": explanation
+        }
+    except Exception as e:
+        # Catch-all for unexpected failures
+        return JSONResponse(
+            status_code=500,
+            content={"status": "error", "message": f"Processing error: {str(e)}"}
+        )
+    finally:
+        # 9️⃣ CLEANUP TEMP FILES
+        for path in [original_temp_mp3, temp_mp3]:
+            if os.path.exists(path):
+                try:
+                    os.remove(path)
+                except:
+                    pass

ml/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # ML module for AI Voice Detection
2	+

ml/inference/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# ML inference module
+from .predict import predict
+__all__ = ["predict"]

ml/inference/predict.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+import librosa
+import numpy as np
+import torch
+from transformers import Wav2Vec2Processor, Wav2Vec2Model
+from joblib import load
+TARGET_SR = 16000
+# Get the directory where this script is located
+_current_dir = os.path.dirname(os.path.abspath(__file__))
+# Try multiple paths for model location (works in different deployment scenarios)
+_model_paths = [
+    os.path.join(_current_dir, "..", "models"),  # Relative from inference/
+    os.path.join(os.path.dirname(_current_dir), "models"),  # From ml/models
+    "ml/models",  # From root directory
+    os.path.join(os.getcwd(), "ml", "models"),  # From current working directory
+]
+_model_dir = None
+for path in _model_paths:
+    abs_path = os.path.abspath(path)
+    if os.path.exists(abs_path) and os.path.exists(os.path.join(abs_path, "voice_classifier.joblib")):
+        _model_dir = abs_path
+        break
+if _model_dir is None:
+    raise FileNotFoundError(
+        f"Could not find model files. Tried paths: {_model_paths}. "
+        f"Current directory: {os.getcwd()}, Script directory: {_current_dir}"
+    )
+# Load model + scaler
+clf = load(os.path.join(_model_dir, "voice_classifier.joblib"))
+scaler = load(os.path.join(_model_dir, "signal_scaler.joblib"))
+# Load wav2vec
+processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
+model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
+model.eval()
+# Use GPU if available
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model.to(device)
+def extract_embedding(audio_path):
+    y, _ = librosa.load(audio_path, sr=TARGET_SR, mono=True)
+    inputs = processor(
+        y,
+        sampling_rate=TARGET_SR,
+        return_tensors="pt",
+        padding=True
+    )
+    # Move to device
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
+def extract_signal_features(audio_path):
+    y, sr = librosa.load(audio_path, sr=TARGET_SR, mono=True)
+    f0 = librosa.yin(y, fmin=50, fmax=300)
+    pitch_var = np.var(f0)
+    spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
+    spec_mean = np.mean(spec_centroid)
+    spec_var = np.var(spec_centroid)
+    zcr = librosa.feature.zero_crossing_rate(y)
+    zcr_mean = np.mean(zcr)
+    return np.array([pitch_var, spec_mean, spec_var, zcr_mean])
+def generate_explanation(sig_feats, is_ai):
+    pitch_var, spec_mean, spec_var, zcr = sig_feats
+    if is_ai:
+        reasons = []
+        if pitch_var < 3000:
+            reasons.append("unnaturally stable pitch")
+        if spec_var < 8e5:
+            reasons.append("overly smooth spectral profile")
+        if zcr < 0.1:
+            reasons.append("robotic waveform structure")
+        if reasons:
+            return " and ".join(reasons).capitalize() + " detected"
+        else:
+            return "Acoustic patterns consistent with synthetic speech detected"
+    else:
+        return "Natural pitch variation and spectral dynamics detected"
+def predict(audio_path):
+    emb = extract_embedding(audio_path)
+    sig = extract_signal_features(audio_path)
+    sig_scaled = scaler.transform(sig.reshape(1, -1))
+    X = np.concatenate([emb.reshape(1, -1), sig_scaled], axis=1)
+    prob_ai = clf.predict_proba(X)[0][1]
+    is_ai = prob_ai >= 0.5
+    label = "AI_GENERATED" if is_ai else "HUMAN"
+    explanation = generate_explanation(sig, is_ai)
+    # Confidence in the predicted class
+    confidence = prob_ai if is_ai else (1 - prob_ai)
+    return {
+        "classification": label,
+        "confidenceScore": round(float(confidence), 3),
+        "explanation": explanation
+    }

ml/models/signal_scaler.joblib ADDED Viewed

Binary file (711 Bytes). View file

ml/models/voice_classifier.joblib ADDED Viewed

Binary file (7.06 kB). View file