Spaces:

AK-Gaming-92
/

voice-detection-api

Runtime error

App Files Files Community

AK-Gaming-92 commited on Feb 3

Commit

0354504

verified ·

1 Parent(s): f1565c0

Upload 5 files

Browse files

Files changed (5) hide show

Dockerfile +20 -0
detector.py +107 -0
main.py +62 -0
requirements.txt +10 -0
utils.py +23 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+# Use Python 3.9
+FROM python:3.9
+# Set working directory
+WORKDIR /code
+# Copy requirements and install dependencies
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Copy the application code
+COPY . /code
+# Create a writable cache directory for the AI model
+# (Hugging Face needs this permission setup)
+RUN mkdir -p /code/cache && chmod 777 /code/cache
+ENV XDG_CACHE_HOME=/code/cache
+# Start the server on port 7860 (Hugging Face default)
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

detector.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import torch
+import librosa
+import numpy as np
+import io
+from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
+class VoiceDetector:
+    def __init__(self):
+        print("Loading AI Detection Model... (this may take a moment)")
+        # We use a pre-trained model specifically fine-tuned for Deepfake detection
+        # Source: https://huggingface.co/MelodyMachine/Deepfake-audio-detection-V2
+        # Alternative robust model: "padmalcom/wav2vec2-large-fake-voice-detection-v2"
+        self.model_name = "MelodyMachine/Deepfake-audio-detection-V2"
+        try:
+            self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(self.model_name)
+            self.model = Wav2Vec2ForSequenceClassification.from_pretrained(self.model_name)
+            self.model.eval() # Set to evaluation mode
+        except Exception as e:
+            print(f"CRITICAL ERROR: Failed to load AI model. {e}")
+            raise e
+    def preprocess_audio(self, audio_buffer: io.BytesIO, target_sr=16000):
+        """
+        Loads audio from bytes and resamples it to 16kHz (required by Wav2Vec2).
+        """
+        audio_buffer.seek(0)
+        # Load with librosa (automatically handles MP3/WAV)
+        y, sr = librosa.load(audio_buffer, sr=target_sr)
+        # Ensure we have enough audio for the model (pad if too short)
+        if len(y) < target_sr: # Less than 1 second
+            padding = target_sr - len(y)
+            y = np.pad(y, (0, padding), 'constant')
+        # Normalize audio volume
+        y = librosa.util.normalize(y)
+        return y
+    def analyze(self, audio_buffer: io.BytesIO, language: str):
+        """
+        Analyzes audio using the Deep Learning model.
+        Returns classification, confidence, and explanation.
+        """
+        try:
+            # 1. Preprocess Audio
+            audio_input = self.preprocess_audio(audio_buffer)
+            # 2. Prepare inputs for the model
+            inputs = self.feature_extractor(
+                audio_input,
+                sampling_rate=16000,
+                return_tensors="pt",
+                padding=True
+            )
+            # 3. Inference (Prediction)
+            with torch.no_grad():
+                logits = self.model(**inputs).logits
+            # 4. Convert logits to probabilities (Softmax)
+            probabilities = torch.nn.functional.softmax(logits, dim=-1)
+            # Get the predicted class (0 or 1) and score
+            # Note: Model specific labels need to be checked.
+            # Usually Index 0 = Real/Human, Index 1 = Fake/AI for this specific model family
+            # We verify via the model config id2label if available, otherwise assume standard.
+            # Let's dynamically check the label map if possible
+            id2label = self.model.config.id2label
+            predicted_id = torch.argmax(probabilities, dim=-1).item()
+            confidence = probabilities[0][predicted_id].item()
+            predicted_label = id2label[predicted_id]
+            # Map model output to API requirements
+            # The model labels might be "real"/"fake" or "bonafide"/"spoof"
+            is_ai = False
+            if "fake" in predicted_label.lower() or "spoof" in predicted_label.lower() or "ai" in predicted_label.lower():
+                is_ai = True
+            elif "real" in predicted_label.lower() or "bonafide" in predicted_label.lower() or "human" in predicted_label.lower():
+                is_ai = False
+            else:
+                # Fallback based on index (usually 1 is fake)
+                is_ai = (predicted_id == 1)
+            # 5. Construct Response
+            if is_ai:
+                classification = "AI_GENERATED"
+                explanation = "Deep learning model detected synthetic vocal artifacts and unnatural spectral patterns."
+            else:
+                classification = "HUMAN"
+                explanation = "Deep learning model verified natural micro-prosody and human vocal characteristics."
+            return {
+                "classification": classification,
+                "confidenceScore": round(confidence, 2),
+                "explanation": explanation
+            }
+        except Exception as e:
+            # Fallback for debugging
+            print(f"Analysis Error: {e}")
+            return {
+                "classification": "HUMAN", # Fail-safe default
+                "confidenceScore": 0.0,
+                "explanation": f"Error during analysis: {str(e)}"
+            }

main.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from fastapi import FastAPI, Header, HTTPException, Request
+from pydantic import BaseModel
+import uvicorn
+import os
+import traceback
+from utils import decode_base64_audio, convert_mp3_to_wav
+from detector import VoiceDetector
+app = FastAPI(title="AI Voice Detection API")
+detector = VoiceDetector()
+# Configuration
+API_KEY = os.getenv("API_KEY", "sk_test_123456789")
+SUPPORTED_LANGUAGES = ["Tamil", "English", "Hindi", "Malayalam", "Telugu"]
+class DetectionRequest(BaseModel):
+    language: str
+    audioFormat: str
+    audioBase64: str
+@app.post("/api/voice-detection")
+async def detect_voice(
+    request: DetectionRequest,
+    x_api_key: str = Header(None)
+):
+    # 1. Authentication
+    if x_api_key != API_KEY:
+        raise HTTPException(status_code=401, detail="Invalid API key or malformed request")
+    # 2. Validation
+    if request.language not in SUPPORTED_LANGUAGES:
+        raise HTTPException(status_code=400, detail=f"Language {request.language} not supported")
+    if request.audioFormat.lower() != "mp3":
+        raise HTTPException(status_code=400, detail="Only MP3 format is supported")
+    try:
+        # 3. Process Audio
+        mp3_buffer = decode_base64_audio(request.audioBase64)
+        wav_buffer = convert_mp3_to_wav(mp3_buffer)
+        # 4. Analyze
+        result = detector.analyze(wav_buffer, request.language)
+        # 5. Return Response
+        return {
+            "status": "success",
+            "language": request.language,
+            "classification": result["classification"],
+            "confidenceScore": result["confidenceScore"],
+            "explanation": result["explanation"]
+        }
+    except Exception as e:
+        traceback.print_exc()
+        return {
+            "status": "error",
+            "message": str(e)
+        }
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi
+uvicorn
+python-multipart
+requests
+torch
+transformers
+librosa
+numpy
+scipy
+pydub

utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import base64
+import io
+from pydub import AudioSegment
+import tempfile
+import os
+def decode_base64_audio(base64_string: str) -> io.BytesIO:
+    """Decodes a base64 string into a bytes buffer."""
+    audio_data = base64.b64decode(base64_string)
+    return io.BytesIO(audio_data)
+def convert_mp3_to_wav(mp3_buffer: io.BytesIO) -> io.BytesIO:
+    """Converts MP3 audio buffer to WAV format for processing."""
+    try:
+        audio = AudioSegment.from_mp3(mp3_buffer)
+        wav_buffer = io.BytesIO()
+        audio.export(wav_buffer, format="wav")
+        wav_buffer.seek(0)
+        return wav_buffer
+    except Exception as e:
+        # Fallback: return the buffer as-is, librosa can handle MP3
+        mp3_buffer.seek(0)
+        return mp3_buffer