Spaces:
Runtime error
Runtime error
Upload 5 files
Browse files- Dockerfile +20 -0
- detector.py +107 -0
- main.py +62 -0
- requirements.txt +10 -0
- utils.py +23 -0
Dockerfile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use Python 3.9
|
| 2 |
+
FROM python:3.9
|
| 3 |
+
|
| 4 |
+
# Set working directory
|
| 5 |
+
WORKDIR /code
|
| 6 |
+
|
| 7 |
+
# Copy requirements and install dependencies
|
| 8 |
+
COPY ./requirements.txt /code/requirements.txt
|
| 9 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
| 10 |
+
|
| 11 |
+
# Copy the application code
|
| 12 |
+
COPY . /code
|
| 13 |
+
|
| 14 |
+
# Create a writable cache directory for the AI model
|
| 15 |
+
# (Hugging Face needs this permission setup)
|
| 16 |
+
RUN mkdir -p /code/cache && chmod 777 /code/cache
|
| 17 |
+
ENV XDG_CACHE_HOME=/code/cache
|
| 18 |
+
|
| 19 |
+
# Start the server on port 7860 (Hugging Face default)
|
| 20 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
detector.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import librosa
|
| 3 |
+
import numpy as np
|
| 4 |
+
import io
|
| 5 |
+
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
|
| 6 |
+
|
| 7 |
+
class VoiceDetector:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
print("Loading AI Detection Model... (this may take a moment)")
|
| 10 |
+
# We use a pre-trained model specifically fine-tuned for Deepfake detection
|
| 11 |
+
# Source: https://huggingface.co/MelodyMachine/Deepfake-audio-detection-V2
|
| 12 |
+
# Alternative robust model: "padmalcom/wav2vec2-large-fake-voice-detection-v2"
|
| 13 |
+
self.model_name = "MelodyMachine/Deepfake-audio-detection-V2"
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(self.model_name)
|
| 17 |
+
self.model = Wav2Vec2ForSequenceClassification.from_pretrained(self.model_name)
|
| 18 |
+
self.model.eval() # Set to evaluation mode
|
| 19 |
+
except Exception as e:
|
| 20 |
+
print(f"CRITICAL ERROR: Failed to load AI model. {e}")
|
| 21 |
+
raise e
|
| 22 |
+
|
| 23 |
+
def preprocess_audio(self, audio_buffer: io.BytesIO, target_sr=16000):
|
| 24 |
+
"""
|
| 25 |
+
Loads audio from bytes and resamples it to 16kHz (required by Wav2Vec2).
|
| 26 |
+
"""
|
| 27 |
+
audio_buffer.seek(0)
|
| 28 |
+
# Load with librosa (automatically handles MP3/WAV)
|
| 29 |
+
y, sr = librosa.load(audio_buffer, sr=target_sr)
|
| 30 |
+
|
| 31 |
+
# Ensure we have enough audio for the model (pad if too short)
|
| 32 |
+
if len(y) < target_sr: # Less than 1 second
|
| 33 |
+
padding = target_sr - len(y)
|
| 34 |
+
y = np.pad(y, (0, padding), 'constant')
|
| 35 |
+
|
| 36 |
+
# Normalize audio volume
|
| 37 |
+
y = librosa.util.normalize(y)
|
| 38 |
+
return y
|
| 39 |
+
|
| 40 |
+
def analyze(self, audio_buffer: io.BytesIO, language: str):
|
| 41 |
+
"""
|
| 42 |
+
Analyzes audio using the Deep Learning model.
|
| 43 |
+
Returns classification, confidence, and explanation.
|
| 44 |
+
"""
|
| 45 |
+
try:
|
| 46 |
+
# 1. Preprocess Audio
|
| 47 |
+
audio_input = self.preprocess_audio(audio_buffer)
|
| 48 |
+
|
| 49 |
+
# 2. Prepare inputs for the model
|
| 50 |
+
inputs = self.feature_extractor(
|
| 51 |
+
audio_input,
|
| 52 |
+
sampling_rate=16000,
|
| 53 |
+
return_tensors="pt",
|
| 54 |
+
padding=True
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# 3. Inference (Prediction)
|
| 58 |
+
with torch.no_grad():
|
| 59 |
+
logits = self.model(**inputs).logits
|
| 60 |
+
|
| 61 |
+
# 4. Convert logits to probabilities (Softmax)
|
| 62 |
+
probabilities = torch.nn.functional.softmax(logits, dim=-1)
|
| 63 |
+
|
| 64 |
+
# Get the predicted class (0 or 1) and score
|
| 65 |
+
# Note: Model specific labels need to be checked.
|
| 66 |
+
# Usually Index 0 = Real/Human, Index 1 = Fake/AI for this specific model family
|
| 67 |
+
# We verify via the model config id2label if available, otherwise assume standard.
|
| 68 |
+
|
| 69 |
+
# Let's dynamically check the label map if possible
|
| 70 |
+
id2label = self.model.config.id2label
|
| 71 |
+
predicted_id = torch.argmax(probabilities, dim=-1).item()
|
| 72 |
+
confidence = probabilities[0][predicted_id].item()
|
| 73 |
+
predicted_label = id2label[predicted_id]
|
| 74 |
+
|
| 75 |
+
# Map model output to API requirements
|
| 76 |
+
# The model labels might be "real"/"fake" or "bonafide"/"spoof"
|
| 77 |
+
is_ai = False
|
| 78 |
+
if "fake" in predicted_label.lower() or "spoof" in predicted_label.lower() or "ai" in predicted_label.lower():
|
| 79 |
+
is_ai = True
|
| 80 |
+
elif "real" in predicted_label.lower() or "bonafide" in predicted_label.lower() or "human" in predicted_label.lower():
|
| 81 |
+
is_ai = False
|
| 82 |
+
else:
|
| 83 |
+
# Fallback based on index (usually 1 is fake)
|
| 84 |
+
is_ai = (predicted_id == 1)
|
| 85 |
+
|
| 86 |
+
# 5. Construct Response
|
| 87 |
+
if is_ai:
|
| 88 |
+
classification = "AI_GENERATED"
|
| 89 |
+
explanation = "Deep learning model detected synthetic vocal artifacts and unnatural spectral patterns."
|
| 90 |
+
else:
|
| 91 |
+
classification = "HUMAN"
|
| 92 |
+
explanation = "Deep learning model verified natural micro-prosody and human vocal characteristics."
|
| 93 |
+
|
| 94 |
+
return {
|
| 95 |
+
"classification": classification,
|
| 96 |
+
"confidenceScore": round(confidence, 2),
|
| 97 |
+
"explanation": explanation
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
except Exception as e:
|
| 101 |
+
# Fallback for debugging
|
| 102 |
+
print(f"Analysis Error: {e}")
|
| 103 |
+
return {
|
| 104 |
+
"classification": "HUMAN", # Fail-safe default
|
| 105 |
+
"confidenceScore": 0.0,
|
| 106 |
+
"explanation": f"Error during analysis: {str(e)}"
|
| 107 |
+
}
|
main.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, Header, HTTPException, Request
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
import uvicorn
|
| 4 |
+
import os
|
| 5 |
+
import traceback
|
| 6 |
+
from utils import decode_base64_audio, convert_mp3_to_wav
|
| 7 |
+
from detector import VoiceDetector
|
| 8 |
+
|
| 9 |
+
app = FastAPI(title="AI Voice Detection API")
|
| 10 |
+
detector = VoiceDetector()
|
| 11 |
+
|
| 12 |
+
# Configuration
|
| 13 |
+
API_KEY = os.getenv("API_KEY", "sk_test_123456789")
|
| 14 |
+
SUPPORTED_LANGUAGES = ["Tamil", "English", "Hindi", "Malayalam", "Telugu"]
|
| 15 |
+
|
| 16 |
+
class DetectionRequest(BaseModel):
|
| 17 |
+
language: str
|
| 18 |
+
audioFormat: str
|
| 19 |
+
audioBase64: str
|
| 20 |
+
|
| 21 |
+
@app.post("/api/voice-detection")
|
| 22 |
+
async def detect_voice(
|
| 23 |
+
request: DetectionRequest,
|
| 24 |
+
x_api_key: str = Header(None)
|
| 25 |
+
):
|
| 26 |
+
# 1. Authentication
|
| 27 |
+
if x_api_key != API_KEY:
|
| 28 |
+
raise HTTPException(status_code=401, detail="Invalid API key or malformed request")
|
| 29 |
+
|
| 30 |
+
# 2. Validation
|
| 31 |
+
if request.language not in SUPPORTED_LANGUAGES:
|
| 32 |
+
raise HTTPException(status_code=400, detail=f"Language {request.language} not supported")
|
| 33 |
+
|
| 34 |
+
if request.audioFormat.lower() != "mp3":
|
| 35 |
+
raise HTTPException(status_code=400, detail="Only MP3 format is supported")
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
# 3. Process Audio
|
| 39 |
+
mp3_buffer = decode_base64_audio(request.audioBase64)
|
| 40 |
+
wav_buffer = convert_mp3_to_wav(mp3_buffer)
|
| 41 |
+
|
| 42 |
+
# 4. Analyze
|
| 43 |
+
result = detector.analyze(wav_buffer, request.language)
|
| 44 |
+
|
| 45 |
+
# 5. Return Response
|
| 46 |
+
return {
|
| 47 |
+
"status": "success",
|
| 48 |
+
"language": request.language,
|
| 49 |
+
"classification": result["classification"],
|
| 50 |
+
"confidenceScore": result["confidenceScore"],
|
| 51 |
+
"explanation": result["explanation"]
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
except Exception as e:
|
| 55 |
+
traceback.print_exc()
|
| 56 |
+
return {
|
| 57 |
+
"status": "error",
|
| 58 |
+
"message": str(e)
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
if __name__ == "__main__":
|
| 62 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
python-multipart
|
| 4 |
+
requests
|
| 5 |
+
torch
|
| 6 |
+
transformers
|
| 7 |
+
librosa
|
| 8 |
+
numpy
|
| 9 |
+
scipy
|
| 10 |
+
pydub
|
utils.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import io
|
| 3 |
+
from pydub import AudioSegment
|
| 4 |
+
import tempfile
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
def decode_base64_audio(base64_string: str) -> io.BytesIO:
|
| 8 |
+
"""Decodes a base64 string into a bytes buffer."""
|
| 9 |
+
audio_data = base64.b64decode(base64_string)
|
| 10 |
+
return io.BytesIO(audio_data)
|
| 11 |
+
|
| 12 |
+
def convert_mp3_to_wav(mp3_buffer: io.BytesIO) -> io.BytesIO:
|
| 13 |
+
"""Converts MP3 audio buffer to WAV format for processing."""
|
| 14 |
+
try:
|
| 15 |
+
audio = AudioSegment.from_mp3(mp3_buffer)
|
| 16 |
+
wav_buffer = io.BytesIO()
|
| 17 |
+
audio.export(wav_buffer, format="wav")
|
| 18 |
+
wav_buffer.seek(0)
|
| 19 |
+
return wav_buffer
|
| 20 |
+
except Exception as e:
|
| 21 |
+
# Fallback: return the buffer as-is, librosa can handle MP3
|
| 22 |
+
mp3_buffer.seek(0)
|
| 23 |
+
return mp3_buffer
|