voice-detection-v2 / detector.py
testing-ak's picture
Create detector.py
53bc43f verified
import torch
import librosa
import numpy as np
import io
import torch.nn.functional as F
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
class VoiceDetector:
def __init__(self):
print("⏳ Loading Robust AI Detection Model...")
# FIX: Using the verified MelodyMachine model
self.model_name = "MelodyMachine/Deepfake-audio-detection-V2"
try:
self.feature_extractor = AutoFeatureExtractor.from_pretrained(self.model_name)
self.model = AutoModelForAudioClassification.from_pretrained(self.model_name)
self.model.eval()
# Print labels to debug (Make sure we know what 0 and 1 mean)
print(f"βœ… Model Labels: {self.model.config.id2label}")
except Exception as e:
print(f"❌ CRITICAL ERROR: Failed to load AI model. {e}")
raise e
def preprocess_audio(self, audio_buffer: io.BytesIO, target_sr=16000):
"""
Robust preprocessing: Resample, Normalize, and Fix Duration.
"""
audio_buffer.seek(0)
y, sr = librosa.load(audio_buffer, sr=target_sr)
# 1. Normalize Volume (Crucial for quiet clips)
y = librosa.util.normalize(y)
# 2. Fix Duration (Model expects ~3-5 seconds)
# If too short (< 1.5s), loop it.
if len(y) < target_sr * 1.5:
tile_factor = int_(np.ceil((target_sr * 1.5) / len(y)))
y = np.tile(y, tile_factor)
# 3. Limit Duration (If > 10s, take the middle 5s)
# Long files confuse the model logic if not chunked.
max_len = target_sr * 5
if len(y) > max_len:
start = (len(y) - max_len) // 2
y = y[start : start + max_len]
return y
def analyze(self, audio_buffer: io.BytesIO, language: str):
"""
Analyzes audio with improved threshold logic using MelodyMachine.
"""
try:
# 1. Preprocess
audio_input = self.preprocess_audio(audio_buffer)
# 2. Prepare Input
inputs = self.feature_extractor(
audio_input,
sampling_rate=16000,
return_tensors="pt",
padding=True
)
# 3. Inference
with torch.no_grad():
logits = self.model(**inputs).logits
# 4. Get Probabilities
probs = F.softmax(logits, dim=-1)
# 5. Dynamic Label Mapping (Safe method)
# MelodyMachine usually: Label 0 = Fake, Label 1 = Real
# But we check config to be 100% sure.
id2label = self.model.config.id2label
fake_score = 0.0
real_score = 0.0
# Find which index is 'fake' and which is 'real'
for idx, label in id2label.items():
label_lower = str(label).lower()
if "fake" in label_lower or "spoof" in label_lower:
fake_score = probs[0][idx].item()
elif "real" in label_lower or "bonafide" in label_lower:
real_score = probs[0][idx].item()
# Fallback if labels are just "LABEL_0", "LABEL_1" (MelodyMachine default)
# In MelodyMachine: LABEL_0 is REAL, LABEL_1 is FAKE (Wait, let's verify logic below)
# Actually, standard MelodyMachine:
# Index 1 (Deepfake) -> "fake"
# Index 0 (Real) -> "real"
# Let's use direct index access if names are generic
if fake_score == 0.0 and real_score == 0.0:
# Default mapping for MelodyMachine
real_score = probs[0][0].item() # Real is usually 0
fake_score = probs[0][1].item() # Fake is usually 1
print(f"πŸ” DEBUG: Real Score: {real_score:.4f} | Fake Score: {fake_score:.4f}")
# 6. Decision Logic
# If AI confidence is > 50%, call it AI.
if fake_score > real_score:
classification = "AI_GENERATED"
confidence = fake_score
explanation = f"Detected synthetic artifacts with {int(fake_score*100)}% confidence."
else:
classification = "HUMAN"
confidence = real_score
explanation = f"Verified human vocal characteristics with {int(real_score*100)}% confidence."
return {
"classification": classification,
"confidenceScore": round(confidence, 2),
"explanation": explanation
}
except Exception as e:
print(f"Analysis Error: {e}")
return {
"classification": "HUMAN",
"confidenceScore": 0.0,
"explanation": f"Error: {str(e)}"
}
# Helper to fix numpy integer issue
def int_(val):
return int(val)