voice_detection / real_detector.py
ranar110
Fix: Update response format to match Multi-Language problem statement (classification key)
4f7e968
import os
import warnings
# Suppress warnings
warnings.filterwarnings("ignore")
# Global model cache
MODEL_CACHE = {}
MODEL_NAME = "MelodyMachine/Deepfake-audio-detection" # A good starting model from HF
def load_model():
"""Load the model and feature extractor if not already loaded."""
if MODEL_CACHE.get("model") is None:
print(f"Loading model: {MODEL_NAME}...")
try:
# Lazy import to prevent startup timeout
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
# Load feature extractor
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
# Load model with memory optimization
model = AutoModelForAudioClassification.from_pretrained(
MODEL_NAME,
low_cpu_mem_usage=True
)
MODEL_CACHE["feature_extractor"] = feature_extractor
MODEL_CACHE["model"] = model
print("Model loaded successfully.")
except Exception as e:
print(f"Error loading model: {e}")
return None, None
return MODEL_CACHE["model"], MODEL_CACHE["feature_extractor"]
def preprocess_audio(file_path, max_duration=10):
"""Load and preprocess audio file for the model."""
try:
# Lazy import
import librosa
import numpy as np
# Load audio file (resample to 16kHz as typically required by Wav2Vec2)
audio, sample_rate = librosa.load(file_path, sr=16000, duration=max_duration)
return audio, sample_rate
except Exception as e:
print(f"Error preprocessing audio: {e}")
return None, None
def analyze_audio_real(metadata):
"""
Run actual AI inference on the audio file.
Replaces the mock logic with real Deep Learning model predictions.
"""
# Lazy import
import torch
file_path = metadata.get('file_path')
if not file_path or not os.path.exists(file_path):
return {
"error": "File not found",
"is_human": None,
"confidence": 0.0
}
# Load model
model, feature_extractor = load_model()
if not model or not feature_extractor:
# Fallback if model fails to load (e.g. no internet/memory)
return {
"error": "Model failed to load",
"is_human": None,
"confidence": 0.0
}
try:
# Preprocess
audio, sr = preprocess_audio(file_path)
if audio is None:
return {"error": "Invalid audio file", "is_human": None}
# Prepare inputs
inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt")
# Inference
with torch.no_grad():
logits = model(**inputs).logits
# Get probabilities (softmax)
probs = torch.nn.functional.softmax(logits, dim=-1)
# Get predicted label and score
predicted_id = torch.argmax(logits, dim=-1).item()
confidence = probs[0][predicted_id].item()
id2label = model.config.id2label
predicted_label = id2label[predicted_id]
# Logic: if label contains "real" or "bona-fide", it's human
is_human = "real" in predicted_label.lower() or "bona" in predicted_label.lower()
# Return structured analysis
return {
"is_human": is_human,
"classification": "HUMAN" if is_human else "AI_GENERATED",
"confidence": round(confidence, 4),
"detected_language": "analyzed",
"model_used": MODEL_NAME,
"raw_label": predicted_label,
"segments": [
{"start": 0.0, "end": min(metadata.get('duration_seconds', 0), 10.0), "label": predicted_label}
]
}
except Exception as e:
print(f"Inference error: {e}")
return {
"error": str(e),
"is_human": None,
"confidence": 0.0
}