Devved11's picture
Update models.py
877ea93 verified
from transformers import pipeline
import librosa
import numpy as np
classifier = None
def load_model():
global classifier
if classifier is None:
classifier = pipeline(
"audio-classification",
model="Hemgg/Deepfake-audio-detection",
device=-1
)
return classifier
def detect_audio(y: np.ndarray) -> tuple[str, float, str]:
"""
Detect if audio is AI_GENERATED or HUMAN.
Returns: classification, confidenceScore (0-1), explanation
"""
try:
# ✅ Always ensure model is loaded
model = load_model()
# ✅ Pass correct input format
result = model(
{
"array": y,
"sampling_rate": 16000
}
)
if not result:
return "HUMAN", 0.50, "Insufficient audio features detected."
# Take top prediction
top = result[0]
label_lower = top["label"].lower()
top_score = top["score"]
# Label mapping
if any(word in label_lower for word in ["ai", "fake", "synthetic", "aivoice"]):
classification = "AI_GENERATED"
else:
classification = "HUMAN"
confidence = round(float(top_score), 3)
# Feature analysis
flatness = librosa.feature.spectral_flatness(y=y).mean()
pitch = librosa.yin(y, fmin=75, fmax=300)
pitch_std = np.std(pitch) if len(pitch) > 0 else 0.0
cues = []
if flatness > 0.5:
cues.append("unnatural high spectral flatness (robotic)")
else:
cues.append("natural spectral variation")
if pitch_std < 10:
cues.append("unnatural pitch consistency")
else:
cues.append("natural pitch variation")
# Feature vote
feature_vote = (
"AI_GENERATED"
if (flatness > 0.5 and pitch_std < 10)
else "HUMAN"
)
cues_text = " and ".join(cues)
if feature_vote == classification:
explanation = (
f"{cues_text}, which aligns with the model prediction "
f"of {classification.lower()} voice."
)
else:
explanation = (
f"{cues_text}. However, the deep learning model detected "
f"patterns consistent with {classification.lower()} voice."
)
explanation = explanation.capitalize()
return classification, confidence, explanation
except Exception as e:
return (
"HUMAN",
0.50,
f"Analysis error: {str(e)}. Treated as human."
)