|
|
from transformers import pipeline |
|
|
import librosa |
|
|
import numpy as np |
|
|
|
|
|
classifier = None |
|
|
|
|
|
|
|
|
def load_model(): |
|
|
global classifier |
|
|
if classifier is None: |
|
|
classifier = pipeline( |
|
|
"audio-classification", |
|
|
model="Hemgg/Deepfake-audio-detection", |
|
|
device=-1 |
|
|
) |
|
|
return classifier |
|
|
|
|
|
|
|
|
def detect_audio(y: np.ndarray) -> tuple[str, float, str]: |
|
|
""" |
|
|
Detect if audio is AI_GENERATED or HUMAN. |
|
|
Returns: classification, confidenceScore (0-1), explanation |
|
|
""" |
|
|
|
|
|
try: |
|
|
|
|
|
model = load_model() |
|
|
|
|
|
|
|
|
result = model( |
|
|
{ |
|
|
"array": y, |
|
|
"sampling_rate": 16000 |
|
|
} |
|
|
) |
|
|
|
|
|
if not result: |
|
|
return "HUMAN", 0.50, "Insufficient audio features detected." |
|
|
|
|
|
|
|
|
top = result[0] |
|
|
label_lower = top["label"].lower() |
|
|
top_score = top["score"] |
|
|
|
|
|
|
|
|
if any(word in label_lower for word in ["ai", "fake", "synthetic", "aivoice"]): |
|
|
classification = "AI_GENERATED" |
|
|
else: |
|
|
classification = "HUMAN" |
|
|
|
|
|
confidence = round(float(top_score), 3) |
|
|
|
|
|
|
|
|
flatness = librosa.feature.spectral_flatness(y=y).mean() |
|
|
|
|
|
pitch = librosa.yin(y, fmin=75, fmax=300) |
|
|
pitch_std = np.std(pitch) if len(pitch) > 0 else 0.0 |
|
|
|
|
|
cues = [] |
|
|
|
|
|
if flatness > 0.5: |
|
|
cues.append("unnatural high spectral flatness (robotic)") |
|
|
else: |
|
|
cues.append("natural spectral variation") |
|
|
|
|
|
if pitch_std < 10: |
|
|
cues.append("unnatural pitch consistency") |
|
|
else: |
|
|
cues.append("natural pitch variation") |
|
|
|
|
|
|
|
|
feature_vote = ( |
|
|
"AI_GENERATED" |
|
|
if (flatness > 0.5 and pitch_std < 10) |
|
|
else "HUMAN" |
|
|
) |
|
|
|
|
|
cues_text = " and ".join(cues) |
|
|
|
|
|
if feature_vote == classification: |
|
|
explanation = ( |
|
|
f"{cues_text}, which aligns with the model prediction " |
|
|
f"of {classification.lower()} voice." |
|
|
) |
|
|
else: |
|
|
explanation = ( |
|
|
f"{cues_text}. However, the deep learning model detected " |
|
|
f"patterns consistent with {classification.lower()} voice." |
|
|
) |
|
|
|
|
|
explanation = explanation.capitalize() |
|
|
|
|
|
return classification, confidence, explanation |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
return ( |
|
|
"HUMAN", |
|
|
0.50, |
|
|
f"Analysis error: {str(e)}. Treated as human." |
|
|
) |
|
|
|