from transformers import pipeline import librosa import numpy as np classifier = None def load_model(): global classifier if classifier is None: classifier = pipeline( "audio-classification", model="Hemgg/Deepfake-audio-detection", device=-1 ) return classifier def detect_audio(y: np.ndarray) -> tuple[str, float, str]: """ Detect if audio is AI_GENERATED or HUMAN. Returns: classification, confidenceScore (0-1), explanation """ try: # ✅ Always ensure model is loaded model = load_model() # ✅ Pass correct input format result = model( { "array": y, "sampling_rate": 16000 } ) if not result: return "HUMAN", 0.50, "Insufficient audio features detected." # Take top prediction top = result[0] label_lower = top["label"].lower() top_score = top["score"] # Label mapping if any(word in label_lower for word in ["ai", "fake", "synthetic", "aivoice"]): classification = "AI_GENERATED" else: classification = "HUMAN" confidence = round(float(top_score), 3) # Feature analysis flatness = librosa.feature.spectral_flatness(y=y).mean() pitch = librosa.yin(y, fmin=75, fmax=300) pitch_std = np.std(pitch) if len(pitch) > 0 else 0.0 cues = [] if flatness > 0.5: cues.append("unnatural high spectral flatness (robotic)") else: cues.append("natural spectral variation") if pitch_std < 10: cues.append("unnatural pitch consistency") else: cues.append("natural pitch variation") # Feature vote feature_vote = ( "AI_GENERATED" if (flatness > 0.5 and pitch_std < 10) else "HUMAN" ) cues_text = " and ".join(cues) if feature_vote == classification: explanation = ( f"{cues_text}, which aligns with the model prediction " f"of {classification.lower()} voice." ) else: explanation = ( f"{cues_text}. However, the deep learning model detected " f"patterns consistent with {classification.lower()} voice." ) explanation = explanation.capitalize() return classification, confidence, explanation except Exception as e: return ( "HUMAN", 0.50, f"Analysis error: {str(e)}. Treated as human." )