Spaces:

Devved11
/

RestAPI_AI_VOICE_DETECTION

Running

File size: 2,672 Bytes

b7efc93
 
 
 
 
 
877ea93
b7efc93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
877ea93
b7efc93
877ea93
 
 
 
 
 
 
 
 
 
 
b7efc93
 
 
 
 
877ea93
 
b7efc93
877ea93
 
b7efc93
 
 
 
877ea93
 
 
b7efc93
877ea93
b7efc93
 
 
 
877ea93
b7efc93
 
 
 
877ea93
b7efc93
 
 
 
 
877ea93
 
 
 
 
 
b7efc93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
877ea93

from transformers import pipeline
import librosa
import numpy as np

classifier = None


def load_model():
    global classifier
    if classifier is None:
        classifier = pipeline(
            "audio-classification",
            model="Hemgg/Deepfake-audio-detection",
            device=-1
        )
    return classifier


def detect_audio(y: np.ndarray) -> tuple[str, float, str]:
    """
    Detect if audio is AI_GENERATED or HUMAN.
    Returns: classification, confidenceScore (0-1), explanation
    """

    try:
        # ✅ Always ensure model is loaded
        model = load_model()

        # ✅ Pass correct input format
        result = model(
            {
                "array": y,
                "sampling_rate": 16000
            }
        )

        if not result:
            return "HUMAN", 0.50, "Insufficient audio features detected."

        # Take top prediction
        top = result[0]
        label_lower = top["label"].lower()
        top_score = top["score"]

        # Label mapping
        if any(word in label_lower for word in ["ai", "fake", "synthetic", "aivoice"]):
            classification = "AI_GENERATED"
        else:
            classification = "HUMAN"

        confidence = round(float(top_score), 3)

        # Feature analysis
        flatness = librosa.feature.spectral_flatness(y=y).mean()

        pitch = librosa.yin(y, fmin=75, fmax=300)
        pitch_std = np.std(pitch) if len(pitch) > 0 else 0.0

        cues = []

        if flatness > 0.5:
            cues.append("unnatural high spectral flatness (robotic)")
        else:
            cues.append("natural spectral variation")

        if pitch_std < 10:
            cues.append("unnatural pitch consistency")
        else:
            cues.append("natural pitch variation")

        # Feature vote
        feature_vote = (
            "AI_GENERATED"
            if (flatness > 0.5 and pitch_std < 10)
            else "HUMAN"
        )

        cues_text = " and ".join(cues)

        if feature_vote == classification:
            explanation = (
                f"{cues_text}, which aligns with the model prediction "
                f"of {classification.lower()} voice."
            )
        else:
            explanation = (
                f"{cues_text}. However, the deep learning model detected "
                f"patterns consistent with {classification.lower()} voice."
            )

        explanation = explanation.capitalize()

        return classification, confidence, explanation

    except Exception as e:

        return (
            "HUMAN",
            0.50,
            f"Analysis error: {str(e)}. Treated as human."
        )