File size: 2,672 Bytes
b7efc93
 
 
 
 
 
877ea93
b7efc93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
877ea93
b7efc93
877ea93
 
 
 
 
 
 
 
 
 
 
b7efc93
 
 
 
 
877ea93
 
b7efc93
877ea93
 
b7efc93
 
 
 
877ea93
 
 
b7efc93
877ea93
b7efc93
 
 
 
877ea93
b7efc93
 
 
 
877ea93
b7efc93
 
 
 
 
877ea93
 
 
 
 
 
b7efc93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
877ea93
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from transformers import pipeline
import librosa
import numpy as np

classifier = None


def load_model():
    global classifier
    if classifier is None:
        classifier = pipeline(
            "audio-classification",
            model="Hemgg/Deepfake-audio-detection",
            device=-1
        )
    return classifier


def detect_audio(y: np.ndarray) -> tuple[str, float, str]:
    """
    Detect if audio is AI_GENERATED or HUMAN.
    Returns: classification, confidenceScore (0-1), explanation
    """

    try:
        # ✅ Always ensure model is loaded
        model = load_model()

        # ✅ Pass correct input format
        result = model(
            {
                "array": y,
                "sampling_rate": 16000
            }
        )

        if not result:
            return "HUMAN", 0.50, "Insufficient audio features detected."

        # Take top prediction
        top = result[0]
        label_lower = top["label"].lower()
        top_score = top["score"]

        # Label mapping
        if any(word in label_lower for word in ["ai", "fake", "synthetic", "aivoice"]):
            classification = "AI_GENERATED"
        else:
            classification = "HUMAN"

        confidence = round(float(top_score), 3)

        # Feature analysis
        flatness = librosa.feature.spectral_flatness(y=y).mean()

        pitch = librosa.yin(y, fmin=75, fmax=300)
        pitch_std = np.std(pitch) if len(pitch) > 0 else 0.0

        cues = []

        if flatness > 0.5:
            cues.append("unnatural high spectral flatness (robotic)")
        else:
            cues.append("natural spectral variation")

        if pitch_std < 10:
            cues.append("unnatural pitch consistency")
        else:
            cues.append("natural pitch variation")

        # Feature vote
        feature_vote = (
            "AI_GENERATED"
            if (flatness > 0.5 and pitch_std < 10)
            else "HUMAN"
        )

        cues_text = " and ".join(cues)

        if feature_vote == classification:
            explanation = (
                f"{cues_text}, which aligns with the model prediction "
                f"of {classification.lower()} voice."
            )
        else:
            explanation = (
                f"{cues_text}. However, the deep learning model detected "
                f"patterns consistent with {classification.lower()} voice."
            )

        explanation = explanation.capitalize()

        return classification, confidence, explanation

    except Exception as e:

        return (
            "HUMAN",
            0.50,
            f"Analysis error: {str(e)}. Treated as human."
        )