Update models.py
Browse files
models.py
CHANGED
|
@@ -4,6 +4,7 @@ import numpy as np
|
|
| 4 |
|
| 5 |
classifier = None
|
| 6 |
|
|
|
|
| 7 |
def load_model():
|
| 8 |
global classifier
|
| 9 |
if classifier is None:
|
|
@@ -20,41 +21,59 @@ def detect_audio(y: np.ndarray) -> tuple[str, float, str]:
|
|
| 20 |
Detect if audio is AI_GENERATED or HUMAN.
|
| 21 |
Returns: classification, confidenceScore (0-1), explanation
|
| 22 |
"""
|
|
|
|
| 23 |
try:
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
if not result:
|
| 26 |
return "HUMAN", 0.50, "Insufficient audio features detected."
|
| 27 |
|
| 28 |
# Take top prediction
|
| 29 |
top = result[0]
|
| 30 |
-
label_lower = top[
|
| 31 |
-
top_score = top[
|
| 32 |
|
| 33 |
-
#
|
| 34 |
-
if any(word in label_lower for word in [
|
| 35 |
classification = "AI_GENERATED"
|
| 36 |
-
confidence = round(top_score, 3)
|
| 37 |
else:
|
| 38 |
classification = "HUMAN"
|
| 39 |
-
confidence = round(top_score, 3)
|
| 40 |
|
| 41 |
-
|
|
|
|
|
|
|
| 42 |
flatness = librosa.feature.spectral_flatness(y=y).mean()
|
|
|
|
| 43 |
pitch = librosa.yin(y, fmin=75, fmax=300)
|
| 44 |
pitch_std = np.std(pitch) if len(pitch) > 0 else 0.0
|
| 45 |
|
| 46 |
cues = []
|
|
|
|
| 47 |
if flatness > 0.5:
|
| 48 |
cues.append("unnatural high spectral flatness (robotic)")
|
| 49 |
else:
|
| 50 |
cues.append("natural spectral variation")
|
|
|
|
| 51 |
if pitch_std < 10:
|
| 52 |
cues.append("unnatural pitch consistency")
|
| 53 |
else:
|
| 54 |
cues.append("natural pitch variation")
|
| 55 |
|
| 56 |
-
#
|
| 57 |
-
feature_vote =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
cues_text = " and ".join(cues)
|
| 60 |
|
|
@@ -74,5 +93,9 @@ def detect_audio(y: np.ndarray) -> tuple[str, float, str]:
|
|
| 74 |
return classification, confidence, explanation
|
| 75 |
|
| 76 |
except Exception as e:
|
| 77 |
-
|
| 78 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
classifier = None
|
| 6 |
|
| 7 |
+
|
| 8 |
def load_model():
|
| 9 |
global classifier
|
| 10 |
if classifier is None:
|
|
|
|
| 21 |
Detect if audio is AI_GENERATED or HUMAN.
|
| 22 |
Returns: classification, confidenceScore (0-1), explanation
|
| 23 |
"""
|
| 24 |
+
|
| 25 |
try:
|
| 26 |
+
# ✅ Always ensure model is loaded
|
| 27 |
+
model = load_model()
|
| 28 |
+
|
| 29 |
+
# ✅ Pass correct input format
|
| 30 |
+
result = model(
|
| 31 |
+
{
|
| 32 |
+
"array": y,
|
| 33 |
+
"sampling_rate": 16000
|
| 34 |
+
}
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
if not result:
|
| 38 |
return "HUMAN", 0.50, "Insufficient audio features detected."
|
| 39 |
|
| 40 |
# Take top prediction
|
| 41 |
top = result[0]
|
| 42 |
+
label_lower = top["label"].lower()
|
| 43 |
+
top_score = top["score"]
|
| 44 |
|
| 45 |
+
# Label mapping
|
| 46 |
+
if any(word in label_lower for word in ["ai", "fake", "synthetic", "aivoice"]):
|
| 47 |
classification = "AI_GENERATED"
|
|
|
|
| 48 |
else:
|
| 49 |
classification = "HUMAN"
|
|
|
|
| 50 |
|
| 51 |
+
confidence = round(float(top_score), 3)
|
| 52 |
+
|
| 53 |
+
# Feature analysis
|
| 54 |
flatness = librosa.feature.spectral_flatness(y=y).mean()
|
| 55 |
+
|
| 56 |
pitch = librosa.yin(y, fmin=75, fmax=300)
|
| 57 |
pitch_std = np.std(pitch) if len(pitch) > 0 else 0.0
|
| 58 |
|
| 59 |
cues = []
|
| 60 |
+
|
| 61 |
if flatness > 0.5:
|
| 62 |
cues.append("unnatural high spectral flatness (robotic)")
|
| 63 |
else:
|
| 64 |
cues.append("natural spectral variation")
|
| 65 |
+
|
| 66 |
if pitch_std < 10:
|
| 67 |
cues.append("unnatural pitch consistency")
|
| 68 |
else:
|
| 69 |
cues.append("natural pitch variation")
|
| 70 |
|
| 71 |
+
# Feature vote
|
| 72 |
+
feature_vote = (
|
| 73 |
+
"AI_GENERATED"
|
| 74 |
+
if (flatness > 0.5 and pitch_std < 10)
|
| 75 |
+
else "HUMAN"
|
| 76 |
+
)
|
| 77 |
|
| 78 |
cues_text = " and ".join(cues)
|
| 79 |
|
|
|
|
| 93 |
return classification, confidence, explanation
|
| 94 |
|
| 95 |
except Exception as e:
|
| 96 |
+
|
| 97 |
+
return (
|
| 98 |
+
"HUMAN",
|
| 99 |
+
0.50,
|
| 100 |
+
f"Analysis error: {str(e)}. Treated as human."
|
| 101 |
+
)
|