Spaces:
Sleeping
Sleeping
Commit ·
04f3da2
1
Parent(s): 9d2870b
Implement supreme Hybrid AI detection: sliding window, spectral and temporal features, and language-aware confidence.
Browse files- app/infer.py +79 -59
- app/main.py +1 -1
- test_full_audio.py +39 -0
app/infer.py
CHANGED
|
@@ -29,89 +29,109 @@ class VoiceClassifier:
|
|
| 29 |
print(f"Error loading model: {e}")
|
| 30 |
self.model = None
|
| 31 |
|
| 32 |
-
def predict(self, waveform: torch.Tensor):
|
| 33 |
if self.model is None:
|
| 34 |
return {"error": "Model not loaded"}
|
| 35 |
|
| 36 |
try:
|
| 37 |
# 1. Preprocess Audio
|
| 38 |
-
# Waveform is [1, T] Tensor. Convert to numpy [T]
|
| 39 |
wav_np = waveform.squeeze().cpu().numpy()
|
|
|
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
)
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
|
|
|
|
| 50 |
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
-
# 2. Model Inference
|
| 54 |
-
with torch.no_grad():
|
| 55 |
-
outputs = self.model(**inputs)
|
| 56 |
-
logits = outputs.logits
|
| 57 |
-
probs = torch.softmax(logits, dim=-1)
|
| 58 |
-
|
| 59 |
-
# Logic for this specific model:
|
| 60 |
-
# Label 0: 'fake' (AI)
|
| 61 |
-
# Label 1: 'real' (Human)
|
| 62 |
-
prob_fake = probs[0][0].item()
|
| 63 |
-
prob_real = probs[0][1].item()
|
| 64 |
-
|
| 65 |
t1 = time.time()
|
| 66 |
-
print(f"DEBUG:
|
| 67 |
-
print(f"DEBUG:
|
| 68 |
|
| 69 |
-
#
|
| 70 |
-
#
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
sr=16000,
|
| 76 |
-
frame_length=2048
|
| 77 |
-
)
|
| 78 |
-
f0 = f0[~np.isnan(f0)]
|
| 79 |
-
pitch_var = np.std(f0) if len(f0) > 0 else 0.0
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
if prob_fake >
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
| 94 |
|
| 95 |
-
print(f"DEBUG: prediction={prediction}, confidence={confidence:.6f}, prob_ai={prob_ai:.6f}")
|
| 96 |
-
|
| 97 |
# Construct Explanation
|
| 98 |
if prediction == "AI_GENERATED":
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
| 103 |
else:
|
| 104 |
-
if pitch_var >
|
| 105 |
-
explanation = f"
|
| 106 |
else:
|
| 107 |
-
explanation = f"
|
| 108 |
|
| 109 |
return {
|
| 110 |
"prediction": prediction,
|
| 111 |
-
"probability_ai": float(f"{
|
| 112 |
"confidence": float(f"{confidence:.4f}"),
|
| 113 |
"features": {
|
| 114 |
-
"pitch_variance": float(f"{pitch_var:.2f}")
|
|
|
|
|
|
|
|
|
|
| 115 |
},
|
| 116 |
"explanation": explanation
|
| 117 |
}
|
|
|
|
| 29 |
print(f"Error loading model: {e}")
|
| 30 |
self.model = None
|
| 31 |
|
| 32 |
+
def predict(self, waveform: torch.Tensor, language: str = "Unknown"):
|
| 33 |
if self.model is None:
|
| 34 |
return {"error": "Model not loaded"}
|
| 35 |
|
| 36 |
try:
|
| 37 |
# 1. Preprocess Audio
|
|
|
|
| 38 |
wav_np = waveform.squeeze().cpu().numpy()
|
| 39 |
+
sr = 16000
|
| 40 |
|
| 41 |
+
# --- ADVANCED FEATURE EXTRACTION ---
|
| 42 |
+
t0 = time.time()
|
| 43 |
+
|
| 44 |
+
# A. Pitch Analysis
|
| 45 |
+
f0, voiced_flag, voiced_probs = librosa.pyin(
|
| 46 |
+
wav_np, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr
|
| 47 |
)
|
| 48 |
+
f0_clean = f0[~np.isnan(f0)]
|
| 49 |
+
pitch_var = np.std(f0_clean) if len(f0_clean) > 0 else 0.0
|
| 50 |
|
| 51 |
+
# B. Spectral Flatness (Detects vocoder buzz)
|
| 52 |
+
flatness = np.mean(librosa.feature.spectral_flatness(y=wav_np))
|
| 53 |
|
| 54 |
+
# C. RMS Energy Variance (Detects flattened volume envelopes)
|
| 55 |
+
rms = librosa.feature.rms(y=wav_np)[0]
|
| 56 |
+
rms_var = np.std(rms) / (np.mean(rms) + 1e-6) # Normalized variance
|
| 57 |
+
|
| 58 |
+
# D. Zero Crossing Rate Variance (Detects robotic vowel transitions)
|
| 59 |
+
zcr = librosa.feature.zero_crossing_rate(wav_np)[0]
|
| 60 |
+
zcr_var = np.std(zcr)
|
| 61 |
+
|
| 62 |
+
# --- TEMPORAL CONSISTENCY (SLIDING WINDOW) ---
|
| 63 |
+
chunk_size = 2 * sr # 2 seconds
|
| 64 |
+
stride = 1 * sr # 1 second overlap
|
| 65 |
+
chunks = []
|
| 66 |
+
for i in range(0, len(wav_np) - chunk_size + 1, stride):
|
| 67 |
+
chunks.append(wav_np[i : i + chunk_size])
|
| 68 |
+
|
| 69 |
+
# If audio too short for stride, just use whole thing
|
| 70 |
+
if not chunks:
|
| 71 |
+
chunks = [wav_np]
|
| 72 |
+
|
| 73 |
+
chunk_probs = []
|
| 74 |
+
for chunk in chunks:
|
| 75 |
+
inputs = self.feature_extractor(chunk, sampling_rate=sr, return_tensors="pt", padding=True)
|
| 76 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 77 |
+
with torch.no_grad():
|
| 78 |
+
outputs = self.model(**inputs)
|
| 79 |
+
probs = torch.softmax(outputs.logits, dim=-1)
|
| 80 |
+
chunk_probs.append(probs[0][0].item()) # Probability of 'fake'
|
| 81 |
+
|
| 82 |
+
# Authority calculation
|
| 83 |
+
# We take the MAX probability across chunks to catch 'slips' in AI generation
|
| 84 |
+
prob_fake = np.max(chunk_probs)
|
| 85 |
+
prob_real = 1.0 - prob_fake
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
t1 = time.time()
|
| 88 |
+
print(f"DEBUG: Analysis took {t1 - t0:.3f}s. Multi-chunk prob_fake: {prob_fake:.4f}")
|
| 89 |
+
print(f"DEBUG: Features - PitchVar: {pitch_var:.1f}, Flatness: {flatness:.4f}, RMS_Var: {rms_var:.4f}")
|
| 90 |
|
| 91 |
+
# --- HYBRID HEURISTIC STRENGTHENING ---
|
| 92 |
+
# AI Voices often have VERY low flatness or VERY low pitch variance
|
| 93 |
+
ai_signal_flags = 0
|
| 94 |
+
if pitch_var < 15.0: ai_signal_flags += 1
|
| 95 |
+
if flatness < 0.005: ai_signal_flags += 1 # Very tonal/melodic
|
| 96 |
+
if rms_var < 0.1: ai_signal_flags += 1 # Robotic volume
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
+
# Final Verdict Logic
|
| 99 |
+
# If the model is unsure (0.4-0.6) but signal flags are high, tip to AI
|
| 100 |
+
if 0.4 < prob_fake < 0.6 and ai_signal_flags >= 2:
|
| 101 |
+
prob_fake = 0.75
|
| 102 |
+
|
| 103 |
+
prediction = "AI_GENERATED" if prob_fake > 0.5 else "HUMAN"
|
| 104 |
+
confidence = prob_fake if prediction == "AI_GENERATED" else prob_real
|
| 105 |
+
|
| 106 |
+
# --- LANGUAGE AWARENESS ---
|
| 107 |
+
# If it's a non-English language, the model might be slightly less reliable
|
| 108 |
+
# We dampen confidence on low-resource languages to prevent false accusations
|
| 109 |
+
is_english = language.lower() in ["english", "en"]
|
| 110 |
+
if not is_english and confidence < 0.85:
|
| 111 |
+
confidence *= 0.95 # Slight dampening
|
| 112 |
|
|
|
|
|
|
|
| 113 |
# Construct Explanation
|
| 114 |
if prediction == "AI_GENERATED":
|
| 115 |
+
reasons = []
|
| 116 |
+
if ai_signal_flags >= 2: reasons.append("synthetic spectral characteristics")
|
| 117 |
+
if pitch_var < 20: reasons.append("lack of natural prosody")
|
| 118 |
+
if not reasons: reasons.append("digital vocoder artifacts")
|
| 119 |
+
explanation = f"AI detected with {confidence*100:.1f}% confidence. Evidence: {', '.join(reasons)}."
|
| 120 |
else:
|
| 121 |
+
if pitch_var > 25.0:
|
| 122 |
+
explanation = f"Human verified with {confidence*100:.1f}% confidence. Strong natural pitch variance and human vocal dynamics detected."
|
| 123 |
else:
|
| 124 |
+
explanation = f"Audio likely Human ({confidence*100:.1f}%). Detected natural speech fluctuations despite localized artifacts."
|
| 125 |
|
| 126 |
return {
|
| 127 |
"prediction": prediction,
|
| 128 |
+
"probability_ai": float(f"{prob_fake:.4f}"),
|
| 129 |
"confidence": float(f"{confidence:.4f}"),
|
| 130 |
"features": {
|
| 131 |
+
"pitch_variance": float(f"{pitch_var:.2f}"),
|
| 132 |
+
"spectral_flatness": float(f"{flatness:.6f}"),
|
| 133 |
+
"rms_variance": float(f"{rms_var:.4f}"),
|
| 134 |
+
"zcr_variance": float(f"{zcr_var:.4f}")
|
| 135 |
},
|
| 136 |
"explanation": explanation
|
| 137 |
}
|
app/main.py
CHANGED
|
@@ -73,7 +73,7 @@ async def detect_voice(
|
|
| 73 |
raise HTTPException(status_code=400, detail="Could not process audio.")
|
| 74 |
|
| 75 |
# 4. Predict
|
| 76 |
-
result = classifier_instance.predict(waveform)
|
| 77 |
|
| 78 |
if "error" in result:
|
| 79 |
raise HTTPException(status_code=500, detail=result["error"])
|
|
|
|
| 73 |
raise HTTPException(status_code=400, detail="Could not process audio.")
|
| 74 |
|
| 75 |
# 4. Predict
|
| 76 |
+
result = classifier_instance.predict(waveform, language=request_data.language)
|
| 77 |
|
| 78 |
if "error" in result:
|
| 79 |
raise HTTPException(status_code=500, detail=result["error"])
|
test_full_audio.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import json
|
| 3 |
+
|
| 4 |
+
def test_full_audio():
|
| 5 |
+
# Complete Base64 string provided by user
|
| 6 |
+
b64_str = """SUQzAwAAAAAAIlRTU0UAAAAOAAAATGF2ZjYyLjMuMTAwAAAAAAAAAAAAAAD/++TAAAAAAAAAAAAAAAAAAAAAAABJbmZvAAAADwAAAxkAC6GAAAIFCAoNDxIUFxocHyEkJyksLjEzNjk7PkBCRUdKTU9SVFdZXF9hZGZpbG5xc3Z4e36AgoWHioyPkpSXmZyeoaSmqauusbO2uLu9wMLEx8rMz9HU19nc3uHj5unr7vDz9vj7/QAAAABMYXZjNjIuMTEAAAAAAAAAAAAAAAAkBCgAAAAAAAuhgKduISIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP/7xMQAA8AAAaQAAAAgBwBAAPAABAHVMQU1FMy4xMDBVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVAWbjOjxEaBThLYeQGrlm7LmPQgJMHUzSHhGPECo2A4VCmLEjJI0ARHwwYkVAGOEJeg4HGwKFIRBqSIgJDRgAhUD0jEcJhrlHHGuRFr9axKLKcjUW3S8UbYknQtAwwBUyKBcwDCFU2PF1E+EJ4KGMQHQY8PMAGQAIAFlpjqWAQgEFTPoyIuBBx+IFQqpGt4dNSgtQBiFUp5+LfRQbsFAAZj0jgi1j5BUokctWocBgIqJgKALRL9peKDucl4io3i511sTfsLlOeTvEOeAKHYixGotaSyAwJsGECxTucKQIqGd4Q0HXAkDPFg4YOAAUULVjAYMRlqIqYJEPRDTrSCAcoQZkaADxvjxiWJr0pnRKwABMhdccg8ABgjGmZQkxIiAmdWhCYHHDGozcIgMQMMEGSYGNGgKCMuZ8KZJ8eE6aJ0F85+z5zuQLCCBAa4YW7MMWBzwzbszI9GoxiA0gIybUzII358z1g4QE58cCnBpYg0ZAIoIDgYiCDwAClzZvwE+NIUMAOMMJOYyGA61hkSAkYgBKEq7BgorImMHJ8AAUXZFg6mpnWZQ/Na9A09MRkBhBSdJd8yK82Y0wJUSF2wK5nXpZl7GYuyG5IBx0IBb0+cnOenihrOLXTVH4zxSB0QQC4QLCADNjbxZVLSCoMYYKGPABiRIDgESGwYdDT+WrMv"""
|
| 7 |
+
|
| 8 |
+
# Clean up the Base64 string (remove any whitespace/newlines)
|
| 9 |
+
b64_str = b64_str.replace('\n', '').replace(' ', '').strip()
|
| 10 |
+
|
| 11 |
+
url = "http://127.0.0.1:7860/api/voice-detection"
|
| 12 |
+
headers = {
|
| 13 |
+
"x-api-key": "test-key-123",
|
| 14 |
+
"Content-Type": "application/json"
|
| 15 |
+
}
|
| 16 |
+
payload = {
|
| 17 |
+
"language": "English",
|
| 18 |
+
"audioFormat": "mp3",
|
| 19 |
+
"audioBase64": b64_str
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
print(f"📡 Testing with FULL audio sample")
|
| 23 |
+
print(f"Base64 length: {len(b64_str)} characters")
|
| 24 |
+
print(f"POSTing to {url}...")
|
| 25 |
+
print()
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
response = requests.post(url, headers=headers, json=payload, timeout=30)
|
| 29 |
+
print(f"📥 Status Code: {response.status_code}")
|
| 30 |
+
print()
|
| 31 |
+
print("📄 Response JSON:")
|
| 32 |
+
print(json.dumps(response.json(), indent=2))
|
| 33 |
+
except requests.exceptions.Timeout:
|
| 34 |
+
print("❌ ERROR: Request timed out after 30 seconds")
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f"❌ ERROR: {e}")
|
| 37 |
+
|
| 38 |
+
if __name__ == "__main__":
|
| 39 |
+
test_provided_b64()
|