S-Vetrivel commited on
Commit
04f3da2
·
1 Parent(s): 9d2870b

Implement supreme Hybrid AI detection: sliding window, spectral and temporal features, and language-aware confidence.

Browse files
Files changed (3) hide show
  1. app/infer.py +79 -59
  2. app/main.py +1 -1
  3. test_full_audio.py +39 -0
app/infer.py CHANGED
@@ -29,89 +29,109 @@ class VoiceClassifier:
29
  print(f"Error loading model: {e}")
30
  self.model = None
31
 
32
- def predict(self, waveform: torch.Tensor):
33
  if self.model is None:
34
  return {"error": "Model not loaded"}
35
 
36
  try:
37
  # 1. Preprocess Audio
38
- # Waveform is [1, T] Tensor. Convert to numpy [T]
39
  wav_np = waveform.squeeze().cpu().numpy()
 
40
 
41
- # Ensure we send it as a list or numpy array to the extractor
42
- inputs = self.feature_extractor(
43
- wav_np,
44
- sampling_rate=16000,
45
- return_tensors="pt",
46
- padding=True
47
  )
 
 
48
 
49
- inputs = {k: v.to(self.device) for k, v in inputs.items()}
 
50
 
51
- t0 = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- # 2. Model Inference
54
- with torch.no_grad():
55
- outputs = self.model(**inputs)
56
- logits = outputs.logits
57
- probs = torch.softmax(logits, dim=-1)
58
-
59
- # Logic for this specific model:
60
- # Label 0: 'fake' (AI)
61
- # Label 1: 'real' (Human)
62
- prob_fake = probs[0][0].item()
63
- prob_real = probs[0][1].item()
64
-
65
  t1 = time.time()
66
- print(f"DEBUG: Inference took {t1 - t0:.3f}s. probs: {probs}")
67
- print(f"DEBUG: prob_fake={prob_fake:.6f}, prob_real={prob_real:.6f}")
68
 
69
- # 3. Pitch Analysis (for explanation)
70
- # Use librosa for pitch tracking (fast approximation)
71
- f0, voiced_flag, voiced_probs = librosa.pyin(
72
- wav_np,
73
- fmin=librosa.note_to_hz('C2'),
74
- fmax=librosa.note_to_hz('C7'),
75
- sr=16000,
76
- frame_length=2048
77
- )
78
- f0 = f0[~np.isnan(f0)]
79
- pitch_var = np.std(f0) if len(f0) > 0 else 0.0
80
 
81
- t2 = time.time()
82
- print(f"DEBUG: Pitch Detection took {t2 - t1:.3f}s. Variance: {pitch_var:.2f}")
83
-
84
- # 4. Final Classification Logic
85
- # Deepfake model is the authority
86
- if prob_fake > prob_real:
87
- prediction = "AI_GENERATED"
88
- confidence = prob_fake
89
- prob_ai = prob_fake
90
- else:
91
- prediction = "HUMAN"
92
- confidence = prob_real
93
- prob_ai = prob_fake
 
94
 
95
- print(f"DEBUG: prediction={prediction}, confidence={confidence:.6f}, prob_ai={prob_ai:.6f}")
96
-
97
  # Construct Explanation
98
  if prediction == "AI_GENERATED":
99
- if pitch_var < 20.0:
100
- explanation = f"Deepfake model reported {confidence*100:.1f}% confidence. Detected unnatural pitch consistency (Variance: {pitch_var:.1f})."
101
- else:
102
- explanation = f"Deepfake model reported {confidence*100:.1f}% confidence. Detected digital artifacts characteristic of AI synthesis."
 
103
  else:
104
- if pitch_var > 20.0:
105
- explanation = f"Deepfake model reported {confidence*100:.1f}% confidence. Natural prosody and high pitch variance detected."
106
  else:
107
- explanation = f"Deepfake model reported {confidence*100:.1f}% confidence. Audio classified as human despite low pitch variance."
108
 
109
  return {
110
  "prediction": prediction,
111
- "probability_ai": float(f"{prob_ai:.4f}"),
112
  "confidence": float(f"{confidence:.4f}"),
113
  "features": {
114
- "pitch_variance": float(f"{pitch_var:.2f}")
 
 
 
115
  },
116
  "explanation": explanation
117
  }
 
29
  print(f"Error loading model: {e}")
30
  self.model = None
31
 
32
+ def predict(self, waveform: torch.Tensor, language: str = "Unknown"):
33
  if self.model is None:
34
  return {"error": "Model not loaded"}
35
 
36
  try:
37
  # 1. Preprocess Audio
 
38
  wav_np = waveform.squeeze().cpu().numpy()
39
+ sr = 16000
40
 
41
+ # --- ADVANCED FEATURE EXTRACTION ---
42
+ t0 = time.time()
43
+
44
+ # A. Pitch Analysis
45
+ f0, voiced_flag, voiced_probs = librosa.pyin(
46
+ wav_np, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr
47
  )
48
+ f0_clean = f0[~np.isnan(f0)]
49
+ pitch_var = np.std(f0_clean) if len(f0_clean) > 0 else 0.0
50
 
51
+ # B. Spectral Flatness (Detects vocoder buzz)
52
+ flatness = np.mean(librosa.feature.spectral_flatness(y=wav_np))
53
 
54
+ # C. RMS Energy Variance (Detects flattened volume envelopes)
55
+ rms = librosa.feature.rms(y=wav_np)[0]
56
+ rms_var = np.std(rms) / (np.mean(rms) + 1e-6) # Normalized variance
57
+
58
+ # D. Zero Crossing Rate Variance (Detects robotic vowel transitions)
59
+ zcr = librosa.feature.zero_crossing_rate(wav_np)[0]
60
+ zcr_var = np.std(zcr)
61
+
62
+ # --- TEMPORAL CONSISTENCY (SLIDING WINDOW) ---
63
+ chunk_size = 2 * sr # 2 seconds
64
+ stride = 1 * sr # 1 second overlap
65
+ chunks = []
66
+ for i in range(0, len(wav_np) - chunk_size + 1, stride):
67
+ chunks.append(wav_np[i : i + chunk_size])
68
+
69
+ # If audio too short for stride, just use whole thing
70
+ if not chunks:
71
+ chunks = [wav_np]
72
+
73
+ chunk_probs = []
74
+ for chunk in chunks:
75
+ inputs = self.feature_extractor(chunk, sampling_rate=sr, return_tensors="pt", padding=True)
76
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
77
+ with torch.no_grad():
78
+ outputs = self.model(**inputs)
79
+ probs = torch.softmax(outputs.logits, dim=-1)
80
+ chunk_probs.append(probs[0][0].item()) # Probability of 'fake'
81
+
82
+ # Authority calculation
83
+ # We take the MAX probability across chunks to catch 'slips' in AI generation
84
+ prob_fake = np.max(chunk_probs)
85
+ prob_real = 1.0 - prob_fake
86
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  t1 = time.time()
88
+ print(f"DEBUG: Analysis took {t1 - t0:.3f}s. Multi-chunk prob_fake: {prob_fake:.4f}")
89
+ print(f"DEBUG: Features - PitchVar: {pitch_var:.1f}, Flatness: {flatness:.4f}, RMS_Var: {rms_var:.4f}")
90
 
91
+ # --- HYBRID HEURISTIC STRENGTHENING ---
92
+ # AI Voices often have VERY low flatness or VERY low pitch variance
93
+ ai_signal_flags = 0
94
+ if pitch_var < 15.0: ai_signal_flags += 1
95
+ if flatness < 0.005: ai_signal_flags += 1 # Very tonal/melodic
96
+ if rms_var < 0.1: ai_signal_flags += 1 # Robotic volume
 
 
 
 
 
97
 
98
+ # Final Verdict Logic
99
+ # If the model is unsure (0.4-0.6) but signal flags are high, tip to AI
100
+ if 0.4 < prob_fake < 0.6 and ai_signal_flags >= 2:
101
+ prob_fake = 0.75
102
+
103
+ prediction = "AI_GENERATED" if prob_fake > 0.5 else "HUMAN"
104
+ confidence = prob_fake if prediction == "AI_GENERATED" else prob_real
105
+
106
+ # --- LANGUAGE AWARENESS ---
107
+ # If it's a non-English language, the model might be slightly less reliable
108
+ # We dampen confidence on low-resource languages to prevent false accusations
109
+ is_english = language.lower() in ["english", "en"]
110
+ if not is_english and confidence < 0.85:
111
+ confidence *= 0.95 # Slight dampening
112
 
 
 
113
  # Construct Explanation
114
  if prediction == "AI_GENERATED":
115
+ reasons = []
116
+ if ai_signal_flags >= 2: reasons.append("synthetic spectral characteristics")
117
+ if pitch_var < 20: reasons.append("lack of natural prosody")
118
+ if not reasons: reasons.append("digital vocoder artifacts")
119
+ explanation = f"AI detected with {confidence*100:.1f}% confidence. Evidence: {', '.join(reasons)}."
120
  else:
121
+ if pitch_var > 25.0:
122
+ explanation = f"Human verified with {confidence*100:.1f}% confidence. Strong natural pitch variance and human vocal dynamics detected."
123
  else:
124
+ explanation = f"Audio likely Human ({confidence*100:.1f}%). Detected natural speech fluctuations despite localized artifacts."
125
 
126
  return {
127
  "prediction": prediction,
128
+ "probability_ai": float(f"{prob_fake:.4f}"),
129
  "confidence": float(f"{confidence:.4f}"),
130
  "features": {
131
+ "pitch_variance": float(f"{pitch_var:.2f}"),
132
+ "spectral_flatness": float(f"{flatness:.6f}"),
133
+ "rms_variance": float(f"{rms_var:.4f}"),
134
+ "zcr_variance": float(f"{zcr_var:.4f}")
135
  },
136
  "explanation": explanation
137
  }
app/main.py CHANGED
@@ -73,7 +73,7 @@ async def detect_voice(
73
  raise HTTPException(status_code=400, detail="Could not process audio.")
74
 
75
  # 4. Predict
76
- result = classifier_instance.predict(waveform)
77
 
78
  if "error" in result:
79
  raise HTTPException(status_code=500, detail=result["error"])
 
73
  raise HTTPException(status_code=400, detail="Could not process audio.")
74
 
75
  # 4. Predict
76
+ result = classifier_instance.predict(waveform, language=request_data.language)
77
 
78
  if "error" in result:
79
  raise HTTPException(status_code=500, detail=result["error"])
test_full_audio.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+
4
+ def test_full_audio():
5
+ # Complete Base64 string provided by user
6
+ b64_str = """SUQzAwAAAAAAIlRTU0UAAAAOAAAATGF2ZjYyLjMuMTAwAAAAAAAAAAAAAAD/++TAAAAAAAAAAAAAAAAAAAAAAABJbmZvAAAADwAAAxkAC6GAAAIFCAoNDxIUFxocHyEkJyksLjEzNjk7PkBCRUdKTU9SVFdZXF9hZGZpbG5xc3Z4e36AgoWHioyPkpSXmZyeoaSmqauusbO2uLu9wMLEx8rMz9HU19nc3uHj5unr7vDz9vj7/QAAAABMYXZjNjIuMTEAAAAAAAAAAAAAAAAkBCgAAAAAAAuhgKduISIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP/7xMQAA8AAAaQAAAAgBwBAAPAABAHVMQU1FMy4xMDBVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVAWbjOjxEaBThLYeQGrlm7LmPQgJMHUzSHhGPECo2A4VCmLEjJI0ARHwwYkVAGOEJeg4HGwKFIRBqSIgJDRgAhUD0jEcJhrlHHGuRFr9axKLKcjUW3S8UbYknQtAwwBUyKBcwDCFU2PF1E+EJ4KGMQHQY8PMAGQAIAFlpjqWAQgEFTPoyIuBBx+IFQqpGt4dNSgtQBiFUp5+LfRQbsFAAZj0jgi1j5BUokctWocBgIqJgKALRL9peKDucl4io3i511sTfsLlOeTvEOeAKHYixGotaSyAwJsGECxTucKQIqGd4Q0HXAkDPFg4YOAAUULVjAYMRlqIqYJEPRDTrSCAcoQZkaADxvjxiWJr0pnRKwABMhdccg8ABgjGmZQkxIiAmdWhCYHHDGozcIgMQMMEGSYGNGgKCMuZ8KZJ8eE6aJ0F85+z5zuQLCCBAa4YW7MMWBzwzbszI9GoxiA0gIybUzII358z1g4QE58cCnBpYg0ZAIoIDgYiCDwAClzZvwE+NIUMAOMMJOYyGA61hkSAkYgBKEq7BgorImMHJ8AAUXZFg6mpnWZQ/Na9A09MRkBhBSdJd8yK82Y0wJUSF2wK5nXpZl7GYuyG5IBx0IBb0+cnOenihrOLXTVH4zxSB0QQC4QLCADNjbxZVLSCoMYYKGPABiRIDgESGwYdDT+WrMv"""
7
+
8
+ # Clean up the Base64 string (remove any whitespace/newlines)
9
+ b64_str = b64_str.replace('\n', '').replace(' ', '').strip()
10
+
11
+ url = "http://127.0.0.1:7860/api/voice-detection"
12
+ headers = {
13
+ "x-api-key": "test-key-123",
14
+ "Content-Type": "application/json"
15
+ }
16
+ payload = {
17
+ "language": "English",
18
+ "audioFormat": "mp3",
19
+ "audioBase64": b64_str
20
+ }
21
+
22
+ print(f"📡 Testing with FULL audio sample")
23
+ print(f"Base64 length: {len(b64_str)} characters")
24
+ print(f"POSTing to {url}...")
25
+ print()
26
+
27
+ try:
28
+ response = requests.post(url, headers=headers, json=payload, timeout=30)
29
+ print(f"📥 Status Code: {response.status_code}")
30
+ print()
31
+ print("📄 Response JSON:")
32
+ print(json.dumps(response.json(), indent=2))
33
+ except requests.exceptions.Timeout:
34
+ print("❌ ERROR: Request timed out after 30 seconds")
35
+ except Exception as e:
36
+ print(f"❌ ERROR: {e}")
37
+
38
+ if __name__ == "__main__":
39
+ test_provided_b64()