pavankumarvk commited on
Commit
0318cca
Β·
verified Β·
1 Parent(s): 6f2c247

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +109 -46
pipeline.py CHANGED
@@ -28,26 +28,48 @@ efficientnet_model = tf.keras.layers.TFSMLayer(
28
  )
29
 
30
  # ─────────────────────────────────────────────────────────────────────────────
31
- # Audio Model: mo-thecreator/Deepfake-audio-detection
32
- # Wav2Vec2 fine-tuned for real vs fake audio classification
33
- # Accuracy: 98.82% on evaluation set
 
 
 
 
 
 
 
 
 
 
 
 
34
  # ─────────────────────────────────────────────────────────────────────────────
35
- AUDIO_MODEL_ID = "Gustking/wav2vec2-large-xlsr-deepfake-audio-classification"
 
 
 
 
36
  AUDIO_SAMPLE_RATE = 16000
37
 
38
- print(f"Loading audio model: {AUDIO_MODEL_ID} ...")
39
- audio_feature_extractor = AutoFeatureExtractor.from_pretrained(AUDIO_MODEL_ID)
40
- audio_model = AutoModelForAudioClassification.from_pretrained(AUDIO_MODEL_ID)
41
- audio_model.eval()
42
- print("Audio model loaded successfully!")
43
- print(f"[Audio] Label map: {audio_model.config.id2label}")
44
 
45
- # ─── Thresholds ───────────────────────────────────────────────────────────────
46
- # REAL_THRESHOLD = 0.50 β†’ anything above 50% real probability = Real
47
- # FAKE_THRESHOLD = 0.90 β†’ needs very high confidence to be called Fake
48
- # Between 50-90% fake β†’ AI Synthesized / Voice Cloned
49
- REAL_THRESHOLD = 0.50
50
- FAKE_THRESHOLD = 0.90
 
 
 
 
 
 
 
 
51
 
52
 
53
  def convert_to_mp4(input_path):
@@ -177,13 +199,11 @@ def deepfakes_image_predict(input_image):
177
 
178
  def get_real_fake_probs(probs, id2label: dict):
179
  """
180
- Robustly map model label indices β†’ real/fake probabilities.
181
- Always prints the raw label map to Logs for debugging.
182
  """
183
  real_prob, fake_prob = None, None
184
 
185
- print(f"[Audio] id2label: {id2label}")
186
-
187
  for idx, prob in enumerate(probs):
188
  label = id2label[idx].lower().strip()
189
  if label in ("real", "label_1", "genuine", "bonafide", "1"):
@@ -191,35 +211,55 @@ def get_real_fake_probs(probs, id2label: dict):
191
  elif label in ("fake", "label_0", "spoof", "synthetic", "0"):
192
  fake_prob = float(prob)
193
 
194
- # Fallback if label names are unrecognised
195
  if real_prob is None or fake_prob is None:
196
- print("[Audio] Warning: unrecognised labels β€” falling back to probs[0]=fake, probs[1]=real")
197
  fake_prob = float(probs[0])
198
  real_prob = float(probs[1])
199
 
200
  return real_prob, fake_prob
201
 
202
 
203
- def classify_audio_3class(real_prob: float, fake_prob: float) -> str:
204
  """
205
- 3-class decision:
206
- real_prob >= 0.50 β†’ Real Human Voice
207
- fake_prob >= 0.90 β†’ Fake / Manipulated Audio
208
- in between β†’ AI Synthesized / Voice Cloned
209
  """
210
- print(f"[Audio] real_prob={real_prob:.4f} fake_prob={fake_prob:.4f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
  if real_prob >= REAL_THRESHOLD:
213
- return "βœ… Real Human Voice"
214
  elif fake_prob >= FAKE_THRESHOLD:
215
- return "🚨 Fake / Manipulated Audio"
216
  else:
217
- return "πŸ€– AI Synthesized / Voice Cloned"
 
 
 
218
 
219
 
220
  def deepfakes_audio_predict(input_audio):
221
  """
222
- Detect whether audio is: Real Human Voice / AI Synthesized / Fake.
 
 
223
  Gradio gr.Audio() returns (sample_rate, numpy_array).
224
  """
225
  sr, x = input_audio
@@ -240,19 +280,42 @@ def deepfakes_audio_predict(input_audio):
240
  x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
241
  print(f"[Audio] After resample: {len(x)} samples ({len(x) / AUDIO_SAMPLE_RATE:.2f}s)")
242
 
243
- # Step 4 β€” inference
244
- inputs = audio_feature_extractor(
245
- x,
246
- sampling_rate=AUDIO_SAMPLE_RATE,
247
- return_tensors="pt",
248
- padding=True
249
- )
250
-
251
- with torch.no_grad():
252
- logits = audio_model(**inputs).logits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
- probs = torch.softmax(logits, dim=-1)[0]
255
- real_prob, fake_prob = get_real_fake_probs(probs, audio_model.config.id2label)
256
 
257
- # Step 5 β€” 3-class decision
258
- return classify_audio_3class(real_prob, fake_prob)
 
 
 
 
 
28
  )
29
 
30
  # ─────────────────────────────────────────────────────────────────────────────
31
+ # Audio Ensemble: 3 models vote β€” majority wins
32
+ #
33
+ # Model 1: mo-thecreator/Deepfake-audio-detection
34
+ # Wav2Vec2-base, trained on real/fake speech, 98.82% accuracy
35
+ #
36
+ # Model 2: MelodyMachine/Deepfake-audio-detection-V2
37
+ # Fine-tuned from mo-thecreator, 99.73% accuracy on evaluation
38
+ #
39
+ # Model 3: Gustking/wav2vec2-large-xlsr-deepfake-audio-classification
40
+ # Wav2Vec2-large-xlsr, bigger multilingual model, more robust
41
+ #
42
+ # Voting logic:
43
+ # Each model casts a vote: "real", "ai_synth", or "fake"
44
+ # Final result = whichever label gets the most votes (majority)
45
+ # Tie on real vs fake β†’ AI Synthesized (safest middle ground)
46
  # ─────────────────────────────────────────────────────────────────────────────
47
+ AUDIO_MODELS = [
48
+ "mo-thecreator/Deepfake-audio-detection",
49
+ "MelodyMachine/Deepfake-audio-detection-V2",
50
+ "Gustking/wav2vec2-large-xlsr-deepfake-audio-classification",
51
+ ]
52
  AUDIO_SAMPLE_RATE = 16000
53
 
54
+ # ─── Thresholds (applied per model before voting) ────────────────────────────
55
+ REAL_THRESHOLD = 0.50 # real_prob >= 0.50 β†’ vote "real"
56
+ FAKE_THRESHOLD = 0.90 # fake_prob >= 0.90 β†’ vote "fake"
57
+ # anything between β†’ vote "ai_synth"
 
 
58
 
59
+ print("Loading audio ensemble models ...")
60
+ ensemble = []
61
+ for model_id in AUDIO_MODELS:
62
+ print(f" Loading {model_id} ...")
63
+ try:
64
+ fe = AutoFeatureExtractor.from_pretrained(model_id)
65
+ m = AutoModelForAudioClassification.from_pretrained(model_id)
66
+ m.eval()
67
+ ensemble.append({"id": model_id, "extractor": fe, "model": m})
68
+ print(f" βœ… Loaded: {model_id} | labels: {m.config.id2label}")
69
+ except Exception as e:
70
+ print(f" ⚠️ Skipped {model_id}: {e}")
71
+
72
+ print(f"Ensemble ready with {len(ensemble)} models.")
73
 
74
 
75
  def convert_to_mp4(input_path):
 
199
 
200
  def get_real_fake_probs(probs, id2label: dict):
201
  """
202
+ Map model output probabilities β†’ real/fake floats.
203
+ Handles all known label naming conventions.
204
  """
205
  real_prob, fake_prob = None, None
206
 
 
 
207
  for idx, prob in enumerate(probs):
208
  label = id2label[idx].lower().strip()
209
  if label in ("real", "label_1", "genuine", "bonafide", "1"):
 
211
  elif label in ("fake", "label_0", "spoof", "synthetic", "0"):
212
  fake_prob = float(prob)
213
 
214
+ # Fallback: 0=fake, 1=real
215
  if real_prob is None or fake_prob is None:
216
+ print("[Audio] Warning: unknown labels β€” falling back to probs[0]=fake, probs[1]=real")
217
  fake_prob = float(probs[0])
218
  real_prob = float(probs[1])
219
 
220
  return real_prob, fake_prob
221
 
222
 
223
+ def single_model_vote(x, entry):
224
  """
225
+ Run one model and return its vote: 'real', 'ai_synth', or 'fake'
226
+ along with the real/fake probabilities.
 
 
227
  """
228
+ model_id = entry["id"]
229
+ fe = entry["extractor"]
230
+ m = entry["model"]
231
+
232
+ inputs = fe(
233
+ x,
234
+ sampling_rate=AUDIO_SAMPLE_RATE,
235
+ return_tensors="pt",
236
+ padding=True
237
+ )
238
+
239
+ with torch.no_grad():
240
+ logits = m(**inputs).logits
241
+
242
+ probs = torch.softmax(logits, dim=-1)[0]
243
+ real_prob, fake_prob = get_real_fake_probs(probs, m.config.id2label)
244
+
245
+ print(f"[Audio] {model_id} β†’ real={real_prob:.4f} fake={fake_prob:.4f}")
246
 
247
  if real_prob >= REAL_THRESHOLD:
248
+ vote = "real"
249
  elif fake_prob >= FAKE_THRESHOLD:
250
+ vote = "fake"
251
  else:
252
+ vote = "ai_synth"
253
+
254
+ print(f"[Audio] {model_id} β†’ vote: {vote}")
255
+ return vote, real_prob, fake_prob
256
 
257
 
258
  def deepfakes_audio_predict(input_audio):
259
  """
260
+ Ensemble audio deepfake detection.
261
+ All loaded models vote β€” majority wins.
262
+
263
  Gradio gr.Audio() returns (sample_rate, numpy_array).
264
  """
265
  sr, x = input_audio
 
280
  x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
281
  print(f"[Audio] After resample: {len(x)} samples ({len(x) / AUDIO_SAMPLE_RATE:.2f}s)")
282
 
283
+ # Step 4 β€” each model votes
284
+ votes = {"real": 0, "ai_synth": 0, "fake": 0}
285
+ all_real_probs = []
286
+ all_fake_probs = []
287
+
288
+ for entry in ensemble:
289
+ try:
290
+ vote, real_prob, fake_prob = single_model_vote(x, entry)
291
+ votes[vote] += 1
292
+ all_real_probs.append(real_prob)
293
+ all_fake_probs.append(fake_prob)
294
+ except Exception as e:
295
+ print(f"[Audio] Model {entry['id']} failed during inference: {e}")
296
+
297
+ print(f"[Audio] Vote tally: {votes}")
298
+
299
+ if len(all_real_probs) == 0:
300
+ return "⚠️ All models failed. Please try again."
301
+
302
+ # Step 5 β€” majority vote decision
303
+ max_votes = max(votes.values())
304
+ winners = [label for label, count in votes.items() if count == max_votes]
305
+
306
+ # Tie-break: real > ai_synth > fake (bias toward safety)
307
+ if "real" in winners:
308
+ final = "real"
309
+ elif "ai_synth" in winners:
310
+ final = "ai_synth"
311
+ else:
312
+ final = "fake"
313
 
314
+ print(f"[Audio] Final decision: {final}")
 
315
 
316
+ if final == "real":
317
+ return "βœ… Real Human Voice"
318
+ elif final == "ai_synth":
319
+ return "πŸ€– AI Synthesized / Voice Cloned"
320
+ else:
321
+ return "🚨 Fake / Manipulated Audio"