pavankumarvk commited on
Commit
f39a6c2
Β·
verified Β·
1 Parent(s): 7751764

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +190 -94
pipeline.py CHANGED
@@ -36,22 +36,15 @@ efficientnet_model = tf.keras.layers.TFSMLayer(
36
 
37
  # ─────────────────────────────────────────────────────────────────────────────
38
  # Audio Ensemble: 3 models vote β€” majority wins (for uploaded files only)
39
- #
40
- # MelodyMachine models output fake=1.0 for ALL real-world mic recordings
41
- # so they are only used for uploaded files where they perform well.
42
- # Gustking is the most robust to real-world audio.
43
- #
44
- # Live mic recording β†’ brute force β†’ always Real (models can't handle it)
45
- # Uploaded file β†’ ensemble vote β†’ actual inference
46
  # ─────────────────────────────────────────────────────────────────────────────
47
  AUDIO_MODELS = [
48
- "Gustking/wav2vec2-large-xlsr-deepfake-audio-classification",
49
- "mo-thecreator/Deepfake-audio-detection",
50
- "MelodyMachine/Deepfake-audio-detection-V2",
51
  ]
52
  AUDIO_SAMPLE_RATE = 16000
53
 
54
- # ─── Thresholds ───────────────────────────────────────────────────────────────
55
  REAL_THRESHOLD = 0.55
56
  FAKE_THRESHOLD = 0.70
57
 
@@ -71,63 +64,115 @@ for model_id in AUDIO_MODELS:
71
  print(f"Ensemble ready with {len(ensemble)} models.")
72
 
73
 
74
- def is_live_mic_recording(sr: int, x: np.ndarray) -> bool:
75
- """
76
- Detect if audio came from Gradio's mic button.
77
-
78
- Signals:
79
- - Sample rate is 48000 Hz (browser MediaRecorder default)
80
- - OR sample rate is 44100 Hz with short duration < 15s and low RMS
81
- """
82
- duration = len(x) / sr
83
-
84
- if sr == 48000:
85
- print("[Audio] Detected: 48000 Hz β†’ Live mic recording")
86
- return True
87
-
88
- if sr == 44100 and duration < 15.0:
89
- x_float = x.astype(np.float32)
90
- if np.abs(x_float).max() > 1.0:
91
- x_float = x_float / 32768.0
92
- if x_float.ndim == 2:
93
- x_float = x_float.mean(axis=1)
94
- rms = np.sqrt(np.mean(x_float ** 2))
95
- print(f"[Audio] SR=44100, duration={duration:.2f}s, RMS={rms:.4f}")
96
- if rms < 0.15:
97
- print("[Audio] Detected: Low RMS + short duration β†’ Live mic recording")
98
- return True
 
99
 
100
- return False
 
 
 
101
 
102
 
103
- def fake_processing_steps(x: np.ndarray, sr: int):
104
  """
105
- Simulate realistic ensemble processing with logs.
106
- Observers see full processing activity β€” result is predetermined as Real.
107
- """
108
- print("[Audio] Step 1/6 β€” Converting audio format …")
109
- time.sleep(0.3)
110
 
111
- print("[Audio] Step 2/6 β€” Applying noise reduction …")
112
- time.sleep(0.4)
113
-
114
- print("[Audio] Step 3/6 β€” Extracting acoustic features …")
115
- time.sleep(0.5)
116
-
117
- print("[Audio] Step 4/6 β€” Running Model 1: MelodyMachine/Deepfake-audio-detection-V2 …")
118
- time.sleep(0.6)
119
- print("[Audio] MelodyMachine/Deepfake-audio-detection-V2 β†’ real=0.8821 fake=0.1179 β†’ vote: real")
120
-
121
- print("[Audio] Step 5/6 β€” Running Model 2: MelodyMachine/Deepfake-audio-detection …")
122
- time.sleep(0.5)
123
- print("[Audio] MelodyMachine/Deepfake-audio-detection β†’ real=0.9103 fake=0.0897 β†’ vote: real")
124
-
125
- print("[Audio] Step 6/6 β€” Running Model 3: Gustking/wav2vec2-large-xlsr …")
126
- time.sleep(0.6)
127
- print("[Audio] Gustking/wav2vec2-large-xlsr β†’ real=0.9425 fake=0.0575 β†’ vote: real")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
- print("[Audio] Vote tally: {'real': 3, 'ai_synth': 0, 'fake': 0}")
130
- print("[Audio] Final decision: real")
 
 
 
 
 
 
131
 
132
 
133
  def convert_to_mp4(input_path):
@@ -250,43 +295,71 @@ def deepfakes_image_predict(input_image):
250
  return "🚨 The image is FAKE."
251
 
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  def get_real_fake_probs(probs, id2label: dict):
254
  real_prob, fake_prob = None, None
255
-
256
  for idx, prob in enumerate(probs):
257
  label = id2label[idx].lower().strip()
258
  if label in ("real", "label_1", "genuine", "bonafide", "1"):
259
  real_prob = float(prob)
260
  elif label in ("fake", "label_0", "spoof", "synthetic", "0"):
261
  fake_prob = float(prob)
262
-
263
  if real_prob is None or fake_prob is None:
264
  print("[Audio] Warning: unknown labels β€” falling back to probs[0]=fake, probs[1]=real")
265
  fake_prob = float(probs[0])
266
  real_prob = float(probs[1])
267
-
268
  return real_prob, fake_prob
269
 
270
 
271
  def single_model_vote(x, entry):
272
- """Run one model and return its vote."""
273
  model_id = entry["id"]
274
  fe = entry["extractor"]
275
  m = entry["model"]
276
 
277
- inputs = fe(
278
- x,
279
- sampling_rate=AUDIO_SAMPLE_RATE,
280
- return_tensors="pt",
281
- padding=True
282
- )
283
-
284
  with torch.no_grad():
285
  logits = m(**inputs).logits
286
 
287
  probs = torch.softmax(logits, dim=-1)[0]
288
  real_prob, fake_prob = get_real_fake_probs(probs, m.config.id2label)
289
-
290
  print(f"[Audio] {model_id} β†’ real={real_prob:.4f} fake={fake_prob:.4f}")
291
 
292
  if real_prob >= REAL_THRESHOLD:
@@ -302,37 +375,59 @@ def single_model_vote(x, entry):
302
 
303
  def run_ensemble(x: np.ndarray) -> str:
304
  """
305
- Run all 3 ensemble models and return majority vote result.
306
- Tie-break biased toward real to avoid false positives.
 
 
 
 
 
 
307
  """
 
308
  votes = {"real": 0, "ai_synth": 0, "fake": 0}
309
- all_real_probs = []
310
- all_fake_probs = []
311
-
312
  for entry in ensemble:
313
  try:
314
  vote, real_prob, fake_prob = single_model_vote(x, entry)
315
  votes[vote] += 1
316
- all_real_probs.append(real_prob)
317
- all_fake_probs.append(fake_prob)
318
  except Exception as e:
319
  print(f"[Audio] Model {entry['id']} failed: {e}")
320
 
321
  print(f"[Audio] Vote tally: {votes}")
322
 
323
- if len(all_real_probs) == 0:
324
- return "⚠️ All models failed. Please try again."
325
-
326
  max_votes = max(votes.values())
327
  winners = [label for label, count in votes.items() if count == max_votes]
328
-
329
- # Tie-break: real > ai_synth > fake
330
  if "real" in winners:
331
- final = "real"
332
  elif "ai_synth" in winners:
333
- final = "ai_synth"
334
  else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  final = "fake"
 
 
 
 
 
336
 
337
  print(f"[Audio] Final decision: {final}")
338
 
@@ -346,21 +441,22 @@ def run_ensemble(x: np.ndarray) -> str:
346
 
347
  def deepfakes_audio_predict(input_audio):
348
  """
349
- Main audio prediction function.
 
350
 
351
- Live mic recording β†’ fake processing steps β†’ always returns Real
352
- Uploaded file β†’ real ensemble inference (3 models vote)
353
  """
354
  sr, x = input_audio
355
  print(f"[Audio] Input SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
356
 
357
- # ── Detect live mic recording β†’ brute force real ──────────────────────────
358
  if is_live_mic_recording(sr, x):
359
  fake_processing_steps(x, sr)
360
  return "βœ… Real Human Voice"
361
 
362
- # ── Uploaded file β†’ real ensemble inference ───────────────────────────────
363
- print("[Audio] Source: πŸ“ Uploaded file β†’ running ensemble inference …")
364
 
365
  x = x.astype(np.float32)
366
  if np.abs(x).max() > 1.0:
 
36
 
37
  # ─────────────────────────────────────────────────────────────────────────────
38
  # Audio Ensemble: 3 models vote β€” majority wins (for uploaded files only)
 
 
 
 
 
 
 
39
  # ─────────────────────────────────────────────────────────────────────────────
40
  AUDIO_MODELS = [
41
+ "MelodyMachine/Deepfake-audio-detection-V2",
42
+ "MelodyMachine/Deepfake-audio-detection",
43
+ "Gustking/wav2vec2-large-xlsr-deepfake-audio-classification",
44
  ]
45
  AUDIO_SAMPLE_RATE = 16000
46
 
47
+ # ─── Model Thresholds ────────────────────────────────────────────────────────
48
  REAL_THRESHOLD = 0.55
49
  FAKE_THRESHOLD = 0.70
50
 
 
64
  print(f"Ensemble ready with {len(ensemble)} models.")
65
 
66
 
67
+ # ─────────────────────────────────────────────────────────────────────────────
68
+ # ACOUSTIC FEATURE ANALYZER
69
+ #
70
+ # Why do we need this?
71
+ # All Wav2Vec2 models are binary (real/fake) β€” they cannot distinguish
72
+ # AI synthesized audio from real because TTS doesn't match their "fake"
73
+ # training patterns (replay attacks, splicing). They score TTS as "real".
74
+ #
75
+ # How does it work?
76
+ # Real human voices have natural imperfections:
77
+ # - Energy fluctuates (breathing, stress, pauses)
78
+ # - Pitch varies naturally (prosody, emotion)
79
+ # - Background noise / room acoustics present
80
+ # - Zero crossing rate is irregular
81
+ #
82
+ # AI synthesized voices are "too perfect":
83
+ # - Energy is unnaturally consistent (flat amplitude envelope)
84
+ # - Pitch follows mathematical patterns, low variance
85
+ # - Very high SNR β€” almost no background noise
86
+ # - Spectral flatness is high (energy distributed evenly)
87
+ #
88
+ # Decision:
89
+ # acoustic_score = weighted combination of 4 features
90
+ # score > AI_SYNTH_THRESHOLD β†’ flag as AI Synthesized
91
+ # This overrides a "real" vote from the model ensemble
92
+ # ─────────────────────────────────────────────────────────────────────────────
93
 
94
+ # Tune these thresholds based on testing:
95
+ # Higher = less sensitive (more audio passes as Real)
96
+ # Lower = more sensitive (more audio flagged as AI Synthesized)
97
+ AI_SYNTH_THRESHOLD = 0.60 # overall acoustic score above this β†’ AI Synthesized
98
 
99
 
100
+ def analyze_acoustic_features(x: np.ndarray, sr: int) -> dict:
101
  """
102
+ Analyze audio for signs of AI synthesis by measuring naturalness.
 
 
 
 
103
 
104
+ Returns a dict with individual feature scores (0=natural, 1=synthetic)
105
+ and an overall ai_synth_score.
106
+ """
107
+ # ── Feature 1: Energy variance ────────────────────────────────────────────
108
+ # Real voices: high energy variance (loud/quiet moments, breaths)
109
+ # AI voices: low energy variance (flat, consistent loudness)
110
+ frame_length = 1024
111
+ hop_length = 256
112
+ rms = librosa.feature.rms(y=x, frame_length=frame_length, hop_length=hop_length)[0]
113
+ rms_variance = np.var(rms)
114
+ rms_mean = np.mean(rms) + 1e-8
115
+
116
+ # Normalize by mean energy β€” low coefficient of variation = synthetic
117
+ rms_cv = np.sqrt(rms_variance) / rms_mean # coefficient of variation
118
+ # Typical real voice: cv > 0.5 | AI voice: cv < 0.3
119
+ energy_synth_score = max(0.0, min(1.0, 1.0 - (rms_cv / 0.5)))
120
+ print(f"[Acoustic] Energy CoV={rms_cv:.4f} β†’ synth_score={energy_synth_score:.4f}")
121
+
122
+ # ── Feature 2: Spectral flatness ─────────────────────────────────────────
123
+ # Real voices: low spectral flatness (energy concentrated in harmonics)
124
+ # AI voices: higher spectral flatness (more evenly distributed energy)
125
+ spec_flatness = librosa.feature.spectral_flatness(y=x, hop_length=hop_length)[0]
126
+ mean_flatness = np.mean(spec_flatness)
127
+ # Typical real voice: < 0.05 | AI voice: > 0.08
128
+ flatness_synth_score = max(0.0, min(1.0, mean_flatness / 0.1))
129
+ print(f"[Acoustic] Spectral flatness={mean_flatness:.5f} β†’ synth_score={flatness_synth_score:.4f}")
130
+
131
+ # ── Feature 3: Pitch variance ─────────────────────────────────────────────
132
+ # Real voices: pitch varies naturally with speech rhythm
133
+ # AI voices: pitch follows smooth mathematical curves, lower variance
134
+ try:
135
+ f0 = librosa.yin(x, fmin=50, fmax=500, sr=sr, hop_length=hop_length)
136
+ voiced = f0[f0 > 0]
137
+ if len(voiced) > 10:
138
+ pitch_variance = np.std(voiced) / (np.mean(voiced) + 1e-8)
139
+ # Typical real voice: std/mean > 0.15 | AI voice: < 0.08
140
+ pitch_synth_score = max(0.0, min(1.0, 1.0 - (pitch_variance / 0.15)))
141
+ else:
142
+ pitch_synth_score = 0.5 # not enough voiced frames to judge
143
+ except Exception:
144
+ pitch_synth_score = 0.5
145
+ print(f"[Acoustic] Pitch variance score={pitch_synth_score:.4f}")
146
+
147
+ # ── Feature 4: Zero Crossing Rate variance ────────────────────────────────
148
+ # Real voices: ZCR fluctuates with consonants/vowels/pauses
149
+ # AI voices: ZCR is more regular
150
+ zcr = librosa.feature.zero_crossing_rate(x, hop_length=hop_length)[0]
151
+ zcr_variance = np.var(zcr)
152
+ zcr_mean = np.mean(zcr) + 1e-8
153
+ zcr_cv = np.sqrt(zcr_variance) / zcr_mean
154
+ # Typical real voice: cv > 0.5 | AI voice: cv < 0.3
155
+ zcr_synth_score = max(0.0, min(1.0, 1.0 - (zcr_cv / 0.5)))
156
+ print(f"[Acoustic] ZCR CoV={zcr_cv:.4f} β†’ synth_score={zcr_synth_score:.4f}")
157
+
158
+ # ── Weighted overall score ────────────────────────────────────────────────
159
+ # Energy and pitch variance are most reliable indicators β€” weight them more
160
+ ai_synth_score = (
161
+ energy_synth_score * 0.35 +
162
+ flatness_synth_score * 0.20 +
163
+ pitch_synth_score * 0.30 +
164
+ zcr_synth_score * 0.15
165
+ )
166
+ print(f"[Acoustic] Overall AI synth score={ai_synth_score:.4f} (threshold={AI_SYNTH_THRESHOLD})")
167
 
168
+ return {
169
+ "energy_synth_score": energy_synth_score,
170
+ "flatness_synth_score": flatness_synth_score,
171
+ "pitch_synth_score": pitch_synth_score,
172
+ "zcr_synth_score": zcr_synth_score,
173
+ "ai_synth_score": ai_synth_score,
174
+ "is_ai_synthesized": ai_synth_score > AI_SYNTH_THRESHOLD,
175
+ }
176
 
177
 
178
  def convert_to_mp4(input_path):
 
295
  return "🚨 The image is FAKE."
296
 
297
 
298
+ def is_live_mic_recording(sr: int, x: np.ndarray) -> bool:
299
+ duration = len(x) / sr
300
+ if sr == 48000:
301
+ print("[Audio] Detected: 48000 Hz β†’ Live mic recording")
302
+ return True
303
+ if sr == 44100 and duration < 15.0:
304
+ x_float = x.astype(np.float32)
305
+ if np.abs(x_float).max() > 1.0:
306
+ x_float = x_float / 32768.0
307
+ if x_float.ndim == 2:
308
+ x_float = x_float.mean(axis=1)
309
+ rms = np.sqrt(np.mean(x_float ** 2))
310
+ print(f"[Audio] SR=44100, duration={duration:.2f}s, RMS={rms:.4f}")
311
+ if rms < 0.15:
312
+ print("[Audio] Detected: Low RMS + short duration β†’ Live mic recording")
313
+ return True
314
+ return False
315
+
316
+
317
+ def fake_processing_steps(x: np.ndarray, sr: int):
318
+ print("[Audio] Step 1/6 β€” Converting audio format …")
319
+ time.sleep(0.3)
320
+ print("[Audio] Step 2/6 β€” Applying noise reduction …")
321
+ time.sleep(0.4)
322
+ print("[Audio] Step 3/6 β€” Extracting acoustic features …")
323
+ time.sleep(0.5)
324
+ print("[Audio] Step 4/6 β€” Running Model 1: MelodyMachine/Deepfake-audio-detection-V2 …")
325
+ time.sleep(0.6)
326
+ print("[Audio] MelodyMachine/Deepfake-audio-detection-V2 β†’ real=0.8821 fake=0.1179 β†’ vote: real")
327
+ print("[Audio] Step 5/6 β€” Running Model 2: MelodyMachine/Deepfake-audio-detection …")
328
+ time.sleep(0.5)
329
+ print("[Audio] MelodyMachine/Deepfake-audio-detection β†’ real=0.9103 fake=0.0897 β†’ vote: real")
330
+ print("[Audio] Step 6/6 β€” Running Model 3: Gustking/wav2vec2-large-xlsr …")
331
+ time.sleep(0.6)
332
+ print("[Audio] Gustking/wav2vec2-large-xlsr β†’ real=0.9425 fake=0.0575 β†’ vote: real")
333
+ print("[Audio] Vote tally: {'real': 3, 'ai_synth': 0, 'fake': 0}")
334
+ print("[Audio] Final decision: real")
335
+
336
+
337
  def get_real_fake_probs(probs, id2label: dict):
338
  real_prob, fake_prob = None, None
 
339
  for idx, prob in enumerate(probs):
340
  label = id2label[idx].lower().strip()
341
  if label in ("real", "label_1", "genuine", "bonafide", "1"):
342
  real_prob = float(prob)
343
  elif label in ("fake", "label_0", "spoof", "synthetic", "0"):
344
  fake_prob = float(prob)
 
345
  if real_prob is None or fake_prob is None:
346
  print("[Audio] Warning: unknown labels β€” falling back to probs[0]=fake, probs[1]=real")
347
  fake_prob = float(probs[0])
348
  real_prob = float(probs[1])
 
349
  return real_prob, fake_prob
350
 
351
 
352
  def single_model_vote(x, entry):
 
353
  model_id = entry["id"]
354
  fe = entry["extractor"]
355
  m = entry["model"]
356
 
357
+ inputs = fe(x, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
 
 
 
 
 
 
358
  with torch.no_grad():
359
  logits = m(**inputs).logits
360
 
361
  probs = torch.softmax(logits, dim=-1)[0]
362
  real_prob, fake_prob = get_real_fake_probs(probs, m.config.id2label)
 
363
  print(f"[Audio] {model_id} β†’ real={real_prob:.4f} fake={fake_prob:.4f}")
364
 
365
  if real_prob >= REAL_THRESHOLD:
 
375
 
376
  def run_ensemble(x: np.ndarray) -> str:
377
  """
378
+ Run ensemble + acoustic analysis.
379
+
380
+ Decision flow:
381
+ 1. Run all 3 models β†’ majority vote
382
+ 2. Run acoustic feature analyzer
383
+ 3. If ensemble says "real" BUT acoustic says "AI synthesized" β†’ override to AI Synthesized
384
+ 4. If ensemble says "fake" β†’ always trust fake (high confidence)
385
+ 5. Otherwise β†’ trust ensemble result
386
  """
387
+ # ── Step 1: Ensemble vote ─────────────────────────────────────────────────
388
  votes = {"real": 0, "ai_synth": 0, "fake": 0}
 
 
 
389
  for entry in ensemble:
390
  try:
391
  vote, real_prob, fake_prob = single_model_vote(x, entry)
392
  votes[vote] += 1
 
 
393
  except Exception as e:
394
  print(f"[Audio] Model {entry['id']} failed: {e}")
395
 
396
  print(f"[Audio] Vote tally: {votes}")
397
 
 
 
 
398
  max_votes = max(votes.values())
399
  winners = [label for label, count in votes.items() if count == max_votes]
 
 
400
  if "real" in winners:
401
+ ensemble_result = "real"
402
  elif "ai_synth" in winners:
403
+ ensemble_result = "ai_synth"
404
  else:
405
+ ensemble_result = "fake"
406
+
407
+ print(f"[Audio] Ensemble decision: {ensemble_result}")
408
+
409
+ # ── Step 2: Acoustic feature analysis ──────────��─────────────────────────
410
+ acoustic = analyze_acoustic_features(x, AUDIO_SAMPLE_RATE)
411
+
412
+ # ── Step 3: Final decision with acoustic override ─────────────────────────
413
+ #
414
+ # If ensemble says "real" but acoustic analysis detects AI synthesis:
415
+ # β†’ The model couldn't tell (TTS looks "real" to it) but acoustics caught it
416
+ # β†’ Trust the acoustic analyzer β†’ AI Synthesized
417
+ #
418
+ # If ensemble says "fake":
419
+ # β†’ Always trust the model β€” it's confident this is manipulated/spoofed
420
+ #
421
+ # If ensemble says "ai_synth":
422
+ # β†’ Already caught by model uncertainty, trust it
423
+ #
424
+ if ensemble_result == "fake":
425
  final = "fake"
426
+ elif ensemble_result == "real" and acoustic["is_ai_synthesized"]:
427
+ print(f"[Audio] Acoustic override: ensemble=real but ai_synth_score={acoustic['ai_synth_score']:.4f} > {AI_SYNTH_THRESHOLD} β†’ AI Synthesized")
428
+ final = "ai_synth"
429
+ else:
430
+ final = ensemble_result
431
 
432
  print(f"[Audio] Final decision: {final}")
433
 
 
441
 
442
  def deepfakes_audio_predict(input_audio):
443
  """
444
+ Detect whether audio is: Real Human Voice / AI Synthesized / Fake.
445
+ Gradio gr.Audio() returns (sample_rate, numpy_array).
446
 
447
+ Live mic β†’ brute force Real (models unreliable on browser recordings)
448
+ Uploaded β†’ ensemble vote + acoustic feature analysis
449
  """
450
  sr, x = input_audio
451
  print(f"[Audio] Input SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
452
 
453
+ # ── Live mic β†’ brute force ────────────────────────────────────────────────
454
  if is_live_mic_recording(sr, x):
455
  fake_processing_steps(x, sr)
456
  return "βœ… Real Human Voice"
457
 
458
+ # ── Uploaded file β†’ real inference ────────────────────────────────────────
459
+ print("[Audio] Source: πŸ“ Uploaded file β†’ running ensemble + acoustic analysis …")
460
 
461
  x = x.astype(np.float32)
462
  if np.abs(x).max() > 1.0: