pavankumarvk commited on
Commit
2b9cb11
Β·
verified Β·
1 Parent(s): 7fe6676

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +95 -50
pipeline.py CHANGED
@@ -9,6 +9,14 @@ import numpy as np
9
  import tensorflow as tf
10
  from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
11
 
 
 
 
 
 
 
 
 
12
  # Set random seed for reproducibility.
13
  tf.random.set_seed(42)
14
 
@@ -29,32 +37,17 @@ efficientnet_model = tf.keras.layers.TFSMLayer(
29
 
30
  # ─────────────────────────────────────────────────────────────────────────────
31
  # Audio Ensemble: 3 models vote β€” majority wins
32
- #
33
- # Model 1: mo-thecreator/Deepfake-audio-detection
34
- # Wav2Vec2-base, trained on real/fake speech, 98.82% accuracy
35
- #
36
- # Model 2: MelodyMachine/Deepfake-audio-detection-V2
37
- # Fine-tuned from mo-thecreator, 99.73% accuracy on evaluation
38
- #
39
- # Model 3: Gustking/wav2vec2-large-xlsr-deepfake-audio-classification
40
- # Wav2Vec2-large-xlsr, bigger multilingual model, more robust
41
- #
42
- # Voting logic:
43
- # Each model casts a vote: "real", "ai_synth", or "fake"
44
- # Final result = whichever label gets the most votes (majority)
45
- # Tie on real vs fake β†’ AI Synthesized (safest middle ground)
46
  # ─────────────────────────────────────────────────────────────────────────────
47
  AUDIO_MODELS = [
48
- "MelodyMachine/Deepfake-audio-detection-V2", # Highest accuracy
49
- "MelodyMachine/Deepfake-audio-detection", # Second best
50
- "Gustking/wav2vec2-large-xlsr-deepfake-audio-classification", # Most robust to real-world audio
51
  ]
52
  AUDIO_SAMPLE_RATE = 16000
53
 
54
- # ─── Thresholds (applied per model before voting) ────────────────────────────
55
- REAL_THRESHOLD = 0.50 # real_prob >= 0.50 β†’ vote "real"
56
- FAKE_THRESHOLD = 0.90 # fake_prob >= 0.90 β†’ vote "fake"
57
- # anything between β†’ vote "ai_synth"
58
 
59
  print("Loading audio ensemble models ...")
60
  ensemble = []
@@ -73,7 +66,6 @@ print(f"Ensemble ready with {len(ensemble)} models.")
73
 
74
 
75
  def convert_to_mp4(input_path):
76
- """Convert any video to .mp4 using ffmpeg (handles webcam .webm, etc.)"""
77
  ext = os.path.splitext(input_path)[-1].lower()
78
  if ext == ".mp4":
79
  cap = cv2.VideoCapture(input_path)
@@ -99,8 +91,6 @@ def convert_to_mp4(input_path):
99
 
100
 
101
  class DetectionPipeline:
102
- """Pipeline for detecting faces in video frames or processing images."""
103
-
104
  def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality='video'):
105
  self.n_frames = n_frames
106
  self.batch_size = batch_size
@@ -197,13 +187,76 @@ def deepfakes_image_predict(input_image):
197
  return "🚨 The image is FAKE."
198
 
199
 
200
- def get_real_fake_probs(probs, id2label: dict):
201
  """
202
- Map model output probabilities β†’ real/fake floats.
203
- Handles all known label naming conventions.
 
 
 
 
 
204
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  real_prob, fake_prob = None, None
206
 
 
 
207
  for idx, prob in enumerate(probs):
208
  label = id2label[idx].lower().strip()
209
  if label in ("real", "label_1", "genuine", "bonafide", "1"):
@@ -211,7 +264,6 @@ def get_real_fake_probs(probs, id2label: dict):
211
  elif label in ("fake", "label_0", "spoof", "synthetic", "0"):
212
  fake_prob = float(prob)
213
 
214
- # Fallback: 0=fake, 1=real
215
  if real_prob is None or fake_prob is None:
216
  print("[Audio] Warning: unknown labels β€” falling back to probs[0]=fake, probs[1]=real")
217
  fake_prob = float(probs[0])
@@ -221,10 +273,6 @@ def get_real_fake_probs(probs, id2label: dict):
221
 
222
 
223
  def single_model_vote(x, entry):
224
- """
225
- Run one model and return its vote: 'real', 'ai_synth', or 'fake'
226
- along with the real/fake probabilities.
227
- """
228
  model_id = entry["id"]
229
  fe = entry["extractor"]
230
  m = entry["model"]
@@ -257,30 +305,27 @@ def single_model_vote(x, entry):
257
 
258
  def deepfakes_audio_predict(input_audio):
259
  """
260
- Ensemble audio deepfake detection.
261
- All loaded models vote β€” majority wins.
262
 
263
  Gradio gr.Audio() returns (sample_rate, numpy_array).
 
 
264
  """
265
  sr, x = input_audio
266
  print(f"[Audio] Input SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
267
 
268
- # Step 1 β€” float32 + normalise
269
- x = x.astype(np.float32)
270
- if np.abs(x).max() > 1.0:
271
- x = x / 32768.0
 
 
 
272
 
273
- # Step 2 β€” stereo β†’ mono
274
- if x.ndim == 2:
275
- x = x.mean(axis=1)
276
-
277
- # Step 3 β€” resample to 16 kHz
278
- if sr != AUDIO_SAMPLE_RATE:
279
- print(f"[Audio] Resampling {sr} Hz β†’ {AUDIO_SAMPLE_RATE} Hz …")
280
- x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
281
- print(f"[Audio] After resample: {len(x)} samples ({len(x) / AUDIO_SAMPLE_RATE:.2f}s)")
282
 
283
- # Step 4 β€” each model votes
284
  votes = {"real": 0, "ai_synth": 0, "fake": 0}
285
  all_real_probs = []
286
  all_fake_probs = []
@@ -299,11 +344,11 @@ def deepfakes_audio_predict(input_audio):
299
  if len(all_real_probs) == 0:
300
  return "⚠️ All models failed. Please try again."
301
 
302
- # Step 5 β€” majority vote decision
303
  max_votes = max(votes.values())
304
  winners = [label for label, count in votes.items() if count == max_votes]
305
 
306
- # Tie-break: real > ai_synth > fake (bias toward safety)
307
  if "real" in winners:
308
  final = "real"
309
  elif "ai_synth" in winners:
 
9
  import tensorflow as tf
10
  from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
11
 
12
+ try:
13
+ import noisereduce as nr
14
+ NOISEREDUCE_AVAILABLE = True
15
+ print("noisereduce available β€” live recording denoising enabled.")
16
+ except ImportError:
17
+ NOISEREDUCE_AVAILABLE = False
18
+ print("noisereduce not available β€” skipping denoising.")
19
+
20
  # Set random seed for reproducibility.
21
  tf.random.set_seed(42)
22
 
 
37
 
38
  # ─────────────────────────────────────────────────────────────────────────────
39
  # Audio Ensemble: 3 models vote β€” majority wins
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  # ─────────────────────────────────────────────────────────────────────────────
41
  AUDIO_MODELS = [
42
+ "MelodyMachine/Deepfake-audio-detection-V2",
43
+ "MelodyMachine/Deepfake-audio-detection",
44
+ "Gustking/wav2vec2-large-xlsr-deepfake-audio-classification",
45
  ]
46
  AUDIO_SAMPLE_RATE = 16000
47
 
48
+ # ─── Thresholds ───────────────────────────────────────────────────────────────
49
+ REAL_THRESHOLD = 0.50
50
+ FAKE_THRESHOLD = 0.90
 
51
 
52
  print("Loading audio ensemble models ...")
53
  ensemble = []
 
66
 
67
 
68
  def convert_to_mp4(input_path):
 
69
  ext = os.path.splitext(input_path)[-1].lower()
70
  if ext == ".mp4":
71
  cap = cv2.VideoCapture(input_path)
 
91
 
92
 
93
  class DetectionPipeline:
 
 
94
  def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality='video'):
95
  self.n_frames = n_frames
96
  self.batch_size = batch_size
 
187
  return "🚨 The image is FAKE."
188
 
189
 
190
+ def preprocess_audio(x: np.ndarray, sr: int, is_live: bool) -> np.ndarray:
191
  """
192
+ Preprocessing pipeline with extra steps for live microphone recordings.
193
+
194
+ Uploaded file:
195
+ float32 β†’ mono β†’ resample β†’ normalize
196
+
197
+ Live recording (extra steps):
198
+ float32 β†’ mono β†’ resample β†’ denoise β†’ normalize β†’ trim silence
199
  """
200
+ # Step 1 β€” float32 + int16 normalise
201
+ x = x.astype(np.float32)
202
+ if np.abs(x).max() > 1.0:
203
+ x = x / 32768.0
204
+
205
+ # Step 2 β€” stereo β†’ mono
206
+ if x.ndim == 2:
207
+ x = x.mean(axis=1)
208
+
209
+ # Step 3 β€” resample to 16 kHz
210
+ if sr != AUDIO_SAMPLE_RATE:
211
+ print(f"[Audio] Resampling {sr} Hz β†’ {AUDIO_SAMPLE_RATE} Hz …")
212
+ x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
213
+
214
+ if is_live:
215
+ print("[Audio] Live recording detected β€” applying enhanced preprocessing …")
216
+
217
+ # Step 4 β€” Noise reduction
218
+ # Uses first 0.5s as noise profile (usually silence before speaking)
219
+ if NOISEREDUCE_AVAILABLE and len(x) > AUDIO_SAMPLE_RATE // 2:
220
+ noise_sample = x[:AUDIO_SAMPLE_RATE // 2]
221
+ x = nr.reduce_noise(
222
+ y=x,
223
+ sr=AUDIO_SAMPLE_RATE,
224
+ y_noise=noise_sample,
225
+ prop_decrease=0.75, # aggressive but not total noise removal
226
+ stationary=False # handles non-stationary noise (room noise)
227
+ )
228
+ print("[Audio] Noise reduction applied.")
229
+
230
+ # Step 5 β€” Trim leading/trailing silence
231
+ # Live recordings often have silence at start/end before/after speaking
232
+ x, _ = librosa.effects.trim(
233
+ x,
234
+ top_db=20, # anything 20dB below peak = silence
235
+ frame_length=512,
236
+ hop_length=128
237
+ )
238
+ print(f"[Audio] After trim: {len(x)} samples ({len(x)/AUDIO_SAMPLE_RATE:.2f}s)")
239
+
240
+ # Step 6 β€” Peak normalize to -3dBFS
241
+ # Live mics often record too quietly, which confuses the model
242
+ peak = np.abs(x).max()
243
+ if peak > 0:
244
+ x = x / peak * 0.707 # normalize to ~-3dBFS
245
+ print("[Audio] Peak normalization applied.")
246
+
247
+ # Final check β€” must have at least 0.5s of audio
248
+ min_samples = AUDIO_SAMPLE_RATE // 2
249
+ if len(x) < min_samples:
250
+ x = np.pad(x, (0, min_samples - len(x)), mode='constant')
251
+
252
+ return x
253
+
254
+
255
+ def get_real_fake_probs(probs, id2label: dict):
256
  real_prob, fake_prob = None, None
257
 
258
+ print(f"[Audio] id2label: {id2label}")
259
+
260
  for idx, prob in enumerate(probs):
261
  label = id2label[idx].lower().strip()
262
  if label in ("real", "label_1", "genuine", "bonafide", "1"):
 
264
  elif label in ("fake", "label_0", "spoof", "synthetic", "0"):
265
  fake_prob = float(prob)
266
 
 
267
  if real_prob is None or fake_prob is None:
268
  print("[Audio] Warning: unknown labels β€” falling back to probs[0]=fake, probs[1]=real")
269
  fake_prob = float(probs[0])
 
273
 
274
 
275
  def single_model_vote(x, entry):
 
 
 
 
276
  model_id = entry["id"]
277
  fe = entry["extractor"]
278
  m = entry["model"]
 
305
 
306
  def deepfakes_audio_predict(input_audio):
307
  """
308
+ Detect whether audio is: Real Human Voice / AI Synthesized / Fake.
 
309
 
310
  Gradio gr.Audio() returns (sample_rate, numpy_array).
311
+ Detects if input is live recording or uploaded file and applies
312
+ appropriate preprocessing accordingly.
313
  """
314
  sr, x = input_audio
315
  print(f"[Audio] Input SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
316
 
317
+ # ── Detect if live recording ──────────────────────────────────────────────
318
+ # Live recordings from browser microphone typically arrive at 48000 Hz.
319
+ # Uploaded files can be any sample rate but are rarely exactly 48000.
320
+ # Duration under 30s also strongly suggests live recording.
321
+ duration = len(x) / sr
322
+ is_live = (sr == 48000 and duration < 30.0)
323
+ print(f"[Audio] Source: {'πŸŽ™οΈ Live recording' if is_live else 'πŸ“ Uploaded file'} | duration={duration:.2f}s")
324
 
325
+ # ── Preprocess ────────────────────────────────────────────────────────────
326
+ x = preprocess_audio(x, sr, is_live)
 
 
 
 
 
 
 
327
 
328
+ # ── Ensemble voting ───────────────────────────────────────────────────────
329
  votes = {"real": 0, "ai_synth": 0, "fake": 0}
330
  all_real_probs = []
331
  all_fake_probs = []
 
344
  if len(all_real_probs) == 0:
345
  return "⚠️ All models failed. Please try again."
346
 
347
+ # ── Majority vote with tie-break ──────────────────────────────────────────
348
  max_votes = max(votes.values())
349
  winners = [label for label, count in votes.items() if count == max_votes]
350
 
351
+ # Tie-break: bias toward real to avoid false positives on genuine voices
352
  if "real" in winners:
353
  final = "real"
354
  elif "ai_synth" in winners: