pavankumarvk commited on
Commit
3cbb0e7
Β·
verified Β·
1 Parent(s): d9f3145

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +124 -140
pipeline.py CHANGED
@@ -7,34 +7,71 @@ import subprocess
7
  import tempfile
8
  import numpy as np
9
  import tensorflow as tf
10
- from facenet_pytorch import MTCNN
11
- from rawnet import RawNet
12
-
13
 
14
  # Set random seed for reproducibility.
15
  tf.random.set_seed(42)
16
 
17
- # Extract model if not already extracted
18
  if not os.path.exists("efficientnet-b0"):
19
  local_zip = "./efficientnet-b0.zip"
20
  if os.path.exists(local_zip):
21
  zip_ref = zipfile.ZipFile(local_zip, 'r')
22
  zip_ref.extractall()
23
  zip_ref.close()
24
- print("Model extracted successfully!")
25
 
26
- # Load EfficientNet model using TFSMLayer (Keras 3 compatible)
27
- model = tf.keras.layers.TFSMLayer(
28
  "efficientnet-b0/",
29
  call_endpoint="serving_default"
30
  )
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  def convert_to_mp4(input_path):
34
- """
35
- Convert any video (e.g. .webm from webcam) to .mp4 using ffmpeg.
36
- Returns the path to the converted file, or the original path if already mp4.
37
- """
38
  ext = os.path.splitext(input_path)[-1].lower()
39
  if ext == ".mp4":
40
  cap = cv2.VideoCapture(input_path)
@@ -48,13 +85,9 @@ def convert_to_mp4(input_path):
48
  output_path = tmp.name
49
 
50
  cmd = [
51
- "ffmpeg", "-y",
52
- "-i", input_path,
53
- "-c:v", "libx264",
54
- "-preset", "fast",
55
- "-crf", "23",
56
- "-c:a", "aac",
57
- output_path
58
  ]
59
  result = subprocess.run(cmd, capture_output=True)
60
  if result.returncode != 0:
@@ -64,7 +97,7 @@ def convert_to_mp4(input_path):
64
 
65
 
66
  class DetectionPipeline:
67
- """Pipeline class for detecting faces in the frames of a video file."""
68
 
69
  def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality='video'):
70
  self.n_frames = n_frames
@@ -87,15 +120,14 @@ class DetectionPipeline:
87
  if v_len == 0:
88
  raise RuntimeError("Video has 0 frames after conversion.")
89
 
90
- if self.n_frames is None:
91
- sample = np.arange(0, v_len)
92
- else:
93
- sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)
94
 
95
- faces = []
96
- frames = []
97
  for j in range(v_len):
98
- success = v_cap.grab()
99
  if j in sample:
100
  success, frame = v_cap.retrieve()
101
  if not success:
@@ -105,9 +137,7 @@ class DetectionPipeline:
105
  frame = frame.resize([int(d * self.resize) for d in frame.size])
106
  frames.append(frame)
107
  if len(frames) % self.batch_size == 0 or j == sample[-1]:
108
- face2 = cv2.resize(frame, (224, 224))
109
- faces.append(face2)
110
-
111
  v_cap.release()
112
  finally:
113
  if is_temp and os.path.exists(converted_path):
@@ -120,18 +150,10 @@ class DetectionPipeline:
120
  elif self.input_modality == 'image':
121
  print('Input modality is image.')
122
  image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
123
- image = cv2.resize(image, (224, 224))
124
- return image
125
-
126
- elif self.input_modality == 'audio':
127
- print("Input modality is audio.")
128
- x, sr = librosa.load(filename)
129
- x_pt = torch.Tensor(x)
130
- x_pt = torch.unsqueeze(x_pt, dim=0)
131
- return x_pt
132
 
133
  else:
134
- raise ValueError("Invalid input modality. Must be either 'video' or 'image'")
135
 
136
 
137
  detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
@@ -140,126 +162,75 @@ detection_image_pipeline = DetectionPipeline(batch_size=1, input_modality='image
140
 
141
  def deepfakes_video_predict(input_video):
142
  faces = detection_video_pipeline(input_video)
143
- total = 0
144
- real_res = []
145
- fake_res = []
146
 
147
  for face in faces:
148
  face2 = face / 255
149
- pred = model(np.expand_dims(face2, axis=0))
150
  pred = list(pred.values())[0].numpy()[0]
151
- real, fake = pred[0], pred[1]
152
- real_res.append(real)
153
- fake_res.append(fake)
154
- total += 1
155
- pred2 = pred[1]
156
- if pred2 > 0.5:
157
- fake += 1
158
- else:
159
- real += 1
160
 
161
  real_mean = np.mean(real_res)
162
  fake_mean = np.mean(fake_res)
163
- print(f"Real Faces: {real_mean}")
164
- print(f"Fake Faces: {fake_mean}")
165
 
166
  if real_mean >= 0.5:
167
- text = "The video is REAL. \n Deepfakes Confidence: " + str(round(100 - (real_mean * 100), 3)) + "%"
168
  else:
169
- text = "The video is FAKE. \n Deepfakes Confidence: " + str(round(fake_mean * 100, 3)) + "%"
170
- return text
171
 
172
 
173
  def deepfakes_image_predict(input_image):
174
- faces = detection_image_pipeline(input_image)
175
- face2 = faces / 255
176
- pred = model(np.expand_dims(face2, axis=0))
177
  pred = list(pred.values())[0].numpy()[0]
178
  real, fake = pred[0], pred[1]
179
  if real > 0.5:
180
- text2 = "The image is REAL. \n Deepfakes Confidence: " + str(round(100 - (real * 100), 3)) + "%"
181
  else:
182
- text2 = "The image is FAKE. \n Deepfakes Confidence: " + str(round(fake * 100, 3)) + "%"
183
- return text2
184
-
185
-
186
- def load_audio_model():
187
- d_args = {
188
- "nb_samp": 64600,
189
- "first_conv": 1024,
190
- "in_channels": 1,
191
- "filts": [20, [20, 20], [20, 128], [128, 128]],
192
- "blocks": [2, 4],
193
- "nb_fc_node": 1024,
194
- "gru_node": 1024,
195
- "nb_gru_layer": 3,
196
- "nb_classes": 2
197
- }
198
- audio_model = RawNet(d_args=d_args, device='cpu')
199
- ckpt = torch.load('RawNet2.pth', map_location=torch.device('cpu'))
200
- audio_model.load_state_dict(ckpt)
201
- audio_model.eval()
202
- return audio_model
203
-
204
-
205
- RAWNET_SAMPLE_RATE = 16000 # RawNet2 was trained strictly on 16kHz β€” never change
206
- NB_SAMP = 64600 # Exactly 4.0375 seconds at 16kHz
207
-
208
- # ─── Confidence thresholds for 3-class labelling ────────────────────────────
209
- # RawNet2 has 2 output classes (real / fake). We derive a 3rd class
210
- # "AI Synthesized" from the confidence score:
211
- #
212
- # real_prob >= REAL_THRESHOLD β†’ Genuine human voice
213
- # fake_prob >= FAKE_THRESHOLD β†’ Manipulated / spliced audio
214
- # anything in between β†’ AI Synthesized / TTS / Voice-cloned
215
- #
216
- # Why this works: TTS and voice-clone audio confuses RawNet2 β€” it produces
217
- # low-confidence outputs for both classes because it was trained on older
218
- # spoofing attacks. That uncertainty is the signal we exploit.
219
- REAL_THRESHOLD = 0.75
220
- FAKE_THRESHOLD = 0.75
221
 
222
 
223
  def classify_audio_3class(real_prob: float, fake_prob: float) -> str:
224
  """
225
- Map RawNet2 2-class probabilities β†’ 3-class human-readable label.
226
 
227
- Classes:
228
- - Real Human Voice : model is confident it's real
229
- - AI Synthesized : model is uncertain (TTS / voice-clone zone)
230
- - Fake / Manipulated : model is confident it's fake (spliced, replayed)
231
  """
232
  print(f"[Audio] real_prob={real_prob:.4f} fake_prob={fake_prob:.4f}")
233
 
234
  if real_prob >= REAL_THRESHOLD:
235
- confidence = round(real_prob * 100, 2)
236
- return f"βœ… Real Human Voice\nConfidence: {confidence}%"
237
 
238
  elif fake_prob >= FAKE_THRESHOLD:
239
- confidence = round(fake_prob * 100, 2)
240
- return f"🚨 Fake / Manipulated Audio\nConfidence: {confidence}%"
241
 
242
  else:
243
- # Low confidence on both sides β†’ hallmark of modern TTS / voice cloning
244
- ai_confidence = round(fake_prob * 100, 2)
245
  return (
246
  f"πŸ€– AI Synthesized / Voice Cloned\n"
247
- f"Confidence: {ai_confidence}%\n"
248
- f"(Model uncertainty indicates TTS or neural voice cloning)"
249
  )
250
 
251
 
252
  def deepfakes_audio_predict(input_audio):
253
  """
254
- Gradio gr.Audio() returns a tuple: (sample_rate, numpy_array).
255
 
256
- Pipeline:
 
 
257
  1. float32 conversion + int16 normalisation
258
  2. Stereo β†’ mono
259
- 3. Resample to 16000 Hz ← critical: RawNet2 SincConv assumes 16kHz
260
- 4. Pad / trim to NB_SAMP (64600) samples
261
- 5. RawNet2 inference β†’ log-softmax β†’ probabilities
262
- 6. 3-class decision via confidence thresholds
263
  """
264
  sr, x = input_audio
265
  print(f"[Audio] Input SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
@@ -267,34 +238,47 @@ def deepfakes_audio_predict(input_audio):
267
  # Step 1 β€” float32 + normalise
268
  x = x.astype(np.float32)
269
  if np.abs(x).max() > 1.0:
270
- x = x / 32768.0 # int16 β†’ [-1, 1]
271
 
272
- # Step 2 β€” stereo β†’ mono (must precede librosa.resample which needs 1-D)
273
  if x.ndim == 2:
274
  x = x.mean(axis=1)
275
 
276
- # Step 3 β€” resample to 16 kHz (THE root-cause fix)
277
- if sr != RAWNET_SAMPLE_RATE:
278
- print(f"[Audio] Resampling {sr} Hz β†’ {RAWNET_SAMPLE_RATE} Hz …")
279
- x = librosa.resample(x, orig_sr=sr, target_sr=RAWNET_SAMPLE_RATE)
280
- print(f"[Audio] After resample: {len(x)} samples ({len(x)/RAWNET_SAMPLE_RATE:.2f}s)")
281
 
282
- # Step 4 β€” pad or trim to exactly NB_SAMP
283
- if len(x) < NB_SAMP:
284
- x = np.pad(x, (0, NB_SAMP - len(x)), mode='constant')
285
- else:
286
- x = x[:NB_SAMP]
287
-
288
- # Step 5 β€” inference
289
- x_pt = torch.tensor(x, dtype=torch.float32).unsqueeze(0) # [1, NB_SAMP]
290
- audio_model = load_audio_model()
291
 
292
  with torch.no_grad():
293
- log_probs = audio_model(x_pt) # log-softmax output
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
- probs = torch.exp(log_probs).numpy()[0] # convert log β†’ actual probabilities
296
- real_prob = float(probs[0])
297
- fake_prob = float(probs[1])
 
 
298
 
299
- # Step 6 β€” 3-class label
300
  return classify_audio_3class(real_prob, fake_prob)
 
7
  import tempfile
8
  import numpy as np
9
  import tensorflow as tf
10
+ from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
 
 
11
 
12
  # Set random seed for reproducibility.
13
  tf.random.set_seed(42)
14
 
15
+ # Extract EfficientNet model if not already extracted
16
  if not os.path.exists("efficientnet-b0"):
17
  local_zip = "./efficientnet-b0.zip"
18
  if os.path.exists(local_zip):
19
  zip_ref = zipfile.ZipFile(local_zip, 'r')
20
  zip_ref.extractall()
21
  zip_ref.close()
22
+ print("EfficientNet model extracted successfully!")
23
 
24
+ # Load EfficientNet model (image/video)
25
+ efficientnet_model = tf.keras.layers.TFSMLayer(
26
  "efficientnet-b0/",
27
  call_endpoint="serving_default"
28
  )
29
 
30
+ # ─────────────────────────────────────────────────────────────────────────────
31
+ # Audio Model: Wav2Vec2 fine-tuned for deepfake detection
32
+ #
33
+ # Why replace RawNet2?
34
+ # RawNet2 was trained on ASVspoof 2019 β€” a dataset that predates modern TTS
35
+ # systems (ElevenLabs, Vall-E, XTTS, Bark, etc.). It has never seen this
36
+ # class of audio and consistently misclassifies it as "Real".
37
+ #
38
+ # Why Wav2Vec2?
39
+ # "mo-thecreator/deepfake-audio-detection" is a Wav2Vec2-base model
40
+ # fine-tuned on FakeAVCeleb + ASVspoof 2021 LA, covering:
41
+ # - Genuine human speech
42
+ # - Neural TTS (modern AI voices)
43
+ # - Voice conversion / cloning
44
+ # - Replay / splicing attacks
45
+ # ─────────────────────────────────────────────────────────────────────────────
46
+ AUDIO_MODEL_ID = "mo-thecreator/deepfake-audio-detection"
47
+ AUDIO_SAMPLE_RATE = 16000 # Wav2Vec2 expects 16kHz
48
+
49
+ print(f"Loading audio model: {AUDIO_MODEL_ID} ...")
50
+ audio_feature_extractor = AutoFeatureExtractor.from_pretrained(AUDIO_MODEL_ID)
51
+ audio_model = AutoModelForAudioClassification.from_pretrained(AUDIO_MODEL_ID)
52
+ audio_model.eval()
53
+ print("Audio model loaded successfully!")
54
+
55
+ # Map model's raw label β†’ "real" or "fake"
56
+ LABEL_MAP = {
57
+ "LABEL_0": "real",
58
+ "LABEL_1": "fake",
59
+ "real": "real",
60
+ "fake": "fake",
61
+ }
62
+
63
+ # ─── Confidence thresholds ────────────────────────────────────────────────────
64
+ # High confidence real β†’ Genuine Human Voice
65
+ # High confidence fake β†’ Fake / Manipulated Audio
66
+ # Low confidence both β†’ AI Synthesized / Voice Cloned
67
+ # Modern TTS confuses the model β€” it sits in the uncertain middle zone.
68
+ # That low-confidence signature IS the AI synthesis detection signal.
69
+ REAL_THRESHOLD = 0.75
70
+ FAKE_THRESHOLD = 0.70
71
+
72
 
73
  def convert_to_mp4(input_path):
74
+ """Convert any video to .mp4 using ffmpeg (handles webcam .webm, etc.)"""
 
 
 
75
  ext = os.path.splitext(input_path)[-1].lower()
76
  if ext == ".mp4":
77
  cap = cv2.VideoCapture(input_path)
 
85
  output_path = tmp.name
86
 
87
  cmd = [
88
+ "ffmpeg", "-y", "-i", input_path,
89
+ "-c:v", "libx264", "-preset", "fast",
90
+ "-crf", "23", "-c:a", "aac", output_path
 
 
 
 
91
  ]
92
  result = subprocess.run(cmd, capture_output=True)
93
  if result.returncode != 0:
 
97
 
98
 
99
  class DetectionPipeline:
100
+ """Pipeline for detecting faces in video frames or processing images."""
101
 
102
  def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality='video'):
103
  self.n_frames = n_frames
 
120
  if v_len == 0:
121
  raise RuntimeError("Video has 0 frames after conversion.")
122
 
123
+ sample = (
124
+ np.arange(0, v_len) if self.n_frames is None
125
+ else np.linspace(0, v_len - 1, self.n_frames).astype(int)
126
+ )
127
 
128
+ faces, frames = [], []
 
129
  for j in range(v_len):
130
+ v_cap.grab()
131
  if j in sample:
132
  success, frame = v_cap.retrieve()
133
  if not success:
 
137
  frame = frame.resize([int(d * self.resize) for d in frame.size])
138
  frames.append(frame)
139
  if len(frames) % self.batch_size == 0 or j == sample[-1]:
140
+ faces.append(cv2.resize(frame, (224, 224)))
 
 
141
  v_cap.release()
142
  finally:
143
  if is_temp and os.path.exists(converted_path):
 
150
  elif self.input_modality == 'image':
151
  print('Input modality is image.')
152
  image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
153
+ return cv2.resize(image, (224, 224))
 
 
 
 
 
 
 
 
154
 
155
  else:
156
+ raise ValueError(f"Invalid input modality: {self.input_modality}")
157
 
158
 
159
  detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
 
162
 
163
  def deepfakes_video_predict(input_video):
164
  faces = detection_video_pipeline(input_video)
165
+ real_res, fake_res = [], []
 
 
166
 
167
  for face in faces:
168
  face2 = face / 255
169
+ pred = efficientnet_model(np.expand_dims(face2, axis=0))
170
  pred = list(pred.values())[0].numpy()[0]
171
+ real_res.append(pred[0])
172
+ fake_res.append(pred[1])
 
 
 
 
 
 
 
173
 
174
  real_mean = np.mean(real_res)
175
  fake_mean = np.mean(fake_res)
176
+ print(f"Real Faces: {real_mean:.4f} | Fake Faces: {fake_mean:.4f}")
 
177
 
178
  if real_mean >= 0.5:
179
+ return "The video is REAL.\nDeepfakes Confidence: " + str(round(100 - real_mean * 100, 3)) + "%"
180
  else:
181
+ return "The video is FAKE.\nDeepfakes Confidence: " + str(round(fake_mean * 100, 3)) + "%"
 
182
 
183
 
184
  def deepfakes_image_predict(input_image):
185
+ face = detection_image_pipeline(input_image)
186
+ face2 = face / 255
187
+ pred = efficientnet_model(np.expand_dims(face2, axis=0))
188
  pred = list(pred.values())[0].numpy()[0]
189
  real, fake = pred[0], pred[1]
190
  if real > 0.5:
191
+ return "The image is REAL.\nDeepfakes Confidence: " + str(round(100 - real * 100, 3)) + "%"
192
  else:
193
+ return "The image is FAKE.\nDeepfakes Confidence: " + str(round(fake * 100, 3)) + "%"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
 
196
  def classify_audio_3class(real_prob: float, fake_prob: float) -> str:
197
  """
198
+ Map 2-class probabilities β†’ 3-class human-readable result.
199
 
200
+ real_prob >= REAL_THRESHOLD β†’ Genuine Human Voice
201
+ fake_prob >= FAKE_THRESHOLD β†’ Fake / Manipulated Audio
202
+ both below threshold β†’ AI Synthesized / Voice Cloned
 
203
  """
204
  print(f"[Audio] real_prob={real_prob:.4f} fake_prob={fake_prob:.4f}")
205
 
206
  if real_prob >= REAL_THRESHOLD:
207
+ return f"βœ… Real Human Voice\nConfidence: {round(real_prob * 100, 2)}%"
 
208
 
209
  elif fake_prob >= FAKE_THRESHOLD:
210
+ return f"🚨 Fake / Manipulated Audio\nConfidence: {round(fake_prob * 100, 2)}%"
 
211
 
212
  else:
213
+ # Neither class wins confidently β†’ hallmark of modern TTS / voice cloning
214
+ ai_conf = round(max(fake_prob, 1 - real_prob) * 100, 2)
215
  return (
216
  f"πŸ€– AI Synthesized / Voice Cloned\n"
217
+ f"Confidence: {ai_conf}%\n"
218
+ f"(Model uncertainty indicates modern neural TTS or voice cloning)"
219
  )
220
 
221
 
222
  def deepfakes_audio_predict(input_audio):
223
  """
224
+ Detect whether audio is: Real Human Voice / AI Synthesized / Fake.
225
 
226
+ Gradio gr.Audio() returns (sample_rate, numpy_array).
227
+
228
+ Steps:
229
  1. float32 conversion + int16 normalisation
230
  2. Stereo β†’ mono
231
+ 3. Resample to 16000 Hz (Wav2Vec2 requirement)
232
+ 4. Wav2Vec2 feature extraction + inference β†’ softmax probabilities
233
+ 5. 3-class decision via confidence thresholds
 
234
  """
235
  sr, x = input_audio
236
  print(f"[Audio] Input SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
 
238
  # Step 1 β€” float32 + normalise
239
  x = x.astype(np.float32)
240
  if np.abs(x).max() > 1.0:
241
+ x = x / 32768.0
242
 
243
+ # Step 2 β€” stereo β†’ mono (must precede resample β€” librosa needs 1-D)
244
  if x.ndim == 2:
245
  x = x.mean(axis=1)
246
 
247
+ # Step 3 β€” resample to 16 kHz
248
+ if sr != AUDIO_SAMPLE_RATE:
249
+ print(f"[Audio] Resampling {sr} Hz β†’ {AUDIO_SAMPLE_RATE} Hz …")
250
+ x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
251
+ print(f"[Audio] After resample: {len(x)} samples ({len(x) / AUDIO_SAMPLE_RATE:.2f}s)")
252
 
253
+ # Step 4 β€” Wav2Vec2 inference
254
+ inputs = audio_feature_extractor(
255
+ x,
256
+ sampling_rate=AUDIO_SAMPLE_RATE,
257
+ return_tensors="pt",
258
+ padding=True
259
+ )
 
 
260
 
261
  with torch.no_grad():
262
+ logits = audio_model(**inputs).logits
263
+
264
+ probs = torch.softmax(logits, dim=-1)[0]
265
+
266
+ # Map model label indices β†’ real / fake probabilities
267
+ id2label = audio_model.config.id2label
268
+ real_prob, fake_prob = 0.0, 0.0
269
+
270
+ for idx, prob in enumerate(probs):
271
+ mapped = LABEL_MAP.get(id2label[idx], id2label[idx].lower())
272
+ if mapped == "real":
273
+ real_prob = float(prob)
274
+ elif mapped == "fake":
275
+ fake_prob = float(prob)
276
 
277
+ # Fallback: if label mapping failed, assume index order (0=real, 1=fake)
278
+ if real_prob == 0.0 and fake_prob == 0.0:
279
+ print("[Audio] Warning: label mapping failed β€” using index order (0=real, 1=fake)")
280
+ real_prob = float(probs[0])
281
+ fake_prob = float(probs[1])
282
 
283
+ # Step 5 β€” 3-class decision
284
  return classify_audio_3class(real_prob, fake_prob)