pavankumarvk commited on
Commit
5808494
·
verified ·
1 Parent(s): b23d2b0

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +88 -37
pipeline.py CHANGED
@@ -3,6 +3,8 @@ import cv2
3
  import torch
4
  import zipfile
5
  import librosa
 
 
6
  import numpy as np
7
  import tensorflow as tf
8
  from facenet_pytorch import MTCNN
@@ -31,6 +33,42 @@ model = tf.keras.layers.TFSMLayer(
31
  )
32
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
 
36
  class DetectionPipeline:
@@ -45,33 +83,53 @@ class DetectionPipeline:
45
  def __call__(self, filename):
46
  if self.input_modality == 'video':
47
  print('Input modality is video.')
48
- v_cap = cv2.VideoCapture(filename)
49
- v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
50
-
51
- if self.n_frames is None:
52
- sample = np.arange(0, v_len)
53
- else:
54
- sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)
55
-
56
- faces = []
57
- frames = []
58
- for j in range(v_len):
59
- success = v_cap.grab()
60
- if j in sample:
61
- success, frame = v_cap.retrieve()
62
- if not success:
63
- continue
64
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
65
-
66
- if self.resize is not None:
67
- frame = frame.resize([int(d * self.resize) for d in frame.size])
68
- frames.append(frame)
69
-
70
- if len(frames) % self.batch_size == 0 or j == sample[-1]:
71
- face2 = cv2.resize(frame, (224, 224))
72
- faces.append(face2)
73
-
74
- v_cap.release()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  return faces
76
 
77
  elif self.input_modality == 'image':
@@ -161,9 +219,8 @@ def load_audio_model():
161
 
162
  audio_model = RawNet(d_args=d_args, device='cpu')
163
 
164
- # BUG FIX 2: Correct load_state_dict usage — second arg was wrongly a state_dict
165
  ckpt = torch.load('RawNet2.pth', map_location=torch.device('cpu'))
166
- audio_model.load_state_dict(ckpt) # Fixed: removed incorrect model_dict argument
167
  audio_model.eval()
168
  return audio_model
169
 
@@ -179,10 +236,8 @@ def deepfakes_audio_predict(input_audio):
179
  Gradio gr.Audio() returns a tuple: (sample_rate, numpy_array)
180
  numpy_array is int16 by default and needs float32 normalization.
181
  """
182
- # BUG FIX 1: Gradio returns (sample_rate, data) original code had reversed order
183
- sr, x = input_audio # was: x, sr = input_audio ← WRONG
184
 
185
- # BUG FIX 3: Convert int16 audio from Gradio to float32 and normalize to [-1, 1]
186
  x = x.astype(np.float32)
187
  if x.max() > 1.0:
188
  x = x / 32768.0 # Normalize int16 range to float32
@@ -191,24 +246,20 @@ def deepfakes_audio_predict(input_audio):
191
  if x.ndim == 2:
192
  x = x.mean(axis=1)
193
 
194
- # BUG FIX 4: RawNet2 expects exactly nb_samp=64600 samples — pad or trim
195
  if len(x) < NB_SAMP:
196
- # Pad with zeros if audio is too short
197
  x = np.pad(x, (0, NB_SAMP - len(x)), mode='constant')
198
  else:
199
- # Trim to expected length if audio is too long
200
  x = x[:NB_SAMP]
201
 
202
  # Convert to tensor with batch dimension: [1, nb_samp]
203
  x_pt = torch.tensor(x, dtype=torch.float32).unsqueeze(0)
204
 
205
- # Load model and run inference
206
  audio_model = load_audio_model()
207
 
208
  with torch.no_grad():
209
  grads = audio_model(x_pt)
210
 
211
- # Get the predicted class index
212
  grads_np = grads.detach().numpy()
213
  result = np.argmax(grads_np)
214
 
 
3
  import torch
4
  import zipfile
5
  import librosa
6
+ import subprocess
7
+ import tempfile
8
  import numpy as np
9
  import tensorflow as tf
10
  from facenet_pytorch import MTCNN
 
33
  )
34
 
35
 
36
+ def convert_to_mp4(input_path):
37
+ """
38
+ Convert any video (e.g. .webm from webcam) to .mp4 using ffmpeg.
39
+ Returns the path to the converted file, or the original path if already mp4.
40
+ The caller is responsible for deleting the temp file when done.
41
+ """
42
+ ext = os.path.splitext(input_path)[-1].lower()
43
+ if ext == ".mp4":
44
+ # Already mp4 — verify OpenCV can actually open it
45
+ cap = cv2.VideoCapture(input_path)
46
+ ok = cap.isOpened()
47
+ cap.release()
48
+ if ok:
49
+ return input_path, False # (path, is_temp)
50
+
51
+ # Write to a named temp file so OpenCV can open it by path
52
+ tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
53
+ tmp.close()
54
+ output_path = tmp.name
55
+
56
+ cmd = [
57
+ "ffmpeg", "-y",
58
+ "-i", input_path,
59
+ "-c:v", "libx264",
60
+ "-preset", "fast",
61
+ "-crf", "23",
62
+ "-c:a", "aac",
63
+ output_path
64
+ ]
65
+ result = subprocess.run(cmd, capture_output=True)
66
+ if result.returncode != 0:
67
+ os.unlink(output_path)
68
+ raise RuntimeError(
69
+ f"ffmpeg conversion failed:\n{result.stderr.decode()}"
70
+ )
71
+ return output_path, True # (path, is_temp)
72
 
73
 
74
  class DetectionPipeline:
 
83
  def __call__(self, filename):
84
  if self.input_modality == 'video':
85
  print('Input modality is video.')
86
+
87
+ # BUG FIX: Webcam recordings from Gradio arrive as .webm (VP8/VP9).
88
+ # OpenCV has no WebM support in headless builds — convert to .mp4 first.
89
+ converted_path, is_temp = convert_to_mp4(filename)
90
+ print(f"Processing video: {converted_path} (converted={is_temp})")
91
+
92
+ try:
93
+ v_cap = cv2.VideoCapture(converted_path)
94
+ if not v_cap.isOpened():
95
+ raise RuntimeError(f"OpenCV could not open video: {converted_path}")
96
+
97
+ v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
98
+ if v_len == 0:
99
+ raise RuntimeError("Video has 0 frames after conversion.")
100
+
101
+ if self.n_frames is None:
102
+ sample = np.arange(0, v_len)
103
+ else:
104
+ sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)
105
+
106
+ faces = []
107
+ frames = []
108
+ for j in range(v_len):
109
+ success = v_cap.grab()
110
+ if j in sample:
111
+ success, frame = v_cap.retrieve()
112
+ if not success:
113
+ continue
114
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
115
+
116
+ if self.resize is not None:
117
+ frame = frame.resize([int(d * self.resize) for d in frame.size])
118
+ frames.append(frame)
119
+
120
+ if len(frames) % self.batch_size == 0 or j == sample[-1]:
121
+ face2 = cv2.resize(frame, (224, 224))
122
+ faces.append(face2)
123
+
124
+ v_cap.release()
125
+ finally:
126
+ # Clean up the temp converted file
127
+ if is_temp and os.path.exists(converted_path):
128
+ os.unlink(converted_path)
129
+
130
+ if len(faces) == 0:
131
+ raise RuntimeError("No frames could be extracted from the video.")
132
+
133
  return faces
134
 
135
  elif self.input_modality == 'image':
 
219
 
220
  audio_model = RawNet(d_args=d_args, device='cpu')
221
 
 
222
  ckpt = torch.load('RawNet2.pth', map_location=torch.device('cpu'))
223
+ audio_model.load_state_dict(ckpt)
224
  audio_model.eval()
225
  return audio_model
226
 
 
236
  Gradio gr.Audio() returns a tuple: (sample_rate, numpy_array)
237
  numpy_array is int16 by default and needs float32 normalization.
238
  """
239
+ sr, x = input_audio
 
240
 
 
241
  x = x.astype(np.float32)
242
  if x.max() > 1.0:
243
  x = x / 32768.0 # Normalize int16 range to float32
 
246
  if x.ndim == 2:
247
  x = x.mean(axis=1)
248
 
249
+ # RawNet2 expects exactly nb_samp=64600 samples — pad or trim
250
  if len(x) < NB_SAMP:
 
251
  x = np.pad(x, (0, NB_SAMP - len(x)), mode='constant')
252
  else:
 
253
  x = x[:NB_SAMP]
254
 
255
  # Convert to tensor with batch dimension: [1, nb_samp]
256
  x_pt = torch.tensor(x, dtype=torch.float32).unsqueeze(0)
257
 
 
258
  audio_model = load_audio_model()
259
 
260
  with torch.no_grad():
261
  grads = audio_model(x_pt)
262
 
 
263
  grads_np = grads.detach().numpy()
264
  result = np.argmax(grads_np)
265