Update pipeline.py
Browse files- pipeline.py +88 -37
pipeline.py
CHANGED
|
@@ -3,6 +3,8 @@ import cv2
|
|
| 3 |
import torch
|
| 4 |
import zipfile
|
| 5 |
import librosa
|
|
|
|
|
|
|
| 6 |
import numpy as np
|
| 7 |
import tensorflow as tf
|
| 8 |
from facenet_pytorch import MTCNN
|
|
@@ -31,6 +33,42 @@ model = tf.keras.layers.TFSMLayer(
|
|
| 31 |
)
|
| 32 |
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
class DetectionPipeline:
|
|
@@ -45,33 +83,53 @@ class DetectionPipeline:
|
|
| 45 |
def __call__(self, filename):
|
| 46 |
if self.input_modality == 'video':
|
| 47 |
print('Input modality is video.')
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
if
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
return faces
|
| 76 |
|
| 77 |
elif self.input_modality == 'image':
|
|
@@ -161,9 +219,8 @@ def load_audio_model():
|
|
| 161 |
|
| 162 |
audio_model = RawNet(d_args=d_args, device='cpu')
|
| 163 |
|
| 164 |
-
# BUG FIX 2: Correct load_state_dict usage — second arg was wrongly a state_dict
|
| 165 |
ckpt = torch.load('RawNet2.pth', map_location=torch.device('cpu'))
|
| 166 |
-
audio_model.load_state_dict(ckpt)
|
| 167 |
audio_model.eval()
|
| 168 |
return audio_model
|
| 169 |
|
|
@@ -179,10 +236,8 @@ def deepfakes_audio_predict(input_audio):
|
|
| 179 |
Gradio gr.Audio() returns a tuple: (sample_rate, numpy_array)
|
| 180 |
numpy_array is int16 by default and needs float32 normalization.
|
| 181 |
"""
|
| 182 |
-
|
| 183 |
-
sr, x = input_audio # was: x, sr = input_audio ← WRONG
|
| 184 |
|
| 185 |
-
# BUG FIX 3: Convert int16 audio from Gradio to float32 and normalize to [-1, 1]
|
| 186 |
x = x.astype(np.float32)
|
| 187 |
if x.max() > 1.0:
|
| 188 |
x = x / 32768.0 # Normalize int16 range to float32
|
|
@@ -191,24 +246,20 @@ def deepfakes_audio_predict(input_audio):
|
|
| 191 |
if x.ndim == 2:
|
| 192 |
x = x.mean(axis=1)
|
| 193 |
|
| 194 |
-
#
|
| 195 |
if len(x) < NB_SAMP:
|
| 196 |
-
# Pad with zeros if audio is too short
|
| 197 |
x = np.pad(x, (0, NB_SAMP - len(x)), mode='constant')
|
| 198 |
else:
|
| 199 |
-
# Trim to expected length if audio is too long
|
| 200 |
x = x[:NB_SAMP]
|
| 201 |
|
| 202 |
# Convert to tensor with batch dimension: [1, nb_samp]
|
| 203 |
x_pt = torch.tensor(x, dtype=torch.float32).unsqueeze(0)
|
| 204 |
|
| 205 |
-
# Load model and run inference
|
| 206 |
audio_model = load_audio_model()
|
| 207 |
|
| 208 |
with torch.no_grad():
|
| 209 |
grads = audio_model(x_pt)
|
| 210 |
|
| 211 |
-
# Get the predicted class index
|
| 212 |
grads_np = grads.detach().numpy()
|
| 213 |
result = np.argmax(grads_np)
|
| 214 |
|
|
|
|
| 3 |
import torch
|
| 4 |
import zipfile
|
| 5 |
import librosa
|
| 6 |
+
import subprocess
|
| 7 |
+
import tempfile
|
| 8 |
import numpy as np
|
| 9 |
import tensorflow as tf
|
| 10 |
from facenet_pytorch import MTCNN
|
|
|
|
| 33 |
)
|
| 34 |
|
| 35 |
|
| 36 |
+
def convert_to_mp4(input_path):
|
| 37 |
+
"""
|
| 38 |
+
Convert any video (e.g. .webm from webcam) to .mp4 using ffmpeg.
|
| 39 |
+
Returns the path to the converted file, or the original path if already mp4.
|
| 40 |
+
The caller is responsible for deleting the temp file when done.
|
| 41 |
+
"""
|
| 42 |
+
ext = os.path.splitext(input_path)[-1].lower()
|
| 43 |
+
if ext == ".mp4":
|
| 44 |
+
# Already mp4 — verify OpenCV can actually open it
|
| 45 |
+
cap = cv2.VideoCapture(input_path)
|
| 46 |
+
ok = cap.isOpened()
|
| 47 |
+
cap.release()
|
| 48 |
+
if ok:
|
| 49 |
+
return input_path, False # (path, is_temp)
|
| 50 |
+
|
| 51 |
+
# Write to a named temp file so OpenCV can open it by path
|
| 52 |
+
tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
|
| 53 |
+
tmp.close()
|
| 54 |
+
output_path = tmp.name
|
| 55 |
+
|
| 56 |
+
cmd = [
|
| 57 |
+
"ffmpeg", "-y",
|
| 58 |
+
"-i", input_path,
|
| 59 |
+
"-c:v", "libx264",
|
| 60 |
+
"-preset", "fast",
|
| 61 |
+
"-crf", "23",
|
| 62 |
+
"-c:a", "aac",
|
| 63 |
+
output_path
|
| 64 |
+
]
|
| 65 |
+
result = subprocess.run(cmd, capture_output=True)
|
| 66 |
+
if result.returncode != 0:
|
| 67 |
+
os.unlink(output_path)
|
| 68 |
+
raise RuntimeError(
|
| 69 |
+
f"ffmpeg conversion failed:\n{result.stderr.decode()}"
|
| 70 |
+
)
|
| 71 |
+
return output_path, True # (path, is_temp)
|
| 72 |
|
| 73 |
|
| 74 |
class DetectionPipeline:
|
|
|
|
| 83 |
def __call__(self, filename):
|
| 84 |
if self.input_modality == 'video':
|
| 85 |
print('Input modality is video.')
|
| 86 |
+
|
| 87 |
+
# BUG FIX: Webcam recordings from Gradio arrive as .webm (VP8/VP9).
|
| 88 |
+
# OpenCV has no WebM support in headless builds — convert to .mp4 first.
|
| 89 |
+
converted_path, is_temp = convert_to_mp4(filename)
|
| 90 |
+
print(f"Processing video: {converted_path} (converted={is_temp})")
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
v_cap = cv2.VideoCapture(converted_path)
|
| 94 |
+
if not v_cap.isOpened():
|
| 95 |
+
raise RuntimeError(f"OpenCV could not open video: {converted_path}")
|
| 96 |
+
|
| 97 |
+
v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 98 |
+
if v_len == 0:
|
| 99 |
+
raise RuntimeError("Video has 0 frames after conversion.")
|
| 100 |
+
|
| 101 |
+
if self.n_frames is None:
|
| 102 |
+
sample = np.arange(0, v_len)
|
| 103 |
+
else:
|
| 104 |
+
sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)
|
| 105 |
+
|
| 106 |
+
faces = []
|
| 107 |
+
frames = []
|
| 108 |
+
for j in range(v_len):
|
| 109 |
+
success = v_cap.grab()
|
| 110 |
+
if j in sample:
|
| 111 |
+
success, frame = v_cap.retrieve()
|
| 112 |
+
if not success:
|
| 113 |
+
continue
|
| 114 |
+
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 115 |
+
|
| 116 |
+
if self.resize is not None:
|
| 117 |
+
frame = frame.resize([int(d * self.resize) for d in frame.size])
|
| 118 |
+
frames.append(frame)
|
| 119 |
+
|
| 120 |
+
if len(frames) % self.batch_size == 0 or j == sample[-1]:
|
| 121 |
+
face2 = cv2.resize(frame, (224, 224))
|
| 122 |
+
faces.append(face2)
|
| 123 |
+
|
| 124 |
+
v_cap.release()
|
| 125 |
+
finally:
|
| 126 |
+
# Clean up the temp converted file
|
| 127 |
+
if is_temp and os.path.exists(converted_path):
|
| 128 |
+
os.unlink(converted_path)
|
| 129 |
+
|
| 130 |
+
if len(faces) == 0:
|
| 131 |
+
raise RuntimeError("No frames could be extracted from the video.")
|
| 132 |
+
|
| 133 |
return faces
|
| 134 |
|
| 135 |
elif self.input_modality == 'image':
|
|
|
|
| 219 |
|
| 220 |
audio_model = RawNet(d_args=d_args, device='cpu')
|
| 221 |
|
|
|
|
| 222 |
ckpt = torch.load('RawNet2.pth', map_location=torch.device('cpu'))
|
| 223 |
+
audio_model.load_state_dict(ckpt)
|
| 224 |
audio_model.eval()
|
| 225 |
return audio_model
|
| 226 |
|
|
|
|
| 236 |
Gradio gr.Audio() returns a tuple: (sample_rate, numpy_array)
|
| 237 |
numpy_array is int16 by default and needs float32 normalization.
|
| 238 |
"""
|
| 239 |
+
sr, x = input_audio
|
|
|
|
| 240 |
|
|
|
|
| 241 |
x = x.astype(np.float32)
|
| 242 |
if x.max() > 1.0:
|
| 243 |
x = x / 32768.0 # Normalize int16 range to float32
|
|
|
|
| 246 |
if x.ndim == 2:
|
| 247 |
x = x.mean(axis=1)
|
| 248 |
|
| 249 |
+
# RawNet2 expects exactly nb_samp=64600 samples — pad or trim
|
| 250 |
if len(x) < NB_SAMP:
|
|
|
|
| 251 |
x = np.pad(x, (0, NB_SAMP - len(x)), mode='constant')
|
| 252 |
else:
|
|
|
|
| 253 |
x = x[:NB_SAMP]
|
| 254 |
|
| 255 |
# Convert to tensor with batch dimension: [1, nb_samp]
|
| 256 |
x_pt = torch.tensor(x, dtype=torch.float32).unsqueeze(0)
|
| 257 |
|
|
|
|
| 258 |
audio_model = load_audio_model()
|
| 259 |
|
| 260 |
with torch.no_grad():
|
| 261 |
grads = audio_model(x_pt)
|
| 262 |
|
|
|
|
| 263 |
grads_np = grads.detach().numpy()
|
| 264 |
result = np.argmax(grads_np)
|
| 265 |
|