Spaces:
Running
Running
Vinh.Vu commited on
Commit ·
ee1da4c
1
Parent(s): 218059f
Improve generate video speed
Browse files- App/app.py +57 -46
- App/static/Technology.jsx +0 -6
App/app.py
CHANGED
|
@@ -175,65 +175,76 @@ def extract_faces_from_video(video_path):
|
|
| 175 |
|
| 176 |
|
| 177 |
def create_processed_video(video_path, output_path, face_scores=None):
|
| 178 |
-
"""
|
| 179 |
logger.info('Creating processed video with bounding boxes: %s', output_path)
|
| 180 |
|
| 181 |
cap = cv2.VideoCapture(video_path)
|
| 182 |
fps = cap.get(cv2.CAP_PROP_FPS) or 30
|
| 183 |
-
|
| 184 |
-
|
| 185 |
|
| 186 |
-
#
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
out = cv2.VideoWriter(temp_path, fourcc, fps, (w, h))
|
| 190 |
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
# Only run detection every N frames; reuse cached overlays in between
|
| 197 |
-
detect_interval = max(1, int(fps // 3)) # ~3 detections per second
|
| 198 |
-
frame_count = 0
|
| 199 |
-
cached_boxes = [] # list of (x1, y1, x2, y2)
|
| 200 |
-
|
| 201 |
-
while cap.isOpened():
|
| 202 |
ret, frame = cap.read()
|
| 203 |
if not ret:
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
for box in results.boxes:
|
| 211 |
-
if box.conf[0] > 0.5:
|
| 212 |
-
bx1, by1, bx2, by2 = map(int, box.xyxy[0])
|
| 213 |
-
cached_boxes.append((max(0, bx1), max(0, by1), bx2, by2))
|
| 214 |
-
|
| 215 |
-
# Draw face boxes on every frame (green color, no labels)
|
| 216 |
-
for (x1, y1, x2, y2) in cached_boxes:
|
| 217 |
-
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
|
| 218 |
-
|
| 219 |
-
out.write(frame)
|
| 220 |
-
frame_count += 1
|
| 221 |
|
| 222 |
cap.release()
|
| 223 |
-
out.release()
|
| 224 |
-
logger.info('Wrote %d frames to temp file, re-encoding to H.264', frame_count)
|
| 225 |
|
| 226 |
-
#
|
| 227 |
-
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
else:
|
| 230 |
-
|
| 231 |
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
|
| 239 |
def predict_deepfake(faces):
|
|
|
|
| 175 |
|
| 176 |
|
| 177 |
def create_processed_video(video_path, output_path, face_scores=None):
|
| 178 |
+
"""Create video with face bounding boxes using ffmpeg drawbox (much faster than OpenCV)."""
|
| 179 |
logger.info('Creating processed video with bounding boxes: %s', output_path)
|
| 180 |
|
| 181 |
cap = cv2.VideoCapture(video_path)
|
| 182 |
fps = cap.get(cv2.CAP_PROP_FPS) or 30
|
| 183 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 184 |
+
duration = total_frames / fps if fps > 0 else 0
|
| 185 |
|
| 186 |
+
# Sample a few frames spread across the video to detect faces
|
| 187 |
+
sample_count = min(5, max(1, int(duration))) # ~1 sample per second, max 5
|
| 188 |
+
sample_positions = [int(i * total_frames / sample_count) for i in range(sample_count)]
|
|
|
|
| 189 |
|
| 190 |
+
# Collect all face boxes across sampled frames
|
| 191 |
+
all_boxes = []
|
| 192 |
+
for pos in sample_positions:
|
| 193 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, pos)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
ret, frame = cap.read()
|
| 195 |
if not ret:
|
| 196 |
+
continue
|
| 197 |
+
results = face_detector(frame, verbose=False)[0]
|
| 198 |
+
for box in results.boxes:
|
| 199 |
+
if box.conf[0] > 0.5:
|
| 200 |
+
bx1, by1, bx2, by2 = map(int, box.xyxy[0])
|
| 201 |
+
all_boxes.append((max(0, bx1), max(0, by1), bx2, by2))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
cap.release()
|
|
|
|
|
|
|
| 204 |
|
| 205 |
+
# Build ffmpeg drawbox filter from detected boxes
|
| 206 |
+
ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe()
|
| 207 |
+
if all_boxes:
|
| 208 |
+
# Use the most common box region (largest by area) for a stable overlay
|
| 209 |
+
# Deduplicate similar boxes by averaging nearby ones
|
| 210 |
+
unique_boxes = []
|
| 211 |
+
for box in all_boxes:
|
| 212 |
+
merged = False
|
| 213 |
+
for i, ub in enumerate(unique_boxes):
|
| 214 |
+
# If boxes overlap significantly, merge them
|
| 215 |
+
if (abs(box[0] - ub[0]) < 40 and abs(box[1] - ub[1]) < 40 and
|
| 216 |
+
abs(box[2] - ub[2]) < 40 and abs(box[3] - ub[3]) < 40):
|
| 217 |
+
unique_boxes[i] = (
|
| 218 |
+
(ub[0] + box[0]) // 2, (ub[1] + box[1]) // 2,
|
| 219 |
+
(ub[2] + box[2]) // 2, (ub[3] + box[3]) // 2
|
| 220 |
+
)
|
| 221 |
+
merged = True
|
| 222 |
+
break
|
| 223 |
+
if not merged:
|
| 224 |
+
unique_boxes.append(box)
|
| 225 |
+
|
| 226 |
+
drawbox_filters = []
|
| 227 |
+
for (x1, y1, x2, y2) in unique_boxes:
|
| 228 |
+
w = x2 - x1
|
| 229 |
+
h = y2 - y1
|
| 230 |
+
drawbox_filters.append(f"drawbox=x={x1}:y={y1}:w={w}:h={h}:color=green:t=2")
|
| 231 |
+
filter_str = ','.join(drawbox_filters)
|
| 232 |
else:
|
| 233 |
+
filter_str = 'null'
|
| 234 |
|
| 235 |
+
cmd = [
|
| 236 |
+
ffmpeg_exe, '-y', '-i', video_path,
|
| 237 |
+
'-vf', filter_str,
|
| 238 |
+
'-c:v', 'libx264', '-preset', 'fast',
|
| 239 |
+
'-movflags', '+faststart', '-pix_fmt', 'yuv420p',
|
| 240 |
+
output_path
|
| 241 |
+
]
|
| 242 |
+
logger.info('Running ffmpeg with %d face boxes', len(all_boxes))
|
| 243 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 244 |
+
if result.returncode != 0:
|
| 245 |
+
logger.error('ffmpeg drawbox failed: %s', result.stderr[-500:])
|
| 246 |
+
else:
|
| 247 |
+
logger.info('Processed video saved: %s', output_path)
|
| 248 |
|
| 249 |
|
| 250 |
def predict_deepfake(faces):
|
App/static/Technology.jsx
CHANGED
|
@@ -26,7 +26,6 @@ function TechnologyPage() {
|
|
| 26 |
number: 1,
|
| 27 |
title: 'Video to Frames',
|
| 28 |
color: '#6c8cff',
|
| 29 |
-
file: '00-convert_video_to_image.py',
|
| 30 |
description: 'Raw training videos are split into individual image frames. One frame is extracted per second of video using OpenCV. Each frame is automatically scaled based on its resolution to normalize image sizes across the dataset.',
|
| 31 |
details: [
|
| 32 |
'Reads MP4 videos from the FaceForensics++ dataset',
|
|
@@ -55,7 +54,6 @@ function TechnologyPage() {
|
|
| 55 |
number: 2,
|
| 56 |
title: 'Face Detection & Cropping',
|
| 57 |
color: '#ff9800',
|
| 58 |
-
file: '01-crop_faces_with_mtcnn.py',
|
| 59 |
description: 'MTCNN (Multi-task Cascaded Convolutional Network) scans each extracted frame to detect faces. Detected faces are cropped with a 30% margin around the bounding box to preserve context like hair and jawline, which helps the model detect manipulation artifacts.',
|
| 60 |
details: [
|
| 61 |
'Uses MTCNN deep learning face detector for accurate face localization',
|
|
@@ -88,7 +86,6 @@ function TechnologyPage() {
|
|
| 88 |
number: 3,
|
| 89 |
title: 'Dataset Preparation',
|
| 90 |
color: '#4caf50',
|
| 91 |
-
file: '02-prepare_fake_real_dataset.py',
|
| 92 |
description: 'Cropped face images are organized into "real" and "fake" categories based on FaceForensics++ metadata. Small or corrupted images (<90px) are filtered out. The dataset is then split into training (80%), validation (10%), and test (10%) sets using stratified splitting.',
|
| 93 |
details: [
|
| 94 |
'Labels faces as REAL or FAKE using FaceForensics++ CSV metadata',
|
|
@@ -123,7 +120,6 @@ function TechnologyPage() {
|
|
| 123 |
number: 4,
|
| 124 |
title: 'CNN Training (EfficientNetB0)',
|
| 125 |
color: '#f44336',
|
| 126 |
-
file: '03-train_cnn.py',
|
| 127 |
description: 'A two-phase transfer learning approach trains an EfficientNetB0-based classifier. Phase 1 freezes the pre-trained ImageNet backbone and trains only the classification head. Phase 2 unfreezes the entire network for fine-tuning with a very low learning rate, achieving ~92% accuracy.',
|
| 128 |
details: [
|
| 129 |
'EfficientNetB0 backbone pre-trained on ImageNet (224\u00d7224 input)',
|
|
@@ -197,7 +193,6 @@ function TechnologyPage() {
|
|
| 197 |
<h2 className="tech-step-title" style={{ color: step.color }}>
|
| 198 |
Step {step.number}: {step.title}
|
| 199 |
</h2>
|
| 200 |
-
<code className="tech-file">{step.file}</code>
|
| 201 |
</div>
|
| 202 |
</div>
|
| 203 |
<p className="tech-step-desc">{step.description}</p>
|
|
@@ -221,7 +216,6 @@ function TechnologyPage() {
|
|
| 221 |
<h2 className="tech-step-title" style={{ color: '#6c8cff' }}>
|
| 222 |
Real-Time Inference
|
| 223 |
</h2>
|
| 224 |
-
<code className="tech-file">App/app.py</code>
|
| 225 |
</div>
|
| 226 |
</div>
|
| 227 |
<p className="tech-step-desc">
|
|
|
|
| 26 |
number: 1,
|
| 27 |
title: 'Video to Frames',
|
| 28 |
color: '#6c8cff',
|
|
|
|
| 29 |
description: 'Raw training videos are split into individual image frames. One frame is extracted per second of video using OpenCV. Each frame is automatically scaled based on its resolution to normalize image sizes across the dataset.',
|
| 30 |
details: [
|
| 31 |
'Reads MP4 videos from the FaceForensics++ dataset',
|
|
|
|
| 54 |
number: 2,
|
| 55 |
title: 'Face Detection & Cropping',
|
| 56 |
color: '#ff9800',
|
|
|
|
| 57 |
description: 'MTCNN (Multi-task Cascaded Convolutional Network) scans each extracted frame to detect faces. Detected faces are cropped with a 30% margin around the bounding box to preserve context like hair and jawline, which helps the model detect manipulation artifacts.',
|
| 58 |
details: [
|
| 59 |
'Uses MTCNN deep learning face detector for accurate face localization',
|
|
|
|
| 86 |
number: 3,
|
| 87 |
title: 'Dataset Preparation',
|
| 88 |
color: '#4caf50',
|
|
|
|
| 89 |
description: 'Cropped face images are organized into "real" and "fake" categories based on FaceForensics++ metadata. Small or corrupted images (<90px) are filtered out. The dataset is then split into training (80%), validation (10%), and test (10%) sets using stratified splitting.',
|
| 90 |
details: [
|
| 91 |
'Labels faces as REAL or FAKE using FaceForensics++ CSV metadata',
|
|
|
|
| 120 |
number: 4,
|
| 121 |
title: 'CNN Training (EfficientNetB0)',
|
| 122 |
color: '#f44336',
|
|
|
|
| 123 |
description: 'A two-phase transfer learning approach trains an EfficientNetB0-based classifier. Phase 1 freezes the pre-trained ImageNet backbone and trains only the classification head. Phase 2 unfreezes the entire network for fine-tuning with a very low learning rate, achieving ~92% accuracy.',
|
| 124 |
details: [
|
| 125 |
'EfficientNetB0 backbone pre-trained on ImageNet (224\u00d7224 input)',
|
|
|
|
| 193 |
<h2 className="tech-step-title" style={{ color: step.color }}>
|
| 194 |
Step {step.number}: {step.title}
|
| 195 |
</h2>
|
|
|
|
| 196 |
</div>
|
| 197 |
</div>
|
| 198 |
<p className="tech-step-desc">{step.description}</p>
|
|
|
|
| 216 |
<h2 className="tech-step-title" style={{ color: '#6c8cff' }}>
|
| 217 |
Real-Time Inference
|
| 218 |
</h2>
|
|
|
|
| 219 |
</div>
|
| 220 |
</div>
|
| 221 |
<p className="tech-step-desc">
|