Vinh.Vu commited on
Commit
ee1da4c
·
1 Parent(s): 218059f

Improve generate video speed

Browse files
Files changed (2) hide show
  1. App/app.py +57 -46
  2. App/static/Technology.jsx +0 -6
App/app.py CHANGED
@@ -175,65 +175,76 @@ def extract_faces_from_video(video_path):
175
 
176
 
177
  def create_processed_video(video_path, output_path, face_scores=None):
178
- """Re-encode video with face bounding boxes (detection only, no labels)."""
179
  logger.info('Creating processed video with bounding boxes: %s', output_path)
180
 
181
  cap = cv2.VideoCapture(video_path)
182
  fps = cap.get(cv2.CAP_PROP_FPS) or 30
183
- w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
184
- h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
185
 
186
- # Write to a temp file with mp4v codec first
187
- temp_path = output_path + '.tmp.mp4'
188
- fourcc = cv2.VideoWriter_fourcc(*'mp4v')
189
- out = cv2.VideoWriter(temp_path, fourcc, fps, (w, h))
190
 
191
- if not out.isOpened():
192
- logger.error('VideoWriter failed to open: %s', temp_path)
193
- cap.release()
194
- return
195
-
196
- # Only run detection every N frames; reuse cached overlays in between
197
- detect_interval = max(1, int(fps // 3)) # ~3 detections per second
198
- frame_count = 0
199
- cached_boxes = [] # list of (x1, y1, x2, y2)
200
-
201
- while cap.isOpened():
202
  ret, frame = cap.read()
203
  if not ret:
204
- break
205
-
206
- if frame_count % detect_interval == 0:
207
- results = face_detector(frame, verbose=False)[0]
208
- cached_boxes = []
209
-
210
- for box in results.boxes:
211
- if box.conf[0] > 0.5:
212
- bx1, by1, bx2, by2 = map(int, box.xyxy[0])
213
- cached_boxes.append((max(0, bx1), max(0, by1), bx2, by2))
214
-
215
- # Draw face boxes on every frame (green color, no labels)
216
- for (x1, y1, x2, y2) in cached_boxes:
217
- cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
218
-
219
- out.write(frame)
220
- frame_count += 1
221
 
222
  cap.release()
223
- out.release()
224
- logger.info('Wrote %d frames to temp file, re-encoding to H.264', frame_count)
225
 
226
- # Re-encode to H.264 for browser compatibility
227
- if reencode_to_h264(temp_path, output_path):
228
- logger.info('Processed video saved (H.264): %s', output_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  else:
230
- logger.error('Failed to re-encode processed video')
231
 
232
- # Clean up temp file
233
- try:
234
- os.remove(temp_path)
235
- except OSError:
236
- pass
 
 
 
 
 
 
 
 
237
 
238
 
239
  def predict_deepfake(faces):
 
175
 
176
 
177
  def create_processed_video(video_path, output_path, face_scores=None):
178
+ """Create video with face bounding boxes using ffmpeg drawbox (much faster than OpenCV)."""
179
  logger.info('Creating processed video with bounding boxes: %s', output_path)
180
 
181
  cap = cv2.VideoCapture(video_path)
182
  fps = cap.get(cv2.CAP_PROP_FPS) or 30
183
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
184
+ duration = total_frames / fps if fps > 0 else 0
185
 
186
+ # Sample a few frames spread across the video to detect faces
187
+ sample_count = min(5, max(1, int(duration))) # ~1 sample per second, max 5
188
+ sample_positions = [int(i * total_frames / sample_count) for i in range(sample_count)]
 
189
 
190
+ # Collect all face boxes across sampled frames
191
+ all_boxes = []
192
+ for pos in sample_positions:
193
+ cap.set(cv2.CAP_PROP_POS_FRAMES, pos)
 
 
 
 
 
 
 
194
  ret, frame = cap.read()
195
  if not ret:
196
+ continue
197
+ results = face_detector(frame, verbose=False)[0]
198
+ for box in results.boxes:
199
+ if box.conf[0] > 0.5:
200
+ bx1, by1, bx2, by2 = map(int, box.xyxy[0])
201
+ all_boxes.append((max(0, bx1), max(0, by1), bx2, by2))
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  cap.release()
 
 
204
 
205
+ # Build ffmpeg drawbox filter from detected boxes
206
+ ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe()
207
+ if all_boxes:
208
+ # Use the most common box region (largest by area) for a stable overlay
209
+ # Deduplicate similar boxes by averaging nearby ones
210
+ unique_boxes = []
211
+ for box in all_boxes:
212
+ merged = False
213
+ for i, ub in enumerate(unique_boxes):
214
+ # If boxes overlap significantly, merge them
215
+ if (abs(box[0] - ub[0]) < 40 and abs(box[1] - ub[1]) < 40 and
216
+ abs(box[2] - ub[2]) < 40 and abs(box[3] - ub[3]) < 40):
217
+ unique_boxes[i] = (
218
+ (ub[0] + box[0]) // 2, (ub[1] + box[1]) // 2,
219
+ (ub[2] + box[2]) // 2, (ub[3] + box[3]) // 2
220
+ )
221
+ merged = True
222
+ break
223
+ if not merged:
224
+ unique_boxes.append(box)
225
+
226
+ drawbox_filters = []
227
+ for (x1, y1, x2, y2) in unique_boxes:
228
+ w = x2 - x1
229
+ h = y2 - y1
230
+ drawbox_filters.append(f"drawbox=x={x1}:y={y1}:w={w}:h={h}:color=green:t=2")
231
+ filter_str = ','.join(drawbox_filters)
232
  else:
233
+ filter_str = 'null'
234
 
235
+ cmd = [
236
+ ffmpeg_exe, '-y', '-i', video_path,
237
+ '-vf', filter_str,
238
+ '-c:v', 'libx264', '-preset', 'fast',
239
+ '-movflags', '+faststart', '-pix_fmt', 'yuv420p',
240
+ output_path
241
+ ]
242
+ logger.info('Running ffmpeg with %d face boxes', len(all_boxes))
243
+ result = subprocess.run(cmd, capture_output=True, text=True)
244
+ if result.returncode != 0:
245
+ logger.error('ffmpeg drawbox failed: %s', result.stderr[-500:])
246
+ else:
247
+ logger.info('Processed video saved: %s', output_path)
248
 
249
 
250
  def predict_deepfake(faces):
App/static/Technology.jsx CHANGED
@@ -26,7 +26,6 @@ function TechnologyPage() {
26
  number: 1,
27
  title: 'Video to Frames',
28
  color: '#6c8cff',
29
- file: '00-convert_video_to_image.py',
30
  description: 'Raw training videos are split into individual image frames. One frame is extracted per second of video using OpenCV. Each frame is automatically scaled based on its resolution to normalize image sizes across the dataset.',
31
  details: [
32
  'Reads MP4 videos from the FaceForensics++ dataset',
@@ -55,7 +54,6 @@ function TechnologyPage() {
55
  number: 2,
56
  title: 'Face Detection & Cropping',
57
  color: '#ff9800',
58
- file: '01-crop_faces_with_mtcnn.py',
59
  description: 'MTCNN (Multi-task Cascaded Convolutional Network) scans each extracted frame to detect faces. Detected faces are cropped with a 30% margin around the bounding box to preserve context like hair and jawline, which helps the model detect manipulation artifacts.',
60
  details: [
61
  'Uses MTCNN deep learning face detector for accurate face localization',
@@ -88,7 +86,6 @@ function TechnologyPage() {
88
  number: 3,
89
  title: 'Dataset Preparation',
90
  color: '#4caf50',
91
- file: '02-prepare_fake_real_dataset.py',
92
  description: 'Cropped face images are organized into "real" and "fake" categories based on FaceForensics++ metadata. Small or corrupted images (<90px) are filtered out. The dataset is then split into training (80%), validation (10%), and test (10%) sets using stratified splitting.',
93
  details: [
94
  'Labels faces as REAL or FAKE using FaceForensics++ CSV metadata',
@@ -123,7 +120,6 @@ function TechnologyPage() {
123
  number: 4,
124
  title: 'CNN Training (EfficientNetB0)',
125
  color: '#f44336',
126
- file: '03-train_cnn.py',
127
  description: 'A two-phase transfer learning approach trains an EfficientNetB0-based classifier. Phase 1 freezes the pre-trained ImageNet backbone and trains only the classification head. Phase 2 unfreezes the entire network for fine-tuning with a very low learning rate, achieving ~92% accuracy.',
128
  details: [
129
  'EfficientNetB0 backbone pre-trained on ImageNet (224\u00d7224 input)',
@@ -197,7 +193,6 @@ function TechnologyPage() {
197
  <h2 className="tech-step-title" style={{ color: step.color }}>
198
  Step {step.number}: {step.title}
199
  </h2>
200
- <code className="tech-file">{step.file}</code>
201
  </div>
202
  </div>
203
  <p className="tech-step-desc">{step.description}</p>
@@ -221,7 +216,6 @@ function TechnologyPage() {
221
  <h2 className="tech-step-title" style={{ color: '#6c8cff' }}>
222
  Real-Time Inference
223
  </h2>
224
- <code className="tech-file">App/app.py</code>
225
  </div>
226
  </div>
227
  <p className="tech-step-desc">
 
26
  number: 1,
27
  title: 'Video to Frames',
28
  color: '#6c8cff',
 
29
  description: 'Raw training videos are split into individual image frames. One frame is extracted per second of video using OpenCV. Each frame is automatically scaled based on its resolution to normalize image sizes across the dataset.',
30
  details: [
31
  'Reads MP4 videos from the FaceForensics++ dataset',
 
54
  number: 2,
55
  title: 'Face Detection & Cropping',
56
  color: '#ff9800',
 
57
  description: 'MTCNN (Multi-task Cascaded Convolutional Network) scans each extracted frame to detect faces. Detected faces are cropped with a 30% margin around the bounding box to preserve context like hair and jawline, which helps the model detect manipulation artifacts.',
58
  details: [
59
  'Uses MTCNN deep learning face detector for accurate face localization',
 
86
  number: 3,
87
  title: 'Dataset Preparation',
88
  color: '#4caf50',
 
89
  description: 'Cropped face images are organized into "real" and "fake" categories based on FaceForensics++ metadata. Small or corrupted images (<90px) are filtered out. The dataset is then split into training (80%), validation (10%), and test (10%) sets using stratified splitting.',
90
  details: [
91
  'Labels faces as REAL or FAKE using FaceForensics++ CSV metadata',
 
120
  number: 4,
121
  title: 'CNN Training (EfficientNetB0)',
122
  color: '#f44336',
 
123
  description: 'A two-phase transfer learning approach trains an EfficientNetB0-based classifier. Phase 1 freezes the pre-trained ImageNet backbone and trains only the classification head. Phase 2 unfreezes the entire network for fine-tuning with a very low learning rate, achieving ~92% accuracy.',
124
  details: [
125
  'EfficientNetB0 backbone pre-trained on ImageNet (224\u00d7224 input)',
 
193
  <h2 className="tech-step-title" style={{ color: step.color }}>
194
  Step {step.number}: {step.title}
195
  </h2>
 
196
  </div>
197
  </div>
198
  <p className="tech-step-desc">{step.description}</p>
 
216
  <h2 className="tech-step-title" style={{ color: '#6c8cff' }}>
217
  Real-Time Inference
218
  </h2>
 
219
  </div>
220
  </div>
221
  <p className="tech-step-desc">