Spaces:

vu0018
/

Deepface_detect

Running

App Files Files Community

Vinh.Vu commited on Apr 12

Commit

ee1da4c

1 Parent(s): 218059f

Improve generate video speed

Browse files

Files changed (2) hide show

App/app.py +57 -46
App/static/Technology.jsx +0 -6

App/app.py CHANGED Viewed

@@ -175,65 +175,76 @@ def extract_faces_from_video(video_path):
 def create_processed_video(video_path, output_path, face_scores=None):
-    """Re-encode video with face bounding boxes (detection only, no labels)."""
     logger.info('Creating processed video with bounding boxes: %s', output_path)
     cap = cv2.VideoCapture(video_path)
     fps = cap.get(cv2.CAP_PROP_FPS) or 30
-    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    # Write to a temp file with mp4v codec first
-    temp_path = output_path + '.tmp.mp4'
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    out = cv2.VideoWriter(temp_path, fourcc, fps, (w, h))
-    if not out.isOpened():
-        logger.error('VideoWriter failed to open: %s', temp_path)
-        cap.release()
-        return
-    # Only run detection every N frames; reuse cached overlays in between
-    detect_interval = max(1, int(fps // 3))  # ~3 detections per second
-    frame_count = 0
-    cached_boxes = []  # list of (x1, y1, x2, y2)
-    while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
-            break
-        if frame_count % detect_interval == 0:
-            results = face_detector(frame, verbose=False)[0]
-            cached_boxes = []
-            for box in results.boxes:
-                if box.conf[0] > 0.5:
-                    bx1, by1, bx2, by2 = map(int, box.xyxy[0])
-                    cached_boxes.append((max(0, bx1), max(0, by1), bx2, by2))
-        # Draw face boxes on every frame (green color, no labels)
-        for (x1, y1, x2, y2) in cached_boxes:
-            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
-        out.write(frame)
-        frame_count += 1
     cap.release()
-    out.release()
-    logger.info('Wrote %d frames to temp file, re-encoding to H.264', frame_count)
-    # Re-encode to H.264 for browser compatibility
-    if reencode_to_h264(temp_path, output_path):
-        logger.info('Processed video saved (H.264): %s', output_path)
     else:
-        logger.error('Failed to re-encode processed video')
-    # Clean up temp file
-    try:
-        os.remove(temp_path)
-    except OSError:
-        pass
 def predict_deepfake(faces):

 def create_processed_video(video_path, output_path, face_scores=None):
+    """Create video with face bounding boxes using ffmpeg drawbox (much faster than OpenCV)."""
     logger.info('Creating processed video with bounding boxes: %s', output_path)
     cap = cv2.VideoCapture(video_path)
     fps = cap.get(cv2.CAP_PROP_FPS) or 30
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    duration = total_frames / fps if fps > 0 else 0
+    # Sample a few frames spread across the video to detect faces
+    sample_count = min(5, max(1, int(duration)))  # ~1 sample per second, max 5
+    sample_positions = [int(i * total_frames / sample_count) for i in range(sample_count)]
+    # Collect all face boxes across sampled frames
+    all_boxes = []
+    for pos in sample_positions:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, pos)
         ret, frame = cap.read()
         if not ret:
+            continue
+        results = face_detector(frame, verbose=False)[0]
+        for box in results.boxes:
+            if box.conf[0] > 0.5:
+                bx1, by1, bx2, by2 = map(int, box.xyxy[0])
+                all_boxes.append((max(0, bx1), max(0, by1), bx2, by2))
     cap.release()
+    # Build ffmpeg drawbox filter from detected boxes
+    ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe()
+    if all_boxes:
+        # Use the most common box region (largest by area) for a stable overlay
+        # Deduplicate similar boxes by averaging nearby ones
+        unique_boxes = []
+        for box in all_boxes:
+            merged = False
+            for i, ub in enumerate(unique_boxes):
+                # If boxes overlap significantly, merge them
+                if (abs(box[0] - ub[0]) < 40 and abs(box[1] - ub[1]) < 40 and
+                        abs(box[2] - ub[2]) < 40 and abs(box[3] - ub[3]) < 40):
+                    unique_boxes[i] = (
+                        (ub[0] + box[0]) // 2, (ub[1] + box[1]) // 2,
+                        (ub[2] + box[2]) // 2, (ub[3] + box[3]) // 2
+                    )
+                    merged = True
+                    break
+            if not merged:
+                unique_boxes.append(box)
+        drawbox_filters = []
+        for (x1, y1, x2, y2) in unique_boxes:
+            w = x2 - x1
+            h = y2 - y1
+            drawbox_filters.append(f"drawbox=x={x1}:y={y1}:w={w}:h={h}:color=green:t=2")
+        filter_str = ','.join(drawbox_filters)
     else:
+        filter_str = 'null'
+    cmd = [
+        ffmpeg_exe, '-y', '-i', video_path,
+        '-vf', filter_str,
+        '-c:v', 'libx264', '-preset', 'fast',
+        '-movflags', '+faststart', '-pix_fmt', 'yuv420p',
+        output_path
+    ]
+    logger.info('Running ffmpeg with %d face boxes', len(all_boxes))
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        logger.error('ffmpeg drawbox failed: %s', result.stderr[-500:])
+    else:
+        logger.info('Processed video saved: %s', output_path)
 def predict_deepfake(faces):

App/static/Technology.jsx CHANGED Viewed

@@ -26,7 +26,6 @@ function TechnologyPage() {
             number: 1,
             title: 'Video to Frames',
             color: '#6c8cff',
-            file: '00-convert_video_to_image.py',
             description: 'Raw training videos are split into individual image frames. One frame is extracted per second of video using OpenCV. Each frame is automatically scaled based on its resolution to normalize image sizes across the dataset.',
             details: [
                 'Reads MP4 videos from the FaceForensics++ dataset',
@@ -55,7 +54,6 @@ function TechnologyPage() {
             number: 2,
             title: 'Face Detection & Cropping',
             color: '#ff9800',
-            file: '01-crop_faces_with_mtcnn.py',
             description: 'MTCNN (Multi-task Cascaded Convolutional Network) scans each extracted frame to detect faces. Detected faces are cropped with a 30% margin around the bounding box to preserve context like hair and jawline, which helps the model detect manipulation artifacts.',
             details: [
                 'Uses MTCNN deep learning face detector for accurate face localization',
@@ -88,7 +86,6 @@ function TechnologyPage() {
             number: 3,
             title: 'Dataset Preparation',
             color: '#4caf50',
-            file: '02-prepare_fake_real_dataset.py',
             description: 'Cropped face images are organized into "real" and "fake" categories based on FaceForensics++ metadata. Small or corrupted images (<90px) are filtered out. The dataset is then split into training (80%), validation (10%), and test (10%) sets using stratified splitting.',
             details: [
                 'Labels faces as REAL or FAKE using FaceForensics++ CSV metadata',
@@ -123,7 +120,6 @@ function TechnologyPage() {
             number: 4,
             title: 'CNN Training (EfficientNetB0)',
             color: '#f44336',
-            file: '03-train_cnn.py',
             description: 'A two-phase transfer learning approach trains an EfficientNetB0-based classifier. Phase 1 freezes the pre-trained ImageNet backbone and trains only the classification head. Phase 2 unfreezes the entire network for fine-tuning with a very low learning rate, achieving ~92% accuracy.',
             details: [
                 'EfficientNetB0 backbone pre-trained on ImageNet (224\u00d7224 input)',
@@ -197,7 +193,6 @@ function TechnologyPage() {
                             <h2 className="tech-step-title" style={{ color: step.color }}>
                                 Step {step.number}: {step.title}
                             </h2>
-                            <code className="tech-file">{step.file}</code>
                         </div>
                     </div>
                     <p className="tech-step-desc">{step.description}</p>
@@ -221,7 +216,6 @@ function TechnologyPage() {
                         <h2 className="tech-step-title" style={{ color: '#6c8cff' }}>
                             Real-Time Inference
                         </h2>
-                        <code className="tech-file">App/app.py</code>
                     </div>
                 </div>
                 <p className="tech-step-desc">

             number: 1,
             title: 'Video to Frames',
             color: '#6c8cff',
             description: 'Raw training videos are split into individual image frames. One frame is extracted per second of video using OpenCV. Each frame is automatically scaled based on its resolution to normalize image sizes across the dataset.',
             details: [
                 'Reads MP4 videos from the FaceForensics++ dataset',
             number: 2,
             title: 'Face Detection & Cropping',
             color: '#ff9800',
             description: 'MTCNN (Multi-task Cascaded Convolutional Network) scans each extracted frame to detect faces. Detected faces are cropped with a 30% margin around the bounding box to preserve context like hair and jawline, which helps the model detect manipulation artifacts.',
             details: [
                 'Uses MTCNN deep learning face detector for accurate face localization',
             number: 3,
             title: 'Dataset Preparation',
             color: '#4caf50',
             description: 'Cropped face images are organized into "real" and "fake" categories based on FaceForensics++ metadata. Small or corrupted images (<90px) are filtered out. The dataset is then split into training (80%), validation (10%), and test (10%) sets using stratified splitting.',
             details: [
                 'Labels faces as REAL or FAKE using FaceForensics++ CSV metadata',
             number: 4,
             title: 'CNN Training (EfficientNetB0)',
             color: '#f44336',
             description: 'A two-phase transfer learning approach trains an EfficientNetB0-based classifier. Phase 1 freezes the pre-trained ImageNet backbone and trains only the classification head. Phase 2 unfreezes the entire network for fine-tuning with a very low learning rate, achieving ~92% accuracy.',
             details: [
                 'EfficientNetB0 backbone pre-trained on ImageNet (224\u00d7224 input)',
                             <h2 className="tech-step-title" style={{ color: step.color }}>
                                 Step {step.number}: {step.title}
                             </h2>
                         </div>
                     </div>
                     <p className="tech-step-desc">{step.description}</p>
                         <h2 className="tech-step-title" style={{ color: '#6c8cff' }}>
                             Real-Time Inference
                         </h2>
                     </div>
                 </div>
                 <p className="tech-step-desc">