quarterturn
/

facesaver

Model card Files Files and versions

xet

Community

quarterturn commited on Oct 9, 2025

Commit

e862c01

verified ·

1 Parent(s): 74e8816

Upload 4 files

Browse files

Files changed (3) hide show

README.md +15 -24
clipsaver.py +238 -0
main.py +1 -1

README.md CHANGED Viewed

@@ -1,44 +1,35 @@
----
-license: cc-by-nc-3.0
----
-# facesaver
-# A tool to process video files into still for image and video AI training, using yolov11 face detection to find scenes with people in them, within a certain size and position range.
-# Requirements:
 CUDA 12.x
 A GPU with 6GB or more VRAM
 Raw video rips, unless you want subtitles in your training data.
-# Usage:
 1. create a conda env
-```conda env create -n facesaver python=3.12```
 2. activate the env
-```conda activate facesaver```
 3. install the requiremnts
-```pip3 install -r requirements.txt```
 4. put your video files into the input directory
-5. run the command
-```python3 main.py -I ./input -O ./output -w 200 -m 200```
-# notes:
 You can use -w and -m to specify the minimum bounding box for face detection, to avoid triggering on background faces
 If you find you're getting too many false positives or not enough faces, adjust the code here:
-```
         # Perform face detection if no face has been detected in this scene
         if not face_detected_in_scene:
             try:
                 results = model.predict(frame, classes=[0], conf=0.75, device=device)
-```
-by changing ```conf``` to somethihng bigger or smaller
 You will have to do some cleanup to remove the occasional non-face and faces in credit scenes.
-If you process something like as 12-episode anime, you should end up with 250-1000 usable stills after manual cleanup.

+facesaver
+A tool to process video files into still for image and video AI training, using yolov11 face detection to find scenes with people in them, within a certain size and position range.
+Requirements:
 CUDA 12.x
 A GPU with 6GB or more VRAM
 Raw video rips, unless you want subtitles in your training data.
+Usage:
 1. create a conda env
+conda env create -n facesaver python=3.12
 2. activate the env
+conda activate facesaver
 3. install the requiremnts
+pip3 install -r requirements.txt
 4. put your video files into the input directory
+5.
+run the command for stills
+python3 main.py -I ./input -O ./output -w 200 -m 200
+run the command for clips
+python3 clipsaver.py -I ./input -O ./output -w 200 -m 200
+notes:
 You can use -w and -m to specify the minimum bounding box for face detection, to avoid triggering on background faces
 If you find you're getting too many false positives or not enough faces, adjust the code here:
         # Perform face detection if no face has been detected in this scene
         if not face_detected_in_scene:
             try:
                 results = model.predict(frame, classes=[0], conf=0.75, device=device)
+by changing conf to somethihng bigger or smaller
 You will have to do some cleanup to remove the occasional non-face and faces in credit scenes.
+If you process something like as 12-episode anime, you should end up with 250-1000 usable stills or clips after manual cleanup.

clipsaver.py ADDED Viewed

	@@ -0,0 +1,238 @@

+#!/usr/bin/env python3
+import argparse
+import os
+import cv2
+import numpy as np
+from ultralytics import YOLO
+from scenedetect import open_video, SceneManager, ContentDetector
+import torch
+def parse_arguments():
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Detect full faces in videos and capture 15-second video clips on scene changes.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--input-dir", "-I",
+        required=True,
+        help="Directory containing input video files."
+    )
+    parser.add_argument(
+        "--output-dir", "-O",
+        required=True,
+        help="Directory to save video clip outputs."
+    )
+    parser.add_argument(
+        "--min-width", "-w",
+        type=int,
+        default=200,
+        help="Minimum width of face bounding box to trigger capture."
+    )
+    parser.add_argument(
+        "--min-height", "-m",
+        type=int,
+        default=200,
+        help="Minimum height of face bounding box to trigger capture."
+    )
+    return parser.parse_args()
+def ensure_directory(directory):
+    """Create directory if it doesn't exist."""
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+def check_cuda():
+    """Check CUDA availability and return device."""
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        print(f"CUDA is available! Using GPU: {torch.cuda.get_device_name(0)}")
+        print(f"CUDA version: {torch.version.cuda}")
+        print(f"Number of GPUs: {torch.cuda.device_count()}")
+    else:
+        device = torch.device("cpu")
+        print("CUDA is not available. Falling back to CPU.")
+    return device
+def is_full_face(box, frame_shape, min_width, min_height, min_proportion=0.1):
+    """Check if the bounding box represents a full face within the frame."""
+    x1, y1, x2, y2 = box
+    frame_height, frame_width = frame_shape[:2]
+    # Check if box is fully within frame (not touching edges)
+    if x1 <= 0 or y1 <= 0 or x2 >= frame_width or y2 >= frame_height:
+        return False
+    # Check minimum size
+    width = x2 - x1
+    height = y2 - y1
+    if width < min_width or height < min_height:
+        return False
+    # Check if box is large enough relative to frame (likely a face)
+    if width < frame_width * min_proportion or height < frame_height * min_proportion:
+        return False
+    return True
+def process_video(video_path, output_dir, min_width, min_height, model, device):
+    """Process a single video for face detection and capture 15-second video clips."""
+    # Initialize PySceneDetect for scene detection
+    try:
+        video = open_video(video_path)
+        scene_manager = SceneManager()
+        scene_manager.add_detector(ContentDetector(threshold=30.0))
+    except Exception as e:
+        print(f"Error initializing video for scene detection in {video_path}: {e}")
+        return
+    # Get video capture for OpenCV
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f"Error opening video file {video_path}")
+        return
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if fps <= 0:
+        print(f"Invalid FPS for {video_path}. Skipping.")
+        cap.release()
+        return
+    # Calculate frames for 15-second clip
+    num_frames = int(fps * 15)
+    # Get original dimensions
+    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    if frame_height == 0:
+        print(f"Invalid frame height for {video_path}. Skipping.")
+        cap.release()
+        return
+    # Calculate scaled dimensions (height=480, maintain aspect ratio)
+    scale = 480 / frame_height
+    new_width = int(frame_width * scale)
+    new_height = 480
+    # Find scenes
+    try:
+        scene_manager.detect_scenes(video=video)
+        scene_list = scene_manager.get_scene_list()
+        scene_starts = [scene[0].get_frames() for scene in scene_list]
+    except Exception as e:
+        print(f"Error detecting scenes in {video_path}: {e}")
+        cap.release()
+        return
+    scene_index = 0
+    face_detected_in_scene = False
+    frame_idx = 0
+    output_count = 0
+    video_name = os.path.splitext(os.path.basename(video_path))[0]
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Check if current frame is start of a new scene
+        if scene_index < len(scene_starts) and frame_idx >= scene_starts[scene_index]:
+            face_detected_in_scene = False  # Reset face detection for new scene
+            scene_index += 1
+            print(f"New scene detected at frame {frame_idx}")
+        # Perform face detection if no face has been detected in this scene
+        if not face_detected_in_scene:
+            try:
+                results = model.predict(frame, classes=[0], conf=0.75, device=device)
+                for result in results:
+                    boxes = result.boxes.xyxy.cpu().numpy()
+                    confidences = result.boxes.conf.cpu().numpy()
+                    classes = result.boxes.cls.cpu().numpy()
+                    for box, conf, cls in zip(boxes, confidences, classes):
+                        if cls == 0:  # Class 0 is 'person' in COCO, used as proxy for face
+                            if is_full_face(box, frame.shape, min_width, min_height):
+                                # Initialize VideoWriter
+                                output_path = os.path.join(output_dir, f"{video_name}_face_{output_count:04d}.mp4")
+                                fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+                                out = cv2.VideoWriter(output_path, fourcc, fps, (new_width, new_height))
+                                if not out.isOpened():
+                                    print(f"Error initializing VideoWriter for {output_path}")
+                                    break
+                                # Capture 15 seconds of frames
+                                frames_captured = 0
+                                start_frame_idx = frame_idx
+                                cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame_idx)  # Reset to start frame
+                                while frames_captured < num_frames:
+                                    ret, frame = cap.read()
+                                    if not ret:
+                                        print(f"Warning: Clip at frame {start_frame_idx} in {video_path} is shorter than 15 seconds ({frames_captured/fps:.2f} seconds)")
+                                        break
+                                    # Scale frame
+                                    scaled_frame = cv2.resize(frame, (new_width, new_height), interpolation=cv2.INTER_AREA)
+                                    out.write(scaled_frame)
+                                    frames_captured += 1
+                                    frame_idx += 1
+                                out.release()
+                                print(f"Saved video clip: {output_path} ({frames_captured/fps:.2f} seconds)")
+                                output_count += 1
+                                face_detected_in_scene = True
+                                # Skip to frame after clip
+                                cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame_idx + frames_captured)
+                                break  # Stop checking boxes after first valid face
+                    if face_detected_in_scene:
+                        break  # Stop checking results after first valid face
+            except Exception as e:
+                print(f"Error during face detection in {video_path}: {e}")
+        else:
+            frame_idx += 1
+    cap.release()
+    print(f"Processed {video_path}: {output_count} video clips saved.")
+def main():
+    """Main function to process videos in input directory."""
+    args = parse_arguments()
+    # Validate input directory
+    if not os.path.isdir(args.input_dir):
+        print(f"Error: Input directory '{args.input_dir}' does not exist.")
+        return
+    # Ensure output directory exists
+    ensure_directory(args.output_dir)
+    # Check CUDA and set device once
+    device = check_cuda()
+    # Load YOLO model once
+    try:
+        model = YOLO("yolov11l.pt")
+        model.to(device)
+        print(f"YOLO model loaded on device: {device}")
+    except Exception as e:
+        print(f"Error loading YOLO model: {e}")
+        return
+    # Supported video extensions
+    video_extensions = ('.mp4', '.avi', '.mov', '.mkv')
+    # Iterate over video files in input directory
+    for filename in os.listdir(args.input_dir):
+        if filename.lower().endswith(video_extensions):
+            video_path = os.path.join(args.input_dir, filename)
+            print(f"Processing video: {video_path}")
+            process_video(video_path, args.output_dir, args.min_width, args.min_height, model, device)
+if __name__ == "__main__":
+    main()

main.py CHANGED Viewed

@@ -137,7 +137,7 @@ def process_video(video_path, output_dir, min_width, min_height, model, device):
                         if cls == 0:  # Class 0 is 'person' in COCO, used as proxy for face
                             if is_full_face(box, frame.shape, min_width, min_height):
                                 # Save screenshot
-                                output_path = os.path.join(output_dir, f"{video_name}_face_{output_count:04d}.jpg")
                                 cv2.imwrite(output_path, frame)
                                 print(f"Saved screenshot: {output_path}")
                                 output_count += 1

                         if cls == 0:  # Class 0 is 'person' in COCO, used as proxy for face
                             if is_full_face(box, frame.shape, min_width, min_height):
                                 # Save screenshot
+                                output_path = os.path.join(output_dir, f"{video_name}_face_{output_count:04d}.png")
                                 cv2.imwrite(output_path, frame)
                                 print(f"Saved screenshot: {output_path}")
                                 output_count += 1