quarterturn commited on
Commit
e862c01
·
verified ·
1 Parent(s): 74e8816

Upload 4 files

Browse files
Files changed (3) hide show
  1. README.md +15 -24
  2. clipsaver.py +238 -0
  3. main.py +1 -1
README.md CHANGED
@@ -1,44 +1,35 @@
1
- ---
2
- license: cc-by-nc-3.0
3
- ---
4
- # facesaver
5
 
6
- # A tool to process video files into still for image and video AI training, using yolov11 face detection to find scenes with people in them, within a certain size and position range.
7
 
8
- # Requirements:
9
  CUDA 12.x
10
  A GPU with 6GB or more VRAM
11
  Raw video rips, unless you want subtitles in your training data.
12
 
13
- # Usage:
14
  1. create a conda env
15
-
16
- ```conda env create -n facesaver python=3.12```
17
-
18
  2. activate the env
19
-
20
- ```conda activate facesaver```
21
-
22
  3. install the requiremnts
23
-
24
- ```pip3 install -r requirements.txt```
25
-
26
  4. put your video files into the input directory
 
 
 
27
 
28
- 5. run the command
29
-
30
- ```python3 main.py -I ./input -O ./output -w 200 -m 200```
31
 
32
- # notes:
33
  You can use -w and -m to specify the minimum bounding box for face detection, to avoid triggering on background faces
34
  If you find you're getting too many false positives or not enough faces, adjust the code here:
35
- ```
36
  # Perform face detection if no face has been detected in this scene
37
  if not face_detected_in_scene:
38
  try:
39
  results = model.predict(frame, classes=[0], conf=0.75, device=device)
40
- ```
41
- by changing ```conf``` to somethihng bigger or smaller
42
 
43
  You will have to do some cleanup to remove the occasional non-face and faces in credit scenes.
44
- If you process something like as 12-episode anime, you should end up with 250-1000 usable stills after manual cleanup.
 
1
+ facesaver
 
 
 
2
 
3
+ A tool to process video files into still for image and video AI training, using yolov11 face detection to find scenes with people in them, within a certain size and position range.
4
 
5
+ Requirements:
6
  CUDA 12.x
7
  A GPU with 6GB or more VRAM
8
  Raw video rips, unless you want subtitles in your training data.
9
 
10
+ Usage:
11
  1. create a conda env
12
+ conda env create -n facesaver python=3.12
 
 
13
  2. activate the env
14
+ conda activate facesaver
 
 
15
  3. install the requiremnts
16
+ pip3 install -r requirements.txt
 
 
17
  4. put your video files into the input directory
18
+ 5.
19
+ run the command for stills
20
+ python3 main.py -I ./input -O ./output -w 200 -m 200
21
 
22
+ run the command for clips
23
+ python3 clipsaver.py -I ./input -O ./output -w 200 -m 200
 
24
 
25
+ notes:
26
  You can use -w and -m to specify the minimum bounding box for face detection, to avoid triggering on background faces
27
  If you find you're getting too many false positives or not enough faces, adjust the code here:
 
28
  # Perform face detection if no face has been detected in this scene
29
  if not face_detected_in_scene:
30
  try:
31
  results = model.predict(frame, classes=[0], conf=0.75, device=device)
32
+ by changing conf to somethihng bigger or smaller
 
33
 
34
  You will have to do some cleanup to remove the occasional non-face and faces in credit scenes.
35
+ If you process something like as 12-episode anime, you should end up with 250-1000 usable stills or clips after manual cleanup.
clipsaver.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import argparse
4
+ import os
5
+ import cv2
6
+ import numpy as np
7
+ from ultralytics import YOLO
8
+ from scenedetect import open_video, SceneManager, ContentDetector
9
+ import torch
10
+
11
+ def parse_arguments():
12
+ """Parse command-line arguments."""
13
+ parser = argparse.ArgumentParser(
14
+ description="Detect full faces in videos and capture 15-second video clips on scene changes.",
15
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
16
+ )
17
+ parser.add_argument(
18
+ "--input-dir", "-I",
19
+ required=True,
20
+ help="Directory containing input video files."
21
+ )
22
+ parser.add_argument(
23
+ "--output-dir", "-O",
24
+ required=True,
25
+ help="Directory to save video clip outputs."
26
+ )
27
+ parser.add_argument(
28
+ "--min-width", "-w",
29
+ type=int,
30
+ default=200,
31
+ help="Minimum width of face bounding box to trigger capture."
32
+ )
33
+ parser.add_argument(
34
+ "--min-height", "-m",
35
+ type=int,
36
+ default=200,
37
+ help="Minimum height of face bounding box to trigger capture."
38
+ )
39
+ return parser.parse_args()
40
+
41
+ def ensure_directory(directory):
42
+ """Create directory if it doesn't exist."""
43
+ if not os.path.exists(directory):
44
+ os.makedirs(directory)
45
+
46
+ def check_cuda():
47
+ """Check CUDA availability and return device."""
48
+ if torch.cuda.is_available():
49
+ device = torch.device("cuda")
50
+ print(f"CUDA is available! Using GPU: {torch.cuda.get_device_name(0)}")
51
+ print(f"CUDA version: {torch.version.cuda}")
52
+ print(f"Number of GPUs: {torch.cuda.device_count()}")
53
+ else:
54
+ device = torch.device("cpu")
55
+ print("CUDA is not available. Falling back to CPU.")
56
+ return device
57
+
58
+ def is_full_face(box, frame_shape, min_width, min_height, min_proportion=0.1):
59
+ """Check if the bounding box represents a full face within the frame."""
60
+ x1, y1, x2, y2 = box
61
+ frame_height, frame_width = frame_shape[:2]
62
+
63
+ # Check if box is fully within frame (not touching edges)
64
+ if x1 <= 0 or y1 <= 0 or x2 >= frame_width or y2 >= frame_height:
65
+ return False
66
+
67
+ # Check minimum size
68
+ width = x2 - x1
69
+ height = y2 - y1
70
+ if width < min_width or height < min_height:
71
+ return False
72
+
73
+ # Check if box is large enough relative to frame (likely a face)
74
+ if width < frame_width * min_proportion or height < frame_height * min_proportion:
75
+ return False
76
+
77
+ return True
78
+
79
+ def process_video(video_path, output_dir, min_width, min_height, model, device):
80
+ """Process a single video for face detection and capture 15-second video clips."""
81
+ # Initialize PySceneDetect for scene detection
82
+ try:
83
+ video = open_video(video_path)
84
+ scene_manager = SceneManager()
85
+ scene_manager.add_detector(ContentDetector(threshold=30.0))
86
+ except Exception as e:
87
+ print(f"Error initializing video for scene detection in {video_path}: {e}")
88
+ return
89
+
90
+ # Get video capture for OpenCV
91
+ cap = cv2.VideoCapture(video_path)
92
+ if not cap.isOpened():
93
+ print(f"Error opening video file {video_path}")
94
+ return
95
+
96
+ fps = cap.get(cv2.CAP_PROP_FPS)
97
+ frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
98
+ if fps <= 0:
99
+ print(f"Invalid FPS for {video_path}. Skipping.")
100
+ cap.release()
101
+ return
102
+
103
+ # Calculate frames for 15-second clip
104
+ num_frames = int(fps * 15)
105
+
106
+ # Get original dimensions
107
+ frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
108
+ frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
109
+ if frame_height == 0:
110
+ print(f"Invalid frame height for {video_path}. Skipping.")
111
+ cap.release()
112
+ return
113
+
114
+ # Calculate scaled dimensions (height=480, maintain aspect ratio)
115
+ scale = 480 / frame_height
116
+ new_width = int(frame_width * scale)
117
+ new_height = 480
118
+
119
+ # Find scenes
120
+ try:
121
+ scene_manager.detect_scenes(video=video)
122
+ scene_list = scene_manager.get_scene_list()
123
+ scene_starts = [scene[0].get_frames() for scene in scene_list]
124
+ except Exception as e:
125
+ print(f"Error detecting scenes in {video_path}: {e}")
126
+ cap.release()
127
+ return
128
+
129
+ scene_index = 0
130
+ face_detected_in_scene = False
131
+ frame_idx = 0
132
+ output_count = 0
133
+ video_name = os.path.splitext(os.path.basename(video_path))[0]
134
+
135
+ while cap.isOpened():
136
+ ret, frame = cap.read()
137
+ if not ret:
138
+ break
139
+
140
+ # Check if current frame is start of a new scene
141
+ if scene_index < len(scene_starts) and frame_idx >= scene_starts[scene_index]:
142
+ face_detected_in_scene = False # Reset face detection for new scene
143
+ scene_index += 1
144
+ print(f"New scene detected at frame {frame_idx}")
145
+
146
+ # Perform face detection if no face has been detected in this scene
147
+ if not face_detected_in_scene:
148
+ try:
149
+ results = model.predict(frame, classes=[0], conf=0.75, device=device)
150
+
151
+ for result in results:
152
+ boxes = result.boxes.xyxy.cpu().numpy()
153
+ confidences = result.boxes.conf.cpu().numpy()
154
+ classes = result.boxes.cls.cpu().numpy()
155
+
156
+ for box, conf, cls in zip(boxes, confidences, classes):
157
+ if cls == 0: # Class 0 is 'person' in COCO, used as proxy for face
158
+ if is_full_face(box, frame.shape, min_width, min_height):
159
+ # Initialize VideoWriter
160
+ output_path = os.path.join(output_dir, f"{video_name}_face_{output_count:04d}.mp4")
161
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
162
+ out = cv2.VideoWriter(output_path, fourcc, fps, (new_width, new_height))
163
+ if not out.isOpened():
164
+ print(f"Error initializing VideoWriter for {output_path}")
165
+ break
166
+
167
+ # Capture 15 seconds of frames
168
+ frames_captured = 0
169
+ start_frame_idx = frame_idx
170
+ cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame_idx) # Reset to start frame
171
+
172
+ while frames_captured < num_frames:
173
+ ret, frame = cap.read()
174
+ if not ret:
175
+ print(f"Warning: Clip at frame {start_frame_idx} in {video_path} is shorter than 15 seconds ({frames_captured/fps:.2f} seconds)")
176
+ break
177
+
178
+ # Scale frame
179
+ scaled_frame = cv2.resize(frame, (new_width, new_height), interpolation=cv2.INTER_AREA)
180
+ out.write(scaled_frame)
181
+ frames_captured += 1
182
+ frame_idx += 1
183
+
184
+ out.release()
185
+ print(f"Saved video clip: {output_path} ({frames_captured/fps:.2f} seconds)")
186
+ output_count += 1
187
+ face_detected_in_scene = True
188
+ # Skip to frame after clip
189
+ cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame_idx + frames_captured)
190
+ break # Stop checking boxes after first valid face
191
+ if face_detected_in_scene:
192
+ break # Stop checking results after first valid face
193
+
194
+ except Exception as e:
195
+ print(f"Error during face detection in {video_path}: {e}")
196
+
197
+ else:
198
+ frame_idx += 1
199
+
200
+ cap.release()
201
+ print(f"Processed {video_path}: {output_count} video clips saved.")
202
+
203
+ def main():
204
+ """Main function to process videos in input directory."""
205
+ args = parse_arguments()
206
+
207
+ # Validate input directory
208
+ if not os.path.isdir(args.input_dir):
209
+ print(f"Error: Input directory '{args.input_dir}' does not exist.")
210
+ return
211
+
212
+ # Ensure output directory exists
213
+ ensure_directory(args.output_dir)
214
+
215
+ # Check CUDA and set device once
216
+ device = check_cuda()
217
+
218
+ # Load YOLO model once
219
+ try:
220
+ model = YOLO("yolov11l.pt")
221
+ model.to(device)
222
+ print(f"YOLO model loaded on device: {device}")
223
+ except Exception as e:
224
+ print(f"Error loading YOLO model: {e}")
225
+ return
226
+
227
+ # Supported video extensions
228
+ video_extensions = ('.mp4', '.avi', '.mov', '.mkv')
229
+
230
+ # Iterate over video files in input directory
231
+ for filename in os.listdir(args.input_dir):
232
+ if filename.lower().endswith(video_extensions):
233
+ video_path = os.path.join(args.input_dir, filename)
234
+ print(f"Processing video: {video_path}")
235
+ process_video(video_path, args.output_dir, args.min_width, args.min_height, model, device)
236
+
237
+ if __name__ == "__main__":
238
+ main()
main.py CHANGED
@@ -137,7 +137,7 @@ def process_video(video_path, output_dir, min_width, min_height, model, device):
137
  if cls == 0: # Class 0 is 'person' in COCO, used as proxy for face
138
  if is_full_face(box, frame.shape, min_width, min_height):
139
  # Save screenshot
140
- output_path = os.path.join(output_dir, f"{video_name}_face_{output_count:04d}.jpg")
141
  cv2.imwrite(output_path, frame)
142
  print(f"Saved screenshot: {output_path}")
143
  output_count += 1
 
137
  if cls == 0: # Class 0 is 'person' in COCO, used as proxy for face
138
  if is_full_face(box, frame.shape, min_width, min_height):
139
  # Save screenshot
140
+ output_path = os.path.join(output_dir, f"{video_name}_face_{output_count:04d}.png")
141
  cv2.imwrite(output_path, frame)
142
  print(f"Saved screenshot: {output_path}")
143
  output_count += 1