themalinery commited on
Commit
c3cafb2
·
0 Parent(s):

first push

Browse files
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.9
README.md ADDED
File without changes
config.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration file for pose estimation project
2
+ # Add your configuration parameters below
3
+
4
+ task: "object_detection" # Options: "pose", "hand"
5
+
6
+ input_path: "D:\\youtube\\skiathos-sep 2025\\cats\\PXL_20250910_163543016.mp4" #
7
+
8
+ output_dir: ./results
9
+
10
+ output_name: "cats.mp4"
11
+
12
+ frames_dir: ./frames
13
+
14
+ # Hand drawing parameters
15
+ hand_drawing:
16
+ radius: 20
17
+ color_landmarks: [179, 124, 247] # BGR
18
+ color_connections: [225, 225, 225] # BGR
fonts/Minecraft.ttf ADDED
Binary file (14.5 kB). View file
 
fonts/Perfect DOS VGA 437.ttf ADDED
Binary file (81.2 kB). View file
 
fonts/PressStart2P.ttf ADDED
Binary file (82.5 kB). View file
 
main.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #https://huggingface.co/docs/transformers/en/tasks/zero_shot_object_detection
2
+ from transformers import pipeline
3
+ from transformers.image_utils import load_image
4
+ from PIL import ImageDraw, Image, ImageFont
5
+ from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
6
+ import torch
7
+ import yaml
8
+ from pathlib import Path
9
+ from datetime import datetime
10
+ from src.utils import create_video_from_images
11
+ import cv2
12
+ import os
13
+
14
+ device = "cuda" if torch.cuda.is_available() else "cpu"
15
+ print(device)
16
+
17
+ def get_paths_from_config(config: dict) -> tuple[Path, Path, Path]:
18
+ """Extract paths from configuration dictionary."""
19
+
20
+ raw_input = config.get("input_path")
21
+ if raw_input is None:
22
+ raise ValueError("config missing 'input_path'")
23
+
24
+ raw_path = Path(raw_input)
25
+
26
+ if raw_path.exists() and raw_path.is_dir():
27
+ files = sorted([p for p in raw_path.iterdir() if p.is_file()])
28
+ # store all file paths (as strings) in config for later use
29
+ config["input_path_list"] = [str(p) for p in files]
30
+ else:
31
+ # single path (file or non-existent) stored as single-item list
32
+ config["input_path_list"] = [str(raw_path)]
33
+ input_path = Path(config['input_path'])
34
+ output_dir = Path(config['output_dir'])
35
+ output_name = config.get('output_name')
36
+ task = config.get('task')
37
+ frames_dir = Path(config.get('frames_dir'))
38
+
39
+ output_dir = output_dir.joinpath(task)
40
+ output_dir.mkdir(parents=True, exist_ok=True)
41
+
42
+ date = datetime.now().strftime("%Y%m%d_%H%M%S")
43
+ # create dated frames root and a subfolder per input file (by base name)
44
+ # target_root = frames_dir.joinpath(date)
45
+ target_root = frames_dir
46
+ target_root.mkdir(parents=True, exist_ok=True)
47
+
48
+ input_list = config.get("input_path_list", [])
49
+ for p in input_list:
50
+ subfolder = target_root.joinpath(Path(p).stem)
51
+ subfolder.mkdir(parents=True, exist_ok=True)
52
+
53
+ # store mapping for later use if needed
54
+ config["frames_subdirs"] = [str(target_root.joinpath(Path(p).stem)) for p in input_list]
55
+ # frames_dir = frames_dir.joinpath(date)
56
+ # frames_dir.mkdir(parents=True, exist_ok=True)
57
+
58
+ if output_name:
59
+ output_path = output_dir.joinpath(output_name)
60
+ else:
61
+ output_path = output_dir.joinpath(input_path.name)
62
+
63
+ return input_list, output_path, config["frames_subdirs"]
64
+
65
+
66
+ def object_detection(path_video, output_folder):
67
+ checkpoint = "iSEE-Laboratory/iSEE-Laboratory_llmdet_large" #"openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det"
68
+ model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint, device_map="auto")
69
+ processor = AutoProcessor.from_pretrained(checkpoint)
70
+
71
+ # Initialize video capture
72
+ vidcap = cv2.VideoCapture(path_video)
73
+
74
+ frame_count = 0
75
+ # Initialize hand tracking
76
+ while vidcap.isOpened():
77
+ ret, frame = vidcap.read()
78
+ if not ret:
79
+ break
80
+
81
+ print(f"Processing frame {frame_count}")
82
+
83
+ # Convert the BGR image to RGB and ensure RGB mode
84
+ rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
85
+ image = Image.fromarray(rgb_frame).convert("RGB")
86
+
87
+ # use a flat list of labels for single-image inference
88
+ text_labels = ["cat"]
89
+ inputs = processor(text=text_labels, images=image, return_tensors="pt").to(device)
90
+
91
+ with torch.no_grad():
92
+ outputs = model(**inputs)
93
+
94
+ # monkeypatch ImageDraw.text to accept a `fontsize` argument (absolute pixels or fraction of image height)
95
+
96
+
97
+ results = processor.post_process_grounded_object_detection(
98
+ outputs, threshold=0.50, target_sizes=[(image.height, image.width)])[0]
99
+
100
+ draw = ImageDraw.Draw(image)
101
+
102
+ scores = results.get("scores", [])
103
+ text_labels_res = results.get("text_labels", [])
104
+ boxes = results.get("boxes", [])
105
+
106
+ for box, score, text_label in zip(boxes, scores, text_labels_res):
107
+ xmin, ymin, xmax, ymax = box
108
+ draw.rectangle((xmin, ymin, xmax, ymax), outline="white", width=10)
109
+ # convert score to float safely
110
+ try:
111
+ score_val = float(score)
112
+ except Exception:
113
+ score_val = round(score.item(), 2)
114
+
115
+ # font_size = max(10, int(0.1 * image.height)) # 10% of image height, minimum 10 pixels
116
+ #font = ImageFont.load_default(size=80)
117
+ font = ImageFont.truetype("fonts/Perfect DOS VGA 437.ttf", size=60)
118
+ draw.text((xmin, ymin), f"{text_label}: {round(score_val,2)}", fill="black", stroke_width=1, stroke_fill="black", font=font)
119
+ # save the annotated image (PIL image is modified in-place)
120
+ image.save(f"{output_folder}/{frame_count}.png")
121
+
122
+ # Exit loop by pressing 'q'
123
+ if cv2.waitKey(1) & 0xFF == ord('q'):
124
+ break
125
+
126
+ frame_count += 1
127
+ if frame_count == 90: # limit to first 30 frames
128
+ break
129
+
130
+ # Release the video capture and close windows
131
+ vidcap.release()
132
+ cv2.destroyAllWindows()
133
+
134
+
135
+ def main():
136
+
137
+ with open('config.yaml', 'r') as file:
138
+ config = yaml.safe_load(file)
139
+
140
+ input_path_list, output_path, frames_subdirs = get_paths_from_config(config)
141
+
142
+ for input_path, frames_dir in zip(input_path_list, frames_subdirs):
143
+ object_detection(str(input_path), str(frames_dir))
144
+
145
+ # path_video_frame_dirs = [config['frames_dir']+'/'+dir for dir in os.listdir(config['frames_dir'])]
146
+ # output_path = [config['output_dir']+'/'+config['task']+'/'+dir+'.mp4' for dir in os.listdir(config['frames_dir'])]
147
+
148
+ # for frames_dir, output_file in zip(path_video_frame_dirs, output_path):
149
+ # print(f"Creating video from {frames_dir} -> {output_file}")
150
+ # create_video_from_images(str(frames_dir), str(output_file), fps=30)
151
+
152
+ if __name__ == "__main__":
153
+ main()
pyproject.toml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "object-detection"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.9"
7
+ dependencies = [
8
+ "accelerate>=1.10.1",
9
+ "moviepy>=2.2.1",
10
+ "natsort>=8.4.0",
11
+ "opencv-python>=4.12.0.88",
12
+ "pillow>=11.3.0",
13
+ "six>=1.17.0",
14
+ "torch>=2.8.0",
15
+ "transformers>=4.57.1",
16
+ ]
src/utils.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from moviepy import ImageSequenceClip
3
+ from natsort import natsorted
4
+
5
+ def create_video_from_images(folder_path, output_video_file, fps):
6
+ """
7
+ Creates a video file from a sequence of images in a folder.
8
+
9
+ Args:
10
+ folder_path (str): The path to the folder containing the images.
11
+ output_video_file (str): The name of the output video file (e.g., 'my_video.mp4').
12
+ fps (int): The frames per second for the output video.
13
+ """
14
+ if not os.path.isdir(folder_path):
15
+ print(f"Error: The folder '{folder_path}' does not exist.")
16
+ return
17
+
18
+ # List all image files in the folder.
19
+ # We use natsorted to ensure files with numerical names (e.g., image-1.png, image-10.png)
20
+ # are sorted in a human-friendly way.
21
+ supported_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif')
22
+ image_files = [
23
+ os.path.join(folder_path, f)
24
+ for f in natsorted(os.listdir(folder_path))
25
+ if f.lower().endswith(supported_extensions)
26
+ ]
27
+
28
+ if not image_files:
29
+ print(f"Error: No supported image files found in '{folder_path}'.")
30
+ return
31
+
32
+ if len(image_files) < 2:
33
+ print("Error: At least two images are required to create a video.")
34
+ return
35
+
36
+ print(f"Found {len(image_files)} images. Creating video...")
37
+
38
+ try:
39
+ # Create a video clip from the list of image files.
40
+ clip = ImageSequenceClip(image_files, fps=fps)
41
+
42
+ # Write the video file to the specified path.
43
+ clip.write_videofile(output_video_file, fps=fps)
44
+
45
+ print(f"Successfully created video: '{output_video_file}'")
46
+ except Exception as e:
47
+ print(f"An error occurred while creating the video: {e}")
uv.lock ADDED
The diff for this file is too large to render. See raw diff