Spaces:
Sleeping
Sleeping
Commit ·
c3cafb2
0
Parent(s):
first push
Browse files- .python-version +1 -0
- README.md +0 -0
- config.yaml +18 -0
- fonts/Minecraft.ttf +0 -0
- fonts/Perfect DOS VGA 437.ttf +0 -0
- fonts/PressStart2P.ttf +0 -0
- main.py +153 -0
- pyproject.toml +16 -0
- src/utils.py +47 -0
- uv.lock +0 -0
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.9
|
README.md
ADDED
|
File without changes
|
config.yaml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration file for pose estimation project
|
| 2 |
+
# Add your configuration parameters below
|
| 3 |
+
|
| 4 |
+
task: "object_detection" # Options: "pose", "hand"
|
| 5 |
+
|
| 6 |
+
input_path: "D:\\youtube\\skiathos-sep 2025\\cats\\PXL_20250910_163543016.mp4" #
|
| 7 |
+
|
| 8 |
+
output_dir: ./results
|
| 9 |
+
|
| 10 |
+
output_name: "cats.mp4"
|
| 11 |
+
|
| 12 |
+
frames_dir: ./frames
|
| 13 |
+
|
| 14 |
+
# Hand drawing parameters
|
| 15 |
+
hand_drawing:
|
| 16 |
+
radius: 20
|
| 17 |
+
color_landmarks: [179, 124, 247] # BGR
|
| 18 |
+
color_connections: [225, 225, 225] # BGR
|
fonts/Minecraft.ttf
ADDED
|
Binary file (14.5 kB). View file
|
|
|
fonts/Perfect DOS VGA 437.ttf
ADDED
|
Binary file (81.2 kB). View file
|
|
|
fonts/PressStart2P.ttf
ADDED
|
Binary file (82.5 kB). View file
|
|
|
main.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#https://huggingface.co/docs/transformers/en/tasks/zero_shot_object_detection
|
| 2 |
+
from transformers import pipeline
|
| 3 |
+
from transformers.image_utils import load_image
|
| 4 |
+
from PIL import ImageDraw, Image, ImageFont
|
| 5 |
+
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
| 6 |
+
import torch
|
| 7 |
+
import yaml
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from src.utils import create_video_from_images
|
| 11 |
+
import cv2
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 15 |
+
print(device)
|
| 16 |
+
|
| 17 |
+
def get_paths_from_config(config: dict) -> tuple[Path, Path, Path]:
|
| 18 |
+
"""Extract paths from configuration dictionary."""
|
| 19 |
+
|
| 20 |
+
raw_input = config.get("input_path")
|
| 21 |
+
if raw_input is None:
|
| 22 |
+
raise ValueError("config missing 'input_path'")
|
| 23 |
+
|
| 24 |
+
raw_path = Path(raw_input)
|
| 25 |
+
|
| 26 |
+
if raw_path.exists() and raw_path.is_dir():
|
| 27 |
+
files = sorted([p for p in raw_path.iterdir() if p.is_file()])
|
| 28 |
+
# store all file paths (as strings) in config for later use
|
| 29 |
+
config["input_path_list"] = [str(p) for p in files]
|
| 30 |
+
else:
|
| 31 |
+
# single path (file or non-existent) stored as single-item list
|
| 32 |
+
config["input_path_list"] = [str(raw_path)]
|
| 33 |
+
input_path = Path(config['input_path'])
|
| 34 |
+
output_dir = Path(config['output_dir'])
|
| 35 |
+
output_name = config.get('output_name')
|
| 36 |
+
task = config.get('task')
|
| 37 |
+
frames_dir = Path(config.get('frames_dir'))
|
| 38 |
+
|
| 39 |
+
output_dir = output_dir.joinpath(task)
|
| 40 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 41 |
+
|
| 42 |
+
date = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 43 |
+
# create dated frames root and a subfolder per input file (by base name)
|
| 44 |
+
# target_root = frames_dir.joinpath(date)
|
| 45 |
+
target_root = frames_dir
|
| 46 |
+
target_root.mkdir(parents=True, exist_ok=True)
|
| 47 |
+
|
| 48 |
+
input_list = config.get("input_path_list", [])
|
| 49 |
+
for p in input_list:
|
| 50 |
+
subfolder = target_root.joinpath(Path(p).stem)
|
| 51 |
+
subfolder.mkdir(parents=True, exist_ok=True)
|
| 52 |
+
|
| 53 |
+
# store mapping for later use if needed
|
| 54 |
+
config["frames_subdirs"] = [str(target_root.joinpath(Path(p).stem)) for p in input_list]
|
| 55 |
+
# frames_dir = frames_dir.joinpath(date)
|
| 56 |
+
# frames_dir.mkdir(parents=True, exist_ok=True)
|
| 57 |
+
|
| 58 |
+
if output_name:
|
| 59 |
+
output_path = output_dir.joinpath(output_name)
|
| 60 |
+
else:
|
| 61 |
+
output_path = output_dir.joinpath(input_path.name)
|
| 62 |
+
|
| 63 |
+
return input_list, output_path, config["frames_subdirs"]
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def object_detection(path_video, output_folder):
|
| 67 |
+
checkpoint = "iSEE-Laboratory/iSEE-Laboratory_llmdet_large" #"openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det"
|
| 68 |
+
model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint, device_map="auto")
|
| 69 |
+
processor = AutoProcessor.from_pretrained(checkpoint)
|
| 70 |
+
|
| 71 |
+
# Initialize video capture
|
| 72 |
+
vidcap = cv2.VideoCapture(path_video)
|
| 73 |
+
|
| 74 |
+
frame_count = 0
|
| 75 |
+
# Initialize hand tracking
|
| 76 |
+
while vidcap.isOpened():
|
| 77 |
+
ret, frame = vidcap.read()
|
| 78 |
+
if not ret:
|
| 79 |
+
break
|
| 80 |
+
|
| 81 |
+
print(f"Processing frame {frame_count}")
|
| 82 |
+
|
| 83 |
+
# Convert the BGR image to RGB and ensure RGB mode
|
| 84 |
+
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 85 |
+
image = Image.fromarray(rgb_frame).convert("RGB")
|
| 86 |
+
|
| 87 |
+
# use a flat list of labels for single-image inference
|
| 88 |
+
text_labels = ["cat"]
|
| 89 |
+
inputs = processor(text=text_labels, images=image, return_tensors="pt").to(device)
|
| 90 |
+
|
| 91 |
+
with torch.no_grad():
|
| 92 |
+
outputs = model(**inputs)
|
| 93 |
+
|
| 94 |
+
# monkeypatch ImageDraw.text to accept a `fontsize` argument (absolute pixels or fraction of image height)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
results = processor.post_process_grounded_object_detection(
|
| 98 |
+
outputs, threshold=0.50, target_sizes=[(image.height, image.width)])[0]
|
| 99 |
+
|
| 100 |
+
draw = ImageDraw.Draw(image)
|
| 101 |
+
|
| 102 |
+
scores = results.get("scores", [])
|
| 103 |
+
text_labels_res = results.get("text_labels", [])
|
| 104 |
+
boxes = results.get("boxes", [])
|
| 105 |
+
|
| 106 |
+
for box, score, text_label in zip(boxes, scores, text_labels_res):
|
| 107 |
+
xmin, ymin, xmax, ymax = box
|
| 108 |
+
draw.rectangle((xmin, ymin, xmax, ymax), outline="white", width=10)
|
| 109 |
+
# convert score to float safely
|
| 110 |
+
try:
|
| 111 |
+
score_val = float(score)
|
| 112 |
+
except Exception:
|
| 113 |
+
score_val = round(score.item(), 2)
|
| 114 |
+
|
| 115 |
+
# font_size = max(10, int(0.1 * image.height)) # 10% of image height, minimum 10 pixels
|
| 116 |
+
#font = ImageFont.load_default(size=80)
|
| 117 |
+
font = ImageFont.truetype("fonts/Perfect DOS VGA 437.ttf", size=60)
|
| 118 |
+
draw.text((xmin, ymin), f"{text_label}: {round(score_val,2)}", fill="black", stroke_width=1, stroke_fill="black", font=font)
|
| 119 |
+
# save the annotated image (PIL image is modified in-place)
|
| 120 |
+
image.save(f"{output_folder}/{frame_count}.png")
|
| 121 |
+
|
| 122 |
+
# Exit loop by pressing 'q'
|
| 123 |
+
if cv2.waitKey(1) & 0xFF == ord('q'):
|
| 124 |
+
break
|
| 125 |
+
|
| 126 |
+
frame_count += 1
|
| 127 |
+
if frame_count == 90: # limit to first 30 frames
|
| 128 |
+
break
|
| 129 |
+
|
| 130 |
+
# Release the video capture and close windows
|
| 131 |
+
vidcap.release()
|
| 132 |
+
cv2.destroyAllWindows()
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def main():
|
| 136 |
+
|
| 137 |
+
with open('config.yaml', 'r') as file:
|
| 138 |
+
config = yaml.safe_load(file)
|
| 139 |
+
|
| 140 |
+
input_path_list, output_path, frames_subdirs = get_paths_from_config(config)
|
| 141 |
+
|
| 142 |
+
for input_path, frames_dir in zip(input_path_list, frames_subdirs):
|
| 143 |
+
object_detection(str(input_path), str(frames_dir))
|
| 144 |
+
|
| 145 |
+
# path_video_frame_dirs = [config['frames_dir']+'/'+dir for dir in os.listdir(config['frames_dir'])]
|
| 146 |
+
# output_path = [config['output_dir']+'/'+config['task']+'/'+dir+'.mp4' for dir in os.listdir(config['frames_dir'])]
|
| 147 |
+
|
| 148 |
+
# for frames_dir, output_file in zip(path_video_frame_dirs, output_path):
|
| 149 |
+
# print(f"Creating video from {frames_dir} -> {output_file}")
|
| 150 |
+
# create_video_from_images(str(frames_dir), str(output_file), fps=30)
|
| 151 |
+
|
| 152 |
+
if __name__ == "__main__":
|
| 153 |
+
main()
|
pyproject.toml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "object-detection"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.9"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"accelerate>=1.10.1",
|
| 9 |
+
"moviepy>=2.2.1",
|
| 10 |
+
"natsort>=8.4.0",
|
| 11 |
+
"opencv-python>=4.12.0.88",
|
| 12 |
+
"pillow>=11.3.0",
|
| 13 |
+
"six>=1.17.0",
|
| 14 |
+
"torch>=2.8.0",
|
| 15 |
+
"transformers>=4.57.1",
|
| 16 |
+
]
|
src/utils.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from moviepy import ImageSequenceClip
|
| 3 |
+
from natsort import natsorted
|
| 4 |
+
|
| 5 |
+
def create_video_from_images(folder_path, output_video_file, fps):
|
| 6 |
+
"""
|
| 7 |
+
Creates a video file from a sequence of images in a folder.
|
| 8 |
+
|
| 9 |
+
Args:
|
| 10 |
+
folder_path (str): The path to the folder containing the images.
|
| 11 |
+
output_video_file (str): The name of the output video file (e.g., 'my_video.mp4').
|
| 12 |
+
fps (int): The frames per second for the output video.
|
| 13 |
+
"""
|
| 14 |
+
if not os.path.isdir(folder_path):
|
| 15 |
+
print(f"Error: The folder '{folder_path}' does not exist.")
|
| 16 |
+
return
|
| 17 |
+
|
| 18 |
+
# List all image files in the folder.
|
| 19 |
+
# We use natsorted to ensure files with numerical names (e.g., image-1.png, image-10.png)
|
| 20 |
+
# are sorted in a human-friendly way.
|
| 21 |
+
supported_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif')
|
| 22 |
+
image_files = [
|
| 23 |
+
os.path.join(folder_path, f)
|
| 24 |
+
for f in natsorted(os.listdir(folder_path))
|
| 25 |
+
if f.lower().endswith(supported_extensions)
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
if not image_files:
|
| 29 |
+
print(f"Error: No supported image files found in '{folder_path}'.")
|
| 30 |
+
return
|
| 31 |
+
|
| 32 |
+
if len(image_files) < 2:
|
| 33 |
+
print("Error: At least two images are required to create a video.")
|
| 34 |
+
return
|
| 35 |
+
|
| 36 |
+
print(f"Found {len(image_files)} images. Creating video...")
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
# Create a video clip from the list of image files.
|
| 40 |
+
clip = ImageSequenceClip(image_files, fps=fps)
|
| 41 |
+
|
| 42 |
+
# Write the video file to the specified path.
|
| 43 |
+
clip.write_videofile(output_video_file, fps=fps)
|
| 44 |
+
|
| 45 |
+
print(f"Successfully created video: '{output_video_file}'")
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(f"An error occurred while creating the video: {e}")
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|