Spaces:

themalinery
/

object_detection

Sleeping

App Files Files Community

themalinery commited on Jan 12

Commit

c3cafb2

0 Parent(s):

first push

Browse files

Files changed (10) hide show

.python-version +1 -0
README.md +0 -0
config.yaml +18 -0
fonts/Minecraft.ttf +0 -0
fonts/Perfect DOS VGA 437.ttf +0 -0
fonts/PressStart2P.ttf +0 -0
main.py +153 -0
pyproject.toml +16 -0
src/utils.py +47 -0
uv.lock +0 -0

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.9

README.md ADDED Viewed

File without changes

config.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+# Configuration file for pose estimation project
+# Add your configuration parameters below
+task: "object_detection"  # Options: "pose", "hand"
+input_path: "D:\\youtube\\skiathos-sep 2025\\cats\\PXL_20250910_163543016.mp4"   #
+output_dir: ./results
+output_name: "cats.mp4"
+frames_dir: ./frames
+# Hand drawing parameters
+hand_drawing:
+  radius: 20
+  color_landmarks: [179, 124, 247]  # BGR
+  color_connections: [225, 225, 225]  # BGR

fonts/Minecraft.ttf ADDED Viewed

Binary file (14.5 kB). View file

fonts/Perfect DOS VGA 437.ttf ADDED Viewed

Binary file (81.2 kB). View file

fonts/PressStart2P.ttf ADDED Viewed

Binary file (82.5 kB). View file

main.py ADDED Viewed

	@@ -0,0 +1,153 @@

+#https://huggingface.co/docs/transformers/en/tasks/zero_shot_object_detection
+from transformers import pipeline
+from transformers.image_utils import load_image
+from PIL import ImageDraw, Image, ImageFont
+from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+import torch
+import yaml
+from pathlib import Path
+from datetime import datetime
+from src.utils import create_video_from_images
+import cv2
+import os
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(device)
+def get_paths_from_config(config: dict) -> tuple[Path, Path, Path]:
+    """Extract paths from configuration dictionary."""
+    raw_input = config.get("input_path")
+    if raw_input is None:
+        raise ValueError("config missing 'input_path'")
+    raw_path = Path(raw_input)
+    if raw_path.exists() and raw_path.is_dir():
+        files = sorted([p for p in raw_path.iterdir() if p.is_file()])
+        # store all file paths (as strings) in config for later use
+        config["input_path_list"] = [str(p) for p in files]
+    else:
+        # single path (file or non-existent) stored as single-item list
+        config["input_path_list"] = [str(raw_path)]
+    input_path = Path(config['input_path'])
+    output_dir = Path(config['output_dir'])
+    output_name = config.get('output_name')
+    task = config.get('task')
+    frames_dir = Path(config.get('frames_dir'))
+    output_dir = output_dir.joinpath(task)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    date = datetime.now().strftime("%Y%m%d_%H%M%S")
+    # create dated frames root and a subfolder per input file (by base name)
+    # target_root = frames_dir.joinpath(date)
+    target_root = frames_dir
+    target_root.mkdir(parents=True, exist_ok=True)
+    input_list = config.get("input_path_list", [])
+    for p in input_list:
+        subfolder = target_root.joinpath(Path(p).stem)
+        subfolder.mkdir(parents=True, exist_ok=True)
+    # store mapping for later use if needed
+    config["frames_subdirs"] = [str(target_root.joinpath(Path(p).stem)) for p in input_list]
+    # frames_dir = frames_dir.joinpath(date)
+    # frames_dir.mkdir(parents=True, exist_ok=True)
+    if output_name:
+        output_path = output_dir.joinpath(output_name)
+    else:
+        output_path = output_dir.joinpath(input_path.name)
+    return input_list, output_path, config["frames_subdirs"]
+def object_detection(path_video, output_folder):
+    checkpoint = "iSEE-Laboratory/iSEE-Laboratory_llmdet_large"  #"openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det"
+    model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint, device_map="auto")
+    processor = AutoProcessor.from_pretrained(checkpoint)
+    # Initialize video capture
+    vidcap = cv2.VideoCapture(path_video)
+    frame_count = 0
+    # Initialize hand tracking
+    while vidcap.isOpened():
+        ret, frame = vidcap.read()
+        if not ret:
+            break
+        print(f"Processing frame {frame_count}")
+        # Convert the BGR image to RGB and ensure RGB mode
+        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        image = Image.fromarray(rgb_frame).convert("RGB")
+        # use a flat list of labels for single-image inference
+        text_labels = ["cat"]
+        inputs = processor(text=text_labels, images=image, return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # monkeypatch ImageDraw.text to accept a `fontsize` argument (absolute pixels or fraction of image height)
+        results = processor.post_process_grounded_object_detection(
+                    outputs, threshold=0.50, target_sizes=[(image.height, image.width)])[0]
+        draw = ImageDraw.Draw(image)
+        scores = results.get("scores", [])
+        text_labels_res = results.get("text_labels", [])
+        boxes = results.get("boxes", [])
+        for box, score, text_label in zip(boxes, scores, text_labels_res):
+            xmin, ymin, xmax, ymax = box
+            draw.rectangle((xmin, ymin, xmax, ymax), outline="white", width=10)
+            # convert score to float safely
+            try:
+                score_val = float(score)
+            except Exception:
+                score_val = round(score.item(), 2)
+            # font_size = max(10, int(0.1 * image.height))  # 10% of image height, minimum 10 pixels
+            #font = ImageFont.load_default(size=80)
+            font = ImageFont.truetype("fonts/Perfect DOS VGA 437.ttf", size=60)
+            draw.text((xmin, ymin), f"{text_label}: {round(score_val,2)}", fill="black", stroke_width=1, stroke_fill="black", font=font)
+        # save the annotated image (PIL image is modified in-place)
+        image.save(f"{output_folder}/{frame_count}.png")
+        # Exit loop by pressing 'q'
+        if cv2.waitKey(1) & 0xFF == ord('q'):
+            break
+        frame_count += 1
+        if frame_count == 90:  # limit to first 30 frames
+            break
+    # Release the video capture and close windows
+    vidcap.release()
+    cv2.destroyAllWindows()
+def main():
+    with open('config.yaml', 'r') as file:
+        config = yaml.safe_load(file)
+    input_path_list, output_path, frames_subdirs = get_paths_from_config(config)
+    for input_path, frames_dir in zip(input_path_list, frames_subdirs):
+        object_detection(str(input_path), str(frames_dir))
+    # path_video_frame_dirs = [config['frames_dir']+'/'+dir for dir in os.listdir(config['frames_dir'])]
+    # output_path =  [config['output_dir']+'/'+config['task']+'/'+dir+'.mp4' for dir in os.listdir(config['frames_dir'])]
+    # for frames_dir, output_file in zip(path_video_frame_dirs, output_path):
+    #     print(f"Creating video from {frames_dir} -> {output_file}")
+    #     create_video_from_images(str(frames_dir), str(output_file), fps=30)
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,16 @@

+[project]
+name = "object-detection"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.9"
+dependencies = [
+    "accelerate>=1.10.1",
+    "moviepy>=2.2.1",
+    "natsort>=8.4.0",
+    "opencv-python>=4.12.0.88",
+    "pillow>=11.3.0",
+    "six>=1.17.0",
+    "torch>=2.8.0",
+    "transformers>=4.57.1",
+]

src/utils.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+from moviepy import ImageSequenceClip
+from natsort import natsorted
+def create_video_from_images(folder_path, output_video_file, fps):
+    """
+    Creates a video file from a sequence of images in a folder.
+    Args:
+        folder_path (str): The path to the folder containing the images.
+        output_video_file (str): The name of the output video file (e.g., 'my_video.mp4').
+        fps (int): The frames per second for the output video.
+    """
+    if not os.path.isdir(folder_path):
+        print(f"Error: The folder '{folder_path}' does not exist.")
+        return
+    # List all image files in the folder.
+    # We use natsorted to ensure files with numerical names (e.g., image-1.png, image-10.png)
+    # are sorted in a human-friendly way.
+    supported_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif')
+    image_files = [
+        os.path.join(folder_path, f)
+        for f in natsorted(os.listdir(folder_path))
+        if f.lower().endswith(supported_extensions)
+    ]
+    if not image_files:
+        print(f"Error: No supported image files found in '{folder_path}'.")
+        return
+    if len(image_files) < 2:
+        print("Error: At least two images are required to create a video.")
+        return
+    print(f"Found {len(image_files)} images. Creating video...")
+    try:
+        # Create a video clip from the list of image files.
+        clip = ImageSequenceClip(image_files, fps=fps)
+        # Write the video file to the specified path.
+        clip.write_videofile(output_video_file, fps=fps)
+        print(f"Successfully created video: '{output_video_file}'")
+    except Exception as e:
+        print(f"An error occurred while creating the video: {e}")

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff