Spaces:

themalinery
/

object_detection

Sleeping

App Files Files Community

themalinery commited on 28 days ago

Commit

8023e2e

1 Parent(s): 754ab40

web app

Browse files

Files changed (6) hide show

app.py +80 -4
config.yaml +3 -13
main.py +2 -81
pyproject.toml +2 -1
src/utils.py +80 -1
uv.lock +0 -0

app.py CHANGED Viewed

@@ -1,7 +1,83 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+import tempfile
+import shutil
+from pathlib import Path
+from src.utils import create_video_from_images, object_detection
+def process_video(video_file, labels_text, frame_color):
+    # Parse labels
+    text_labels = [label.strip() for label in labels_text.split(',') if label.strip()]
+    if not text_labels:
+        raise gr.Error("Please enter at least one label")
+    # Create config
+    config = {
+        'labels': text_labels,
+        'frame_colour': frame_color
+    }
+    # Create temporary directories
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_path = Path(temp_dir)
+        frames_dir = temp_path / "frames"
+        frames_dir.mkdir()
+        output_video = temp_path / "output.mp4"
+        # Process video to frames
+        object_detection(str(video_file), str(frames_dir), config)
+        # Create video from frames
+        create_video_from_images(str(frames_dir), str(output_video), fps=30)
+        # Copy to a permanent location for download
+        results_dir = Path("./results/gradio_outputs")
+        results_dir.mkdir(parents=True, exist_ok=True)
+        final_output = results_dir / f"detected_{Path(video_file).stem}.mp4"
+        shutil.copy(output_video, final_output)
+        return str(final_output)
+# Gradio interface
+with gr.Blocks(title="Video Object Detection", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# Video Object Detection")
+    gr.Markdown("Upload a video, enter labels to detect, choose frame color, and download the processed video.")
+    with gr.Row():
+        with gr.Column():
+            video_input = gr.Video(label="Upload Video")
+            labels_input = gr.Textbox(label="Detection Labels (comma-separated)", placeholder="e.g., cat, dog, person")
+            color_input = gr.ColorPicker(label="Bounding Box Color", value="#FF0000")
+            process_btn = gr.Button("Process Video", variant="primary")
+        with gr.Column():
+            # Output section
+            gr.Markdown("## Output")
+            output_video = gr.Video(label="Processed Video", interactive=False)
+            download_button = gr.File(label="Download Processed Video", visible=False)
+    # Handle processing
+    def process_and_update(video, labels_text, frame_color):
+        try:
+            # Update status
+            gr.Info("Processing video... This may take a few minutes.")
+            output_path = process_video(video, labels_text, frame_color)
+            gr.Info("Video processing complete!")
+            return output_path, output_path
+        except Exception as e:
+            raise gr.Error(f"Processing failed: {str(e)}")
+    process_btn.click(
+        fn=process_and_update,
+        inputs=[video_input, labels_input, color_input],
+        outputs=[output_video, download_button]
+    )
+if __name__ == "__main__":
+    demo.launch()

config.yaml CHANGED Viewed

@@ -1,18 +1,8 @@
-# Configuration file for pose estimation project
-# Add your configuration parameters below
 task: "object_detection"  # Options: "pose", "hand"
-input_path: "D:\\youtube\\skiathos-sep 2025\\cats\\PXL_20250910_163543016.mp4"
 output_dir: ./results
 output_name: "cats.mp4"
 frames_dir: ./frames
-# Hand drawing parameters
-hand_drawing:
-  radius: 20
-  color_landmarks: [179, 124, 247]  # BGR
-  color_connections: [225, 225, 225]  # BGR

 task: "object_detection"  # Options: "pose", "hand"
+input_path: "D:\\youtube\\skiathos-sep 2025\\cats\\cat_20250910_163543016.mp4"
 output_dir: ./results
 output_name: "cats.mp4"
 frames_dir: ./frames
+frame_colour: "white"
+labels: ["cat"]

main.py CHANGED Viewed

@@ -1,18 +1,8 @@
 #https://huggingface.co/docs/transformers/en/tasks/zero_shot_object_detection
-from transformers import pipeline
-from transformers.image_utils import load_image
-from PIL import ImageDraw, Image, ImageFont
-from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
-import torch
 import yaml
 from pathlib import Path
 from datetime import datetime
-from src.utils import create_video_from_images
-import cv2
-import os
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(device)
 def get_paths_from_config(config: dict) -> tuple[Path, Path, Path]:
     """Extract paths from configuration dictionary."""
@@ -63,75 +53,6 @@ def get_paths_from_config(config: dict) -> tuple[Path, Path, Path]:
     return input_list, output_path, config["frames_subdirs"]
-def object_detection(path_video, output_folder):
-    checkpoint = "iSEE-Laboratory/iSEE-Laboratory_llmdet_large"  #"openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det"
-    model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint, device_map="auto")
-    processor = AutoProcessor.from_pretrained(checkpoint)
-    # Initialize video capture
-    vidcap = cv2.VideoCapture(path_video)
-    frame_count = 0
-    # Initialize hand tracking
-    while vidcap.isOpened():
-        ret, frame = vidcap.read()
-        if not ret:
-            break
-        print(f"Processing frame {frame_count}")
-        # Convert the BGR image to RGB and ensure RGB mode
-        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        image = Image.fromarray(rgb_frame).convert("RGB")
-        # use a flat list of labels for single-image inference
-        text_labels = ["cat"]
-        inputs = processor(text=text_labels, images=image, return_tensors="pt").to(device)
-        with torch.no_grad():
-            outputs = model(**inputs)
-        # monkeypatch ImageDraw.text to accept a `fontsize` argument (absolute pixels or fraction of image height)
-        results = processor.post_process_grounded_object_detection(
-                    outputs, threshold=0.50, target_sizes=[(image.height, image.width)])[0]
-        draw = ImageDraw.Draw(image)
-        scores = results.get("scores", [])
-        text_labels_res = results.get("text_labels", [])
-        boxes = results.get("boxes", [])
-        for box, score, text_label in zip(boxes, scores, text_labels_res):
-            xmin, ymin, xmax, ymax = box
-            draw.rectangle((xmin, ymin, xmax, ymax), outline="white", width=10)
-            # convert score to float safely
-            try:
-                score_val = float(score)
-            except Exception:
-                score_val = round(score.item(), 2)
-            # font_size = max(10, int(0.1 * image.height))  # 10% of image height, minimum 10 pixels
-            #font = ImageFont.load_default(size=80)
-            font = ImageFont.truetype("fonts/Perfect DOS VGA 437.ttf", size=60)
-            draw.text((xmin, ymin), f"{text_label}: {round(score_val,2)}", fill="black", stroke_width=1, stroke_fill="black", font=font)
-        # save the annotated image (PIL image is modified in-place)
-        image.save(f"{output_folder}/{frame_count}.png")
-        # Exit loop by pressing 'q'
-        if cv2.waitKey(1) & 0xFF == ord('q'):
-            break
-        frame_count += 1
-        if frame_count == 90:  # limit to first 30 frames
-            break
-    # Release the video capture and close windows
-    vidcap.release()
-    cv2.destroyAllWindows()
 def main():
     with open('config.yaml', 'r') as file:
@@ -140,7 +61,7 @@ def main():
     input_path_list, output_path, frames_subdirs = get_paths_from_config(config)
     for input_path, frames_dir in zip(input_path_list, frames_subdirs):
-        object_detection(str(input_path), str(frames_dir))
     # path_video_frame_dirs = [config['frames_dir']+'/'+dir for dir in os.listdir(config['frames_dir'])]
     # output_path =  [config['output_dir']+'/'+config['task']+'/'+dir+'.mp4' for dir in os.listdir(config['frames_dir'])]

 #https://huggingface.co/docs/transformers/en/tasks/zero_shot_object_detection
 import yaml
 from pathlib import Path
 from datetime import datetime
+from src.utils import create_video_from_images, object_detection
 def get_paths_from_config(config: dict) -> tuple[Path, Path, Path]:
     """Extract paths from configuration dictionary."""
     return input_list, output_path, config["frames_subdirs"]
 def main():
     with open('config.yaml', 'r') as file:
     input_path_list, output_path, frames_subdirs = get_paths_from_config(config)
     for input_path, frames_dir in zip(input_path_list, frames_subdirs):
+        object_detection(str(input_path), str(frames_dir), config)
     # path_video_frame_dirs = [config['frames_dir']+'/'+dir for dir in os.listdir(config['frames_dir'])]
     # output_path =  [config['output_dir']+'/'+config['task']+'/'+dir+'.mp4' for dir in os.listdir(config['frames_dir'])]

pyproject.toml CHANGED Viewed

@@ -6,10 +6,11 @@ readme = "README.md"
 requires-python = ">=3.9"
 dependencies = [
     "accelerate>=1.10.1",
     "moviepy>=2.2.1",
     "natsort>=8.4.0",
     "opencv-python>=4.12.0.88",
-    "pillow>=11.3.0",
     "six>=1.17.0",
     "torch>=2.8.0",
     "transformers>=4.57.1",

 requires-python = ">=3.9"
 dependencies = [
     "accelerate>=1.10.1",
+    "gradio>=4.0.0",
     "moviepy>=2.2.1",
     "natsort>=8.4.0",
     "opencv-python>=4.12.0.88",
+    "pillow>=8.0,<11.0",
     "six>=1.17.0",
     "torch>=2.8.0",
     "transformers>=4.57.1",

src/utils.py CHANGED Viewed

@@ -1,6 +1,13 @@
 import os
 from moviepy import ImageSequenceClip
 from natsort import natsorted
 def create_video_from_images(folder_path, output_video_file, fps):
     """
@@ -44,4 +51,76 @@ def create_video_from_images(folder_path, output_video_file, fps):
         print(f"Successfully created video: '{output_video_file}'")
     except Exception as e:
-        print(f"An error occurred while creating the video: {e}")

 import os
 from moviepy import ImageSequenceClip
 from natsort import natsorted
+from transformers import pipeline
+from transformers.image_utils import load_image
+from PIL import ImageDraw, Image, ImageFont
+from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+import torch
+import cv2
+import os
 def create_video_from_images(folder_path, output_video_file, fps):
     """
         print(f"Successfully created video: '{output_video_file}'")
     except Exception as e:
+        print(f"An error occurred while creating the video: {e}")
+def object_detection(path_video, output_folder, config):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    text_labels = config.get('labels', [])
+    frame_color = config.get('frame_colour')
+    checkpoint = "iSEE-Laboratory/llmdet_tiny"  #"openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det"
+    model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint, device_map="auto")
+    processor = AutoProcessor.from_pretrained(checkpoint)
+    # Initialize video capture
+    vidcap = cv2.VideoCapture(path_video)
+    frame_count = 0
+    # Initialize hand tracking
+    while vidcap.isOpened():
+        ret, frame = vidcap.read()
+        if not ret:
+            break
+        print(f"Processing frame {frame_count}")
+        # Convert the BGR image to RGB and ensure RGB mode
+        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        image = Image.fromarray(rgb_frame).convert("RGB")
+        inputs = processor(text=text_labels, images=image, return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # monkeypatch ImageDraw.text to accept a `fontsize` argument (absolute pixels or fraction of image height)
+        results = processor.post_process_grounded_object_detection(
+                    outputs, threshold=0.50, target_sizes=[(image.height, image.width)])[0]
+        draw = ImageDraw.Draw(image)
+        scores = results.get("scores", [])
+        text_labels_res = results.get("text_labels", [])
+        boxes = results.get("boxes", [])
+        for box, score, text_label in zip(boxes, scores, text_labels_res):
+            xmin, ymin, xmax, ymax = box
+            draw.rectangle((xmin, ymin, xmax, ymax), outline=frame_color, width=10)
+            # convert score to float safely
+            try:
+                score_val = float(score)
+            except Exception:
+                score_val = round(score.item(), 2)
+            # font_size = max(10, int(0.1 * image.height))  # 10% of image height, minimum 10 pixels
+            #font = ImageFont.load_default(size=80)
+            font = ImageFont.truetype("fonts/Perfect DOS VGA 437.ttf", size=60)
+            draw.text((xmin, ymin), f"{text_label}: {round(score_val,2)}", fill="black", stroke_width=1, stroke_fill="black", font=font)
+        # save the annotated image (PIL image is modified in-place)
+        image.save(f"{output_folder}/{frame_count}.png")
+        # Exit loop by pressing 'q'
+        if cv2.waitKey(1) & 0xFF == ord('q'):
+            break
+        frame_count += 1
+        if frame_count == 90:  # limit to first 30 frames
+            break
+    # Release the video capture and close windows
+    vidcap.release()
+    cv2.destroyAllWindows()

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff