Spaces:

rdjarbeng
/

YOLO-Video-Segmentation

Runtime error

App Files Files Community

rdjarbeng commited on Aug 27, 2025

Commit

801bb4d

verified ·

1 Parent(s): 93c5c43

Try to process individual frames to fix error

Browse files

Files changed (1) hide show

app.py +52 -65

app.py CHANGED Viewed

@@ -2,98 +2,85 @@ import gradio as gr
 import cv2
 import numpy as np
 import torch
-from ultralytics.models.sam import SAM2VideoPredictor
-from ultralytics import YOLOWorld
-import supervision as sv
 import os
 # Initialize models
-overrides = dict(model="sam2.1_t.pt", device="cpu")
-predictor = SAM2VideoPredictor(overrides=overrides)
 yolo_model = YOLOWorld("yolov8s-world.pt")  # Lightweight YOLO-World model
-def detect_motorcycles(frame, prompt="motorcycle"):
-    """Detect motorcycles in a frame using YOLO-World and return bounding boxes."""
     yolo_model.set_classes([prompt])
-    results = yolo_model.predict(frame, device="cpu")
     boxes = []
     for result in results:
-        for box in result.boxes:
-            # Check if the detected class matches the prompt
-            if result.names[int(box.cls)] == prompt:
-                x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
-                boxes.append([x1, y1, x2, y2])
     return boxes
 def segment_and_highlight_video(video_path, prompt="motorcycle", highlight_color="red"):
     """Segment and highlight motorcycles in a video using SAM 2 and YOLO-World."""
-    # Create temporary directory for video frames
-    frames_dir = "video_frames"
-    os.makedirs(frames_dir, exist_ok=True)
-    # Extract frames
     cap = cv2.VideoCapture(video_path)
     fps = cap.get(cv2.CAP_PROP_FPS)
     width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     # Limit resolution for CPU
     if width > 640:
-        height = int(height * 640 / width)
         width = 640
-    frame_paths = []
-    # Save frames as JPEG
-    frame_idx = 0
-    with sv.ImageSink(target_dir_path=frames_dir, image_name_pattern="{:05d}.jpeg") as sink:
-        while cap.isOpened():
-            ret, frame = cap.read()
-            if not ret:
-                break
-            frame = cv2.resize(frame, (width, height))
-            sink.save_image(frame)
-            frame_paths.append(os.path.join(frames_dir, f"{frame_idx:05d}.jpeg"))
-            frame_idx += 1
-    cap.release()
-    # Initialize SAM 2 inference state
-    with torch.inference_mode():
-        state = predictor.init_state(video_path=frames_dir)
-        # Detect motorcycles in the first frame
-        first_frame = cv2.imread(frame_paths[0])
-        boxes = detect_motorcycles(first_frame, prompt)
-        # Add boxes as prompts for SAM 2
-        if boxes:
-            frame_idx, obj_ids, masks = predictor.add_new_points_or_box(
-                state, frame_idx=0, obj_ids=[1], boxes=np.array(boxes)
-            )
-        # Create output video
-        output_path = "output.mp4"
-        out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))
-        # Color map for highlighting
-        color_map = {"red": (0, 0, 255), "green": (0, 255, 0), "blue": (255, 0, 0)}
-        highlight_rgb = color_map.get(highlight_color.lower(), (0, 0, 255))
-        # Propagate masks and apply highlights
-        for frame_idx, obj_ids, masks in predictor.propagate_in_video(state):
-            frame = cv2.imread(frame_paths[frame_idx])
-            mask = masks[0].astype(np.uint8) * 255  # Assuming one object
             mask_colored = np.zeros_like(frame)
-            mask_colored[:, :, 0] = mask * highlight_rgb[0]
-            mask_colored[:, :, 1] = mask * highlight_rgb[1]
-            mask_colored[:, :, 2] = mask * highlight_rgb[2]
             highlighted_frame = cv2.addWeighted(frame, 0.7, mask_colored, 0.3, 0)
-            out.write(highlighted_frame)
-        out.release()
-    # Clean up
-    for frame_path in frame_paths:
-        os.remove(frame_path)
-    os.rmdir(frames_dir)
     return output_path

 import cv2
 import numpy as np
 import torch
+from ultralytics import SAM, YOLOWorld
 import os
 # Initialize models
+sam_model = SAM("sam2.1_t.pt", device="cpu")
 yolo_model = YOLOWorld("yolov8s-world.pt")  # Lightweight YOLO-World model
+def detect_motorcycles(first_frame, prompt="motorcycle"):
+    """Detect motorcycles in the first frame using YOLO-World and return bounding boxes."""
     yolo_model.set_classes([prompt])
+    results = yolo_model.predict(first_frame, device="cpu")
     boxes = []
     for result in results:
+        boxes.append(result.boxes.xyxy.cpu().numpy())
+    if len(boxes) > 0:
+        boxes = np.vstack(boxes)  # Stack all boxes if multiple results
+    else:
+        boxes = np.array([])
     return boxes
 def segment_and_highlight_video(video_path, prompt="motorcycle", highlight_color="red"):
     """Segment and highlight motorcycles in a video using SAM 2 and YOLO-World."""
+    # Get first frame for detection
+    cap = cv2.VideoCapture(video_path)
+    ret, first_frame = cap.read()
+    if not ret:
+        raise ValueError("Could not read first frame from video.")
+    cap.release()
+    # Detect boxes in first frame
+    boxes = detect_motorcycles(first_frame, prompt)
+    if len(boxes) == 0:
+        return video_path  # No motorcycles detected, return original
+    # Run SAM2 on video with boxes prompt
+    results = sam_model(source=video_path, bboxes=boxes)
+    # Prepare output video
     cap = cv2.VideoCapture(video_path)
     fps = cap.get(cv2.CAP_PROP_FPS)
     width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     # Limit resolution for CPU
     if width > 640:
+        scale = 640 / width
         width = 640
+        height = int(height * scale)
+    output_path = "output.mp4"
+    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))
+    # Color map for highlighting
+    color_map = {"red": (0, 0, 255), "green": (0, 255, 0), "blue": (255, 0, 0)}
+    highlight_rgb = color_map.get(highlight_color.lower(), (0, 0, 255))
+    frame_idx = 0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame = cv2.resize(frame, (width, height))
+        # Get masks for this frame
+        if results[frame_idx].masks is not None:
+            masks = results[frame_idx].masks.data.cpu().numpy()  # (num_masks, h, w)
+            combined_mask = np.any(masks, axis=0).astype(np.uint8) * 255  # Combine all masks
             mask_colored = np.zeros_like(frame)
+            mask_colored[:, :, 0] = combined_mask * highlight_rgb[0]
+            mask_colored[:, :, 1] = combined_mask * highlight_rgb[1]
+            mask_colored[:, :, 2] = combined_mask * highlight_rgb[2]
             highlighted_frame = cv2.addWeighted(frame, 0.7, mask_colored, 0.3, 0)
+        else:
+            highlighted_frame = frame
+        out.write(highlighted_frame)
+        frame_idx += 1
+    cap.release()
+    out.release()
     return output_path