Spaces:

rdjarbeng
/

YOLO-Video-Segmentation

Runtime error

App Files Files Community

rdjarbeng commited on Aug 27, 2025

Commit

e5ea4ad

verified ·

1 Parent(s): 5ada4a5

Use mobile sam to reduce processing time

Browse files

Files changed (1) hide show

app.py +21 -21

app.py CHANGED Viewed

@@ -6,18 +6,18 @@ from ultralytics import SAM, YOLOWorld
 import os
 # Initialize models
-sam_model = SAM("sam2.1_t.pt")  # SAM 2.1 tiny model, no device argument
-yolo_model = YOLOWorld("yolov8s-world.pt")  # Lightweight YOLO-World model
 def detect_motorcycles(first_frame, prompt="motorcycle"):
     """Detect motorcycles in the first frame using YOLO-World and return bounding boxes."""
     yolo_model.set_classes([prompt])
-    results = yolo_model.predict(first_frame, device="cpu")
     boxes = []
     for result in results:
         boxes.extend(result.boxes.xyxy.cpu().numpy())
     if len(boxes) > 0:
-        boxes = np.vstack(boxes)  # Stack all boxes if multiple results
     else:
         boxes = np.array([])
     return boxes
@@ -29,6 +29,8 @@ def segment_and_highlight_video(video_path, prompt="motorcycle", highlight_color
     ret, first_frame = cap.read()
     if not ret:
         raise ValueError("Could not read first frame from video.")
     cap.release()
     # Detect boxes in first frame
@@ -37,19 +39,19 @@ def segment_and_highlight_video(video_path, prompt="motorcycle", highlight_color
     if len(boxes) == 0:
         return video_path  # No motorcycles detected, return original
-    # Run SAM2 on video with boxes prompt
-    results = sam_model.predict(source=video_path, bboxes=boxes)
     # Prepare output video
     cap = cv2.VideoCapture(video_path)
     fps = cap.get(cv2.CAP_PROP_FPS)
-    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    # Limit resolution for CPU
-    if width > 640:
-        scale = 640 / width
-        width = 640
-        height = int(height * scale)
     output_path = "output.mp4"
     out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))
@@ -58,16 +60,14 @@ def segment_and_highlight_video(video_path, prompt="motorcycle", highlight_color
     highlight_rgb = color_map.get(highlight_color.lower(), (0, 0, 255))
     frame_idx = 0
-    while cap.isOpened():
-        ret, frame = cap.read()
-        if not ret:
-            break
         frame = cv2.resize(frame, (width, height))
         # Get masks for this frame
-        if results[frame_idx].masks is not None:
-            masks = results[frame_idx].masks.data.cpu().numpy()  # (num_masks, h, w)
-            combined_mask = np.any(masks, axis=0).astype(np.uint8) * 255  # Combine all masks
             mask_colored = np.zeros_like(frame)
             mask_colored[:, :, 0] = combined_mask * highlight_rgb[0]
             mask_colored[:, :, 1] = combined_mask * highlight_rgb[1]
@@ -93,7 +93,7 @@ iface = gr.Interface(
         gr.Dropdown(choices=["red", "green", "blue"], label="Highlight Color")
     ],
     outputs=gr.Video(label="Highlighted Video"),
-    title="Video Segmentation with SAM 2 and YOLO-World (CPU)",
     description="Upload a short video (5-10 seconds), specify a text prompt (e.g., 'motorcycle'), and choose a highlight color. Optimized for CPU."
 )
 iface.launch()

 import os
 # Initialize models
+sam_model = SAM("mobile_sam.pt")  # Switch to MobileSAM for faster CPU inference
+yolo_model = YOLOWorld("yolov8n-world.pt")  # Nano model for faster detection
 def detect_motorcycles(first_frame, prompt="motorcycle"):
     """Detect motorcycles in the first frame using YOLO-World and return bounding boxes."""
     yolo_model.set_classes([prompt])
+    results = yolo_model.predict(first_frame, device="cpu", max_det=2)  # Limit to 2 detections
     boxes = []
     for result in results:
         boxes.extend(result.boxes.xyxy.cpu().numpy())
     if len(boxes) > 0:
+        boxes = np.vstack(boxes)
     else:
         boxes = np.array([])
     return boxes
     ret, first_frame = cap.read()
     if not ret:
         raise ValueError("Could not read first frame from video.")
+    # Resize first frame for detection
+    first_frame = cv2.resize(first_frame, (320, 180))
     cap.release()
     # Detect boxes in first frame
     if len(boxes) == 0:
         return video_path  # No motorcycles detected, return original
+    # Resize boxes to match SAM input resolution (320x180)
+    scale_x = 320 / first_frame.shape[1]
+    scale_y = 180 / first_frame.shape[0]
+    boxes = boxes * [scale_x, scale_y, scale_x, scale_y]
+    # Run SAM on video with boxes prompt
+    results = sam_model.predict(source=video_path, bboxes=boxes, stream=True, imgsz=320)  # Stream and low resolution
     # Prepare output video
     cap = cv2.VideoCapture(video_path)
     fps = cap.get(cv2.CAP_PROP_FPS)
+    width = 320
+    height = 180
     output_path = "output.mp4"
     out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))
     highlight_rgb = color_map.get(highlight_color.lower(), (0, 0, 255))
     frame_idx = 0
+    for result in results:
+        frame = cv2.VideoCapture(video_path).read()[1]
         frame = cv2.resize(frame, (width, height))
         # Get masks for this frame
+        if result.masks is not None:
+            masks = result.masks.data.cpu().numpy()  # (num_masks, h, w)
+            combined_mask = np.any(masks, axis=0).astype(np.uint8) * 255
             mask_colored = np.zeros_like(frame)
             mask_colored[:, :, 0] = combined_mask * highlight_rgb[0]
             mask_colored[:, :, 1] = combined_mask * highlight_rgb[1]
         gr.Dropdown(choices=["red", "green", "blue"], label="Highlight Color")
     ],
     outputs=gr.Video(label="Highlighted Video"),
+    title="Video Segmentation with MobileSAM and YOLO-World (CPU)",
     description="Upload a short video (5-10 seconds), specify a text prompt (e.g., 'motorcycle'), and choose a highlight color. Optimized for CPU."
 )
 iface.launch()