Spaces:

rdjarbeng
/

YOLO-Video-Segmentation

Runtime error

App Files Files Community

rdjarbeng commited on Aug 27, 2025

Commit

7a3f21a

verified ·

1 Parent(s): e5ea4ad

Limit frames, optimize code

Browse files

SAM Resolution Problem: The original code was calling sam_model.predict(source=video_path, ...) which processes the entire video at SAM's default resolution (1024). I changed this to process individual frames with explicit imgsz=320 parameter.
Inefficient Frame Processing: Your code was opening a new VideoCapture for each frame in the loop (cv2.VideoCapture(video_path).read()[1]), which is extremely inefficient.
Missing Resolution Control for YOLO: Added imgsz=320 to the YOLO prediction to ensure consistent resolution.
Box Scaling Issues: Removed unnecessary box scaling since we're working consistently in the target resolution.
Memory Leaks: Fixed VideoCapture resource management

Files changed (1) hide show

app.py +96 -51

app.py CHANGED Viewed

@@ -12,10 +12,12 @@ yolo_model = YOLOWorld("yolov8n-world.pt")  # Nano model for faster detection
 def detect_motorcycles(first_frame, prompt="motorcycle"):
     """Detect motorcycles in the first frame using YOLO-World and return bounding boxes."""
     yolo_model.set_classes([prompt])
-    results = yolo_model.predict(first_frame, device="cpu", max_det=2)  # Limit to 2 detections
     boxes = []
     for result in results:
-        boxes.extend(result.boxes.xyxy.cpu().numpy())
     if len(boxes) > 0:
         boxes = np.vstack(boxes)
     else:
@@ -24,64 +26,100 @@ def detect_motorcycles(first_frame, prompt="motorcycle"):
 def segment_and_highlight_video(video_path, prompt="motorcycle", highlight_color="red"):
     """Segment and highlight motorcycles in a video using SAM 2 and YOLO-World."""
-    # Get first frame for detection
     cap = cv2.VideoCapture(video_path)
     ret, first_frame = cap.read()
     if not ret:
         raise ValueError("Could not read first frame from video.")
     # Resize first frame for detection
-    first_frame = cv2.resize(first_frame, (320, 180))
     cap.release()
-    # Detect boxes in first frame
-    boxes = detect_motorcycles(first_frame, prompt)
     if len(boxes) == 0:
         return video_path  # No motorcycles detected, return original
-    # Resize boxes to match SAM input resolution (320x180)
-    scale_x = 320 / first_frame.shape[1]
-    scale_y = 180 / first_frame.shape[0]
-    boxes = boxes * [scale_x, scale_y, scale_x, scale_y]
-    # Run SAM on video with boxes prompt
-    results = sam_model.predict(source=video_path, bboxes=boxes, stream=True, imgsz=320)  # Stream and low resolution
-    # Prepare output video
-    cap = cv2.VideoCapture(video_path)
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    width = 320
-    height = 180
-    output_path = "output.mp4"
-    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))
     # Color map for highlighting
     color_map = {"red": (0, 0, 255), "green": (0, 255, 0), "blue": (255, 0, 0)}
     highlight_rgb = color_map.get(highlight_color.lower(), (0, 0, 255))
-    frame_idx = 0
-    for result in results:
-        frame = cv2.VideoCapture(video_path).read()[1]
-        frame = cv2.resize(frame, (width, height))
-        # Get masks for this frame
-        if result.masks is not None:
-            masks = result.masks.data.cpu().numpy()  # (num_masks, h, w)
-            combined_mask = np.any(masks, axis=0).astype(np.uint8) * 255
-            mask_colored = np.zeros_like(frame)
-            mask_colored[:, :, 0] = combined_mask * highlight_rgb[0]
-            mask_colored[:, :, 1] = combined_mask * highlight_rgb[1]
-            mask_colored[:, :, 2] = combined_mask * highlight_rgb[2]
-            highlighted_frame = cv2.addWeighted(frame, 0.7, mask_colored, 0.3, 0)
-        else:
-            highlighted_frame = frame
         out.write(highlighted_frame)
-        frame_idx += 1
     cap.release()
     out.release()
     return output_path
 # Gradio interface
@@ -89,11 +127,18 @@ iface = gr.Interface(
     fn=segment_and_highlight_video,
     inputs=[
         gr.Video(label="Upload Video"),
-        gr.Textbox(label="Prompt", placeholder="e.g., motorcycle"),
-        gr.Dropdown(choices=["red", "green", "blue"], label="Highlight Color")
     ],
     outputs=gr.Video(label="Highlighted Video"),
-    title="Video Segmentation with MobileSAM and YOLO-World (CPU)",
-    description="Upload a short video (5-10 seconds), specify a text prompt (e.g., 'motorcycle'), and choose a highlight color. Optimized for CPU."
 )
-iface.launch()

 def detect_motorcycles(first_frame, prompt="motorcycle"):
     """Detect motorcycles in the first frame using YOLO-World and return bounding boxes."""
     yolo_model.set_classes([prompt])
+    results = yolo_model.predict(first_frame, device="cpu", max_det=2, imgsz=320)  # Force YOLO to use 320 resolution
     boxes = []
     for result in results:
+        if result.boxes is not None and len(result.boxes.xyxy) > 0:
+            boxes.extend(result.boxes.xyxy.cpu().numpy())
     if len(boxes) > 0:
         boxes = np.vstack(boxes)
     else:
 def segment_and_highlight_video(video_path, prompt="motorcycle", highlight_color="red"):
     """Segment and highlight motorcycles in a video using SAM 2 and YOLO-World."""
+    # Get video properties first
     cap = cv2.VideoCapture(video_path)
+    original_fps = cap.get(cv2.CAP_PROP_FPS)
+    original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    # Target resolution
+    target_width, target_height = 320, 180
+    # Get first frame for detection
     ret, first_frame = cap.read()
     if not ret:
+        cap.release()
         raise ValueError("Could not read first frame from video.")
     # Resize first frame for detection
+    first_frame_resized = cv2.resize(first_frame, (target_width, target_height))
     cap.release()
+    # Detect boxes in resized first frame
+    boxes = detect_motorcycles(first_frame_resized, prompt)
     if len(boxes) == 0:
         return video_path  # No motorcycles detected, return original
+    # Boxes are already in the target resolution coordinate system
+    print(f"Detected {len(boxes)} objects with boxes: {boxes}")
     # Color map for highlighting
     color_map = {"red": (0, 0, 255), "green": (0, 255, 0), "blue": (255, 0, 0)}
     highlight_rgb = color_map.get(highlight_color.lower(), (0, 0, 255))
+    # Process video frame by frame instead of using SAM's video prediction
+    cap = cv2.VideoCapture(video_path)
+    output_path = "output.mp4"
+    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), original_fps, (target_width, target_height))
+    frame_count = 0
+    max_frames = min(total_frames, 150)  # Limit to 150 frames (~5 seconds at 30fps)
+    print(f"Processing {max_frames} frames...")
+    while frame_count < max_frames:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Resize frame to target resolution
+        frame_resized = cv2.resize(frame, (target_width, target_height))
+        try:
+            # Run SAM on individual frame with explicit resolution control
+            sam_results = sam_model.predict(
+                source=frame_resized,
+                bboxes=boxes,
+                device="cpu",
+                imgsz=320,  # Force SAM resolution
+                conf=0.25,
+                verbose=False
+            )
+            highlighted_frame = frame_resized.copy()
+            # Process SAM results
+            if len(sam_results) > 0 and sam_results[0].masks is not None:
+                masks = sam_results[0].masks.data.cpu().numpy()
+                if len(masks) > 0:
+                    # Combine all masks
+                    combined_mask = np.any(masks, axis=0).astype(np.uint8)
+                    # Create colored overlay
+                    overlay = np.zeros_like(frame_resized)
+                    overlay[combined_mask == 1] = highlight_rgb
+                    # Blend with original frame
+                    highlighted_frame = cv2.addWeighted(frame_resized, 0.7, overlay, 0.3, 0)
+        except Exception as e:
+            print(f"Error processing frame {frame_count}: {e}")
+            highlighted_frame = frame_resized
         out.write(highlighted_frame)
+        frame_count += 1
+        # Progress indicator
+        if frame_count % 30 == 0:
+            print(f"Processed {frame_count}/{max_frames} frames")
     cap.release()
     out.release()
+    print(f"Video processing complete. Output saved to {output_path}")
     return output_path
 # Gradio interface
     fn=segment_and_highlight_video,
     inputs=[
         gr.Video(label="Upload Video"),
+        gr.Textbox(label="Prompt", placeholder="e.g., motorcycle", value="motorcycle"),
+        gr.Dropdown(choices=["red", "green", "blue"], label="Highlight Color", value="red")
     ],
     outputs=gr.Video(label="Highlighted Video"),
+    title="Video Segmentation with MobileSAM and YOLO-World (CPU Optimized)",
+    description="Upload a short video (5-10 seconds), specify a text prompt (e.g., 'motorcycle'), and choose a highlight color. Optimized for CPU with 320x180 resolution.",
+    examples=[
+        [None, "motorcycle", "red"],
+        [None, "car", "green"],
+        [None, "person", "blue"]
+    ]
 )
+if __name__ == "__main__":
+    iface.launch()