Spaces:

furbola
/

chaskick

Build error

App Files Files Community

Mirko Trasciatti commited on Nov 10, 2025

Commit

8457ca9

1 Parent(s): a2fc2ab

Restore to last known working version (single object only)

Browse files

Files changed (1) hide show

app.py +34 -215

app.py CHANGED Viewed

@@ -65,138 +65,6 @@ def load_video_cv2(video_path):
     return frames, {'fps': fps}
-@spaces.GPU
-def segment_video_multi_objects(video_file, annotations_json, remove_bg):
-    """
-    Segment video with MULTIPLE objects.
-    annotations_json: JSON string with format:
-    [
-        {"x": 360, "y": 640, "frame": 0, "obj_id": 1},
-        {"x": 360, "y": 640, "frame": 189, "obj_id": 2}
-    ]
-    """
-    global device, model, processor
-    if model is None:
-        initialize_model()
-    try:
-        if video_file is None:
-            return None, "❌ Error: No video file provided"
-        video_path = str(video_file)
-        if not os.path.exists(video_path):
-            return None, f"❌ Error: Video file not found: {video_path}"
-        print(f"Processing video from: {video_path}")
-        # Parse annotations
-        try:
-            annotations = json.loads(annotations_json)
-        except:
-            return None, f"❌ Error: Invalid JSON format for annotations"
-        if not annotations or len(annotations) == 0:
-            return None, "❌ Error: No annotations provided"
-        print(f"Processing {len(annotations)} objects...")
-        # Load video
-        video_frames, video_info = load_video_cv2(video_path)
-        fps = video_info.get('fps', 30.0)
-        # Initialize inference session
-        dtype = torch.float32
-        inference_session = processor.init_video_session(
-            video=video_frames,
-            inference_device=device,
-            dtype=dtype,
-        )
-        # Add all annotations
-        for ann in annotations:
-            x = int(ann['x'])
-            y = int(ann['y'])
-            frame = int(ann['frame'])
-            obj_id = int(ann.get('obj_id', 1))
-            print(f"  Adding object {obj_id} at ({x}, {y}) on frame {frame}")
-            processor.add_inputs_to_inference_session(
-                inference_session=inference_session,
-                frame_idx=frame,
-                obj_ids=obj_id,
-                input_points=[[[[x, y]]]],
-                input_labels=[[[1]]],
-            )
-            # Run inference on this frame
-            model(
-                inference_session=inference_session,
-                frame_idx=frame,
-            )
-        # Propagate through video (will track ALL objects)
-        video_segments = {}
-        for sam2_output in model.propagate_in_video_iterator(inference_session):
-            video_res_masks = processor.post_process_masks(
-                [sam2_output.pred_masks],
-                original_sizes=[[inference_session.video_height, inference_session.video_width]],
-                binarize=False,
-            )[0]
-            video_segments[sam2_output.frame_idx] = video_res_masks
-        # Create output video
-        output_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
-        first_frame = np.array(video_frames[0])
-        height, width = first_frame.shape[:2]
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
-        for frame_idx, frame_pil in enumerate(video_frames):
-            frame = np.array(frame_pil)
-            if frame_idx in video_segments:
-                mask = video_segments[frame_idx].cpu().numpy()
-                # Combine ALL object masks
-                if mask.ndim == 4:
-                    # Shape: [batch, num_objects, height, width]
-                    # Combine across object dimension
-                    mask = mask[0]  # Remove batch dim
-                if mask.ndim == 3:
-                    # Combine all object masks with max (OR operation)
-                    mask = mask.max(axis=0)
-                if mask.shape != (height, width):
-                    mask = cv2.resize(mask, (width, height), interpolation=cv2.INTER_NEAREST)
-                mask_binary = (mask > 0.5).astype(np.uint8)
-                if remove_bg:
-                    background = np.zeros_like(frame)
-                    mask_3d = np.repeat(mask_binary[:, :, np.newaxis], 3, axis=2)
-                    frame = frame * mask_3d + background * (1 - mask_3d)
-            frame_bgr = cv2.cvtColor(frame.astype(np.uint8), cv2.COLOR_RGB2BGR)
-            out.write(frame_bgr)
-        out.release()
-        if os.path.exists(output_path):
-            return output_path, f"✅ Success! Processed {len(annotations)} objects across {len(video_segments)} frames"
-        else:
-            return None, f"❌ Error: Output file was not created"
-    except Exception as e:
-        import traceback
-        error_details = traceback.format_exc()
-        print(f"Error in segment_video_multi_objects: {error_details}")
-        return None, f"❌ Error: {str(e)}"
 @spaces.GPU
 def segment_video_simple(video_file, point_x, point_y, frame_idx, remove_bg):
     """Simple video segmentation with a single point."""
@@ -261,7 +129,7 @@ def segment_video_simple(video_file, point_x, point_y, frame_idx, remove_bg):
             video_segments[sam2_output.frame_idx] = video_res_masks
         # Create output video
-        output_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
         first_frame = np.array(video_frames[0])
         height, width = first_frame.shape[:2]
@@ -316,95 +184,46 @@ def create_app():
         # 🎥 SAM2 Video Background Remover
         Remove backgrounds from videos by tracking objects with Meta's SAM2.
         """)
-        with gr.Tabs():
-            # Tab 1: Single Object
-            with gr.Tab("Single Object"):
-                gr.Markdown("""
-                **Track ONE object:**
-                1. Upload a video
-                2. Enter X, Y coordinates of the object to track
-                3. Click "Process Video"
-                """)
                 with gr.Row():
-                    with gr.Column():
-                        video_input = gr.File(label="Upload Video", file_types=["video"])
-                        with gr.Row():
-                            point_x = gr.Textbox(label="Point X", value="320")
-                            point_y = gr.Textbox(label="Point Y", value="240")
-                        frame_idx = gr.Textbox(label="Frame Index", value="0")
-                        remove_bg = gr.Checkbox(label="Remove Background", value=True)
-                        process_btn = gr.Button("🎬 Process Video", variant="primary")
-                    with gr.Column():
-                        output_video = gr.File(label="Output Video")
-                        status_text = gr.Textbox(label="Status", lines=3)
-                process_btn.click(
-                    fn=segment_video_simple,
-                    inputs=[video_input, point_x, point_y, frame_idx, remove_bg],
-                    outputs=[output_video, status_text]
-                )
-                gr.Markdown("""
-                ### Tips:
-                - Point X, Y: Coordinates of the object in the video
-                - Frame Index: Usually 0 (first frame)
-                - Portrait and landscape videos are both supported!
-                """)
-            # Tab 2: Multiple Objects
-            with gr.Tab("Multiple Objects"):
-                gr.Markdown("""
-                **Track MULTIPLE objects:**
-                1. Upload a video
-                2. Enter annotations as JSON (see example below)
-                3. Click "Process Video"
-                **Example JSON** (ball at frame 0, player at frame 189):
-                ```json
-                [
-                    {"x": 360, "y": 640, "frame": 0, "obj_id": 1},
-                    {"x": 360, "y": 640, "frame": 189, "obj_id": 2}
-                ]
-                ```
-                """)
-                with gr.Row():
-                    with gr.Column():
-                        video_input_multi = gr.File(label="Upload Video", file_types=["video"])
-                        annotations_json = gr.Textbox(
-                            label="Annotations (JSON)",
-                            value='[{"x": 360, "y": 640, "frame": 0, "obj_id": 1}, {"x": 360, "y": 640, "frame": 189, "obj_id": 2}]',
-                            lines=5
-                        )
-                        remove_bg_multi = gr.Checkbox(label="Remove Background", value=True)
-                        process_btn_multi = gr.Button("🎬 Process Multiple Objects", variant="primary")
-                    with gr.Column():
-                        output_video_multi = gr.File(label="Output Video")
-                        status_text_multi = gr.Textbox(label="Status", lines=3)
-                process_btn_multi.click(
-                    fn=segment_video_multi_objects,
-                    inputs=[video_input_multi, annotations_json, remove_bg_multi],
-                    outputs=[output_video_multi, status_text_multi]
-                )
-                gr.Markdown("""
-                ### Tips:
-                - Each object needs: `x`, `y`, `frame`, and unique `obj_id`
-                - Pick frames where each object is clearly visible
-                - All objects will be tracked and combined in the output!
-                """)
     return app

     return frames, {'fps': fps}
 @spaces.GPU
 def segment_video_simple(video_file, point_x, point_y, frame_idx, remove_bg):
     """Simple video segmentation with a single point."""
             video_segments[sam2_output.frame_idx] = video_res_masks
         # Create output video
+        output_path = tempfile.mktemp(suffix=".mp4")
         first_frame = np.array(video_frames[0])
         height, width = first_frame.shape[:2]
         # 🎥 SAM2 Video Background Remover
         Remove backgrounds from videos by tracking objects with Meta's SAM2.
+        **How to use:**
+        1. Upload a video
+        2. Enter X, Y coordinates of the object to track (from first frame)
+        3. Click "Process Video"
         """)
+        with gr.Row():
+            with gr.Column():
+                # Using gr.File instead of gr.Video for better API compatibility
+                video_input = gr.File(label="Upload Video", file_types=["video"])
                 with gr.Row():
+                    point_x = gr.Textbox(label="Point X", value="320")
+                    point_y = gr.Textbox(label="Point Y", value="240")
+                frame_idx = gr.Textbox(label="Frame Index", value="0")
+                remove_bg = gr.Checkbox(label="Remove Background", value=True)
+                process_btn = gr.Button("🎬 Process Video", variant="primary")
+            with gr.Column():
+                output_video = gr.File(label="Output Video")
+                status_text = gr.Textbox(label="Status", lines=3)
+        process_btn.click(
+            fn=segment_video_simple,
+            inputs=[video_input, point_x, point_y, frame_idx, remove_bg],
+            outputs=[output_video, status_text]
+        )
+        gr.Markdown("""
+        ### Tips:
+        - Point X, Y: Coordinates of the object in the video
+        - For a 720x1280 portrait video, center is typically X=360, Y=640
+        - For a 1920x1080 landscape video, center is typically X=960, Y=540
+        - Frame Index: Usually 0 (first frame)
+        - Processing time depends on video length (CPU processing is slow)
+        - Portrait and landscape videos are both supported!
+        """)
     return app