Spaces:

IFMedTechdemo
/

Video_segmentation

Paused

App Files Files Community

IFMedTechdemo commited on Dec 30, 2025

Commit

56cc62f

verified ·

1 Parent(s): ef99afc

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

.gitattributes +1 -0
app.py +136 -78
examples/sample_surgical.png +3 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 models/sam/notebooks/images/groceries.jpg filter=lfs diff=lfs merge=lfs -text
 models/sam/notebooks/images/truck.jpg filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 models/sam/notebooks/images/groceries.jpg filter=lfs diff=lfs merge=lfs -text
 models/sam/notebooks/images/truck.jpg filter=lfs diff=lfs merge=lfs -text
+examples/sample_surgical.png filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 Surgical-DeSAM Gradio App for Hugging Face Spaces
-Uses ZeroGPU for inference
 """
 import os
 import spaces
@@ -10,8 +10,9 @@ import numpy as np
 import cv2
 from PIL import Image
 from huggingface_hub import hf_hub_download
-# Model imports (will be copied to hf_space)
 from models.detr_seg import DETR, SAMModel
 from models.backbone import build_backbone
 from models.transformer import build_transformer
@@ -44,7 +45,6 @@ def download_weights():
     weights_dir = "weights"
     os.makedirs(weights_dir, exist_ok=True)
-    # Download DeSAM weights
     desam_path = hf_hub_download(
         repo_id=MODEL_REPO,
         filename="surgical_desam_1024.pth",
@@ -52,7 +52,6 @@ def download_weights():
         local_dir=weights_dir
     )
-    # Download SAM weights
     sam_path = hf_hub_download(
         repo_id=MODEL_REPO,
         filename="sam_vit_b_01ec64.pth",
@@ -60,10 +59,9 @@ def download_weights():
         local_dir=weights_dir
     )
-    # Download Swin backbone
     swin_dir = "swin_backbone"
     os.makedirs(swin_dir, exist_ok=True)
-    swin_path = hf_hub_download(
         repo_id=MODEL_REPO,
         filename="swin_base_patch4_window7_224_22kto1k.pth",
         token=HF_TOKEN,
@@ -99,11 +97,8 @@ def load_models():
     global model, seg_model, device
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    # Download weights
     desam_path, sam_path = download_weights()
-    # Build model
     args = Args()
     args.device = str(device)
@@ -113,18 +108,16 @@ def load_models():
     model = DETR(
         backbone,
         transformer,
-        num_classes=9,  # 8 classes + background
         num_queries=args.num_queries,
         aux_loss=args.aux_loss,
     )
-    # Load weights
     checkpoint = torch.load(desam_path, map_location='cpu')
     model.load_state_dict(checkpoint['model'], strict=False)
     model.to(device)
     model.eval()
-    # Load SAM model
     seg_model = SAMModel(device=device, ckpt_path=sam_path)
     if 'seg_model' in checkpoint:
         seg_model.load_state_dict(checkpoint['seg_model'])
@@ -134,18 +127,13 @@ def load_models():
     print("Models loaded successfully!")
-def preprocess_image(image):
-    """Preprocess image for model input"""
-    # Resize to 1024x1024
-    img = cv2.resize(np.array(image), (1024, 1024))
     img = img.astype(np.float32) / 255.0
-    # Normalize
     mean = np.array([0.485, 0.456, 0.406])
     std = np.array([0.229, 0.224, 0.225])
     img = (img - mean) / std
-    # Convert to tensor
     img_tensor = torch.from_numpy(img.transpose(2, 0, 1)).float()
     return img_tensor
@@ -158,38 +146,66 @@ def box_cxcywh_to_xyxy(x):
     return torch.stack(b, dim=-1)
-def create_visualization(image, boxes, labels, masks, scores):
-    """Create visualization with boxes and masks"""
-    img = np.array(image).copy()
-    h, w = img.shape[:2]
-    for i, (box, label, mask, score) in enumerate(zip(boxes, labels, masks, scores)):
         if score < 0.3:
             continue
         color = COLORS[label % len(COLORS)]
         # Draw mask
-        mask_resized = cv2.resize(mask, (w, h))
         mask_bool = mask_resized > 0.5
-        overlay = img.copy()
         overlay[mask_bool] = color
-        img = cv2.addWeighted(img, 0.6, overlay, 0.4, 0)
         # Draw box
         x1, y1, x2, y2 = box.astype(int)
-        cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
         # Draw label
         label_text = f"{INSTRUMENT_CLASSES[label]}: {score:.2f}"
-        cv2.putText(img, label_text, (x1, y1 - 10),
                     cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
-    return Image.fromarray(img)
 @spaces.GPU
-def predict(image):
     """Run inference on input image"""
     global model, seg_model, device
@@ -199,66 +215,108 @@ def predict(image):
     if image is None:
         return None
-    # Preprocess
-    img_tensor = preprocess_image(image).unsqueeze(0).to(device)
-    # Create nested tensor
-    mask = torch.zeros((1, 1024, 1024), dtype=torch.bool, device=device)
-    samples = NestedTensor(img_tensor, mask)
-    # Run detection
-    with torch.no_grad():
-        outputs, image_embeddings = model(samples)
-        # Get predictions
-        probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
-        keep = probas.max(-1).values > 0.3
-        if not keep.any():
-            return image  # No detections
-        # Get boxes
-        boxes = outputs['pred_boxes'][0, keep]
-        scores = probas[keep].max(-1).values.cpu().numpy()
-        labels = probas[keep].argmax(-1).cpu().numpy()
-        # Scale boxes to image size
-        h, w = image.size[1], image.size[0]
-        boxes_scaled = box_cxcywh_to_xyxy(boxes) * torch.tensor([w, h, w, h], device=device)
-        boxes_np = boxes_scaled.cpu().numpy()
-        # Run segmentation
-        low_res_masks, pred_masks, _ = seg_model(
-            img_tensor, boxes, image_embeddings,
-            sizes=(1024, 1024), add_noise=False
-        )
-        masks_np = pred_masks.cpu().numpy()
-    # Create visualization
-    result = create_visualization(image, boxes_np, labels, masks_np, scores)
-    return result
 # Create Gradio interface
-with gr.Blocks(title="Surgical-DeSAM") as demo:
     gr.Markdown("# 🔬 Surgical-DeSAM")
-    gr.Markdown("Upload a surgical image to segment instruments.")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(type="pil", label="Input Image")
-            submit_btn = gr.Button("Segment", variant="primary")
-        with gr.Column():
-            output_image = gr.Image(type="pil", label="Segmentation Result")
-    submit_btn.click(fn=predict, inputs=input_image, outputs=output_image)
-    gr.Examples(
-        examples=[],  # Add example images if available
-        inputs=input_image
-    )
 if __name__ == "__main__":
     demo.launch()

 """
 Surgical-DeSAM Gradio App for Hugging Face Spaces
+Supports both Image and Video segmentation with ZeroGPU
 """
 import os
 import spaces
 import cv2
 from PIL import Image
 from huggingface_hub import hf_hub_download
+import tempfile
+# Model imports
 from models.detr_seg import DETR, SAMModel
 from models.backbone import build_backbone
 from models.transformer import build_transformer
     weights_dir = "weights"
     os.makedirs(weights_dir, exist_ok=True)
     desam_path = hf_hub_download(
         repo_id=MODEL_REPO,
         filename="surgical_desam_1024.pth",
         local_dir=weights_dir
     )
     sam_path = hf_hub_download(
         repo_id=MODEL_REPO,
         filename="sam_vit_b_01ec64.pth",
         local_dir=weights_dir
     )
     swin_dir = "swin_backbone"
     os.makedirs(swin_dir, exist_ok=True)
+    hf_hub_download(
         repo_id=MODEL_REPO,
         filename="swin_base_patch4_window7_224_22kto1k.pth",
         token=HF_TOKEN,
     global model, seg_model, device
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     desam_path, sam_path = download_weights()
     args = Args()
     args.device = str(device)
     model = DETR(
         backbone,
         transformer,
+        num_classes=9,
         num_queries=args.num_queries,
         aux_loss=args.aux_loss,
     )
     checkpoint = torch.load(desam_path, map_location='cpu')
     model.load_state_dict(checkpoint['model'], strict=False)
     model.to(device)
     model.eval()
     seg_model = SAMModel(device=device, ckpt_path=sam_path)
     if 'seg_model' in checkpoint:
         seg_model.load_state_dict(checkpoint['seg_model'])
     print("Models loaded successfully!")
+def preprocess_frame(frame):
+    """Preprocess frame for model input"""
+    img = cv2.resize(frame, (1024, 1024))
     img = img.astype(np.float32) / 255.0
     mean = np.array([0.485, 0.456, 0.406])
     std = np.array([0.229, 0.224, 0.225])
     img = (img - mean) / std
     img_tensor = torch.from_numpy(img.transpose(2, 0, 1)).float()
     return img_tensor
     return torch.stack(b, dim=-1)
+def process_single_frame(frame_rgb, h, w):
+    """Process a single frame and return segmented result"""
+    global model, seg_model, device
+    img_tensor = preprocess_frame(frame_rgb).unsqueeze(0).to(device)
+    mask = torch.zeros((1, 1024, 1024), dtype=torch.bool, device=device)
+    samples = NestedTensor(img_tensor, mask)
+    with torch.no_grad():
+        outputs, image_embeddings = model(samples)
+        probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
+        keep = probas.max(-1).values > 0.3
+        if not keep.any():
+            return frame_rgb  # No detections
+        boxes = outputs['pred_boxes'][0, keep]
+        scores = probas[keep].max(-1).values.cpu().numpy()
+        labels = probas[keep].argmax(-1).cpu().numpy()
+        boxes_scaled = box_cxcywh_to_xyxy(boxes) * torch.tensor([w, h, w, h], device=device)
+        boxes_np = boxes_scaled.cpu().numpy()
+        low_res_masks, pred_masks, _ = seg_model(
+            img_tensor, boxes, image_embeddings,
+            sizes=(1024, 1024), add_noise=False
+        )
+        masks_np = pred_masks.cpu().numpy()
+    # Draw on frame
+    result = frame_rgb.copy()
+    for i, (box, label, mask_pred, score) in enumerate(zip(boxes_np, labels, masks_np, scores)):
         if score < 0.3:
             continue
         color = COLORS[label % len(COLORS)]
         # Draw mask
+        mask_resized = cv2.resize(mask_pred, (w, h))
         mask_bool = mask_resized > 0.5
+        overlay = result.copy()
         overlay[mask_bool] = color
+        result = cv2.addWeighted(result, 0.6, overlay, 0.4, 0)
         # Draw box
         x1, y1, x2, y2 = box.astype(int)
+        cv2.rectangle(result, (x1, y1), (x2, y2), color, 2)
         # Draw label
         label_text = f"{INSTRUMENT_CLASSES[label]}: {score:.2f}"
+        cv2.putText(result, label_text, (x1, y1 - 10),
                     cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+    return result
 @spaces.GPU
+def predict_image(image):
     """Run inference on input image"""
     global model, seg_model, device
     if image is None:
         return None
+    frame_rgb = np.array(image)
+    h, w = frame_rgb.shape[:2]
+    result = process_single_frame(frame_rgb, h, w)
+    return Image.fromarray(result)
+@spaces.GPU(duration=300)
+def predict_video(video_path, progress=gr.Progress()):
+    """Process video and return segmented video"""
+    global model, seg_model, device
+    if model is None:
+        progress(0, desc="Loading models...")
+        load_models()
+    if video_path is None:
+        return None
+    # Open video
+    cap = cv2.VideoCapture(video_path)
+    fps = int(cap.get(cv2.CAP_PROP_FPS))
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    # Output video
+    output_path = tempfile.mktemp(suffix=".mp4")
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+    frame_count = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # BGR to RGB
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        # Process frame
+        result_rgb = process_single_frame(frame_rgb, height, width)
+        # RGB to BGR for output
+        result_bgr = cv2.cvtColor(result_rgb, cv2.COLOR_RGB2BGR)
+        out.write(result_bgr)
+        frame_count += 1
+        progress(frame_count / total_frames, desc=f"Processing frame {frame_count}/{total_frames}")
+    cap.release()
+    out.release()
+    return output_path
 # Create Gradio interface
+with gr.Blocks(title="Surgical-DeSAM", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🔬 Surgical-DeSAM")
+    gr.Markdown("Segment surgical instruments in images or videos using DeSAM architecture.")
+    with gr.Tabs():
+        # Image Tab
+        with gr.TabItem("🖼️ Image Segmentation"):
+            with gr.Row():
+                with gr.Column():
+                    input_image = gr.Image(type="pil", label="Input Image")
+                    image_btn = gr.Button("Segment Image", variant="primary")
+                with gr.Column():
+                    output_image = gr.Image(type="pil", label="Segmentation Result")
+            image_btn.click(fn=predict_image, inputs=input_image, outputs=output_image)
+            gr.Examples(
+                examples=["examples/sample_surgical.png"] if os.path.exists("examples/sample_surgical.png") else [],
+                inputs=input_image,
+                label="Example Images"
+            )
+        # Video Tab
+        with gr.TabItem("🎬 Video Segmentation"):
+            with gr.Row():
+                with gr.Column():
+                    input_video = gr.Video(label="Input Video")
+                    video_btn = gr.Button("Segment Video", variant="primary")
+                with gr.Column():
+                    output_video = gr.Video(label="Segmentation Result")
+            video_btn.click(fn=predict_video, inputs=input_video, outputs=output_video)
+            gr.Examples(
+                examples=["examples/demo_surgical.mp4"] if os.path.exists("examples/demo_surgical.mp4") else [],
+                inputs=input_video,
+                label="Example Videos"
+            )
+    gr.Markdown("""
+    ## Detected Classes
+    Bipolar Forceps | Prograsp Forceps | Large Needle Driver | Monopolar Curved Scissors |
+    Ultrasound Probe | Suction | Clip Applier | Stapler
+    """)
 if __name__ == "__main__":
     demo.launch()

examples/sample_surgical.png ADDED Viewed

Git LFS Details

SHA256: ad9e495ef59e6c67e2dd60aa8fa7d758714e709f4ac5d71dacae90b8f79f275f
Pointer size: 132 Bytes
Size of remote file: 1.22 MB