Spaces:

tp53
/

oncoseg-api

Runtime error

App Files Files Community

tp53(ashish) commited on Jan 21

Commit

c94cf50

1 Parent(s): 49e4f07

Remove model folder - use fallback mode

Browse files

Files changed (2) hide show

model/__init__.py +0 -0
model/medsam3.py +0 -379

model/__init__.py DELETED Viewed

File without changes

model/medsam3.py DELETED Viewed

@@ -1,379 +0,0 @@
-import os
-import torch
-import torch.nn as nn
-from typing import Dict, Optional, List, Any
-try:
-    from sam3.model_builder import build_sam3_image_model as build_sam3_model
-    from sam3.model.data_misc import BatchedDatapoint, FindStage, BatchedFindTarget, BatchedInferenceMetadata
-    from sam3.model import decoder as sam3_decoder
-    SAM3_AVAILABLE = True
-except ImportError:
-    build_sam3_model = None
-    BatchedDatapoint = None
-    FindStage = None
-    BatchedFindTarget = None
-    BatchedInferenceMetadata = None
-    sam3_decoder = None
-    SAM3_AVAILABLE = False
-from peft import LoraConfig, get_peft_model
-def _patch_sam3_decoder_for_ddp():
-    """
-    Monkey-patch SAM3's decoder to fix DDP device placement bug.
-    The bug: SAM3 caches coords_h/coords_w in compilable_cord_cache and coord_cache.
-    In DDP, these get created on cuda:0 first, then other ranks fail because
-    the cached coords are on the wrong device.
-    The fix: Patch _get_rpb_matrix to always move cached coords to the correct device.
-    """
-    if not SAM3_AVAILABLE or sam3_decoder is None:
-        return
-    # Find the decoder class that has _get_rpb_matrix
-    decoder_cls = None
-    for name in dir(sam3_decoder):
-        cls = getattr(sam3_decoder, name)
-        if isinstance(cls, type) and hasattr(cls, '_get_rpb_matrix'):
-            decoder_cls = cls
-            break
-    if decoder_cls is None:
-        print("[MedSAM3] Warning: Could not find decoder class to patch")
-        return
-    # Store original method
-    original_get_rpb_matrix = decoder_cls._get_rpb_matrix
-    def patched_get_rpb_matrix(self, *args, **kwargs):
-        """Patched version that ensures coords are on the correct device."""
-        # Get device from first tensor argument (reference_boxes)
-        target_device = None
-        for arg in args:
-            if torch.is_tensor(arg):
-                target_device = arg.device
-                break
-        if target_device is None:
-            for v in kwargs.values():
-                if torch.is_tensor(v):
-                    target_device = v.device
-                    break
-        if target_device is not None:
-            # Fix compilable_cord_cache if device mismatch
-            if hasattr(self, 'compilable_cord_cache') and self.compilable_cord_cache is not None:
-                cached_h, cached_w = self.compilable_cord_cache
-                if cached_h.device != target_device:
-                    self.compilable_cord_cache = (
-                        cached_h.to(target_device),
-                        cached_w.to(target_device)
-                    )
-            # Also fix coord_cache dict
-            if hasattr(self, 'coord_cache') and self.coord_cache:
-                for key in list(self.coord_cache.keys()):
-                    cached_h, cached_w = self.coord_cache[key]
-                    if cached_h.device != target_device:
-                        self.coord_cache[key] = (
-                            cached_h.to(target_device),
-                            cached_w.to(target_device)
-                        )
-        return original_get_rpb_matrix(self, *args, **kwargs)
-    # Apply patch
-    decoder_cls._get_rpb_matrix = patched_get_rpb_matrix
-    print("[MedSAM3] Successfully patched SAM3 decoder for DDP compatibility")
-# Apply the patch at module load time
-_patch_sam3_decoder_for_ddp()
-class MedSAM3Model(nn.Module):
-    def __init__(self, model_id: str = "sam3_hiera_base", lora_rank: int = 16, image_size: int = 1024, checkpoint_path: Optional[str] = None):
-        super().__init__()
-        self._logged_shapes = False  # For one-time debug logging
-        self._buffers_migrated = False  # Track if we've done buffer device migration
-        self.image_size = image_size  # Store for coordinate normalization
-        # --- 1. Initialize SAM 3 Architecture ---
-        if build_sam3_model:
-            # Initialize SAM3 architecture without downloading from HuggingFace
-            # (our checkpoint already contains full weights including base SAM3)
-            self.model = build_sam3_model(load_from_HF=False, eval_mode=False)
-            # --- 2. Load Weights ---
-            if checkpoint_path and os.path.exists(checkpoint_path):
-                state_dict = torch.load(checkpoint_path, map_location="cpu")
-                if "model" in state_dict:
-                    state_dict = state_dict["model"]
-                self.model.load_state_dict(state_dict, strict=False)
-        else:
-            raise ImportError(
-                "CRITICAL: SAM3 core libraries not found. "
-                "Ensure you have installed sam3 correctly (e.g. via pip install git+...sam3.git). "
-                "Check logs for previous import errors."
-            )
-        # --- 3. Freeze Backbone ---
-        for name, param in self.model.named_parameters():
-            if "perception_encoder" in name:
-                param.requires_grad = False
-        # --- 4. Apply LoRA ---
-        lora_config = LoraConfig(
-            r=lora_rank,
-            lora_alpha=lora_rank * 2,
-            target_modules=["qkv", "proj"],
-            lora_dropout=0.1,
-            bias="none",
-            task_type=None # Important: prevents peft from injecting 'input_ids'
-        )
-        self.model = get_peft_model(self.model, lora_config)
-        # --- 5. Foundation Specialist Fix: Dummy Matcher ---
-        # SAM3's forward_grounding path (which handles boxes/points) sometimes
-        # attempts to call self.matcher even if no targets are provided.
-        # We inject a dummy matcher that returns empty indices to prevent
-        # 'NoneType object is not callable' crashes.
-        base_model = self.model.get_base_model()
-        if hasattr(base_model, 'matcher') and base_model.matcher is None:
-            # Matcher expected signature: func(outputs, targets) -> list of matches
-            base_model.matcher = lambda outputs, targets: []
-            print("[MedSAM3] Injected dummy matcher for grounding stability")
-    def forward(self, pixel_values, input_boxes=None, input_points=None, point_labels=None, text_prompt=None):
-        # DDP Fix: Ensure all model buffers are on the same device as input
-        # SAM3 has some internal buffers that don't auto-migrate in DDP
-        # Only do this once per device to avoid overhead on every forward pass
-        target_device = pixel_values.device
-        if not self._buffers_migrated:
-            migrated_count = 0
-            for name, buf in self.model.named_buffers():
-                if buf.device != target_device:
-                    buf.data = buf.data.to(target_device)
-                    migrated_count += 1
-            if migrated_count > 0:
-                print(f"[MedSAM3] Migrated {migrated_count} buffers to {target_device}")
-            self._buffers_migrated = True
-        # Debug: Log shapes once on first forward pass
-        if not self._logged_shapes:
-            print(f"[MedSAM3] First forward - Input shapes:")
-            print(f"  pixel_values: {pixel_values.shape}")
-            print(f"  input_boxes: {input_boxes.shape if input_boxes is not None else None}")
-            print(f"  input_points: {input_points.shape if input_points is not None else None}")
-            print(f"  point_labels: {point_labels.shape if point_labels is not None else None}")
-        # --- 1. Handle 3D to 2D Flattening (Robust) ---
-        if pixel_values.dim() == 5:
-            # Input: (B, C, T, H, W) -> Goal: (B*T, C, H, W)
-            B_orig, C, T, H, W = pixel_values.shape
-            # Permute to (B, T, C, H, W) then flatten
-            pixel_values = pixel_values.permute(0, 2, 1, 3, 4).reshape(B_orig * T, C, H, W)
-            if input_boxes is not None:
-                # input_boxes is (B, T, 4) -> (B*T, 4)
-                input_boxes = input_boxes.view(B_orig * T, 4)
-            if input_points is not None:
-                # input_points is (B, T, 1, 2) -> (B*T, 1, 2)
-                input_points = input_points.view(B_orig * T, -1, 2)
-            if point_labels is not None:
-                # point_labels is (B, T, 1) -> (B*T, 1)
-                point_labels = point_labels.view(B_orig * T, -1)
-        # After reshaping, get the actual batch size
-        B = pixel_values.shape[0]
-        # --- 2. Channel Handling (Ensuring 3 channels for SAM3) ---
-        num_channels = pixel_values.shape[1]
-        if num_channels == 1:
-            # Single-channel (e.g., CT) -> replicate to 3 channels
-            pixel_values = pixel_values.repeat(1, 3, 1, 1)
-        elif num_channels == 3:
-            # Already 3 channels (e.g., multi-modal MRI after SelectMRIChannels)
-            pass
-        elif num_channels == 4:
-            # 4-channel MRI (BrainTumour) - use first 3 channels [FLAIR, T1w, T1gd]
-            # This is a fallback; ideally SelectMedicalChannels should handle this in transforms
-            if not self._logged_shapes:
-                print(f"[MedSAM3 WARNING] Received 4-channel input - using first 3 channels. "
-                      f"Consider enabling SelectMedicalChannels transform.")
-            pixel_values = pixel_values[:, :3, :, :]
-        else:
-            # Unexpected channel count - average and replicate
-            if not self._logged_shapes:
-                print(f"[MedSAM3 WARNING] Unexpected {num_channels} channels - averaging to single then replicating to 3.")
-            pixel_values = pixel_values.mean(dim=1, keepdim=True).repeat(1, 3, 1, 1)
-        # --- 3. Prompt Dimension Enforcement ---
-        # Boxes: (B_total, 1, 4)
-        if input_boxes is not None and input_boxes.dim() == 2:
-            input_boxes = input_boxes.unsqueeze(1)
-        # Points: (B_total, 1, N, 2), Labels: (B_total, 1, N)
-        if input_points is not None and input_points.dim() == 3:
-            input_points = input_points.unsqueeze(1)
-        if point_labels is not None and point_labels.dim() == 2:
-            point_labels = point_labels.unsqueeze(1)
-        # --- 3. Package into Official SAM 3 Structure ---
-        if BatchedDatapoint is not None and FindStage is not None:
-            # Get device from model parameters (critical for DDP multi-GPU)
-            device = pixel_values.device
-            # SAM 3 expects a SINGLE FindStage object that aggregates prompts for the entire batch.
-            # We must flatten the batch dimension of the prompts and create corresponding img_ids.
-            # Current shapes:
-            # input_boxes: (B, 1, 4)
-            # input_points: (B, 1, N, 2)
-            # point_labels: (B, 1, N)
-            # We treat each image in the batch as having 1 prompt (since we have 1 box/point set per slice)
-            # So we just flatten the first dimension.
-            # img_ids: [0, 1, 2, ... B-1] (since 1 prompt per image)
-            img_ids = torch.arange(B, device=device, dtype=torch.long)
-            # Text ids: all 0 (dummy)
-            text_ids = torch.zeros(B, device=device, dtype=torch.long)
-            # SAM3 expects SEQUENCE-FIRST format for embeddings, BATCH-FIRST for masks:
-            # input_boxes: [num_boxes, num_prompts, 4] - sequence first
-            # input_boxes_mask: [num_prompts, num_boxes] - batch first (1=padded/invalid)
-            # input_boxes_label: [num_boxes, num_prompts]
-            # input_points: [num_points, num_prompts, 2] - sequence first
-            # input_points_mask: [num_prompts, num_points] - batch first
-            #
-            # For our case: 1 box per image, B images → num_boxes=1, num_prompts=B
-            # Boxes: [B, 1, 4] → [1, B, 4] (sequence first)
-            # SAM3 expects boxes in CxCyWH format, normalized to [0, 1]
-            # Our input is xyxy in pixel coordinates
-            if input_boxes is not None:
-                boxes_xyxy = input_boxes.squeeze(1).float().to(device)  # [B, 4] - x_min, y_min, x_max, y_max
-                # Use actual tensor dimensions for normalization (more robust than stored image_size)
-                actual_h, actual_w = pixel_values.shape[2], pixel_values.shape[3]
-                # Convert xyxy to cxcywh and normalize to [0, 1]
-                x_min, y_min, x_max, y_max = boxes_xyxy[:, 0], boxes_xyxy[:, 1], boxes_xyxy[:, 2], boxes_xyxy[:, 3]
-                cx = (x_min + x_max) / 2.0 / actual_w
-                cy = (y_min + y_max) / 2.0 / actual_h
-                w = (x_max - x_min) / actual_w
-                h = (y_max - y_min) / actual_h
-                # Clamp to ensure valid boxes (min size 1% of image to avoid ROI align issues)
-                min_size = 0.01
-                w = torch.clamp(w, min=min_size)
-                h = torch.clamp(h, min=min_size)
-                boxes_cxcywh = torch.stack([cx, cy, w, h], dim=1)  # [B, 4]
-                flat_boxes = boxes_cxcywh.unsqueeze(0)  # [1, B, 4]
-                flat_boxes_mask = torch.zeros(B, 1, device=device, dtype=torch.bool)  # [B, 1] - 0=valid
-                flat_boxes_label = torch.zeros(1, B, device=device, dtype=torch.long)  # [1, B]
-            else:
-                flat_boxes = torch.zeros(1, B, 4, device=device)
-                flat_boxes_mask = torch.ones(B, 1, device=device, dtype=torch.bool)  # 1=invalid/padded
-                flat_boxes_label = torch.zeros(1, B, device=device, dtype=torch.long)
-            # Points: [B, 1, N, 2] → [N, B, 2] (sequence first)
-            # SAM3 expects points normalized to [0, 1]
-            n_points = input_points.shape[2] if input_points is not None else 1
-            if input_points is not None:
-                points_pixel = input_points.squeeze(1).float().to(device)  # [B, N, 2] - x, y in pixel coords
-                # Normalize using actual tensor dimensions
-                actual_h, actual_w = pixel_values.shape[2], pixel_values.shape[3]
-                points_normalized = points_pixel.clone()
-                points_normalized[..., 0] = points_pixel[..., 0] / actual_w  # x normalized
-                points_normalized[..., 1] = points_pixel[..., 1] / actual_h  # y normalized
-                flat_points = points_normalized.permute(1, 0, 2)  # [B, N, 2] → [N, B, 2]
-                flat_points_mask = torch.zeros(B, n_points, device=device, dtype=torch.bool)  # 0=valid
-            else:
-                flat_points = torch.zeros(1, B, 2, device=device)
-                flat_points_mask = torch.ones(B, 1, device=device, dtype=torch.bool)  # 1=invalid
-            stage = FindStage(
-                img_ids=img_ids,
-                text_ids=text_ids,
-                input_boxes=flat_boxes,
-                input_boxes_mask=flat_boxes_mask,
-                input_boxes_label=flat_boxes_label,
-                input_points=flat_points,
-                input_points_mask=flat_points_mask,
-            )
-            # Text batch for grounding head - use provided text_prompt or fallback
-            if text_prompt is not None:
-                find_text_batch = [text_prompt] * B
-            else:
-                find_text_batch = ["medical"] * B
-            # Create dummy target structure to satisfy SAM3's internal indexing [0]
-            # We use the dummy matcher injected in __init__ to ensure this doesn't
-            # actually trigger any real loss computation.
-            dummy_target = BatchedFindTarget(
-                num_boxes=torch.zeros(B, device=device, dtype=torch.long),
-                boxes=torch.zeros(B, 4, device=device),
-                boxes_padded=torch.zeros(B, 1, 4, device=device),
-                repeated_boxes=torch.zeros(B, 4, device=device),
-                segments=None,
-                semantic_segments=None,
-                is_valid_segment=None,
-                is_exhaustive=torch.zeros(B, device=device, dtype=torch.bool),
-                object_ids=torch.zeros(B, device=device, dtype=torch.long),
-                object_ids_padded=torch.zeros(B, 1, device=device, dtype=torch.long),
-            )
-            # Create proper metadata structure (required by SAM3's type hints)
-            # BatchedInferenceMetadata requires: coco_image_id, original_image_id, original_category_id,
-            # original_size, object_id, frame_index, is_conditioning_only
-            dummy_metadata = BatchedInferenceMetadata(
-                coco_image_id=torch.zeros(B, device=device, dtype=torch.long),
-                original_image_id=torch.zeros(B, device=device, dtype=torch.long),
-                original_category_id=torch.zeros(B, device=device, dtype=torch.int),
-                original_size=torch.tensor([[self.image_size, self.image_size]] * B, device=device, dtype=torch.long),
-                object_id=torch.zeros(B, device=device, dtype=torch.long),
-                frame_index=torch.zeros(B, device=device, dtype=torch.long),
-                is_conditioning_only=[None] * B,
-            ) if BatchedInferenceMetadata is not None else {}
-            # Package into BatchedDatapoint
-            # find_targets=[dummy_target]: satisfy internal 'input.find_targets[0]' access
-            find_targets_list = [dummy_target]  # Pre-create to verify it's not empty
-            find_metadatas_list = [dummy_metadata]
-            data = BatchedDatapoint(
-                img_batch=pixel_values,
-                find_text_batch=find_text_batch,
-                find_inputs=[stage],
-                find_targets=find_targets_list,
-                find_metadatas=find_metadatas_list
-            )
-            # Immediate verification
-            assert len(data.find_targets) == 1, f"find_targets should have 1 element, got {len(data.find_targets)}"
-            # Debug: Log processed shapes once
-            if not self._logged_shapes:
-                print(f"[MedSAM3] Processed shapes before SAM3 call:")
-                print(f"  img_batch: {pixel_values.shape}")
-                print(f"  flat_boxes: {flat_boxes.shape} (CxCyWH normalized)")
-                print(f"  flat_boxes sample: {flat_boxes[0, 0, :] if flat_boxes.numel() > 0 else 'empty'}")
-                print(f"  flat_boxes_mask: {flat_boxes_mask.shape}")
-                print(f"  flat_points: {flat_points.shape} (normalized)")
-                print(f"  flat_points_mask: {flat_points_mask.shape}")
-                print(f"  img_ids: {img_ids.shape}")
-                print(f"  find_targets: {data.find_targets}, len={len(data.find_targets)}")
-                print(f"  find_inputs: {data.find_inputs}, len={len(data.find_inputs)}")
-                self._logged_shapes = True
-            # Validation safety: verify find_targets is not empty before passing to SAM3
-            if len(data.find_targets) == 0:
-                print(f"[MedSAM3 ERROR] find_targets is empty! dummy_target={dummy_target}")
-                raise ValueError("find_targets list is empty - this should never happen")
-            return self.model(data)
-        else:
-            return self.model(pixel_values)