Upload 13 files

Browse files

Files changed (14) hide show

.gitattributes +1 -0
LICENSE +21 -0
README.md +47 -3
docs/MCP-MedSAM.png +3 -0
infer.py +738 -0
modality_npz_dataset.py +317 -0
models/__init__.py +4 -0
models/common.py +44 -0
models/lite_medsam.py +54 -0
models/mask_decoder.py +465 -0
models/prompt_encoder.py +306 -0
models/tiny_vit.py +645 -0
models/transformer.py +243 -0
train.py +502 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/MCP-MedSAM.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Leo-Lyu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,47 @@
----
-license: mit
----

+# MCP-MedSAM
+Pytorch Implementation of the paper:
+"[MCP-MedSAM: A Powerful Lightweight Medical Segment Anything Model Trained with a Single GPU in Just One Day](https://arxiv.org/abs/2412.05888)"
+![MCP-MedSAM Architecture](docs/MCP-MedSAM.png)
+## 📄 Overview
+This work proposes a lightweight variant of MedSAM by integrating:
+- A **pre-trained Tiny ViT** as the vision backbone
+- Two novel prompt types:
+  - **Modality Prompt**
+  - **Content Prompt**
+- A **modified mask decoder** adapted to these prompts
+To further improve performance across imaging modalities, we introduce a **modality-aware data sampling strategy** that ensures better balance and generalization.
+With these enhancements, our model achieves strong multi-modality segmentation performance, and can be trained in approximately **1 day on a single A100 (40GB)** GPU.
+<!--
+We are currently releasing the inference code along with the model weight. You can download from [here](https://drive.google.com/drive/folders/1NW4aSNhk-dtiK-dicTAUp0g0eR2fryNi?usp=sharing).
+The training code has been released and you can train your . -->
+## Requirements
+* Python==3.10.14
+* torch==2.0.0
+* torchvision==0.15.0
+* transformers==4.49.0
+## Training and Inference
+Training and inference can be done by running train.py and infer.py. Additionally, we also release the model weight for inference, which can be downloaded from [here](https://drive.google.com/drive/folders/1NW4aSNhk-dtiK-dicTAUp0g0eR2fryNi?usp=sharing).
+## Citation
+```bash
+@article{lyu2024mcp,
+  title={MCP-MedSAM: A Powerful Lightweight Medical Segment Anything Model Trained with a Single GPU in Just One Day},
+  author={Lyu, Donghang and Gao, Ruochen and Staring, Marius},
+  journal={arXiv preprint arXiv:2412.05888},
+  year={2024}
+}
+```

docs/MCP-MedSAM.png ADDED Viewed

Git LFS Details

SHA256: 2b082ffd221532ee5590679539cf4eacb13f729f1d71aa987e202b5d889323d0
Pointer size: 132 Bytes
Size of remote file: 2.09 MB

infer.py ADDED Viewed

	@@ -0,0 +1,738 @@

+from os import makedirs
+from os.path import join, basename
+from glob import glob
+from tqdm import tqdm
+from time import time
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import transforms
+from models import PromptEncoder, TwoWayTransformer, TinyViT, MaskDecoder_F4
+from matplotlib import pyplot as plt
+import cv2
+import argparse
+from collections import OrderedDict
+import pandas as pd
+from datetime import datetime
+from transformers import CLIPModel, CLIPTokenizer
+torch.set_float32_matmul_precision('high')
+torch.manual_seed(42)
+torch.cuda.manual_seed(42)
+np.random.seed(42)
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    '-i',
+    '--input_dir',
+    type=str,
+    default='',
+    # required=True,
+    help='root directory of the data',
+)
+parser.add_argument(
+    '-o',
+    '--output_dir',
+    type=str,
+    default='',
+    help='directory to save the prediction',
+)
+parser.add_argument(
+    '-lite_medsam_checkpoint_path',
+    type=str,
+    default="",
+    help='path to the checkpoint of MedSAM-Lite',
+)
+parser.add_argument(
+    '-device',
+    type=str,
+    default="cuda:0",
+    help='device to run the inference',
+)
+parser.add_argument(
+    '-num_workers',
+    type=int,
+    default=4,
+    help='number of workers for inference with multiprocessing',
+)
+parser.add_argument(
+    '--save_overlay',
+    default=False,
+    action='store_true',
+    help='whether to save the overlay image'
+)
+parser.add_argument(
+    '-png_save_dir',
+    type=str,
+    default=None,
+    help='directory to save the overlay image'
+)
+args = parser.parse_args()
+data_root = args.input_dir
+pred_save_dir = args.output_dir
+save_overlay = args.save_overlay
+num_workers = args.num_workers
+if save_overlay:
+    assert args.png_save_dir is not None, "Please specify the directory to save the overlay image"
+    png_save_dir = args.png_save_dir
+    makedirs(png_save_dir, exist_ok=True)
+lite_medsam_checkpoint_path = args.lite_medsam_checkpoint_path
+makedirs(pred_save_dir, exist_ok=True)
+device = torch.device(args.device)
+image_size = 256
+model1 = CLIPModel.from_pretrained("flaviagiammarino/pubmed-clip-vit-base-patch32", resume_download=True)
+tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16", resume_download=True)
+model1.requires_grad_(False)
+def resize_longest_side(image, target_length=256):
+    """
+    Resize image to target_length while keeping the aspect ratio
+    Expects a numpy array with shape HxWxC in uint8 format.
+    """
+    oldh, oldw = image.shape[0], image.shape[1]
+    scale = target_length * 1.0 / max(oldh, oldw)
+    newh, neww = oldh * scale, oldw * scale
+    neww, newh = int(neww + 0.5), int(newh + 0.5)
+    target_size = (neww, newh)
+    return cv2.resize(image, target_size, interpolation=cv2.INTER_AREA)
+def pad_image(image, target_size=256):
+    """
+    Pad image to target_size
+    Expects a numpy array with shape HxWxC in uint8 format.
+    """
+    # Pad
+    h, w = image.shape[0], image.shape[1]
+    padh = target_size - h
+    padw = target_size - w
+    if len(image.shape) == 3: ## Pad image
+        image_padded = np.pad(image, ((0, padh), (0, padw), (0, 0)))
+    else: ## Pad gt mask
+        image_padded = np.pad(image, ((0, padh), (0, padw)))
+    return image_padded
+class MedSAM_Lite(nn.Module):
+    def __init__(
+            self,
+            image_encoder,
+            mask_decoder,
+            prompt_encoder
+        ):
+        super().__init__()
+        self.image_encoder = image_encoder
+        self.mask_decoder = mask_decoder
+        self.prompt_encoder = prompt_encoder
+    def forward(self, image, points, boxes, masks, features, crops, text_features, category_idx):
+        image_embedding = self.image_encoder(image)
+        with torch.no_grad():
+            boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device)
+            if len(boxes.shape) == 2:
+                boxes = boxes[:, None, :] # (B, 1, 4)
+        sparse_embeddings, dense_embeddings = self.prompt_encoder(
+            points=points,
+            boxes=boxes,
+            masks=masks,
+            features=features,
+            crops=crops,
+            text_features = text_features,
+            category_idx=category_idx
+        )
+        low_res_masks, iou_predictions, category_predictions, clip_vec, img_vec = self.mask_decoder(
+            image_embeddings=image_embedding, # (B, 256, 64, 64)
+            image_pe=self.prompt_encoder.get_dense_pe(), # (1, 256, 64, 64)
+            sparse_prompt_embeddings=sparse_embeddings, # (B, 2, 256)
+            dense_prompt_embeddings=dense_embeddings, # (B, 256, 64, 64)
+            multimask_output=False,
+          ) # (B, 1, 256, 256)
+        return low_res_masks
+    @torch.no_grad()
+    def postprocess_masks(self, masks, new_size, original_size):
+        """
+        Do cropping and resizing
+        Parameters
+        ----------
+        masks : torch.Tensor
+            masks predicted by the model
+        new_size : tuple
+            the shape of the image after resizing to the longest side of 256
+        original_size : tuple
+            the original shape of the image
+        Returns
+        -------
+        torch.Tensor
+            the upsampled mask to the original size
+        """
+        # Crop
+        masks = masks[..., :new_size[0], :new_size[1]]
+        # Resize
+        masks = F.interpolate(
+            masks,
+            size=(original_size[0], original_size[1]),
+            mode="bilinear",
+            align_corners=False,
+        )
+        return masks
+def show_mask(mask, ax, mask_color=None, alpha=0.5):
+    """
+    show mask on the image
+    Parameters
+    ----------
+    mask : numpy.ndarray
+        mask of the image
+    ax : matplotlib.axes.Axes
+        axes to plot the mask
+    mask_color : numpy.ndarray
+        color of the mask
+    alpha : float
+        transparency of the mask
+    """
+    if mask_color is not None:
+        color = np.concatenate([mask_color, np.array([alpha])], axis=0)
+    else:
+        color = np.array([251/255, 252/255, 30/255, alpha])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+def show_box(box, ax, edgecolor='blue'):
+    """
+    show bounding box on the image
+    Parameters
+    ----------
+    box : numpy.ndarray
+        bounding box coordinates in the original image
+    ax : matplotlib.axes.Axes
+        axes to plot the bounding box
+    edgecolor : str
+        color of the bounding box
+    """
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor=edgecolor, facecolor=(0,0,0,0), lw=2))
+def show_points(points, ax):
+    points = points.numpy()
+    for i, (x, y) in enumerate(points):
+        ax.scatter(x, y, color='yellow', s=15)
+def get_bbox256(mask_256, bbox_shift=3):
+    """
+    Get the bounding box coordinates from the mask (256x256)
+    Parameters
+    ----------
+    mask_256 : numpy.ndarray
+        the mask of the resized image
+    bbox_shift : int
+        Add perturbation to the bounding box coordinates
+    Returns
+    -------
+    numpy.ndarray
+        bounding box coordinates in the resized image
+    """
+    y_indices, x_indices = np.where(mask_256 > 0)
+    x_min, x_max = np.min(x_indices), np.max(x_indices)
+    y_min, y_max = np.min(y_indices), np.max(y_indices)
+    # add perturbation to bounding box coordinates and test the robustness
+    # this can be removed if you do not want to test the robustness
+    H, W = mask_256.shape
+    x_min = max(0, x_min - bbox_shift)
+    x_max = min(W, x_max + bbox_shift)
+    y_min = max(0, y_min - bbox_shift)
+    y_max = min(H, y_max + bbox_shift)
+    bboxes256 = np.array([x_min, y_min, x_max, y_max])
+    return bboxes256
+def resize_box_to_256(box, original_size):
+    """
+    the input bounding box is obtained from the original image
+    here, we rescale it to the coordinates of the resized image
+    Parameters
+    ----------
+    box : numpy.ndarray
+        bounding box coordinates in the original image
+    original_size : tuple
+        the original size of the image
+    Returns
+    -------
+    numpy.ndarray
+        bounding box coordinates in the resized image
+    """
+    new_box = np.zeros_like(box)
+    ratio = 256 / max(original_size)
+    for i in range(len(box)):
+        new_box[i] = int(box[i] * ratio)
+    return new_box, ratio
+def get_points_256(box, gt2D):
+    gt2D = np.mean(gt2D, axis=-1)
+    if len(box)==1:
+        x_min, y_min, x_max, y_max = box[0]
+    else:
+        x_min, y_min, x_max, y_max = box
+    try:
+        bounder_shiftx = np.random.randint(int((x_max-x_min)/5), int(2*(x_max-x_min)/5), (1,))
+        # bounder_shiftx = int((x_max-x_min)/5)
+    except:
+        bounder_shiftx = 0
+    try:
+        bounder_shifty = np.random.randint(int((y_max-y_min)/5), int(2*(y_max-y_min)/5), (1,))
+        # bounder_shifty = int((y_max-y_min)/5)
+    except:
+        bounder_shifty = 0
+    mid_x = int((x_min+x_max)//2)
+    mid_y = int((y_min+y_max)//2)
+    x_min = int(x_min+bounder_shiftx)
+    x_max = int(x_max-bounder_shiftx)
+    y_min = int(y_min+bounder_shifty)
+    y_max = int(y_max-bounder_shifty)
+    cl = [[y_min, mid_y, x_min, mid_x], [mid_y,y_max,x_min,mid_x], [mid_y,y_max, mid_x,x_max], [y_min,mid_y, mid_x,x_max]]
+    coords = []
+    for i in range(4):
+        gt2D_tmp = np.zeros((256, 256))
+        gt2D_tmp[cl[i][0]:cl[i][1], cl[i][2]:cl[i][3]] = gt2D[cl[i][0]:cl[i][1], cl[i][2]:cl[i][3]]
+        y_indices, x_indices = np.where(gt2D_tmp > 0)
+        if y_indices.size==0:
+            coords.append([mid_x, mid_y])
+        else:
+            x_point = np.random.choice(x_indices)
+            y_point = np.random.choice(y_indices)
+            coords.append([x_point, y_point])
+    coords = np.array(coords).reshape(4, 2)
+    coords = torch.tensor(coords).float()
+    return coords
+def get_points_256_v0(box, gt2D):
+    gt2D = np.mean(gt2D, axis=-1)
+    if len(box)==1:
+        x_min, y_min, x_max, y_max = box[0]
+    else:
+        x_min, y_min, x_max, y_max = box
+    mid_x = int((x_min+x_max)//2)
+    mid_y = int((y_min+y_max)//2)
+    try:
+        bounder_shiftx = np.random.randint(int((x_max-x_min)/3), int(2*(x_max-x_min)/4)-1, (1,))
+        # bounder_shiftx = 0
+    except:
+        bounder_shiftx = 0
+    try:
+        bounder_shifty = np.random.randint(int((y_max-y_min)/3), int(2*(y_max-y_min)/4)-1, (1,))
+        # bounder_shifty = 0
+    except:
+        bounder_shifty = 0
+    x_min = int(x_min+bounder_shiftx)
+    x_max = int(x_max-bounder_shiftx)
+    y_min = int(y_min+bounder_shifty)
+    y_max = int(y_max-bounder_shifty)
+    # cl = [[y_min, mid_y, x_min, mid_x], [mid_y,y_max,x_min,mid_x], [mid_y,y_max, mid_x,x_max], [y_min,mid_y, mid_x,x_max]]
+    coords = []
+    gt2D_tmp = np.zeros((256, 256))
+    gt2D_tmp[y_min:y_max, x_min:x_max] = gt2D[y_min:y_max, x_min:x_max]
+    for i in range(4):
+        y_indices, x_indices = np.where(gt2D_tmp > 0)
+        if y_indices.size==0:
+            coords.append([mid_x, mid_y])
+        else:
+            x_point = np.random.choice(x_indices)
+            y_point = np.random.choice(y_indices)
+            coords.append([x_point, y_point])
+    coords = np.array(coords).reshape(4, 2)
+    coords = torch.tensor(coords).float()
+    return coords
+@torch.no_grad()
+def medsam_inference(medsam_model, img_embed, box_256, features, crops, text_features, category_idx, new_size, original_size):
+    """
+    Perform inference using the LiteMedSAM model.
+    Args:
+        medsam_model (MedSAMModel): The MedSAM model.
+        img_embed (torch.Tensor): The image embeddings.
+        box_256 (numpy.ndarray): The bounding box coordinates.
+        new_size (tuple): The new size of the image.
+        original_size (tuple): The original size of the image.
+    Returns:
+        tuple: A tuple containing the segmented image and the intersection over union (IoU) score.
+    """
+    box_torch = torch.as_tensor(box_256[None, None, ...], dtype=torch.float, device=img_embed.device)
+    features = features.unsqueeze(0).to(device)
+    crops = crops.unsqueeze(0).to(device)
+    category_idx = torch.tensor([category_idx]).to(device)
+    sparse_embeddings, dense_embeddings = medsam_model.prompt_encoder(
+        points=None,
+        boxes=box_torch,
+        masks=None,
+        features=features,
+        crops=crops,
+        text_features = text_features,
+        category_idx=category_idx
+    )
+    low_res_logits, iou, _, _, _ = medsam_model.mask_decoder(
+        image_embeddings=img_embed, # (B, 256, 64, 64)
+        image_pe=medsam_model.prompt_encoder.get_dense_pe(), # (1, 256, 64, 64)
+        sparse_prompt_embeddings=sparse_embeddings, # (B, 2, 256)
+        dense_prompt_embeddings=dense_embeddings, # (B, 256, 64, 64)
+        multimask_output=False
+    )
+    low_res_pred = medsam_model.postprocess_masks(low_res_logits, new_size, original_size)
+    low_res_pred = torch.sigmoid(low_res_pred)
+    low_res_pred = low_res_pred.squeeze().cpu().numpy()
+    medsam_seg = (low_res_pred > 0.5).astype(np.uint8)
+    return medsam_seg, iou
+medsam_lite_image_encoder = TinyViT(
+    img_size=256,
+    in_chans=3,
+    embed_dims=[
+        64, ## (64, 256, 256)
+        128, ## (128, 128, 128)
+        160, ## (160, 64, 64)
+        320 ## (320, 64, 64)
+    ],
+    depths=[2, 2, 6, 2],
+    num_heads=[2, 4, 5, 10],
+    window_sizes=[7, 7, 14, 7],
+    mlp_ratio=4.,
+    drop_rate=0.,
+    drop_path_rate=0.0,
+    use_checkpoint=False,
+    mbconv_expand_ratio=4.0,
+    local_conv_size=3,
+    layer_lr_decay=0.8
+)
+medsam_lite_prompt_encoder = PromptEncoder(
+    embed_dim=256,
+    image_embedding_size=(64, 64),
+    input_image_size=(256, 256),
+    mask_in_chans=16
+)
+medsam_lite_mask_decoder = MaskDecoder_F4(
+    num_multimask_outputs=3,
+    transformer=TwoWayTransformer(
+        depth=2,
+        embedding_dim=256,
+        mlp_dim=2048,
+        num_heads=8,
+    ),
+    modality=True,
+    contents=True,
+    transformer_dim=256,
+    iou_head_depth=3,
+    iou_head_hidden_dim=256,
+)
+medsam_lite_model = MedSAM_Lite(
+    image_encoder = medsam_lite_image_encoder,
+    mask_decoder = medsam_lite_mask_decoder,
+    prompt_encoder = medsam_lite_prompt_encoder
+)
+lite_medsam_checkpoint = torch.load(lite_medsam_checkpoint_path, map_location='cpu')
+medsam_lite_model.load_state_dict(lite_medsam_checkpoint["model"])
+medsam_lite_model.to(device)
+medsam_lite_model.eval()
+def m2_pre_img(image_data, image_size=224):
+    transform1 = transforms.Compose([
+        transforms.ToTensor(), # normalize to [0.0,1.0]
+        transforms.Resize([image_size, image_size], interpolation=transforms.InterpolationMode.BILINEAR, antialias=True)
+        ]
+    )
+    resize_img_torch = transform1(image_data)
+    return resize_img_torch
+def get_contents(img, box):
+    if len(box)==1:
+        x_mino, y_mino, x_maxo, y_maxo = box[0]
+    else:
+        x_mino, y_mino, x_maxo, y_maxo = box
+    crops = img[y_mino:y_maxo,x_mino:x_maxo,:]
+    crops_128 = m2_pre_img(crops, image_size=64)
+    crops_224 = m2_pre_img(crops)
+    crops_224 = crops_224.unsqueeze(0)
+    with torch.no_grad():
+        image_features = model1.get_image_features(crops_224)
+    return crops_128, image_features
+def get_text_features(modality_text):
+    text_token = tokenizer(modality_text, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt").input_ids
+    with torch.no_grad():
+        text_features = model1.get_text_features(text_token)
+    return text_features
+def get_category(idx):
+    categories_map = {
+        "CT": 0,
+        "MR": 1,
+        "Endoscopy": 2,
+        "XRay": 3,
+        "X-Ray": 3,
+        "PET": 4,
+        "Dermoscopy": 5,
+        "Mammography": 6,
+        "Mammo": 6,
+        "US": 7,
+        "OCT": 8,
+        "Fundus": 9,
+        "Microscopy": 10,
+        "Microscope": 10
+    }
+    return categories_map[idx]
+def change_name(name):
+    if name=="Microscope":
+        name = "Microscopy"
+    return name
+def MedSAM_infer_npz_2D(img_npz_file):
+    npz_name = basename(img_npz_file)
+    c_name = change_name(npz_name.split('_')[1])
+    modality_text = f"{c_name} Image"
+    category_idx = get_category(c_name)
+    npz_data = np.load(img_npz_file, 'r', allow_pickle=True) # (H, W, 3)
+    img_3c = npz_data['imgs'] # (H, W, 3)
+    assert np.max(img_3c)<256, f'input data should be in range [0, 255], but got {np.unique(img_3c)}'
+    H, W = img_3c.shape[:2]
+    boxes = npz_data['boxes']
+    segs = np.zeros(img_3c.shape[:2], dtype=np.uint8)
+    text_features = get_text_features(modality_text)
+    text_features = torch.tensor(text_features).unsqueeze(0).to(device)
+    ## preprocessing
+    img_256 = resize_longest_side(img_3c, 256)
+    newh, neww = img_256.shape[:2]
+    img_256_norm = (img_256 - img_256.min()) / np.clip(
+        img_256.max() - img_256.min(), a_min=1e-8, a_max=None
+    )
+    img_256_padded = pad_image(img_256_norm, 256)
+    img_256_tensor = torch.tensor(img_256_padded).float().permute(2, 0, 1).unsqueeze(0).to(device)
+    with torch.no_grad():
+        image_embedding = medsam_lite_model.image_encoder(img_256_tensor)
+    for idx, box in enumerate(boxes, start=1):
+        crops, features = get_contents(img_3c, box)
+        box256, ratio = resize_box_to_256(box, original_size=(H, W))
+        box256 = box256[None, ...] # (1, 4)
+        medsam_mask, iou_pred = medsam_inference(medsam_lite_model, image_embedding, box256, features, crops, text_features, category_idx, (newh, neww), (H, W))
+        segs[medsam_mask>0] = idx%256
+        # print(f'{npz_name}, box: {box}, predicted iou: {np.round(iou_pred.item(), 4)}')
+    np.savez_compressed(
+        join(pred_save_dir, npz_name),
+        segs=segs,
+    )
+    # visualize image, mask and bounding box
+    if save_overlay and "Microscope" not in npz_name:
+        fig, ax = plt.subplots(1, 2, figsize=(10, 5))
+        ax[0].imshow(img_3c)
+        ax[1].imshow(img_3c)
+        ax[0].set_title("Image")
+        ax[1].set_title("LiteMedSAM Segmentation")
+        ax[0].axis('off')
+        ax[1].axis('off')
+        for i, box in enumerate(boxes):
+            color = np.random.rand(3)
+            box_viz = box
+            show_box(box_viz, ax[1], edgecolor=color)
+            # show_points(points[i], ax[1])
+            show_mask((segs == i+1).astype(np.uint8), ax[1], mask_color=color)
+        plt.tight_layout()
+        plt.savefig(join(png_save_dir, npz_name.split(".")[0] + '.png'), dpi=300)
+        plt.close()
+def MedSAM_infer_npz_3D(img_npz_file):
+    npz_name = basename(img_npz_file)
+    c_name = change_name(npz_name.split('_')[1])
+    modality_text = f"{c_name} Image"
+    category_idx = get_category(c_name)
+    npz_data = np.load(img_npz_file, 'r', allow_pickle=True)
+    img_3D = npz_data['imgs'] # (D, H, W)
+    # not used in this demo because it treats each slice independently
+    # spacing = npz_data['spacing']
+    segs = np.zeros_like(img_3D, dtype=np.uint8)
+    boxes_3D = npz_data['boxes'] # [[x_min, y_min, z_min, x_max, y_max, z_max]]
+    text_features = get_text_features(modality_text)
+    text_features = torch.tensor(text_features).unsqueeze(0).to(device)
+    for idx, box3D in enumerate(boxes_3D, start=1):
+        segs_3d_temp = np.zeros_like(img_3D, dtype=np.uint8)
+        x_min, y_min, z_min, x_max, y_max, z_max = box3D
+        assert z_min < z_max, f"z_min should be smaller than z_max, but got {z_min=} and {z_max=}"
+        mid_slice_bbox_2d = np.array([x_min, y_min, x_max, y_max])
+        z_middle = int((z_max - z_min)/2 + z_min)
+        # infer from middle slice to the z_max
+        # print(npz_name, 'infer from middle slice to the z_max')
+        for z in range(z_middle, z_max):
+            img_2d = img_3D[z, :, :]
+            if len(img_2d.shape) == 2:
+                img_3c = np.repeat(img_2d[:, :, None], 3, axis=-1)
+            else:
+                img_3c = img_2d
+            H, W, _ = img_3c.shape
+            img_256 = resize_longest_side(img_3c, 256)
+            new_H, new_W = img_256.shape[:2]
+            img_256 = (img_256 - img_256.min()) / np.clip(
+                img_256.max() - img_256.min(), a_min=1e-8, a_max=None
+            )  # normalize to [0, 1], (H, W, 3)
+            ## Pad image to 256x256
+            img_256 = pad_image(img_256)
+            # convert the shape to (3, H, W)
+            img_256_tensor = torch.tensor(img_256).float().permute(2, 0, 1).unsqueeze(0).to(device)
+            # get the image embedding
+            with torch.no_grad():
+                image_embedding = medsam_lite_model.image_encoder(img_256_tensor) # (1, 256, 64, 64)
+            if z == z_middle:
+                crops, features = get_contents(img_3c, mid_slice_bbox_2d)
+                box_256, _ = resize_box_to_256(mid_slice_bbox_2d, original_size=(H, W))
+            else:
+                pre_seg = segs_3d_temp[z-1, :, :]
+                if np.max(pre_seg) > 0:
+                    box_original = get_bbox256(pre_seg)
+                    crops, features = get_contents(img_3c, box_original)
+                    pre_seg256 = resize_longest_side(pre_seg)
+                    pre_seg256 = pad_image(pre_seg256)
+                    box_256 = get_bbox256(pre_seg256)
+                else:
+                    crops, features = get_contents(img_3c, mid_slice_bbox_2d)
+                    box_256, _ = resize_box_to_256(mid_slice_bbox_2d, original_size=(H, W))
+            img_2d_seg, iou_pred = medsam_inference(medsam_lite_model, image_embedding, box_256, features, crops, text_features, category_idx, [new_H, new_W], [H, W])
+            segs_3d_temp[z, img_2d_seg>0] = idx
+        # infer from middle slice to the z_max
+        # print(npz_name, 'infer from middle slice to the z_min')
+        for z in range(z_middle-1, z_min, -1):
+            img_2d = img_3D[z, :, :]
+            if len(img_2d.shape) == 2:
+                img_3c = np.repeat(img_2d[:, :, None], 3, axis=-1)
+            else:
+                img_3c = img_2d
+            H, W, _ = img_3c.shape
+            img_256 = resize_longest_side(img_3c)
+            new_H, new_W = img_256.shape[:2]
+            img_256 = (img_256 - img_256.min()) / np.clip(
+                img_256.max() - img_256.min(), a_min=1e-8, a_max=None
+            )  # normalize to [0, 1], (H, W, 3)
+            ## Pad image to 256x256
+            img_256 = pad_image(img_256)
+            img_256_tensor = torch.tensor(img_256).float().permute(2, 0, 1).unsqueeze(0).to(device)
+            # get the image embedding
+            with torch.no_grad():
+                image_embedding = medsam_lite_model.image_encoder(img_256_tensor) # (1, 256, 64, 64)
+            pre_seg = segs_3d_temp[z+1, :, :]
+            # pre_seg = segs[z+1, :, :]
+            if np.max(pre_seg) > 0:
+                box_original = get_bbox256(pre_seg)
+                crops, features = get_contents(img_3c, box_original)
+                pre_seg256 = resize_longest_side(pre_seg)
+                pre_seg256 = pad_image(pre_seg256)
+                box_256 = get_bbox256(pre_seg256)
+            else:
+                crops, features = get_contents(img_3c, mid_slice_bbox_2d)
+                scale_256 = 256 / max(H, W)
+                box_256 = mid_slice_bbox_2d * scale_256
+            img_2d_seg, iou_pred = medsam_inference(medsam_lite_model, image_embedding, box_256, features, crops, text_features, category_idx, [new_H, new_W], [H, W])
+            segs_3d_temp[z, img_2d_seg>0] = idx
+        segs[segs_3d_temp>0] = idx
+    np.savez_compressed(
+        join(pred_save_dir, npz_name),
+        segs=segs,
+    )
+    # visualize image, mask and bounding box
+    if save_overlay and "Microscope" not in npz_name:
+        idx = int(segs.shape[0] / 2)
+        fig, ax = plt.subplots(1, 2, figsize=(10, 5))
+        ax[0].imshow(img_3D[idx], cmap='gray')
+        ax[1].imshow(img_3D[idx], cmap='gray')
+        ax[0].set_title("Image")
+        ax[1].set_title("LiteMedSAM Segmentation")
+        ax[0].axis('off')
+        ax[1].axis('off')
+        for i, box3D in enumerate(boxes_3D, start=1):
+            if np.sum(segs[idx]==i) > 0:
+                color = np.random.rand(3)
+                x_min, y_min, z_min, x_max, y_max, z_max = box3D
+                box_viz = np.array([x_min, y_min, x_max, y_max])
+                show_box(box_viz, ax[1], edgecolor=color)
+                show_mask(segs[idx]==i, ax[1], mask_color=color)
+        plt.tight_layout()
+        plt.savefig(join(png_save_dir, npz_name.split(".")[0] + '.png'), dpi=300)
+        plt.close()
+if __name__ == '__main__':
+    img_npz_files = sorted(glob(join(data_root, '*.npz'), recursive=True))
+    efficiency = OrderedDict()
+    efficiency['case'] = []
+    efficiency['time'] = []
+    for img_npz_file in tqdm(img_npz_files):
+        start_time = time()
+        if basename(img_npz_file).startswith('3D'):
+            MedSAM_infer_npz_3D(img_npz_file)
+        else:
+            MedSAM_infer_npz_2D(img_npz_file)
+        end_time = time()
+        efficiency['case'].append(basename(img_npz_file))
+        efficiency['time'].append(end_time - start_time)
+        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        # print(current_time, 'file name:', basename(img_npz_file), 'time cost:', np.round(end_time - start_time, 4))
+    efficiency_df = pd.DataFrame(efficiency)
+    efficiency_df.to_csv(join(pred_save_dir, 'efficiency.csv'), index=False)

modality_npz_dataset.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import numpy as np
+import matplotlib.pyplot as plt
+import os
+from torchvision import transforms
+from torch.utils.data import Dataset
+import torch
+import cv2
+from transformers import CLIPModel, CLIPTokenizer
+from os.path import join, exists, isfile, isdir, basename
+import random
+join = os.path.join
+import json
+def reshape_MR(img):
+    original_shape = img.shape
+    sorted_axes = np.argsort(original_shape)
+    new_img = img.transpose(sorted_axes)
+    return new_img
+class ModalityNpzDataset(Dataset):
+    def __init__(self,
+                 data_root,
+                 points=True,
+                 contents=True,
+                 image_size=256,
+                 bbox_shift=5,
+                 data_aug=True):
+        self.data_root = data_root
+        json_data = json.load(open("case_data.json", "r"))
+        self.file_paths = json_data
+        assert len(self.file_paths) == 11
+        self.image_size = image_size
+        self.target_length = image_size
+        self.bbox_shift = bbox_shift
+        self.data_aug = data_aug
+        self.points = points
+        self.contents = contents
+        self.categories_map = {
+            "CT": 0,
+            "MR": 1,
+            "Endoscopy": 2,
+            "XRay": 3,
+            "X-Ray": 3,
+            "PET": 4,
+            "Dermoscopy": 5,
+            "Mammography": 6,
+            "Mammo": 6,
+            "US": 7,
+            "OCT": 8,
+            "Fundus": 9,
+            "Microscopy": 10,
+            "Microscope": 10
+        }
+        self.model1 = CLIPModel.from_pretrained("flaviagiammarino/pubmed-clip-vit-base-patch32")
+        self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16")
+        self.model1.requires_grad_(False)
+    def show_box(self, box, ax):
+        x0, y0 = box[0], box[1]
+        w, h = box[2] - box[0], box[3] - box[1]
+        ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='blue', facecolor=(0,0,0,0), lw=2))
+    def vis(self, image, bboxes, title):
+        _, axs = plt.subplots(1, 2, figsize=(10, 10))
+        axs[0].imshow(image, cmap="gray")
+        self.show_box(bboxes, axs[0])
+        axs[0].axis('off')
+        axs[0].set_title(title)
+        plt.subplots_adjust(wspace=0.01, hspace=0)
+        plt.savefig(
+            "test.png",
+            bbox_inches='tight',
+            dpi=300
+        )
+        plt.close()
+    def vis_crop(self, image, title):
+        plt.imshow(np.transpose(image, (1,2,0)))
+        plt.axis('off')
+        plt.title(title)
+        plt.savefig(
+            "test.png",
+            bbox_inches='tight',
+            dpi=300
+        )
+        plt.close()
+    def __getitem__(self, index):
+        #! add the random index
+        modality_map = [
+            "CT",
+            "MR",
+            "Endoscopy",
+            "X-ray",
+            "PET",
+            "Dermoscopy",
+            "Mammography",
+            "US",
+            "OCT",
+            "Fundus",
+            "Microscopy"
+        ]
+        modality_index = random.randint(0, 10)
+        index = random.randint(0, len(self.file_paths[modality_map[modality_index]])-1)
+        file_path = self.file_paths[modality_map[modality_index]][index][0]
+        temp = '/'.join(file_path.split('/')[7:])
+        file_path = self.data_root+'/'+temp
+        npz = np.load(file_path, 'r', allow_pickle=True)
+        img_name = basename(file_path)
+        mt = img_name.split("_")[0]
+        if mt=="2D" or mt=="3D":
+            mt = img_name.split("_")[1]
+        category_text = f"{mt} Image"
+        category_idx = self.categories_map[mt]
+        gts = npz["gts"]
+        img = npz["imgs"]
+        # special case for MR_totalseg
+        if "MR_totalseg" in img_name:
+            img = reshape_MR(img)
+            gts = reshape_MR(gts)
+            if img.shape[1] <=100:
+                return self.__getitem__(random.randint(0,len(self)-1))
+        if len(gts.shape) > 2: ## 3D image
+            i=random.randint(0,gts.shape[0]-1)
+            img = img[i, :, :]
+            gts = gts[i, :, :]
+            img_3c = np.repeat(img[:, :, None], 3, axis=-1) # (H, W, 3)
+            img_resized = self.resize_longest_side(img_3c)
+        else:
+            if len(img.shape) < 3:
+                img_3c = np.repeat(img[:, :, None], 3, axis=-1)
+            else:
+                img_3c = img
+            img_resized = self.resize_longest_side(img_3c)
+        gts = np.uint16(gts)
+        # Resizing
+        img_resized = (img_resized - img_resized.min()) / np.clip(img_resized.max() - img_resized.min(), a_min=1e-8, a_max=None) # normalize to [0, 1], (H, W, 3
+        img_padded = self.pad_image(img_resized) #self.pad_image(img_resize) # (256, 256, 3)
+        # convert the shape to (3, H, W)
+        img_padded = np.transpose(img_padded, (2, 0, 1)) # (3, 256, 256)
+        assert np.max(img_padded)<=1.0 and np.min(img_padded)>=0.0, 'image should be normalized to [0, 1]'
+        label_ids = np.unique(gts)
+        label_ids = label_ids.tolist()
+        try:
+            label_ids.remove(0)
+            label_id = random.choice(label_ids)
+            gt2D_original = np.uint8(gts == label_id)
+            gt = cv2.resize(
+                gt2D_original,
+                (img_resized.shape[1], img_resized.shape[0]),
+                interpolation=cv2.INTER_NEAREST
+            ).astype(np.uint8)
+            gt2D = self.pad_image(gt)
+        except:
+            return self.__getitem__(random.randint(0,len(self)-1))
+        box_original = self.get_bbox(gt2D_original)
+        x_mino, y_mino, x_maxo, y_maxo = box_original
+        if self.data_aug:
+            if random.random() > 0.5:
+                img_padded = np.ascontiguousarray(np.flip(img_padded, axis=-1))
+                gt2D = np.ascontiguousarray(np.flip(gt2D, axis=-1))
+            if random.random() > 0.5:
+                img_padded = np.ascontiguousarray(np.flip(img_padded, axis=-2))
+                gt2D = np.ascontiguousarray(np.flip(gt2D, axis=-2))
+        try:
+            gt2D = np.uint8(gt2D > 0)
+            y_indices, x_indices = np.where(gt2D > 0)
+            x_min, x_max = np.min(x_indices), np.max(x_indices)
+            y_min, y_max = np.min(y_indices), np.max(y_indices)
+            H, W = gt2D.shape
+            x_min = max(0, x_min - random.randint(0, self.bbox_shift))
+            x_max = min(W, x_max + random.randint(0, self.bbox_shift))
+            y_min = max(0, y_min - random.randint(0, self.bbox_shift))
+            y_max = min(H, y_max + random.randint(0, self.bbox_shift))
+            bboxes = np.array([x_min, y_min, x_max, y_max])
+        except:
+            return self.__getitem__(random.randint(0,len(self)-1))
+        if self.points:
+            mid_x = (x_min+x_max)//2
+            mid_y = (y_min+y_max)//2
+            cl = [[y_min, mid_y, x_min, mid_x], [mid_y,y_max,x_min,mid_x], [mid_y,y_max, mid_x,x_max], [y_min,mid_y, mid_x,x_max]]
+            coords = []
+            for i in range(4):
+                gt2D_tmp = np.zeros((H, W))
+                gt2D_tmp[cl[i][0]:cl[i][1], cl[i][2]:cl[i][3]] = gt2D[cl[i][0]:cl[i][1], cl[i][2]:cl[i][3]]
+                y_indices, x_indices = np.where(gt2D_tmp > 0)
+                if y_indices.size==0:
+                    coords.append([mid_x, mid_y])
+                else:
+                    x_point = np.random.choice(x_indices)
+                    y_point = np.random.choice(y_indices)
+                    coords.append([x_point, y_point])
+            coords = np.array(coords).reshape(4, 2)
+            coords = torch.tensor(coords).float()
+        else:
+            coords = None
+        if self.contents:
+            try:
+                crops = img_3c[y_mino:y_maxo,x_mino:x_maxo,:]
+                crops_64 = self.m2_pre_img(crops, image_size=64)  # change here for the size of cropped part
+                crops_224 = self.m2_pre_img(crops)
+            except:
+                crops_64 = torch.zeros((3, 64, 64))
+                crops_224 = torch.zeros((3, 224, 224))
+            crops_224 = crops_224.unsqueeze(0)
+            text_token = self.tokenizer(category_text, max_length=self.tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt").input_ids
+            with torch.no_grad():
+                image_features = self.model1.get_image_features(crops_224)
+                text_features = self.model1.get_text_features(text_token)
+        else:
+            crops_64 = None
+            image_features = None
+            text_features = None
+        return {
+            "image": torch.tensor(img_padded).float(),
+            "gt2D": torch.tensor(gt2D[None, :,:]).long(),
+            "coords": coords,
+            "bboxes": torch.tensor(bboxes[None, None, ...]).float(),
+            "image_crop": crops_64.float(),
+            "image_feature": image_features.float(),
+            "text_feature": text_features.float(),
+            "category_idx": category_idx,
+            "image_name": img_name,
+            "new_size": torch.tensor(np.array([img_padded.shape[0], img_padded.shape[1]])).long(),
+            "original_size": torch.tensor(np.array([img_3c.shape[0], img_3c.shape[1]])).long()
+        }
+    def __len__(self):
+        return 108714
+    def get_bbox(self, mask_256, bbox_shift=5):
+        y_indices, x_indices = np.where(mask_256 > 0)
+        x_min, x_max = np.min(x_indices), np.max(x_indices)
+        y_min, y_max = np.min(y_indices), np.max(y_indices)
+        H, W = mask_256.shape
+        x_min = max(0, x_min - random.randint(0, bbox_shift))
+        x_max = min(W, x_max + random.randint(0, bbox_shift))
+        y_min = max(0, y_min - random.randint(0, bbox_shift))
+        y_max = min(H, y_max + random.randint(0, bbox_shift))
+        bboxes256 = np.array([x_min, y_min, x_max, y_max])
+        return bboxes256
+    def m2_pre_img(self, image_data, image_size=224):
+        transform1 = transforms.Compose([
+            transforms.ToTensor(), # normalize to [0.0,1.0]
+            transforms.Resize([image_size, image_size], interpolation=transforms.InterpolationMode.BILINEAR, antialias=True)
+            ]
+        )
+        resize_img_torch = transform1(image_data)
+        return resize_img_torch
+    def resize_longest_side(self, image):
+        """
+        Expects a numpy array with shape HxWxC in uint8 format.
+        """
+        long_side_length = self.target_length
+        oldh, oldw = image.shape[0], image.shape[1]
+        scale = long_side_length * 1.0 / max(oldh, oldw)
+        newh, neww = oldh * scale, oldw * scale
+        neww, newh = int(neww + 0.5), int(newh + 0.5)
+        target_size = (neww, newh)
+        return cv2.resize(image, target_size, interpolation=cv2.INTER_AREA)
+    def pad_image(self, image):
+        """
+        Expects a numpy array with shape HxWxC in uint8 format.
+        """
+        # Pad
+        h, w = image.shape[0], image.shape[1]
+        padh = self.image_size - h
+        padw = self.image_size - w
+        if len(image.shape) == 3: ## Pad image
+            image_padded = np.pad(image, ((0, padh), (0, padw), (0, 0)))
+        else: ## Pad gt mask
+            image_padded = np.pad(image, ((0, padh), (0, padw)))
+        return image_padded

models/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .mask_decoder import MaskDecoder, MaskDecoder_F4
+from .prompt_encoder import PromptEncoder
+from .transformer import TwoWayTransformer
+from .tiny_vit import TinyViT

models/common.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from typing import Type
+class MLPBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        mlp_dim: int,
+        act: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        super().__init__()
+        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+        self.act = act()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.lin2(self.act(self.lin1(x)))
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x

models/lite_medsam.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .mask_decoder import MaskDecoder
+from .prompt_encoder import PromptEncoder
+from .transform import TwoWayTransformer
+class MedSAM_Lite(nn.Module):
+    def __init__(self,
+                image_encoder,
+                mask_decoder,
+                prompt_encoder
+                ):
+        super().__init__()
+        self.image_encoder = image_encoder
+        self.mask_decoder = mask_decoder
+        self.prompt_encoder = prompt_encoder
+    def forward(self, image, boxes):
+        image_embedding = self.image_encoder(image) # (B, 256, 64, 64)
+        sparse_embeddings, dense_embeddings = self.prompt_encoder(
+            points=None,
+            boxes=boxes,
+            masks=None,
+        )
+        low_res_masks, iou_predictions = self.mask_decoder(
+            image_embeddings=image_embedding, # (B, 256, 64, 64)
+            image_pe=self.prompt_encoder.get_dense_pe(), # (1, 256, 64, 64)
+            sparse_prompt_embeddings=sparse_embeddings, # (B, 2, 256)
+            dense_prompt_embeddings=dense_embeddings, # (B, 256, 64, 64)
+            multimask_output=False,
+          ) # (B, 1, 256, 256)
+        return low_res_masks, iou_predictions
+    @torch.no_grad()
+    def postprocess_masks(self, masks, new_size, original_size):
+        """
+        Do cropping and resizing
+        """
+        # Crop
+        masks = masks[:, :, :new_size[0], :new_size[1]]
+        # Resize
+        masks = F.interpolate(
+            masks,
+            size=(original_size[0], original_size[1]),
+            mode="bilinear",
+            align_corners=False,
+        )
+        return masks

models/mask_decoder.py ADDED Viewed

	@@ -0,0 +1,465 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch import nn
+from torch.nn import functional as F
+from typing import List, Tuple, Type
+from .common import LayerNorm2d
+from .transformer import TwoWayTransformer
+class Classifier(nn.Module):
+    def __init__(self, in_dim, hid_dim=None, out_dim=None, act=nn.GELU, drop=0.):
+        super().__init__()
+        out_dim = out_dim or in_dim
+        hid_dim = hid_dim or in_dim
+        self.fc1 = nn.Linear(in_dim, hid_dim)
+        self.act = act()
+        self.fc2 = nn.Linear(hid_dim, out_dim)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, in_channels, out_channels, i_downsample=None, stride=1):
+        super(Block, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride, bias=False)
+        self.batch_norm1 = nn.BatchNorm2d(out_channels)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, stride=stride, bias=False)
+        self.i_downsample = i_downsample
+        self.stride = stride
+        self.relu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
+    def forward(self, x):
+      identity = x.clone()
+      x = self.relu(self.batch_norm1(self.conv1(x)))
+      x = self.conv2(x)
+      if self.i_downsample is not None:
+          identity = self.i_downsample(identity)
+      x += identity
+      return x
+class MaskDecoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        transformer_dim: int,
+        transformer: nn.Module,
+        modality,
+        contents,
+        num_multimask_outputs: int = 3,
+        activation: Type[nn.Module] = nn.GELU,
+        iou_head_depth: int = 3,
+        iou_head_hidden_dim: int = 256,
+        category_num = 11
+    ) -> None:
+        """
+        Predicts masks given an image and prompt embeddings, using a
+        transformer architecture.
+        Arguments:
+          transformer_dim (int): the channel dimension of the transformer
+          transformer (nn.Module): the transformer used to predict masks
+          num_multimask_outputs (int): the number of masks to predict
+            when disambiguating masks
+          activation (nn.Module): the type of activation to use when
+            upscaling masks
+          iou_head_depth (int): the depth of the MLP used to predict
+            mask quality
+          iou_head_hidden_dim (int): the hidden dimension of the MLP
+            used to predict mask quality
+        """
+        super().__init__()
+        self.transformer_dim = transformer_dim
+        self.transformer = transformer
+        self.category_num = category_num
+        self.modality = modality
+        self.contents = contents
+        self.num_multimask_outputs = num_multimask_outputs
+        self.iou_token = nn.Embedding(1, transformer_dim)
+        self.num_mask_tokens = num_multimask_outputs + 1
+        self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
+        self.convs = Block(transformer_dim, transformer_dim)
+        self.w_lin = nn.Linear(transformer_dim, transformer_dim)
+        self.b_lin = nn.Linear(transformer_dim, transformer_dim)
+        self.output_upscaling = nn.Sequential(
+            nn.ConvTranspose2d(
+                transformer_dim, transformer_dim // 4, kernel_size=2, stride=2
+            ),
+            LayerNorm2d(transformer_dim // 4),
+            activation(),
+            nn.ConvTranspose2d(
+                transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2
+            ),
+            activation(),
+        )
+        self.output_hypernetworks_mlps = nn.ModuleList(
+            [
+                MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3)
+                for i in range(self.num_mask_tokens)
+            ]
+        )
+        self.iou_prediction_head = MLP(
+            transformer_dim, iou_head_hidden_dim, self.num_mask_tokens, iou_head_depth
+        )
+        self.category_prediction_head = Classifier(
+            transformer_dim, transformer_dim//4, category_num
+        )
+    def forward(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        dense_prompt_embeddings: torch.Tensor,
+        multimask_output: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predict masks given image and prompt embeddings.
+        Arguments:
+          image_embeddings (torch.Tensor): the embeddings from the image encoder
+          image_pe (torch.Tensor): positional encoding with the shape of image_embeddings
+          sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
+          dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs
+          multimask_output (bool): Whether to return multiple masks or a single
+            mask.
+        Returns:
+          torch.Tensor: batched predicted masks
+          torch.Tensor: batched predictions of mask quality
+        """
+        masks, iou_pred, category_pred, clip_tokens_out, image_tokens_out = self.predict_masks(
+            image_embeddings=image_embeddings,
+            image_pe=image_pe,
+            sparse_prompt_embeddings=sparse_prompt_embeddings,
+            dense_prompt_embeddings=dense_prompt_embeddings,
+        )
+        # Select the correct mask or masks for output
+        if multimask_output:
+            mask_slice = slice(1, None)
+        else:
+            mask_slice = slice(0, 1)
+        masks = masks[:, mask_slice, :, :]
+        iou_pred = iou_pred[:, mask_slice]
+        # Prepare output
+        return masks, iou_pred, category_pred, clip_tokens_out, image_tokens_out
+    def predict_masks(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        dense_prompt_embeddings: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predicts masks. See 'forward' for more details."""
+        # Concatenate output tokens
+        output_tokens = torch.cat(
+            [self.iou_token.weight, self.mask_tokens.weight], dim=0
+        )
+        output_tokens = output_tokens.unsqueeze(0).expand(
+            sparse_prompt_embeddings.size(0), -1, -1
+        )
+        tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
+        # Expand per-image data in batch direction to be per-mask
+        if image_embeddings.shape[0] != tokens.shape[0]:
+            src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
+        else:
+            src = image_embeddings
+        src = src + dense_prompt_embeddings
+        pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
+        b, c, h, w = src.shape
+        # Run the transformer
+        hs, src = self.transformer(src, pos_src, tokens)
+        iou_token_out = hs[:, 0, :]
+        mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :]
+        # Upscale mask embeddings and predict masks using the mask tokens
+        src = src.transpose(1, 2).view(b, c, h, w)
+        if self.contents:
+            clip_tokens_out = tokens[:,-2,:]
+            image_tokens_out = F.adaptive_avg_pool2d(dense_prompt_embeddings, output_size=(1, 1)).squeeze(-1).squeeze(-1)
+            clip_new_out = hs[:,-2,:].unsqueeze(-1).unsqueeze(-1)
+            src = dense_prompt_embeddings+src+clip_new_out
+            src = self.convs(src)
+        else:
+            clip_tokens_out = None
+            image_tokens_out = None
+        if self.modality:
+            category_tokens_out = hs[:,-1,:]
+            wc = self.w_lin(category_tokens_out).unsqueeze(-1).unsqueeze(-1)
+            bc = self.b_lin(category_tokens_out).unsqueeze(-1).unsqueeze(-1)
+            src = wc*src+bc+src
+            category_pred = self.category_prediction_head(category_tokens_out)
+        else:
+            category_pred = None
+        upscaled_embedding = self.output_upscaling(src)
+        hyper_in_list: List[torch.Tensor] = []
+        for i in range(self.num_mask_tokens):
+            hyper_in_list.append(
+                self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :])
+            )
+        hyper_in = torch.stack(hyper_in_list, dim=1)
+        b, c, h, w = upscaled_embedding.shape
+        masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
+        # Generate mask quality predictions
+        iou_pred = self.iou_prediction_head(iou_token_out)
+        return masks, iou_pred, category_pred, clip_tokens_out, image_tokens_out
+# Lightly adapted from
+# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa
+class MLP(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        sigmoid_output: bool = False,
+    ) -> None:
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+        self.sigmoid_output = sigmoid_output
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        if self.sigmoid_output:
+            x = F.sigmoid(x)
+        return x
+class MaskDecoder_F4(nn.Module):
+    def __init__(
+        self,
+        *,
+        transformer_dim: int,
+        transformer: nn.Module,
+        modality,
+        contents,
+        num_multimask_outputs: int = 3,
+        activation: Type[nn.Module] = nn.GELU,
+        iou_head_depth: int = 3,
+        iou_head_hidden_dim: int = 256,
+        category_num = 11
+    ) -> None:
+        """
+        Predicts masks given an image and prompt embeddings, using a
+        transformer architecture.
+        Arguments:
+          transformer_dim (int): the channel dimension of the transformer
+          transformer (nn.Module): the transformer used to predict masks
+          num_multimask_outputs (int): the number of masks to predict
+            when disambiguating masks
+          activation (nn.Module): the type of activation to use when
+            upscaling masks
+          iou_head_depth (int): the depth of the MLP used to predict
+            mask quality
+          iou_head_hidden_dim (int): the hidden dimension of the MLP
+            used to predict mask quality
+        """
+        super().__init__()
+        self.transformer_dim = transformer_dim
+        self.transformer = transformer
+        self.category_num = category_num
+        self.modality = modality
+        self.contents = contents
+        self.num_multimask_outputs = num_multimask_outputs
+        self.iou_token = nn.Embedding(1, transformer_dim)
+        self.num_mask_tokens = num_multimask_outputs + 1
+        self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
+        self.convs = Block(transformer_dim, transformer_dim)
+        self.conv1 = nn.Conv2d(transformer_dim*2, transformer_dim, 1)
+        self.c_conv = Block(transformer_dim, transformer_dim)
+        self.w_lin = nn.Linear(transformer_dim, transformer_dim)
+        self.b_lin = nn.Linear(transformer_dim, transformer_dim)
+        self.m_conv = Block(transformer_dim, transformer_dim)
+        self.output_upscaling = nn.Sequential(
+            nn.ConvTranspose2d(
+                transformer_dim, transformer_dim // 4, kernel_size=2, stride=2
+            ),
+            LayerNorm2d(transformer_dim // 4),
+            activation(),
+            nn.ConvTranspose2d(
+                transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2
+            ),
+            activation(),
+        )
+        self.output_hypernetworks_mlps = nn.ModuleList(
+            [
+                MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3)
+                for i in range(self.num_mask_tokens)
+            ]
+        )
+        self.iou_prediction_head = MLP(
+            transformer_dim, iou_head_hidden_dim, self.num_mask_tokens, iou_head_depth
+        )
+        # self.category_prediction_head = Classifier(
+        #     transformer_dim, transformer_dim//4, category_num
+        # )
+        self.category_prediction_head = Classifier(
+            transformer_dim, transformer_dim//4, category_num
+        )
+    def forward(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        dense_prompt_embeddings: torch.Tensor,
+        multimask_output: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predict masks given image and prompt embeddings.
+        Arguments:
+          image_embeddings (torch.Tensor): the embeddings from the image encoder
+          image_pe (torch.Tensor): positional encoding with the shape of image_embeddings
+          sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
+          dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs
+          multimask_output (bool): Whether to return multiple masks or a single
+            mask.
+        Returns:
+          torch.Tensor: batched predicted masks
+          torch.Tensor: batched predictions of mask quality
+        """
+        masks, iou_pred, category_pred, clip_tokens_out, image_tokens_out = self.predict_masks(
+            image_embeddings=image_embeddings,
+            image_pe=image_pe,
+            sparse_prompt_embeddings=sparse_prompt_embeddings,
+            dense_prompt_embeddings=dense_prompt_embeddings,
+        )
+        # Select the correct mask or masks for output
+        if multimask_output:
+            mask_slice = slice(1, None)
+        else:
+            mask_slice = slice(0, 1)
+        masks = masks[:, mask_slice, :, :]
+        iou_pred = iou_pred[:, mask_slice]
+        # Prepare output
+        return masks, iou_pred, category_pred, clip_tokens_out, image_tokens_out
+    def predict_masks(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        dense_prompt_embeddings: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predicts masks. See 'forward' for more details."""
+        # Concatenate output tokens
+        output_tokens = torch.cat(
+            [self.iou_token.weight, self.mask_tokens.weight], dim=0
+        )
+        output_tokens = output_tokens.unsqueeze(0).expand(
+            sparse_prompt_embeddings.size(0), -1, -1
+        )
+        tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
+        m_token = tokens[:,-1,:]
+        # Expand per-image data in batch direction to be per-mask
+        if image_embeddings.shape[0] != tokens.shape[0]:
+            src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
+        else:
+            src = image_embeddings
+        src = src + dense_prompt_embeddings
+        pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
+        b, c, h, w = src.shape
+        # Run the transformer
+        hs, src = self.transformer(src, pos_src, tokens)
+        iou_token_out = hs[:, 0, :]
+        mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :]
+        # Upscale mask embeddings and predict masks using the mask tokens
+        src = src.transpose(1, 2).view(b, c, h, w)
+        if self.modality:
+            category_tokens_out = hs[:,-1,:]
+            wc = self.w_lin(category_tokens_out).unsqueeze(-1).unsqueeze(-1)
+            bc = self.b_lin(category_tokens_out).unsqueeze(-1).unsqueeze(-1)
+            src_m = wc*src+bc+src
+            m_info = wc.squeeze(-1).squeeze(-1)+bc.squeeze(-1).squeeze(-1)+category_tokens_out
+            category_pred = self.category_prediction_head(m_info)
+            src_m = self.m_conv(src_m)
+        else:
+            category_pred = None
+        if self.contents:
+            clip_tokens_out = tokens[:,-2,:]
+            image_tokens_out = F.adaptive_avg_pool2d(dense_prompt_embeddings, output_size=(1, 1)).squeeze(-1).squeeze(-1)
+            clip_new_out = hs[:,-2,:].unsqueeze(-1).unsqueeze(-1)
+            src_vp = dense_prompt_embeddings+src+clip_new_out
+            src_vp = self.convs(src_vp)
+        else:
+            clip_tokens_out = None
+            image_tokens_out = None
+        if self.contents and self.modality:
+            src = torch.cat((src_m, src_vp), dim=1)
+            src = self.conv1(src)
+            src = self.c_conv(src)
+        elif self.contents:
+            src = src_vp
+        elif self.modality:
+            src = src_m
+        upscaled_embedding = self.output_upscaling(src)
+        hyper_in_list: List[torch.Tensor] = []
+        for i in range(self.num_mask_tokens):
+            hyper_in_list.append(
+                self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :])
+            )
+        hyper_in = torch.stack(hyper_in_list, dim=1)
+        b, c, h, w = upscaled_embedding.shape
+        masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
+        # Generate mask quality predictions
+        iou_pred = self.iou_prediction_head(iou_token_out)
+        return masks, iou_pred, category_pred, clip_tokens_out, image_tokens_out

models/prompt_encoder.py ADDED Viewed

	@@ -0,0 +1,306 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+from torch import nn
+import torch.nn.functional as F
+from typing import Any, Optional, Tuple, Type
+from .common import LayerNorm2d
+class PositionEmbeddingRandom(nn.Module):
+    """
+    Positional encoding using random spatial frequencies.
+    """
+    def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
+        super().__init__()
+        if scale is None or scale <= 0.0:
+            scale = 1.0
+        self.register_buffer(
+            "positional_encoding_gaussian_matrix",
+            scale * torch.randn((2, num_pos_feats)),
+        )
+    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+        """Positionally encode points that are normalized to [0,1]."""
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coords = 2 * coords - 1
+        coords = coords @ self.positional_encoding_gaussian_matrix
+        coords = 2 * np.pi * coords
+        # outputs d_1 x ... x d_n x C shape
+        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+    def forward(self, size: Tuple[int, int]) -> torch.Tensor:
+        """Generate positional encoding for a grid of the specified size."""
+        h, w = size
+        device: Any = self.positional_encoding_gaussian_matrix.device
+        grid = torch.ones((h, w), device=device, dtype=torch.float32)
+        y_embed = grid.cumsum(dim=0) - 0.5
+        x_embed = grid.cumsum(dim=1) - 0.5
+        y_embed = y_embed / h
+        x_embed = x_embed / w
+        pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
+        return pe.permute(2, 0, 1)  # C x H x W
+    def forward_with_coords(
+        self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+    ) -> torch.Tensor:
+        """Positionally encode points that are not normalized to [0,1]."""
+        coords = coords_input.clone()
+        coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+        coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+        return self._pe_encoding(coords.to(torch.float))  # B x N x C
+class Block(nn.Module):
+    def __init__(self, in_channels, out_channels, i_downsample=None, stride=1):
+        super(Block, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride, bias=False)
+        self.batch_norm1 = nn.BatchNorm2d(out_channels)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, stride=stride, bias=False)
+        self.batch_norm2 = nn.BatchNorm2d(out_channels)
+        self.i_downsample = i_downsample
+        self.stride = stride
+        self.relu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
+    def forward(self, x):
+      identity = x.clone()
+      x = self.relu(self.batch_norm1(self.conv1(x)))
+      x = self.batch_norm2(self.conv2(x))
+      if self.i_downsample is not None:
+          identity = self.i_downsample(identity)
+      x += identity
+      x = self.relu(x)
+      return x
+class Crop_Net_New(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.conv = nn.Conv2d(3, dim, 3, 1, 1)
+        self.conv1 = Block(dim, dim)
+        self.conv2 = Block(dim, dim)
+        self.conv3 = Block(dim, dim)
+        self.conv4 = nn.Conv2d(dim, dim, 5, 1, 2)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        return self.conv4(x)
+class Mlp(nn.Module):
+    def __init__(self, in_dim, hid_dim=None, out_dim=None, act=nn.GELU, drop=0.):
+        super().__init__()
+        out_dim = out_dim or in_dim
+        hid_dim = hid_dim or in_dim
+        self.fc1 = nn.Linear(in_dim, hid_dim)
+        self.act = act()
+        self.fc2 = nn.Linear(hid_dim, out_dim)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class PromptEncoder(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        image_embedding_size: Tuple[int, int],
+        input_image_size: Tuple[int, int],
+        mask_in_chans: int,
+        activation: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        """
+        Encodes prompts for input to SAM's mask decoder.
+        Arguments:
+          embed_dim (int): The prompts' embedding dimension
+          image_embedding_size (tuple(int, int)): The spatial size of the
+            image embedding, as (H, W).
+          input_image_size (int): The padded size of the image as input
+            to the image encoder, as (H, W).
+          mask_in_chans (int): The number of hidden channels used for
+            encoding input masks.
+          activation (nn.Module): The activation to use when encoding
+            input masks.
+        """
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.input_image_size = input_image_size
+        self.image_embedding_size = image_embedding_size
+        self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
+        self.num_point_embeddings: int = 4  # pos/neg point + 2 box corners
+        point_embeddings = [
+            nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)
+        ]
+        self.point_embeddings = nn.ModuleList(point_embeddings)
+        self.not_a_point_embed = nn.Embedding(1, embed_dim)
+        self.mask_input_size = (
+            4 * image_embedding_size[0],
+            4 * image_embedding_size[1],
+        )
+        self.no_mask_embed = nn.Embedding(1, embed_dim)
+        self.crop_nets = Crop_Net_New(embed_dim)
+        self.clip_img_mlp = Mlp(in_dim=512, hid_dim=256, out_dim=256)
+        self.clip_text_mlp = Mlp(in_dim=512, hid_dim=256, out_dim=256)
+        self.mlps = Mlp(in_dim=512, hid_dim=512, out_dim=256)
+        self.categories = nn.Embedding(11, 256)
+    def get_dense_pe(self) -> torch.Tensor:
+        """
+        Returns the positional encoding used to encode point prompts,
+        applied to a dense set of points the shape of the image encoding.
+        Returns:
+          torch.Tensor: Positional encoding with shape
+            1x(embed_dim)x(embedding_h)x(embedding_w)
+        """
+        return self.pe_layer(self.image_embedding_size).unsqueeze(0)
+    def _embed_points(
+        self,
+        points: torch.Tensor,
+        labels: torch.Tensor,
+        pad: bool,
+    ) -> torch.Tensor:
+        """Embeds point prompts."""
+        points = points + 0.5  # Shift to center of pixel
+        if pad:
+            padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
+            padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
+            points = torch.cat([points, padding_point], dim=1)
+            labels = torch.cat([labels, padding_label], dim=1)
+        point_embedding = self.pe_layer.forward_with_coords(
+            points, self.input_image_size
+        )
+        point_embedding[labels == -1] = 0.0
+        point_embedding[labels == -1] += self.not_a_point_embed.weight
+        point_embedding[labels == 0] += self.point_embeddings[0].weight
+        point_embedding[labels == 1] += self.point_embeddings[1].weight
+        return point_embedding
+    def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
+        """Embeds box prompts."""
+        boxes = boxes + 0.5  # Shift to center of pixel
+        coords = boxes.reshape(-1, 2, 2)
+        corner_embedding = self.pe_layer.forward_with_coords(
+            coords, self.input_image_size
+        )
+        corner_embedding[:, 0, :] += self.point_embeddings[2].weight
+        corner_embedding[:, 1, :] += self.point_embeddings[3].weight
+        return corner_embedding
+    # def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
+    #     """Embeds mask inputs."""
+    #     mask_embedding = self.mask_downscaling(masks)
+    #     return mask_embedding
+    def _get_batch_size(
+        self,
+        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        boxes: Optional[torch.Tensor],
+        masks: Optional[torch.Tensor],
+    ) -> int:
+        """
+        Gets the batch size of the output given the batch size of the input prompts.
+        """
+        if points is not None:
+            return points[0].shape[0]
+        elif boxes is not None:
+            return boxes.shape[0]
+        # elif tokens is not None:
+        #     return tokens.shape[0]
+        elif masks is not None:
+            return masks.shape[0]
+        else:
+            return 1
+    def _get_device(self) -> torch.device:
+        return self.point_embeddings[0].weight.device
+    def forward(
+        self,
+        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        boxes: Optional[torch.Tensor],
+        masks,
+        features,
+        crops,
+        text_features,
+        category_idx
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Embeds different types of prompts, returning both sparse and dense
+        embeddings.
+        Arguments:
+          points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates
+            and labels to embed.
+          boxes (torch.Tensor or none): boxes to embed
+          masks (torch.Tensor or none): masks to embed
+        Returns:
+          torch.Tensor: sparse embeddings for the points and boxes, with shape
+            BxNx(embed_dim), where N is determined by the number of input points
+            and boxes.
+          torch.Tensor: dense embeddings for the masks, in the shape
+            Bx(embed_dim)x(embed_H)x(embed_W)
+        """
+        bs = self._get_batch_size(points, boxes, masks)
+        sparse_embeddings = torch.empty(
+            (bs, 0, self.embed_dim), device=self._get_device()
+        )
+        if points is not None:
+            coords, labels = points
+            point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
+            sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
+        if boxes is not None:
+            box_embeddings = self._embed_boxes(boxes)
+            sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)
+        if features is not None:
+            clip_embeddings = self.clip_img_mlp(features)
+            sparse_embeddings = torch.cat([sparse_embeddings, clip_embeddings], dim=1)
+        if category_idx is not None:
+            text_embeddings = self.clip_text_mlp(text_features)
+            category_embeddings = torch.zeros((bs, 1, 256)).to(boxes.device)
+            for i in range(bs):
+                category_embeddings[i,0,:] = self.categories(category_idx[i].long())
+            modality_embeddings = torch.cat((text_embeddings, category_embeddings), dim=-1)
+            text_embeddings = self.mlps(modality_embeddings)
+            sparse_embeddings = torch.cat([sparse_embeddings, text_embeddings], dim=1)
+        if crops is not None:
+            dense_embeddings = self.crop_nets(crops)
+        else:
+            dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
+                bs, -1, self.image_embedding_size[0], self.image_embedding_size[1]
+            )
+        return sparse_embeddings, dense_embeddings

models/tiny_vit.py ADDED Viewed

	@@ -0,0 +1,645 @@

+# --------------------------------------------------------
+# TinyViT Model Architecture
+# Copyright (c) 2022 Microsoft
+# Adapted from LeViT and Swin Transformer
+#   LeViT: (https://github.com/facebookresearch/levit)
+#   Swin: (https://github.com/microsoft/swin-transformer)
+# Build the TinyViT Model
+# --------------------------------------------------------
+# The TinyViT model is adapted from MobileSAM's variant.
+# --------------------------------------------------------
+import itertools
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath as TimmDropPath,\
+    to_2tuple, trunc_normal_
+from typing import Tuple
+class Conv2d_BN(torch.nn.Sequential):
+    def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1,
+                 groups=1, bn_weight_init=1):
+        super().__init__()
+        self.add_module('c', torch.nn.Conv2d(
+            a, b, ks, stride, pad, dilation, groups, bias=False))
+        bn = torch.nn.BatchNorm2d(b)
+        torch.nn.init.constant_(bn.weight, bn_weight_init)
+        torch.nn.init.constant_(bn.bias, 0)
+        self.add_module('bn', bn)
+    @torch.no_grad()
+    def fuse(self):
+        c, bn = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps)**0.5
+        w = c.weight * w[:, None, None, None]
+        b = bn.bias - bn.running_mean * bn.weight / \
+            (bn.running_var + bn.eps)**0.5
+        m = torch.nn.Conv2d(w.size(1) * self.c.groups, w.size(
+            0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups)
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+class DropPath(TimmDropPath):
+    def __init__(self, drop_prob=None):
+        super().__init__(drop_prob=drop_prob)
+        self.drop_prob = drop_prob
+    def __repr__(self):
+        msg = super().__repr__()
+        msg += f'(drop_prob={self.drop_prob})'
+        return msg
+class PatchEmbed(nn.Module):
+    def __init__(self, in_chans, embed_dim, resolution, activation):
+        super().__init__()
+        img_size: Tuple[int, int] = to_2tuple(resolution)
+        #self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
+        self.patches_resolution = img_size
+        self.num_patches = self.patches_resolution[0] * \
+            self.patches_resolution[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        n = embed_dim
+        #self.seq = nn.Sequential(
+        #    Conv2d_BN(in_chans, n // 2, 3, 2, 1),
+        #    activation(),
+        #    Conv2d_BN(n // 2, n, 3, 2, 1),
+        #)
+        self.seq = nn.Sequential(
+            Conv2d_BN(in_chans, n // 2, 1, 1, 0),
+            activation(),
+            Conv2d_BN(n // 2, n, 1, 1, 0),
+        )
+    def forward(self, x):
+        return self.seq(x)
+class MBConv(nn.Module):
+    def __init__(self, in_chans, out_chans, expand_ratio,
+                 activation, drop_path):
+        super().__init__()
+        self.in_chans = in_chans
+        self.hidden_chans = int(in_chans * expand_ratio)
+        self.out_chans = out_chans
+        self.conv1 = Conv2d_BN(in_chans, self.hidden_chans, ks=1)
+        self.act1 = activation()
+        self.conv2 = Conv2d_BN(self.hidden_chans, self.hidden_chans,
+                               ks=3, stride=1, pad=1, groups=self.hidden_chans)
+        self.act2 = activation()
+        self.conv3 = Conv2d_BN(
+            self.hidden_chans, out_chans, ks=1, bn_weight_init=0.0)
+        self.act3 = activation()
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.act1(x)
+        x = self.conv2(x)
+        x = self.act2(x)
+        x = self.conv3(x)
+        x = self.drop_path(x)
+        x += shortcut
+        x = self.act3(x)
+        return x
+class PatchMerging(nn.Module):
+    def __init__(self, input_resolution, dim, out_dim, activation):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.out_dim = out_dim
+        self.act = activation()
+        self.conv1 = Conv2d_BN(dim, out_dim, 1, 1, 0)
+        stride_c=2
+        if(out_dim==320 or out_dim==448 or out_dim==576):
+            stride_c=1
+        self.conv2 = Conv2d_BN(out_dim, out_dim, 3, stride_c, 1, groups=out_dim)
+        self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0)
+    def forward(self, x):
+        if x.ndim == 3:
+            H, W = self.input_resolution
+            B = len(x)
+            # (B, C, H, W)
+            x = x.view(B, H, W, -1).permute(0, 3, 1, 2)
+        x = self.conv1(x)
+        x = self.act(x)
+        x = self.conv2(x)
+        x = self.act(x)
+        x = self.conv3(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+class ConvLayer(nn.Module):
+    def __init__(self, dim, input_resolution, depth,
+                 activation,
+                 drop_path=0., downsample=None, use_checkpoint=False,
+                 out_dim=None,
+                 conv_expand_ratio=4.,
+                 ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            MBConv(dim, dim, conv_expand_ratio, activation,
+                   drop_path[i] if isinstance(drop_path, list) else drop_path,
+                   )
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, out_dim=out_dim, activation=activation)
+        else:
+            self.downsample = None
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None,
+                 out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.norm = nn.LayerNorm(in_features)
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.act = act_layer()
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.norm(x)
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(torch.nn.Module):
+    def __init__(self, dim, key_dim, num_heads=8,
+                 attn_ratio=4,
+                 resolution=(14, 14),
+                 ):
+        super().__init__()
+        # (h, w)
+        assert isinstance(resolution, tuple) and len(resolution) == 2
+        self.num_heads = num_heads
+        self.scale = key_dim ** -0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * num_heads
+        self.attn_ratio = attn_ratio
+        h = self.dh + nh_kd * 2
+        self.norm = nn.LayerNorm(dim)
+        self.qkv = nn.Linear(dim, h)
+        self.proj = nn.Linear(self.dh, dim)
+        points = list(itertools.product(
+            range(resolution[0]), range(resolution[1])))
+        N = len(points)
+        attention_offsets = {}
+        idxs = []
+        for p1 in points:
+            for p2 in points:
+                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = torch.nn.Parameter(
+            torch.zeros(num_heads, len(attention_offsets)))
+        self.register_buffer('attention_bias_idxs',
+                             torch.LongTensor(idxs).view(N, N),
+                             persistent=False)
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and hasattr(self, 'ab'):
+            del self.ab
+        else:
+            self.register_buffer('ab',
+                                 self.attention_biases[:, self.attention_bias_idxs],
+                                 persistent=False)
+    def forward(self, x):  # x (B,N,C)
+        B, N, _ = x.shape
+        # Normalization
+        x = self.norm(x)
+        qkv = self.qkv(x)
+        # (B, N, num_heads, d)
+        q, k, v = qkv.view(B, N, self.num_heads, -
+                           1).split([self.key_dim, self.key_dim, self.d], dim=3)
+        # (B, num_heads, N, d)
+        q = q.permute(0, 2, 1, 3)
+        k = k.permute(0, 2, 1, 3)
+        v = v.permute(0, 2, 1, 3)
+        attn = (
+            (q @ k.transpose(-2, -1)) * self.scale
+            +
+            (self.attention_biases[:, self.attention_bias_idxs]
+             if self.training else self.ab)
+        )
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
+        x = self.proj(x)
+        return x
+class TinyViTBlock(nn.Module):
+    r""" TinyViT Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int, int]): Input resolution.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        local_conv_size (int): the kernel size of the convolution between
+                               Attention and MLP. Default: 3
+        activation: the activation function. Default: nn.GELU
+    """
+    def __init__(self, dim, input_resolution, num_heads, window_size=7,
+                 mlp_ratio=4., drop=0., drop_path=0.,
+                 local_conv_size=3,
+                 activation=nn.GELU,
+                 ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        assert window_size > 0, 'window_size must be greater than 0'
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        assert dim % num_heads == 0, 'dim must be divisible by num_heads'
+        head_dim = dim // num_heads
+        window_resolution = (window_size, window_size)
+        self.attn = Attention(dim, head_dim, num_heads,
+                              attn_ratio=1, resolution=window_resolution)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        mlp_activation = activation
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim,
+                       act_layer=mlp_activation, drop=drop)
+        pad = local_conv_size // 2
+        self.local_conv = Conv2d_BN(
+            dim, dim, ks=local_conv_size, stride=1, pad=pad, groups=dim)
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        res_x = x
+        if H == self.window_size and W == self.window_size:
+            x = self.attn(x)
+        else:
+            x = x.view(B, H, W, C)
+            pad_b = (self.window_size - H %
+                     self.window_size) % self.window_size
+            pad_r = (self.window_size - W %
+                     self.window_size) % self.window_size
+            padding = pad_b > 0 or pad_r > 0
+            if padding:
+                x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b))
+            pH, pW = H + pad_b, W + pad_r
+            nH = pH // self.window_size
+            nW = pW // self.window_size
+            # window partition
+            x = x.view(B, nH, self.window_size, nW, self.window_size, C).transpose(2, 3).reshape(
+                B * nH * nW, self.window_size * self.window_size, C)
+            x = self.attn(x)
+            # window reverse
+            x = x.view(B, nH, nW, self.window_size, self.window_size,
+                       C).transpose(2, 3).reshape(B, pH, pW, C)
+            if padding:
+                x = x[:, :H, :W].contiguous()
+            x = x.view(B, L, C)
+        x = res_x + self.drop_path(x)
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+        x = self.local_conv(x)
+        x = x.view(B, C, L).transpose(1, 2)
+        x = x + self.drop_path(self.mlp(x))
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, mlp_ratio={self.mlp_ratio}"
+class BasicLayer(nn.Module):
+    """ A basic TinyViT layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+        local_conv_size: the kernel size of the depthwise convolution between attention and MLP. Default: 3
+        activation: the activation function. Default: nn.GELU
+        out_dim: the output dimension of the layer. Default: dim
+    """
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., drop=0.,
+                 drop_path=0., downsample=None, use_checkpoint=False,
+                 local_conv_size=3,
+                 activation=nn.GELU,
+                 out_dim=None,
+                 ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            TinyViTBlock(dim=dim, input_resolution=input_resolution,
+                         num_heads=num_heads, window_size=window_size,
+                         mlp_ratio=mlp_ratio,
+                         drop=drop,
+                         drop_path=drop_path[i] if isinstance(
+                             drop_path, list) else drop_path,
+                         local_conv_size=local_conv_size,
+                         activation=activation,
+                         )
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, out_dim=out_dim, activation=activation)
+        else:
+            self.downsample = None
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+class TinyViT(nn.Module):
+    def __init__(self,
+                 img_size=224,
+                 in_chans=3,
+                 #num_classes=1000,
+                 embed_dims=[96, 192, 384, 768], depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_sizes=[7, 7, 14, 7],
+                 mlp_ratio=4.,
+                 drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_checkpoint=False,
+                 mbconv_expand_ratio=4.0,
+                 local_conv_size=3,
+                 layer_lr_decay=1.0,
+                 ):
+        super().__init__()
+        self.img_size=img_size
+        #self.num_classes = num_classes
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.mlp_ratio = mlp_ratio
+        activation = nn.GELU
+        self.patch_embed = PatchEmbed(in_chans=in_chans,
+                                      embed_dim=embed_dims[0],
+                                      resolution=img_size,
+                                      activation=activation)
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate,
+                                                sum(depths))]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            kwargs = dict(dim=embed_dims[i_layer],
+                        input_resolution=(
+                            patches_resolution[0] // (2 ** (i_layer-1 if i_layer == 3 else i_layer)),
+                            patches_resolution[1] // (2 ** (i_layer-1 if i_layer == 3 else i_layer))
+                        ),
+                        #   input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                        #                     patches_resolution[1] // (2 ** i_layer)),
+                          depth=depths[i_layer],
+                          drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                          downsample=PatchMerging if (
+                              i_layer < self.num_layers - 1) else None,
+                          use_checkpoint=use_checkpoint,
+                          out_dim=embed_dims[min(
+                              i_layer + 1, len(embed_dims) - 1)],
+                          activation=activation,
+                          )
+            if i_layer == 0:
+                layer = ConvLayer(
+                    conv_expand_ratio=mbconv_expand_ratio,
+                    **kwargs,
+                )
+            else:
+                layer = BasicLayer(
+                    num_heads=num_heads[i_layer],
+                    window_size=window_sizes[i_layer],
+                    mlp_ratio=self.mlp_ratio,
+                    drop=drop_rate,
+                    local_conv_size=local_conv_size,
+                    **kwargs)
+            self.layers.append(layer)
+        # init weights
+        self.apply(self._init_weights)
+        self.set_layer_lr_decay(layer_lr_decay)
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                embed_dims[-1],
+                256,
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(256),
+            nn.Conv2d(
+                256,
+                256,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(256),
+        )
+    def set_layer_lr_decay(self, layer_lr_decay):
+        decay_rate = layer_lr_decay
+        # layers -> blocks (depth)
+        depth = sum(self.depths)
+        lr_scales = [decay_rate ** (depth - i - 1) for i in range(depth)]
+        def _set_lr_scale(m, scale):
+            for p in m.parameters():
+                p.lr_scale = scale
+        self.patch_embed.apply(lambda x: _set_lr_scale(x, lr_scales[0]))
+        i = 0
+        for layer in self.layers:
+            for block in layer.blocks:
+                block.apply(lambda x: _set_lr_scale(x, lr_scales[i]))
+                i += 1
+            if layer.downsample is not None:
+                layer.downsample.apply(
+                    lambda x: _set_lr_scale(x, lr_scales[i - 1]))
+        assert i == depth
+        for k, p in self.named_parameters():
+            p.param_name = k
+        def _check_lr_scale(m):
+            for p in m.parameters():
+                assert hasattr(p, 'lr_scale'), p.param_name
+        self.apply(_check_lr_scale)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'attention_biases'}
+    def forward_features(self, x):
+        # x: (N, C, H, W)
+        x = self.patch_embed(x)
+        x = self.layers[0](x)
+        start_i = 1
+        for i in range(start_i, len(self.layers)):
+            layer = self.layers[i]
+            x = layer(x)
+        B, _, C = x.size()
+        x = x.view(B, 64, 64, C)
+        x = x.permute(0, 3, 1, 2)
+        x = self.neck(x)
+        return x
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+# model = TinyViT(
+#     img_size=256,
+#     in_chans=3,
+#     embed_dims=[
+#         64, ## (64, 256, 256)
+#         128, ## (128, 128, 128)
+#         160, ## (160, 64, 64)
+#         320 ## (320, 64, 64)
+#     ],
+#     depths=[2, 2, 6, 2],
+#     num_heads=[2, 4, 5, 10],
+#     window_sizes=[7, 7, 14, 7],
+#     mlp_ratio=4.,
+#     drop_rate=0.,
+#     drop_path_rate=0.0,
+#     use_checkpoint=False,
+#     mbconv_expand_ratio=4.0,
+#     local_conv_size=3,
+#     layer_lr_decay=0.8
+# )

models/transformer.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch import Tensor, nn
+import math
+from typing import Tuple, Type
+from .common import MLPBlock
+class TwoWayTransformer(nn.Module):
+    def __init__(
+        self,
+        depth: int,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+    ) -> None:
+        """
+        A transformer decoder that attends to an input image using
+        queries whose positional embedding is supplied.
+        Args:
+          depth (int): number of layers in the transformer
+          embedding_dim (int): the channel dimension for the input embeddings
+          num_heads (int): the number of heads for multihead attention. Must
+            divide embedding_dim
+          mlp_dim (int): the channel dimension internal to the MLP block
+          activation (nn.Module): the activation to use in the MLP block
+        """
+        super().__init__()
+        self.depth = depth
+        self.embedding_dim = embedding_dim
+        self.num_heads = num_heads
+        self.mlp_dim = mlp_dim
+        self.layers = nn.ModuleList()
+        for i in range(depth):
+            self.layers.append(
+                TwoWayAttentionBlock(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    mlp_dim=mlp_dim,
+                    activation=activation,
+                    attention_downsample_rate=attention_downsample_rate,
+                    skip_first_layer_pe=(i == 0),
+                )
+            )
+        self.final_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm_final_attn = nn.LayerNorm(embedding_dim)
+    def forward(
+        self,
+        image_embedding: Tensor,
+        image_pe: Tensor,
+        point_embedding: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+          image_embedding (torch.Tensor): image to attend to. Should be shape
+            B x embedding_dim x h x w for any h and w.
+          image_pe (torch.Tensor): the positional encoding to add to the image. Must
+            have the same shape as image_embedding.
+          point_embedding (torch.Tensor): the embedding to add to the query points.
+            Must have shape B x N_points x embedding_dim for any N_points.
+        Returns:
+          torch.Tensor: the processed point_embedding
+          torch.Tensor: the processed image_embedding
+        """
+        # BxCxHxW -> BxHWxC == B x N_image_tokens x C
+        bs, c, h, w = image_embedding.shape
+        image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+        image_pe = image_pe.flatten(2).permute(0, 2, 1)
+        # Prepare queries
+        queries = point_embedding
+        keys = image_embedding
+        # Apply transformer blocks and final layernorm
+        for layer in self.layers:
+            queries, keys = layer(
+                queries=queries,
+                keys=keys,
+                query_pe=point_embedding,
+                key_pe=image_pe,
+            )
+        # Apply the final attention layer from the points to the image
+        q = queries + point_embedding
+        k = keys + image_pe
+        attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm_final_attn(queries)
+        return queries, keys
+class TwoWayAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int = 2048,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+        skip_first_layer_pe: bool = False,
+    ) -> None:
+        """
+        A transformer block with four layers: (1) self-attention of sparse
+        inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
+        block on sparse inputs, and (4) cross attention of dense inputs to sparse
+        inputs.
+        Arguments:
+          embedding_dim (int): the channel dimension of the embeddings
+          num_heads (int): the number of heads in the attention layers
+          mlp_dim (int): the hidden dimension of the mlp block
+          activation (nn.Module): the activation of the mlp block
+          skip_first_layer_pe (bool): skip the PE on the first layer
+        """
+        super().__init__()
+        self.self_attn = Attention(embedding_dim, num_heads)
+        self.norm1 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm2 = nn.LayerNorm(embedding_dim)
+        self.mlp = MLPBlock(embedding_dim, mlp_dim, activation)
+        self.norm3 = nn.LayerNorm(embedding_dim)
+        self.norm4 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_image_to_token = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.skip_first_layer_pe = skip_first_layer_pe
+    def forward(
+        self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
+    ) -> Tuple[Tensor, Tensor]:
+        # Self attention block
+        if self.skip_first_layer_pe:
+            queries = self.self_attn(q=queries, k=queries, v=queries)
+        else:
+            q = queries + query_pe
+            attn_out = self.self_attn(q=q, k=q, v=queries)
+            queries = queries + attn_out
+        queries = self.norm1(queries)
+        # Cross attention block, tokens attending to image embedding
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm2(queries)
+        # MLP block
+        mlp_out = self.mlp(queries)
+        queries = queries + mlp_out
+        queries = self.norm3(queries)
+        # Cross attention block, image embedding attending to tokens
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
+        keys = keys + attn_out
+        keys = self.norm4(keys)
+        return queries, keys
+class Attention(nn.Module):
+    """
+    An attention layer that allows for downscaling the size of the embedding
+    after projection to queries, keys, and values.
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        downsample_rate: int = 1,
+    ) -> None:
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.internal_dim = embedding_dim // downsample_rate
+        self.num_heads = num_heads
+        assert (
+            self.internal_dim % num_heads == 0
+        ), "num_heads must divide embedding_dim."
+        self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
+    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+        b, n, c = x.shape
+        x = x.reshape(b, n, num_heads, c // num_heads)
+        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
+    def _recombine_heads(self, x: Tensor) -> Tensor:
+        b, n_heads, n_tokens, c_per_head = x.shape
+        x = x.transpose(1, 2)
+        return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        # Input projections
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        v = self.v_proj(v)
+        # Separate into heads
+        q = self._separate_heads(q, self.num_heads)
+        k = self._separate_heads(k, self.num_heads)
+        v = self._separate_heads(v, self.num_heads)
+        # Attention
+        _, _, _, c_per_head = q.shape
+        attn = q @ k.permute(0, 1, 3, 2)  # B x N_heads x N_tokens x N_tokens
+        attn = attn / math.sqrt(c_per_head)
+        attn = torch.softmax(attn, dim=-1)
+        # Get output
+        out = attn @ v
+        out = self._recombine_heads(out)
+        out = self.out_proj(out)
+        return out

train.py ADDED Viewed

	@@ -0,0 +1,502 @@

+import os
+import random
+import monai
+from os import listdir, makedirs
+from os.path import join, exists, isfile, isdir, basename
+from tqdm import tqdm
+from time import time
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from datetime import datetime
+from shutil import copyfile
+from models import PromptEncoder, TwoWayTransformer, TinyViT, MaskDecoder_F4
+import torch.nn.functional as F
+import gc
+from matplotlib import pyplot as plt
+import argparse
+from modality_npz_dataset import ModalityNpzDataset
+torch.cuda.empty_cache()
+os.environ["OMP_NUM_THREADS"] = "4"  # export OMP_NUM_THREADS=4
+os.environ["OPENBLAS_NUM_THREADS"] = "4"  # export OPENBLAS_NUM_THREADS=4
+os.environ["MKL_NUM_THREADS"] = "6"  # export MKL_NUM_THREADS=6
+os.environ["VECLIB_MAXIMUM_THREADS"] = "4"  # export VECLIB_MAXIMUM_THREADS=4
+os.environ["NUMEXPR_NUM_THREADS"] = "6"  # export NUMEXPR_NUM_THREADS=6
+def setup_seed(seed):
+     torch.manual_seed(seed)
+     torch.cuda.manual_seed_all(seed)
+     np.random.seed(seed)
+     random.seed(seed)
+setup_seed(2024)
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_root",
+                        type=str,
+                        default="",
+                        help="Path to the npy data root.")
+    parser.add_argument('--task_name', type=str, default='MedSAM-Lite-All')
+    parser.add_argument("--pretrained_checkpoint",
+                        type=str,
+                        default=None,
+                        help="Path to the pretrained Lite-MedSAM checkpoint.")
+    parser.add_argument("--resume",
+                        type=str,
+                        default=None,
+                        help="Path to the checkpoint to continue training.")
+    parser.add_argument(
+        "--work_dir",
+        type=str,
+        default="./work_dir",
+        help=
+        "Path to the working directory where checkpoints and logs will be saved."
+    )
+    parser.add_argument('--data_aug',
+                        action='store_true',
+                        default=False,
+                        help='use data augmentation during training')
+    parser.add_argument("--num_epochs",
+                        type=int,
+                        default=25,
+                        help="Number of epochs to train.")
+    parser.add_argument("--batch_size",
+                        type=int,
+                        default=16,
+                        help="Batch size.")
+    parser.add_argument("--num_workers",
+                        type=int,
+                        default=8,
+                        help="Number of workers for dataloader.")
+    parser.add_argument(
+        "--bbox_shift",
+        type=int,
+        default=5,
+        help="Perturbation to bounding box coordinates during training.")
+    parser.add_argument("-lr", type=float, default=2e-4, help="Learning rate.")
+    parser.add_argument("-weight_decay",
+                        type=float,
+                        default=0.001,
+                        help="Weight decay.")
+    parser.add_argument("-iou_loss_weight",
+                        type=float,
+                        default=1.0,
+                        help="Weight of IoU loss.")
+    parser.add_argument("-seg_loss_weight",
+                        type=float,
+                        default=1.0,
+                        help="Weight of segmentation loss.")
+    parser.add_argument("-ce_loss_weight",
+                        type=float,
+                        default=1.0,
+                        help="Weight of cross entropy loss.")
+    parser.add_argument("--sanity_check",
+                        action="store_true",
+                        default=True,
+                        help="Whether to do sanity check for dataloading.")
+    args = parser.parse_args()
+    return args
+def show_mask(mask, ax, random_color=True):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.45])], axis=0)
+    else:
+        color = np.array([251 / 255, 252 / 255, 30 / 255, 0.45])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+def show_box(box, ax):
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(
+        plt.Rectangle((x0, y0),
+                      w,
+                      h,
+                      edgecolor='blue',
+                      facecolor=(0, 0, 0, 0),
+                      lw=2))
+def show_points(points, ax):
+    for i, (x, y) in enumerate(points):
+        ax.scatter(x, y, color='red', s=10)
+def cal_iou(result, reference):
+    intersection = torch.count_nonzero(torch.logical_and(result, reference),
+                                       dim=[i for i in range(1, result.ndim)])
+    union = torch.count_nonzero(torch.logical_or(result, reference),
+                                dim=[i for i in range(1, result.ndim)])
+    iou = intersection.float() / union.float()
+    return iou.unsqueeze(1)
+def sanity_check_dataset(args):
+    tr_dataset = ModalityNpzDataset(args.data_root, data_aug=True)
+    tr_dataloader = DataLoader(tr_dataset, batch_size=8, shuffle=True)
+    for step, batch in enumerate(tr_dataloader):
+        # show the example
+        _, axs = plt.subplots(1, 2, figsize=(10, 10))
+        idx = random.randint(0, 4)
+        image = batch["image"]
+        gt = batch["gt2D"]
+        bboxes = batch["bboxes"]
+        names_temp = batch["image_name"]
+        axs[0].imshow(image[idx].cpu().permute(1, 2, 0).numpy())
+        show_mask(gt[idx].cpu().squeeze().numpy(), axs[0])
+        show_box(bboxes[idx].numpy().squeeze(), axs[0])
+        axs[0].axis('off')
+        # set title
+        axs[0].set_title(names_temp[idx])
+        idx = random.randint(4, 7)
+        axs[1].imshow(image[idx].cpu().permute(1, 2, 0).numpy())
+        show_mask(gt[idx].cpu().squeeze().numpy(), axs[1])
+        show_box(bboxes[idx].numpy().squeeze(), axs[1])
+        axs[1].axis('off')
+        # set title
+        axs[1].set_title(names_temp[idx])
+        plt.subplots_adjust(wspace=0.01, hspace=0)
+        plt.savefig(join(args.work_dir, 'Sanitycheck_DA.png'),
+                    bbox_inches='tight',
+                    dpi=300)
+        plt.close()
+        break
+class MedSAM_Lite(nn.Module):
+    def __init__(
+        self,
+        image_encoder,
+        mask_decoder,
+        prompt_encoder,
+    ):
+        super().__init__()
+        self.image_encoder = image_encoder
+        self.mask_decoder = mask_decoder
+        self.prompt_encoder = prompt_encoder
+        encoder_weight_file = "" # path for vision encoder (tiny vit) weights
+        self.image_encoder.load_state_dict(torch.load(encoder_weight_file))
+    def forward(self, image, points, boxes, masks, features, crops,
+                text_features, category_idx):
+        image_embedding = self.image_encoder(image)
+        sparse_embeddings, dense_embeddings = self.prompt_encoder(
+            points=points,
+            boxes=boxes,
+            masks=masks,
+            features=features,
+            crops=crops,
+            text_features=text_features,
+            category_idx=category_idx)
+        low_res_masks, iou_predictions, category_predictions, clip_vec, img_vec = self.mask_decoder(
+            image_embeddings=image_embedding,  # (B, 256, 64, 64)
+            image_pe=self.prompt_encoder.get_dense_pe(),  # (1, 256, 64, 64)
+            sparse_prompt_embeddings=sparse_embeddings,  # (B, 2, 256)
+            dense_prompt_embeddings=dense_embeddings,  # (B, 256, 64, 64)
+            multimask_output=False,
+        )  # (B, 1, 256, 256)
+        return low_res_masks, iou_predictions, category_predictions, clip_vec, img_vec
+    @torch.no_grad()
+    def postprocess_masks(self, masks, new_size, original_size):
+        """
+        Do cropping and resizing
+        """
+        # Crop
+        masks = masks[:, :, :new_size[0], :new_size[1]]
+        # Resize
+        masks = F.interpolate(
+            masks,
+            size=(original_size[0], original_size[1]),
+            mode="bilinear",
+            align_corners=False,
+        )
+        return masks
+def collate_fn(batch):
+    """
+    Collate function for PyTorch DataLoader.
+    """
+    batch_dict = {}
+    for key in batch[0].keys():
+        if key == "image_name" or key == "category_idx":
+            batch_dict[key] = [sample[key] for sample in batch]
+        else:
+            batch_dict[key] = torch.stack([sample[key] for sample in batch],
+                                          dim=0)
+    return batch_dict
+if __name__ == "__main__":
+    args = get_args()
+    sanity_check_dataset(args)
+    run_id = datetime.now().strftime("%Y%m%d-%H%M")
+    print(f"Run ID: {run_id}")
+    model_save_path = join(args.work_dir, args.task_name + "-" + run_id)
+    makedirs(model_save_path, exist_ok=True)
+    copyfile(__file__,
+             join(model_save_path, run_id + "_" + os.path.basename(__file__)))
+    device = torch.device("cuda")
+    num_epochs = args.num_epochs
+    batch_size = args.batch_size
+    num_workers = args.num_workers
+    medsam_lite_image_encoder = TinyViT(
+        img_size=256,
+        in_chans=3,
+        embed_dims=[
+            64,  ## (64, 256, 256)
+            128,  ## (128, 128, 128)
+            160,  ## (160, 64, 64)
+            320  ## (320, 64, 64)
+        ],
+        depths=[2, 2, 6, 2],
+        num_heads=[2, 4, 5, 10],
+        window_sizes=[7, 7, 14, 7],
+        mlp_ratio=4.,
+        drop_rate=0.,
+        drop_path_rate=0.0,
+        use_checkpoint=False,
+        mbconv_expand_ratio=4.0,
+        local_conv_size=3,
+        layer_lr_decay=0.8)
+    medsam_lite_prompt_encoder = PromptEncoder(embed_dim=256,
+                                               image_embedding_size=(64, 64),
+                                               input_image_size=(256, 256),
+                                               mask_in_chans=16)
+    medsam_lite_mask_decoder = MaskDecoder_F4(
+        num_multimask_outputs=3,
+        transformer=TwoWayTransformer(
+            depth=2,
+            embedding_dim=256,
+            mlp_dim=2048,
+            num_heads=8,
+        ),
+        modality=True,
+        contents=True,
+        transformer_dim=256,
+        iou_head_depth=3,
+        iou_head_hidden_dim=256,
+    )
+    medsam_lite_model = MedSAM_Lite(image_encoder=medsam_lite_image_encoder,
+                                    mask_decoder=medsam_lite_mask_decoder,
+                                    prompt_encoder=medsam_lite_prompt_encoder)
+    if args.resume is None and args.pretrained_checkpoint is not None:
+        ## Load pretrained checkpoint if there's no checkpoint to resume from and there's a pretrained checkpoint
+        print(
+            f"Loading pretrained checkpoint from {args.pretrained_checkpoint}")
+        medsam_lite_checkpoint = torch.load(args.pretrained_checkpoint,
+                                            map_location="cpu")
+        medsam_lite_model.load_state_dict(medsam_lite_checkpoint["model"],
+                                          strict=True)
+    medsam_lite_model = medsam_lite_model.to(device)
+    medsam_lite_model.train()
+    print(
+        f"MedSAM Lite size: {sum(p.numel() for p in medsam_lite_model.parameters())}"
+    )
+    print('lr:', args.lr)
+    optimizer = optim.AdamW(
+        medsam_lite_model.parameters(),
+        lr=args.lr,
+        betas=(0.9, 0.999),
+        eps=1e-08,
+        weight_decay=args.weight_decay,
+    )
+    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
+                                                        mode='min',
+                                                        factor=0.9,
+                                                        patience=5,
+                                                        cooldown=0)
+    seg_loss = monai.losses.DiceLoss(sigmoid=True,
+                                     squared_pred=True,
+                                     reduction='mean')
+    bce_loss = nn.BCEWithLogitsLoss(reduction='mean')
+    iou_loss = nn.MSELoss(reduction='mean')
+    ce_loss = nn.CrossEntropyLoss(reduction='mean')
+    train_dataset = ModalityNpzDataset(data_root=args.data_root, data_aug=True)
+    train_loader = DataLoader(train_dataset,
+                              batch_size=batch_size,
+                              shuffle=True,
+                              num_workers=num_workers,
+                              pin_memory=True)
+    if args.resume is not None:
+        ckpt_folders = sorted(listdir(args.resume))
+        ckpt_folders = [
+            f for f in ckpt_folders
+            if (f.startswith(args.task_name)
+                and isfile(join(args.resume, f, 'medsam_lite_latest.pth')))
+        ]
+        print('*' * 20)
+        print('existing ckpts in', args.resume, ckpt_folders)
+        # find the latest ckpt folders
+        time_strings = [
+            f.split(args.task_name + '-')[-1] for f in ckpt_folders
+        ]
+        dates = [datetime.strptime(f, '%Y%m%d-%H%M') for f in time_strings]
+        latest_date = max(dates)
+        latest_ckpt = join(
+            args.work_dir,
+            args.task_name + '-' + latest_date.strftime('%Y%m%d-%H%M'),
+            'medsam_lite_latest.pth')
+        print('Loading from', latest_ckpt)
+        checkpoint = torch.load(latest_ckpt, map_location=device)
+        medsam_lite_model.module.load_state_dict(checkpoint["model"])
+        optimizer.load_state_dict(checkpoint["optimizer"])
+        start_epoch = checkpoint["epoch"] + 1
+        best_loss = checkpoint["loss"]
+        print(f"Loaded checkpoint from epoch {start_epoch}")
+    else:
+        start_epoch = 0
+        best_loss = 1e10
+    train_losses = []
+    epoch_times = []
+    print("Training")
+    for epoch in range(start_epoch, num_epochs):
+        if epoch == num_epochs - 1:
+            for param_group in optimizer.param_groups:
+                param_group['lr'] = 5e-5
+        epoch_loss = [1e10 for _ in range(len(train_loader))]
+        epoch_start_time = time()
+        pbar = tqdm(train_loader)
+        for step, batch in enumerate(pbar):
+            gc.collect()
+            torch.cuda.empty_cache()
+            image = batch["image"]
+            gt2D = batch["gt2D"]
+            boxes = batch["bboxes"]
+            coords = batch["coords"]
+            crops = batch["image_crop"]
+            features = batch["image_feature"]
+            text_features = batch["text_feature"]
+            class_idx = batch["category_idx"]
+            class_idx = torch.tensor(class_idx)
+            optimizer.zero_grad()
+            image, gt2D, boxes, coords, crops, features, text_features, class_idx = image.to(
+                device), gt2D.to(device), boxes.to(device), coords.to(
+                    device), crops.to(device), features.to(
+                        device), text_features.to(device), class_idx.to(device)
+            labels_torch = torch.ones(coords.shape[0]).long()
+            labels_torch = labels_torch.unsqueeze(1).expand(-1, 4)
+            labels_torch = labels_torch.to(device)
+            point_prompt = (coords, labels_torch)
+            logits_pred, iou_pred, category_predictions, clip_vec, img_vec = medsam_lite_model(
+                image, None, boxes, None, features, crops, text_features, class_idx)
+            clip_img_features = clip_vec / clip_vec.norm(dim=-1, keepdim=True)
+            img_features = img_vec / img_vec.norm(dim=-1, keepdim=True)
+            similarity1 = torch.matmul(clip_img_features, img_features.T)
+            similarity2 = torch.matmul(img_features, clip_img_features.T)
+            sim_labels = torch.arange(similarity1.shape[0]).to(image.device)
+            l_seg = seg_loss(logits_pred, gt2D)
+            l_bce = bce_loss(logits_pred, gt2D.float())
+            l_ce_sim = 0.5 * (ce_loss(similarity1, sim_labels.long()) +
+                              ce_loss(similarity2, sim_labels.long()))
+            l_ce = ce_loss(category_predictions, class_idx.long())
+            mask_loss = l_seg + l_bce
+            with torch.no_grad():
+                iou_gt = cal_iou(torch.sigmoid(logits_pred) > 0.5, gt2D.bool())
+            l_iou = iou_loss(iou_pred, iou_gt)
+            loss = mask_loss + l_iou + 0.01 * l_ce_sim + 0.01 * l_ce
+            epoch_loss[step] = loss.item()
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            pbar.set_description(
+                f"Epoch {epoch} at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}, loss: {loss.item():.4f}"
+            )
+        epoch_end_time = time()
+        epoch_duration = epoch_end_time - epoch_start_time
+        epoch_times.append(epoch_duration)
+        epoch_loss_reduced = sum(epoch_loss) / len(epoch_loss)
+        train_losses.append(epoch_loss_reduced)
+        lr_scheduler.step(epoch_loss_reduced)
+        model_weights = medsam_lite_model.state_dict()
+        checkpoint = {
+            "model": model_weights,
+            "epoch": epoch,
+            "optimizer": optimizer.state_dict(),
+            "loss": epoch_loss_reduced,
+            "best_loss": best_loss,
+        }
+        torch.save(checkpoint, join(model_save_path, "medsam_lite_latest.pth"))
+        if epoch_loss_reduced < best_loss:
+            print(
+                f"New best loss: {best_loss:.4f} -> {epoch_loss_reduced:.4f}")
+            best_loss = epoch_loss_reduced
+            checkpoint["best_loss"] = best_loss
+            torch.save(checkpoint, join(model_save_path,
+                                        "medsam_lite_best.pth"))
+        epoch_loss_reduced = 1e10
+        fig, axes = plt.subplots(2, 1, figsize=(10, 8))
+        axes[0].title.set_text("Dice + Binary Cross Entropy + IoU Loss")
+        axes[0].plot(train_losses)
+        axes[0].set_ylabel("Loss")
+        axes[1].plot(epoch_times)
+        axes[1].title.set_text("Epoch Duration")
+        axes[1].set_ylabel("Duration (s)")
+        axes[1].set_xlabel("Epoch")
+        plt.tight_layout()
+        plt.savefig(join(model_save_path, "log.png"))
+        plt.close()