Upload 8 files

Browse files

Files changed (8) hide show

factory/.DS_Store +0 -0
factory/loss.py +269 -0
factory/metric.py +23 -0
factory/utils.py +403 -0
models/.DS_Store +0 -0
models/clip_tqn.py +546 -0
models/resnet.py +1382 -0
models/transformer_decoder.py +320 -0

factory/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

factory/loss.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+try:
+    import torch.distributed.nn
+    from torch import distributed as dist
+    has_distributed = True
+except ImportError:
+    has_distributed = False
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+def gather_features(
+        image_features,
+        text_features,
+        local_loss=False,
+        gather_with_grad=False,
+        rank=0,
+        world_size=1,
+        use_horovod=False
+):
+    assert has_distributed, 'torch.distributed did not import correctly, please use a PyTorch version with support.'
+    if use_horovod:
+        assert hvd is not None, 'Please install horovod'
+        if gather_with_grad:
+            all_image_features = hvd.allgather(image_features)
+            all_text_features = hvd.allgather(text_features)
+        else:
+            with torch.no_grad():
+                all_image_features = hvd.allgather(image_features)
+                all_text_features = hvd.allgather(text_features)
+            if not local_loss:
+                # ensure grads for local rank when all_* features don't have a gradient
+                gathered_image_features = list(all_image_features.chunk(world_size, dim=0))
+                gathered_text_features = list(all_text_features.chunk(world_size, dim=0))
+                gathered_image_features[rank] = image_features
+                gathered_text_features[rank] = text_features
+                all_image_features = torch.cat(gathered_image_features, dim=0)
+                all_text_features = torch.cat(gathered_text_features, dim=0)
+    else:
+        # We gather tensors from all gpus
+        if gather_with_grad:
+            all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0)
+            all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0)
+        else:
+            gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)]
+            gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)]
+            dist.all_gather(gathered_image_features, image_features)
+            dist.all_gather(gathered_text_features, text_features)
+            if not local_loss:
+                # ensure grads for local rank when all_* features don't have a gradient
+                gathered_image_features[rank] = image_features
+                gathered_text_features[rank] = text_features
+            all_image_features = torch.cat(gathered_image_features, dim=0)
+            all_text_features = torch.cat(gathered_text_features, dim=0)
+    return all_image_features, all_text_features
+class ClipLoss(nn.Module):
+    def __init__(
+            self,
+            local_loss=False,
+            gather_with_grad=False,
+            cache_labels=False,
+            rank=0,
+            world_size=1,
+            use_horovod=False,
+    ):
+        super().__init__()
+        self.local_loss = local_loss
+        self.gather_with_grad = gather_with_grad
+        self.cache_labels = cache_labels
+        self.rank = rank
+        self.world_size = world_size
+        self.use_horovod = use_horovod
+        # cache state
+        self.prev_num_logits = 0
+        self.labels = {}
+    def forward(self, image_features, text_features):
+        logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        device = image_features.device
+        if self.world_size > 1:
+            all_image_features, all_text_features = gather_features(
+                image_features, text_features,
+                self.local_loss, self.gather_with_grad, self.rank, self.world_size, self.use_horovod)
+            if self.local_loss:
+                logits_per_image = logit_scale * image_features @ all_text_features.T
+                logits_per_text = logit_scale * text_features @ all_image_features.T
+            else:
+                logits_per_image = logit_scale * all_image_features @ all_text_features.T
+                logits_per_text = logits_per_image.T
+        else:
+            logits_per_image = logit_scale * image_features @ text_features.T
+            logits_per_text = logit_scale * text_features @ image_features.T
+        # calculated ground-truth and cache if enabled
+        num_logits = logits_per_image.shape[0]
+        labels = torch.eye(num_logits, device=device, dtype=torch.float)
+        pred_1 = F.log_softmax(logits_per_image,dim=-1)
+        pred_2 = F.log_softmax(logits_per_text,dim=-1)
+        loss_a = F.kl_div(pred_1, labels,reduction = 'sum')/num_logits
+        loss_b = F.kl_div(pred_2, labels,reduction = 'sum')/num_logits
+        total_loss = (loss_a + loss_b)/2
+        return total_loss
+class AsymmetricLoss(nn.Module):
+    def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_torch_grad_focal_loss=True):
+        super(AsymmetricLoss, self).__init__()
+        self.gamma_neg = gamma_neg
+        self.gamma_pos = gamma_pos
+        self.clip = clip
+        self.disable_torch_grad_focal_loss = disable_torch_grad_focal_loss
+        self.eps = eps
+    def forward(self, x, y, use_weight = False):
+        """"
+        Parameters
+        ----------
+        x: input logits
+        y: targets (multi-label binarized vector)
+        """
+        # Calculating Probabilities
+        x_sigmoid = torch.sigmoid(x)
+        xs_pos = x_sigmoid
+        xs_neg = 1 - x_sigmoid
+        # Asymmetric Clipping
+        if self.clip is not None and self.clip > 0:
+            xs_neg = (xs_neg + self.clip).clamp(max=1)
+        # Basic CE calculation
+        los_pos = y * torch.log(xs_pos.clamp(min=self.eps))
+        los_neg = (1 - y) * torch.log(xs_neg.clamp(min=self.eps))
+        loss = los_pos + los_neg
+        # Asymmetric Focusing
+        if self.gamma_neg > 0 or self.gamma_pos > 0:
+            if self.disable_torch_grad_focal_loss:
+                torch.set_grad_enabled(False)
+            pt0 = xs_pos * y
+            pt1 = xs_neg * (1 - y)  # pt = p if t > 0 else 1-p
+            pt = pt0 + pt1
+            one_sided_gamma = self.gamma_pos * y + self.gamma_neg * (1 - y)
+            one_sided_w = torch.pow(1 - pt, one_sided_gamma)
+            if self.disable_torch_grad_focal_loss:
+                torch.set_grad_enabled(True)
+            loss *= one_sided_w
+        if use_weight:
+            return loss
+        return -loss.sum()
+class RalSingleLoss(nn.Module):
+    '''
+    This loss is intended for single-label classification problems
+    '''
+    def __init__(self, gamma_pos=0, gamma_neg=4, eps: float = 0.1, epsilon_pos_pow = -2.5, reduction='mean'):
+        super(RalSingleLoss, self).__init__()
+        self.eps = eps
+        self.logsoftmax = nn.LogSoftmax(dim=-1)
+        self.targets_classes = []
+        self.gamma_pos = gamma_pos
+        self.gamma_neg = gamma_neg
+        self.reduction = reduction
+        self.epsilon_pos = 1.0
+        self.epsilon_neg = 0.0
+        self.epsilon_pos_pow = epsilon_pos_pow
+        self.lamb = 1.5
+    def forward(self, inputs, target):
+        '''
+        "input" dimensions: - (batch_size,number_classes)
+        "target" dimensions: - (batch_size)
+        '''
+        num_classes = inputs.size()[-1]
+        log_preds = self.logsoftmax(inputs)
+        self.targets_classes = torch.zeros_like(inputs).scatter_(1, target.long().unsqueeze(1), 1)
+        # ASL weights
+        targets = self.targets_classes
+        anti_targets = 1 - targets
+        xs_pos = torch.exp(log_preds)
+        xs_neg = 1 - xs_pos
+        xs_pos = torch.exp(log_preds)* (torch.log(xs_pos.clamp(min=self.eps)) + self.epsilon_pos * (1 - xs_pos.clamp(min=self.eps)) + self.epsilon_pos_pow * 0.5 * torch.pow(1 - xs_pos.clamp(min=self.eps), 2) ) * torch.log(xs_pos)
+        xs_neg = (1 - xs_pos) * (torch.log(xs_neg.clamp(min=self.eps)) + self.epsilon_neg * (xs_neg.clamp(min=self.eps)) ) * -(self.lamb - xs_neg) * xs_neg ** 2
+        asymmetric_w = torch.pow(1 - xs_pos - xs_neg,
+                                 self.gamma_pos * targets + self.gamma_neg * anti_targets)
+        log_preds = log_preds * asymmetric_w
+        if self.eps > 0:  # label smoothing
+            self.targets_classes = self.targets_classes.mul(1 - self.eps).add(self.eps / num_classes)
+        # loss calculation
+        loss = - self.targets_classes.mul(log_preds)
+        loss = loss.sum(dim=-1)
+        if self.reduction == 'mean':
+            loss = loss.mean()
+        return loss
+class Ralloss(nn.Module):
+    def __init__(self, gamma_neg=4, gamma_pos=0, clip=0.05, eps=1e-8, lamb=1.5, epsilon_neg=0.0, epsilon_pos=1.0, epsilon_pos_pow=-2.5, disable_torch_grad_focal_loss=False):
+        super(Ralloss, self).__init__()
+        self.gamma_neg = gamma_neg
+        self.gamma_pos = gamma_pos
+        self.clip = clip
+        self.disable_torch_grad_focal_loss = disable_torch_grad_focal_loss
+        self.eps = eps
+        # parameters of Taylor expansion polynomials
+        self.epsilon_pos = epsilon_pos
+        self.epsilon_neg = epsilon_neg
+        self.epsilon_pos_pow = epsilon_pos_pow
+        self.margin = 1.0
+        self.lamb = lamb
+    def forward(self, x, y, use_weight=False):
+        """"
+        x: input logits with size (batch_size, number of labels).
+        y: binarized multi-label targets with size (batch_size, number of labels).
+        """
+        # Calculating Probabilities
+        x_sigmoid = torch.sigmoid(x)
+        xs_pos = x_sigmoid
+        xs_neg = 1 - x_sigmoid
+        # Asymmetric Clipping
+        if self.clip is not None and self.clip > 0:
+            xs_neg = (xs_neg + self.clip).clamp(max=1)
+        # Basic Taylor expansion polynomials
+        los_pos = y * (torch.log(xs_pos.clamp(min=self.eps)) + self.epsilon_pos * (1 - xs_pos.clamp(min=self.eps)) + self.epsilon_pos_pow * 0.5 * torch.pow(1 - xs_pos.clamp(min=self.eps), 2))
+        los_neg = (1 - y) * (torch.log(xs_neg.clamp(min=self.eps)) + self.epsilon_neg * (xs_neg.clamp(min=self.eps)) ) * (self.lamb - x_sigmoid) * x_sigmoid ** 2 * (self.lamb - xs_neg)
+        loss = los_pos + los_neg
+        # Asymmetric Focusing
+        if self.gamma_neg > 0 or self.gamma_pos > 0:
+            if self.disable_torch_grad_focal_loss:
+                torch.set_grad_enabled(False)
+            pt0 = xs_pos * y
+            pt1 = xs_neg * (1 - y)  # pt = p if t > 0 else 1-p
+            pt = pt0 + pt1
+            one_sided_gamma = self.gamma_pos * y + self.gamma_neg * (1 - y)
+            one_sided_w = torch.pow(1 - pt, one_sided_gamma)
+            if self.disable_torch_grad_focal_loss:
+                torch.set_grad_enabled(True)
+            loss *= one_sided_w
+        if use_weight:
+            return loss
+        return -loss.sum()

factory/metric.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import sklearn
+from sklearn.metrics import roc_auc_score,precision_recall_curve,accuracy_score,matthews_corrcoef
+# gt_np = [0,1,1,0]
+# pred_np = [0.2,0.7,0.8,0.3]
+# fps,tps,thresholds = sklearn.metrics._ranking._binary_clf_curve(gt_np, pred_np)
+# precision, recall, thresholds = precision_recall_curve(gt_np, pred_np)
+# print(fps,tps,thresholds) # tps An increasing count of true positives
+# print(precision[:-1], recall[:-1], thresholds)
+# tns = tps/recall[:-1] - tps
+# print(tns + fps + tps)
+# fns =
+# [0. 1. 2.] [2. 2. 2.] [0.8 0.3 0.2]
+# precision, recall, thresholds = precision_recall_curve(gt_np, pred_np)
+# numerator = 2 * recall * precision
+# denom = recall + precision
+# f1_scores = np.divide(numerator, denom, out=np.zeros_like(denom), where=(denom!=0))
+# max_f1 = np.max(f1_scores)
+# max_f1_thresh = thresholds[np.argmax(f1_scores)]
+# print('The max_f1_thresh is', max_f1_thresh)
+# print('The average f1_score is', max_f1)
+# print('The average acc_score is', accuracy_score(gt_np, pred_np>max_f1_thresh))

factory/utils.py ADDED Viewed

	@@ -0,0 +1,403 @@

+import numpy as np
+import io
+import os
+import time
+import random
+from collections import defaultdict, deque
+import datetime
+import subprocess
+import torch
+import torch.distributed as dist
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        if self.count == 0:
+            return  self.total
+        else:
+            return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value
+            )
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+    def global_avg(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {:.4f}".format(name, meter.global_avg)
+            )
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        log_msg = [
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}'
+        ]
+        if torch.cuda.is_available():
+            log_msg.append('max mem: {memory:.0f}')
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def compute_acc(logits, label, reduction='mean'):
+    ret = (torch.argmax(logits, dim=1) == label).float()
+    if reduction == 'none':
+        return ret.detach()
+    elif reduction == 'mean':
+        return ret.mean().item()
+def compute_n_params(model, return_str=True):
+    tot = 0
+    for p in model.parameters():
+        w = 1
+        for x in p.shape:
+            w *= x
+        tot += w
+    if return_str:
+        if tot >= 1e6:
+            return '{:.1f}M'.format(tot / 1e6)
+        else:
+            return '{:.1f}K'.format(tot / 1e3)
+    else:
+        return tot
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def seed_worker(worker_id):
+    worker_seed = torch.initial_seed() % 2**32
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def init_distributed_mode(args):
+    if args.dist_on_itp:
+        args.rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+        args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+        args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+        args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])
+        os.environ['LOCAL_RANK'] = str(args.gpu)
+        os.environ['RANK'] = str(args.rank)
+        os.environ['WORLD_SIZE'] = str(args.world_size)
+        print('on tip')
+        # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
+    elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+        args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])
+        print('rank')
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+    # args.distributed = False
+    # torch.cuda.set_device(args.gpu)
+    # args.dist_backend = 'gloo'
+    # print('| distributed init (rank {}): {}, gpu {}'.format(
+       # args.rank, args.dist_url, args.gpu), flush=True)
+    # print("flag1")
+    # print(args)
+    # torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         # world_size=args.world_size, rank=args.rank)
+    # print("flag2")
+    # torch.distributed.barrier()
+    # setup_for_distributed(args.rank == 0)
+    args.distributed = False
+    args.dist_url ='tcp://localhost:12345'
+    args.world_size=1
+    args.rank = 0
+# def init_distributed_mode(args,port='29511'):
+#     num_gpus = torch.cuda.device_count()
+#     if args.dist_on_itp:
+#         args.rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+#         args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+#         args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+#         args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])
+#         os.environ['LOCAL_RANK'] = str(args.gpu)
+#         os.environ['RANK'] = str(args.rank)
+#         os.environ['WORLD_SIZE'] = str(args.world_size)
+#         # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
+#     elif "SLURM_JOB_ID" in os.environ:
+#         print('SLURM_JOB_ID')
+#         args.rank = int(os.environ["SLURM_PROCID"])
+#         args.world_size = int(os.environ["SLURM_NTASKS"])
+#         node_list = os.environ["SLURM_NODELIST"]
+#         addr = subprocess.getoutput(f"scontrol show hostname {node_list} | head -n1")
+#         # specify master port
+#         if port is not None:
+#             os.environ["MASTER_PORT"] = str(port)
+#         elif "MASTER_PORT" not in os.environ:
+#             os.environ["MASTER_PORT"] = "29400"
+#         if "MASTER_ADDR" not in os.environ:
+#             os.environ["MASTER_ADDR"] = addr
+#         os.environ["WORLD_SIZE"] = str(args.world_size)
+#         os.environ["LOCAL_RANK"] = str(args.rank  % num_gpus)
+#         os.environ["RANK"] = os.environ["WORLD_SIZE"]
+#         args.gpu = args.rank % torch.cuda.device_count()
+#     elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+#         print('RANK')
+#         args.rank = int(os.environ["RANK"])
+#         args.world_size = int(os.environ['WORLD_SIZE'])
+#         args.gpu = int(os.environ['LOCAL_RANK'])
+#     else:
+#         print('Not using distributed mode')
+#         args.distributed = False
+#         return
+#     args.distributed = True
+#     torch.cuda.set_device(args.gpu)
+#     args.dist_backend = 'nccl'
+#     print('| distributed init (rank {}): {}, gpu {}'.format(
+#         args.rank, args.dist_url, args.gpu), flush=True)
+#     torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+#                                          world_size=args.world_size, rank=args.rank)
+#     print('Init_process_group')
+#     torch.distributed.barrier()
+#     print('distributed.barrier')
+#     setup_for_distributed(args.rank == 0)
+#     print('Finished distributed')
+# def init_distributed_mode(args):
+#     # os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
+#     # args.local_rank = os.environ['LOCAL_RANK']
+#     if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+#         args.rank = int(os.environ["RANK"])
+#         args.world_size = int(os.environ['WORLD_SIZE'])
+#         args.local_rank = int(os.environ['LOCAL_RANK'])
+#     elif 'SLURM_PROCID' in os.environ:
+#         args.rank = int(os.environ['SLURM_PROCID'])
+#         args.local_rank = args.rank % torch.cuda.device_count()
+#     else:
+#         print('Not using distributed mode')
+#         args.distributed = False
+#         return
+#     args.distributed = True
+#     torch.cuda.set_device(args.local_rank)
+#     args.dist_backend = 'nccl'
+#     print('| distributed init (rank {}): {}'.format(
+#         args.rank, args.dist_url), flush=True)
+#     torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+#                                          world_size=args.world_size, rank=args.rank)
+#     torch.distributed.barrier()
+#     setup_for_distributed(args.rank == 0)
+# def init_distributed_mode(args):
+#     # if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+#     #     args.rank = int(os.environ["RANK"])
+#     #     args.world_size = int(os.environ['WORLD_SIZE'])
+#     #     args.gpu = int(os.environ['LOCAL_RANK'])
+#     # elif 'SLURM_PROCID' in os.environ:
+#     #     args.rank = int(os.environ['SLURM_PROCID'])
+#     #     args.gpu = args.rank % torch.cuda.device_count()
+#     # else:
+#     #     print('Not using distributed mode')
+#     #     args.distributed = False
+#     #     return
+#     # rank = int(os.environ['RANK'])                # system env process ranks\
+#     # print(torch.distributed.get_world_size())
+#     args.distributed = True
+#     # torch.cuda.set_device(args.gpu)
+#     num_gpus = torch.cuda.device_count()          # Returns the number of GPUs available
+#     torch.cuda.set_device(args.rank % num_gpus)
+#     # args.gpu = args.rank % torch.cuda.device_count()
+#     args.dist_backend = 'nccl'
+#     print('| distributed init (rank {}): {}'.format(
+#         args.rank, args.dist_url), flush=True)
+#     torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+#                                          world_size=args.world_size, rank=args.rank)
+#     torch.distributed.barrier()
+#     print('using distributed mode',args.rank, args.dist_url)
+#     setup_for_distributed(args.rank == 0)
+# # export MASTER_ADDR=localhost
+# export MASTER_PORT=5678

models/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

models/clip_tqn.py ADDED Viewed

	@@ -0,0 +1,546 @@

+import sys
+# 加入父文件夹路径到sys.path中
+sys.path.append(sys.path[0].replace('models', ''))
+import re
+import logging
+import math
+import json
+import pathlib
+import numpy as np
+from copy import deepcopy
+from pathlib import Path
+from einops import rearrange
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import Tuple, Union, Callable, Optional
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torchvision.models as models
+from torch.utils.checkpoint import checkpoint
+from transformers import AutoModel,BertConfig,AutoTokenizer
+# from pytorch_pretrained_vit import ViT
+# from visualizer import get_local
+from models.transformer_decoder import *
+# from io import BytesIO
+# from petrel_client.client import Client
+# conf_path = '~/petreloss.conf'
+# client = Client(conf_path)
+from torch.autograd import Function
+import timm
+class ReverseLayerF(Function):
+    @staticmethod
+    def forward(ctx, x, alpha):
+        ctx.alpha = alpha
+        return x.view_as(x)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output.neg() * ctx.alpha, None
+class DomainClassifier(nn.Module):
+    '''一个单层分类器 带梯度反转层'''
+    def __init__(self, domain_nums=4, feature_dims=768):
+        super().__init__()
+        self.domain_nums = domain_nums
+        self.feature_dims = feature_dims
+        self.fc = nn.Linear(feature_dims, domain_nums)
+    def forward(self, x):
+        reverse_x = ReverseLayerF.apply(x, 1.0)
+        return self.fc(reverse_x)
+class CLP_clinical(nn.Module):
+    def __init__(self,
+                bert_model_name: str,
+                embed_dim: int = 768,
+                freeze_layers:Union[Tuple[int, int], int] = None):
+        super().__init__()
+        self.bert_model = self._get_bert_basemodel(bert_model_name=bert_model_name, freeze_layers=freeze_layers)
+        self.mlp_embed = nn.Sequential(
+            nn.Linear(embed_dim, embed_dim),
+            nn.GELU(),
+            nn.Linear(embed_dim, embed_dim)
+        )
+        self.embed_dim = embed_dim
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.init_parameters()
+    def init_parameters(self):
+        nn.init.constant_(self.logit_scale, np.log(1 / 0.07))
+        for m in self.mlp_embed:
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=self.embed_dim ** -0.5)
+    def _get_bert_basemodel(self, bert_model_name, freeze_layers=None):#12
+        try:
+            print(bert_model_name)
+            config = BertConfig.from_pretrained(bert_model_name, output_hidden_states=True)#bert-base-uncased
+            model = AutoModel.from_pretrained(bert_model_name, config=config)#, return_dict=True)
+            print("Text feature extractor:", bert_model_name)
+            print("bert encoder layers:",len(model.encoder.layer))
+        except:
+            raise ("Invalid model name. Check the config file and pass a BERT model from transformers lybrary")
+        if freeze_layers is not None:
+            for layer_idx in freeze_layers:
+                for param in list(model.encoder.layer[layer_idx].parameters()):
+                    param.requires_grad = False
+        return model
+    def encode_text(self, text):
+        #input batch_size,token, return batch_size,dim
+        output = self.bert_model(input_ids = text['input_ids'],attention_mask = text['attention_mask'] )
+        last_hidden_state, pooler_output, hidden_states = output[0],output[1],output[2]
+        encode_out = self.mlp_embed(pooler_output)
+        # encode_out = pooler_output
+        return encode_out
+    def forward(self, text):
+        #input batch_size,token, return batch_size,dim
+        output = self.bert_model(input_ids = text['input_ids'],attention_mask = text['attention_mask'] )
+        last_hidden_state, pooler_output, hidden_states = output[0],output[1],output[2]
+        encode_out = self.mlp_embed(pooler_output)
+        # encode_out = pooler_output
+        return encode_out
+    # def forward(self,text1,text2):
+    #     text1_features = self.encode_text(text1)
+    #     text2_features = self.encode_text(text2)
+    #     text1_features = F.normalize(text1_features, dim=-1)
+    #     text2_features = F.normalize(text2_features, dim=-1)
+    #     return text1_features, text2_features, self.logit_scale.exp()
+class ModelRes(nn.Module):
+    def __init__(self, res_base_model):
+        super(ModelRes, self).__init__()
+        self.resnet_dict = {
+                            "resnet50": models.resnet50(pretrained=True),
+                            "resnet101": models.resnet101(pretrained=True),
+                            "resnet152": models.resnet152(pretrained=True),
+                            "resnet50_openai": None,
+                            'resnet101_openai': None,
+                            'resnet50x4_openai': None,
+                            }
+                            # "resnet50": models.resnet50(pretrained=True)}
+        self.resnet = self._get_res_basemodel(res_base_model)
+        # num_ftrs = int(self.resnet.fc.in_features/2)
+        # self.res_features = nn.Sequential(*list(self.resnet.children())[:-3]) 224
+        if 'openai' in res_base_model:
+            # 重新定义res_features
+            num_ftrs = int(self.resnet.attnpool.v_proj.in_features)
+            self.res_features = nn.Sequential(*list(self.resnet.children())[:-1])
+        else:
+            num_ftrs = int(self.resnet.fc.in_features)
+            self.res_features = nn.Sequential(*list(self.resnet.children())[:-2])
+        # here num_ftrs = 2048
+        self.res_l1 = nn.Linear(num_ftrs, num_ftrs)
+        self.res_l2 = nn.Linear(num_ftrs, 768)
+    def _get_res_basemodel(self, res_model_name):
+        try:
+            res_model = self.resnet_dict[res_model_name]
+            print("Image feature extractor:", res_model_name)
+            return res_model
+        except:
+            raise ("Invalid model name. Check the config file and pass one of: resnet18 or resnet50")
+    def forward(self, img):
+        #return (batchsize, patch_num, dim)
+        batch_size = img.shape[0]
+        res_fea = self.res_features(img)
+        # return res_fea
+        # res_fea = F.adaptive_avg_pool2d(res_fea, (1, 1))
+        res_fea = rearrange(res_fea,'b d n1 n2 -> b (n1 n2) d')
+        h = rearrange(res_fea,'b n d -> (b n) d')
+        x = self.res_l1(h)
+        x = F.relu(x)
+        x = self.res_l2(x)
+        out_emb = rearrange(x,'(b n) d -> b n d',b=batch_size)
+        out_pool = torch.mean(out_emb,dim=1)
+        return out_emb,out_pool
+class ModelConvNeXt(nn.Module):
+    def __init__(self, convnext_base_model):
+        super(ModelConvNeXt, self).__init__()
+        self.convnext_dict = {"convnext-tiny": timm.create_model('convnextv2_tiny.fcmae_ft_in22k_in1k_384', pretrained=True, num_classes=1000),
+                              "convnext-base": timm.create_model('convnextv2_base.fcmae_ft_in22k_in1k_384', pretrained=True, num_classes=1000),
+                              }
+        convnext = self._get_convnext_basemodel(convnext_base_model)
+        num_ftrs = int(convnext.head.in_features)
+        self.conv_features = nn.Sequential(*list(convnext.children())[:-2])
+        self.conv_l1 = nn.Linear(num_ftrs, num_ftrs)
+        self.conv_l2 = nn.Linear(num_ftrs, 768)
+    def _get_convnext_basemodel(self, convnext_model_name):
+        try:
+            convnext_model = self.convnext_dict[convnext_model_name]
+            print("Image feature extractor:", convnext_model_name)
+            return convnext_model
+        except:
+            raise ("Invalid model name. Check the config file and pass one of: convnext-tiny, convnext-small or convnext-base")
+    def forward(self, img):
+        #return (batchsize, patch_num, dim)
+        batch_size = img.shape[0]
+        conv_fea = self.conv_features(img)
+        conv_fea = F.adaptive_avg_pool2d(conv_fea, (1, 1))
+        conv_fea = rearrange(conv_fea,'b d n1 n2 -> b (n1 n2) d')
+        h = rearrange(conv_fea,'b n d -> (b n) d')
+        x = self.conv_l1(h)
+        x = F.relu(x)
+        x = self.conv_l2(x)
+        out_emb = rearrange(x,'(b n) d -> b n d',b=batch_size)
+        out_pool = torch.mean(out_emb,dim=1)
+        return out_emb,out_pool
+# class ModelConvNeXt(nn.Module):
+#     def __init__(self, convnext_base_model):
+#         super(ModelConvNeXt, self).__init__()
+#         self.convnext_dict = {"convnext-tiny": models.convnext_tiny(weights='ConvNeXt_Tiny_Weights.DEFAULT'),
+#                               "convnext-small": models.convnext_small(weights='ConvNeXt_Small_Weights.DEFAULT'),
+#                               "convnext-base": models.convnext_base(weights='ConvNeXt_Base_Weights.DEFAULT'),
+#                               }
+#         convnext = self._get_convnext_basemodel(convnext_base_model)
+#         num_ftrs = int(convnext.classifier[-1].in_features)
+#         self.conv_features = nn.Sequential(*list(convnext.children())[:-2])
+#         self.conv_l1 = nn.Linear(num_ftrs, num_ftrs)
+#         self.conv_l2 = nn.Linear(num_ftrs, 768)
+#     def _get_convnext_basemodel(self, convnext_model_name):
+#         try:
+#             convnext_model = self.convnext_dict[convnext_model_name]
+#             print("Image feature extractor:", convnext_model_name)
+#             return convnext_model
+#         except:
+#             raise ("Invalid model name. Check the config file and pass one of: convnext-tiny, convnext-small or convnext-base")
+#     def forward(self, img):
+#         #return (batchsize, patch_num, dim)
+#         batch_size = img.shape[0]
+#         conv_fea = self.conv_features(img)
+#         conv_fea = F.adaptive_avg_pool2d(conv_fea, (1, 1))
+#         conv_fea = rearrange(conv_fea,'b d n1 n2 -> b (n1 n2) d')
+#         h = rearrange(conv_fea,'b n d -> (b n) d')
+#         x = self.conv_l1(h)
+#         x = F.relu(x)
+#         x = self.conv_l2(x)
+#         out_emb = rearrange(x,'(b n) d -> b n d',b=batch_size)
+#         out_pool = torch.mean(out_emb,dim=1)
+#         return out_emb,out_pool
+# import open_clip
+# class ModelCLIP(nn.Module):
+#     def __init__(self, clip_base_model):
+#         super(ModelCLIP, self).__init__()
+#         # 根据clip_base_model加载不同的模型
+#         if clip_base_model == 'openai_EVA02-B-16':
+#             model, _, preprocess = open_clip.create_model_and_transforms('EVA02-B-16', pretrained='merged2b_s8b_b131k')
+#         elif clip_base_model == 'openai_convnext_base_w':
+#             model, _, preprocess = open_clip.create_model_and_transforms('convnext_base_w', pretrained='laion2b_s13b_b82k_augreg')
+#         else:
+#             raise ("Invalid model name. Check the config file and pass one of: EVA02-B-16 or convnext_base_w")
+class ModelEfficientV2(nn.Module):
+    def __init__(self, efficientv2_base_model):
+        super(ModelEfficientV2, self).__init__()
+        self.efficientv2_dict = {"efficientnet_v2_s": models.efficientnet_v2_s(weights='EfficientNet_V2_S_Weights.IMAGENET1K_V1'),}
+        self.efficientv2_model = self._get_efficientv2_basemodel(efficientv2_base_model)
+        num_ftrs = int(self.efficientv2_model.classifier[-1].in_features)
+        self.efficientv2_features = nn.Sequential(*list(self.efficientv2_model.children())[:-2])
+        self.efficientv2_l1 = nn.Linear(num_ftrs, num_ftrs)
+        self.efficientv2_l2 = nn.Linear(num_ftrs, 768)
+    def _get_efficientv2_basemodel(self, efficientv2_model_name):
+        try:
+            efficientv2_model = self.efficientv2_dict[efficientv2_model_name]
+            print("Image feature extractor:", efficientv2_model_name)
+            return efficientv2_model
+        except:
+            raise ("Invalid model name. Check the config file and pass one of: efficientnetv2_rw_s")
+    def forward(self, img):
+        batch_size = img.shape[0]
+        efficientv2_fea = self.efficientv2_features(img)
+        # efficientv2_fea = F.adaptive_avg_pool2d(efficientv2_fea, (1, 1))
+        # print(efficientv2_fea.shape)
+        efficientv2_fea = rearrange(efficientv2_fea,'b d n1 n2 -> b (n1 n2) d')
+        # print(efficientv2_fea.shape)
+        h = rearrange(efficientv2_fea,'b n d -> (b n) d')
+        # print(h.shape)
+        x = self.efficientv2_l1(h)
+        x = F.relu(x)
+        x = self.efficientv2_l2(x)
+        # print(x.shape)
+        out_emb = rearrange(x,'(b n) d -> b n d',b=batch_size)
+        out_pool = torch.mean(out_emb,dim=1)
+        return out_emb,out_pool
+class ModelDense(nn.Module):
+    def __init__(self, dense_base_model):
+        super(ModelDense, self).__init__()
+        self.densenet_dict = {"densenet121": models.densenet121(weights='DenseNet121_Weights.IMAGENET1K_V1'),
+                            "densenet161": models.densenet161(weights='DenseNet161_Weights.IMAGENET1K_V1'),
+                            "densenet201": models.densenet201(weights='DenseNet201_Weights.IMAGENET1K_V1'),}
+        self.densenet = self._get_dense_basemodel(dense_base_model)
+        num_ftrs = int(self.densenet.classifier.in_features)
+        self.dense_features = self.densenet.features
+        self.dense_l1 = nn.Linear(num_ftrs, num_ftrs)
+        self.dense_l2 = nn.Linear(num_ftrs, 768)
+    def _get_dense_basemodel(self, dense_base_model):
+        try:
+            dense_model = self.densenet_dict[dense_base_model]
+            print("Image feature extractor:", dense_base_model)
+            return dense_model
+        except:
+            raise ("Invalid model name. Check the config file and pass one of: densenet121 or densenet161")
+    def forward(self, img):
+        batch_size = img.shape[0]
+        dense_fea = self.dense_features(img)#N, 1024, 7,7
+        dense_fea = rearrange(dense_fea,'b d n1 n2 -> b (n1 n2) d')
+        h = rearrange(dense_fea,'b n d -> (b n) d')
+        x = self.dense_l1(h)
+        x = F.relu(x)
+        x = self.dense_l2(x)
+        out_emb = rearrange(x,'(b n) d -> b n d',b=batch_size)
+        out_pool = torch.mean(out_emb,dim=1)
+        return out_emb,out_pool
+class TQN_Model(nn.Module):
+    def __init__(self,
+            embed_dim: int = 768,
+            class_num: int = 1,
+            lam: list = [1, 0]
+            ):
+        super().__init__()
+        self.d_model = embed_dim
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        # decoder_layer = TransformerDecoderLayer(self.d_model, 4, 1024,
+                                        # 0.1, 'relu',normalize_before=True)
+        decoder_layerV1 = TransformerDecoderLayerV1(self.d_model, 4, 1024,
+                                        0.1, 'relu', True, lam)
+        self.decoder_norm = nn.LayerNorm(self.d_model)
+        # self.decoder = TransformerDecoder(decoder_layer, 4, self.decoder_norm,
+                                # return_intermediate=False)
+        self.decoderV1 = TransformerDecoderV1(decoder_layerV1, 4, self.decoder_norm,
+                                return_intermediate=False)
+        self.dropout_feas = nn.Dropout(0.1)
+        # class_num = 2
+        self.mlp_head = nn.Sequential( # nn.LayerNorm(768),
+            nn.Linear(embed_dim, class_num)
+        )
+        self.apply(self._init_weights)
+    @staticmethod
+    def _init_weights(module):
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+        elif isinstance(module, nn.MultiheadAttention):
+            module.in_proj_weight.data.normal_(mean=0.0, std=0.02)
+            module.out_proj.weight.data.normal_(mean=0.0, std=0.02)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    # def forward(self, image_features, text_features):
+    #     #image_features (batch_size,patch_num,dim)
+    #     #text_features (query_num,dim)
+    #     batch_size = image_features.shape[0]
+    #     image_features = image_features.transpose(0,1)
+    #     text_features = text_features.unsqueeze(1).repeat(1, batch_size, 1)
+    #     image_features = self.decoder_norm(image_features)
+    #     text_features = self.decoder_norm(text_features)
+    #     # features = self.decoder(text_features, image_features,
+    #     #         memory_key_padding_mask=None, pos=None, query_pos=None)
+    #     image_features_pool = torch.mean(image_features,dim=0).unsqueeze(0)
+    #     features = self.decoderV1(text_features, image_features, image_features_pool,
+    #             memory_key_padding_mask=None, pos=None, query_pos=None)
+    #     features = self.dropout_feas(features).transpose(0,1)  #b,embed_dim
+    #     out = self.mlp_head(features)  #(batch_size, query_num)
+    #     # out = out.squeeze(-1)
+    #     return out
+    def forward(self, image_features, text_features, return_atten = False):
+        #image_features (batch_size,patch_num,dim)
+        #text_features (query_num,dim)
+        batch_size = image_features.shape[0]
+        image_features = image_features.transpose(0,1)
+        text_features = text_features.unsqueeze(1).repeat(1, batch_size, 1)
+        image_features = self.decoder_norm(image_features)
+        text_features = self.decoder_norm(text_features)
+        image_features_pool = torch.mean(image_features,dim=0).unsqueeze(0)
+        features,atten_map = self.decoderV1(text_features, image_features, image_features_pool,
+                memory_key_padding_mask=None, pos=None, query_pos=None)
+        features = self.dropout_feas(features).transpose(0,1)  #b,embed_dim
+        out = self.mlp_head(features)  #(batch_size, query_num)
+        if return_atten:
+            return out, atten_map
+        else:
+            return out
+class TQN_Model_Ensemble(nn.Module):
+    def __init__(self,
+            embed_dim: int = 768,
+            class_num: int = 1,
+            lam: list = [1, 0]
+            ):
+        super().__init__()
+        self.d_model = embed_dim
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        decoder_layerV1 = TransformerDecoderLayerV1(self.d_model, 4, 1024,
+                                        0.1, 'relu', True, lam)
+        self.decoder_norm = nn.LayerNorm(self.d_model)
+        self.decoder_norm_1 = nn.LayerNorm(self.d_model)
+        self.decoder_norm_2 = nn.LayerNorm(self.d_model)
+        self.decoderV1 = TransformerDecoderV1(decoder_layerV1, 4, self.decoder_norm,
+                                return_intermediate=False)
+        self.decoderV1_1 = TransformerDecoderV1(decoder_layerV1, 4, self.decoder_norm_1,
+                                return_intermediate=False)
+        self.decoderV1_2 = TransformerDecoderV1(decoder_layerV1, 4, self.decoder_norm_2,
+                                return_intermediate=False)
+        self.dropout_feas = nn.Dropout(0.1)
+        # class_num = 2
+        self.mlp_head = nn.Sequential(nn.Linear(embed_dim, class_num))
+        self.mlp_head_1 = nn.Sequential(nn.Linear(embed_dim, class_num))
+        self.mlp_head_2 = nn.Sequential(nn.Linear(embed_dim, class_num))
+        self.apply(self._init_weights)
+    @staticmethod
+    def _init_weights(module):
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+        elif isinstance(module, nn.MultiheadAttention):
+            module.in_proj_weight.data.normal_(mean=0.0, std=0.02)
+            module.out_proj.weight.data.normal_(mean=0.0, std=0.02)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def forward(self, image_features, text_features, return_atten = False):
+        batch_size = image_features.shape[0]
+        image_features = image_features.transpose(0,1)
+        text_features = text_features.unsqueeze(1).repeat(1, batch_size, 1)
+        image_features = self.decoder_norm(image_features)
+        image_features_1 = self.decoder_norm_1(image_features)
+        image_features_2 = self.decoder_norm_2(image_features)
+        text_features = self.decoder_norm(text_features)
+        text_features_1 = self.decoder_norm_1(text_features)
+        text_features_2 = self.decoder_norm_2(text_features)
+        image_features_pool = torch.mean(image_features,dim=0).unsqueeze(0)
+        image_features_pool_1 = torch.mean(image_features_1,dim=0).unsqueeze(0)
+        image_features_pool_2 = torch.mean(image_features_2,dim=0).unsqueeze(0)
+        features,atten_map = self.decoderV1(text_features, image_features, image_features_pool,
+                memory_key_padding_mask=None, pos=None, query_pos=None)
+        features = self.dropout_feas(features).transpose(0,1)
+        out = self.mlp_head(features)
+        features_1,atten_map_1 = self.decoderV1_1(text_features_1, image_features_1, image_features_pool_1,
+                memory_key_padding_mask=None, pos=None, query_pos=None)
+        features_1 = self.dropout_feas(features_1).transpose(0,1)
+        out_1 = self.mlp_head_1(features_1)
+        features_2,atten_map_2 = self.decoderV1_2(text_features_2, image_features_2, image_features_pool_2,
+                memory_key_padding_mask=None, pos=None, query_pos=None)
+        features_2 = self.dropout_feas(features_2).transpose(0,1)
+        out_2 = self.mlp_head_2(features_2)
+        out_stack = torch.stack([out, out_1, out_2])
+        out = torch.mean(out_stack, dim=0)
+        if return_atten:
+            return out, atten_map
+        else:
+            return out
+# MIMIC时，batch_size=32, query_num=41, patch_num=256, dim=768
+# img 256, 32, 768
+# txt   1, 32, 768
+# query41, 32, 768
+# fts 41, 32, 768
+# out 41, 32, 1
+# 未经过sigmoid！计算loss时sigmoid！
+if __name__ == "__main__":
+    #torch 1.10.2 to torch 1.12.1
+    #torchvision-0.11.3 to torchvision-0.13.1
+    # image = torch.randn(1, 3, 224, 224)
+    # image_encoder = ModelRes(res_base_model = 'resnet50')
+    # # image_encoder = ModelDense(dense_base_model = 'densenet121')
+    # # image_encoder = ModelViT(vit_base_model = 'vit_b_32')
+    # image_encoder(image)
+    # image = torch.randn(256, 1, 768)
+    # query = torch.randn(41, 768)
+    # model = TQN_Model()
+    # out = model(image, query)
+    # img = torch.randn(1,3,512,512)
+    img = torch.randn(2,3,224,224)
+    # model = ModelConvNeXt(convnext_base_model = 'convnext-base')
+    # model = ModelEfficientV2(efficientv2_base_model = 'efficientnet_v2_s')
+    model = ModelRes(res_base_model = 'resnet50_openai')
+    out_emb, out_pool = model(img)
+    print(out_emb.size(), out_pool.size())

models/resnet.py ADDED Viewed

	@@ -0,0 +1,1382 @@

+from functools import partial
+from typing import Type, Any, Callable, Union, List, Optional
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torchvision.transforms._presets import ImageClassification
+from torchvision.utils import _log_api_usage_once
+from torchvision.models._api import WeightsEnum, Weights
+from torchvision.models._meta import _IMAGENET_CATEGORIES
+from torchvision.models._utils import handle_legacy_interface, _ovewrite_named_param
+import math
+import torch.nn.functional as F
+import random
+from torch.nn.common_types import _size_1_t, _size_2_t, _size_3_t
+class LoRALayer(nn.Module):
+    """
+    Base lora class
+    """
+    def __init__(
+            self,
+            r,
+            lora_alpha,
+         ):
+        super().__init__()
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Mark the weight as unmerged
+        self.merged = False
+    def reset_parameters(self):
+        raise NotImplementedError
+    def train(self, mode:bool = True):
+        raise NotImplementedError
+    def eval(self):
+        raise NotImplementedError
+class LoRALinear(LoRALayer):
+    def __init__(self, r, lora_alpha, linear_layer):
+        """
+        LoRA class for nn.Linear class
+        :param r: low rank dimension
+        :param lora_alpha: scaling factor
+        :param linear_layer: target nn.Linear layer for applying Lora
+        """
+        super().__init__(r, lora_alpha)
+        self.linear = linear_layer
+        in_features = self.linear.in_features
+        out_features = self.linear.out_features
+        # Lora configuration
+        self.lora_A = nn.Parameter(self.linear.weight.new_zeros((r, in_features)))
+        self.lora_B = nn.Parameter(self.linear.weight.new_zeros((out_features, r)))
+        self.scaling = self.lora_alpha / self.r
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+        nn.init.zeros_(self.lora_B)
+    def train(self, mode:bool = True):
+        self.linear.train(mode)
+        if self.merged:
+            self.linear.weight.data -= (self.lora_B @ self.lora_A) * self.scaling
+            self.merged = False
+    def eval(self):
+        self.linear.eval()
+        if not self.merged:
+            self.linear.weight.data += (self.lora_B @ self.lora_A) * self.scaling
+            self.merged = True
+    def forward(self, x):
+        if not self.merged:
+            result = F.linear(x, self.linear.weight, bias=self.linear.bias)
+            out = (x @ self.lora_A.T @ self.lora_B.T)
+            result += out
+            return result
+        else:
+            return F.linear(x, self.linear.weight, bias=self.linear.bias)
+# class LoraConv2d(LoRALayer):
+#     def __init__(self, r, lora_alpha, conv_layer):
+#         """
+#         LoRA class for nn.Conv2d class
+#         """
+#         super().__init__(r, lora_alpha)
+#         self.conv = conv_layer
+#         in_channels = self.conv.in_channels
+#         out_channels = self.conv.out_channels
+#         kernel_size = self.conv.kernel_size[0]
+#         # lora configuration
+#         self.lora_A = nn.Parameter(
+#             self.conv.weight.new_zeros((r * kernel_size, in_channels * kernel_size))
+#         )
+#         self.lora_B = nn.Parameter(
+#             self.conv.weight.new_zeros((out_channels * kernel_size, r * kernel_size))
+#         )
+#         self.scaling = self.lora_alpha / self.r
+#         self.reset_parameters()
+#     def reset_parameters(self):
+#         nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+#         nn.init.zeros_(self.lora_B)
+#     def train(self, mode: bool = True):
+#         self.conv.train(mode)
+#         if self.merged:
+#             # Make sure that the weights are not merged
+#             self.conv.weight.data -= (self.lora_B @ self.lora_A).view(self.conv.weight.shape) * self.scaling
+#             self.merged = False
+#     def eval(self):
+#         self.conv.eval()
+#         if not self.merged:
+#             # Merge the weights and mark it
+#             self.conv.weight.data += (self.lora_B @ self.lora_A).view(self.conv.weight.shape) * self.scaling
+#             self.merged = True
+#     def forward(self, x):
+#         if not self.merged:
+#             return F.conv2d(
+#                 x,
+#                 self.conv.weight + (self.lora_B @ self.lora_A).view(self.conv.weight.shape) * self.scaling,
+#                 self.conv.bias, self.conv.stride, self.conv.padding, self.conv.dilation, self.conv.groups
+#             )
+#         return self.conv(x)
+class LoraConv2d(nn.Conv2d):
+    def __init__(
+        self,
+        r: int,
+        lora_alpha: float,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: Union[str, _size_2_t] = 0,
+        dilation: _size_2_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',  # TODO: refine this type
+        device=None,
+        dtype=None
+    ):
+        """
+        LoRA class for nn.Conv2d class
+        """
+        super().__init__(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode, device, dtype)
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # lora configuration
+        self.lora_A = nn.Parameter(
+            self.weight.new_zeros((r * kernel_size, in_channels * kernel_size))
+        )
+        self.lora_B = nn.Parameter(
+            self.weight.new_zeros((out_channels * kernel_size, r * kernel_size))
+        )
+        self.scaling = self.lora_alpha / self.r
+        self.reset_parameters_lora()
+        self.merged = False
+        self.drop_lora_rate = 0.9
+    def reset_parameters_lora(self):
+        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+        nn.init.zeros_(self.lora_B)
+    def train(self, mode: bool = True):
+        super().train(mode)
+        if self.merged:
+            # Make sure that the weights are not merged
+            self.weight.data -= (self.lora_B @ self.lora_A).view(self.weight.shape) * self.scaling
+            self.merged = False
+    def eval(self):
+        super().eval()
+        if not self.merged:
+            # Merge the weights and mark it
+            self.weight.data += (self.lora_B @ self.lora_A).view(self.weight.shape) * self.scaling
+            self.merged = True
+    def forward(self, x):
+        # 产生一个随机数
+        # drop_rate = random.random()
+        # # 训练过程中以一定的概率不使用lora
+        # if drop_rate <= self.drop_lora_rate and self.training:
+        #     return F.conv2d(
+        #         x,
+        #         self.weight,
+        #         self.bias, self.stride, self.padding, self.dilation, self.groups
+        #     )
+        # else:
+        return F.conv2d(
+            x,
+            self.weight + (self.lora_B @ self.lora_A).view(self.weight.shape) * self.scaling,
+            self.bias, self.stride, self.padding, self.dilation, self.groups
+        )
+class MultiLoRALinear(LoRALayer):
+    def __init__(self, r, lora_alpha, linear_layer,lora_num):
+        """
+        LoRA class for nn.Linear class
+        :param r: low rank dimension
+        :param lora_alpha: scaling factor
+        :param linear_layer: target nn.Linear layer for applying Lora
+        """
+        super().__init__(r,lora_alpha)
+        self.linear = linear_layer
+        self.lora_num = lora_num
+        self.r_list = r
+        in_features = self.linear.in_features
+        out_features = self.linear.out_features
+        # Lora configuration
+        self.lora_A_list = nn.ParameterList([nn.Parameter(self.linear.weight.new_zeros((self.r_list[th], in_features))) for th in range(self.lora_num)])
+        self.lora_B_list = nn.ParameterList([nn.Parameter(self.linear.weight.new_zeros((out_features, self.r_list[th]))) for th in range(self.lora_num)])
+        # self.lora_A = nn.Parameter(self.linear.weight.new_zeros((r, in_features)))
+        # self.lora_B = nn.Parameter(self.linear.weight.new_zeros((out_features, r)))
+        self.scaling = [self.lora_alpha / self.r_list[th] for th in range(self.lora_num)]
+        self.reset_parameters()
+    def reset_parameters(self):
+        for th in range(self.lora_num):
+            nn.init.kaiming_uniform_(self.lora_A_list[th], a=math.sqrt(5))
+            nn.init.zeros_(self.lora_B_list[th])
+    def train(self, mode:bool = True):
+        self.linear.train(mode)
+    def eval(self):
+        self.linear.eval()
+    def forward(self, x, weights):
+        if not self.merged:
+            result = F.linear(x, self.linear.weight, bias=self.linear.bias) # (247, batch, 768)
+            out_stack = torch.stack([(x @ self.lora_A_list[th].T @ self.lora_B_list[th].T) * self.scaling[th] for th in range(self.lora_num)], dim=2) # (2353,16,3,768)
+            # (247, batch, lora_num, 768)
+            # weights = weights.unsqueeze(0).unsqueeze(-1)
+            # (1, batch, lora_num, 1)
+            # out = torch.sum(out_stack * weights,dim=2)
+            out = torch.sum(out_stack, dim=2)
+            # (247, batch, 768)
+            result += out
+            # (247, batch, 768)
+            return result
+        else:
+            return F.linear(x, self.linear.weight, bias=self.linear.bias)
+class MultiLoraConv2d(LoRALayer):
+    def __init__(self, r, lora_alpha, conv_layer, num_task):
+        """
+        LoRA class for nn.Conv2d class
+        """
+        super().__init__(r, lora_alpha)
+        self.conv = conv_layer
+        self.num_task = num_task
+        in_channels = self.conv.in_channels
+        out_channels = self.conv.out_channels
+        kernel_size = self.conv.kernel_size[0]
+        # lora configuration
+        self.lora_A_list = nn.ParameterList([nn.Parameter(self.conv.weight.new_zeros((r * kernel_size, in_channels * kernel_size))) for th in range(num_task)])
+        self.lora_B_list = nn.ParameterList([nn.Parameter(self.conv.weight.new_zeros((out_channels * kernel_size, r * kernel_size))) for th in range(num_task)])
+        self.scaling = self.lora_alpha / self.r
+        self.reset_parameters()
+        self.merged = False
+        self.label_batch = None
+    def reset_parameters(self):
+        for th in range(self.num_task):
+            nn.init.kaiming_uniform_(self.lora_A_list[th], a=math.sqrt(5))
+            nn.init.zeros_(self.lora_B_list[th])
+    def train(self, mode: bool = True):
+        self.conv.train(mode)
+    def eval(self):
+        self.conv.eval()
+    def forward(self, input_x, alphas=None):
+        if not self.merged:
+            conv_weight_stack = torch.cat([(self.lora_B_list[th] @ self.lora_A_list[th]).view(self.conv.weight.shape).unsqueeze(0) * self.scaling for th in range(self.num_task)], dim=0)
+            if isinstance(input_x, dict):
+                # print('input is dict')
+                x, alphas = input_x[0], input_x[1]
+            else:
+                x = input_x
+            batch_size, c = x.shape[0], x.shape[1]
+            # print(alphas)
+            if alphas==None:
+                print('在lora_fast里才是none')
+            agg_weights = self.conv.weight + torch.sum(
+                torch.mul(conv_weight_stack.unsqueeze(0), alphas.view(batch_size, -1, 1, 1, 1, 1)), dim=1)
+            agg_weights = agg_weights.view(-1, *agg_weights.shape[-3:])
+            x_grouped = x.view(1, -1, *x.shape[-2:])
+            outputs = F.conv2d(x_grouped, agg_weights, self.conv.bias, self.conv.stride, self.conv.padding, self.conv.dilation, groups=batch_size)
+            outputs = outputs.view(batch_size, -1, *outputs.shape[-2:])
+            return outputs
+        else:
+            return self.conv(x)
+    def merged_weight(self, th): # only for test
+        self.conv.weight.data += (self.lora_B_list[th] @ self.lora_A_list[th]).view(self.conv.weight.shape) * self.scaling
+        self.merged = True
+__all__ = [
+    "ResNet",
+    "ResNet18_Weights",
+    "ResNet34_Weights",
+    "ResNet50_Weights",
+    "ResNet101_Weights",
+    "ResNet152_Weights",
+    "ResNeXt50_32X4D_Weights",
+    "ResNeXt101_32X8D_Weights",
+    "ResNeXt101_64X4D_Weights",
+    "Wide_ResNet50_2_Weights",
+    "Wide_ResNet101_2_Weights",
+    "resnet18",
+    "resnet34",
+    "resnet50",
+    "resnet101",
+    "resnet152",
+    "resnext50_32x4d",
+    "resnext101_32x8d",
+    "resnext101_64x4d",
+    "wide_resnet50_2",
+    "wide_resnet101_2",
+]
+def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation,
+    )
+def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+def conv3x3_lora(r: int, lora_alpha: float, in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    return LoraConv2d(
+        r,lora_alpha,
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation,
+    )
+def conv1x1_lora(r: int, lora_alpha: float, in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
+    """1x1 convolution"""
+    return LoraConv2d(r, lora_alpha, in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+class BasicBlock_Lora(nn.Module):
+    expansion: int = 1
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        r: int,
+        lora_alpha: float,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError("BasicBlock only supports groups=1 and base_width=64")
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3_lora(r, lora_alpha, inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3_lora(r, lora_alpha, planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class BasicBlock(nn.Module):
+    expansion: int = 1
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError("BasicBlock only supports groups=1 and base_width=64")
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+    expansion: int = 4
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.0)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class Bottleneck_Lora(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+    expansion: int = 4
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        r: int,
+        lora_alpha: float,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.0)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1_lora(r, lora_alpha, inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3_lora(r, lora_alpha, width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1_lora(r, lora_alpha, width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        layers: List[int],
+        num_classes: int = 1000,
+        zero_init_residual: bool = False,
+        groups: int = 1,
+        width_per_group: int = 64,
+        replace_stride_with_dilation: Optional[List[bool]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError(
+                "replace_stride_with_dilation should be None "
+                f"or a 3-element tuple, got {replace_stride_with_dilation}"
+            )
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck) and m.bn3.weight is not None:
+                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
+                elif isinstance(m, BasicBlock) and m.bn2.weight is not None:
+                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]
+    def _make_layer(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        planes: int,
+        blocks: int,
+        stride: int = 1,
+        dilate: bool = False,
+    ) -> nn.Sequential:
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+        layers = []
+        layers.append(
+            block(
+                self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer
+            )
+        )
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer,
+                )
+            )
+        return nn.Sequential(*layers)
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+        return x
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+class ResNet_Lora(nn.Module):
+    def __init__(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        layers: List[int],
+        r: int,
+        lora_alpha: float,
+        num_classes: int = 1000,
+        zero_init_residual: bool = False,
+        groups: int = 1,
+        width_per_group: int = 64,
+        replace_stride_with_dilation: Optional[List[bool]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError(
+                "replace_stride_with_dilation should be None "
+                f"or a 3-element tuple, got {replace_stride_with_dilation}"
+            )
+        self.groups = groups
+        self.base_width = width_per_group
+        self.r = r
+        self.lora_alpha = lora_alpha
+        self.conv1 = LoraConv2d(self.r, self.lora_alpha, 3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, LoraConv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck) and m.bn3.weight is not None:
+                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
+                elif isinstance(m, BasicBlock) and m.bn2.weight is not None:
+                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]
+    def _make_layer(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        planes: int,
+        blocks: int,
+        stride: int = 1,
+        dilate: bool = False,
+    ) -> nn.Sequential:
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1_lora(self.r, self.lora_alpha, self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+        layers = []
+        layers.append(
+            block(
+                self.inplanes, planes, self.r, self.lora_alpha, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer
+            )
+        )
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    self.r,
+                    self.lora_alpha,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer,
+                )
+            )
+        return nn.Sequential(*layers)
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+        return x
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+def _resnet(
+    block: Type[Union[BasicBlock, Bottleneck]],
+    layers: List[int],
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> ResNet:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+    model = ResNet(block, layers, **kwargs)
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress))
+    return model
+def _resnet_lora(
+    block: Type[Union[BasicBlock, Bottleneck]],
+    layers: List[int],
+    r: int,
+    lora_alpha: float,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> ResNet_Lora:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+    model = ResNet_Lora(block, layers, r, lora_alpha, **kwargs)
+    if weights is not None:
+        missing_keys, unexpected_keys =  model.load_state_dict(weights.get_state_dict(progress=progress), strict=False)
+    for key_name in missing_keys:
+        if 'lora_A' in key_name or 'lora_B' in key_name:
+            pass
+        else:
+            raise ValueError(f'{key_name} in missing keys')
+    if unexpected_keys != []:
+        raise ValueError(f'Have unexpected keys {unexpected_keys}')
+    return model
+_COMMON_META = {
+    "min_size": (1, 1),
+    "categories": _IMAGENET_CATEGORIES,
+}
+class ResNet18_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/resnet18-f37072fd.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 11689512,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 69.758,
+                    "acc@5": 89.078,
+                }
+            },
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+class ResNet34_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/resnet34-b627a593.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 21797672,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 73.314,
+                    "acc@5": 91.420,
+                }
+            },
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+class ResNet50_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/resnet50-0676ba61.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 25557032,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 76.130,
+                    "acc@5": 92.862,
+                }
+            },
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/resnet50-11ad3fa6.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 25557032,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#issuecomment-1013906621",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 80.858,
+                    "acc@5": 95.434,
+                }
+            },
+            "_docs": """
+                These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+class ResNet101_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/resnet101-63fe2227.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 44549160,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 77.374,
+                    "acc@5": 93.546,
+                }
+            },
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/resnet101-cd907fc2.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 44549160,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 81.886,
+                    "acc@5": 95.780,
+                }
+            },
+            "_docs": """
+                These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+class ResNet152_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/resnet152-394f9c45.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 60192808,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 78.312,
+                    "acc@5": 94.046,
+                }
+            },
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/resnet152-f82ba261.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 60192808,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 82.284,
+                    "acc@5": 96.002,
+                }
+            },
+            "_docs": """
+                These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+class ResNeXt50_32X4D_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 25028904,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnext",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 77.618,
+                    "acc@5": 93.698,
+                }
+            },
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/resnext50_32x4d-1a0047aa.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 25028904,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 81.198,
+                    "acc@5": 95.340,
+                }
+            },
+            "_docs": """
+                These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+class ResNeXt101_32X8D_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 88791336,
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnext",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 79.312,
+                    "acc@5": 94.526,
+                }
+            },
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/resnext101_32x8d-110c445d.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 88791336,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-fixres",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 82.834,
+                    "acc@5": 96.228,
+                }
+            },
+            "_docs": """
+                These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+class ResNeXt101_64X4D_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/resnext101_64x4d-173b62eb.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 83455272,
+            "recipe": "https://github.com/pytorch/vision/pull/5935",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 83.246,
+                    "acc@5": 96.454,
+                }
+            },
+            "_docs": """
+                These weights were trained from scratch by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V1
+class Wide_ResNet50_2_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 68883240,
+            "recipe": "https://github.com/pytorch/vision/pull/912#issue-445437439",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 78.468,
+                    "acc@5": 94.086,
+                }
+            },
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/wide_resnet50_2-9ba9bcbe.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 68883240,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-fixres",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 81.602,
+                    "acc@5": 95.758,
+                }
+            },
+            "_docs": """
+                These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+class Wide_ResNet101_2_Weights(WeightsEnum):
+    IMAGENET1K_V1 = Weights(
+        url="https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth",
+        transforms=partial(ImageClassification, crop_size=224),
+        meta={
+            **_COMMON_META,
+            "num_params": 126886696,
+            "recipe": "https://github.com/pytorch/vision/pull/912#issue-445437439",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 78.848,
+                    "acc@5": 94.284,
+                }
+            },
+            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+        },
+    )
+    IMAGENET1K_V2 = Weights(
+        url="https://download.pytorch.org/models/wide_resnet101_2-d733dc28.pth",
+        transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+        meta={
+            **_COMMON_META,
+            "num_params": 126886696,
+            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+            "_metrics": {
+                "ImageNet-1K": {
+                    "acc@1": 82.510,
+                    "acc@5": 96.020,
+                }
+            },
+            "_docs": """
+                These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
+            """,
+        },
+    )
+    DEFAULT = IMAGENET1K_V2
+@handle_legacy_interface(weights=("pretrained", ResNet18_Weights.IMAGENET1K_V1))
+def resnet18(*, weights: Optional[ResNet18_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+    weights = ResNet18_Weights.verify(weights)
+    return _resnet(BasicBlock, [2, 2, 2, 2], weights, progress, **kwargs)
+@handle_legacy_interface(weights=("pretrained", ResNet34_Weights.IMAGENET1K_V1))
+def resnet34(*, weights: Optional[ResNet34_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+    weights = ResNet34_Weights.verify(weights)
+    return _resnet(BasicBlock, [3, 4, 6, 3], weights, progress, **kwargs)
+@handle_legacy_interface(weights=("pretrained", ResNet50_Weights.IMAGENET1K_V1))
+def resnet50(*, weights: Optional[ResNet50_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+    weights = ResNet50_Weights.verify(weights)
+    return _resnet(Bottleneck, [3, 4, 6, 3], weights, progress, **kwargs)
+@handle_legacy_interface(weights=("pretrained", ResNet50_Weights.IMAGENET1K_V1))
+def resnet50_lora(*, r: int, lora_alpha: float, weights: Optional[ResNet50_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+    weights = ResNet50_Weights.verify(weights)
+    return _resnet_lora(Bottleneck_Lora, [3, 4, 6, 3], r, lora_alpha, weights, progress, **kwargs)
+@handle_legacy_interface(weights=("pretrained", ResNet101_Weights.IMAGENET1K_V1))
+def resnet101(*, weights: Optional[ResNet101_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+    weights = ResNet101_Weights.verify(weights)
+    return _resnet(Bottleneck, [3, 4, 23, 3], weights, progress, **kwargs)
+@handle_legacy_interface(weights=("pretrained", ResNet101_Weights.IMAGENET1K_V1))
+def resnet101_lora(*, r: int, lora_alpha: float, weights: Optional[ResNet101_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+    weights = ResNet101_Weights.verify(weights)
+    return _resnet_lora(Bottleneck_Lora, [3, 4, 23, 3], r, lora_alpha, weights, progress, **kwargs)
+@handle_legacy_interface(weights=("pretrained", ResNet152_Weights.IMAGENET1K_V1))
+def resnet152(*, weights: Optional[ResNet152_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+    weights = ResNet152_Weights.verify(weights)
+    return _resnet(Bottleneck, [3, 8, 36, 3], weights, progress, **kwargs)
+@handle_legacy_interface(weights=("pretrained", ResNet152_Weights.IMAGENET1K_V1))
+def resnet152_lora(*, r: int, lora_alpha: float, weights: Optional[ResNet152_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+    weights = ResNet152_Weights.verify(weights)
+    return _resnet_lora(Bottleneck_Lora, [3, 8, 36, 3], r, lora_alpha, weights, progress, **kwargs)
+@handle_legacy_interface(weights=("pretrained", ResNeXt50_32X4D_Weights.IMAGENET1K_V1))
+def resnext50_32x4d(
+    *, weights: Optional[ResNeXt50_32X4D_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ResNet:
+    weights = ResNeXt50_32X4D_Weights.verify(weights)
+    _ovewrite_named_param(kwargs, "groups", 32)
+    _ovewrite_named_param(kwargs, "width_per_group", 4)
+    return _resnet(Bottleneck, [3, 4, 6, 3], weights, progress, **kwargs)
+@handle_legacy_interface(weights=("pretrained", ResNeXt101_32X8D_Weights.IMAGENET1K_V1))
+def resnext101_32x8d(
+    *, weights: Optional[ResNeXt101_32X8D_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ResNet:
+    weights = ResNeXt101_32X8D_Weights.verify(weights)
+    _ovewrite_named_param(kwargs, "groups", 32)
+    _ovewrite_named_param(kwargs, "width_per_group", 8)
+    return _resnet(Bottleneck, [3, 4, 23, 3], weights, progress, **kwargs)
+def resnext101_64x4d(
+    *, weights: Optional[ResNeXt101_64X4D_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ResNet:
+    weights = ResNeXt101_64X4D_Weights.verify(weights)
+    _ovewrite_named_param(kwargs, "groups", 64)
+    _ovewrite_named_param(kwargs, "width_per_group", 4)
+    return _resnet(Bottleneck, [3, 4, 23, 3], weights, progress, **kwargs)
+@handle_legacy_interface(weights=("pretrained", Wide_ResNet50_2_Weights.IMAGENET1K_V1))
+def wide_resnet50_2(
+    *, weights: Optional[Wide_ResNet50_2_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ResNet:
+    weights = Wide_ResNet50_2_Weights.verify(weights)
+    _ovewrite_named_param(kwargs, "width_per_group", 64 * 2)
+    return _resnet(Bottleneck, [3, 4, 6, 3], weights, progress, **kwargs)
+@handle_legacy_interface(weights=("pretrained", Wide_ResNet101_2_Weights.IMAGENET1K_V1))
+def wide_resnet101_2(
+    *, weights: Optional[Wide_ResNet101_2_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ResNet:
+    weights = Wide_ResNet101_2_Weights.verify(weights)
+    _ovewrite_named_param(kwargs, "width_per_group", 64 * 2)
+    return _resnet(Bottleneck, [3, 4, 23, 3], weights, progress, **kwargs)
+# The dictionary below is internal implementation detail and will be removed in v0.15
+from torchvision.models._utils import _ModelURLs
+model_urls = _ModelURLs(
+    {
+        "resnet18": ResNet18_Weights.IMAGENET1K_V1.url,
+        "resnet34": ResNet34_Weights.IMAGENET1K_V1.url,
+        "resnet50": ResNet50_Weights.IMAGENET1K_V1.url,
+        "resnet101": ResNet101_Weights.IMAGENET1K_V1.url,
+        "resnet152": ResNet152_Weights.IMAGENET1K_V1.url,
+        "resnext50_32x4d": ResNeXt50_32X4D_Weights.IMAGENET1K_V1.url,
+        "resnext101_32x8d": ResNeXt101_32X8D_Weights.IMAGENET1K_V1.url,
+        "wide_resnet50_2": Wide_ResNet50_2_Weights.IMAGENET1K_V1.url,
+        "wide_resnet101_2": Wide_ResNet101_2_Weights.IMAGENET1K_V1.url,
+    }
+)
+if __name__ == '__main__':
+    model = resnet50_lora(r=16, lora_alpha=16, weights='ResNet50_Weights.IMAGENET1K_V2')

models/transformer_decoder.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import copy
+from typing import Optional, List
+import pickle as cp
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+# from visualizer import get_local
+class TransformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+    def forward(self,tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        output = tgt
+        T,B,C = memory.shape
+        intermediate = []
+        for n,layer in enumerate(self.layers):
+            residual=True
+            output,ws = layer(output, memory, tgt_mask=tgt_mask,
+                           memory_mask=memory_mask,
+                           tgt_key_padding_mask=tgt_key_padding_mask,
+                           memory_key_padding_mask=memory_key_padding_mask,
+                           pos=pos, query_pos=query_pos,residual=residual)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+        return output
+class TransformerDecoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=1024, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt, memory,
+                     tgt_mask: Optional[Tensor] = None,
+                     memory_mask: Optional[Tensor] = None,
+                     tgt_key_padding_mask: Optional[Tensor] = None,
+                     memory_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None,
+                     residual=True):
+        #tgt: text_features, torch.Size([14, 1, 768])
+        #memory: image_features, torch.Size([49, 1, 768])
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2,ws = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)
+        tgt = self.norm1(tgt)
+        tgt2,ws = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   need_weights = True,
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)
+        # attn_weights [B,NUM_Q,T]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt,ws
+    # @get_local('attn_weights')
+    def forward_pre(self, tgt, memory,
+                    tgt_mask: Optional[Tensor] = None,
+                    memory_mask: Optional[Tensor] = None,
+                    tgt_key_padding_mask: Optional[Tensor] = None,
+                    memory_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2,ws = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)
+        # print('self atten',ws.shape)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        tgt2,attn_weights = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)# (N,num_heads,L,S)
+        # print('self attn_weights',attn_weights.shape)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt,attn_weights
+    def forward(self, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None,
+                residual=True):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
+                                    tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, tgt_mask, memory_mask,
+                                 tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos,residual)
+class TransformerDecoderV1(nn.Module):
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+    def forward(self,tgt, memory,
+                memory_global, #
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        output = tgt
+        T,B,C = memory.shape
+        intermediate = []
+        for n,layer in enumerate(self.layers):
+            residual=True
+            output,ws = layer(output, memory,
+                              memory_global, #
+                              tgt_mask=tgt_mask,
+                           memory_mask=memory_mask,
+                           tgt_key_padding_mask=tgt_key_padding_mask,
+                           memory_key_padding_mask=memory_key_padding_mask,
+                           pos=pos, query_pos=query_pos,residual=residual)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+        if self.norm is not None:
+            output = self.norm(output)
+            #
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+        return output,ws
+class TransformerDecoderLayerV1(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=1024, dropout=0.1,
+                 activation="relu", normalize_before=False, lam = [1,0]):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        # self.lam = lam
+        self.lam_l = lam[0]
+        self.lam_g = lam[1]
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt, memory,
+                     memory_global,
+                     tgt_mask: Optional[Tensor] = None,
+                     memory_mask: Optional[Tensor] = None,
+                     tgt_key_padding_mask: Optional[Tensor] = None,
+                     memory_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None,
+                     residual=True):
+        #tgt: text_features, torch.Size([14, 1, 768])
+        #memory: image_features, torch.Size([49, 1, 768])
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2,ws = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)
+        tgt = self.norm1(tgt)
+        tgt2,ws = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   need_weights = True,
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)
+        # attn_weights [B,NUM_Q,T]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt,ws
+    # @get_local('attn_weights')
+    def forward_pre(self, tgt, memory,
+                    memory_global, #
+                    tgt_mask: Optional[Tensor] = None,
+                    memory_mask: Optional[Tensor] = None,
+                    tgt_key_padding_mask: Optional[Tensor] = None,
+                    memory_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2,ws = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)
+        # print('self atten',ws.shape)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        if memory.shape[0] == 1:
+            tgt2_fine,attn_weights = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)# (N,num_heads,L,S)
+            tgt2 = tgt2_fine
+            attn_weights = attn_weights
+        else:
+            tgt2_fine,attn_weights = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)# (N,num_heads,L,S)
+            tgt2_global,attn_weights_global = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
+                                   key=self.with_pos_embed(memory_global, pos),
+                                   value=memory_global, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)# (N,num_heads,L,S)
+            # tgt2 = tgt2_fine + self.lam * tgt2_global
+            tgt2 = tgt2_fine*self.lam_l + tgt2_global*self.lam_g
+            # attn_weights = attn_weights + self.lam * attn_weights_global
+            attn_weights = attn_weights*self.lam_l + attn_weights_global*self.lam_g
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt, attn_weights
+    def forward(self, tgt, memory,
+                memory_global,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None,
+                residual=True):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, memory_global, tgt_mask, memory_mask,
+                                    tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, memory_global, tgt_mask, memory_mask,
+                                 tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos,residual)
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")