diff --git a/ckpts/lllyasviel/Annotators/body_pose_model.pth b/ckpts/lllyasviel/Annotators/body_pose_model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9acb77e68f31906a8875f1daef2f3f7ef94acb1e
--- /dev/null
+++ b/ckpts/lllyasviel/Annotators/body_pose_model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25a948c16078b0f08e236bda51a385d855ef4c153598947c28c0d47ed94bb746
+size 209267595
diff --git a/src/custom_mesh_graphormer/modeling/data/J_regressor_extra.npy b/src/custom_mesh_graphormer/modeling/data/J_regressor_extra.npy
new file mode 100644
index 0000000000000000000000000000000000000000..c15c7c4294d859ee037404876073a969c0da5524
--- /dev/null
+++ b/src/custom_mesh_graphormer/modeling/data/J_regressor_extra.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40dfaa71fcc7eed6966a6ed046311b7e8ea0eb9a5172b298e3df6fc4b6ec0eb0
+size 771808
diff --git a/src/custom_mesh_graphormer/modeling/data/J_regressor_h36m_correct.npy b/src/custom_mesh_graphormer/modeling/data/J_regressor_h36m_correct.npy
new file mode 100644
index 0000000000000000000000000000000000000000..dff7bedc5d08289a308299a6c82df39484e4b62b
--- /dev/null
+++ b/src/custom_mesh_graphormer/modeling/data/J_regressor_h36m_correct.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1835d64133d5f66bd80a814ab1c1dc0900ef01950f568320acf5f9390c1f2c8c
+size 937168
diff --git a/src/custom_timm/data/__init__.py b/src/custom_timm/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eb10a660c1195250fc418884fc93482efd4f144
--- /dev/null
+++ b/src/custom_timm/data/__init__.py
@@ -0,0 +1,13 @@
+from .auto_augment import RandAugment, AutoAugment, rand_augment_ops, auto_augment_policy,\
+    rand_augment_transform, auto_augment_transform
+from .config import resolve_data_config
+from .constants import *
+from .dataset import ImageDataset, IterableImageDataset, AugMixDataset
+from .dataset_factory import create_dataset
+from .loader import create_loader
+from .mixup import Mixup, FastCollateMixup
+from .parsers import create_parser,\
+    get_img_extensions, is_img_extension, set_img_extensions, add_img_extensions, del_img_extensions
+from .real_labels import RealLabelsImagenet
+from .transforms import *
+from .transforms_factory import create_transform
diff --git a/src/custom_timm/data/random_erasing.py b/src/custom_timm/data/random_erasing.py
new file mode 100644
index 0000000000000000000000000000000000000000..98108488da5392787d6502e2d21487259fe8c5e3
--- /dev/null
+++ b/src/custom_timm/data/random_erasing.py
@@ -0,0 +1,103 @@
+""" Random Erasing (Cutout)
+
+Originally inspired by impl at https://github.com/zhunzhong07/Random-Erasing, Apache 2.0
+Copyright Zhun Zhong & Liang Zheng
+
+Hacked together by / Copyright 2019, Ross Wightman
+"""
+import random
+import math
+import torch
+
+
+def _get_pixels(per_pixel, rand_color, patch_size, dtype=torch.float32, device='cuda'):
+    # NOTE I've seen CUDA illegal memory access errors being caused by the normal_()
+    # paths, flip the order so normal is run on CPU if this becomes a problem
+    # Issue has been fixed in master https://github.com/pytorch/pytorch/issues/19508
+    if per_pixel:
+        return torch.empty(patch_size, dtype=dtype, device=device).normal_()
+    elif rand_color:
+        return torch.empty((patch_size[0], 1, 1), dtype=dtype, device=device).normal_()
+    else:
+        return torch.zeros((patch_size[0], 1, 1), dtype=dtype, device=device)
+
+
+class RandomErasing:
+    """ Randomly selects a rectangle region in an image and erases its pixels.
+        'Random Erasing Data Augmentation' by Zhong et al.
+        See https://arxiv.org/pdf/1708.04896.pdf
+
+        This variant of RandomErasing is intended to be applied to either a batch
+        or single image tensor after it has been normalized by dataset mean and std.
+    Args:
+         probability: Probability that the Random Erasing operation will be performed.
+         min_area: Minimum percentage of erased area wrt input image area.
+         max_area: Maximum percentage of erased area wrt input image area.
+         min_aspect: Minimum aspect ratio of erased area.
+         mode: pixel color mode, one of 'const', 'rand', or 'pixel'
+            'const' - erase block is constant color of 0 for all channels
+            'rand'  - erase block is same per-channel random (normal) color
+            'pixel' - erase block is per-pixel random (normal) color
+        max_count: maximum number of erasing blocks per image, area per box is scaled by count.
+            per-image count is randomly chosen between 1 and this value.
+    """
+
+    def __init__(
+            self,
+            probability=0.5, min_area=0.02, max_area=1/3, min_aspect=0.3, max_aspect=None,
+            mode='const', min_count=1, max_count=None, num_splits=0, device='cuda'):
+        self.probability = probability
+        self.min_area = min_area
+        self.max_area = max_area
+        max_aspect = max_aspect or 1 / min_aspect
+        self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
+        self.min_count = min_count
+        self.max_count = max_count or min_count
+        self.num_splits = num_splits
+        self.mode = mode.lower()
+        self.rand_color = False
+        self.per_pixel = False
+        if self.mode == 'rand':
+            self.rand_color = True  # per block random normal
+        elif self.mode == 'pixel':
+            self.per_pixel = True  # per pixel random normal
+        else:
+            assert not self.mode or self.mode == 'const'
+        self.device = device
+
+    def _erase(self, img, chan, img_h, img_w, dtype):
+        if random.random() > self.probability:
+            return
+        area = img_h * img_w
+        count = self.min_count if self.min_count == self.max_count else \
+            random.randint(self.min_count, self.max_count)
+        for _ in range(count):
+            for attempt in range(10):
+                target_area = random.uniform(self.min_area, self.max_area) * area / count
+                aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+                h = int(round(math.sqrt(target_area * aspect_ratio)))
+                w = int(round(math.sqrt(target_area / aspect_ratio)))
+                if w < img_w and h < img_h:
+                    top = random.randint(0, img_h - h)
+                    left = random.randint(0, img_w - w)
+                    img[:, top:top + h, left:left + w] = _get_pixels(
+                        self.per_pixel, self.rand_color, (chan, h, w),
+                        dtype=dtype, device=self.device)
+                    break
+
+    def __call__(self, input):
+        if len(input.size()) == 3:
+            self._erase(input, *input.size(), input.dtype)
+        else:
+            batch_size, chan, img_h, img_w = input.size()
+            # skip first slice of batch if num_splits is set (for clean portion of samples)
+            batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0
+            for i in range(batch_start, batch_size):
+                self._erase(input[i], chan, img_h, img_w, input.dtype)
+        return input
+
+    def __repr__(self):
+        # NOTE simplified state for repr
+        fs = self.__class__.__name__ + f'(p={self.probability}, mode={self.mode}'
+        fs += f', count=({self.min_count}, {self.max_count}))'
+        return fs
diff --git a/src/custom_timm/data/real_labels.py b/src/custom_timm/data/real_labels.py
new file mode 100644
index 0000000000000000000000000000000000000000..939c34867e7915ce3e4cc7da04a5bc1653ec4f2c
--- /dev/null
+++ b/src/custom_timm/data/real_labels.py
@@ -0,0 +1,42 @@
+""" Real labels evaluator for ImageNet
+Paper: `Are we done with ImageNet?` - https://arxiv.org/abs/2006.07159
+Based on Numpy example at https://github.com/google-research/reassessed-imagenet
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import os
+import json
+import numpy as np
+
+
+class RealLabelsImagenet:
+
+    def __init__(self, filenames, real_json='real.json', topk=(1, 5)):
+        with open(real_json) as real_labels:
+            real_labels = json.load(real_labels)
+            real_labels = {f'ILSVRC2012_val_{i + 1:08d}.JPEG': labels for i, labels in enumerate(real_labels)}
+        self.real_labels = real_labels
+        self.filenames = filenames
+        assert len(self.filenames) == len(self.real_labels)
+        self.topk = topk
+        self.is_correct = {k: [] for k in topk}
+        self.sample_idx = 0
+
+    def add_result(self, output):
+        maxk = max(self.topk)
+        _, pred_batch = output.topk(maxk, 1, True, True)
+        pred_batch = pred_batch.cpu().numpy()
+        for pred in pred_batch:
+            filename = self.filenames[self.sample_idx]
+            filename = os.path.basename(filename)
+            if self.real_labels[filename]:
+                for k in self.topk:
+                    self.is_correct[k].append(
+                        any([p in self.real_labels[filename] for p in pred[:k]]))
+            self.sample_idx += 1
+
+    def get_accuracy(self, k=None):
+        if k is None:
+            return {k: float(np.mean(self.is_correct[k])) * 100 for k in self.topk}
+        else:
+            return float(np.mean(self.is_correct[k])) * 100
diff --git a/src/custom_timm/data/tf_preprocessing.py b/src/custom_timm/data/tf_preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..44b4a3af7372c6865b1cdddda0a8da0ccc6b93a0
--- /dev/null
+++ b/src/custom_timm/data/tf_preprocessing.py
@@ -0,0 +1,232 @@
+""" Tensorflow Preprocessing Adapter
+
+Allows use of Tensorflow preprocessing pipeline in PyTorch Transform
+
+Copyright of original Tensorflow code below.
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ImageNet preprocessing for MnasNet."""
+import tensorflow as tf
+import numpy as np
+
+IMAGE_SIZE = 224
+CROP_PADDING = 32
+
+
+def distorted_bounding_box_crop(image_bytes,
+                                bbox,
+                                min_object_covered=0.1,
+                                aspect_ratio_range=(0.75, 1.33),
+                                area_range=(0.05, 1.0),
+                                max_attempts=100,
+                                scope=None):
+    """Generates cropped_image using one of the bboxes randomly distorted.
+
+    See `tf.image.sample_distorted_bounding_box` for more documentation.
+
+    Args:
+      image_bytes: `Tensor` of binary image data.
+      bbox: `Tensor` of bounding boxes arranged `[1, num_boxes, coords]`
+          where each coordinate is [0, 1) and the coordinates are arranged
+          as `[ymin, xmin, ymax, xmax]`. If num_boxes is 0 then use the whole
+          image.
+      min_object_covered: An optional `float`. Defaults to `0.1`. The cropped
+          area of the image must contain at least this fraction of any bounding
+          box supplied.
+      aspect_ratio_range: An optional list of `float`s. The cropped area of the
+          image must have an aspect ratio = width / height within this range.
+      area_range: An optional list of `float`s. The cropped area of the image
+          must contain a fraction of the supplied image within in this range.
+      max_attempts: An optional `int`. Number of attempts at generating a cropped
+          region of the image of the specified constraints. After `max_attempts`
+          failures, return the entire image.
+      scope: Optional `str` for name scope.
+    Returns:
+      cropped image `Tensor`
+    """
+    with tf.name_scope(scope, 'distorted_bounding_box_crop', [image_bytes, bbox]):
+        shape = tf.image.extract_jpeg_shape(image_bytes)
+        sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+            shape,
+            bounding_boxes=bbox,
+            min_object_covered=min_object_covered,
+            aspect_ratio_range=aspect_ratio_range,
+            area_range=area_range,
+            max_attempts=max_attempts,
+            use_image_if_no_bounding_boxes=True)
+        bbox_begin, bbox_size, _ = sample_distorted_bounding_box
+
+        # Crop the image to the specified bounding box.
+        offset_y, offset_x, _ = tf.unstack(bbox_begin)
+        target_height, target_width, _ = tf.unstack(bbox_size)
+        crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
+        image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
+
+        return image
+
+
+def _at_least_x_are_equal(a, b, x):
+    """At least `x` of `a` and `b` `Tensors` are equal."""
+    match = tf.equal(a, b)
+    match = tf.cast(match, tf.int32)
+    return tf.greater_equal(tf.reduce_sum(match), x)
+
+
+def _decode_and_random_crop(image_bytes, image_size, resize_method):
+    """Make a random crop of image_size."""
+    bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
+    image = distorted_bounding_box_crop(
+        image_bytes,
+        bbox,
+        min_object_covered=0.1,
+        aspect_ratio_range=(3. / 4, 4. / 3.),
+        area_range=(0.08, 1.0),
+        max_attempts=10,
+        scope=None)
+    original_shape = tf.image.extract_jpeg_shape(image_bytes)
+    bad = _at_least_x_are_equal(original_shape, tf.shape(image), 3)
+
+    image = tf.cond(
+        bad,
+        lambda: _decode_and_center_crop(image_bytes, image_size),
+        lambda: tf.image.resize([image], [image_size, image_size], resize_method)[0])
+
+    return image
+
+
+def _decode_and_center_crop(image_bytes, image_size, resize_method):
+    """Crops to center of image with padding then scales image_size."""
+    shape = tf.image.extract_jpeg_shape(image_bytes)
+    image_height = shape[0]
+    image_width = shape[1]
+
+    padded_center_crop_size = tf.cast(
+        ((image_size / (image_size + CROP_PADDING)) *
+         tf.cast(tf.minimum(image_height, image_width), tf.float32)),
+        tf.int32)
+
+    offset_height = ((image_height - padded_center_crop_size) + 1) // 2
+    offset_width = ((image_width - padded_center_crop_size) + 1) // 2
+    crop_window = tf.stack([offset_height, offset_width,
+                            padded_center_crop_size, padded_center_crop_size])
+    image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
+    image = tf.image.resize([image], [image_size, image_size], resize_method)[0]
+
+    return image
+
+
+def _flip(image):
+    """Random horizontal image flip."""
+    image = tf.image.random_flip_left_right(image)
+    return image
+
+
+def preprocess_for_train(image_bytes, use_bfloat16, image_size=IMAGE_SIZE, interpolation='bicubic'):
+    """Preprocesses the given image for evaluation.
+
+    Args:
+      image_bytes: `Tensor` representing an image binary of arbitrary size.
+      use_bfloat16: `bool` for whether to use bfloat16.
+      image_size: image size.
+      interpolation: image interpolation method
+
+    Returns:
+      A preprocessed image `Tensor`.
+    """
+    resize_method = tf.image.ResizeMethod.BICUBIC if interpolation == 'bicubic' else tf.image.ResizeMethod.BILINEAR
+    image = _decode_and_random_crop(image_bytes, image_size, resize_method)
+    image = _flip(image)
+    image = tf.reshape(image, [image_size, image_size, 3])
+    image = tf.image.convert_image_dtype(
+        image, dtype=tf.bfloat16 if use_bfloat16 else tf.float32)
+    return image
+
+
+def preprocess_for_eval(image_bytes, use_bfloat16, image_size=IMAGE_SIZE, interpolation='bicubic'):
+    """Preprocesses the given image for evaluation.
+
+    Args:
+      image_bytes: `Tensor` representing an image binary of arbitrary size.
+      use_bfloat16: `bool` for whether to use bfloat16.
+      image_size: image size.
+      interpolation: image interpolation method
+
+    Returns:
+      A preprocessed image `Tensor`.
+    """
+    resize_method = tf.image.ResizeMethod.BICUBIC if interpolation == 'bicubic' else tf.image.ResizeMethod.BILINEAR
+    image = _decode_and_center_crop(image_bytes, image_size, resize_method)
+    image = tf.reshape(image, [image_size, image_size, 3])
+    image = tf.image.convert_image_dtype(
+        image, dtype=tf.bfloat16 if use_bfloat16 else tf.float32)
+    return image
+
+
+def preprocess_image(image_bytes,
+                     is_training=False,
+                     use_bfloat16=False,
+                     image_size=IMAGE_SIZE,
+                     interpolation='bicubic'):
+    """Preprocesses the given image.
+
+    Args:
+      image_bytes: `Tensor` representing an image binary of arbitrary size.
+      is_training: `bool` for whether the preprocessing is for training.
+      use_bfloat16: `bool` for whether to use bfloat16.
+      image_size: image size.
+      interpolation: image interpolation method
+
+    Returns:
+      A preprocessed image `Tensor` with value range of [0, 255].
+    """
+    if is_training:
+        return preprocess_for_train(image_bytes, use_bfloat16, image_size, interpolation)
+    else:
+        return preprocess_for_eval(image_bytes, use_bfloat16, image_size, interpolation)
+
+
+class TfPreprocessTransform:
+
+    def __init__(self, is_training=False, size=224, interpolation='bicubic'):
+        self.is_training = is_training
+        self.size = size[0] if isinstance(size, tuple) else size
+        self.interpolation = interpolation
+        self._image_bytes = None
+        self.process_image = self._build_tf_graph()
+        self.sess = None
+
+    def _build_tf_graph(self):
+        with tf.device('/cpu:0'):
+            self._image_bytes = tf.placeholder(
+                shape=[],
+                dtype=tf.string,
+            )
+            img = preprocess_image(
+                self._image_bytes, self.is_training, False, self.size, self.interpolation)
+        return img
+
+    def __call__(self, image_bytes):
+        if self.sess is None:
+            self.sess = tf.Session()
+        img = self.sess.run(self.process_image, feed_dict={self._image_bytes: image_bytes})
+        img = img.round().clip(0, 255).astype(np.uint8)
+        if img.ndim < 3:
+            img = np.expand_dims(img, axis=-1)
+        img = np.rollaxis(img, 2)  # HWC to CHW
+        return img
diff --git a/src/custom_timm/data/transforms.py b/src/custom_timm/data/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..3eb3bc32768f8c153233dc5bf7aa19dff9a80d39
--- /dev/null
+++ b/src/custom_timm/data/transforms.py
@@ -0,0 +1,197 @@
+import torch
+import torchvision.transforms.functional as F
+try:
+    from torchvision.transforms.functional import InterpolationMode
+    has_interpolation_mode = True
+except ImportError:
+    has_interpolation_mode = False
+from PIL import Image
+import warnings
+import math
+import random
+import numpy as np
+
+
+class ToNumpy:
+
+    def __call__(self, pil_img):
+        np_img = np.array(pil_img, dtype=np.uint8)
+        if np_img.ndim < 3:
+            np_img = np.expand_dims(np_img, axis=-1)
+        np_img = np.rollaxis(np_img, 2)  # HWC to CHW
+        return np_img
+
+
+class ToTensor:
+
+    def __init__(self, dtype=torch.float32):
+        self.dtype = dtype
+
+    def __call__(self, pil_img):
+        np_img = np.array(pil_img, dtype=np.uint8)
+        if np_img.ndim < 3:
+            np_img = np.expand_dims(np_img, axis=-1)
+        np_img = np.rollaxis(np_img, 2)  # HWC to CHW
+        return torch.from_numpy(np_img).to(dtype=self.dtype)
+
+
+# Pillow is deprecating the top-level resampling attributes (e.g., Image.BILINEAR) in
+# favor of the Image.Resampling enum. The top-level resampling attributes will be
+# removed in Pillow 10.
+if hasattr(Image, "Resampling"):
+    _pil_interpolation_to_str = {
+        Image.Resampling.NEAREST: 'nearest',
+        Image.Resampling.BILINEAR: 'bilinear',
+        Image.Resampling.BICUBIC: 'bicubic',
+        Image.Resampling.BOX: 'box',
+        Image.Resampling.HAMMING: 'hamming',
+        Image.Resampling.LANCZOS: 'lanczos',
+    }
+else:
+    _pil_interpolation_to_str = {
+        Image.NEAREST: 'nearest',
+        Image.BILINEAR: 'bilinear',
+        Image.BICUBIC: 'bicubic',
+        Image.BOX: 'box',
+        Image.HAMMING: 'hamming',
+        Image.LANCZOS: 'lanczos',
+    }
+
+_str_to_pil_interpolation = {b: a for a, b in _pil_interpolation_to_str.items()}
+
+
+if has_interpolation_mode:
+    _torch_interpolation_to_str = {
+        InterpolationMode.NEAREST: 'nearest',
+        InterpolationMode.BILINEAR: 'bilinear',
+        InterpolationMode.BICUBIC: 'bicubic',
+        InterpolationMode.BOX: 'box',
+        InterpolationMode.HAMMING: 'hamming',
+        InterpolationMode.LANCZOS: 'lanczos',
+    }
+    _str_to_torch_interpolation = {b: a for a, b in _torch_interpolation_to_str.items()}
+else:
+    _pil_interpolation_to_torch = {}
+    _torch_interpolation_to_str = {}
+
+
+def str_to_pil_interp(mode_str):
+    return _str_to_pil_interpolation[mode_str]
+
+
+def str_to_interp_mode(mode_str):
+    if has_interpolation_mode:
+        return _str_to_torch_interpolation[mode_str]
+    else:
+        return _str_to_pil_interpolation[mode_str]
+
+
+def interp_mode_to_str(mode):
+    if has_interpolation_mode:
+        return _torch_interpolation_to_str[mode]
+    else:
+        return _pil_interpolation_to_str[mode]
+
+
+_RANDOM_INTERPOLATION = (str_to_interp_mode('bilinear'), str_to_interp_mode('bicubic'))
+
+
+class RandomResizedCropAndInterpolation:
+    """Crop the given PIL Image to random size and aspect ratio with random interpolation.
+
+    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
+    aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop
+    is finally resized to given size.
+    This is popularly used to train the Inception networks.
+
+    Args:
+        size: expected output size of each edge
+        scale: range of size of the origin size cropped
+        ratio: range of aspect ratio of the origin aspect ratio cropped
+        interpolation: Default: PIL.Image.BILINEAR
+    """
+
+    def __init__(self, size, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.),
+                 interpolation='bilinear'):
+        if isinstance(size, (list, tuple)):
+            self.size = tuple(size)
+        else:
+            self.size = (size, size)
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn("range should be of kind (min, max)")
+
+        if interpolation == 'random':
+            self.interpolation = _RANDOM_INTERPOLATION
+        else:
+            self.interpolation = str_to_interp_mode(interpolation)
+        self.scale = scale
+        self.ratio = ratio
+
+    @staticmethod
+    def get_params(img, scale, ratio):
+        """Get parameters for ``crop`` for a random sized crop.
+
+        Args:
+            img (PIL Image): Image to be cropped.
+            scale (tuple): range of size of the origin size cropped
+            ratio (tuple): range of aspect ratio of the origin aspect ratio cropped
+
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for a random
+                sized crop.
+        """
+        area = img.size[0] * img.size[1]
+
+        for attempt in range(10):
+            target_area = random.uniform(*scale) * area
+            log_ratio = (math.log(ratio[0]), math.log(ratio[1]))
+            aspect_ratio = math.exp(random.uniform(*log_ratio))
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if w <= img.size[0] and h <= img.size[1]:
+                i = random.randint(0, img.size[1] - h)
+                j = random.randint(0, img.size[0] - w)
+                return i, j, h, w
+
+        # Fallback to central crop
+        in_ratio = img.size[0] / img.size[1]
+        if in_ratio < min(ratio):
+            w = img.size[0]
+            h = int(round(w / min(ratio)))
+        elif in_ratio > max(ratio):
+            h = img.size[1]
+            w = int(round(h * max(ratio)))
+        else:  # whole image
+            w = img.size[0]
+            h = img.size[1]
+        i = (img.size[1] - h) // 2
+        j = (img.size[0] - w) // 2
+        return i, j, h, w
+
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image): Image to be cropped and resized.
+
+        Returns:
+            PIL Image: Randomly cropped and resized image.
+        """
+        i, j, h, w = self.get_params(img, self.scale, self.ratio)
+        if isinstance(self.interpolation, (tuple, list)):
+            interpolation = random.choice(self.interpolation)
+        else:
+            interpolation = self.interpolation
+        return F.resized_crop(img, i, j, h, w, self.size, interpolation)
+
+    def __repr__(self):
+        if isinstance(self.interpolation, (tuple, list)):
+            interpolate_str = ' '.join([interp_mode_to_str(x) for x in self.interpolation])
+        else:
+            interpolate_str = interp_mode_to_str(self.interpolation)
+        format_string = self.__class__.__name__ + '(size={0}'.format(self.size)
+        format_string += ', scale={0}'.format(tuple(round(s, 4) for s in self.scale))
+        format_string += ', ratio={0}'.format(tuple(round(r, 4) for r in self.ratio))
+        format_string += ', interpolation={0})'.format(interpolate_str)
+        return format_string
diff --git a/src/custom_timm/data/transforms_factory.py b/src/custom_timm/data/transforms_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..70f05dbf8393d94f41999cfa599b3e4bdf80f8e6
--- /dev/null
+++ b/src/custom_timm/data/transforms_factory.py
@@ -0,0 +1,236 @@
+""" Transforms Factory
+Factory methods for building image transforms for use with TIMM (PyTorch Image Models)
+
+Hacked together by / Copyright 2019, Ross Wightman
+"""
+import math
+
+import torch
+from torchvision import transforms
+
+from custom_timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, DEFAULT_CROP_PCT
+from custom_timm.data.auto_augment import rand_augment_transform, augment_and_mix_transform, auto_augment_transform
+from custom_timm.data.transforms import str_to_interp_mode, str_to_pil_interp, RandomResizedCropAndInterpolation, ToNumpy
+from custom_timm.data.random_erasing import RandomErasing
+
+
+def transforms_noaug_train(
+        img_size=224,
+        interpolation='bilinear',
+        use_prefetcher=False,
+        mean=IMAGENET_DEFAULT_MEAN,
+        std=IMAGENET_DEFAULT_STD,
+):
+    if interpolation == 'random':
+        # random interpolation not supported with no-aug
+        interpolation = 'bilinear'
+    tfl = [
+        transforms.Resize(img_size, interpolation=str_to_interp_mode(interpolation)),
+        transforms.CenterCrop(img_size)
+    ]
+    if use_prefetcher:
+        # prefetcher and collate will handle tensor conversion and norm
+        tfl += [ToNumpy()]
+    else:
+        tfl += [
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=torch.tensor(mean),
+                std=torch.tensor(std))
+        ]
+    return transforms.Compose(tfl)
+
+
+def transforms_imagenet_train(
+        img_size=224,
+        scale=None,
+        ratio=None,
+        hflip=0.5,
+        vflip=0.,
+        color_jitter=0.4,
+        auto_augment=None,
+        interpolation='random',
+        use_prefetcher=False,
+        mean=IMAGENET_DEFAULT_MEAN,
+        std=IMAGENET_DEFAULT_STD,
+        re_prob=0.,
+        re_mode='const',
+        re_count=1,
+        re_num_splits=0,
+        separate=False,
+):
+    """
+    If separate==True, the transforms are returned as a tuple of 3 separate transforms
+    for use in a mixing dataset that passes
+     * all data through the first (primary) transform, called the 'clean' data
+     * a portion of the data through the secondary transform
+     * normalizes and converts the branches above with the third, final transform
+    """
+    scale = tuple(scale or (0.08, 1.0))  # default imagenet scale range
+    ratio = tuple(ratio or (3./4., 4./3.))  # default imagenet ratio range
+    primary_tfl = [
+        RandomResizedCropAndInterpolation(img_size, scale=scale, ratio=ratio, interpolation=interpolation)]
+    if hflip > 0.:
+        primary_tfl += [transforms.RandomHorizontalFlip(p=hflip)]
+    if vflip > 0.:
+        primary_tfl += [transforms.RandomVerticalFlip(p=vflip)]
+
+    secondary_tfl = []
+    if auto_augment:
+        assert isinstance(auto_augment, str)
+        if isinstance(img_size, (tuple, list)):
+            img_size_min = min(img_size)
+        else:
+            img_size_min = img_size
+        aa_params = dict(
+            translate_const=int(img_size_min * 0.45),
+            img_mean=tuple([min(255, round(255 * x)) for x in mean]),
+        )
+        if interpolation and interpolation != 'random':
+            aa_params['interpolation'] = str_to_pil_interp(interpolation)
+        if auto_augment.startswith('rand'):
+            secondary_tfl += [rand_augment_transform(auto_augment, aa_params)]
+        elif auto_augment.startswith('augmix'):
+            aa_params['translate_pct'] = 0.3
+            secondary_tfl += [augment_and_mix_transform(auto_augment, aa_params)]
+        else:
+            secondary_tfl += [auto_augment_transform(auto_augment, aa_params)]
+    elif color_jitter is not None:
+        # color jitter is enabled when not using AA
+        if isinstance(color_jitter, (list, tuple)):
+            # color jitter should be a 3-tuple/list if spec brightness/contrast/saturation
+            # or 4 if also augmenting hue
+            assert len(color_jitter) in (3, 4)
+        else:
+            # if it's a scalar, duplicate for brightness, contrast, and saturation, no hue
+            color_jitter = (float(color_jitter),) * 3
+        secondary_tfl += [transforms.ColorJitter(*color_jitter)]
+
+    final_tfl = []
+    if use_prefetcher:
+        # prefetcher and collate will handle tensor conversion and norm
+        final_tfl += [ToNumpy()]
+    else:
+        final_tfl += [
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=torch.tensor(mean),
+                std=torch.tensor(std))
+        ]
+        if re_prob > 0.:
+            final_tfl.append(
+                RandomErasing(re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits, device='cpu'))
+
+    if separate:
+        return transforms.Compose(primary_tfl), transforms.Compose(secondary_tfl), transforms.Compose(final_tfl)
+    else:
+        return transforms.Compose(primary_tfl + secondary_tfl + final_tfl)
+
+
+def transforms_imagenet_eval(
+        img_size=224,
+        crop_pct=None,
+        interpolation='bilinear',
+        use_prefetcher=False,
+        mean=IMAGENET_DEFAULT_MEAN,
+        std=IMAGENET_DEFAULT_STD):
+    crop_pct = crop_pct or DEFAULT_CROP_PCT
+
+    if isinstance(img_size, (tuple, list)):
+        assert len(img_size) == 2
+        if img_size[-1] == img_size[-2]:
+            # fall-back to older behaviour so Resize scales to shortest edge if target is square
+            scale_size = int(math.floor(img_size[0] / crop_pct))
+        else:
+            scale_size = tuple([int(x / crop_pct) for x in img_size])
+    else:
+        scale_size = int(math.floor(img_size / crop_pct))
+
+    tfl = [
+        transforms.Resize(scale_size, interpolation=str_to_interp_mode(interpolation)),
+        transforms.CenterCrop(img_size),
+    ]
+    if use_prefetcher:
+        # prefetcher and collate will handle tensor conversion and norm
+        tfl += [ToNumpy()]
+    else:
+        tfl += [
+            transforms.ToTensor(),
+            transforms.Normalize(
+                     mean=torch.tensor(mean),
+                     std=torch.tensor(std))
+        ]
+
+    return transforms.Compose(tfl)
+
+
+def create_transform(
+        input_size,
+        is_training=False,
+        use_prefetcher=False,
+        no_aug=False,
+        scale=None,
+        ratio=None,
+        hflip=0.5,
+        vflip=0.,
+        color_jitter=0.4,
+        auto_augment=None,
+        interpolation='bilinear',
+        mean=IMAGENET_DEFAULT_MEAN,
+        std=IMAGENET_DEFAULT_STD,
+        re_prob=0.,
+        re_mode='const',
+        re_count=1,
+        re_num_splits=0,
+        crop_pct=None,
+        tf_preprocessing=False,
+        separate=False):
+
+    if isinstance(input_size, (tuple, list)):
+        img_size = input_size[-2:]
+    else:
+        img_size = input_size
+
+    if tf_preprocessing and use_prefetcher:
+        assert not separate, "Separate transforms not supported for TF preprocessing"
+        from custom_timm.data.tf_preprocessing import TfPreprocessTransform
+        transform = TfPreprocessTransform(
+            is_training=is_training, size=img_size, interpolation=interpolation)
+    else:
+        if is_training and no_aug:
+            assert not separate, "Cannot perform split augmentation with no_aug"
+            transform = transforms_noaug_train(
+                img_size,
+                interpolation=interpolation,
+                use_prefetcher=use_prefetcher,
+                mean=mean,
+                std=std)
+        elif is_training:
+            transform = transforms_imagenet_train(
+                img_size,
+                scale=scale,
+                ratio=ratio,
+                hflip=hflip,
+                vflip=vflip,
+                color_jitter=color_jitter,
+                auto_augment=auto_augment,
+                interpolation=interpolation,
+                use_prefetcher=use_prefetcher,
+                mean=mean,
+                std=std,
+                re_prob=re_prob,
+                re_mode=re_mode,
+                re_count=re_count,
+                re_num_splits=re_num_splits,
+                separate=separate)
+        else:
+            assert not separate, "Separate transforms not supported for validation preprocessing"
+            transform = transforms_imagenet_eval(
+                img_size,
+                interpolation=interpolation,
+                use_prefetcher=use_prefetcher,
+                mean=mean,
+                std=std,
+                crop_pct=crop_pct)
+
+    return transform
diff --git a/src/custom_timm/loss/__init__.py b/src/custom_timm/loss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea7f15f2f79673c962f68d6d4b06898e73ac1df6
--- /dev/null
+++ b/src/custom_timm/loss/__init__.py
@@ -0,0 +1,4 @@
+from .asymmetric_loss import AsymmetricLossMultiLabel, AsymmetricLossSingleLabel
+from .binary_cross_entropy import BinaryCrossEntropy
+from .cross_entropy import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
+from .jsd import JsdCrossEntropy
diff --git a/src/custom_timm/loss/asymmetric_loss.py b/src/custom_timm/loss/asymmetric_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8b10f9c797c2cb3b2652302717b592dada216f3
--- /dev/null
+++ b/src/custom_timm/loss/asymmetric_loss.py
@@ -0,0 +1,97 @@
+import torch
+import torch.nn as nn
+
+
+class AsymmetricLossMultiLabel(nn.Module):
+    def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_torch_grad_focal_loss=False):
+        super(AsymmetricLossMultiLabel, self).__init__()
+
+        self.gamma_neg = gamma_neg
+        self.gamma_pos = gamma_pos
+        self.clip = clip
+        self.disable_torch_grad_focal_loss = disable_torch_grad_focal_loss
+        self.eps = eps
+
+    def forward(self, x, y):
+        """"
+        Parameters
+        ----------
+        x: input logits
+        y: targets (multi-label binarized vector)
+        """
+
+        # Calculating Probabilities
+        x_sigmoid = torch.sigmoid(x)
+        xs_pos = x_sigmoid
+        xs_neg = 1 - x_sigmoid
+
+        # Asymmetric Clipping
+        if self.clip is not None and self.clip > 0:
+            xs_neg = (xs_neg + self.clip).clamp(max=1)
+
+        # Basic CE calculation
+        los_pos = y * torch.log(xs_pos.clamp(min=self.eps))
+        los_neg = (1 - y) * torch.log(xs_neg.clamp(min=self.eps))
+        loss = los_pos + los_neg
+
+        # Asymmetric Focusing
+        if self.gamma_neg > 0 or self.gamma_pos > 0:
+            if self.disable_torch_grad_focal_loss:
+                torch._C.set_grad_enabled(False)
+            pt0 = xs_pos * y
+            pt1 = xs_neg * (1 - y)  # pt = p if t > 0 else 1-p
+            pt = pt0 + pt1
+            one_sided_gamma = self.gamma_pos * y + self.gamma_neg * (1 - y)
+            one_sided_w = torch.pow(1 - pt, one_sided_gamma)
+            if self.disable_torch_grad_focal_loss:
+                torch._C.set_grad_enabled(True)
+            loss *= one_sided_w
+
+        return -loss.sum()
+
+
+class AsymmetricLossSingleLabel(nn.Module):
+    def __init__(self, gamma_pos=1, gamma_neg=4, eps: float = 0.1, reduction='mean'):
+        super(AsymmetricLossSingleLabel, self).__init__()
+
+        self.eps = eps
+        self.logsoftmax = nn.LogSoftmax(dim=-1)
+        self.targets_classes = []  # prevent gpu repeated memory allocation
+        self.gamma_pos = gamma_pos
+        self.gamma_neg = gamma_neg
+        self.reduction = reduction
+
+    def forward(self, inputs, target, reduction=None):
+        """"
+        Parameters
+        ----------
+        x: input logits
+        y: targets (1-hot vector)
+        """
+
+        num_classes = inputs.size()[-1]
+        log_preds = self.logsoftmax(inputs)
+        self.targets_classes = torch.zeros_like(inputs).scatter_(1, target.long().unsqueeze(1), 1)
+
+        # ASL weights
+        targets = self.targets_classes
+        anti_targets = 1 - targets
+        xs_pos = torch.exp(log_preds)
+        xs_neg = 1 - xs_pos
+        xs_pos = xs_pos * targets
+        xs_neg = xs_neg * anti_targets
+        asymmetric_w = torch.pow(1 - xs_pos - xs_neg,
+                                 self.gamma_pos * targets + self.gamma_neg * anti_targets)
+        log_preds = log_preds * asymmetric_w
+
+        if self.eps > 0:  # label smoothing
+            self.targets_classes.mul_(1 - self.eps).add_(self.eps / num_classes)
+
+        # loss calculation
+        loss = - self.targets_classes.mul(log_preds)
+
+        loss = loss.sum(dim=-1)
+        if self.reduction == 'mean':
+            loss = loss.mean()
+
+        return loss
diff --git a/src/custom_timm/loss/binary_cross_entropy.py b/src/custom_timm/loss/binary_cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed76c1e8e004ca9a7255cf3650e322e6525c0577
--- /dev/null
+++ b/src/custom_timm/loss/binary_cross_entropy.py
@@ -0,0 +1,47 @@
+""" Binary Cross Entropy w/ a few extras
+
+Hacked together by / Copyright 2021 Ross Wightman
+"""
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class BinaryCrossEntropy(nn.Module):
+    """ BCE with optional one-hot from dense targets, label smoothing, thresholding
+    NOTE for experiments comparing CE to BCE /w label smoothing, may remove
+    """
+    def __init__(
+            self, smoothing=0.1, target_threshold: Optional[float] = None, weight: Optional[torch.Tensor] = None,
+            reduction: str = 'mean', pos_weight: Optional[torch.Tensor] = None):
+        super(BinaryCrossEntropy, self).__init__()
+        assert 0. <= smoothing < 1.0
+        self.smoothing = smoothing
+        self.target_threshold = target_threshold
+        self.reduction = reduction
+        self.register_buffer('weight', weight)
+        self.register_buffer('pos_weight', pos_weight)
+
+    def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        assert x.shape[0] == target.shape[0]
+        if target.shape != x.shape:
+            # NOTE currently assume smoothing or other label softening is applied upstream if targets are already sparse
+            num_classes = x.shape[-1]
+            # FIXME should off/on be different for smoothing w/ BCE? Other impl out there differ
+            off_value = self.smoothing / num_classes
+            on_value = 1. - self.smoothing + off_value
+            target = target.long().view(-1, 1)
+            target = torch.full(
+                (target.size()[0], num_classes),
+                off_value,
+                device=x.device, dtype=x.dtype).scatter_(1, target, on_value)
+        if self.target_threshold is not None:
+            # Make target 0, or 1 if threshold set
+            target = target.gt(self.target_threshold).to(dtype=target.dtype)
+        return F.binary_cross_entropy_with_logits(
+            x, target,
+            self.weight,
+            pos_weight=self.pos_weight,
+            reduction=self.reduction)
diff --git a/src/custom_timm/loss/cross_entropy.py b/src/custom_timm/loss/cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..85198107f3ad2a1ff775a677d77c03569ff5d04d
--- /dev/null
+++ b/src/custom_timm/loss/cross_entropy.py
@@ -0,0 +1,36 @@
+""" Cross Entropy w/ smoothing or soft targets
+
+Hacked together by / Copyright 2021 Ross Wightman
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class LabelSmoothingCrossEntropy(nn.Module):
+    """ NLL loss with label smoothing.
+    """
+    def __init__(self, smoothing=0.1):
+        super(LabelSmoothingCrossEntropy, self).__init__()
+        assert smoothing < 1.0
+        self.smoothing = smoothing
+        self.confidence = 1. - smoothing
+
+    def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        logprobs = F.log_softmax(x, dim=-1)
+        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
+        nll_loss = nll_loss.squeeze(1)
+        smooth_loss = -logprobs.mean(dim=-1)
+        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+        return loss.mean()
+
+
+class SoftTargetCrossEntropy(nn.Module):
+
+    def __init__(self):
+        super(SoftTargetCrossEntropy, self).__init__()
+
+    def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        loss = torch.sum(-target * F.log_softmax(x, dim=-1), dim=-1)
+        return loss.mean()
diff --git a/src/custom_timm/loss/jsd.py b/src/custom_timm/loss/jsd.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd64e156c23d27aa03817a587ae367e8175fc126
--- /dev/null
+++ b/src/custom_timm/loss/jsd.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .cross_entropy import LabelSmoothingCrossEntropy
+
+
+class JsdCrossEntropy(nn.Module):
+    """ Jensen-Shannon Divergence + Cross-Entropy Loss
+
+    Based on impl here: https://github.com/google-research/augmix/blob/master/imagenet.py
+    From paper: 'AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty -
+    https://arxiv.org/abs/1912.02781
+
+    Hacked together by / Copyright 2020 Ross Wightman
+    """
+    def __init__(self, num_splits=3, alpha=12, smoothing=0.1):
+        super().__init__()
+        self.num_splits = num_splits
+        self.alpha = alpha
+        if smoothing is not None and smoothing > 0:
+            self.cross_entropy_loss = LabelSmoothingCrossEntropy(smoothing)
+        else:
+            self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+
+    def __call__(self, output, target):
+        split_size = output.shape[0] // self.num_splits
+        assert split_size * self.num_splits == output.shape[0]
+        logits_split = torch.split(output, split_size)
+
+        # Cross-entropy is only computed on clean images
+        loss = self.cross_entropy_loss(logits_split[0], target[:split_size])
+        probs = [F.softmax(logits, dim=1) for logits in logits_split]
+
+        # Clamp mixture distribution to avoid exploding KL divergence
+        logp_mixture = torch.clamp(torch.stack(probs).mean(axis=0), 1e-7, 1).log()
+        loss += self.alpha * sum([F.kl_div(
+            logp_mixture, p_split, reduction='batchmean') for p_split in probs]) / len(probs)
+        return loss
diff --git a/src/custom_timm/models/__init__.py b/src/custom_timm/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ff79595d83197ecfb9a164ae9b9125ec3804863
--- /dev/null
+++ b/src/custom_timm/models/__init__.py
@@ -0,0 +1,74 @@
+from .beit import *
+from .byoanet import *
+from .byobnet import *
+from .cait import *
+from .coat import *
+from .convit import *
+from .convmixer import *
+from .convnext import *
+from .crossvit import *
+from .cspnet import *
+from .deit import *
+from .densenet import *
+from .dla import *
+from .dpn import *
+from .edgenext import *
+from .efficientformer import *
+from .efficientnet import *
+from .gcvit import *
+from .ghostnet import *
+from .gluon_resnet import *
+from .gluon_xception import *
+from .hardcorenas import *
+from .hrnet import *
+from .inception_resnet_v2 import *
+from .inception_v3 import *
+from .inception_v4 import *
+from .levit import *
+from .maxxvit import *
+from .mlp_mixer import *
+from .mobilenetv3 import *
+from .mobilevit import *
+from .mvitv2 import *
+from .nasnet import *
+from .nest import *
+from .nfnet import *
+from .pit import *
+from .pnasnet import *
+from .poolformer import *
+from .pvt_v2 import *
+from .regnet import *
+from .res2net import *
+from .resnest import *
+from .resnet import *
+from .resnetv2 import *
+from .rexnet import *
+from .selecsls import *
+from .senet import *
+from .sequencer import *
+from .sknet import *
+from .swin_transformer import *
+from .swin_transformer_v2 import *
+from .swin_transformer_v2_cr import *
+from .tnt import *
+from .tresnet import *
+from .twins import *
+from .vgg import *
+from .visformer import *
+from .vision_transformer import *
+from .vision_transformer_hybrid import *
+from .vision_transformer_relpos import *
+from .volo import *
+from .vovnet import *
+from .xception import *
+from .xception_aligned import *
+from .xcit import *
+
+from .factory import create_model, parse_model_name, safe_model_name
+from .helpers import load_checkpoint, resume_checkpoint, model_parameters
+from .layers import TestTimePoolHead, apply_test_time_pool
+from .layers import convert_splitbn_model, convert_sync_batchnorm
+from .layers import is_scriptable, is_exportable, set_scriptable, set_exportable, is_no_jit, set_no_jit
+from .layers import set_fast_norm
+from .registry import register_model, model_entrypoint, list_models, is_model, list_modules, is_model_in_modules,\
+    is_model_pretrained, get_pretrained_cfg, has_pretrained_cfg_key, is_pretrained_cfg_key, get_pretrained_cfg_value
diff --git a/src/custom_timm/models/beit.py b/src/custom_timm/models/beit.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f81b008ebfc372aef4c211babc95be32c910629
--- /dev/null
+++ b/src/custom_timm/models/beit.py
@@ -0,0 +1,502 @@
+""" BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)
+
+Model from official source: https://github.com/microsoft/unilm/tree/master/beit
+and
+https://github.com/microsoft/unilm/tree/master/beit2
+
+@inproceedings{beit,
+title={{BEiT}: {BERT} Pre-Training of Image Transformers},
+author={Hangbo Bao and Li Dong and Songhao Piao and Furu Wei},
+booktitle={International Conference on Learning Representations},
+year={2022},
+url={https://openreview.net/forum?id=p-BhZSz59o4}
+}
+
+@article{beitv2,
+title={{BEiT v2}: Masked Image Modeling with Vector-Quantized Visual Tokenizers},
+author={Zhiliang Peng and Li Dong and Hangbo Bao and Qixiang Ye and Furu Wei},
+year={2022},
+eprint={2208.06366},
+archivePrefix={arXiv},
+primaryClass={cs.CV}
+}
+
+At this point only the 1k fine-tuned classification weights and model configs have been added,
+see original source above for pre-training models and procedure.
+
+Modifications by / Copyright 2021 Ross Wightman, original copyrights below
+"""
+# --------------------------------------------------------
+# BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)
+# Github source: https://github.com/microsoft/unilm/tree/master/beit
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# By Hangbo Bao
+# Based on timm and DeiT code bases
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# https://github.com/facebookresearch/deit/
+# https://github.com/facebookresearch/dino
+# --------------------------------------------------------'
+import math
+from functools import partial
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg
+from .layers import PatchEmbed, Mlp, DropPath, trunc_normal_
+from .registry import register_model
+from .vision_transformer import checkpoint_filter_fn
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+        'first_conv': 'patch_embed.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'beit_base_patch16_224': _cfg(
+        url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth'),
+    'beit_base_patch16_384': _cfg(
+        url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_384_pt22k_ft22kto1k.pth',
+        input_size=(3, 384, 384), crop_pct=1.0,
+    ),
+    'beit_base_patch16_224_in22k': _cfg(
+        url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22k.pth',
+        num_classes=21841,
+    ),
+    'beit_large_patch16_224': _cfg(
+        url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22kto1k.pth'),
+    'beit_large_patch16_384': _cfg(
+        url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_384_pt22k_ft22kto1k.pth',
+        input_size=(3, 384, 384), crop_pct=1.0,
+    ),
+    'beit_large_patch16_512': _cfg(
+        url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_512_pt22k_ft22kto1k.pth',
+        input_size=(3, 512, 512), crop_pct=1.0,
+    ),
+    'beit_large_patch16_224_in22k': _cfg(
+        url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth',
+        num_classes=21841,
+    ),
+
+    'beitv2_base_patch16_224': _cfg(
+        url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21kto1k.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
+    ),
+    'beitv2_base_patch16_224_in22k': _cfg(
+        url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21k.pth',
+        num_classes=21841,
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
+    ),
+    'beitv2_large_patch16_224': _cfg(
+        url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft21kto1k.pth',
+        crop_pct=0.95,
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
+    ),
+    'beitv2_large_patch16_224_in22k': _cfg(
+        url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft21k.pth',
+        num_classes=21841,
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
+    ),
+}
+
+
+def gen_relative_position_index(window_size: Tuple[int, int]) -> torch.Tensor:
+    num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+    # cls to token & token 2 cls & cls to cls
+    # get pair-wise relative position index for each token inside the window
+    window_area = window_size[0] * window_size[1]
+    coords = torch.stack(torch.meshgrid(
+        [torch.arange(window_size[0]),
+         torch.arange(window_size[1])]))  # 2, Wh, Ww
+    coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+    relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+    relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+    relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+    relative_coords[:, :, 1] += window_size[1] - 1
+    relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+    relative_position_index = torch.zeros(size=(window_area + 1,) * 2, dtype=relative_coords.dtype)
+    relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+    relative_position_index[0, 0:] = num_relative_distance - 3
+    relative_position_index[0:, 0] = num_relative_distance - 2
+    relative_position_index[0, 0] = num_relative_distance - 1
+    return relative_position_index
+
+
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, attn_drop=0.,
+            proj_drop=0., window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.register_buffer('k_bias', torch.zeros(all_head_dim), persistent=False)
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.k_bias = None
+            self.v_bias = None
+
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+            self.register_buffer("relative_position_index", gen_relative_position_index(window_size))
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def _get_rel_pos_bias(self):
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1] + 1,
+            self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        return relative_position_bias.unsqueeze(0)
+
+    def forward(self, x, shared_rel_pos_bias: Optional[torch.Tensor] = None):
+        B, N, C = x.shape
+
+        qkv_bias = torch.cat((self.q_bias, self.k_bias, self.v_bias)) if self.q_bias is not None else None
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        if self.relative_position_bias_table is not None:
+            attn = attn + self._get_rel_pos_bias()
+        if shared_rel_pos_bias is not None:
+            attn = attn + shared_rel_pos_bias
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(
+            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+            drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+            window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop,
+            window_size=window_size, attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        if init_values:
+            self.gamma_1 = nn.Parameter(init_values * torch.ones(dim))
+            self.gamma_2 = nn.Parameter(init_values * torch.ones(dim))
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, shared_rel_pos_bias: Optional[torch.Tensor] = None):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(self.attn(self.norm1(x), shared_rel_pos_bias=shared_rel_pos_bias))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), shared_rel_pos_bias=shared_rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class RelativePositionBias(nn.Module):
+
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.window_area = window_size[0] * window_size[1]
+        num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = nn.Parameter(torch.zeros(num_relative_distance, num_heads))
+        # trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.register_buffer("relative_position_index", gen_relative_position_index(window_size))
+
+    def forward(self):
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_area + 1, self.window_area + 1, -1)  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+
+class Beit(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(
+            self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, global_pool='avg',
+            embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=True, drop_rate=0.,
+            attn_drop_rate=0., drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6),
+            init_values=None, use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False,
+            head_init_scale=0.001):
+        super().__init__()
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.grad_checkpointing = False
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) if use_abs_pos_emb else None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.grid_size, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                init_values=init_values, window_size=self.patch_embed.grid_size if use_rel_pos_bias else None)
+            for i in range(depth)])
+        use_fc_norm = self.global_pool == 'avg'
+        self.norm = nn.Identity() if use_fc_norm else norm_layer(embed_dim)
+        self.fc_norm = norm_layer(embed_dim) if use_fc_norm else None
+        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        # trunc_normal_(self.mask_token, std=.02)
+        self.fix_init_weight()
+        if isinstance(self.head, nn.Linear):
+            trunc_normal_(self.head.weight, std=.02)
+            self.head.weight.data.mul_(head_init_scale)
+            self.head.bias.data.mul_(head_init_scale)
+
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        nwd = {'pos_embed', 'cls_token'}
+        for n, _ in self.named_parameters():
+            if 'relative_position_bias_table' in n:
+                nwd.add(n)
+        return nwd
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^cls_token|pos_embed|patch_embed|rel_pos_bias',  # stem and embed
+            blocks=[(r'^blocks\.(\d+)', None), (r'^norm', (99999,))],
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(blk, x, shared_rel_pos_bias=rel_pos_bias)
+            else:
+                x = blk(x, shared_rel_pos_bias=rel_pos_bias)
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.fc_norm is not None:
+            x = x[:, 1:].mean(dim=1)
+            x = self.fc_norm(x)
+        else:
+            x = x[:, 0]
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _beit_checkpoint_filter_fn(state_dict, model):
+    if 'module' in state_dict:
+        # beit v2 didn't strip module
+        state_dict = state_dict['module']
+    return checkpoint_filter_fn(state_dict, model)
+
+
+def _create_beit(variant, pretrained=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Beit models.')
+
+    model = build_model_with_cfg(
+        Beit, variant, pretrained,
+        # FIXME an updated filter fn needed to interpolate rel pos emb if fine tuning to diff model sizes
+        pretrained_filter_fn=_beit_checkpoint_filter_fn,
+        **kwargs)
+    return model
+
+
+@register_model
+def beit_base_patch16_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
+        use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=0.1, **kwargs)
+    model = _create_beit('beit_base_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def beit_base_patch16_384(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
+        use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=0.1, **kwargs)
+    model = _create_beit('beit_base_patch16_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def beit_base_patch16_224_in22k(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
+        use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=0.1, **kwargs)
+    model = _create_beit('beit_base_patch16_224_in22k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def beit_large_patch16_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
+        use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5,  **kwargs)
+    model = _create_beit('beit_large_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def beit_large_patch16_384(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        img_size=384, patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
+        use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
+    model = _create_beit('beit_large_patch16_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def beit_large_patch16_512(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        img_size=512, patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
+        use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
+    model = _create_beit('beit_large_patch16_512', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def beit_large_patch16_224_in22k(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
+        use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5,  **kwargs)
+    model = _create_beit('beit_large_patch16_224_in22k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def beitv2_base_patch16_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
+        use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
+    model = _create_beit('beitv2_base_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def beitv2_base_patch16_224_in22k(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
+        use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
+    model = _create_beit('beitv2_base_patch16_224_in22k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def beitv2_large_patch16_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
+        use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5,  **kwargs)
+    model = _create_beit('beitv2_large_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def beitv2_large_patch16_224_in22k(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
+        use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5,  **kwargs)
+    model = _create_beit('beitv2_large_patch16_224_in22k', pretrained=pretrained, **model_kwargs)
+    return model
diff --git a/src/custom_timm/models/byoanet.py b/src/custom_timm/models/byoanet.py
new file mode 100644
index 0000000000000000000000000000000000000000..34a557be90fc1af1ed858a08feb1987ed2281dac
--- /dev/null
+++ b/src/custom_timm/models/byoanet.py
@@ -0,0 +1,442 @@
+""" Bring-Your-Own-Attention Network
+
+A flexible network w/ dataclass based config for stacking NN blocks including
+self-attention (or similar) layers.
+
+Currently used to implement experimental variants of:
+  * Bottleneck Transformers
+  * Lambda ResNets
+  * HaloNets
+
+Consider all of the models definitions here as experimental WIP and likely to change.
+
+Hacked together by / copyright Ross Wightman, 2021.
+"""
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .byobnet import ByoBlockCfg, ByoModelCfg, ByobNet, interleave_blocks
+from .helpers import build_model_with_cfg
+from .registry import register_model
+
+__all__ = []
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.95, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.conv1.conv', 'classifier': 'head.fc',
+        'fixed_input_size': False, 'min_input_size': (3, 224, 224),
+        **kwargs
+    }
+
+
+default_cfgs = {
+    # GPU-Efficient (ResNet) weights
+    'botnet26t_256': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/botnet26t_c1_256-167a0e9f.pth',
+        fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)),
+    'sebotnet33ts_256': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/sebotnet33ts_a1h2_256-957e3c3e.pth',
+        fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=0.94),
+    'botnet50ts_256': _cfg(
+        url='',
+        fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)),
+    'eca_botnext26ts_256': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/eca_botnext26ts_c_256-95a898f6.pth',
+        fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)),
+
+    'halonet_h1': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256)),
+    'halonet26t': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/halonet26t_a1h_256-3083328c.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256)),
+    'sehalonet33ts': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/sehalonet33ts_256-87e053f9.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256), crop_pct=0.94),
+    'halonet50ts': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/halonet50ts_a1h2_256-f3a3daee.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256), crop_pct=0.94),
+    'eca_halonext26ts': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/eca_halonext26ts_c_256-06906299.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256), crop_pct=0.94),
+
+    'lambda_resnet26t': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/lambda_resnet26t_c_256-e5a5c857.pth',
+        min_input_size=(3, 128, 128), input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=0.94),
+    'lambda_resnet50ts': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/lambda_resnet50ts_a1h_256-b87370f7.pth',
+        min_input_size=(3, 128, 128), input_size=(3, 256, 256), pool_size=(8, 8)),
+    'lambda_resnet26rpt_256': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/lambda_resnet26rpt_c_256-ab00292d.pth',
+        fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=0.94),
+
+    'haloregnetz_b': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/haloregnetz_c_raa_256-c8ad7616.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        first_conv='stem.conv', input_size=(3, 224, 224), pool_size=(7, 7), min_input_size=(3, 224, 224), crop_pct=0.94),
+
+    'lamhalobotnet50ts_256': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/lamhalobotnet50ts_a1h2_256-fe3d9445.pth',
+        fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)),
+    'halo2botnet50ts_256': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/halo2botnet50ts_a1h2_256-fd9c11a3.pth',
+        fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)),
+}
+
+
+model_cfgs = dict(
+
+    botnet26t=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=0, br=0.25),
+            ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='maxpool',
+        fixed_input_size=True,
+        self_attn_layer='bottleneck',
+        self_attn_kwargs=dict()
+    ),
+    sebotnet33ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), every=[2], d=3, c=512, s=2, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), every=[2], d=3, c=1024, s=2, gs=0, br=0.25),
+            ByoBlockCfg('self_attn', d=2, c=1536, s=2, gs=0, br=0.333),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='',
+        act_layer='silu',
+        num_features=1280,
+        attn_layer='se',
+        self_attn_layer='bottleneck',
+        self_attn_kwargs=dict()
+    ),
+    botnet50ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), every=4, d=4, c=512, s=2, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), d=6, c=1024, s=2, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), d=3, c=2048, s=2, gs=0, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='maxpool',
+        act_layer='silu',
+        fixed_input_size=True,
+        self_attn_layer='bottleneck',
+        self_attn_kwargs=dict()
+    ),
+    eca_botnext26ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=16, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=16, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=16, br=0.25),
+            ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=16, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='maxpool',
+        fixed_input_size=True,
+        act_layer='silu',
+        attn_layer='eca',
+        self_attn_layer='bottleneck',
+        self_attn_kwargs=dict(dim_head=16)
+    ),
+
+    halonet_h1=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='self_attn', d=3, c=64, s=1, gs=0, br=1.0),
+            ByoBlockCfg(type='self_attn', d=3, c=128, s=2, gs=0, br=1.0),
+            ByoBlockCfg(type='self_attn', d=10, c=256, s=2, gs=0, br=1.0),
+            ByoBlockCfg(type='self_attn', d=3, c=512, s=2, gs=0, br=1.0),
+        ),
+        stem_chs=64,
+        stem_type='7x7',
+        stem_pool='maxpool',
+
+        self_attn_layer='halo',
+        self_attn_kwargs=dict(block_size=8, halo_size=3),
+    ),
+    halonet26t=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=0, br=0.25),
+            ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='maxpool',
+        self_attn_layer='halo',
+        self_attn_kwargs=dict(block_size=8, halo_size=2)
+    ),
+    sehalonet33ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), every=[2], d=3, c=512, s=2, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), every=[2], d=3, c=1024, s=2, gs=0, br=0.25),
+            ByoBlockCfg('self_attn', d=2, c=1536, s=2, gs=0, br=0.333),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='',
+        act_layer='silu',
+        num_features=1280,
+        attn_layer='se',
+        self_attn_layer='halo',
+        self_attn_kwargs=dict(block_size=8, halo_size=3)
+    ),
+    halonet50ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25),
+            interleave_blocks(
+                types=('bottle', 'self_attn'), every=4, d=4, c=512, s=2, gs=0, br=0.25,
+                self_attn_layer='halo', self_attn_kwargs=dict(block_size=8, halo_size=3, num_heads=4)),
+            interleave_blocks(types=('bottle', 'self_attn'), d=6, c=1024, s=2, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), d=3, c=2048, s=2, gs=0, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='maxpool',
+        act_layer='silu',
+        self_attn_layer='halo',
+        self_attn_kwargs=dict(block_size=8, halo_size=3)
+    ),
+    eca_halonext26ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=16, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=16, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=16, br=0.25),
+            ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=16, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='maxpool',
+        act_layer='silu',
+        attn_layer='eca',
+        self_attn_layer='halo',
+        self_attn_kwargs=dict(block_size=8, halo_size=2, dim_head=16)
+    ),
+
+    lambda_resnet26t=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=0, br=0.25),
+            ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='maxpool',
+        self_attn_layer='lambda',
+        self_attn_kwargs=dict(r=9)
+    ),
+    lambda_resnet50ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), every=4, d=4, c=512, s=2, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), d=6, c=1024, s=2, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), d=3, c=2048, s=2, gs=0, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='maxpool',
+        act_layer='silu',
+        self_attn_layer='lambda',
+        self_attn_kwargs=dict(r=9)
+    ),
+    lambda_resnet26rpt_256=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=0, br=0.25),
+            ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='maxpool',
+        self_attn_layer='lambda',
+        self_attn_kwargs=dict(r=None)
+    ),
+
+    # experimental
+    haloregnetz_b=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=48, s=2, gs=16, br=3),
+            ByoBlockCfg(type='bottle', d=6, c=96, s=2, gs=16, br=3),
+            interleave_blocks(types=('bottle', 'self_attn'), every=3, d=12, c=192, s=2, gs=16, br=3),
+            ByoBlockCfg('self_attn', d=2, c=288, s=2, gs=16, br=3),
+        ),
+        stem_chs=32,
+        stem_pool='',
+        downsample='',
+        num_features=1536,
+        act_layer='silu',
+        attn_layer='se',
+        attn_kwargs=dict(rd_ratio=0.25),
+        block_kwargs=dict(bottle_in=True, linear_out=True),
+        self_attn_layer='halo',
+        self_attn_kwargs=dict(block_size=7, halo_size=2, qk_ratio=0.33)
+    ),
+
+    # experimental
+    lamhalobotnet50ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25),
+            interleave_blocks(
+                types=('bottle', 'self_attn'), d=4, c=512, s=2, gs=0, br=0.25,
+                self_attn_layer='lambda', self_attn_kwargs=dict(r=13)),
+            interleave_blocks(
+                types=('bottle', 'self_attn'), d=6, c=1024, s=2, gs=0, br=0.25,
+                self_attn_layer='halo', self_attn_kwargs=dict(halo_size=3)),
+            interleave_blocks(
+                types=('bottle', 'self_attn'), d=3, c=2048, s=2, gs=0, br=0.25,
+                self_attn_layer='bottleneck', self_attn_kwargs=dict()),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='',
+        act_layer='silu',
+    ),
+    halo2botnet50ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25),
+            interleave_blocks(
+                types=('bottle', 'self_attn'), d=4, c=512, s=2, gs=0, br=0.25,
+                self_attn_layer='halo', self_attn_kwargs=dict(halo_size=3)),
+            interleave_blocks(
+                types=('bottle', 'self_attn'), d=6, c=1024, s=2, gs=0, br=0.25,
+                self_attn_layer='halo', self_attn_kwargs=dict(halo_size=3)),
+            interleave_blocks(
+                types=('bottle', 'self_attn'), d=3, c=2048, s=2, gs=0, br=0.25,
+                self_attn_layer='bottleneck', self_attn_kwargs=dict()),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='',
+        act_layer='silu',
+    ),
+)
+
+
+def _create_byoanet(variant, cfg_variant=None, pretrained=False, **kwargs):
+    return build_model_with_cfg(
+        ByobNet, variant, pretrained,
+        model_cfg=model_cfgs[variant] if not cfg_variant else model_cfgs[cfg_variant],
+        feature_cfg=dict(flatten_sequential=True),
+        **kwargs)
+
+
+@register_model
+def botnet26t_256(pretrained=False, **kwargs):
+    """ Bottleneck Transformer w/ ResNet26-T backbone.
+    """
+    kwargs.setdefault('img_size', 256)
+    return _create_byoanet('botnet26t_256', 'botnet26t', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def sebotnet33ts_256(pretrained=False, **kwargs):
+    """ Bottleneck Transformer w/ a ResNet33-t backbone, SE attn for non Halo blocks, SiLU,
+    """
+    return _create_byoanet('sebotnet33ts_256', 'sebotnet33ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def botnet50ts_256(pretrained=False, **kwargs):
+    """ Bottleneck Transformer w/ ResNet50-T backbone, silu act.
+    """
+    kwargs.setdefault('img_size', 256)
+    return _create_byoanet('botnet50ts_256', 'botnet50ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def eca_botnext26ts_256(pretrained=False, **kwargs):
+    """ Bottleneck Transformer w/ ResNet26-T backbone, silu act.
+    """
+    kwargs.setdefault('img_size', 256)
+    return _create_byoanet('eca_botnext26ts_256', 'eca_botnext26ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def halonet_h1(pretrained=False, **kwargs):
+    """ HaloNet-H1. Halo attention in all stages as per the paper.
+    NOTE: This runs very slowly!
+    """
+    return _create_byoanet('halonet_h1', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def halonet26t(pretrained=False, **kwargs):
+    """ HaloNet w/ a ResNet26-t backbone. Halo attention in final two stages
+    """
+    return _create_byoanet('halonet26t', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def sehalonet33ts(pretrained=False, **kwargs):
+    """ HaloNet w/ a ResNet33-t backbone, SE attn for non Halo blocks, SiLU, 1-2 Halo in stage 2,3,4.
+    """
+    return _create_byoanet('sehalonet33ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def halonet50ts(pretrained=False, **kwargs):
+    """ HaloNet w/ a ResNet50-t backbone, silu act. Halo attention in final two stages
+    """
+    return _create_byoanet('halonet50ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def eca_halonext26ts(pretrained=False, **kwargs):
+    """ HaloNet w/ a ResNet26-t backbone, silu act. Halo attention in final two stages
+    """
+    return _create_byoanet('eca_halonext26ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def lambda_resnet26t(pretrained=False, **kwargs):
+    """ Lambda-ResNet-26-T. Lambda layers w/ conv pos in last two stages.
+    """
+    return _create_byoanet('lambda_resnet26t', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def lambda_resnet50ts(pretrained=False, **kwargs):
+    """ Lambda-ResNet-50-TS. SiLU act. Lambda layers w/ conv pos in last two stages.
+    """
+    return _create_byoanet('lambda_resnet50ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def lambda_resnet26rpt_256(pretrained=False, **kwargs):
+    """ Lambda-ResNet-26-R-T. Lambda layers w/ rel pos embed in last two stages.
+    """
+    kwargs.setdefault('img_size', 256)
+    return _create_byoanet('lambda_resnet26rpt_256', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def haloregnetz_b(pretrained=False, **kwargs):
+    """ Halo + RegNetZ
+    """
+    return _create_byoanet('haloregnetz_b', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def lamhalobotnet50ts_256(pretrained=False, **kwargs):
+    """ Combo Attention (Lambda + Halo + Bot) Network
+    """
+    return _create_byoanet('lamhalobotnet50ts_256', 'lamhalobotnet50ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def halo2botnet50ts_256(pretrained=False, **kwargs):
+    """ Combo Attention (Halo + Halo + Bot) Network
+    """
+    return _create_byoanet('halo2botnet50ts_256', 'halo2botnet50ts', pretrained=pretrained, **kwargs)
diff --git a/src/custom_timm/models/byobnet.py b/src/custom_timm/models/byobnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..71b6dd446af4d779012a6ea149fb7862b2ff3e27
--- /dev/null
+++ b/src/custom_timm/models/byobnet.py
@@ -0,0 +1,1587 @@
+""" Bring-Your-Own-Blocks Network
+
+A flexible network w/ dataclass based config for stacking those NN blocks.
+
+This model is currently used to implement the following networks:
+
+GPU Efficient (ResNets) - gernet_l/m/s (original versions called genet, but this was already used (by SENet author)).
+Paper: `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090
+Code and weights: https://github.com/idstcv/GPU-Efficient-Networks, licensed Apache 2.0
+
+RepVGG - repvgg_*
+Paper: `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
+Code and weights: https://github.com/DingXiaoH/RepVGG, licensed MIT
+
+In all cases the models have been modified to fit within the design of ByobNet. I've remapped
+the original weights and verified accuracies.
+
+For GPU Efficient nets, I used the original names for the blocks since they were for the most part
+the same as original residual blocks in ResNe(X)t, DarkNet, and other existing models. Note also some
+changes introduced in RegNet were also present in the stem and bottleneck blocks for this model.
+
+A significant number of different network archs can be implemented here, including variants of the
+above nets that include attention.
+
+Hacked together by / copyright Ross Wightman, 2021.
+"""
+import math
+from dataclasses import dataclass, field, replace
+from typing import Tuple, List, Dict, Optional, Union, Any, Callable, Sequence
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg, named_apply, checkpoint_seq
+from .layers import ClassifierHead, ConvNormAct, BatchNormAct2d, DropPath, AvgPool2dSame, \
+    create_conv2d, get_act_layer, get_norm_act_layer, get_attn, make_divisible, to_2tuple, EvoNorm2dS0, EvoNorm2dS0a,\
+    EvoNorm2dS1, EvoNorm2dS1a, EvoNorm2dS2, EvoNorm2dS2a, FilterResponseNormAct2d, FilterResponseNormTlu2d
+from .registry import register_model
+
+__all__ = ['ByobNet', 'ByoModelCfg', 'ByoBlockCfg', 'create_byob_stem', 'create_block']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.conv', 'classifier': 'head.fc',
+        **kwargs
+    }
+
+
+def _cfgr(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 256, 256), 'pool_size': (8, 8),
+        'crop_pct': 0.9, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.conv1.conv', 'classifier': 'head.fc',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    # GPU-Efficient (ResNet) weights
+    'gernet_s': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-ger-weights/gernet_s-756b4751.pth'),
+    'gernet_m': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-ger-weights/gernet_m-0873c53a.pth'),
+    'gernet_l': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-ger-weights/gernet_l-f31e2e8d.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8)),
+
+    # RepVGG weights
+    'repvgg_a2': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_a2-c1ee6d2b.pth',
+        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
+    'repvgg_b0': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b0-80ac3f1b.pth',
+        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
+    'repvgg_b1': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b1-77ca2989.pth',
+        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
+    'repvgg_b1g4': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b1g4-abde5d92.pth',
+        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
+    'repvgg_b2': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b2-25b7494e.pth',
+        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
+    'repvgg_b2g4': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b2g4-165a85f2.pth',
+        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
+    'repvgg_b3': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b3-199bc50d.pth',
+        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
+    'repvgg_b3g4': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b3g4-73c370bf.pth',
+        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
+
+    # experimental configs
+    'resnet51q': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet51q_ra2-d47dcc76.pth',
+        first_conv='stem.conv1', input_size=(3, 256, 256), pool_size=(8, 8),
+        test_input_size=(3, 288, 288), crop_pct=1.0),
+    'resnet61q': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet61q_ra2-6afc536c.pth',
+        test_input_size=(3, 288, 288), crop_pct=1.0),
+
+    'resnext26ts': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/resnext26ts_256_ra2-8bbd9106.pth'),
+    'gcresnext26ts': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/gcresnext26ts_256-e414378b.pth'),
+    'seresnext26ts': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/seresnext26ts_256-6f0d74a3.pth'),
+    'eca_resnext26ts': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/eca_resnext26ts_256-5a1d030f.pth'),
+    'bat_resnext26ts': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/bat_resnext26ts_256-fa6fd595.pth',
+        min_input_size=(3, 256, 256)),
+
+    'resnet32ts': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/resnet32ts_256-aacf5250.pth'),
+    'resnet33ts': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/resnet33ts_256-e91b09a4.pth'),
+    'gcresnet33ts': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/gcresnet33ts_256-0e0cd345.pth'),
+    'seresnet33ts': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/seresnet33ts_256-f8ad44d9.pth'),
+    'eca_resnet33ts': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/eca_resnet33ts_256-8f98face.pth'),
+
+    'gcresnet50t': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/gcresnet50t_256-96374d1c.pth'),
+
+    'gcresnext50ts': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/gcresnext50ts_256-3e0f515e.pth'),
+
+    # experimental models, likely to change ot be removed
+    'regnetz_b16': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/regnetz_b_raa-677d9606.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 224, 224), pool_size=(7, 7), test_input_size=(3, 288, 288), first_conv='stem.conv', crop_pct=0.94),
+    'regnetz_c16': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/regnetz_c_rab2_256-a54bf36a.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), test_input_size=(3, 320, 320), first_conv='stem.conv', crop_pct=0.94),
+    'regnetz_d32': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/regnetz_d_rab_256-b8073a89.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), test_input_size=(3, 320, 320), crop_pct=0.95),
+    'regnetz_d8': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/regnetz_d8_bh-afc03c55.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), test_input_size=(3, 320, 320), crop_pct=1.0),
+    'regnetz_e8': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/regnetz_e8_bh-aace8e6e.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), test_input_size=(3, 320, 320), crop_pct=1.0),
+
+    'regnetz_b16_evos': _cfgr(
+        url='',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 224, 224), pool_size=(7, 7), test_input_size=(3, 288, 288), first_conv='stem.conv',
+        crop_pct=0.94),
+    'regnetz_c16_evos': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnetz_c16_evos_ch-d8311942.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), test_input_size=(3, 320, 320), first_conv='stem.conv', crop_pct=0.95),
+    'regnetz_d8_evos': _cfgr(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnetz_d8_evos_ch-2bc12646.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), test_input_size=(3, 320, 320), crop_pct=0.95),
+}
+
+
+@dataclass
+class ByoBlockCfg:
+    type: Union[str, nn.Module]
+    d: int  # block depth (number of block repeats in stage)
+    c: int  # number of output channels for each block in stage
+    s: int = 2  # stride of stage (first block)
+    gs: Optional[Union[int, Callable]] = None  # group-size of blocks in stage, conv is depthwise if gs == 1
+    br: float = 1.  # bottleneck-ratio of blocks in stage
+
+    # NOTE: these config items override the model cfgs that are applied to all blocks by default
+    attn_layer: Optional[str] = None
+    attn_kwargs: Optional[Dict[str, Any]] = None
+    self_attn_layer: Optional[str] = None
+    self_attn_kwargs: Optional[Dict[str, Any]] = None
+    block_kwargs: Optional[Dict[str, Any]] = None
+
+
+@dataclass
+class ByoModelCfg:
+    blocks: Tuple[Union[ByoBlockCfg, Tuple[ByoBlockCfg, ...]], ...]
+    downsample: str = 'conv1x1'
+    stem_type: str = '3x3'
+    stem_pool: Optional[str] = 'maxpool'
+    stem_chs: int = 32
+    width_factor: float = 1.0
+    num_features: int = 0  # num out_channels for final conv, no final 1x1 conv if 0
+    zero_init_last: bool = True  # zero init last weight (usually bn) in residual path
+    fixed_input_size: bool = False  # model constrained to a fixed-input size / img_size must be provided on creation
+
+    act_layer: str = 'relu'
+    norm_layer: str = 'batchnorm'
+
+    # NOTE: these config items will be overridden by the block cfg (per-block) if they are set there
+    attn_layer: Optional[str] = None
+    attn_kwargs: dict = field(default_factory=lambda: dict())
+    self_attn_layer: Optional[str] = None
+    self_attn_kwargs: dict = field(default_factory=lambda: dict())
+    block_kwargs: Dict[str, Any] = field(default_factory=lambda: dict())
+
+
+def _rep_vgg_bcfg(d=(4, 6, 16, 1), wf=(1., 1., 1., 1.), groups=0):
+    c = (64, 128, 256, 512)
+    group_size = 0
+    if groups > 0:
+        group_size = lambda chs, idx: chs // groups if (idx + 1) % 2 == 0 else 0
+    bcfg = tuple([ByoBlockCfg(type='rep', d=d, c=c * wf, gs=group_size) for d, c, wf in zip(d, c, wf)])
+    return bcfg
+
+
+def interleave_blocks(
+        types: Tuple[str, str], d, every: Union[int, List[int]] = 1, first: bool = False, **kwargs
+) -> Tuple[ByoBlockCfg]:
+    """ interleave 2 block types in stack
+    """
+    assert len(types) == 2
+    if isinstance(every, int):
+        every = list(range(0 if first else every, d, every + 1))
+        if not every:
+            every = [d - 1]
+    set(every)
+    blocks = []
+    for i in range(d):
+        block_type = types[1] if i in every else types[0]
+        blocks += [ByoBlockCfg(type=block_type, d=1, **kwargs)]
+    return tuple(blocks)
+
+
+model_cfgs = dict(
+    gernet_l=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='basic', d=1, c=128, s=2, gs=0, br=1.),
+            ByoBlockCfg(type='basic', d=2, c=192, s=2, gs=0, br=1.),
+            ByoBlockCfg(type='bottle', d=6, c=640, s=2, gs=0, br=1 / 4),
+            ByoBlockCfg(type='bottle', d=5, c=640, s=2, gs=1, br=3.),
+            ByoBlockCfg(type='bottle', d=4, c=640, s=1, gs=1, br=3.),
+        ),
+        stem_chs=32,
+        stem_pool=None,
+        num_features=2560,
+    ),
+    gernet_m=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='basic', d=1, c=128, s=2, gs=0, br=1.),
+            ByoBlockCfg(type='basic', d=2, c=192, s=2, gs=0, br=1.),
+            ByoBlockCfg(type='bottle', d=6, c=640, s=2, gs=0, br=1 / 4),
+            ByoBlockCfg(type='bottle', d=4, c=640, s=2, gs=1, br=3.),
+            ByoBlockCfg(type='bottle', d=1, c=640, s=1, gs=1, br=3.),
+        ),
+        stem_chs=32,
+        stem_pool=None,
+        num_features=2560,
+    ),
+    gernet_s=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='basic', d=1, c=48, s=2, gs=0, br=1.),
+            ByoBlockCfg(type='basic', d=3, c=48, s=2, gs=0, br=1.),
+            ByoBlockCfg(type='bottle', d=7, c=384, s=2, gs=0, br=1 / 4),
+            ByoBlockCfg(type='bottle', d=2, c=560, s=2, gs=1, br=3.),
+            ByoBlockCfg(type='bottle', d=1, c=256, s=1, gs=1, br=3.),
+        ),
+        stem_chs=13,
+        stem_pool=None,
+        num_features=1920,
+    ),
+
+    repvgg_a2=ByoModelCfg(
+        blocks=_rep_vgg_bcfg(d=(2, 4, 14, 1), wf=(1.5, 1.5, 1.5, 2.75)),
+        stem_type='rep',
+        stem_chs=64,
+    ),
+    repvgg_b0=ByoModelCfg(
+        blocks=_rep_vgg_bcfg(wf=(1., 1., 1., 2.5)),
+        stem_type='rep',
+        stem_chs=64,
+    ),
+    repvgg_b1=ByoModelCfg(
+        blocks=_rep_vgg_bcfg(wf=(2., 2., 2., 4.)),
+        stem_type='rep',
+        stem_chs=64,
+    ),
+    repvgg_b1g4=ByoModelCfg(
+        blocks=_rep_vgg_bcfg(wf=(2., 2., 2., 4.), groups=4),
+        stem_type='rep',
+        stem_chs=64,
+    ),
+    repvgg_b2=ByoModelCfg(
+        blocks=_rep_vgg_bcfg(wf=(2.5, 2.5, 2.5, 5.)),
+        stem_type='rep',
+        stem_chs=64,
+    ),
+    repvgg_b2g4=ByoModelCfg(
+        blocks=_rep_vgg_bcfg(wf=(2.5, 2.5, 2.5, 5.), groups=4),
+        stem_type='rep',
+        stem_chs=64,
+    ),
+    repvgg_b3=ByoModelCfg(
+        blocks=_rep_vgg_bcfg(wf=(3., 3., 3., 5.)),
+        stem_type='rep',
+        stem_chs=64,
+    ),
+    repvgg_b3g4=ByoModelCfg(
+        blocks=_rep_vgg_bcfg(wf=(3., 3., 3., 5.), groups=4),
+        stem_type='rep',
+        stem_chs=64,
+    ),
+
+    # 4 x conv stem w/ 2 act, no maxpool, 2,4,6,4 repeats, group size 32 in first 3 blocks
+    # DW convs in last block, 2048 pre-FC, silu act  
+    resnet51q=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=6, c=1536, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=4, c=1536, s=2, gs=1, br=1.0),
+        ),
+        stem_chs=128,
+        stem_type='quad2',
+        stem_pool=None,
+        num_features=2048,
+        act_layer='silu',
+    ),
+
+    # 4 x conv stem w/ 4 act, no maxpool, 1,4,6,4 repeats, edge block first, group size 32 in next 2 blocks
+    # DW convs in last block, 4 conv for each bottle block, 2048 pre-FC, silu act  
+    resnet61q=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='edge', d=1, c=256, s=1, gs=0, br=1.0, block_kwargs=dict()),
+            ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=6, c=1536, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=4, c=1536, s=2, gs=1, br=1.0),
+        ),
+        stem_chs=128,
+        stem_type='quad',
+        stem_pool=None,
+        num_features=2048,
+        act_layer='silu',
+        block_kwargs=dict(extra_conv=True),
+    ),
+
+    # A series of ResNeXt-26 models w/ one of none, GC, SE, ECA, BAT attn, group size 32, SiLU act,
+    # and a tiered stem w/ maxpool
+    resnext26ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=1024, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=2048, s=2, gs=32, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='maxpool',
+        act_layer='silu',
+    ),
+    gcresnext26ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=1024, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=2048, s=2, gs=32, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='maxpool',
+        act_layer='silu',
+        attn_layer='gca',
+    ),
+    seresnext26ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=1024, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=2048, s=2, gs=32, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='maxpool',
+        act_layer='silu',
+        attn_layer='se',
+    ),
+    eca_resnext26ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=1024, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=2048, s=2, gs=32, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='maxpool',
+        act_layer='silu',
+        attn_layer='eca',
+    ),
+    bat_resnext26ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=1024, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=2048, s=2, gs=32, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='maxpool',
+        act_layer='silu',
+        attn_layer='bat',
+        attn_kwargs=dict(block_size=8)
+    ),
+
+    # ResNet-32 (2, 3, 3, 2) models w/ no attn, no groups, SiLU act, no pre-fc feat layer, tiered stem w/o maxpool
+    resnet32ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=3, c=1536, s=2, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=1536, s=2, gs=0, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='',
+        num_features=0,
+        act_layer='silu',
+    ),
+
+    # ResNet-33 (2, 3, 3, 2) models w/ no attn, no groups, SiLU act, 1280 pre-FC feat, tiered stem w/o maxpool
+    resnet33ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=3, c=1536, s=2, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=1536, s=2, gs=0, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='',
+        num_features=1280,
+        act_layer='silu',
+    ),
+
+    # A series of ResNet-33 (2, 3, 3, 2) models w/ one of GC, SE, ECA attn, no groups, SiLU act, 1280 pre-FC feat 
+    # and a tiered stem w/ no maxpool
+    gcresnet33ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=3, c=1536, s=2, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=1536, s=2, gs=0, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='',
+        num_features=1280,
+        act_layer='silu',
+        attn_layer='gca',
+    ),
+    seresnet33ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=3, c=1536, s=2, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=1536, s=2, gs=0, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='',
+        num_features=1280,
+        act_layer='silu',
+        attn_layer='se',
+    ),
+    eca_resnet33ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=3, c=1536, s=2, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=1536, s=2, gs=0, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='',
+        num_features=1280,
+        act_layer='silu',
+        attn_layer='eca',
+    ),
+
+    gcresnet50t=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=3, c=256, s=1, br=0.25),
+            ByoBlockCfg(type='bottle', d=4, c=512, s=2, br=0.25),
+            ByoBlockCfg(type='bottle', d=6, c=1024, s=2, br=0.25),
+            ByoBlockCfg(type='bottle', d=3, c=2048, s=2, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='',
+        attn_layer='gca',
+    ),
+
+    gcresnext50ts=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=6, c=1024, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=3, c=2048, s=2, gs=32, br=0.25),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='maxpool',
+        # stem_pool=None,
+        act_layer='silu',
+        attn_layer='gca',
+    ),
+
+    # experimental models, closer to a RegNetZ than a ResNet. Similar to EfficientNets but w/ groups instead of DW
+    regnetz_b16=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=48, s=2, gs=16, br=3),
+            ByoBlockCfg(type='bottle', d=6, c=96, s=2, gs=16, br=3),
+            ByoBlockCfg(type='bottle', d=12, c=192, s=2, gs=16, br=3),
+            ByoBlockCfg(type='bottle', d=2, c=288, s=2, gs=16, br=3),
+        ),
+        stem_chs=32,
+        stem_pool='',
+        downsample='',
+        num_features=1536,
+        act_layer='silu',
+        attn_layer='se',
+        attn_kwargs=dict(rd_ratio=0.25),
+        block_kwargs=dict(bottle_in=True, linear_out=True),
+    ),
+    regnetz_c16=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=48, s=2, gs=16, br=4),
+            ByoBlockCfg(type='bottle', d=6, c=96, s=2, gs=16, br=4),
+            ByoBlockCfg(type='bottle', d=12, c=192, s=2, gs=16, br=4),
+            ByoBlockCfg(type='bottle', d=2, c=288, s=2, gs=16, br=4),
+        ),
+        stem_chs=32,
+        stem_pool='',
+        downsample='',
+        num_features=1536,
+        act_layer='silu',
+        attn_layer='se',
+        attn_kwargs=dict(rd_ratio=0.25),
+        block_kwargs=dict(bottle_in=True, linear_out=True),
+    ),
+    regnetz_d32=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=3, c=64, s=1, gs=32, br=4),
+            ByoBlockCfg(type='bottle', d=6, c=128, s=2, gs=32, br=4),
+            ByoBlockCfg(type='bottle', d=12, c=256, s=2, gs=32, br=4),
+            ByoBlockCfg(type='bottle', d=3, c=384, s=2, gs=32, br=4),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='',
+        downsample='',
+        num_features=1792,
+        act_layer='silu',
+        attn_layer='se',
+        attn_kwargs=dict(rd_ratio=0.25),
+        block_kwargs=dict(bottle_in=True, linear_out=True),
+    ),
+    regnetz_d8=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=3, c=64, s=1, gs=8, br=4),
+            ByoBlockCfg(type='bottle', d=6, c=128, s=2, gs=8, br=4),
+            ByoBlockCfg(type='bottle', d=12, c=256, s=2, gs=8, br=4),
+            ByoBlockCfg(type='bottle', d=3, c=384, s=2, gs=8, br=4),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='',
+        downsample='',
+        num_features=1792,
+        act_layer='silu',
+        attn_layer='se',
+        attn_kwargs=dict(rd_ratio=0.25),
+        block_kwargs=dict(bottle_in=True, linear_out=True),
+    ),
+    regnetz_e8=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=3, c=96, s=1, gs=8, br=4),
+            ByoBlockCfg(type='bottle', d=8, c=192, s=2, gs=8, br=4),
+            ByoBlockCfg(type='bottle', d=16, c=384, s=2, gs=8, br=4),
+            ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=8, br=4),
+        ),
+        stem_chs=64,
+        stem_type='tiered',
+        stem_pool='',
+        downsample='',
+        num_features=2048,
+        act_layer='silu',
+        attn_layer='se',
+        attn_kwargs=dict(rd_ratio=0.25),
+        block_kwargs=dict(bottle_in=True, linear_out=True),
+    ),
+
+    # experimental EvoNorm configs
+    regnetz_b16_evos=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=48, s=2, gs=16, br=3),
+            ByoBlockCfg(type='bottle', d=6, c=96, s=2, gs=16, br=3),
+            ByoBlockCfg(type='bottle', d=12, c=192, s=2, gs=16, br=3),
+            ByoBlockCfg(type='bottle', d=2, c=288, s=2, gs=16, br=3),
+        ),
+        stem_chs=32,
+        stem_pool='',
+        downsample='',
+        num_features=1536,
+        act_layer='silu',
+        norm_layer=partial(EvoNorm2dS0a, group_size=16),
+        attn_layer='se',
+        attn_kwargs=dict(rd_ratio=0.25),
+        block_kwargs=dict(bottle_in=True, linear_out=True),
+    ),
+    regnetz_c16_evos=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=2, c=48, s=2, gs=16, br=4),
+            ByoBlockCfg(type='bottle', d=6, c=96, s=2, gs=16, br=4),
+            ByoBlockCfg(type='bottle', d=12, c=192, s=2, gs=16, br=4),
+            ByoBlockCfg(type='bottle', d=2, c=288, s=2, gs=16, br=4),
+        ),
+        stem_chs=32,
+        stem_pool='',
+        downsample='',
+        num_features=1536,
+        act_layer='silu',
+        norm_layer=partial(EvoNorm2dS0a, group_size=16),
+        attn_layer='se',
+        attn_kwargs=dict(rd_ratio=0.25),
+        block_kwargs=dict(bottle_in=True, linear_out=True),
+    ),
+    regnetz_d8_evos=ByoModelCfg(
+        blocks=(
+            ByoBlockCfg(type='bottle', d=3, c=64, s=1, gs=8, br=4),
+            ByoBlockCfg(type='bottle', d=6, c=128, s=2, gs=8, br=4),
+            ByoBlockCfg(type='bottle', d=12, c=256, s=2, gs=8, br=4),
+            ByoBlockCfg(type='bottle', d=3, c=384, s=2, gs=8, br=4),
+        ),
+        stem_chs=64,
+        stem_type='deep',
+        stem_pool='',
+        downsample='',
+        num_features=1792,
+        act_layer='silu',
+        norm_layer=partial(EvoNorm2dS0a, group_size=16),
+        attn_layer='se',
+        attn_kwargs=dict(rd_ratio=0.25),
+        block_kwargs=dict(bottle_in=True, linear_out=True),
+    ),
+)
+
+@register_model
+def gernet_l(pretrained=False, **kwargs):
+    """ GEResNet-Large (GENet-Large from official impl)
+    `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090
+    """
+    return _create_byobnet('gernet_l', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def gernet_m(pretrained=False, **kwargs):
+    """ GEResNet-Medium (GENet-Normal from official impl)
+    `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090
+    """
+    return _create_byobnet('gernet_m', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def gernet_s(pretrained=False, **kwargs):
+    """ EResNet-Small (GENet-Small from official impl)
+    `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090
+    """
+    return _create_byobnet('gernet_s', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def repvgg_a2(pretrained=False, **kwargs):
+    """ RepVGG-A2
+    `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
+    """
+    return _create_byobnet('repvgg_a2', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def repvgg_b0(pretrained=False, **kwargs):
+    """ RepVGG-B0
+    `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
+    """
+    return _create_byobnet('repvgg_b0', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def repvgg_b1(pretrained=False, **kwargs):
+    """ RepVGG-B1
+    `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
+    """
+    return _create_byobnet('repvgg_b1', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def repvgg_b1g4(pretrained=False, **kwargs):
+    """ RepVGG-B1g4
+    `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
+    """
+    return _create_byobnet('repvgg_b1g4', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def repvgg_b2(pretrained=False, **kwargs):
+    """ RepVGG-B2
+    `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
+    """
+    return _create_byobnet('repvgg_b2', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def repvgg_b2g4(pretrained=False, **kwargs):
+    """ RepVGG-B2g4
+    `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
+    """
+    return _create_byobnet('repvgg_b2g4', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def repvgg_b3(pretrained=False, **kwargs):
+    """ RepVGG-B3
+    `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
+    """
+    return _create_byobnet('repvgg_b3', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def repvgg_b3g4(pretrained=False, **kwargs):
+    """ RepVGG-B3g4
+    `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
+    """
+    return _create_byobnet('repvgg_b3g4', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def resnet51q(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('resnet51q', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def resnet61q(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('resnet61q', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def resnext26ts(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('resnext26ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def gcresnext26ts(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('gcresnext26ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def seresnext26ts(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('seresnext26ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def eca_resnext26ts(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('eca_resnext26ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def bat_resnext26ts(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('bat_resnext26ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def resnet32ts(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('resnet32ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def resnet33ts(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('resnet33ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def gcresnet33ts(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('gcresnet33ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def seresnet33ts(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('seresnet33ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def eca_resnet33ts(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('eca_resnet33ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def gcresnet50t(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('gcresnet50t', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def gcresnext50ts(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('gcresnext50ts', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def regnetz_b16(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('regnetz_b16', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def regnetz_c16(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('regnetz_c16', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def regnetz_d32(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('regnetz_d32', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def regnetz_d8(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('regnetz_d8', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def regnetz_e8(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('regnetz_e8', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def regnetz_b16_evos(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('regnetz_b16_evos', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def regnetz_c16_evos(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('regnetz_c16_evos', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def regnetz_d8_evos(pretrained=False, **kwargs):
+    """
+    """
+    return _create_byobnet('regnetz_d8_evos', pretrained=pretrained, **kwargs)
+
+
+def expand_blocks_cfg(stage_blocks_cfg: Union[ByoBlockCfg, Sequence[ByoBlockCfg]]) -> List[ByoBlockCfg]:
+    if not isinstance(stage_blocks_cfg, Sequence):
+        stage_blocks_cfg = (stage_blocks_cfg,)
+    block_cfgs = []
+    for i, cfg in enumerate(stage_blocks_cfg):
+        block_cfgs += [replace(cfg, d=1) for _ in range(cfg.d)]
+    return block_cfgs
+
+
+def num_groups(group_size, channels):
+    if not group_size:  # 0 or None
+        return 1  # normal conv with 1 group
+    else:
+        # NOTE group_size == 1 -> depthwise conv
+        assert channels % group_size == 0
+        return channels // group_size
+
+
+@dataclass
+class LayerFn:
+    conv_norm_act: Callable = ConvNormAct
+    norm_act: Callable = BatchNormAct2d
+    act: Callable = nn.ReLU
+    attn: Optional[Callable] = None
+    self_attn: Optional[Callable] = None
+
+
+class DownsampleAvg(nn.Module):
+    def __init__(self, in_chs, out_chs, stride=1, dilation=1, apply_act=False, layers: LayerFn = None):
+        """ AvgPool Downsampling as in 'D' ResNet variants."""
+        super(DownsampleAvg, self).__init__()
+        layers = layers or LayerFn()
+        avg_stride = stride if dilation == 1 else 1
+        if stride > 1 or dilation > 1:
+            avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+            self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
+        else:
+            self.pool = nn.Identity()
+        self.conv = layers.conv_norm_act(in_chs, out_chs, 1, apply_act=apply_act)
+
+    def forward(self, x):
+        return self.conv(self.pool(x))
+
+
+def create_shortcut(downsample_type, layers: LayerFn, in_chs, out_chs, stride, dilation, **kwargs):
+    assert downsample_type in ('avg', 'conv1x1', '')
+    if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
+        if not downsample_type:
+            return None  # no shortcut
+        elif downsample_type == 'avg':
+            return DownsampleAvg(in_chs, out_chs, stride=stride, dilation=dilation[0], **kwargs)
+        else:
+            return layers.conv_norm_act(in_chs, out_chs, kernel_size=1, stride=stride, dilation=dilation[0], **kwargs)
+    else:
+        return nn.Identity()  # identity shortcut
+
+
+class BasicBlock(nn.Module):
+    """ ResNet Basic Block - kxk + kxk
+    """
+
+    def __init__(
+            self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), group_size=None, bottle_ratio=1.0,
+            downsample='avg', attn_last=True, linear_out=False, layers: LayerFn = None, drop_block=None,
+            drop_path_rate=0.):
+        super(BasicBlock, self).__init__()
+        layers = layers or LayerFn()
+        mid_chs = make_divisible(out_chs * bottle_ratio)
+        groups = num_groups(group_size, mid_chs)
+
+        self.shortcut = create_shortcut(
+            downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation,
+            apply_act=False, layers=layers)
+
+        self.conv1_kxk = layers.conv_norm_act(in_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0])
+        self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs)
+        self.conv2_kxk = layers.conv_norm_act(
+            mid_chs, out_chs, kernel_size, dilation=dilation[1], groups=groups, drop_layer=drop_block, apply_act=False)
+        self.attn_last = nn.Identity() if not attn_last or layers.attn is None else layers.attn(out_chs)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
+
+    def init_weights(self, zero_init_last: bool = False):
+        if zero_init_last and self.shortcut is not None:
+            nn.init.zeros_(self.conv2_kxk.bn.weight)
+        for attn in (self.attn, self.attn_last):
+            if hasattr(attn, 'reset_parameters'):
+                attn.reset_parameters()
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1_kxk(x)
+        x = self.conv2_kxk(x)
+        x = self.attn(x)
+        x = self.drop_path(x)
+        if self.shortcut is not None:
+            x = x + self.shortcut(shortcut)
+        return self.act(x)
+
+
+class BottleneckBlock(nn.Module):
+    """ ResNet-like Bottleneck Block - 1x1 - kxk - 1x1
+    """
+
+    def __init__(
+            self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1., group_size=None,
+            downsample='avg', attn_last=False, linear_out=False, extra_conv=False, bottle_in=False,
+            layers: LayerFn = None, drop_block=None, drop_path_rate=0.):
+        super(BottleneckBlock, self).__init__()
+        layers = layers or LayerFn()
+        mid_chs = make_divisible((in_chs if bottle_in else out_chs) * bottle_ratio)
+        groups = num_groups(group_size, mid_chs)
+
+        self.shortcut = create_shortcut(
+            downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation,
+            apply_act=False, layers=layers)
+
+        self.conv1_1x1 = layers.conv_norm_act(in_chs, mid_chs, 1)
+        self.conv2_kxk = layers.conv_norm_act(
+            mid_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0], groups=groups, drop_layer=drop_block)
+        if extra_conv:
+            self.conv2b_kxk = layers.conv_norm_act(mid_chs, mid_chs, kernel_size, dilation=dilation[1], groups=groups)
+        else:
+            self.conv2b_kxk = nn.Identity()
+        self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs)
+        self.conv3_1x1 = layers.conv_norm_act(mid_chs, out_chs, 1, apply_act=False)
+        self.attn_last = nn.Identity() if not attn_last or layers.attn is None else layers.attn(out_chs)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
+
+    def init_weights(self, zero_init_last: bool = False):
+        if zero_init_last and self.shortcut is not None:
+            nn.init.zeros_(self.conv3_1x1.bn.weight)
+        for attn in (self.attn, self.attn_last):
+            if hasattr(attn, 'reset_parameters'):
+                attn.reset_parameters()
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1_1x1(x)
+        x = self.conv2_kxk(x)
+        x = self.conv2b_kxk(x)
+        x = self.attn(x)
+        x = self.conv3_1x1(x)
+        x = self.attn_last(x)
+        x = self.drop_path(x)
+        if self.shortcut is not None:
+            x = x + self.shortcut(shortcut)
+        return self.act(x)
+
+
+class DarkBlock(nn.Module):
+    """ DarkNet-like (1x1 + 3x3 w/ stride) block
+
+    The GE-Net impl included a 1x1 + 3x3 block in their search space. It was not used in the feature models.
+    This block is pretty much a DarkNet block (also DenseNet) hence the name. Neither DarkNet or DenseNet
+    uses strides within the block (external 3x3 or maxpool downsampling is done in front of the block repeats).
+
+    If one does want to use a lot of these blocks w/ stride, I'd recommend using the EdgeBlock (3x3 /w stride + 1x1)
+    for more optimal compute.
+    """
+
+    def __init__(
+            self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1.0, group_size=None,
+            downsample='avg', attn_last=True, linear_out=False, layers: LayerFn = None, drop_block=None,
+            drop_path_rate=0.):
+        super(DarkBlock, self).__init__()
+        layers = layers or LayerFn()
+        mid_chs = make_divisible(out_chs * bottle_ratio)
+        groups = num_groups(group_size, mid_chs)
+
+        self.shortcut = create_shortcut(
+            downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation,
+            apply_act=False, layers=layers)
+
+        self.conv1_1x1 = layers.conv_norm_act(in_chs, mid_chs, 1)
+        self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs)
+        self.conv2_kxk = layers.conv_norm_act(
+            mid_chs, out_chs, kernel_size, stride=stride, dilation=dilation[0],
+            groups=groups, drop_layer=drop_block, apply_act=False)
+        self.attn_last = nn.Identity() if not attn_last or layers.attn is None else layers.attn(out_chs)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
+
+    def init_weights(self, zero_init_last: bool = False):
+        if zero_init_last and self.shortcut is not None:
+            nn.init.zeros_(self.conv2_kxk.bn.weight)
+        for attn in (self.attn, self.attn_last):
+            if hasattr(attn, 'reset_parameters'):
+                attn.reset_parameters()
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1_1x1(x)
+        x = self.attn(x)
+        x = self.conv2_kxk(x)
+        x = self.attn_last(x)
+        x = self.drop_path(x)
+        if self.shortcut is not None:
+            x = x + self.shortcut(shortcut)
+        return self.act(x)
+
+
+class EdgeBlock(nn.Module):
+    """ EdgeResidual-like (3x3 + 1x1) block
+
+    A two layer block like DarkBlock, but with the order of the 3x3 and 1x1 convs reversed.
+    Very similar to the EfficientNet Edge-Residual block but this block it ends with activations, is
+    intended to be used with either expansion or bottleneck contraction, and can use DW/group/non-grouped convs.
+
+    FIXME is there a more common 3x3 + 1x1 conv block to name this after?
+    """
+
+    def __init__(
+            self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1.0, group_size=None,
+            downsample='avg', attn_last=False, linear_out=False, layers: LayerFn = None,
+            drop_block=None, drop_path_rate=0.):
+        super(EdgeBlock, self).__init__()
+        layers = layers or LayerFn()
+        mid_chs = make_divisible(out_chs * bottle_ratio)
+        groups = num_groups(group_size, mid_chs)
+
+        self.shortcut = create_shortcut(
+            downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation,
+            apply_act=False, layers=layers)
+
+        self.conv1_kxk = layers.conv_norm_act(
+            in_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0], groups=groups, drop_layer=drop_block)
+        self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs)
+        self.conv2_1x1 = layers.conv_norm_act(mid_chs, out_chs, 1, apply_act=False)
+        self.attn_last = nn.Identity() if not attn_last or layers.attn is None else layers.attn(out_chs)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
+
+    def init_weights(self, zero_init_last: bool = False):
+        if zero_init_last and self.shortcut is not None:
+            nn.init.zeros_(self.conv2_1x1.bn.weight)
+        for attn in (self.attn, self.attn_last):
+            if hasattr(attn, 'reset_parameters'):
+                attn.reset_parameters()
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1_kxk(x)
+        x = self.attn(x)
+        x = self.conv2_1x1(x)
+        x = self.attn_last(x)
+        x = self.drop_path(x)
+        if self.shortcut is not None:
+            x = x + self.shortcut(shortcut)
+        return self.act(x)
+
+
+class RepVggBlock(nn.Module):
+    """ RepVGG Block.
+
+    Adapted from impl at https://github.com/DingXiaoH/RepVGG
+
+    This version does not currently support the deploy optimization. It is currently fixed in 'train' mode.
+    """
+
+    def __init__(
+            self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1.0, group_size=None,
+            downsample='', layers: LayerFn = None, drop_block=None, drop_path_rate=0.):
+        super(RepVggBlock, self).__init__()
+        layers = layers or LayerFn()
+        groups = num_groups(group_size, in_chs)
+
+        use_ident = in_chs == out_chs and stride == 1 and dilation[0] == dilation[1]
+        self.identity = layers.norm_act(out_chs, apply_act=False) if use_ident else None
+        self.conv_kxk = layers.conv_norm_act(
+            in_chs, out_chs, kernel_size, stride=stride, dilation=dilation[0],
+            groups=groups, drop_layer=drop_block, apply_act=False)
+        self.conv_1x1 = layers.conv_norm_act(in_chs, out_chs, 1, stride=stride, groups=groups, apply_act=False)
+        self.attn = nn.Identity() if layers.attn is None else layers.attn(out_chs)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. and use_ident else nn.Identity()
+        self.act = layers.act(inplace=True)
+
+    def init_weights(self, zero_init_last: bool = False):
+        # NOTE this init overrides that base model init with specific changes for the block type
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                nn.init.normal_(m.weight, .1, .1)
+                nn.init.normal_(m.bias, 0, .1)
+        if hasattr(self.attn, 'reset_parameters'):
+            self.attn.reset_parameters()
+
+    def forward(self, x):
+        if self.identity is None:
+            x = self.conv_1x1(x) + self.conv_kxk(x)
+        else:
+            identity = self.identity(x)
+            x = self.conv_1x1(x) + self.conv_kxk(x)
+            x = self.drop_path(x)  # not in the paper / official impl, experimental
+            x = x + identity
+        x = self.attn(x)  # no attn in the paper / official impl, experimental
+        return self.act(x)
+
+
+class SelfAttnBlock(nn.Module):
+    """ ResNet-like Bottleneck Block - 1x1 - optional kxk - self attn - 1x1
+    """
+
+    def __init__(
+            self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1., group_size=None,
+            downsample='avg', extra_conv=False, linear_out=False, bottle_in=False, post_attn_na=True,
+            feat_size=None, layers: LayerFn = None, drop_block=None, drop_path_rate=0.):
+        super(SelfAttnBlock, self).__init__()
+        assert layers is not None
+        mid_chs = make_divisible((in_chs if bottle_in else out_chs) * bottle_ratio)
+        groups = num_groups(group_size, mid_chs)
+
+        self.shortcut = create_shortcut(
+            downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation,
+            apply_act=False, layers=layers)
+
+        self.conv1_1x1 = layers.conv_norm_act(in_chs, mid_chs, 1)
+        if extra_conv:
+            self.conv2_kxk = layers.conv_norm_act(
+                mid_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0],
+                groups=groups, drop_layer=drop_block)
+            stride = 1  # striding done via conv if enabled
+        else:
+            self.conv2_kxk = nn.Identity()
+        opt_kwargs = {} if feat_size is None else dict(feat_size=feat_size)
+        # FIXME need to dilate self attn to have dilated network support, moop moop
+        self.self_attn = layers.self_attn(mid_chs, stride=stride, **opt_kwargs)
+        self.post_attn = layers.norm_act(mid_chs) if post_attn_na else nn.Identity()
+        self.conv3_1x1 = layers.conv_norm_act(mid_chs, out_chs, 1, apply_act=False)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
+
+    def init_weights(self, zero_init_last: bool = False):
+        if zero_init_last and self.shortcut is not None:
+            nn.init.zeros_(self.conv3_1x1.bn.weight)
+        if hasattr(self.self_attn, 'reset_parameters'):
+            self.self_attn.reset_parameters()
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1_1x1(x)
+        x = self.conv2_kxk(x)
+        x = self.self_attn(x)
+        x = self.post_attn(x)
+        x = self.conv3_1x1(x)
+        x = self.drop_path(x)
+        if self.shortcut is not None:
+            x = x + self.shortcut(shortcut)
+        return self.act(x)
+
+_block_registry = dict(
+    basic=BasicBlock,
+    bottle=BottleneckBlock,
+    dark=DarkBlock,
+    edge=EdgeBlock,
+    rep=RepVggBlock,
+    self_attn=SelfAttnBlock,
+)
+
+
+def register_block(block_type:str, block_fn: nn.Module):
+    _block_registry[block_type] = block_fn
+
+
+def create_block(block: Union[str, nn.Module], **kwargs):
+    if isinstance(block, (nn.Module, partial)):
+        return block(**kwargs)
+    assert block in _block_registry, f'Unknown block type ({block}'
+    return _block_registry[block](**kwargs)
+
+
+class Stem(nn.Sequential):
+
+    def __init__(
+            self, in_chs, out_chs, kernel_size=3, stride=4, pool='maxpool',
+            num_rep=3, num_act=None, chs_decay=0.5, layers: LayerFn = None):
+        super().__init__()
+        assert stride in (2, 4)
+        layers = layers or LayerFn()
+
+        if isinstance(out_chs, (list, tuple)):
+            num_rep = len(out_chs)
+            stem_chs = out_chs
+        else:
+            stem_chs = [round(out_chs * chs_decay ** i) for i in range(num_rep)][::-1]
+
+        self.stride = stride
+        self.feature_info = []  # track intermediate features
+        prev_feat = ''
+        stem_strides = [2] + [1] * (num_rep - 1)
+        if stride == 4 and not pool:
+            # set last conv in stack to be strided if stride == 4 and no pooling layer
+            stem_strides[-1] = 2
+
+        num_act = num_rep if num_act is None else num_act
+        # if num_act < num_rep, first convs in stack won't have bn + act
+        stem_norm_acts = [False] * (num_rep - num_act) + [True] * num_act
+        prev_chs = in_chs
+        curr_stride = 1
+        for i, (ch, s, na) in enumerate(zip(stem_chs, stem_strides, stem_norm_acts)):
+            layer_fn = layers.conv_norm_act if na else create_conv2d
+            conv_name = f'conv{i + 1}'
+            if i > 0 and s > 1:
+                self.feature_info.append(dict(num_chs=prev_chs, reduction=curr_stride, module=prev_feat))
+            self.add_module(conv_name, layer_fn(prev_chs, ch, kernel_size=kernel_size, stride=s))
+            prev_chs = ch
+            curr_stride *= s
+            prev_feat = conv_name
+
+        if pool and 'max' in pool.lower():
+            self.feature_info.append(dict(num_chs=prev_chs, reduction=curr_stride, module=prev_feat))
+            self.add_module('pool', nn.MaxPool2d(3, 2, 1))
+            curr_stride *= 2
+            prev_feat = 'pool'
+
+        self.feature_info.append(dict(num_chs=prev_chs, reduction=curr_stride, module=prev_feat))
+        assert curr_stride == stride
+
+
+def create_byob_stem(in_chs, out_chs, stem_type='', pool_type='', feat_prefix='stem', layers: LayerFn = None):
+    layers = layers or LayerFn()
+    assert stem_type in ('', 'quad', 'quad2', 'tiered', 'deep', 'rep', '7x7', '3x3')
+    if 'quad' in stem_type:
+        # based on NFNet stem, stack of 4 3x3 convs
+        num_act = 2 if 'quad2' in stem_type else None
+        stem = Stem(in_chs, out_chs, num_rep=4, num_act=num_act, pool=pool_type, layers=layers)
+    elif 'tiered' in stem_type:
+        # 3x3 stack of 3 convs as in my ResNet-T
+        stem = Stem(in_chs, (3 * out_chs // 8, out_chs // 2, out_chs), pool=pool_type, layers=layers)
+    elif 'deep' in stem_type:
+        # 3x3 stack of 3 convs as in ResNet-D
+        stem = Stem(in_chs, out_chs, num_rep=3, chs_decay=1.0, pool=pool_type, layers=layers)
+    elif 'rep' in stem_type:
+        stem = RepVggBlock(in_chs, out_chs, stride=2, layers=layers)
+    elif '7x7' in stem_type:
+        # 7x7 stem conv as in ResNet
+        if pool_type:
+            stem = Stem(in_chs, out_chs, 7, num_rep=1, pool=pool_type, layers=layers)
+        else:
+            stem = layers.conv_norm_act(in_chs, out_chs, 7, stride=2)
+    else:
+        # 3x3 stem conv as in RegNet is the default
+        if pool_type:
+            stem = Stem(in_chs, out_chs, 3, num_rep=1, pool=pool_type, layers=layers)
+        else:
+            stem = layers.conv_norm_act(in_chs, out_chs, 3, stride=2)
+
+    if isinstance(stem, Stem):
+        feature_info = [dict(f, module='.'.join([feat_prefix, f['module']])) for f in stem.feature_info]
+    else:
+        feature_info = [dict(num_chs=out_chs, reduction=2, module=feat_prefix)]
+    return stem, feature_info
+
+
+def reduce_feat_size(feat_size, stride=2):
+    return None if feat_size is None else tuple([s // stride for s in feat_size])
+
+
+def override_kwargs(block_kwargs, model_kwargs):
+    """ Override model level attn/self-attn/block kwargs w/ block level
+
+    NOTE: kwargs are NOT merged across levels, block_kwargs will fully replace model_kwargs
+    for the block if set to anything that isn't None.
+
+    i.e. an empty block_kwargs dict will remove kwargs set at model level for that block
+    """
+    out_kwargs = block_kwargs if block_kwargs is not None else model_kwargs
+    return out_kwargs or {}  # make sure None isn't returned
+
+
+def update_block_kwargs(block_kwargs: Dict[str, Any], block_cfg: ByoBlockCfg, model_cfg: ByoModelCfg, ):
+    layer_fns = block_kwargs['layers']
+
+    # override attn layer / args with block local config
+    attn_set = block_cfg.attn_layer is not None
+    if attn_set or block_cfg.attn_kwargs is not None:
+        # override attn layer config
+        if attn_set and not block_cfg.attn_layer:
+            # empty string for attn_layer type will disable attn for this block
+            attn_layer = None
+        else:
+            attn_kwargs = override_kwargs(block_cfg.attn_kwargs, model_cfg.attn_kwargs)
+            attn_layer = block_cfg.attn_layer or model_cfg.attn_layer
+            attn_layer = partial(get_attn(attn_layer), **attn_kwargs) if attn_layer is not None else None
+        layer_fns = replace(layer_fns, attn=attn_layer)
+
+    # override self-attn layer / args with block local cfg
+    self_attn_set = block_cfg.self_attn_layer is not None
+    if self_attn_set or block_cfg.self_attn_kwargs is not None:
+        # override attn layer config
+        if self_attn_set and not block_cfg.self_attn_layer:  # attn_layer == ''
+            # empty string for self_attn_layer type will disable attn for this block
+            self_attn_layer = None
+        else:
+            self_attn_kwargs = override_kwargs(block_cfg.self_attn_kwargs, model_cfg.self_attn_kwargs)
+            self_attn_layer = block_cfg.self_attn_layer or model_cfg.self_attn_layer
+            self_attn_layer = partial(get_attn(self_attn_layer), **self_attn_kwargs) \
+                if self_attn_layer is not None else None
+        layer_fns = replace(layer_fns, self_attn=self_attn_layer)
+
+    block_kwargs['layers'] = layer_fns
+
+    # add additional block_kwargs specified in block_cfg or model_cfg, precedence to block if set
+    block_kwargs.update(override_kwargs(block_cfg.block_kwargs, model_cfg.block_kwargs))
+
+
+def create_byob_stages(
+        cfg: ByoModelCfg, drop_path_rate: float, output_stride: int, stem_feat: Dict[str, Any],
+        feat_size: Optional[int] = None,
+        layers: Optional[LayerFn] = None,
+        block_kwargs_fn: Optional[Callable] = update_block_kwargs):
+
+    layers = layers or LayerFn()
+    feature_info = []
+    block_cfgs = [expand_blocks_cfg(s) for s in cfg.blocks]
+    depths = [sum([bc.d for bc in stage_bcs]) for stage_bcs in block_cfgs]
+    dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+    dilation = 1
+    net_stride = stem_feat['reduction']
+    prev_chs = stem_feat['num_chs']
+    prev_feat = stem_feat
+    stages = []
+    for stage_idx, stage_block_cfgs in enumerate(block_cfgs):
+        stride = stage_block_cfgs[0].s
+        if stride != 1 and prev_feat:
+            feature_info.append(prev_feat)
+        if net_stride >= output_stride and stride > 1:
+            dilation *= stride
+            stride = 1
+        net_stride *= stride
+        first_dilation = 1 if dilation in (1, 2) else 2
+
+        blocks = []
+        for block_idx, block_cfg in enumerate(stage_block_cfgs):
+            out_chs = make_divisible(block_cfg.c * cfg.width_factor)
+            group_size = block_cfg.gs
+            if isinstance(group_size, Callable):
+                group_size = group_size(out_chs, block_idx)
+            block_kwargs = dict(  # Blocks used in this model must accept these arguments
+                in_chs=prev_chs,
+                out_chs=out_chs,
+                stride=stride if block_idx == 0 else 1,
+                dilation=(first_dilation, dilation),
+                group_size=group_size,
+                bottle_ratio=block_cfg.br,
+                downsample=cfg.downsample,
+                drop_path_rate=dpr[stage_idx][block_idx],
+                layers=layers,
+            )
+            if block_cfg.type in ('self_attn',):
+                # add feat_size arg for blocks that support/need it
+                block_kwargs['feat_size'] = feat_size
+            block_kwargs_fn(block_kwargs, block_cfg=block_cfg, model_cfg=cfg)
+            blocks += [create_block(block_cfg.type, **block_kwargs)]
+            first_dilation = dilation
+            prev_chs = out_chs
+            if stride > 1 and block_idx == 0:
+                feat_size = reduce_feat_size(feat_size, stride)
+
+        stages += [nn.Sequential(*blocks)]
+        prev_feat = dict(num_chs=prev_chs, reduction=net_stride, module=f'stages.{stage_idx}')
+
+    feature_info.append(prev_feat)
+    return nn.Sequential(*stages), feature_info
+
+
+def get_layer_fns(cfg: ByoModelCfg):
+    act = get_act_layer(cfg.act_layer)
+    norm_act = get_norm_act_layer(norm_layer=cfg.norm_layer, act_layer=act)
+    conv_norm_act = partial(ConvNormAct, norm_layer=cfg.norm_layer, act_layer=act)
+    attn = partial(get_attn(cfg.attn_layer), **cfg.attn_kwargs) if cfg.attn_layer else None
+    self_attn = partial(get_attn(cfg.self_attn_layer), **cfg.self_attn_kwargs) if cfg.self_attn_layer else None
+    layer_fn = LayerFn(conv_norm_act=conv_norm_act, norm_act=norm_act, act=act, attn=attn, self_attn=self_attn)
+    return layer_fn
+
+
+class ByobNet(nn.Module):
+    """ 'Bring-your-own-blocks' Net
+
+    A flexible network backbone that allows building model stem + blocks via
+    dataclass cfg definition w/ factory functions for module instantiation.
+
+    Current assumption is that both stem and blocks are in conv-bn-act order (w/ block ending in act).
+    """
+    def __init__(
+            self, cfg: ByoModelCfg, num_classes=1000, in_chans=3, global_pool='avg', output_stride=32,
+            zero_init_last=True, img_size=None, drop_rate=0., drop_path_rate=0.):
+        super().__init__()
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        self.grad_checkpointing = False
+        layers = get_layer_fns(cfg)
+        if cfg.fixed_input_size:
+            assert img_size is not None, 'img_size argument is required for fixed input size model'
+        feat_size = to_2tuple(img_size) if img_size is not None else None
+
+        self.feature_info = []
+        stem_chs = int(round((cfg.stem_chs or cfg.blocks[0].c) * cfg.width_factor))
+        self.stem, stem_feat = create_byob_stem(in_chans, stem_chs, cfg.stem_type, cfg.stem_pool, layers=layers)
+        self.feature_info.extend(stem_feat[:-1])
+        feat_size = reduce_feat_size(feat_size, stride=stem_feat[-1]['reduction'])
+
+        self.stages, stage_feat = create_byob_stages(
+            cfg, drop_path_rate, output_stride, stem_feat[-1], layers=layers, feat_size=feat_size)
+        self.feature_info.extend(stage_feat[:-1])
+
+        prev_chs = stage_feat[-1]['num_chs']
+        if cfg.num_features:
+            self.num_features = int(round(cfg.width_factor * cfg.num_features))
+            self.final_conv = layers.conv_norm_act(prev_chs, self.num_features, 1)
+        else:
+            self.num_features = prev_chs
+            self.final_conv = nn.Identity()
+        self.feature_info += [
+            dict(num_chs=self.num_features, reduction=stage_feat[-1]['reduction'], module='final_conv')]
+
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+        # init weights
+        named_apply(partial(_init_weights, zero_init_last=zero_init_last), self)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^stem',
+            blocks=[
+                (r'^stages\.(\d+)' if coarse else r'^stages\.(\d+)\.(\d+)', None),
+                (r'^final_conv', (99999,))
+            ]
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.stages, x)
+        else:
+            x = self.stages(x)
+        x = self.final_conv(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        return self.head(x, pre_logits=pre_logits)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _init_weights(module, name='', zero_init_last=False):
+    if isinstance(module, nn.Conv2d):
+        fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
+        fan_out //= module.groups
+        module.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+        if module.bias is not None:
+            module.bias.data.zero_()
+    elif isinstance(module, nn.Linear):
+        nn.init.normal_(module.weight, mean=0.0, std=0.01)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.BatchNorm2d):
+        nn.init.ones_(module.weight)
+        nn.init.zeros_(module.bias)
+    elif hasattr(module, 'init_weights'):
+        module.init_weights(zero_init_last=zero_init_last)
+
+
+def _create_byobnet(variant, pretrained=False, **kwargs):
+    return build_model_with_cfg(
+        ByobNet, variant, pretrained,
+        model_cfg=model_cfgs[variant],
+        feature_cfg=dict(flatten_sequential=True),
+        **kwargs)
diff --git a/src/custom_timm/models/cait.py b/src/custom_timm/models/cait.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e8ec277e8fa8027b340872ccb7a6179479d4bee
--- /dev/null
+++ b/src/custom_timm/models/cait.py
@@ -0,0 +1,421 @@
+""" Class-Attention in Image Transformers (CaiT)
+
+Paper: 'Going deeper with Image Transformers' - https://arxiv.org/abs/2103.17239
+
+Original code and weights from https://github.com/facebookresearch/deit, copyright below
+
+Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman
+"""
+# Copyright (c) 2015-present, Facebook, Inc.
+# All rights reserved.
+from copy import deepcopy
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg, checkpoint_seq
+from .layers import PatchEmbed, Mlp, DropPath, trunc_normal_
+from .registry import register_model
+
+
+__all__ = ['Cait', 'ClassAttn', 'LayerScaleBlockClassAttn', 'LayerScaleBlock', 'TalkingHeadAttn']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 384, 384), 'pool_size': None,
+        'crop_pct': 1.0, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = dict(
+    cait_xxs24_224=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/XXS24_224.pth',
+        input_size=(3, 224, 224),
+    ),
+    cait_xxs24_384=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/XXS24_384.pth',
+    ),
+    cait_xxs36_224=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/XXS36_224.pth',
+        input_size=(3, 224, 224),
+    ),
+    cait_xxs36_384=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/XXS36_384.pth',
+    ),
+    cait_xs24_384=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/XS24_384.pth',
+    ),
+    cait_s24_224=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/S24_224.pth',
+        input_size=(3, 224, 224),
+    ),
+    cait_s24_384=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/S24_384.pth',
+    ),
+    cait_s36_384=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/S36_384.pth',
+    ),
+    cait_m36_384=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/M36_384.pth',
+    ),
+    cait_m48_448=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/M48_448.pth',
+        input_size=(3, 448, 448),
+    ),
+)
+
+
+class ClassAttn(nn.Module):
+    # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+    # with slight modifications to do CA 
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.k = nn.Linear(dim, dim, bias=qkv_bias)
+        self.v = nn.Linear(dim, dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        q = self.q(x[:, 0]).unsqueeze(1).reshape(B, 1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        k = self.k(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+
+        q = q * self.scale
+        v = self.v(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+
+        attn = (q @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x_cls = (attn @ v).transpose(1, 2).reshape(B, 1, C)
+        x_cls = self.proj(x_cls)
+        x_cls = self.proj_drop(x_cls)
+
+        return x_cls
+
+
+class LayerScaleBlockClassAttn(nn.Module):
+    # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+    # with slight modifications to add CA and LayerScale
+    def __init__(
+            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+            drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, attn_block=ClassAttn,
+            mlp_block=Mlp, init_values=1e-4):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_block(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = mlp_block(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.gamma_1 = nn.Parameter(init_values * torch.ones(dim))
+        self.gamma_2 = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x, x_cls):
+        u = torch.cat((x_cls, x), dim=1)
+        x_cls = x_cls + self.drop_path(self.gamma_1 * self.attn(self.norm1(u)))
+        x_cls = x_cls + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x_cls)))
+        return x_cls
+
+
+class TalkingHeadAttn(nn.Module):
+    # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+    # with slight modifications to add Talking Heads Attention (https://arxiv.org/pdf/2003.02436v1.pdf)
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+
+        self.num_heads = num_heads
+
+        head_dim = dim // num_heads
+
+        self.scale = head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+
+        self.proj = nn.Linear(dim, dim)
+
+        self.proj_l = nn.Linear(num_heads, num_heads)
+        self.proj_w = nn.Linear(num_heads, num_heads)
+
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+
+        attn = (q @ k.transpose(-2, -1))
+
+        attn = self.proj_l(attn.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+
+        attn = attn.softmax(dim=-1)
+
+        attn = self.proj_w(attn.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class LayerScaleBlock(nn.Module):
+    # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+    # with slight modifications to add layerScale
+    def __init__(
+            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+            drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, attn_block=TalkingHeadAttn,
+            mlp_block=Mlp, init_values=1e-4):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_block(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = mlp_block(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.gamma_1 = nn.Parameter(init_values * torch.ones(dim))
+        self.gamma_2 = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x):
+        x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class Cait(nn.Module):
+    # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+    # with slight modifications to adapt to our cait models
+    def __init__(
+            self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, global_pool='token',
+            embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=True,
+            drop_rate=0., attn_drop_rate=0., drop_path_rate=0.,
+            block_layers=LayerScaleBlock,
+            block_layers_token=LayerScaleBlockClassAttn,
+            patch_layer=PatchEmbed,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6),
+            act_layer=nn.GELU,
+            attn_block=TalkingHeadAttn,
+            mlp_block=Mlp,
+            init_values=1e-4,
+            attn_block_token_only=ClassAttn,
+            mlp_block_token_only=Mlp,
+            depth_token_only=2,
+            mlp_ratio_token_only=4.0
+    ):
+        super().__init__()
+        assert global_pool in ('', 'token', 'avg')
+
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = self.embed_dim = embed_dim
+        self.grad_checkpointing = False
+
+        self.patch_embed = patch_layer(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [drop_path_rate for i in range(depth)]
+        self.blocks = nn.Sequential(*[
+            block_layers(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                act_layer=act_layer, attn_block=attn_block, mlp_block=mlp_block, init_values=init_values)
+            for i in range(depth)])
+
+        self.blocks_token_only = nn.ModuleList([
+            block_layers_token(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio_token_only, qkv_bias=qkv_bias,
+                drop=0.0, attn_drop=0.0, drop_path=0.0, norm_layer=norm_layer,
+                act_layer=act_layer, attn_block=attn_block_token_only,
+                mlp_block=mlp_block_token_only, init_values=init_values)
+            for i in range(depth_token_only)])
+
+        self.norm = norm_layer(embed_dim)
+
+        self.feature_info = [dict(num_chs=embed_dim, reduction=0, module='head')]
+        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        def _matcher(name):
+            if any([name.startswith(n) for n in ('cls_token', 'pos_embed', 'patch_embed')]):
+                return 0
+            elif name.startswith('blocks.'):
+                return int(name.split('.')[1]) + 1
+            elif name.startswith('blocks_token_only.'):
+                # overlap token only blocks with last blocks
+                to_offset = len(self.blocks) - len(self.blocks_token_only) + 1
+                return int(name.split('.')[1]) + to_offset
+            elif name.startswith('norm.'):
+                return len(self.blocks)
+            else:
+                return float('inf')
+        return _matcher
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ('', 'token', 'avg')
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        cls_tokens = self.cls_token.expand(x.shape[0], -1, -1)
+        for i, blk in enumerate(self.blocks_token_only):
+            cls_tokens = blk(x, cls_tokens)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool:
+            x = x[:, 1:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0]
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def checkpoint_filter_fn(state_dict, model=None):
+    if 'model' in state_dict:
+        state_dict = state_dict['model']
+    checkpoint_no_module = {}
+    for k, v in state_dict.items():
+        checkpoint_no_module[k.replace('module.', '')] = v
+    return checkpoint_no_module
+
+
+def _create_cait(variant, pretrained=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+
+    model = build_model_with_cfg(
+        Cait, variant, pretrained,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        **kwargs)
+    return model
+
+
+@register_model
+def cait_xxs24_224(pretrained=False, **kwargs):
+    model_args = dict(patch_size=16, embed_dim=192, depth=24, num_heads=4, init_values=1e-5, **kwargs)
+    model = _create_cait('cait_xxs24_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def cait_xxs24_384(pretrained=False, **kwargs):
+    model_args = dict(patch_size=16, embed_dim=192, depth=24, num_heads=4, init_values=1e-5, **kwargs)
+    model = _create_cait('cait_xxs24_384', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def cait_xxs36_224(pretrained=False, **kwargs):
+    model_args = dict(patch_size=16, embed_dim=192, depth=36, num_heads=4, init_values=1e-5, **kwargs)
+    model = _create_cait('cait_xxs36_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def cait_xxs36_384(pretrained=False, **kwargs):
+    model_args = dict(patch_size=16, embed_dim=192, depth=36, num_heads=4, init_values=1e-5, **kwargs)
+    model = _create_cait('cait_xxs36_384', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def cait_xs24_384(pretrained=False, **kwargs):
+    model_args = dict(patch_size=16, embed_dim=288, depth=24, num_heads=6, init_values=1e-5, **kwargs)
+    model = _create_cait('cait_xs24_384', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def cait_s24_224(pretrained=False, **kwargs):
+    model_args = dict(patch_size=16, embed_dim=384, depth=24, num_heads=8, init_values=1e-5, **kwargs)
+    model = _create_cait('cait_s24_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def cait_s24_384(pretrained=False, **kwargs):
+    model_args = dict(patch_size=16, embed_dim=384, depth=24, num_heads=8, init_values=1e-5, **kwargs)
+    model = _create_cait('cait_s24_384', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def cait_s36_384(pretrained=False, **kwargs):
+    model_args = dict(patch_size=16, embed_dim=384, depth=36, num_heads=8, init_values=1e-6, **kwargs)
+    model = _create_cait('cait_s36_384', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def cait_m36_384(pretrained=False, **kwargs):
+    model_args = dict(patch_size=16, embed_dim=768, depth=36, num_heads=16, init_values=1e-6, **kwargs)
+    model = _create_cait('cait_m36_384', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def cait_m48_448(pretrained=False, **kwargs):
+    model_args = dict(patch_size=16, embed_dim=768, depth=48, num_heads=16, init_values=1e-6, **kwargs)
+    model = _create_cait('cait_m48_448', pretrained=pretrained, **model_args)
+    return model
diff --git a/src/custom_timm/models/coat.py b/src/custom_timm/models/coat.py
new file mode 100644
index 0000000000000000000000000000000000000000..6af1bd8824141c9bfe4404970606d0d9def9ce6a
--- /dev/null
+++ b/src/custom_timm/models/coat.py
@@ -0,0 +1,689 @@
+""" 
+CoaT architecture.
+
+Paper: Co-Scale Conv-Attentional Image Transformers - https://arxiv.org/abs/2104.06399
+
+Official CoaT code at: https://github.com/mlpc-ucsd/CoaT
+
+Modified from custom_timm/models/vision_transformer.py
+"""
+from copy import deepcopy
+from functools import partial
+from typing import Tuple, List, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg
+from .layers import PatchEmbed, Mlp, DropPath, to_2tuple, trunc_normal_
+from .registry import register_model
+from .layers import _assert
+
+
+__all__ = [
+    "coat_tiny",
+    "coat_mini",
+    "coat_lite_tiny",
+    "coat_lite_mini",
+    "coat_lite_small"
+]
+
+
+def _cfg_coat(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed1.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'coat_tiny': _cfg_coat(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_tiny-473c2a20.pth'
+    ),
+    'coat_mini': _cfg_coat(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_mini-2c6baf49.pth'
+    ),
+    'coat_lite_tiny': _cfg_coat(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_lite_tiny-461b07a7.pth'
+    ),
+    'coat_lite_mini': _cfg_coat(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_lite_mini-d7842000.pth'
+    ),
+    'coat_lite_small': _cfg_coat(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_lite_small-fea1d5a1.pth'
+    ),
+}
+
+
+class ConvRelPosEnc(nn.Module):
+    """ Convolutional relative position encoding. """
+    def __init__(self, Ch, h, window):
+        """
+        Initialization.
+            Ch: Channels per head.
+            h: Number of heads.
+            window: Window size(s) in convolutional relative positional encoding. It can have two forms:
+                1. An integer of window size, which assigns all attention heads with the same window s
+                    size in ConvRelPosEnc.
+                2. A dict mapping window size to #attention head splits (
+                    e.g. {window size 1: #attention head split 1, window size 2: #attention head split 2})
+                    It will apply different window size to the attention head splits.
+        """
+        super().__init__()
+
+        if isinstance(window, int):
+            # Set the same window size for all attention heads.
+            window = {window: h}
+            self.window = window
+        elif isinstance(window, dict):
+            self.window = window
+        else:
+            raise ValueError()            
+        
+        self.conv_list = nn.ModuleList()
+        self.head_splits = []
+        for cur_window, cur_head_split in window.items():
+            dilation = 1
+            # Determine padding size.
+            # Ref: https://discuss.pytorch.org/t/how-to-keep-the-shape-of-input-and-output-same-when-dilation-conv/14338
+            padding_size = (cur_window + (cur_window - 1) * (dilation - 1)) // 2
+            cur_conv = nn.Conv2d(cur_head_split*Ch, cur_head_split*Ch,
+                kernel_size=(cur_window, cur_window), 
+                padding=(padding_size, padding_size),
+                dilation=(dilation, dilation),                          
+                groups=cur_head_split*Ch,
+            )
+            self.conv_list.append(cur_conv)
+            self.head_splits.append(cur_head_split)
+        self.channel_splits = [x*Ch for x in self.head_splits]
+
+    def forward(self, q, v, size: Tuple[int, int]):
+        B, h, N, Ch = q.shape
+        H, W = size
+        _assert(N == 1 + H * W, '')
+
+        # Convolutional relative position encoding.
+        q_img = q[:, :, 1:, :]  # [B, h, H*W, Ch]
+        v_img = v[:, :, 1:, :]  # [B, h, H*W, Ch]
+
+        v_img = v_img.transpose(-1, -2).reshape(B, h * Ch, H, W)
+        v_img_list = torch.split(v_img, self.channel_splits, dim=1)  # Split according to channels
+        conv_v_img_list = []
+        for i, conv in enumerate(self.conv_list):
+            conv_v_img_list.append(conv(v_img_list[i]))
+        conv_v_img = torch.cat(conv_v_img_list, dim=1)
+        conv_v_img = conv_v_img.reshape(B, h, Ch, H * W).transpose(-1, -2)
+
+        EV_hat = q_img * conv_v_img
+        EV_hat = F.pad(EV_hat, (0, 0, 1, 0, 0, 0))  # [B, h, N, Ch].
+        return EV_hat
+
+
+class FactorAttnConvRelPosEnc(nn.Module):
+    """ Factorized attention with convolutional relative position encoding class. """
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0., shared_crpe=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)  # Note: attn_drop is actually not used.
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        # Shared convolutional relative position encoding.
+        self.crpe = shared_crpe
+
+    def forward(self, x, size: Tuple[int, int]):
+        B, N, C = x.shape
+
+        # Generate Q, K, V.
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # [B, h, N, Ch]
+
+        # Factorized attention.
+        k_softmax = k.softmax(dim=2)
+        factor_att = k_softmax.transpose(-1, -2) @ v
+        factor_att = q @ factor_att
+
+        # Convolutional relative position encoding.
+        crpe = self.crpe(q, v, size=size)  # [B, h, N, Ch]
+
+        # Merge and reshape.
+        x = self.scale * factor_att + crpe
+        x = x.transpose(1, 2).reshape(B, N, C)  # [B, h, N, Ch] -> [B, N, h, Ch] -> [B, N, C]
+
+        # Output projection.
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class ConvPosEnc(nn.Module):
+    """ Convolutional Position Encoding. 
+        Note: This module is similar to the conditional position encoding in CPVT.
+    """
+    def __init__(self, dim, k=3):
+        super(ConvPosEnc, self).__init__()
+        self.proj = nn.Conv2d(dim, dim, k, 1, k//2, groups=dim) 
+    
+    def forward(self, x, size: Tuple[int, int]):
+        B, N, C = x.shape
+        H, W = size
+        _assert(N == 1 + H * W, '')
+
+        # Extract CLS token and image tokens.
+        cls_token, img_tokens = x[:, :1], x[:, 1:]  # [B, 1, C], [B, H*W, C]
+        
+        # Depthwise convolution.
+        feat = img_tokens.transpose(1, 2).view(B, C, H, W)
+        x = self.proj(feat) + feat
+        x = x.flatten(2).transpose(1, 2)
+
+        # Combine with CLS token.
+        x = torch.cat((cls_token, x), dim=1)
+
+        return x
+
+
+class SerialBlock(nn.Module):
+    """ Serial block class.
+        Note: In this implementation, each serial block only contains a conv-attention and a FFN (MLP) module. """
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, shared_cpe=None, shared_crpe=None):
+        super().__init__()
+
+        # Conv-Attention.
+        self.cpe = shared_cpe
+
+        self.norm1 = norm_layer(dim)
+        self.factoratt_crpe = FactorAttnConvRelPosEnc(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, shared_crpe=shared_crpe)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        # MLP.
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x, size: Tuple[int, int]):
+        # Conv-Attention.
+        x = self.cpe(x, size)
+        cur = self.norm1(x)
+        cur = self.factoratt_crpe(cur, size)
+        x = x + self.drop_path(cur) 
+
+        # MLP. 
+        cur = self.norm2(x)
+        cur = self.mlp(cur)
+        x = x + self.drop_path(cur)
+
+        return x
+
+
+class ParallelBlock(nn.Module):
+    """ Parallel block class. """
+    def __init__(self, dims, num_heads, mlp_ratios=[], qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, shared_crpes=None):
+        super().__init__()
+
+        # Conv-Attention.
+        self.norm12 = norm_layer(dims[1])
+        self.norm13 = norm_layer(dims[2])
+        self.norm14 = norm_layer(dims[3])
+        self.factoratt_crpe2 = FactorAttnConvRelPosEnc(
+            dims[1], num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, 
+            shared_crpe=shared_crpes[1]
+        )
+        self.factoratt_crpe3 = FactorAttnConvRelPosEnc(
+            dims[2], num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, 
+            shared_crpe=shared_crpes[2]
+        )
+        self.factoratt_crpe4 = FactorAttnConvRelPosEnc(
+            dims[3], num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, 
+            shared_crpe=shared_crpes[3]
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        # MLP.
+        self.norm22 = norm_layer(dims[1])
+        self.norm23 = norm_layer(dims[2])
+        self.norm24 = norm_layer(dims[3])
+        # In parallel block, we assume dimensions are the same and share the linear transformation.
+        assert dims[1] == dims[2] == dims[3]
+        assert mlp_ratios[1] == mlp_ratios[2] == mlp_ratios[3]
+        mlp_hidden_dim = int(dims[1] * mlp_ratios[1])
+        self.mlp2 = self.mlp3 = self.mlp4 = Mlp(
+            in_features=dims[1], hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def upsample(self, x, factor: float, size: Tuple[int, int]):
+        """ Feature map up-sampling. """
+        return self.interpolate(x, scale_factor=factor, size=size)
+
+    def downsample(self, x, factor: float, size: Tuple[int, int]):
+        """ Feature map down-sampling. """
+        return self.interpolate(x, scale_factor=1.0/factor, size=size)
+
+    def interpolate(self, x, scale_factor: float, size: Tuple[int, int]):
+        """ Feature map interpolation. """
+        B, N, C = x.shape
+        H, W = size
+        _assert(N == 1 + H * W, '')
+
+        cls_token = x[:, :1, :]
+        img_tokens = x[:, 1:, :]
+        
+        img_tokens = img_tokens.transpose(1, 2).reshape(B, C, H, W)
+        img_tokens = F.interpolate(
+            img_tokens, scale_factor=scale_factor, recompute_scale_factor=False, mode='bilinear', align_corners=False)
+        img_tokens = img_tokens.reshape(B, C, -1).transpose(1, 2)
+        
+        out = torch.cat((cls_token, img_tokens), dim=1)
+
+        return out
+
+    def forward(self, x1, x2, x3, x4, sizes: List[Tuple[int, int]]):
+        _, S2, S3, S4 = sizes
+        cur2 = self.norm12(x2)
+        cur3 = self.norm13(x3)
+        cur4 = self.norm14(x4)
+        cur2 = self.factoratt_crpe2(cur2, size=S2)
+        cur3 = self.factoratt_crpe3(cur3, size=S3)
+        cur4 = self.factoratt_crpe4(cur4, size=S4)
+        upsample3_2 = self.upsample(cur3, factor=2., size=S3)
+        upsample4_3 = self.upsample(cur4, factor=2., size=S4)
+        upsample4_2 = self.upsample(cur4, factor=4., size=S4)
+        downsample2_3 = self.downsample(cur2, factor=2., size=S2)
+        downsample3_4 = self.downsample(cur3, factor=2., size=S3)
+        downsample2_4 = self.downsample(cur2, factor=4., size=S2)
+        cur2 = cur2 + upsample3_2 + upsample4_2
+        cur3 = cur3 + upsample4_3 + downsample2_3
+        cur4 = cur4 + downsample3_4 + downsample2_4
+        x2 = x2 + self.drop_path(cur2) 
+        x3 = x3 + self.drop_path(cur3) 
+        x4 = x4 + self.drop_path(cur4) 
+
+        # MLP. 
+        cur2 = self.norm22(x2)
+        cur3 = self.norm23(x3)
+        cur4 = self.norm24(x4)
+        cur2 = self.mlp2(cur2)
+        cur3 = self.mlp3(cur3)
+        cur4 = self.mlp4(cur4)
+        x2 = x2 + self.drop_path(cur2)
+        x3 = x3 + self.drop_path(cur3)
+        x4 = x4 + self.drop_path(cur4) 
+
+        return x1, x2, x3, x4
+
+
+class CoaT(nn.Module):
+    """ CoaT class. """
+    def __init__(
+            self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=(0, 0, 0, 0),
+            serial_depths=(0, 0, 0, 0), parallel_depth=0, num_heads=0, mlp_ratios=(0, 0, 0, 0), qkv_bias=True,
+            drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6),
+            return_interm_layers=False, out_features=None, crpe_window=None, global_pool='token'):
+        super().__init__()
+        assert global_pool in ('token', 'avg')
+        crpe_window = crpe_window or {3: 2, 5: 3, 7: 3}
+        self.return_interm_layers = return_interm_layers
+        self.out_features = out_features
+        self.embed_dims = embed_dims
+        self.num_features = embed_dims[-1]
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+
+        # Patch embeddings.
+        img_size = to_2tuple(img_size)
+        self.patch_embed1 = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans,
+            embed_dim=embed_dims[0], norm_layer=nn.LayerNorm)
+        self.patch_embed2 = PatchEmbed(
+            img_size=[x // 4 for x in img_size], patch_size=2, in_chans=embed_dims[0],
+            embed_dim=embed_dims[1], norm_layer=nn.LayerNorm)
+        self.patch_embed3 = PatchEmbed(
+            img_size=[x // 8 for x in img_size], patch_size=2, in_chans=embed_dims[1],
+            embed_dim=embed_dims[2], norm_layer=nn.LayerNorm)
+        self.patch_embed4 = PatchEmbed(
+            img_size=[x // 16 for x in img_size], patch_size=2, in_chans=embed_dims[2],
+            embed_dim=embed_dims[3], norm_layer=nn.LayerNorm)
+
+        # Class tokens.
+        self.cls_token1 = nn.Parameter(torch.zeros(1, 1, embed_dims[0]))
+        self.cls_token2 = nn.Parameter(torch.zeros(1, 1, embed_dims[1]))
+        self.cls_token3 = nn.Parameter(torch.zeros(1, 1, embed_dims[2]))
+        self.cls_token4 = nn.Parameter(torch.zeros(1, 1, embed_dims[3]))
+
+        # Convolutional position encodings.
+        self.cpe1 = ConvPosEnc(dim=embed_dims[0], k=3)
+        self.cpe2 = ConvPosEnc(dim=embed_dims[1], k=3)
+        self.cpe3 = ConvPosEnc(dim=embed_dims[2], k=3)
+        self.cpe4 = ConvPosEnc(dim=embed_dims[3], k=3)
+
+        # Convolutional relative position encodings.
+        self.crpe1 = ConvRelPosEnc(Ch=embed_dims[0] // num_heads, h=num_heads, window=crpe_window)
+        self.crpe2 = ConvRelPosEnc(Ch=embed_dims[1] // num_heads, h=num_heads, window=crpe_window)
+        self.crpe3 = ConvRelPosEnc(Ch=embed_dims[2] // num_heads, h=num_heads, window=crpe_window)
+        self.crpe4 = ConvRelPosEnc(Ch=embed_dims[3] // num_heads, h=num_heads, window=crpe_window)
+
+        # Disable stochastic depth.
+        dpr = drop_path_rate
+        assert dpr == 0.0
+        
+        # Serial blocks 1.
+        self.serial_blocks1 = nn.ModuleList([
+            SerialBlock(
+                dim=embed_dims[0], num_heads=num_heads, mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, 
+                shared_cpe=self.cpe1, shared_crpe=self.crpe1
+            )
+            for _ in range(serial_depths[0])]
+        )
+
+        # Serial blocks 2.
+        self.serial_blocks2 = nn.ModuleList([
+            SerialBlock(
+                dim=embed_dims[1], num_heads=num_heads, mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, 
+                shared_cpe=self.cpe2, shared_crpe=self.crpe2
+            )
+            for _ in range(serial_depths[1])]
+        )
+
+        # Serial blocks 3.
+        self.serial_blocks3 = nn.ModuleList([
+            SerialBlock(
+                dim=embed_dims[2], num_heads=num_heads, mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, 
+                shared_cpe=self.cpe3, shared_crpe=self.crpe3
+            )
+            for _ in range(serial_depths[2])]
+        )
+
+        # Serial blocks 4.
+        self.serial_blocks4 = nn.ModuleList([
+            SerialBlock(
+                dim=embed_dims[3], num_heads=num_heads, mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, 
+                shared_cpe=self.cpe4, shared_crpe=self.crpe4
+            )
+            for _ in range(serial_depths[3])]
+        )
+
+        # Parallel blocks.
+        self.parallel_depth = parallel_depth
+        if self.parallel_depth > 0:
+            self.parallel_blocks = nn.ModuleList([
+                ParallelBlock(
+                    dims=embed_dims, num_heads=num_heads, mlp_ratios=mlp_ratios, qkv_bias=qkv_bias,
+                    drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer,
+                    shared_crpes=(self.crpe1, self.crpe2, self.crpe3, self.crpe4)
+                )
+                for _ in range(parallel_depth)]
+            )
+        else:
+            self.parallel_blocks = None
+
+        # Classification head(s).
+        if not self.return_interm_layers:
+            if self.parallel_blocks is not None:
+                self.norm2 = norm_layer(embed_dims[1])
+                self.norm3 = norm_layer(embed_dims[2])
+            else:
+                self.norm2 = self.norm3 = None
+            self.norm4 = norm_layer(embed_dims[3])
+
+            if self.parallel_depth > 0:
+                # CoaT series: Aggregate features of last three scales for classification.
+                assert embed_dims[1] == embed_dims[2] == embed_dims[3]
+                self.aggregate = torch.nn.Conv1d(in_channels=3, out_channels=1, kernel_size=1)
+                self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+            else:
+                # CoaT-Lite series: Use feature of last scale for classification.
+                self.aggregate = None
+                self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+        # Initialize weights.
+        trunc_normal_(self.cls_token1, std=.02)
+        trunc_normal_(self.cls_token2, std=.02)
+        trunc_normal_(self.cls_token3, std=.02)
+        trunc_normal_(self.cls_token4, std=.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'cls_token1', 'cls_token2', 'cls_token3', 'cls_token4'}
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        assert not enable, 'gradient checkpointing not supported'
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem1=r'^cls_token1|patch_embed1|crpe1|cpe1',
+            serial_blocks1=r'^serial_blocks1\.(\d+)',
+            stem2=r'^cls_token2|patch_embed2|crpe2|cpe2',
+            serial_blocks2=r'^serial_blocks2\.(\d+)',
+            stem3=r'^cls_token3|patch_embed3|crpe3|cpe3',
+            serial_blocks3=r'^serial_blocks3\.(\d+)',
+            stem4=r'^cls_token4|patch_embed4|crpe4|cpe4',
+            serial_blocks4=r'^serial_blocks4\.(\d+)',
+            parallel_blocks=[  # FIXME (partially?) overlap parallel w/ serial blocks??
+                (r'^parallel_blocks\.(\d+)', None),
+                (r'^norm|aggregate', (99999,)),
+            ]
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ('token', 'avg')
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x0):
+        B = x0.shape[0]
+
+        # Serial blocks 1.
+        x1 = self.patch_embed1(x0)
+        H1, W1 = self.patch_embed1.grid_size
+        x1 = insert_cls(x1, self.cls_token1)
+        for blk in self.serial_blocks1:
+            x1 = blk(x1, size=(H1, W1))
+        x1_nocls = remove_cls(x1).reshape(B, H1, W1, -1).permute(0, 3, 1, 2).contiguous()
+        
+        # Serial blocks 2.
+        x2 = self.patch_embed2(x1_nocls)
+        H2, W2 = self.patch_embed2.grid_size
+        x2 = insert_cls(x2, self.cls_token2)
+        for blk in self.serial_blocks2:
+            x2 = blk(x2, size=(H2, W2))
+        x2_nocls = remove_cls(x2).reshape(B, H2, W2, -1).permute(0, 3, 1, 2).contiguous()
+
+        # Serial blocks 3.
+        x3 = self.patch_embed3(x2_nocls)
+        H3, W3 = self.patch_embed3.grid_size
+        x3 = insert_cls(x3, self.cls_token3)
+        for blk in self.serial_blocks3:
+            x3 = blk(x3, size=(H3, W3))
+        x3_nocls = remove_cls(x3).reshape(B, H3, W3, -1).permute(0, 3, 1, 2).contiguous()
+
+        # Serial blocks 4.
+        x4 = self.patch_embed4(x3_nocls)
+        H4, W4 = self.patch_embed4.grid_size
+        x4 = insert_cls(x4, self.cls_token4)
+        for blk in self.serial_blocks4:
+            x4 = blk(x4, size=(H4, W4))
+        x4_nocls = remove_cls(x4).reshape(B, H4, W4, -1).permute(0, 3, 1, 2).contiguous()
+
+        # Only serial blocks: Early return.
+        if self.parallel_blocks is None:
+            if not torch.jit.is_scripting() and self.return_interm_layers:
+                # Return intermediate features for down-stream tasks (e.g. Deformable DETR and Detectron2).
+                feat_out = {}   
+                if 'x1_nocls' in self.out_features:
+                    feat_out['x1_nocls'] = x1_nocls
+                if 'x2_nocls' in self.out_features:
+                    feat_out['x2_nocls'] = x2_nocls
+                if 'x3_nocls' in self.out_features:
+                    feat_out['x3_nocls'] = x3_nocls
+                if 'x4_nocls' in self.out_features:
+                    feat_out['x4_nocls'] = x4_nocls
+                return feat_out
+            else:
+                # Return features for classification.
+                x4 = self.norm4(x4)
+                return x4
+
+        # Parallel blocks.
+        for blk in self.parallel_blocks:
+            x2, x3, x4 = self.cpe2(x2, (H2, W2)), self.cpe3(x3, (H3, W3)), self.cpe4(x4, (H4, W4))
+            x1, x2, x3, x4 = blk(x1, x2, x3, x4, sizes=[(H1, W1), (H2, W2), (H3, W3), (H4, W4)])
+
+        if not torch.jit.is_scripting() and self.return_interm_layers:
+            # Return intermediate features for down-stream tasks (e.g. Deformable DETR and Detectron2).
+            feat_out = {}   
+            if 'x1_nocls' in self.out_features:
+                x1_nocls = remove_cls(x1).reshape(B, H1, W1, -1).permute(0, 3, 1, 2).contiguous()
+                feat_out['x1_nocls'] = x1_nocls
+            if 'x2_nocls' in self.out_features:
+                x2_nocls = remove_cls(x2).reshape(B, H2, W2, -1).permute(0, 3, 1, 2).contiguous()
+                feat_out['x2_nocls'] = x2_nocls
+            if 'x3_nocls' in self.out_features:
+                x3_nocls = remove_cls(x3).reshape(B, H3, W3, -1).permute(0, 3, 1, 2).contiguous()
+                feat_out['x3_nocls'] = x3_nocls
+            if 'x4_nocls' in self.out_features:
+                x4_nocls = remove_cls(x4).reshape(B, H4, W4, -1).permute(0, 3, 1, 2).contiguous()
+                feat_out['x4_nocls'] = x4_nocls
+            return feat_out
+        else:
+            x2 = self.norm2(x2)
+            x3 = self.norm3(x3)
+            x4 = self.norm4(x4)
+            return [x2, x3, x4]
+
+    def forward_head(self, x_feat: Union[torch.Tensor, List[torch.Tensor]], pre_logits: bool = False):
+        if isinstance(x_feat, list):
+            assert self.aggregate is not None
+            if self.global_pool == 'avg':
+                x = torch.cat([xl[:, 1:].mean(dim=1, keepdim=True) for xl in x_feat], dim=1)  # [B, 3, C]
+            else:
+                x = torch.stack([xl[:, 0] for xl in x_feat], dim=1)  # [B, 3, C]
+            x = self.aggregate(x).squeeze(dim=1)  # Shape: [B, C]
+        else:
+            x = x_feat[:, 1:].mean(dim=1) if self.global_pool == 'avg' else x_feat[:, 0]
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x) -> torch.Tensor:
+        if not torch.jit.is_scripting() and self.return_interm_layers:
+            # Return intermediate features (for down-stream tasks).
+            return self.forward_features(x)
+        else:
+            # Return features for classification.
+            x_feat = self.forward_features(x)
+            x = self.forward_head(x_feat)
+            return x
+
+
+def insert_cls(x, cls_token):
+    """ Insert CLS token. """
+    cls_tokens = cls_token.expand(x.shape[0], -1, -1)
+    x = torch.cat((cls_tokens, x), dim=1)
+    return x
+
+
+def remove_cls(x):
+    """ Remove CLS token. """
+    return x[:, 1:, :]
+
+
+def checkpoint_filter_fn(state_dict, model):
+    out_dict = {}
+    for k, v in state_dict.items():
+        # original model had unused norm layers, removing them requires filtering pretrained checkpoints
+        if k.startswith('norm1') or \
+                (model.norm2 is None and k.startswith('norm2')) or \
+                (model.norm3 is None and k.startswith('norm3')):
+            continue
+        out_dict[k] = v
+    return out_dict
+
+
+def _create_coat(variant, pretrained=False, default_cfg=None, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+
+    model = build_model_with_cfg(
+        CoaT, variant, pretrained,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        **kwargs)
+    return model
+
+
+@register_model
+def coat_tiny(pretrained=False, **kwargs):
+    model_cfg = dict(
+        patch_size=4, embed_dims=[152, 152, 152, 152], serial_depths=[2, 2, 2, 2], parallel_depth=6,
+        num_heads=8, mlp_ratios=[4, 4, 4, 4], **kwargs)
+    model = _create_coat('coat_tiny', pretrained=pretrained, **model_cfg)
+    return model
+
+
+@register_model
+def coat_mini(pretrained=False, **kwargs):
+    model_cfg = dict(
+        patch_size=4, embed_dims=[152, 216, 216, 216], serial_depths=[2, 2, 2, 2], parallel_depth=6,
+        num_heads=8, mlp_ratios=[4, 4, 4, 4], **kwargs)
+    model = _create_coat('coat_mini', pretrained=pretrained, **model_cfg)
+    return model
+
+
+@register_model
+def coat_lite_tiny(pretrained=False, **kwargs):
+    model_cfg = dict(
+        patch_size=4, embed_dims=[64, 128, 256, 320], serial_depths=[2, 2, 2, 2], parallel_depth=0,
+        num_heads=8, mlp_ratios=[8, 8, 4, 4], **kwargs)
+    model = _create_coat('coat_lite_tiny', pretrained=pretrained, **model_cfg)
+    return model
+
+
+@register_model
+def coat_lite_mini(pretrained=False, **kwargs):
+    model_cfg = dict(
+        patch_size=4, embed_dims=[64, 128, 320, 512], serial_depths=[2, 2, 2, 2], parallel_depth=0,
+        num_heads=8, mlp_ratios=[8, 8, 4, 4], **kwargs)
+    model = _create_coat('coat_lite_mini', pretrained=pretrained, **model_cfg)
+    return model
+
+
+@register_model
+def coat_lite_small(pretrained=False, **kwargs):
+    model_cfg = dict(
+        patch_size=4, embed_dims=[64, 128, 320, 512], serial_depths=[3, 4, 6, 3], parallel_depth=0,
+        num_heads=8, mlp_ratios=[8, 8, 4, 4], **kwargs)
+    model = _create_coat('coat_lite_small', pretrained=pretrained, **model_cfg)
+    return model
\ No newline at end of file
diff --git a/src/custom_timm/models/convit.py b/src/custom_timm/models/convit.py
new file mode 100644
index 0000000000000000000000000000000000000000..b23e1c5504cfb12a47a651e45eb7ffd488e32acb
--- /dev/null
+++ b/src/custom_timm/models/convit.py
@@ -0,0 +1,369 @@
+""" ConViT Model
+
+@article{d2021convit,
+  title={ConViT: Improving Vision Transformers with Soft Convolutional Inductive Biases},
+  author={d'Ascoli, St{\'e}phane and Touvron, Hugo and Leavitt, Matthew and Morcos, Ari and Biroli, Giulio and Sagun, Levent},
+  journal={arXiv preprint arXiv:2103.10697},
+  year={2021}
+}
+
+Paper link: https://arxiv.org/abs/2103.10697
+Original code: https://github.com/facebookresearch/convit, original copyright below
+
+Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman
+"""
+# Copyright (c) 2015-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the CC-by-NC license found in the
+# LICENSE file in the root directory of this source tree.
+#
+'''These modules are adapted from those of timm, see
+https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+'''
+
+import torch
+import torch.nn as nn
+from functools import partial
+import torch.nn.functional as F
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg
+from .layers import DropPath, to_2tuple, trunc_normal_, PatchEmbed, Mlp
+from .registry import register_model
+from .vision_transformer_hybrid import HybridEmbed
+from .fx_features import register_notrace_module
+
+import torch
+import torch.nn as nn
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, 'fixed_input_size': True,
+        'first_conv': 'patch_embed.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    # ConViT
+    'convit_tiny': _cfg(
+        url="https://dl.fbaipublicfiles.com/convit/convit_tiny.pth"),
+    'convit_small': _cfg(
+        url="https://dl.fbaipublicfiles.com/convit/convit_small.pth"),
+    'convit_base': _cfg(
+        url="https://dl.fbaipublicfiles.com/convit/convit_base.pth")
+}
+
+
+@register_notrace_module  # reason: FX can't symbolically trace control flow in forward method
+class GPSA(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0., locality_strength=1.):
+        super().__init__()
+        self.num_heads = num_heads
+        self.dim = dim
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.locality_strength = locality_strength
+
+        self.qk = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.v = nn.Linear(dim, dim, bias=qkv_bias)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.pos_proj = nn.Linear(3, num_heads)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.gating_param = nn.Parameter(torch.ones(self.num_heads))
+        self.rel_indices: torch.Tensor = torch.zeros(1, 1, 1, 3)  # silly torchscript hack, won't work with None
+
+    def forward(self, x):
+        B, N, C = x.shape
+        if self.rel_indices is None or self.rel_indices.shape[1] != N:
+            self.rel_indices = self.get_rel_indices(N)
+        attn = self.get_attention(x)
+        v = self.v(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def get_attention(self, x):
+        B, N, C = x.shape
+        qk = self.qk(x).reshape(B, N, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k = qk[0], qk[1]
+        pos_score = self.rel_indices.expand(B, -1, -1, -1)
+        pos_score = self.pos_proj(pos_score).permute(0, 3, 1, 2)
+        patch_score = (q @ k.transpose(-2, -1)) * self.scale
+        patch_score = patch_score.softmax(dim=-1)
+        pos_score = pos_score.softmax(dim=-1)
+
+        gating = self.gating_param.view(1, -1, 1, 1)
+        attn = (1. - torch.sigmoid(gating)) * patch_score + torch.sigmoid(gating) * pos_score
+        attn /= attn.sum(dim=-1).unsqueeze(-1)
+        attn = self.attn_drop(attn)
+        return attn
+
+    def get_attention_map(self, x, return_map=False):
+        attn_map = self.get_attention(x).mean(0)  # average over batch
+        distances = self.rel_indices.squeeze()[:, :, -1] ** .5
+        dist = torch.einsum('nm,hnm->h', (distances, attn_map)) / distances.size(0)
+        if return_map:
+            return dist, attn_map
+        else:
+            return dist
+
+    def local_init(self):
+        self.v.weight.data.copy_(torch.eye(self.dim))
+        locality_distance = 1  # max(1,1/locality_strength**.5)
+
+        kernel_size = int(self.num_heads ** .5)
+        center = (kernel_size - 1) / 2 if kernel_size % 2 == 0 else kernel_size // 2
+        for h1 in range(kernel_size):
+            for h2 in range(kernel_size):
+                position = h1 + kernel_size * h2
+                self.pos_proj.weight.data[position, 2] = -1
+                self.pos_proj.weight.data[position, 1] = 2 * (h1 - center) * locality_distance
+                self.pos_proj.weight.data[position, 0] = 2 * (h2 - center) * locality_distance
+        self.pos_proj.weight.data *= self.locality_strength
+
+    def get_rel_indices(self, num_patches: int) -> torch.Tensor:
+        img_size = int(num_patches ** .5)
+        rel_indices = torch.zeros(1, num_patches, num_patches, 3)
+        ind = torch.arange(img_size).view(1, -1) - torch.arange(img_size).view(-1, 1)
+        indx = ind.repeat(img_size, img_size)
+        indy = ind.repeat_interleave(img_size, dim=0).repeat_interleave(img_size, dim=1)
+        indd = indx ** 2 + indy ** 2
+        rel_indices[:, :, :, 2] = indd.unsqueeze(0)
+        rel_indices[:, :, :, 1] = indy.unsqueeze(0)
+        rel_indices[:, :, :, 0] = indx.unsqueeze(0)
+        device = self.qk.weight.device
+        return rel_indices.to(device)
+
+
+class MHSA(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def get_attention_map(self, x, return_map=False):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn_map = (q @ k.transpose(-2, -1)) * self.scale
+        attn_map = attn_map.softmax(dim=-1).mean(0)
+
+        img_size = int(N ** .5)
+        ind = torch.arange(img_size).view(1, -1) - torch.arange(img_size).view(-1, 1)
+        indx = ind.repeat(img_size, img_size)
+        indy = ind.repeat_interleave(img_size, dim=0).repeat_interleave(img_size, dim=1)
+        indd = indx ** 2 + indy ** 2
+        distances = indd ** .5
+        distances = distances.to(x.device)
+
+        dist = torch.einsum('nm,hnm->h', (distances, attn_map)) / N
+        if return_map:
+            return dist, attn_map
+        else:
+            return dist
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(
+            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+            drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, use_gpsa=True, **kwargs):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.use_gpsa = use_gpsa
+        if self.use_gpsa:
+            self.attn = GPSA(
+                dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, **kwargs)
+        else:
+            self.attn = MHSA(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class ConViT(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(
+            self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, global_pool='token',
+            embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=False, drop_rate=0., attn_drop_rate=0.,
+            drop_path_rate=0., hybrid_backbone=None, norm_layer=nn.LayerNorm,
+            local_up_to_layer=3, locality_strength=1., use_pos_embed=True):
+        super().__init__()
+        assert global_pool in ('', 'avg', 'token')
+        embed_dim *= num_heads
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.local_up_to_layer = local_up_to_layer
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.locality_strength = locality_strength
+        self.use_pos_embed = use_pos_embed
+
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.num_patches = num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if self.use_pos_embed:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+            trunc_normal_(self.pos_embed, std=.02)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                use_gpsa=True,
+                locality_strength=locality_strength)
+            if i < local_up_to_layer else
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                use_gpsa=False)
+            for i in range(depth)])
+        self.norm = norm_layer(embed_dim)
+
+        # Classifier head
+        self.feature_info = [dict(num_chs=embed_dim, reduction=0, module='head')]
+        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+        for n, m in self.named_modules():
+            if hasattr(m, 'local_init'):
+                m.local_init()
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^cls_token|pos_embed|patch_embed',  # stem and embed
+            blocks=[(r'^blocks\.(\d+)', None), (r'^norm', (99999,))]
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        assert not enable, 'gradient checkpointing not supported'
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ('', 'token', 'avg')
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        if self.use_pos_embed:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+        cls_tokens = self.cls_token.expand(x.shape[0], -1, -1)
+        for u, blk in enumerate(self.blocks):
+            if u == self.local_up_to_layer:
+                x = torch.cat((cls_tokens, x), dim=1)
+            x = blk(x)
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool:
+            x = x[:, 1:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0]
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_convit(variant, pretrained=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+
+    return build_model_with_cfg(ConViT, variant, pretrained, **kwargs)
+
+
+@register_model
+def convit_tiny(pretrained=False, **kwargs):
+    model_args = dict(
+        local_up_to_layer=10, locality_strength=1.0, embed_dim=48,
+        num_heads=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    model = _create_convit(variant='convit_tiny', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convit_small(pretrained=False, **kwargs):
+    model_args = dict(
+        local_up_to_layer=10, locality_strength=1.0, embed_dim=48,
+        num_heads=9, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    model = _create_convit(variant='convit_small', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convit_base(pretrained=False, **kwargs):
+    model_args = dict(
+        local_up_to_layer=10, locality_strength=1.0, embed_dim=48,
+        num_heads=16, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    model = _create_convit(variant='convit_base', pretrained=pretrained, **model_args)
+    return model
diff --git a/src/custom_timm/models/convmixer.py b/src/custom_timm/models/convmixer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2140241a2af7f6e7a7427d9fc926e9b71c233b0
--- /dev/null
+++ b/src/custom_timm/models/convmixer.py
@@ -0,0 +1,125 @@
+""" ConvMixer
+
+"""
+import torch
+import torch.nn as nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from custom_timm.models.registry import register_model
+from .helpers import build_model_with_cfg, checkpoint_seq
+from .layers import SelectAdaptivePool2d
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .96, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, 'classifier': 'head',
+        'first_conv': 'stem.0',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'convmixer_1536_20': _cfg(url='https://github.com/tmp-iclr/convmixer/releases/download/timm-v1.0/convmixer_1536_20_ks9_p7.pth.tar'),
+    'convmixer_768_32': _cfg(url='https://github.com/tmp-iclr/convmixer/releases/download/timm-v1.0/convmixer_768_32_ks7_p7_relu.pth.tar'),
+    'convmixer_1024_20_ks9_p14': _cfg(url='https://github.com/tmp-iclr/convmixer/releases/download/timm-v1.0/convmixer_1024_20_ks9_p14.pth.tar')
+}
+
+
+class Residual(nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+
+    def forward(self, x):
+        return self.fn(x) + x
+
+
+class ConvMixer(nn.Module):
+    def __init__(
+            self, dim, depth, kernel_size=9, patch_size=7, in_chans=3, num_classes=1000, global_pool='avg',
+            act_layer=nn.GELU, **kwargs):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = dim
+        self.grad_checkpointing = False
+
+        self.stem = nn.Sequential(
+            nn.Conv2d(in_chans, dim, kernel_size=patch_size, stride=patch_size),
+            act_layer(),
+            nn.BatchNorm2d(dim)
+        )
+        self.blocks = nn.Sequential(
+            *[nn.Sequential(
+                    Residual(nn.Sequential(
+                        nn.Conv2d(dim, dim, kernel_size, groups=dim, padding="same"),
+                        act_layer(),
+                        nn.BatchNorm2d(dim)
+                    )),
+                    nn.Conv2d(dim, dim, kernel_size=1),
+                    act_layer(),
+                    nn.BatchNorm2d(dim)
+            ) for i in range(depth)]
+        )
+        self.pooling = SelectAdaptivePool2d(pool_type=global_pool, flatten=True)
+        self.head = nn.Linear(dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(stem=r'^stem', blocks=r'^blocks\.(\d+)')
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            self.pooling = SelectAdaptivePool2d(pool_type=global_pool, flatten=True)
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+          
+    def forward_features(self, x):
+        x = self.stem(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        x = self.pooling(x)
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_convmixer(variant, pretrained=False, **kwargs):
+    return build_model_with_cfg(ConvMixer, variant, pretrained, **kwargs)
+
+
+@register_model
+def convmixer_1536_20(pretrained=False, **kwargs):
+    model_args = dict(dim=1536, depth=20, kernel_size=9, patch_size=7, **kwargs)
+    return _create_convmixer('convmixer_1536_20', pretrained, **model_args)
+
+
+@register_model
+def convmixer_768_32(pretrained=False, **kwargs):
+    model_args = dict(dim=768, depth=32, kernel_size=7, patch_size=7, act_layer=nn.ReLU, **kwargs)
+    return _create_convmixer('convmixer_768_32', pretrained, **model_args)
+
+
+@register_model
+def convmixer_1024_20_ks9_p14(pretrained=False, **kwargs):
+    model_args = dict(dim=1024, depth=20, kernel_size=9, patch_size=14, **kwargs)
+    return _create_convmixer('convmixer_1024_20_ks9_p14', pretrained, **model_args)
\ No newline at end of file
diff --git a/src/custom_timm/models/convnext.py b/src/custom_timm/models/convnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..f76d972236dbae1a8df24d70ee35f05f6207f815
--- /dev/null
+++ b/src/custom_timm/models/convnext.py
@@ -0,0 +1,673 @@
+""" ConvNeXt
+
+Paper: `A ConvNet for the 2020s` - https://arxiv.org/pdf/2201.03545.pdf
+
+Original code and weights from https://github.com/facebookresearch/ConvNeXt, original copyright below
+
+Model defs atto, femto, pico, nano and _ols / _hnf variants are timm specific.
+
+Modifications and additions for timm hacked together by / Copyright 2022, Ross Wightman
+"""
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the MIT license
+from collections import OrderedDict
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import named_apply, build_model_with_cfg, checkpoint_seq
+from .layers import trunc_normal_, SelectAdaptivePool2d, DropPath, ConvMlp, Mlp, LayerNorm2d, LayerNorm, \
+    create_conv2d, get_act_layer, make_divisible, to_ntuple
+from .registry import register_model
+
+
+__all__ = ['ConvNeXt']  # model_registry will add each entrypoint fn to this
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.0', 'classifier': 'head.fc',
+        **kwargs
+    }
+
+
+default_cfgs = dict(
+    # timm specific variants
+    convnext_atto=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_atto_d2-01bb0f51.pth',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    convnext_atto_ols=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_atto_ols_a2-78d1c8f3.pth',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    convnext_femto=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_femto_d1-d71d5b4c.pth',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    convnext_femto_ols=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_femto_ols_d1-246bf2ed.pth',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    convnext_pico=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_pico_d1-10ad7f0d.pth',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    convnext_pico_ols=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_pico_ols_d1-611f0ca7.pth',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    convnext_nano=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_nano_d1h-7eb4bdea.pth',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    convnext_nano_ols=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_nano_ols_d1h-ae424a9a.pth',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    convnext_tiny_hnf=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_tiny_hnf_a2h-ab7e9df2.pth',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+
+    convnext_tiny=_cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    convnext_small=_cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth",
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    convnext_base=_cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth",
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    convnext_large=_cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth",
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+
+    convnext_tiny_in22ft1k=_cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_224.pth',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    convnext_small_in22ft1k=_cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_224.pth',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    convnext_base_in22ft1k=_cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    convnext_large_in22ft1k=_cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    convnext_xlarge_in22ft1k=_cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+
+    convnext_tiny_384_in22ft1k=_cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_384.pth',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
+    convnext_small_384_in22ft1k=_cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_384.pth',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
+    convnext_base_384_in22ft1k=_cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_384.pth',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
+    convnext_large_384_in22ft1k=_cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
+    convnext_xlarge_384_in22ft1k=_cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
+
+    convnext_tiny_in22k=_cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth", num_classes=21841),
+    convnext_small_in22k=_cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth", num_classes=21841),
+    convnext_base_in22k=_cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth", num_classes=21841),
+    convnext_large_in22k=_cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth", num_classes=21841),
+    convnext_xlarge_in22k=_cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth", num_classes=21841),
+)
+
+
+class ConvNeXtBlock(nn.Module):
+    """ ConvNeXt Block
+    There are two equivalent implementations:
+      (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+      (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+
+    Unlike the official impl, this one allows choice of 1 or 2, 1x1 conv can be faster with appropriate
+    choice of LayerNorm impl, however as model size increases the tradeoffs appear to change and nn.Linear
+    is a better choice. This was observed with PyTorch 1.10 on 3090 GPU, it could change over time & w/ different HW.
+
+    Args:
+        in_chs (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        ls_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    def __init__(
+            self,
+            in_chs,
+            out_chs=None,
+            kernel_size=7,
+            stride=1,
+            dilation=1,
+            mlp_ratio=4,
+            conv_mlp=False,
+            conv_bias=True,
+            ls_init_value=1e-6,
+            act_layer='gelu',
+            norm_layer=None,
+            drop_path=0.,
+    ):
+        super().__init__()
+        out_chs = out_chs or in_chs
+        act_layer = get_act_layer(act_layer)
+        if not norm_layer:
+            norm_layer = LayerNorm2d if conv_mlp else LayerNorm
+        mlp_layer = ConvMlp if conv_mlp else Mlp
+        self.use_conv_mlp = conv_mlp
+
+        self.conv_dw = create_conv2d(
+            in_chs, out_chs, kernel_size=kernel_size, stride=stride, dilation=dilation, depthwise=True, bias=conv_bias)
+        self.norm = norm_layer(out_chs)
+        self.mlp = mlp_layer(out_chs, int(mlp_ratio * out_chs), act_layer=act_layer)
+        self.gamma = nn.Parameter(ls_init_value * torch.ones(out_chs)) if ls_init_value > 0 else None
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv_dw(x)
+        if self.use_conv_mlp:
+            x = self.norm(x)
+            x = self.mlp(x)
+        else:
+            x = x.permute(0, 2, 3, 1)
+            x = self.norm(x)
+            x = self.mlp(x)
+            x = x.permute(0, 3, 1, 2)
+        if self.gamma is not None:
+            x = x.mul(self.gamma.reshape(1, -1, 1, 1))
+
+        x = self.drop_path(x) + shortcut
+        return x
+
+
+class ConvNeXtStage(nn.Module):
+
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            kernel_size=7,
+            stride=2,
+            depth=2,
+            dilation=(1, 1),
+            drop_path_rates=None,
+            ls_init_value=1.0,
+            conv_mlp=False,
+            conv_bias=True,
+            act_layer='gelu',
+            norm_layer=None,
+            norm_layer_cl=None
+    ):
+        super().__init__()
+        self.grad_checkpointing = False
+
+        if in_chs != out_chs or stride > 1 or dilation[0] != dilation[1]:
+            ds_ks = 2 if stride > 1 or dilation[0] != dilation[1] else 1
+            pad = 'same' if dilation[1] > 1 else 0  # same padding needed if dilation used
+            self.downsample = nn.Sequential(
+                norm_layer(in_chs),
+                create_conv2d(
+                    in_chs, out_chs, kernel_size=ds_ks, stride=stride,
+                    dilation=dilation[0], padding=pad, bias=conv_bias),
+            )
+            in_chs = out_chs
+        else:
+            self.downsample = nn.Identity()
+
+        drop_path_rates = drop_path_rates or [0.] * depth
+        stage_blocks = []
+        for i in range(depth):
+            stage_blocks.append(ConvNeXtBlock(
+                in_chs=in_chs,
+                out_chs=out_chs,
+                kernel_size=kernel_size,
+                dilation=dilation[1],
+                drop_path=drop_path_rates[i],
+                ls_init_value=ls_init_value,
+                conv_mlp=conv_mlp,
+                conv_bias=conv_bias,
+                act_layer=act_layer,
+                norm_layer=norm_layer if conv_mlp else norm_layer_cl
+            ))
+            in_chs = out_chs
+        self.blocks = nn.Sequential(*stage_blocks)
+
+    def forward(self, x):
+        x = self.downsample(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        return x
+
+
+class ConvNeXt(nn.Module):
+    r""" ConvNeXt
+        A PyTorch impl of : `A ConvNet for the 2020s`  - https://arxiv.org/pdf/2201.03545.pdf
+
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (tuple(int)): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_rate (float): Head dropout rate
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        ls_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+
+    def __init__(
+            self,
+            in_chans=3,
+            num_classes=1000,
+            global_pool='avg',
+            output_stride=32,
+            depths=(3, 3, 9, 3),
+            dims=(96, 192, 384, 768),
+            kernel_sizes=7,
+            ls_init_value=1e-6,
+            stem_type='patch',
+            patch_size=4,
+            head_init_scale=1.,
+            head_norm_first=False,
+            conv_mlp=False,
+            conv_bias=True,
+            act_layer='gelu',
+            norm_layer=None,
+            drop_rate=0.,
+            drop_path_rate=0.,
+    ):
+        super().__init__()
+        assert output_stride in (8, 16, 32)
+        kernel_sizes = to_ntuple(4)(kernel_sizes)
+        if norm_layer is None:
+            norm_layer = LayerNorm2d
+            norm_layer_cl = norm_layer if conv_mlp else LayerNorm
+        else:
+            assert conv_mlp,\
+                'If a norm_layer is specified, conv MLP must be used so all norm expect rank-4, channels-first input'
+            norm_layer_cl = norm_layer
+
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        self.feature_info = []
+
+        assert stem_type in ('patch', 'overlap', 'overlap_tiered')
+        if stem_type == 'patch':
+            # NOTE: this stem is a minimal form of ViT PatchEmbed, as used in SwinTransformer w/ patch_size = 4
+            self.stem = nn.Sequential(
+                nn.Conv2d(in_chans, dims[0], kernel_size=patch_size, stride=patch_size, bias=conv_bias),
+                norm_layer(dims[0])
+            )
+            stem_stride = patch_size
+        else:
+            mid_chs = make_divisible(dims[0] // 2) if 'tiered' in stem_type else dims[0]
+            self.stem = nn.Sequential(
+                nn.Conv2d(in_chans, mid_chs, kernel_size=3, stride=2, padding=1, bias=conv_bias),
+                nn.Conv2d(mid_chs, dims[0], kernel_size=3, stride=2, padding=1, bias=conv_bias),
+                norm_layer(dims[0]),
+            )
+            stem_stride = 4
+
+        self.stages = nn.Sequential()
+        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        stages = []
+        prev_chs = dims[0]
+        curr_stride = stem_stride
+        dilation = 1
+        # 4 feature resolution stages, each consisting of multiple residual blocks
+        for i in range(4):
+            stride = 2 if curr_stride == 2 or i > 0 else 1
+            if curr_stride >= output_stride and stride > 1:
+                dilation *= stride
+                stride = 1
+            curr_stride *= stride
+            first_dilation = 1 if dilation in (1, 2) else 2
+            out_chs = dims[i]
+            stages.append(ConvNeXtStage(
+                prev_chs,
+                out_chs,
+                kernel_size=kernel_sizes[i],
+                stride=stride,
+                dilation=(first_dilation, dilation),
+                depth=depths[i],
+                drop_path_rates=dp_rates[i],
+                ls_init_value=ls_init_value,
+                conv_mlp=conv_mlp,
+                conv_bias=conv_bias,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                norm_layer_cl=norm_layer_cl
+            ))
+            prev_chs = out_chs
+            # NOTE feature_info use currently assumes stage 0 == stride 1, rest are stride 2
+            self.feature_info += [dict(num_chs=prev_chs, reduction=curr_stride, module=f'stages.{i}')]
+        self.stages = nn.Sequential(*stages)
+        self.num_features = prev_chs
+
+        # if head_norm_first == true, norm -> global pool -> fc ordering, like most other nets
+        # otherwise pool -> norm -> fc, the default ConvNeXt ordering (pretrained FB weights)
+        self.norm_pre = norm_layer(self.num_features) if head_norm_first else nn.Identity()
+        self.head = nn.Sequential(OrderedDict([
+                ('global_pool', SelectAdaptivePool2d(pool_type=global_pool)),
+                ('norm', nn.Identity() if head_norm_first else norm_layer(self.num_features)),
+                ('flatten', nn.Flatten(1) if global_pool else nn.Identity()),
+                ('drop', nn.Dropout(self.drop_rate)),
+                ('fc', nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity())]))
+
+        named_apply(partial(_init_weights, head_init_scale=head_init_scale), self)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^stem',
+            blocks=r'^stages\.(\d+)' if coarse else [
+                (r'^stages\.(\d+)\.downsample', (0,)),  # blocks
+                (r'^stages\.(\d+)\.blocks\.(\d+)', None),
+                (r'^norm_pre', (99999,))
+            ]
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for s in self.stages:
+            s.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes=0, global_pool=None):
+        if global_pool is not None:
+            self.head.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+            self.head.flatten = nn.Flatten(1) if global_pool else nn.Identity()
+        self.head.fc = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = self.stages(x)
+        x = self.norm_pre(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        # NOTE nn.Sequential in head broken down since can't call head[:-1](x) in torchscript :(
+        x = self.head.global_pool(x)
+        x = self.head.norm(x)
+        x = self.head.flatten(x)
+        x = self.head.drop(x)
+        return x if pre_logits else self.head.fc(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _init_weights(module, name=None, head_init_scale=1.0):
+    if isinstance(module, nn.Conv2d):
+        trunc_normal_(module.weight, std=.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=.02)
+        nn.init.zeros_(module.bias)
+        if name and 'head.' in name:
+            module.weight.data.mul_(head_init_scale)
+            module.bias.data.mul_(head_init_scale)
+
+
+def checkpoint_filter_fn(state_dict, model):
+    """ Remap FB checkpoints -> timm """
+    if 'head.norm.weight' in state_dict or 'norm_pre.weight' in state_dict:
+        return state_dict  # non-FB checkpoint
+    if 'model' in state_dict:
+        state_dict = state_dict['model']
+    out_dict = {}
+    import re
+    for k, v in state_dict.items():
+        k = k.replace('downsample_layers.0.', 'stem.')
+        k = re.sub(r'stages.([0-9]+).([0-9]+)', r'stages.\1.blocks.\2', k)
+        k = re.sub(r'downsample_layers.([0-9]+).([0-9]+)', r'stages.\1.downsample.\2', k)
+        k = k.replace('dwconv', 'conv_dw')
+        k = k.replace('pwconv', 'mlp.fc')
+        k = k.replace('head.', 'head.fc.')
+        if k.startswith('norm.'):
+            k = k.replace('norm', 'head.norm')
+        if v.ndim == 2 and 'head' not in k:
+            model_shape = model.state_dict()[k].shape
+            v = v.reshape(model_shape)
+        out_dict[k] = v
+    return out_dict
+
+
+def _create_convnext(variant, pretrained=False, **kwargs):
+    model = build_model_with_cfg(
+        ConvNeXt, variant, pretrained,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        feature_cfg=dict(out_indices=(0, 1, 2, 3), flatten_sequential=True),
+        **kwargs)
+    return model
+
+
+@register_model
+def convnext_atto(pretrained=False, **kwargs):
+    # timm femto variant (NOTE: still tweaking depths, will vary between 3-4M param, current is 3.7M
+    model_args = dict(
+        depths=(2, 2, 6, 2), dims=(40, 80, 160, 320), conv_mlp=True, **kwargs)
+    model = _create_convnext('convnext_atto', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_atto_ols(pretrained=False, **kwargs):
+    # timm femto variant with overlapping 3x3 conv stem, wider than non-ols femto above, current param count 3.7M
+    model_args = dict(
+        depths=(2, 2, 6, 2), dims=(40, 80, 160, 320), conv_mlp=True, stem_type='overlap_tiered', **kwargs)
+    model = _create_convnext('convnext_atto_ols', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_femto(pretrained=False, **kwargs):
+    # timm femto variant
+    model_args = dict(
+        depths=(2, 2, 6, 2), dims=(48, 96, 192, 384), conv_mlp=True, **kwargs)
+    model = _create_convnext('convnext_femto', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_femto_ols(pretrained=False, **kwargs):
+    # timm femto variant
+    model_args = dict(
+        depths=(2, 2, 6, 2), dims=(48, 96, 192, 384), conv_mlp=True, stem_type='overlap_tiered', **kwargs)
+    model = _create_convnext('convnext_femto_ols', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_pico(pretrained=False, **kwargs):
+    # timm pico variant
+    model_args = dict(
+        depths=(2, 2, 6, 2), dims=(64, 128, 256, 512), conv_mlp=True, **kwargs)
+    model = _create_convnext('convnext_pico', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_pico_ols(pretrained=False, **kwargs):
+    # timm nano variant with overlapping 3x3 conv stem
+    model_args = dict(
+        depths=(2, 2, 6, 2), dims=(64, 128, 256, 512), conv_mlp=True,  stem_type='overlap_tiered', **kwargs)
+    model = _create_convnext('convnext_pico_ols', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_nano(pretrained=False, **kwargs):
+    # timm nano variant with standard stem and head
+    model_args = dict(
+        depths=(2, 2, 8, 2), dims=(80, 160, 320, 640), conv_mlp=True, **kwargs)
+    model = _create_convnext('convnext_nano', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_nano_ols(pretrained=False, **kwargs):
+    # experimental nano variant with overlapping conv stem
+    model_args = dict(
+        depths=(2, 2, 8, 2), dims=(80, 160, 320, 640), conv_mlp=True, stem_type='overlap', **kwargs)
+    model = _create_convnext('convnext_nano_ols', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_tiny_hnf(pretrained=False, **kwargs):
+    # experimental tiny variant with norm before pooling in head (head norm first)
+    model_args = dict(
+        depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), head_norm_first=True, conv_mlp=True, **kwargs)
+    model = _create_convnext('convnext_tiny_hnf', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_tiny(pretrained=False, **kwargs):
+    model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), **kwargs)
+    model = _create_convnext('convnext_tiny', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_small(pretrained=False, **kwargs):
+    model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
+    model = _create_convnext('convnext_small', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_base(pretrained=False, **kwargs):
+    model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
+    model = _create_convnext('convnext_base', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_large(pretrained=False, **kwargs):
+    model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
+    model = _create_convnext('convnext_large', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_tiny_in22ft1k(pretrained=False, **kwargs):
+    model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), **kwargs)
+    model = _create_convnext('convnext_tiny_in22ft1k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_small_in22ft1k(pretrained=False, **kwargs):
+    model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
+    model = _create_convnext('convnext_small_in22ft1k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_base_in22ft1k(pretrained=False, **kwargs):
+    model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
+    model = _create_convnext('convnext_base_in22ft1k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_large_in22ft1k(pretrained=False, **kwargs):
+    model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
+    model = _create_convnext('convnext_large_in22ft1k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_xlarge_in22ft1k(pretrained=False, **kwargs):
+    model_args = dict(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs)
+    model = _create_convnext('convnext_xlarge_in22ft1k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_tiny_384_in22ft1k(pretrained=False, **kwargs):
+    model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), **kwargs)
+    model = _create_convnext('convnext_tiny_384_in22ft1k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_small_384_in22ft1k(pretrained=False, **kwargs):
+    model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
+    model = _create_convnext('convnext_small_384_in22ft1k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_base_384_in22ft1k(pretrained=False, **kwargs):
+    model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
+    model = _create_convnext('convnext_base_384_in22ft1k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_large_384_in22ft1k(pretrained=False, **kwargs):
+    model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
+    model = _create_convnext('convnext_large_384_in22ft1k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_xlarge_384_in22ft1k(pretrained=False, **kwargs):
+    model_args = dict(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs)
+    model = _create_convnext('convnext_xlarge_384_in22ft1k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_tiny_in22k(pretrained=False, **kwargs):
+    model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), **kwargs)
+    model = _create_convnext('convnext_tiny_in22k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_small_in22k(pretrained=False, **kwargs):
+    model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
+    model = _create_convnext('convnext_small_in22k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_base_in22k(pretrained=False, **kwargs):
+    model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
+    model = _create_convnext('convnext_base_in22k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_large_in22k(pretrained=False, **kwargs):
+    model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
+    model = _create_convnext('convnext_large_in22k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def convnext_xlarge_in22k(pretrained=False, **kwargs):
+    model_args = dict(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs)
+    model = _create_convnext('convnext_xlarge_in22k', pretrained=pretrained, **model_args)
+    return model
diff --git a/src/custom_timm/models/crossvit.py b/src/custom_timm/models/crossvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb996207da81e19b932c44d36af020267e227357
--- /dev/null
+++ b/src/custom_timm/models/crossvit.py
@@ -0,0 +1,539 @@
+""" CrossViT Model
+
+@inproceedings{
+    chen2021crossvit,
+    title={{CrossViT: Cross-Attention Multi-Scale Vision Transformer for Image Classification}},
+    author={Chun-Fu (Richard) Chen and Quanfu Fan and Rameswar Panda},
+    booktitle={International Conference on Computer Vision (ICCV)},
+    year={2021}
+}
+
+Paper link: https://arxiv.org/abs/2103.14899
+Original code: https://github.com/IBM/CrossViT/blob/main/models/crossvit.py
+
+NOTE: model names have been renamed from originals to represent actual input res all *_224 -> *_240 and *_384 -> *_408
+
+Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman
+"""
+
+# Copyright IBM All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+"""
+Modifed from custom_timm. https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+
+"""
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.hub
+from functools import partial
+from typing import List
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .fx_features import register_notrace_function
+from .helpers import build_model_with_cfg
+from .layers import DropPath, to_2tuple, trunc_normal_, _assert
+from .registry import register_model
+from .vision_transformer import Mlp, Block
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 240, 240), 'pool_size': None, 'crop_pct': 0.875,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, 'fixed_input_size': True,
+        'first_conv': ('patch_embed.0.proj', 'patch_embed.1.proj'),
+        'classifier': ('head.0', 'head.1'),
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'crossvit_15_240': _cfg(url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_15_224.pth'),
+    'crossvit_15_dagger_240': _cfg(
+        url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_15_dagger_224.pth',
+        first_conv=('patch_embed.0.proj.0', 'patch_embed.1.proj.0'),
+    ),
+    'crossvit_15_dagger_408': _cfg(
+        url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_15_dagger_384.pth',
+        input_size=(3, 408, 408), first_conv=('patch_embed.0.proj.0', 'patch_embed.1.proj.0'), crop_pct=1.0,
+    ),
+    'crossvit_18_240': _cfg(url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_18_224.pth'),
+    'crossvit_18_dagger_240': _cfg(
+        url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_18_dagger_224.pth',
+        first_conv=('patch_embed.0.proj.0', 'patch_embed.1.proj.0'),
+    ),
+    'crossvit_18_dagger_408': _cfg(
+        url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_18_dagger_384.pth',
+        input_size=(3, 408, 408), first_conv=('patch_embed.0.proj.0', 'patch_embed.1.proj.0'), crop_pct=1.0,
+    ),
+    'crossvit_9_240': _cfg(url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_9_224.pth'),
+    'crossvit_9_dagger_240': _cfg(
+        url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_9_dagger_224.pth',
+        first_conv=('patch_embed.0.proj.0', 'patch_embed.1.proj.0'),
+    ),
+    'crossvit_base_240': _cfg(
+        url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_base_224.pth'),
+    'crossvit_small_240': _cfg(
+        url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_small_224.pth'),
+    'crossvit_tiny_240': _cfg(
+        url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_tiny_224.pth'),
+}
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, multi_conv=False):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        if multi_conv:
+            if patch_size[0] == 12:
+                self.proj = nn.Sequential(
+                    nn.Conv2d(in_chans, embed_dim // 4, kernel_size=7, stride=4, padding=3),
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(embed_dim // 4, embed_dim // 2, kernel_size=3, stride=3, padding=0),
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(embed_dim // 2, embed_dim, kernel_size=3, stride=1, padding=1),
+                )
+            elif patch_size[0] == 16:
+                self.proj = nn.Sequential(
+                    nn.Conv2d(in_chans, embed_dim // 4, kernel_size=7, stride=4, padding=3),
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(embed_dim // 4, embed_dim // 2, kernel_size=3, stride=2, padding=1),
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(embed_dim // 2, embed_dim, kernel_size=3, stride=2, padding=1),
+                )
+        else:
+            self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        _assert(H == self.img_size[0],
+                f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]}).")
+        _assert(W == self.img_size[1],
+                f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]}).")
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.wq = nn.Linear(dim, dim, bias=qkv_bias)
+        self.wk = nn.Linear(dim, dim, bias=qkv_bias)
+        self.wv = nn.Linear(dim, dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        # B1C -> B1H(C/H) -> BH1(C/H)
+        q = self.wq(x[:, 0:1, ...]).reshape(B, 1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        # BNC -> BNH(C/H) -> BHN(C/H)
+        k = self.wk(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        # BNC -> BNH(C/H) -> BHN(C/H)
+        v = self.wv(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale  # BH1(C/H) @ BH(C/H)N -> BH1N
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, 1, C)  # (BH1N @ BHN(C/H)) -> BH1(C/H) -> B1H(C/H) -> B1C
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class CrossAttentionBlock(nn.Module):
+
+    def __init__(
+            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+            drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = CrossAttention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        x = x[:, 0:1, ...] + self.drop_path(self.attn(self.norm1(x)))
+        return x
+
+
+class MultiScaleBlock(nn.Module):
+
+    def __init__(self, dim, patches, depth, num_heads, mlp_ratio, qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+
+        num_branches = len(dim)
+        self.num_branches = num_branches
+        # different branch could have different embedding size, the first one is the base
+        self.blocks = nn.ModuleList()
+        for d in range(num_branches):
+            tmp = []
+            for i in range(depth[d]):
+                tmp.append(Block(
+                    dim=dim[d], num_heads=num_heads[d], mlp_ratio=mlp_ratio[d], qkv_bias=qkv_bias,
+                    drop=drop, attn_drop=attn_drop, drop_path=drop_path[i], norm_layer=norm_layer))
+            if len(tmp) != 0:
+                self.blocks.append(nn.Sequential(*tmp))
+
+        if len(self.blocks) == 0:
+            self.blocks = None
+
+        self.projs = nn.ModuleList()
+        for d in range(num_branches):
+            if dim[d] == dim[(d + 1) % num_branches] and False:
+                tmp = [nn.Identity()]
+            else:
+                tmp = [norm_layer(dim[d]), act_layer(), nn.Linear(dim[d], dim[(d + 1) % num_branches])]
+            self.projs.append(nn.Sequential(*tmp))
+
+        self.fusion = nn.ModuleList()
+        for d in range(num_branches):
+            d_ = (d + 1) % num_branches
+            nh = num_heads[d_]
+            if depth[-1] == 0:  # backward capability:
+                self.fusion.append(
+                    CrossAttentionBlock(
+                        dim=dim[d_], num_heads=nh, mlp_ratio=mlp_ratio[d], qkv_bias=qkv_bias,
+                        drop=drop, attn_drop=attn_drop, drop_path=drop_path[-1], norm_layer=norm_layer))
+            else:
+                tmp = []
+                for _ in range(depth[-1]):
+                    tmp.append(CrossAttentionBlock(
+                        dim=dim[d_], num_heads=nh, mlp_ratio=mlp_ratio[d], qkv_bias=qkv_bias,
+                        drop=drop, attn_drop=attn_drop, drop_path=drop_path[-1], norm_layer=norm_layer))
+                self.fusion.append(nn.Sequential(*tmp))
+
+        self.revert_projs = nn.ModuleList()
+        for d in range(num_branches):
+            if dim[(d + 1) % num_branches] == dim[d] and False:
+                tmp = [nn.Identity()]
+            else:
+                tmp = [norm_layer(dim[(d + 1) % num_branches]), act_layer(),
+                       nn.Linear(dim[(d + 1) % num_branches], dim[d])]
+            self.revert_projs.append(nn.Sequential(*tmp))
+
+    def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]:
+
+        outs_b = []
+        for i, block in enumerate(self.blocks):
+            outs_b.append(block(x[i]))
+
+        # only take the cls token out
+        proj_cls_token = torch.jit.annotate(List[torch.Tensor], [])
+        for i, proj in enumerate(self.projs):
+            proj_cls_token.append(proj(outs_b[i][:, 0:1, ...]))
+
+        # cross attention
+        outs = []
+        for i, (fusion, revert_proj) in enumerate(zip(self.fusion, self.revert_projs)):
+            tmp = torch.cat((proj_cls_token[i], outs_b[(i + 1) % self.num_branches][:, 1:, ...]), dim=1)
+            tmp = fusion(tmp)
+            reverted_proj_cls_token = revert_proj(tmp[:, 0:1, ...])
+            tmp = torch.cat((reverted_proj_cls_token, outs_b[i][:, 1:, ...]), dim=1)
+            outs.append(tmp)
+        return outs
+
+
+def _compute_num_patches(img_size, patches):
+    return [i[0] // p * i[1] // p for i, p in zip(img_size, patches)]
+
+
+@register_notrace_function
+def scale_image(x, ss: Tuple[int, int], crop_scale: bool = False):  # annotations for torchscript
+    """
+    Pulled out of CrossViT.forward_features to bury conditional logic in a leaf node for FX tracing.
+    Args:
+        x (Tensor): input image
+        ss (tuple[int, int]): height and width to scale to
+        crop_scale (bool): whether to crop instead of interpolate to achieve the desired scale. Defaults to False
+    Returns:
+        Tensor: the "scaled" image batch tensor
+    """
+    H, W = x.shape[-2:]
+    if H != ss[0] or W != ss[1]:
+        if crop_scale and ss[0] <= H and ss[1] <= W:
+            cu, cl = int(round((H - ss[0]) / 2.)), int(round((W - ss[1]) / 2.))
+            x = x[:, :, cu:cu + ss[0], cl:cl + ss[1]]
+        else:
+            x = torch.nn.functional.interpolate(x, size=ss, mode='bicubic', align_corners=False)
+    return x
+
+
+class CrossViT(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(
+            self, img_size=224, img_scale=(1.0, 1.0), patch_size=(8, 16), in_chans=3, num_classes=1000,
+            embed_dim=(192, 384), depth=((1, 3, 1), (1, 3, 1), (1, 3, 1)), num_heads=(6, 12), mlp_ratio=(2., 2., 4.),
+            multi_conv=False, crop_scale=False, qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6), global_pool='token',
+    ):
+        super().__init__()
+        assert global_pool in ('token', 'avg')
+
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.img_size = to_2tuple(img_size)
+        img_scale = to_2tuple(img_scale)
+        self.img_size_scaled = [tuple([int(sj * si) for sj in self.img_size]) for si in img_scale]
+        self.crop_scale = crop_scale  # crop instead of interpolate for scale
+        num_patches = _compute_num_patches(self.img_size_scaled, patch_size)
+        self.num_branches = len(patch_size)
+        self.embed_dim = embed_dim
+        self.num_features = sum(embed_dim)
+        self.patch_embed = nn.ModuleList()
+
+        # hard-coded for torch jit script
+        for i in range(self.num_branches):
+            setattr(self, f'pos_embed_{i}', nn.Parameter(torch.zeros(1, 1 + num_patches[i], embed_dim[i])))
+            setattr(self, f'cls_token_{i}', nn.Parameter(torch.zeros(1, 1, embed_dim[i])))
+
+        for im_s, p, d in zip(self.img_size_scaled, patch_size, embed_dim):
+            self.patch_embed.append(
+                PatchEmbed(img_size=im_s, patch_size=p, in_chans=in_chans, embed_dim=d, multi_conv=multi_conv))
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        total_depth = sum([sum(x[-2:]) for x in depth])
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, total_depth)]  # stochastic depth decay rule
+        dpr_ptr = 0
+        self.blocks = nn.ModuleList()
+        for idx, block_cfg in enumerate(depth):
+            curr_depth = max(block_cfg[:-1]) + block_cfg[-1]
+            dpr_ = dpr[dpr_ptr:dpr_ptr + curr_depth]
+            blk = MultiScaleBlock(
+                embed_dim, num_patches, block_cfg, num_heads=num_heads, mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr_, norm_layer=norm_layer)
+            dpr_ptr += curr_depth
+            self.blocks.append(blk)
+
+        self.norm = nn.ModuleList([norm_layer(embed_dim[i]) for i in range(self.num_branches)])
+        self.head = nn.ModuleList([
+            nn.Linear(embed_dim[i], num_classes) if num_classes > 0 else nn.Identity()
+            for i in range(self.num_branches)])
+
+        for i in range(self.num_branches):
+            trunc_normal_(getattr(self, f'pos_embed_{i}'), std=.02)
+            trunc_normal_(getattr(self, f'cls_token_{i}'), std=.02)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        out = set()
+        for i in range(self.num_branches):
+            out.add(f'cls_token_{i}')
+            pe = getattr(self, f'pos_embed_{i}', None)
+            if pe is not None and pe.requires_grad:
+                out.add(f'pos_embed_{i}')
+        return out
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^cls_token|pos_embed|patch_embed',  # stem and embed
+            blocks=[(r'^blocks\.(\d+)', None), (r'^norm', (99999,))]
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        assert not enable, 'gradient checkpointing not supported'
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ('token', 'avg')
+            self.global_pool = global_pool
+        self.head = nn.ModuleList(
+            [nn.Linear(self.embed_dim[i], num_classes) if num_classes > 0 else nn.Identity() for i in
+             range(self.num_branches)])
+
+    def forward_features(self, x) -> List[torch.Tensor]:
+        B = x.shape[0]
+        xs = []
+        for i, patch_embed in enumerate(self.patch_embed):
+            x_ = x
+            ss = self.img_size_scaled[i]
+            x_ = scale_image(x_, ss, self.crop_scale)
+            x_ = patch_embed(x_)
+            cls_tokens = self.cls_token_0 if i == 0 else self.cls_token_1  # hard-coded for torch jit script
+            cls_tokens = cls_tokens.expand(B, -1, -1)
+            x_ = torch.cat((cls_tokens, x_), dim=1)
+            pos_embed = self.pos_embed_0 if i == 0 else self.pos_embed_1  # hard-coded for torch jit script
+            x_ = x_ + pos_embed
+            x_ = self.pos_drop(x_)
+            xs.append(x_)
+
+        for i, blk in enumerate(self.blocks):
+            xs = blk(xs)
+
+        # NOTE: was before branch token section, move to here to assure all branch token are before layer norm
+        xs = [norm(xs[i]) for i, norm in enumerate(self.norm)]
+        return xs
+
+    def forward_head(self, xs: List[torch.Tensor], pre_logits: bool = False) -> torch.Tensor:
+        xs = [x[:, 1:].mean(dim=1) for x in xs] if self.global_pool == 'avg' else [x[:, 0] for x in xs]
+        if pre_logits or isinstance(self.head[0], nn.Identity):
+            return torch.cat([x for x in xs], dim=1)
+        return torch.mean(torch.stack([head(xs[i]) for i, head in enumerate(self.head)], dim=0), dim=0)
+
+    def forward(self, x):
+        xs = self.forward_features(x)
+        x = self.forward_head(xs)
+        return x
+
+
+def _create_crossvit(variant, pretrained=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+
+    def pretrained_filter_fn(state_dict):
+        new_state_dict = {}
+        for key in state_dict.keys():
+            if 'pos_embed' in key or 'cls_token' in key:
+                new_key = key.replace(".", "_")
+            else:
+                new_key = key
+            new_state_dict[new_key] = state_dict[key]
+        return new_state_dict
+
+    return build_model_with_cfg(
+        CrossViT, variant, pretrained,
+        pretrained_filter_fn=pretrained_filter_fn,
+        **kwargs)
+
+
+@register_model
+def crossvit_tiny_240(pretrained=False, **kwargs):
+    model_args = dict(
+        img_scale=(1.0, 224/240), patch_size=[12, 16], embed_dim=[96, 192], depth=[[1, 4, 0], [1, 4, 0], [1, 4, 0]],
+        num_heads=[3, 3], mlp_ratio=[4, 4, 1], **kwargs)
+    model = _create_crossvit(variant='crossvit_tiny_240', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def crossvit_small_240(pretrained=False, **kwargs):
+    model_args = dict(
+        img_scale=(1.0, 224/240), patch_size=[12, 16], embed_dim=[192, 384], depth=[[1, 4, 0], [1, 4, 0], [1, 4, 0]],
+        num_heads=[6, 6], mlp_ratio=[4, 4, 1], **kwargs)
+    model = _create_crossvit(variant='crossvit_small_240', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def crossvit_base_240(pretrained=False, **kwargs):
+    model_args = dict(
+        img_scale=(1.0, 224/240), patch_size=[12, 16], embed_dim=[384, 768], depth=[[1, 4, 0], [1, 4, 0], [1, 4, 0]],
+        num_heads=[12, 12], mlp_ratio=[4, 4, 1], **kwargs)
+    model = _create_crossvit(variant='crossvit_base_240', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def crossvit_9_240(pretrained=False, **kwargs):
+    model_args = dict(
+        img_scale=(1.0, 224/240), patch_size=[12, 16], embed_dim=[128, 256], depth=[[1, 3, 0], [1, 3, 0], [1, 3, 0]],
+        num_heads=[4, 4], mlp_ratio=[3, 3, 1], **kwargs)
+    model = _create_crossvit(variant='crossvit_9_240', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def crossvit_15_240(pretrained=False, **kwargs):
+    model_args = dict(
+        img_scale=(1.0, 224/240), patch_size=[12, 16], embed_dim=[192, 384], depth=[[1, 5, 0], [1, 5, 0], [1, 5, 0]],
+        num_heads=[6, 6], mlp_ratio=[3, 3, 1], **kwargs)
+    model = _create_crossvit(variant='crossvit_15_240', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def crossvit_18_240(pretrained=False, **kwargs):
+    model_args = dict(
+        img_scale=(1.0, 224 / 240), patch_size=[12, 16], embed_dim=[224, 448], depth=[[1, 6, 0], [1, 6, 0], [1, 6, 0]],
+        num_heads=[7, 7], mlp_ratio=[3, 3, 1], **kwargs)
+    model = _create_crossvit(variant='crossvit_18_240', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def crossvit_9_dagger_240(pretrained=False, **kwargs):
+    model_args = dict(
+        img_scale=(1.0, 224 / 240), patch_size=[12, 16], embed_dim=[128, 256], depth=[[1, 3, 0], [1, 3, 0], [1, 3, 0]],
+        num_heads=[4, 4], mlp_ratio=[3, 3, 1], multi_conv=True, **kwargs)
+    model = _create_crossvit(variant='crossvit_9_dagger_240', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def crossvit_15_dagger_240(pretrained=False, **kwargs):
+    model_args = dict(
+        img_scale=(1.0, 224/240), patch_size=[12, 16], embed_dim=[192, 384], depth=[[1, 5, 0], [1, 5, 0], [1, 5, 0]],
+        num_heads=[6, 6], mlp_ratio=[3, 3, 1], multi_conv=True, **kwargs)
+    model = _create_crossvit(variant='crossvit_15_dagger_240', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def crossvit_15_dagger_408(pretrained=False, **kwargs):
+    model_args = dict(
+        img_scale=(1.0, 384/408), patch_size=[12, 16], embed_dim=[192, 384], depth=[[1, 5, 0], [1, 5, 0], [1, 5, 0]],
+        num_heads=[6, 6], mlp_ratio=[3, 3, 1], multi_conv=True, **kwargs)
+    model = _create_crossvit(variant='crossvit_15_dagger_408', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def crossvit_18_dagger_240(pretrained=False, **kwargs):
+    model_args = dict(
+        img_scale=(1.0, 224/240), patch_size=[12, 16], embed_dim=[224, 448], depth=[[1, 6, 0], [1, 6, 0], [1, 6, 0]],
+        num_heads=[7, 7], mlp_ratio=[3, 3, 1], multi_conv=True, **kwargs)
+    model = _create_crossvit(variant='crossvit_18_dagger_240', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def crossvit_18_dagger_408(pretrained=False, **kwargs):
+    model_args = dict(
+        img_scale=(1.0, 384/408), patch_size=[12, 16], embed_dim=[224, 448], depth=[[1, 6, 0], [1, 6, 0], [1, 6, 0]],
+        num_heads=[7, 7], mlp_ratio=[3, 3, 1], multi_conv=True, **kwargs)
+    model = _create_crossvit(variant='crossvit_18_dagger_408', pretrained=pretrained, **model_args)
+    return model
diff --git a/src/custom_timm/models/cspnet.py b/src/custom_timm/models/cspnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e19ec29f7b14cdf58368a8cbea5cdccee43b07e
--- /dev/null
+++ b/src/custom_timm/models/cspnet.py
@@ -0,0 +1,1083 @@
+"""PyTorch CspNet
+
+A PyTorch implementation of Cross Stage Partial Networks including:
+* CSPResNet50
+* CSPResNeXt50
+* CSPDarkNet53
+* and DarkNet53 for good measure
+
+Based on paper `CSPNet: A New Backbone that can Enhance Learning Capability of CNN` - https://arxiv.org/abs/1911.11929
+
+Reference impl via darknet cfg files at https://github.com/WongKinYiu/CrossStagePartialNetworks
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import collections.abc
+from dataclasses import dataclass, field, asdict
+from functools import partial
+from typing import Any, Callable, Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg, named_apply, MATCH_PREV_GROUP
+from .layers import ClassifierHead, ConvNormAct, ConvNormActAa, DropPath, get_attn, create_act_layer, make_divisible
+from .registry import register_model
+
+
+__all__ = ['CspNet']  # model_registry will add each entrypoint fn to this
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 256, 256), 'pool_size': (8, 8),
+        'crop_pct': 0.887, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.conv1.conv', 'classifier': 'head.fc',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'cspresnet50': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/cspresnet50_ra-d3e8d487.pth'),
+    'cspresnet50d': _cfg(url=''),
+    'cspresnet50w': _cfg(url=''),
+    'cspresnext50': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/cspresnext50_ra_224-648b4713.pth',
+    ),
+    'cspdarknet53': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/cspdarknet53_ra_256-d05c7c21.pth'),
+
+    'darknet17': _cfg(url=''),
+    'darknet21': _cfg(url=''),
+    'sedarknet21': _cfg(url=''),
+    'darknet53': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/darknet53_256_c2ns-3aeff817.pth',
+        interpolation='bicubic', test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'darknetaa53': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/darknetaa53_c2ns-5c28ec8a.pth',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+
+    'cs3darknet_s': _cfg(
+        url='', interpolation='bicubic'),
+    'cs3darknet_m': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3darknet_m_c2ns-43f06604.pth',
+        interpolation='bicubic', test_input_size=(3, 288, 288), test_crop_pct=0.95,
+    ),
+    'cs3darknet_l': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3darknet_l_c2ns-16220c5d.pth',
+        interpolation='bicubic', test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'cs3darknet_x': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3darknet_x_c2ns-4e4490aa.pth',
+        interpolation='bicubic', crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+
+    'cs3darknet_focus_s': _cfg(
+        url='', interpolation='bicubic'),
+    'cs3darknet_focus_m': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3darknet_focus_m_c2ns-e23bed41.pth',
+        interpolation='bicubic', test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'cs3darknet_focus_l': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3darknet_focus_l_c2ns-65ef8888.pth',
+        interpolation='bicubic', test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'cs3darknet_focus_x': _cfg(
+        url='', interpolation='bicubic'),
+
+    'cs3sedarknet_l': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3sedarknet_l_c2ns-e8d1dc13.pth',
+        interpolation='bicubic', test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'cs3sedarknet_x': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3sedarknet_x_c2ns-b4d0abc0.pth',
+        interpolation='bicubic', test_input_size=(3, 288, 288), test_crop_pct=1.0),
+
+    'cs3sedarknet_xdw': _cfg(
+        url='', interpolation='bicubic'),
+
+    'cs3edgenet_x': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3edgenet_x_c2-2e1610a9.pth',
+        interpolation='bicubic', test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'cs3se_edgenet_x': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3se_edgenet_x_c2ns-76f8e3ac.pth',
+        interpolation='bicubic', crop_pct=0.95, test_input_size=(3, 320, 320), test_crop_pct=1.0),
+}
+
+
+@dataclass
+class CspStemCfg:
+    out_chs: Union[int, Tuple[int, ...]] = 32
+    stride: Union[int, Tuple[int, ...]] = 2
+    kernel_size: int = 3
+    padding: Union[int, str] = ''
+    pool: Optional[str] = ''
+
+
+def _pad_arg(x, n):
+    # pads an argument tuple to specified n by padding with last value
+    if not isinstance(x, (tuple, list)):
+        x = (x,)
+    curr_n = len(x)
+    pad_n = n - curr_n
+    if pad_n <= 0:
+        return x[:n]
+    return tuple(x + (x[-1],) * pad_n)
+
+
+@dataclass
+class CspStagesCfg:
+    depth: Tuple[int, ...] = (3, 3, 5, 2)  # block depth (number of block repeats in stages)
+    out_chs: Tuple[int, ...] = (128, 256, 512, 1024)  # number of output channels for blocks in stage
+    stride: Union[int, Tuple[int, ...]] = 2  # stride of stage
+    groups: Union[int, Tuple[int, ...]] = 1  # num kxk conv groups
+    block_ratio: Union[float, Tuple[float, ...]] = 1.0
+    bottle_ratio: Union[float, Tuple[float, ...]] = 1.  # bottleneck-ratio of blocks in stage
+    avg_down: Union[bool, Tuple[bool, ...]] = False
+    attn_layer: Optional[Union[str, Tuple[str, ...]]] = None
+    attn_kwargs: Optional[Union[Dict, Tuple[Dict]]] = None
+    stage_type: Union[str, Tuple[str]] = 'csp'  # stage type ('csp', 'cs2', 'dark')
+    block_type: Union[str, Tuple[str]] = 'bottle'  # blocks type for stages ('bottle', 'dark')
+
+    # cross-stage only
+    expand_ratio: Union[float, Tuple[float, ...]] = 1.0
+    cross_linear: Union[bool, Tuple[bool, ...]] = False
+    down_growth: Union[bool, Tuple[bool, ...]] = False
+
+    def __post_init__(self):
+        n = len(self.depth)
+        assert len(self.out_chs) == n
+        self.stride = _pad_arg(self.stride, n)
+        self.groups = _pad_arg(self.groups, n)
+        self.block_ratio = _pad_arg(self.block_ratio, n)
+        self.bottle_ratio = _pad_arg(self.bottle_ratio, n)
+        self.avg_down = _pad_arg(self.avg_down, n)
+        self.attn_layer = _pad_arg(self.attn_layer, n)
+        self.attn_kwargs = _pad_arg(self.attn_kwargs, n)
+        self.stage_type = _pad_arg(self.stage_type, n)
+        self.block_type = _pad_arg(self.block_type, n)
+
+        self.expand_ratio = _pad_arg(self.expand_ratio, n)
+        self.cross_linear = _pad_arg(self.cross_linear, n)
+        self.down_growth = _pad_arg(self.down_growth, n)
+
+
+@dataclass
+class CspModelCfg:
+    stem: CspStemCfg
+    stages: CspStagesCfg
+    zero_init_last: bool = True  # zero init last weight (usually bn) in residual path
+    act_layer: str = 'leaky_relu'
+    norm_layer: str = 'batchnorm'
+    aa_layer: Optional[str] = None  # FIXME support string factory for this
+
+
+def _cs3_cfg(
+        width_multiplier=1.0,
+        depth_multiplier=1.0,
+        avg_down=False,
+        act_layer='silu',
+        focus=False,
+        attn_layer=None,
+        attn_kwargs=None,
+        bottle_ratio=1.0,
+        block_type='dark',
+):
+    if focus:
+        stem_cfg = CspStemCfg(
+            out_chs=make_divisible(64 * width_multiplier),
+            kernel_size=6, stride=2, padding=2, pool='')
+    else:
+        stem_cfg = CspStemCfg(
+            out_chs=tuple([make_divisible(c * width_multiplier) for c in (32, 64)]),
+            kernel_size=3, stride=2, pool='')
+    return CspModelCfg(
+        stem=stem_cfg,
+        stages=CspStagesCfg(
+            out_chs=tuple([make_divisible(c * width_multiplier) for c in (128, 256, 512, 1024)]),
+            depth=tuple([int(d * depth_multiplier) for d in (3, 6, 9, 3)]),
+            stride=2,
+            bottle_ratio=bottle_ratio,
+            block_ratio=0.5,
+            avg_down=avg_down,
+            attn_layer=attn_layer,
+            attn_kwargs=attn_kwargs,
+            stage_type='cs3',
+            block_type=block_type,
+        ),
+        act_layer=act_layer,
+    )
+
+
+model_cfgs = dict(
+    cspresnet50=CspModelCfg(
+        stem=CspStemCfg(out_chs=64, kernel_size=7, stride=4, pool='max'),
+        stages=CspStagesCfg(
+            depth=(3, 3, 5, 2),
+            out_chs=(128, 256, 512, 1024),
+            stride=(1, 2),
+            expand_ratio=2.,
+            bottle_ratio=0.5,
+            cross_linear=True,
+        ),
+    ),
+    cspresnet50d=CspModelCfg(
+        stem=CspStemCfg(out_chs=(32, 32, 64), kernel_size=3, stride=4, pool='max'),
+        stages=CspStagesCfg(
+            depth=(3, 3, 5, 2),
+            out_chs=(128, 256, 512, 1024),
+            stride=(1,) + (2,),
+            expand_ratio=2.,
+            bottle_ratio=0.5,
+            block_ratio=1.,
+            cross_linear=True,
+        ),
+    ),
+    cspresnet50w=CspModelCfg(
+        stem=CspStemCfg(out_chs=(32, 32, 64), kernel_size=3, stride=4, pool='max'),
+        stages=CspStagesCfg(
+            depth=(3, 3, 5, 2),
+            out_chs=(256, 512, 1024, 2048),
+            stride=(1,) + (2,),
+            expand_ratio=1.,
+            bottle_ratio=0.25,
+            block_ratio=0.5,
+            cross_linear=True,
+        ),
+    ),
+    cspresnext50=CspModelCfg(
+        stem=CspStemCfg(out_chs=64, kernel_size=7, stride=4, pool='max'),
+        stages=CspStagesCfg(
+            depth=(3, 3, 5, 2),
+            out_chs=(256, 512, 1024, 2048),
+            stride=(1,) + (2,),
+            groups=32,
+            expand_ratio=1.,
+            bottle_ratio=1.,
+            block_ratio=0.5,
+            cross_linear=True,
+        ),
+    ),
+    cspdarknet53=CspModelCfg(
+        stem=CspStemCfg(out_chs=32, kernel_size=3, stride=1, pool=''),
+        stages=CspStagesCfg(
+            depth=(1, 2, 8, 8, 4),
+            out_chs=(64, 128, 256, 512, 1024),
+            stride=2,
+            expand_ratio=(2.,) + (1.,),
+            bottle_ratio=(0.5,) + (1.,),
+            block_ratio=(1.,) + (0.5,),
+            down_growth=True,
+            block_type='dark',
+        ),
+    ),
+    darknet17=CspModelCfg(
+        stem=CspStemCfg(out_chs=32, kernel_size=3, stride=1, pool=''),
+        stages=CspStagesCfg(
+            depth=(1,) * 5,
+            out_chs=(64, 128, 256, 512, 1024),
+            stride=(2,),
+            bottle_ratio=(0.5,),
+            block_ratio=(1.,),
+            stage_type='dark',
+            block_type='dark',
+        ),
+    ),
+    darknet21=CspModelCfg(
+        stem=CspStemCfg(out_chs=32, kernel_size=3, stride=1, pool=''),
+        stages=CspStagesCfg(
+            depth=(1, 1, 1, 2, 2),
+            out_chs=(64, 128, 256, 512, 1024),
+            stride=(2,),
+            bottle_ratio=(0.5,),
+            block_ratio=(1.,),
+            stage_type='dark',
+            block_type='dark',
+
+        ),
+    ),
+    sedarknet21=CspModelCfg(
+        stem=CspStemCfg(out_chs=32, kernel_size=3, stride=1, pool=''),
+        stages=CspStagesCfg(
+            depth=(1, 1, 1, 2, 2),
+            out_chs=(64, 128, 256, 512, 1024),
+            stride=2,
+            bottle_ratio=0.5,
+            block_ratio=1.,
+            attn_layer='se',
+            stage_type='dark',
+            block_type='dark',
+
+        ),
+    ),
+    darknet53=CspModelCfg(
+        stem=CspStemCfg(out_chs=32, kernel_size=3, stride=1, pool=''),
+        stages=CspStagesCfg(
+            depth=(1, 2, 8, 8, 4),
+            out_chs=(64, 128, 256, 512, 1024),
+            stride=2,
+            bottle_ratio=0.5,
+            block_ratio=1.,
+            stage_type='dark',
+            block_type='dark',
+        ),
+    ),
+    darknetaa53=CspModelCfg(
+        stem=CspStemCfg(out_chs=32, kernel_size=3, stride=1, pool=''),
+        stages=CspStagesCfg(
+            depth=(1, 2, 8, 8, 4),
+            out_chs=(64, 128, 256, 512, 1024),
+            stride=2,
+            bottle_ratio=0.5,
+            block_ratio=1.,
+            avg_down=True,
+            stage_type='dark',
+            block_type='dark',
+        ),
+    ),
+
+    cs3darknet_s=_cs3_cfg(width_multiplier=0.5, depth_multiplier=0.5),
+    cs3darknet_m=_cs3_cfg(width_multiplier=0.75, depth_multiplier=0.67),
+    cs3darknet_l=_cs3_cfg(),
+    cs3darknet_x=_cs3_cfg(width_multiplier=1.25, depth_multiplier=1.33),
+
+    cs3darknet_focus_s=_cs3_cfg(width_multiplier=0.5, depth_multiplier=0.5, focus=True),
+    cs3darknet_focus_m=_cs3_cfg(width_multiplier=0.75, depth_multiplier=0.67, focus=True),
+    cs3darknet_focus_l=_cs3_cfg(focus=True),
+    cs3darknet_focus_x=_cs3_cfg(width_multiplier=1.25, depth_multiplier=1.33, focus=True),
+
+    cs3sedarknet_l=_cs3_cfg(attn_layer='se', attn_kwargs=dict(rd_ratio=.25)),
+    cs3sedarknet_x=_cs3_cfg(attn_layer='se', width_multiplier=1.25, depth_multiplier=1.33),
+
+    cs3sedarknet_xdw=CspModelCfg(
+        stem=CspStemCfg(out_chs=(32, 64), kernel_size=3, stride=2, pool=''),
+        stages=CspStagesCfg(
+            depth=(3, 6, 12, 4),
+            out_chs=(256, 512, 1024, 2048),
+            stride=2,
+            groups=(1, 1, 256, 512),
+            bottle_ratio=0.5,
+            block_ratio=0.5,
+            attn_layer='se',
+        ),
+        act_layer='silu',
+    ),
+
+    cs3edgenet_x=_cs3_cfg(width_multiplier=1.25, depth_multiplier=1.33, bottle_ratio=1.5, block_type='edge'),
+    cs3se_edgenet_x=_cs3_cfg(
+        width_multiplier=1.25, depth_multiplier=1.33, bottle_ratio=1.5, block_type='edge',
+        attn_layer='se', attn_kwargs=dict(rd_ratio=.25)),
+)
+
+
+class BottleneckBlock(nn.Module):
+    """ ResNe(X)t Bottleneck Block
+    """
+
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            dilation=1,
+            bottle_ratio=0.25,
+            groups=1,
+            act_layer=nn.ReLU,
+            norm_layer=nn.BatchNorm2d,
+            attn_last=False,
+            attn_layer=None,
+            drop_block=None,
+            drop_path=0.
+    ):
+        super(BottleneckBlock, self).__init__()
+        mid_chs = int(round(out_chs * bottle_ratio))
+        ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer)
+        attn_last = attn_layer is not None and attn_last
+        attn_first = attn_layer is not None and not attn_last
+
+        self.conv1 = ConvNormAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
+        self.conv2 = ConvNormAct(
+            mid_chs, mid_chs, kernel_size=3, dilation=dilation, groups=groups,
+            drop_layer=drop_block, **ckwargs)
+        self.attn2 = attn_layer(mid_chs, act_layer=act_layer) if attn_first else nn.Identity()
+        self.conv3 = ConvNormAct(mid_chs, out_chs, kernel_size=1, apply_act=False, **ckwargs)
+        self.attn3 = attn_layer(out_chs, act_layer=act_layer) if attn_last else nn.Identity()
+        self.drop_path = DropPath(drop_path) if drop_path else nn.Identity()
+        self.act3 = create_act_layer(act_layer)
+
+    def zero_init_last(self):
+        nn.init.zeros_(self.conv3.bn.weight)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.attn2(x)
+        x = self.conv3(x)
+        x = self.attn3(x)
+        x = self.drop_path(x) + shortcut
+        # FIXME partial shortcut needed if first block handled as per original, not used for my current impl
+        #x[:, :shortcut.size(1)] += shortcut
+        x = self.act3(x)
+        return x
+
+
+class DarkBlock(nn.Module):
+    """ DarkNet Block
+    """
+
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            dilation=1,
+            bottle_ratio=0.5,
+            groups=1,
+            act_layer=nn.ReLU,
+            norm_layer=nn.BatchNorm2d,
+            attn_layer=None,
+            drop_block=None,
+            drop_path=0.
+    ):
+        super(DarkBlock, self).__init__()
+        mid_chs = int(round(out_chs * bottle_ratio))
+        ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer)
+
+        self.conv1 = ConvNormAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
+        self.attn = attn_layer(mid_chs, act_layer=act_layer) if attn_layer is not None else nn.Identity()
+        self.conv2 = ConvNormAct(
+            mid_chs, out_chs, kernel_size=3, dilation=dilation, groups=groups,
+            drop_layer=drop_block, **ckwargs)
+        self.drop_path = DropPath(drop_path) if drop_path else nn.Identity()
+
+    def zero_init_last(self):
+        nn.init.zeros_(self.conv2.bn.weight)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.attn(x)
+        x = self.conv2(x)
+        x = self.drop_path(x) + shortcut
+        return x
+
+
+class EdgeBlock(nn.Module):
+    """ EdgeResidual / Fused-MBConv / MobileNetV1-like 3x3 + 1x1 block (w/ activated output)
+    """
+
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            dilation=1,
+            bottle_ratio=0.5,
+            groups=1,
+            act_layer=nn.ReLU,
+            norm_layer=nn.BatchNorm2d,
+            attn_layer=None,
+            drop_block=None,
+            drop_path=0.
+    ):
+        super(EdgeBlock, self).__init__()
+        mid_chs = int(round(out_chs * bottle_ratio))
+        ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer)
+
+        self.conv1 = ConvNormAct(
+            in_chs, mid_chs, kernel_size=3, dilation=dilation, groups=groups,
+            drop_layer=drop_block, **ckwargs)
+        self.attn = attn_layer(mid_chs, act_layer=act_layer) if attn_layer is not None else nn.Identity()
+        self.conv2 = ConvNormAct(mid_chs, out_chs, kernel_size=1, **ckwargs)
+        self.drop_path = DropPath(drop_path) if drop_path else nn.Identity()
+
+    def zero_init_last(self):
+        nn.init.zeros_(self.conv2.bn.weight)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.attn(x)
+        x = self.conv2(x)
+        x = self.drop_path(x) + shortcut
+        return x
+
+
+class CrossStage(nn.Module):
+    """Cross Stage."""
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            stride,
+            dilation,
+            depth,
+            block_ratio=1.,
+            bottle_ratio=1.,
+            expand_ratio=1.,
+            groups=1,
+            first_dilation=None,
+            avg_down=False,
+            down_growth=False,
+            cross_linear=False,
+            block_dpr=None,
+            block_fn=BottleneckBlock,
+            **block_kwargs
+    ):
+        super(CrossStage, self).__init__()
+        first_dilation = first_dilation or dilation
+        down_chs = out_chs if down_growth else in_chs  # grow downsample channels to output channels
+        self.expand_chs = exp_chs = int(round(out_chs * expand_ratio))
+        block_out_chs = int(round(out_chs * block_ratio))
+        conv_kwargs = dict(act_layer=block_kwargs.get('act_layer'), norm_layer=block_kwargs.get('norm_layer'))
+        aa_layer = block_kwargs.pop('aa_layer', None)
+
+        if stride != 1 or first_dilation != dilation:
+            if avg_down:
+                self.conv_down = nn.Sequential(
+                    nn.AvgPool2d(2) if stride == 2 else nn.Identity(),  # FIXME dilation handling
+                    ConvNormActAa(in_chs, out_chs, kernel_size=1, stride=1, groups=groups, **conv_kwargs)
+                )
+            else:
+                self.conv_down = ConvNormActAa(
+                    in_chs, down_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups,
+                    aa_layer=aa_layer, **conv_kwargs)
+            prev_chs = down_chs
+        else:
+            self.conv_down = nn.Identity()
+            prev_chs = in_chs
+
+        # FIXME this 1x1 expansion is pushed down into the cross and block paths in the darknet cfgs. Also,
+        # there is also special case for the first stage for some of the model that results in uneven split
+        # across the two paths. I did it this way for simplicity for now.
+        self.conv_exp = ConvNormAct(prev_chs, exp_chs, kernel_size=1, apply_act=not cross_linear, **conv_kwargs)
+        prev_chs = exp_chs // 2  # output of conv_exp is always split in two
+
+        self.blocks = nn.Sequential()
+        for i in range(depth):
+            self.blocks.add_module(str(i), block_fn(
+                in_chs=prev_chs,
+                out_chs=block_out_chs,
+                dilation=dilation,
+                bottle_ratio=bottle_ratio,
+                groups=groups,
+                drop_path=block_dpr[i] if block_dpr is not None else 0.,
+                **block_kwargs
+            ))
+            prev_chs = block_out_chs
+
+        # transition convs
+        self.conv_transition_b = ConvNormAct(prev_chs, exp_chs // 2, kernel_size=1, **conv_kwargs)
+        self.conv_transition = ConvNormAct(exp_chs, out_chs, kernel_size=1, **conv_kwargs)
+
+    def forward(self, x):
+        x = self.conv_down(x)
+        x = self.conv_exp(x)
+        xs, xb = x.split(self.expand_chs // 2, dim=1)
+        xb = self.blocks(xb)
+        xb = self.conv_transition_b(xb).contiguous()
+        out = self.conv_transition(torch.cat([xs, xb], dim=1))
+        return out
+
+
+class CrossStage3(nn.Module):
+    """Cross Stage 3.
+    Similar to CrossStage, but with only one transition conv for the output.
+    """
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            stride,
+            dilation,
+            depth,
+            block_ratio=1.,
+            bottle_ratio=1.,
+            expand_ratio=1.,
+            groups=1,
+            first_dilation=None,
+            avg_down=False,
+            down_growth=False,
+            cross_linear=False,
+            block_dpr=None,
+            block_fn=BottleneckBlock,
+            **block_kwargs
+    ):
+        super(CrossStage3, self).__init__()
+        first_dilation = first_dilation or dilation
+        down_chs = out_chs if down_growth else in_chs  # grow downsample channels to output channels
+        self.expand_chs = exp_chs = int(round(out_chs * expand_ratio))
+        block_out_chs = int(round(out_chs * block_ratio))
+        conv_kwargs = dict(act_layer=block_kwargs.get('act_layer'), norm_layer=block_kwargs.get('norm_layer'))
+        aa_layer = block_kwargs.pop('aa_layer', None)
+
+        if stride != 1 or first_dilation != dilation:
+            if avg_down:
+                self.conv_down = nn.Sequential(
+                    nn.AvgPool2d(2) if stride == 2 else nn.Identity(),  # FIXME dilation handling
+                    ConvNormActAa(in_chs, out_chs, kernel_size=1, stride=1, groups=groups, **conv_kwargs)
+                )
+            else:
+                self.conv_down = ConvNormActAa(
+                    in_chs, down_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups,
+                    aa_layer=aa_layer, **conv_kwargs)
+            prev_chs = down_chs
+        else:
+            self.conv_down = None
+            prev_chs = in_chs
+
+        # expansion conv
+        self.conv_exp = ConvNormAct(prev_chs, exp_chs, kernel_size=1, apply_act=not cross_linear, **conv_kwargs)
+        prev_chs = exp_chs // 2  # expanded output is split in 2 for blocks and cross stage
+
+        self.blocks = nn.Sequential()
+        for i in range(depth):
+            self.blocks.add_module(str(i), block_fn(
+                in_chs=prev_chs,
+                out_chs=block_out_chs,
+                dilation=dilation,
+                bottle_ratio=bottle_ratio,
+                groups=groups,
+                drop_path=block_dpr[i] if block_dpr is not None else 0.,
+                **block_kwargs
+            ))
+            prev_chs = block_out_chs
+
+        # transition convs
+        self.conv_transition = ConvNormAct(exp_chs, out_chs, kernel_size=1, **conv_kwargs)
+
+    def forward(self, x):
+        x = self.conv_down(x)
+        x = self.conv_exp(x)
+        x1, x2 = x.split(self.expand_chs // 2, dim=1)
+        x1 = self.blocks(x1)
+        out = self.conv_transition(torch.cat([x1, x2], dim=1))
+        return out
+
+
+class DarkStage(nn.Module):
+    """DarkNet stage."""
+
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            stride,
+            dilation,
+            depth,
+            block_ratio=1.,
+            bottle_ratio=1.,
+            groups=1,
+            first_dilation=None,
+            avg_down=False,
+            block_fn=BottleneckBlock,
+            block_dpr=None,
+            **block_kwargs
+    ):
+        super(DarkStage, self).__init__()
+        first_dilation = first_dilation or dilation
+        conv_kwargs = dict(act_layer=block_kwargs.get('act_layer'), norm_layer=block_kwargs.get('norm_layer'))
+        aa_layer = block_kwargs.pop('aa_layer', None)
+
+        if avg_down:
+            self.conv_down = nn.Sequential(
+                nn.AvgPool2d(2) if stride == 2 else nn.Identity(),   # FIXME dilation handling
+                ConvNormActAa(in_chs, out_chs, kernel_size=1, stride=1, groups=groups, **conv_kwargs)
+            )
+        else:
+            self.conv_down = ConvNormActAa(
+                in_chs, out_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups,
+                aa_layer=aa_layer, **conv_kwargs)
+
+        prev_chs = out_chs
+        block_out_chs = int(round(out_chs * block_ratio))
+        self.blocks = nn.Sequential()
+        for i in range(depth):
+            self.blocks.add_module(str(i), block_fn(
+                in_chs=prev_chs,
+                out_chs=block_out_chs,
+                dilation=dilation,
+                bottle_ratio=bottle_ratio,
+                groups=groups,
+                drop_path=block_dpr[i] if block_dpr is not None else 0.,
+                **block_kwargs
+            ))
+            prev_chs = block_out_chs
+
+    def forward(self, x):
+        x = self.conv_down(x)
+        x = self.blocks(x)
+        return x
+
+
+def create_csp_stem(
+        in_chans=3,
+        out_chs=32,
+        kernel_size=3,
+        stride=2,
+        pool='',
+        padding='',
+        act_layer=nn.ReLU,
+        norm_layer=nn.BatchNorm2d,
+        aa_layer=None
+):
+    stem = nn.Sequential()
+    feature_info = []
+    if not isinstance(out_chs, (tuple, list)):
+        out_chs = [out_chs]
+    stem_depth = len(out_chs)
+    assert stem_depth
+    assert stride in (1, 2, 4)
+    prev_feat = None
+    prev_chs = in_chans
+    last_idx = stem_depth - 1
+    stem_stride = 1
+    for i, chs in enumerate(out_chs):
+        conv_name = f'conv{i + 1}'
+        conv_stride = 2 if (i == 0 and stride > 1) or (i == last_idx and stride > 2 and not pool) else 1
+        if conv_stride > 1 and prev_feat is not None:
+            feature_info.append(prev_feat)
+        stem.add_module(conv_name, ConvNormAct(
+            prev_chs, chs, kernel_size,
+            stride=conv_stride,
+            padding=padding if i == 0 else '',
+            act_layer=act_layer,
+            norm_layer=norm_layer
+        ))
+        stem_stride *= conv_stride
+        prev_chs = chs
+        prev_feat = dict(num_chs=prev_chs, reduction=stem_stride, module='.'.join(['stem', conv_name]))
+    if pool:
+        assert stride > 2
+        if prev_feat is not None:
+            feature_info.append(prev_feat)
+        if aa_layer is not None:
+            stem.add_module('pool', nn.MaxPool2d(kernel_size=3, stride=1, padding=1))
+            stem.add_module('aa', aa_layer(channels=prev_chs, stride=2))
+            pool_name = 'aa'
+        else:
+            stem.add_module('pool', nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
+            pool_name = 'pool'
+        stem_stride *= 2
+        prev_feat = dict(num_chs=prev_chs, reduction=stem_stride, module='.'.join(['stem', pool_name]))
+    feature_info.append(prev_feat)
+    return stem, feature_info
+
+
+def _get_stage_fn(stage_args):
+    stage_type = stage_args.pop('stage_type')
+    assert stage_type in ('dark', 'csp', 'cs3')
+    if stage_type == 'dark':
+        stage_args.pop('expand_ratio', None)
+        stage_args.pop('cross_linear', None)
+        stage_args.pop('down_growth', None)
+        stage_fn = DarkStage
+    elif stage_type == 'csp':
+        stage_fn = CrossStage
+    else:
+        stage_fn = CrossStage3
+    return stage_fn, stage_args
+
+
+def _get_block_fn(stage_args):
+    block_type = stage_args.pop('block_type')
+    assert block_type in ('dark', 'edge', 'bottle')
+    if block_type == 'dark':
+        return DarkBlock, stage_args
+    elif block_type == 'edge':
+        return EdgeBlock, stage_args
+    else:
+        return BottleneckBlock, stage_args
+
+
+def _get_attn_fn(stage_args):
+    attn_layer = stage_args.pop('attn_layer')
+    attn_kwargs = stage_args.pop('attn_kwargs', None) or {}
+    if attn_layer is not None:
+        attn_layer = get_attn(attn_layer)
+        if attn_kwargs:
+            attn_layer = partial(attn_layer, **attn_kwargs)
+    return attn_layer, stage_args
+
+
+def create_csp_stages(
+        cfg: CspModelCfg,
+        drop_path_rate: float,
+        output_stride: int,
+        stem_feat: Dict[str, Any]
+):
+    cfg_dict = asdict(cfg.stages)
+    num_stages = len(cfg.stages.depth)
+    cfg_dict['block_dpr'] = [None] * num_stages if not drop_path_rate else \
+        [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg.stages.depth)).split(cfg.stages.depth)]
+    stage_args = [dict(zip(cfg_dict.keys(), values)) for values in zip(*cfg_dict.values())]
+    block_kwargs = dict(
+        act_layer=cfg.act_layer,
+        norm_layer=cfg.norm_layer,
+    )
+
+    dilation = 1
+    net_stride = stem_feat['reduction']
+    prev_chs = stem_feat['num_chs']
+    prev_feat = stem_feat
+    feature_info = []
+    stages = []
+    for stage_idx, stage_args in enumerate(stage_args):
+        stage_fn, stage_args = _get_stage_fn(stage_args)
+        block_fn, stage_args = _get_block_fn(stage_args)
+        attn_fn, stage_args = _get_attn_fn(stage_args)
+        stride = stage_args.pop('stride')
+        if stride != 1 and prev_feat:
+            feature_info.append(prev_feat)
+        if net_stride >= output_stride and stride > 1:
+            dilation *= stride
+            stride = 1
+        net_stride *= stride
+        first_dilation = 1 if dilation in (1, 2) else 2
+
+        stages += [stage_fn(
+            prev_chs,
+            **stage_args,
+            stride=stride,
+            first_dilation=first_dilation,
+            dilation=dilation,
+            block_fn=block_fn,
+            aa_layer=cfg.aa_layer,
+            attn_layer=attn_fn,  # will be passed through stage as block_kwargs
+            **block_kwargs,
+        )]
+        prev_chs = stage_args['out_chs']
+        prev_feat = dict(num_chs=prev_chs, reduction=net_stride, module=f'stages.{stage_idx}')
+
+    feature_info.append(prev_feat)
+    return nn.Sequential(*stages), feature_info
+
+
+class CspNet(nn.Module):
+    """Cross Stage Partial base model.
+
+    Paper: `CSPNet: A New Backbone that can Enhance Learning Capability of CNN` - https://arxiv.org/abs/1911.11929
+    Ref Impl: https://github.com/WongKinYiu/CrossStagePartialNetworks
+
+    NOTE: There are differences in the way I handle the 1x1 'expansion' conv in this impl vs the
+    darknet impl. I did it this way for simplicity and less special cases.
+    """
+
+    def __init__(
+            self,
+            cfg: CspModelCfg,
+            in_chans=3,
+            num_classes=1000,
+            output_stride=32,
+            global_pool='avg',
+            drop_rate=0.,
+            drop_path_rate=0.,
+            zero_init_last=True
+    ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        assert output_stride in (8, 16, 32)
+        layer_args = dict(
+            act_layer=cfg.act_layer,
+            norm_layer=cfg.norm_layer,
+            aa_layer=cfg.aa_layer
+        )
+        self.feature_info = []
+
+        # Construct the stem
+        self.stem, stem_feat_info = create_csp_stem(in_chans, **asdict(cfg.stem), **layer_args)
+        self.feature_info.extend(stem_feat_info[:-1])
+
+        # Construct the stages
+        self.stages, stage_feat_info = create_csp_stages(
+            cfg,
+            drop_path_rate=drop_path_rate,
+            output_stride=output_stride,
+            stem_feat=stem_feat_info[-1],
+        )
+        prev_chs = stage_feat_info[-1]['num_chs']
+        self.feature_info.extend(stage_feat_info)
+
+        # Construct the head
+        self.num_features = prev_chs
+        self.head = ClassifierHead(
+            in_chs=prev_chs, num_classes=num_classes, pool_type=global_pool, drop_rate=drop_rate)
+
+        named_apply(partial(_init_weights, zero_init_last=zero_init_last), self)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^stem',
+            blocks=r'^stages\.(\d+)' if coarse else [
+                (r'^stages\.(\d+)\.blocks\.(\d+)', None),
+                (r'^stages\.(\d+)\..*transition', MATCH_PREV_GROUP),  # map to last block in stage
+                (r'^stages\.(\d+)', (0,)),
+            ]
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        assert not enable, 'gradient checkpointing not supported'
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = self.stages(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        return self.head(x, pre_logits=pre_logits)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _init_weights(module, name, zero_init_last=False):
+    if isinstance(module, nn.Conv2d):
+        nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Linear):
+        nn.init.normal_(module.weight, mean=0.0, std=0.01)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif zero_init_last and hasattr(module, 'zero_init_last'):
+        module.zero_init_last()
+
+
+def _create_cspnet(variant, pretrained=False, **kwargs):
+    if variant.startswith('darknet') or variant.startswith('cspdarknet'):
+        # NOTE: DarkNet is one of few models with stride==1 features w/ 6 out_indices [0..5]
+        default_out_indices = (0, 1, 2, 3, 4, 5)
+    else:
+        default_out_indices = (0, 1, 2, 3, 4)
+    out_indices = kwargs.pop('out_indices', default_out_indices)
+    return build_model_with_cfg(
+        CspNet, variant, pretrained,
+        model_cfg=model_cfgs[variant],
+        feature_cfg=dict(flatten_sequential=True, out_indices=out_indices),
+        **kwargs)
+
+
+@register_model
+def cspresnet50(pretrained=False, **kwargs):
+    return _create_cspnet('cspresnet50', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cspresnet50d(pretrained=False, **kwargs):
+    return _create_cspnet('cspresnet50d', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cspresnet50w(pretrained=False, **kwargs):
+    return _create_cspnet('cspresnet50w', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cspresnext50(pretrained=False, **kwargs):
+    return _create_cspnet('cspresnext50', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cspdarknet53(pretrained=False, **kwargs):
+    return _create_cspnet('cspdarknet53', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def darknet17(pretrained=False, **kwargs):
+    return _create_cspnet('darknet17', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def darknet21(pretrained=False, **kwargs):
+    return _create_cspnet('darknet21', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def sedarknet21(pretrained=False, **kwargs):
+    return _create_cspnet('sedarknet21', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def darknet53(pretrained=False, **kwargs):
+    return _create_cspnet('darknet53', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def darknetaa53(pretrained=False, **kwargs):
+    return _create_cspnet('darknetaa53', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cs3darknet_s(pretrained=False, **kwargs):
+    return _create_cspnet('cs3darknet_s', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cs3darknet_m(pretrained=False, **kwargs):
+    return _create_cspnet('cs3darknet_m', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cs3darknet_l(pretrained=False, **kwargs):
+    return _create_cspnet('cs3darknet_l', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cs3darknet_x(pretrained=False, **kwargs):
+    return _create_cspnet('cs3darknet_x', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cs3darknet_focus_s(pretrained=False, **kwargs):
+    return _create_cspnet('cs3darknet_focus_s', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cs3darknet_focus_m(pretrained=False, **kwargs):
+    return _create_cspnet('cs3darknet_focus_m', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cs3darknet_focus_l(pretrained=False, **kwargs):
+    return _create_cspnet('cs3darknet_focus_l', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cs3darknet_focus_x(pretrained=False, **kwargs):
+    return _create_cspnet('cs3darknet_focus_x', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cs3sedarknet_l(pretrained=False, **kwargs):
+    return _create_cspnet('cs3sedarknet_l', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cs3sedarknet_x(pretrained=False, **kwargs):
+    return _create_cspnet('cs3sedarknet_x', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cs3sedarknet_xdw(pretrained=False, **kwargs):
+    return _create_cspnet('cs3sedarknet_xdw', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cs3edgenet_x(pretrained=False, **kwargs):
+    return _create_cspnet('cs3edgenet_x', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cs3se_edgenet_x(pretrained=False, **kwargs):
+    return _create_cspnet('cs3se_edgenet_x', pretrained=pretrained, **kwargs)
\ No newline at end of file
diff --git a/src/custom_timm/models/deit.py b/src/custom_timm/models/deit.py
new file mode 100644
index 0000000000000000000000000000000000000000..19d9e14d1420b45383829cfe00c822216994b114
--- /dev/null
+++ b/src/custom_timm/models/deit.py
@@ -0,0 +1,449 @@
+""" DeiT - Data-efficient Image Transformers
+
+DeiT model defs and weights from https://github.com/facebookresearch/deit, original copyright below
+
+paper: `DeiT: Data-efficient Image Transformers` - https://arxiv.org/abs/2012.12877
+
+paper: `DeiT III: Revenge of the ViT` - https://arxiv.org/abs/2204.07118
+
+Modifications copyright 2021, Ross Wightman
+"""
+# Copyright (c) 2015-present, Facebook, Inc.
+# All rights reserved.
+from functools import partial
+
+import torch
+from torch import nn as nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from custom_timm.models.vision_transformer import VisionTransformer, trunc_normal_, checkpoint_filter_fn
+
+from .helpers import build_model_with_cfg, checkpoint_seq
+from .registry import register_model
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    # deit models (FB weights)
+    'deit_tiny_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth'),
+    'deit_small_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth'),
+    'deit_base_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth'),
+    'deit_base_patch16_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_384-8de9b5d1.pth',
+        input_size=(3, 384, 384), crop_pct=1.0),
+
+    'deit_tiny_distilled_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_tiny_distilled_patch16_224-b40b3cf7.pth',
+        classifier=('head', 'head_dist')),
+    'deit_small_distilled_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_small_distilled_patch16_224-649709d9.pth',
+        classifier=('head', 'head_dist')),
+    'deit_base_distilled_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_224-df68dfff.pth',
+        classifier=('head', 'head_dist')),
+    'deit_base_distilled_patch16_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_384-d0272ac0.pth',
+        input_size=(3, 384, 384), crop_pct=1.0,
+        classifier=('head', 'head_dist')),
+
+    'deit3_small_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_3_small_224_1k.pth'),
+    'deit3_small_patch16_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_3_small_384_1k.pth',
+        input_size=(3, 384, 384), crop_pct=1.0),
+    'deit3_medium_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_3_medium_224_1k.pth'),
+    'deit3_base_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_3_base_224_1k.pth'),
+    'deit3_base_patch16_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_3_base_384_1k.pth',
+        input_size=(3, 384, 384), crop_pct=1.0),
+    'deit3_large_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_3_large_224_1k.pth'),
+    'deit3_large_patch16_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_3_large_384_1k.pth',
+        input_size=(3, 384, 384), crop_pct=1.0),
+    'deit3_huge_patch14_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_3_huge_224_1k.pth'),
+
+    'deit3_small_patch16_224_in21ft1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_3_small_224_21k.pth',
+        crop_pct=1.0),
+    'deit3_small_patch16_384_in21ft1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_3_small_384_21k.pth',
+        input_size=(3, 384, 384), crop_pct=1.0),
+    'deit3_medium_patch16_224_in21ft1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_3_medium_224_21k.pth',
+        crop_pct=1.0),
+    'deit3_base_patch16_224_in21ft1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_3_base_224_21k.pth',
+        crop_pct=1.0),
+    'deit3_base_patch16_384_in21ft1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_3_base_384_21k.pth',
+        input_size=(3, 384, 384), crop_pct=1.0),
+    'deit3_large_patch16_224_in21ft1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_3_large_224_21k.pth',
+        crop_pct=1.0),
+    'deit3_large_patch16_384_in21ft1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_3_large_384_21k.pth',
+        input_size=(3, 384, 384), crop_pct=1.0),
+    'deit3_huge_patch14_224_in21ft1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_3_huge_224_21k_v1.pth',
+        crop_pct=1.0),
+}
+
+
+class VisionTransformerDistilled(VisionTransformer):
+    """ Vision Transformer w/ Distillation Token and Head
+
+    Distillation token & head support for `DeiT: Data-efficient Image Transformers`
+        - https://arxiv.org/abs/2012.12877
+    """
+
+    def __init__(self, *args, **kwargs):
+        weight_init = kwargs.pop('weight_init', '')
+        super().__init__(*args, **kwargs, weight_init='skip')
+        assert self.global_pool in ('token',)
+
+        self.num_prefix_tokens = 2
+        self.dist_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, self.patch_embed.num_patches + self.num_prefix_tokens, self.embed_dim))
+        self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if self.num_classes > 0 else nn.Identity()
+        self.distilled_training = False  # must set this True to train w/ distillation token
+
+        self.init_weights(weight_init)
+
+    def init_weights(self, mode=''):
+        trunc_normal_(self.dist_token, std=.02)
+        super().init_weights(mode=mode)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^cls_token|pos_embed|patch_embed|dist_token',
+            blocks=[
+                (r'^blocks\.(\d+)', None),
+                (r'^norm', (99999,))]  # final norm w/ last block
+        )
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head, self.head_dist
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
+
+    @torch.jit.ignore
+    def set_distilled_training(self, enable=True):
+        self.distilled_training = enable
+
+    def forward_features(self, x) -> torch.Tensor:
+        x = self.patch_embed(x)
+        x = torch.cat((
+            self.cls_token.expand(x.shape[0], -1, -1),
+            self.dist_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = self.pos_drop(x + self.pos_embed)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False) -> torch.Tensor:
+        if pre_logits:
+            return (x[:, 0] + x[:, 1]) / 2
+        x, x_dist = self.head(x[:, 0]), self.head_dist(x[:, 1])
+        if self.distilled_training and self.training and not torch.jit.is_scripting():
+            # only return separate classification predictions when training in distilled mode
+            return x, x_dist
+        else:
+            # during standard train / finetune, inference average the classifier predictions
+            return (x + x_dist) / 2
+
+
+def _create_deit(variant, pretrained=False, distilled=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+    model_cls = VisionTransformerDistilled if distilled else VisionTransformer
+    model = build_model_with_cfg(
+        model_cls, variant, pretrained,
+        pretrained_filter_fn=partial(checkpoint_filter_fn, adapt_layer_scale=True),
+        **kwargs)
+    return model
+
+
+@register_model
+def deit_tiny_patch16_224(pretrained=False, **kwargs):
+    """ DeiT-tiny model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs)
+    model = _create_deit('deit_tiny_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit_small_patch16_224(pretrained=False, **kwargs):
+    """ DeiT-small model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs)
+    model = _create_deit('deit_small_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit_base_patch16_224(pretrained=False, **kwargs):
+    """ DeiT base model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_deit('deit_base_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit_base_patch16_384(pretrained=False, **kwargs):
+    """ DeiT base model @ 384x384 from paper (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_deit('deit_base_patch16_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit_tiny_distilled_patch16_224(pretrained=False, **kwargs):
+    """ DeiT-tiny distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs)
+    model = _create_deit(
+        'deit_tiny_distilled_patch16_224', pretrained=pretrained, distilled=True, **model_kwargs)
+    return model
+
+
+@register_model
+def deit_small_distilled_patch16_224(pretrained=False, **kwargs):
+    """ DeiT-small distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs)
+    model = _create_deit(
+        'deit_small_distilled_patch16_224', pretrained=pretrained, distilled=True, **model_kwargs)
+    return model
+
+
+@register_model
+def deit_base_distilled_patch16_224(pretrained=False, **kwargs):
+    """ DeiT-base distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_deit(
+        'deit_base_distilled_patch16_224', pretrained=pretrained, distilled=True, **model_kwargs)
+    return model
+
+
+@register_model
+def deit_base_distilled_patch16_384(pretrained=False, **kwargs):
+    """ DeiT-base distilled model @ 384x384 from paper (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_deit(
+        'deit_base_distilled_patch16_384', pretrained=pretrained, distilled=True, **model_kwargs)
+    return model
+
+
+@register_model
+def deit3_small_patch16_224(pretrained=False, **kwargs):
+    """ DeiT-3 small model @ 224x224 from paper (https://arxiv.org/abs/2204.07118).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=384, depth=12, num_heads=6, no_embed_class=True, init_values=1e-6, **kwargs)
+    model = _create_deit('deit3_small_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit3_small_patch16_384(pretrained=False, **kwargs):
+    """ DeiT-3 small model @ 384x384 from paper (https://arxiv.org/abs/2204.07118).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=384, depth=12, num_heads=6, no_embed_class=True, init_values=1e-6, **kwargs)
+    model = _create_deit('deit3_small_patch16_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit3_medium_patch16_224(pretrained=False, **kwargs):
+    """ DeiT-3 medium model @ 224x224 (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=512, depth=12, num_heads=8, no_embed_class=True, init_values=1e-6, **kwargs)
+    model = _create_deit('deit3_medium_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit3_base_patch16_224(pretrained=False, **kwargs):
+    """ DeiT-3 base model @ 224x224 from paper (https://arxiv.org/abs/2204.07118).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, no_embed_class=True, init_values=1e-6, **kwargs)
+    model = _create_deit('deit3_base_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit3_base_patch16_384(pretrained=False, **kwargs):
+    """ DeiT-3 base model @ 384x384 from paper (https://arxiv.org/abs/2204.07118).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, no_embed_class=True, init_values=1e-6, **kwargs)
+    model = _create_deit('deit3_base_patch16_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit3_large_patch16_224(pretrained=False, **kwargs):
+    """ DeiT-3 large model @ 224x224 from paper (https://arxiv.org/abs/2204.07118).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=1024, depth=24, num_heads=16, no_embed_class=True, init_values=1e-6, **kwargs)
+    model = _create_deit('deit3_large_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit3_large_patch16_384(pretrained=False, **kwargs):
+    """ DeiT-3 large model @ 384x384 from paper (https://arxiv.org/abs/2204.07118).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=1024, depth=24, num_heads=16, no_embed_class=True, init_values=1e-6, **kwargs)
+    model = _create_deit('deit3_large_patch16_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit3_huge_patch14_224(pretrained=False, **kwargs):
+    """ DeiT-3 base model @ 384x384 from paper (https://arxiv.org/abs/2204.07118).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(
+        patch_size=14, embed_dim=1280, depth=32, num_heads=16, no_embed_class=True, init_values=1e-6, **kwargs)
+    model = _create_deit('deit3_huge_patch14_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit3_small_patch16_224_in21ft1k(pretrained=False, **kwargs):
+    """ DeiT-3 small model @ 224x224 from paper (https://arxiv.org/abs/2204.07118).
+    ImageNet-21k pretrained weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=384, depth=12, num_heads=6, no_embed_class=True, init_values=1e-6, **kwargs)
+    model = _create_deit('deit3_small_patch16_224_in21ft1k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit3_small_patch16_384_in21ft1k(pretrained=False, **kwargs):
+    """ DeiT-3 small model @ 384x384 from paper (https://arxiv.org/abs/2204.07118).
+    ImageNet-21k pretrained weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=384, depth=12, num_heads=6, no_embed_class=True, init_values=1e-6, **kwargs)
+    model = _create_deit('deit3_small_patch16_384_in21ft1k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit3_medium_patch16_224_in21ft1k(pretrained=False, **kwargs):
+    """ DeiT-3 medium model @ 224x224 (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=512, depth=12, num_heads=8, no_embed_class=True, init_values=1e-6, **kwargs)
+    model = _create_deit('deit3_medium_patch16_224_in21ft1k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit3_base_patch16_224_in21ft1k(pretrained=False, **kwargs):
+    """ DeiT-3 base model @ 224x224 from paper (https://arxiv.org/abs/2204.07118).
+    ImageNet-21k pretrained weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, no_embed_class=True, init_values=1e-6, **kwargs)
+    model = _create_deit('deit3_base_patch16_224_in21ft1k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit3_base_patch16_384_in21ft1k(pretrained=False, **kwargs):
+    """ DeiT-3 base model @ 384x384 from paper (https://arxiv.org/abs/2204.07118).
+    ImageNet-21k pretrained weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, no_embed_class=True, init_values=1e-6, **kwargs)
+    model = _create_deit('deit3_base_patch16_384_in21ft1k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit3_large_patch16_224_in21ft1k(pretrained=False, **kwargs):
+    """ DeiT-3 large model @ 224x224 from paper (https://arxiv.org/abs/2204.07118).
+    ImageNet-21k pretrained weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=1024, depth=24, num_heads=16, no_embed_class=True, init_values=1e-6, **kwargs)
+    model = _create_deit('deit3_large_patch16_224_in21ft1k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit3_large_patch16_384_in21ft1k(pretrained=False, **kwargs):
+    """ DeiT-3 large model @ 384x384 from paper (https://arxiv.org/abs/2204.07118).
+    ImageNet-21k pretrained weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=1024, depth=24, num_heads=16, no_embed_class=True, init_values=1e-6, **kwargs)
+    model = _create_deit('deit3_large_patch16_384_in21ft1k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def deit3_huge_patch14_224_in21ft1k(pretrained=False, **kwargs):
+    """ DeiT-3 base model @ 384x384 from paper (https://arxiv.org/abs/2204.07118).
+    ImageNet-21k pretrained weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(
+        patch_size=14, embed_dim=1280, depth=32, num_heads=16, no_embed_class=True, init_values=1e-6, **kwargs)
+    model = _create_deit('deit3_huge_patch14_224_in21ft1k', pretrained=pretrained, **model_kwargs)
+    return model
diff --git a/src/custom_timm/models/densenet.py b/src/custom_timm/models/densenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..357afe0a341389787067efd66207108d15400a84
--- /dev/null
+++ b/src/custom_timm/models/densenet.py
@@ -0,0 +1,400 @@
+"""Pytorch Densenet implementation w/ tweaks
+This file is a copy of https://github.com/pytorch/vision 'densenet.py' (BSD-3-Clause) with
+fixed kwargs passthrough and addition of dynamic global avg/max pool.
+"""
+import re
+from collections import OrderedDict
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from torch.jit.annotations import List
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg, MATCH_PREV_GROUP
+from .layers import BatchNormAct2d, create_norm_act_layer, BlurPool2d, create_classifier
+from .registry import register_model
+
+__all__ = ['DenseNet']
+
+
+def _cfg(url=''):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'features.conv0', 'classifier': 'classifier',
+    }
+
+
+default_cfgs = {
+    'densenet121': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/densenet121_ra-50efcf5c.pth'),
+    'densenet121d': _cfg(url=''),
+    'densenetblur121d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/densenetblur121d_ra-100dcfbc.pth'),
+    'densenet169': _cfg(url='https://download.pytorch.org/models/densenet169-b2777c0a.pth'),
+    'densenet201': _cfg(url='https://download.pytorch.org/models/densenet201-c1103571.pth'),
+    'densenet161': _cfg(url='https://download.pytorch.org/models/densenet161-8d451a50.pth'),
+    'densenet264': _cfg(url=''),
+    'densenet264d_iabn': _cfg(url=''),
+    'tv_densenet121': _cfg(url='https://download.pytorch.org/models/densenet121-a639ec97.pth'),
+}
+
+
+class DenseLayer(nn.Module):
+    def __init__(
+            self, num_input_features, growth_rate, bn_size, norm_layer=BatchNormAct2d,
+            drop_rate=0., memory_efficient=False):
+        super(DenseLayer, self).__init__()
+        self.add_module('norm1', norm_layer(num_input_features)),
+        self.add_module('conv1', nn.Conv2d(
+            num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False)),
+        self.add_module('norm2', norm_layer(bn_size * growth_rate)),
+        self.add_module('conv2', nn.Conv2d(
+            bn_size * growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False)),
+        self.drop_rate = float(drop_rate)
+        self.memory_efficient = memory_efficient
+
+    def bottleneck_fn(self, xs):
+        # type: (List[torch.Tensor]) -> torch.Tensor
+        concated_features = torch.cat(xs, 1)
+        bottleneck_output = self.conv1(self.norm1(concated_features))  # noqa: T484
+        return bottleneck_output
+
+    # todo: rewrite when torchscript supports any
+    def any_requires_grad(self, x):
+        # type: (List[torch.Tensor]) -> bool
+        for tensor in x:
+            if tensor.requires_grad:
+                return True
+        return False
+
+    @torch.jit.unused  # noqa: T484
+    def call_checkpoint_bottleneck(self, x):
+        # type: (List[torch.Tensor]) -> torch.Tensor
+        def closure(*xs):
+            return self.bottleneck_fn(xs)
+
+        return cp.checkpoint(closure, *x)
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, x):
+        # type: (List[torch.Tensor]) -> (torch.Tensor)
+        pass
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, x):
+        # type: (torch.Tensor) -> (torch.Tensor)
+        pass
+
+    # torchscript does not yet support *args, so we overload method
+    # allowing it to take either a List[Tensor] or single Tensor
+    def forward(self, x):  # noqa: F811
+        if isinstance(x, torch.Tensor):
+            prev_features = [x]
+        else:
+            prev_features = x
+
+        if self.memory_efficient and self.any_requires_grad(prev_features):
+            if torch.jit.is_scripting():
+                raise Exception("Memory Efficient not supported in JIT")
+            bottleneck_output = self.call_checkpoint_bottleneck(prev_features)
+        else:
+            bottleneck_output = self.bottleneck_fn(prev_features)
+
+        new_features = self.conv2(self.norm2(bottleneck_output))
+        if self.drop_rate > 0:
+            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
+        return new_features
+
+
+class DenseBlock(nn.ModuleDict):
+    _version = 2
+
+    def __init__(
+            self, num_layers, num_input_features, bn_size, growth_rate, norm_layer=BatchNormAct2d,
+            drop_rate=0., memory_efficient=False):
+        super(DenseBlock, self).__init__()
+        for i in range(num_layers):
+            layer = DenseLayer(
+                num_input_features + i * growth_rate,
+                growth_rate=growth_rate,
+                bn_size=bn_size,
+                norm_layer=norm_layer,
+                drop_rate=drop_rate,
+                memory_efficient=memory_efficient,
+            )
+            self.add_module('denselayer%d' % (i + 1), layer)
+
+    def forward(self, init_features):
+        features = [init_features]
+        for name, layer in self.items():
+            new_features = layer(features)
+            features.append(new_features)
+        return torch.cat(features, 1)
+
+
+class DenseTransition(nn.Sequential):
+    def __init__(self, num_input_features, num_output_features, norm_layer=BatchNormAct2d, aa_layer=None):
+        super(DenseTransition, self).__init__()
+        self.add_module('norm', norm_layer(num_input_features))
+        self.add_module('conv', nn.Conv2d(
+            num_input_features, num_output_features, kernel_size=1, stride=1, bias=False))
+        if aa_layer is not None:
+            self.add_module('pool', aa_layer(num_output_features, stride=2))
+        else:
+            self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2))
+
+
+class DenseNet(nn.Module):
+    r"""Densenet-BC model class, based on
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
+
+    Args:
+        growth_rate (int) - how many filters to add each layer (`k` in paper)
+        block_config (list of 4 ints) - how many layers in each pooling block
+        bn_size (int) - multiplicative factor for number of bottle neck layers
+          (i.e. bn_size * k features in the bottleneck layer)
+        drop_rate (float) - dropout rate after each dense layer
+        num_classes (int) - number of classification classes
+        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
+          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
+    """
+
+    def __init__(
+            self, growth_rate=32, block_config=(6, 12, 24, 16), num_classes=1000, in_chans=3, global_pool='avg',
+            bn_size=4, stem_type='', norm_layer=BatchNormAct2d, aa_layer=None, drop_rate=0,
+            memory_efficient=False, aa_stem_only=True):
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        super(DenseNet, self).__init__()
+
+        # Stem
+        deep_stem = 'deep' in stem_type  # 3x3 deep stem
+        num_init_features = growth_rate * 2
+        if aa_layer is None:
+            stem_pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        else:
+            stem_pool = nn.Sequential(*[
+                nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
+                aa_layer(channels=num_init_features, stride=2)])
+        if deep_stem:
+            stem_chs_1 = stem_chs_2 = growth_rate
+            if 'tiered' in stem_type:
+                stem_chs_1 = 3 * (growth_rate // 4)
+                stem_chs_2 = num_init_features if 'narrow' in stem_type else 6 * (growth_rate // 4)
+            self.features = nn.Sequential(OrderedDict([
+                ('conv0', nn.Conv2d(in_chans, stem_chs_1, 3, stride=2, padding=1, bias=False)),
+                ('norm0', norm_layer(stem_chs_1)),
+                ('conv1', nn.Conv2d(stem_chs_1, stem_chs_2, 3, stride=1, padding=1, bias=False)),
+                ('norm1', norm_layer(stem_chs_2)),
+                ('conv2', nn.Conv2d(stem_chs_2, num_init_features, 3, stride=1, padding=1, bias=False)),
+                ('norm2', norm_layer(num_init_features)),
+                ('pool0', stem_pool),
+            ]))
+        else:
+            self.features = nn.Sequential(OrderedDict([
+                ('conv0', nn.Conv2d(in_chans, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
+                ('norm0', norm_layer(num_init_features)),
+                ('pool0', stem_pool),
+            ]))
+        self.feature_info = [
+            dict(num_chs=num_init_features, reduction=2, module=f'features.norm{2 if deep_stem else 0}')]
+        current_stride = 4
+
+        # DenseBlocks
+        num_features = num_init_features
+        for i, num_layers in enumerate(block_config):
+            block = DenseBlock(
+                num_layers=num_layers,
+                num_input_features=num_features,
+                bn_size=bn_size,
+                growth_rate=growth_rate,
+                norm_layer=norm_layer,
+                drop_rate=drop_rate,
+                memory_efficient=memory_efficient
+            )
+            module_name = f'denseblock{(i + 1)}'
+            self.features.add_module(module_name, block)
+            num_features = num_features + num_layers * growth_rate
+            transition_aa_layer = None if aa_stem_only else aa_layer
+            if i != len(block_config) - 1:
+                self.feature_info += [
+                    dict(num_chs=num_features, reduction=current_stride, module='features.' + module_name)]
+                current_stride *= 2
+                trans = DenseTransition(
+                    num_input_features=num_features, num_output_features=num_features // 2,
+                    norm_layer=norm_layer, aa_layer=transition_aa_layer)
+                self.features.add_module(f'transition{i + 1}', trans)
+                num_features = num_features // 2
+
+        # Final batch norm
+        self.features.add_module('norm5', norm_layer(num_features))
+
+        self.feature_info += [dict(num_chs=num_features, reduction=current_stride, module='features.norm5')]
+        self.num_features = num_features
+
+        # Linear layer
+        self.global_pool, self.classifier = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool)
+
+        # Official init from torch repo.
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.constant_(m.bias, 0)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^features\.conv[012]|features\.norm[012]|features\.pool[012]',
+            blocks=r'^features\.(?:denseblock|transition)(\d+)' if coarse else [
+                (r'^features\.denseblock(\d+)\.denselayer(\d+)', None),
+                (r'^features\.transition(\d+)', MATCH_PREV_GROUP)  # FIXME combine with previous denselayer
+            ]
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.classifier
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool, self.classifier = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool)
+
+    def forward_features(self, x):
+        return self.features(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.global_pool(x)
+        # both classifier and block drop?
+        # if self.drop_rate > 0.:
+        #     x = F.dropout(x, p=self.drop_rate, training=self.training)
+        x = self.classifier(x)
+        return x
+
+
+def _filter_torchvision_pretrained(state_dict):
+    pattern = re.compile(
+        r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
+
+    for key in list(state_dict.keys()):
+        res = pattern.match(key)
+        if res:
+            new_key = res.group(1) + res.group(2)
+            state_dict[new_key] = state_dict[key]
+            del state_dict[key]
+    return state_dict
+
+
+def _create_densenet(variant, growth_rate, block_config, pretrained, **kwargs):
+    kwargs['growth_rate'] = growth_rate
+    kwargs['block_config'] = block_config
+    return build_model_with_cfg(
+        DenseNet, variant, pretrained,
+        feature_cfg=dict(flatten_sequential=True), pretrained_filter_fn=_filter_torchvision_pretrained,
+        **kwargs)
+
+
+@register_model
+def densenet121(pretrained=False, **kwargs):
+    r"""Densenet-121 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
+    """
+    model = _create_densenet(
+        'densenet121', growth_rate=32, block_config=(6, 12, 24, 16), pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def densenetblur121d(pretrained=False, **kwargs):
+    r"""Densenet-121 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
+    """
+    model = _create_densenet(
+        'densenetblur121d', growth_rate=32, block_config=(6, 12, 24, 16), pretrained=pretrained, stem_type='deep',
+        aa_layer=BlurPool2d, **kwargs)
+    return model
+
+
+@register_model
+def densenet121d(pretrained=False, **kwargs):
+    r"""Densenet-121 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
+    """
+    model = _create_densenet(
+        'densenet121d', growth_rate=32, block_config=(6, 12, 24, 16), stem_type='deep',
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def densenet169(pretrained=False, **kwargs):
+    r"""Densenet-169 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
+    """
+    model = _create_densenet(
+        'densenet169', growth_rate=32, block_config=(6, 12, 32, 32), pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def densenet201(pretrained=False, **kwargs):
+    r"""Densenet-201 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
+    """
+    model = _create_densenet(
+        'densenet201', growth_rate=32, block_config=(6, 12, 48, 32), pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def densenet161(pretrained=False, **kwargs):
+    r"""Densenet-161 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
+    """
+    model = _create_densenet(
+        'densenet161', growth_rate=48, block_config=(6, 12, 36, 24), pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def densenet264(pretrained=False, **kwargs):
+    r"""Densenet-264 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
+    """
+    model = _create_densenet(
+        'densenet264', growth_rate=48, block_config=(6, 12, 64, 48), pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def densenet264d_iabn(pretrained=False, **kwargs):
+    r"""Densenet-264 model with deep stem and Inplace-ABN
+    """
+    def norm_act_fn(num_features, **kwargs):
+        return create_norm_act_layer('iabn', num_features, act_layer='leaky_relu', **kwargs)
+    model = _create_densenet(
+        'densenet264d_iabn', growth_rate=48, block_config=(6, 12, 64, 48), stem_type='deep',
+        norm_layer=norm_act_fn, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tv_densenet121(pretrained=False, **kwargs):
+    r"""Densenet-121 model with original Torchvision weights, from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
+    """
+    model = _create_densenet(
+        'tv_densenet121', growth_rate=32, block_config=(6, 12, 24, 16), pretrained=pretrained, **kwargs)
+    return model
diff --git a/src/custom_timm/models/dla.py b/src/custom_timm/models/dla.py
new file mode 100644
index 0000000000000000000000000000000000000000..e61146e2449e6599f4e584578e0550493eb7111a
--- /dev/null
+++ b/src/custom_timm/models/dla.py
@@ -0,0 +1,474 @@
+""" Deep Layer Aggregation and DLA w/ Res2Net
+DLA original adapted from Official Pytorch impl at:
+DLA Paper: `Deep Layer Aggregation` - https://arxiv.org/abs/1707.06484
+
+Res2Net additions from: https://github.com/gasvn/Res2Net/
+Res2Net Paper: `Res2Net: A New Multi-scale Backbone Architecture` - https://arxiv.org/abs/1904.01169
+"""
+import math
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg
+from .layers import create_classifier
+from .registry import register_model
+
+__all__ = ['DLA']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'base_layer.0', 'classifier': 'fc',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'dla34': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla34-2b83ff04.pth'),
+    'dla46_c': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla46_c-9b68d685.pth'),
+    'dla46x_c': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla46x_c-6bc5b5c8.pth'),
+    'dla60x_c': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla60x_c-a38e054a.pth'),
+    'dla60': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla60-9e91bd4d.pth'),
+    'dla60x': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla60x-6818f6bb.pth'),
+    'dla102': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla102-21f57b54.pth'),
+    'dla102x': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla102x-7ec0aa2a.pth'),
+    'dla102x2': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla102x2-ac4239c4.pth'),
+    'dla169': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla169-7c767967.pth'),
+    'dla60_res2net': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net_dla60_4s-d88db7f9.pth'),
+    'dla60_res2next': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2next_dla60_4s-d327927b.pth'),
+}
+
+
+class DlaBasic(nn.Module):
+    """DLA Basic"""
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1, **_):
+        super(DlaBasic, self).__init__()
+        self.conv1 = nn.Conv2d(
+            inplanes, planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, stride=1, padding=dilation, bias=False, dilation=dilation)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.stride = stride
+
+    def forward(self, x, shortcut=None, children: Optional[List[torch.Tensor]] = None):
+        if shortcut is None:
+            shortcut = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        out += shortcut
+        out = self.relu(out)
+
+        return out
+
+
+class DlaBottleneck(nn.Module):
+    """DLA/DLA-X Bottleneck"""
+    expansion = 2
+
+    def __init__(self, inplanes, outplanes, stride=1, dilation=1, cardinality=1, base_width=64):
+        super(DlaBottleneck, self).__init__()
+        self.stride = stride
+        mid_planes = int(math.floor(outplanes * (base_width / 64)) * cardinality)
+        mid_planes = mid_planes // self.expansion
+
+        self.conv1 = nn.Conv2d(inplanes, mid_planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(mid_planes)
+        self.conv2 = nn.Conv2d(
+            mid_planes, mid_planes, kernel_size=3, stride=stride, padding=dilation,
+            bias=False, dilation=dilation, groups=cardinality)
+        self.bn2 = nn.BatchNorm2d(mid_planes)
+        self.conv3 = nn.Conv2d(mid_planes, outplanes, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(outplanes)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x, shortcut: Optional[torch.Tensor] = None, children: Optional[List[torch.Tensor]] = None):
+        if shortcut is None:
+            shortcut = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += shortcut
+        out = self.relu(out)
+
+        return out
+
+
+class DlaBottle2neck(nn.Module):
+    """ Res2Net/Res2NeXT DLA Bottleneck
+    Adapted from https://github.com/gasvn/Res2Net/blob/master/dla.py
+    """
+    expansion = 2
+
+    def __init__(self, inplanes, outplanes, stride=1, dilation=1, scale=4, cardinality=8, base_width=4):
+        super(DlaBottle2neck, self).__init__()
+        self.is_first = stride > 1
+        self.scale = scale
+        mid_planes = int(math.floor(outplanes * (base_width / 64)) * cardinality)
+        mid_planes = mid_planes // self.expansion
+        self.width = mid_planes
+
+        self.conv1 = nn.Conv2d(inplanes, mid_planes * scale, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(mid_planes * scale)
+
+        num_scale_convs = max(1, scale - 1)
+        convs = []
+        bns = []
+        for _ in range(num_scale_convs):
+            convs.append(nn.Conv2d(
+                mid_planes, mid_planes, kernel_size=3, stride=stride,
+                padding=dilation, dilation=dilation, groups=cardinality, bias=False))
+            bns.append(nn.BatchNorm2d(mid_planes))
+        self.convs = nn.ModuleList(convs)
+        self.bns = nn.ModuleList(bns)
+        self.pool = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1) if self.is_first else None
+
+        self.conv3 = nn.Conv2d(mid_planes * scale, outplanes, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(outplanes)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x, shortcut: Optional[torch.Tensor] = None, children: Optional[List[torch.Tensor]] = None):
+        if shortcut is None:
+            shortcut = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        spx = torch.split(out, self.width, 1)
+        spo = []
+        sp = spx[0]  # redundant, for torchscript
+        for i, (conv, bn) in enumerate(zip(self.convs, self.bns)):
+            if i == 0 or self.is_first:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
+            sp = conv(sp)
+            sp = bn(sp)
+            sp = self.relu(sp)
+            spo.append(sp)
+        if self.scale > 1:
+            if self.pool is not None:  # self.is_first == True, None check for torchscript
+                spo.append(self.pool(spx[-1]))
+            else:
+                spo.append(spx[-1])
+        out = torch.cat(spo, 1)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += shortcut
+        out = self.relu(out)
+
+        return out
+
+
+class DlaRoot(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, shortcut):
+        super(DlaRoot, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, 1, stride=1, bias=False, padding=(kernel_size - 1) // 2)
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.shortcut = shortcut
+
+    def forward(self, x_children: List[torch.Tensor]):
+        x = self.conv(torch.cat(x_children, 1))
+        x = self.bn(x)
+        if self.shortcut:
+            x += x_children[0]
+        x = self.relu(x)
+
+        return x
+
+
+class DlaTree(nn.Module):
+    def __init__(
+            self, levels, block, in_channels, out_channels, stride=1, dilation=1, cardinality=1,
+            base_width=64, level_root=False, root_dim=0, root_kernel_size=1, root_shortcut=False):
+        super(DlaTree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+        self.downsample = nn.MaxPool2d(stride, stride=stride) if stride > 1 else nn.Identity()
+        self.project = nn.Identity()
+        cargs = dict(dilation=dilation, cardinality=cardinality, base_width=base_width)
+        if levels == 1:
+            self.tree1 = block(in_channels, out_channels, stride, **cargs)
+            self.tree2 = block(out_channels, out_channels, 1, **cargs)
+            if in_channels != out_channels:
+                # NOTE the official impl/weights have  project layers in levels > 1 case that are never
+                # used, I've moved the project layer here to avoid wasted params but old checkpoints will
+                # need strict=False while loading.
+                self.project = nn.Sequential(
+                    nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False),
+                    nn.BatchNorm2d(out_channels))
+            self.root = DlaRoot(root_dim, out_channels, root_kernel_size, root_shortcut)
+        else:
+            cargs.update(dict(root_kernel_size=root_kernel_size, root_shortcut=root_shortcut))
+            self.tree1 = DlaTree(
+                levels - 1, block, in_channels, out_channels, stride, root_dim=0, **cargs)
+            self.tree2 = DlaTree(
+                levels - 1, block, out_channels, out_channels, root_dim=root_dim + out_channels, **cargs)
+            self.root = None
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.levels = levels
+
+    def forward(self, x, shortcut: Optional[torch.Tensor] = None, children: Optional[List[torch.Tensor]] = None):
+        if children is None:
+            children = []
+        bottom = self.downsample(x)
+        shortcut = self.project(bottom)
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, shortcut)
+        if self.root is not None:  # levels == 1
+            x2 = self.tree2(x1)
+            x = self.root([x2, x1] + children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, None, children)
+        return x
+
+
+class DLA(nn.Module):
+    def __init__(
+            self, levels, channels, output_stride=32, num_classes=1000, in_chans=3, global_pool='avg',
+            cardinality=1, base_width=64, block=DlaBottle2neck, shortcut_root=False, drop_rate=0.0):
+        super(DLA, self).__init__()
+        self.channels = channels
+        self.num_classes = num_classes
+        self.cardinality = cardinality
+        self.base_width = base_width
+        self.drop_rate = drop_rate
+        assert output_stride == 32  # FIXME support dilation
+
+        self.base_layer = nn.Sequential(
+            nn.Conv2d(in_chans, channels[0], kernel_size=7, stride=1, padding=3, bias=False),
+            nn.BatchNorm2d(channels[0]),
+            nn.ReLU(inplace=True))
+        self.level0 = self._make_conv_level(channels[0], channels[0], levels[0])
+        self.level1 = self._make_conv_level(channels[0], channels[1], levels[1], stride=2)
+        cargs = dict(cardinality=cardinality, base_width=base_width, root_shortcut=shortcut_root)
+        self.level2 = DlaTree(levels[2], block, channels[1], channels[2], 2, level_root=False, **cargs)
+        self.level3 = DlaTree(levels[3], block, channels[2], channels[3], 2, level_root=True, **cargs)
+        self.level4 = DlaTree(levels[4], block, channels[3], channels[4], 2, level_root=True, **cargs)
+        self.level5 = DlaTree(levels[5], block, channels[4], channels[5], 2, level_root=True, **cargs)
+        self.feature_info = [
+            dict(num_chs=channels[0], reduction=1, module='level0'),  # rare to have a meaningful stride 1 level
+            dict(num_chs=channels[1], reduction=2, module='level1'),
+            dict(num_chs=channels[2], reduction=4, module='level2'),
+            dict(num_chs=channels[3], reduction=8, module='level3'),
+            dict(num_chs=channels[4], reduction=16, module='level4'),
+            dict(num_chs=channels[5], reduction=32, module='level5'),
+        ]
+
+        self.num_features = channels[-1]
+        self.global_pool, self.fc = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool, use_conv=True)
+        self.flatten = nn.Flatten(1) if global_pool else nn.Identity()
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend([
+                nn.Conv2d(
+                    inplanes, planes, kernel_size=3, stride=stride if i == 0 else 1,
+                    padding=dilation, bias=False, dilation=dilation),
+                nn.BatchNorm2d(planes),
+                nn.ReLU(inplace=True)])
+            inplanes = planes
+        return nn.Sequential(*modules)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^base_layer',
+            blocks=r'^level(\d+)' if coarse else [
+                # an unusual arch, this achieves somewhat more granularity without getting super messy
+                (r'^level(\d+)\.tree(\d+)', None),
+                (r'^level(\d+)\.root', (2,)),
+                (r'^level(\d+)', (1,))
+            ]
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        assert not enable, 'gradient checkpointing not supported'
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool, self.fc = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool, use_conv=True)
+        self.flatten = nn.Flatten(1) if global_pool else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.base_layer(x)
+        x = self.level0(x)
+        x = self.level1(x)
+        x = self.level2(x)
+        x = self.level3(x)
+        x = self.level4(x)
+        x = self.level5(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        x = self.global_pool(x)
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        if pre_logits:
+            return x.flatten(1)
+        else:
+            x = self.fc(x)
+            return self.flatten(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_dla(variant, pretrained=False, **kwargs):
+    return build_model_with_cfg(
+        DLA, variant, pretrained,
+        pretrained_strict=False,
+        feature_cfg=dict(out_indices=(1, 2, 3, 4, 5)),
+        **kwargs)
+
+
+@register_model
+def dla60_res2net(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        levels=(1, 1, 1, 2, 3, 1), channels=(16, 32, 128, 256, 512, 1024),
+        block=DlaBottle2neck, cardinality=1, base_width=28, **kwargs)
+    return _create_dla('dla60_res2net', pretrained, **model_kwargs)
+
+
+@register_model
+def dla60_res2next(pretrained=False,**kwargs):
+    model_kwargs = dict(
+        levels=(1, 1, 1, 2, 3, 1), channels=(16, 32, 128, 256, 512, 1024),
+        block=DlaBottle2neck, cardinality=8, base_width=4, **kwargs)
+    return _create_dla('dla60_res2next', pretrained, **model_kwargs)
+
+
+@register_model
+def dla34(pretrained=False, **kwargs):  # DLA-34
+    model_kwargs = dict(
+        levels=[1, 1, 1, 2, 2, 1], channels=[16, 32, 64, 128, 256, 512],
+        block=DlaBasic, **kwargs)
+    return _create_dla('dla34', pretrained, **model_kwargs)
+
+
+@register_model
+def dla46_c(pretrained=False, **kwargs):  # DLA-46-C
+    model_kwargs = dict(
+        levels=[1, 1, 1, 2, 2, 1], channels=[16, 32, 64, 64, 128, 256],
+        block=DlaBottleneck, **kwargs)
+    return _create_dla('dla46_c', pretrained, **model_kwargs)
+
+
+@register_model
+def dla46x_c(pretrained=False, **kwargs):  # DLA-X-46-C
+    model_kwargs = dict(
+        levels=[1, 1, 1, 2, 2, 1], channels=[16, 32, 64, 64, 128, 256],
+        block=DlaBottleneck, cardinality=32, base_width=4, **kwargs)
+    return _create_dla('dla46x_c', pretrained, **model_kwargs)
+
+
+@register_model
+def dla60x_c(pretrained=False, **kwargs):  # DLA-X-60-C
+    model_kwargs = dict(
+        levels=[1, 1, 1, 2, 3, 1], channels=[16, 32, 64, 64, 128, 256],
+        block=DlaBottleneck, cardinality=32, base_width=4, **kwargs)
+    return _create_dla('dla60x_c', pretrained, **model_kwargs)
+
+
+@register_model
+def dla60(pretrained=False, **kwargs):  # DLA-60
+    model_kwargs = dict(
+        levels=[1, 1, 1, 2, 3, 1], channels=[16, 32, 128, 256, 512, 1024],
+        block=DlaBottleneck, **kwargs)
+    return _create_dla('dla60', pretrained, **model_kwargs)
+
+
+@register_model
+def dla60x(pretrained=False, **kwargs):  # DLA-X-60
+    model_kwargs = dict(
+        levels=[1, 1, 1, 2, 3, 1], channels=[16, 32, 128, 256, 512, 1024],
+        block=DlaBottleneck, cardinality=32, base_width=4, **kwargs)
+    return _create_dla('dla60x', pretrained, **model_kwargs)
+
+
+@register_model
+def dla102(pretrained=False, **kwargs):  # DLA-102
+    model_kwargs = dict(
+        levels=[1, 1, 1, 3, 4, 1], channels=[16, 32, 128, 256, 512, 1024],
+        block=DlaBottleneck, shortcut_root=True, **kwargs)
+    return _create_dla('dla102', pretrained, **model_kwargs)
+
+
+@register_model
+def dla102x(pretrained=False, **kwargs):  # DLA-X-102
+    model_kwargs = dict(
+        levels=[1, 1, 1, 3, 4, 1], channels=[16, 32, 128, 256, 512, 1024],
+        block=DlaBottleneck, cardinality=32, base_width=4, shortcut_root=True, **kwargs)
+    return _create_dla('dla102x', pretrained, **model_kwargs)
+
+
+@register_model
+def dla102x2(pretrained=False, **kwargs):  # DLA-X-102 64
+    model_kwargs = dict(
+        levels=[1, 1, 1, 3, 4, 1], channels=[16, 32, 128, 256, 512, 1024],
+        block=DlaBottleneck, cardinality=64, base_width=4, shortcut_root=True, **kwargs)
+    return _create_dla('dla102x2', pretrained, **model_kwargs)
+
+
+@register_model
+def dla169(pretrained=False, **kwargs):  # DLA-169
+    model_kwargs = dict(
+        levels=[1, 1, 2, 3, 5, 1], channels=[16, 32, 128, 256, 512, 1024],
+        block=DlaBottleneck, shortcut_root=True, **kwargs)
+    return _create_dla('dla169', pretrained, **model_kwargs)
diff --git a/src/custom_timm/models/dpn.py b/src/custom_timm/models/dpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..4231735672b682fffc0577fe16578950ff3b85bb
--- /dev/null
+++ b/src/custom_timm/models/dpn.py
@@ -0,0 +1,339 @@
+""" PyTorch implementation of DualPathNetworks
+Based on original MXNet implementation https://github.com/cypw/DPNs with
+many ideas from another PyTorch implementation https://github.com/oyam/pytorch-DPNs.
+
+This implementation is compatible with the pretrained weights from cypw's MXNet implementation.
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+from collections import OrderedDict
+from functools import partial
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from custom_timm.data import IMAGENET_DPN_MEAN, IMAGENET_DPN_STD, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg
+from .layers import BatchNormAct2d, ConvNormAct, create_conv2d, create_classifier
+from .registry import register_model
+
+__all__ = ['DPN']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DPN_MEAN, 'std': IMAGENET_DPN_STD,
+        'first_conv': 'features.conv1_1.conv', 'classifier': 'classifier',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'dpn68': _cfg(
+        url='https://github.com/rwightman/pytorch-dpn-pretrained/releases/download/v0.1/dpn68-66bebafa7.pth'),
+    'dpn68b': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dpn68b_ra-a31ca160.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+    'dpn92': _cfg(
+        url='https://github.com/rwightman/pytorch-dpn-pretrained/releases/download/v0.1/dpn92_extra-b040e4a9b.pth'),
+    'dpn98': _cfg(
+        url='https://github.com/rwightman/pytorch-dpn-pretrained/releases/download/v0.1/dpn98-5b90dec4d.pth'),
+    'dpn131': _cfg(
+        url='https://github.com/rwightman/pytorch-dpn-pretrained/releases/download/v0.1/dpn131-71dfe43e0.pth'),
+    'dpn107': _cfg(
+        url='https://github.com/rwightman/pytorch-dpn-pretrained/releases/download/v0.1/dpn107_extra-1ac7121e2.pth')
+}
+
+
+class CatBnAct(nn.Module):
+    def __init__(self, in_chs, norm_layer=BatchNormAct2d):
+        super(CatBnAct, self).__init__()
+        self.bn = norm_layer(in_chs, eps=0.001)
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, x):
+        # type: (Tuple[torch.Tensor, torch.Tensor]) -> (torch.Tensor)
+        pass
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, x):
+        # type: (torch.Tensor) -> (torch.Tensor)
+        pass
+
+    def forward(self, x):
+        if isinstance(x, tuple):
+            x = torch.cat(x, dim=1)
+        return self.bn(x)
+
+
+class BnActConv2d(nn.Module):
+    def __init__(self, in_chs, out_chs, kernel_size, stride, groups=1, norm_layer=BatchNormAct2d):
+        super(BnActConv2d, self).__init__()
+        self.bn = norm_layer(in_chs, eps=0.001)
+        self.conv = create_conv2d(in_chs, out_chs, kernel_size, stride=stride, groups=groups)
+
+    def forward(self, x):
+        return self.conv(self.bn(x))
+
+
+class DualPathBlock(nn.Module):
+    def __init__(
+            self, in_chs, num_1x1_a, num_3x3_b, num_1x1_c, inc, groups, block_type='normal', b=False):
+        super(DualPathBlock, self).__init__()
+        self.num_1x1_c = num_1x1_c
+        self.inc = inc
+        self.b = b
+        if block_type == 'proj':
+            self.key_stride = 1
+            self.has_proj = True
+        elif block_type == 'down':
+            self.key_stride = 2
+            self.has_proj = True
+        else:
+            assert block_type == 'normal'
+            self.key_stride = 1
+            self.has_proj = False
+
+        self.c1x1_w_s1 = None
+        self.c1x1_w_s2 = None
+        if self.has_proj:
+            # Using different member names here to allow easier parameter key matching for conversion
+            if self.key_stride == 2:
+                self.c1x1_w_s2 = BnActConv2d(
+                    in_chs=in_chs, out_chs=num_1x1_c + 2 * inc, kernel_size=1, stride=2)
+            else:
+                self.c1x1_w_s1 = BnActConv2d(
+                    in_chs=in_chs, out_chs=num_1x1_c + 2 * inc, kernel_size=1, stride=1)
+
+        self.c1x1_a = BnActConv2d(in_chs=in_chs, out_chs=num_1x1_a, kernel_size=1, stride=1)
+        self.c3x3_b = BnActConv2d(
+            in_chs=num_1x1_a, out_chs=num_3x3_b, kernel_size=3, stride=self.key_stride, groups=groups)
+        if b:
+            self.c1x1_c = CatBnAct(in_chs=num_3x3_b)
+            self.c1x1_c1 = create_conv2d(num_3x3_b, num_1x1_c, kernel_size=1)
+            self.c1x1_c2 = create_conv2d(num_3x3_b, inc, kernel_size=1)
+        else:
+            self.c1x1_c = BnActConv2d(in_chs=num_3x3_b, out_chs=num_1x1_c + inc, kernel_size=1, stride=1)
+            self.c1x1_c1 = None
+            self.c1x1_c2 = None
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, x):
+        # type: (Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]
+        pass
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, x):
+        # type: (torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]
+        pass
+
+    def forward(self, x) -> Tuple[torch.Tensor, torch.Tensor]:
+        if isinstance(x, tuple):
+            x_in = torch.cat(x, dim=1)
+        else:
+            x_in = x
+        if self.c1x1_w_s1 is None and self.c1x1_w_s2 is None:
+            # self.has_proj == False, torchscript requires condition on module == None
+            x_s1 = x[0]
+            x_s2 = x[1]
+        else:
+            # self.has_proj == True
+            if self.c1x1_w_s1 is not None:
+                # self.key_stride = 1
+                x_s = self.c1x1_w_s1(x_in)
+            else:
+                # self.key_stride = 2
+                x_s = self.c1x1_w_s2(x_in)
+            x_s1 = x_s[:, :self.num_1x1_c, :, :]
+            x_s2 = x_s[:, self.num_1x1_c:, :, :]
+        x_in = self.c1x1_a(x_in)
+        x_in = self.c3x3_b(x_in)
+        x_in = self.c1x1_c(x_in)
+        if self.c1x1_c1 is not None:
+            # self.b == True, using None check for torchscript compat
+            out1 = self.c1x1_c1(x_in)
+            out2 = self.c1x1_c2(x_in)
+        else:
+            out1 = x_in[:, :self.num_1x1_c, :, :]
+            out2 = x_in[:, self.num_1x1_c:, :, :]
+        resid = x_s1 + out1
+        dense = torch.cat([x_s2, out2], dim=1)
+        return resid, dense
+
+
+class DPN(nn.Module):
+    def __init__(
+            self, small=False, num_init_features=64, k_r=96, groups=32, global_pool='avg',
+            b=False, k_sec=(3, 4, 20, 3), inc_sec=(16, 32, 24, 128), output_stride=32,
+            num_classes=1000, in_chans=3, drop_rate=0., fc_act_layer=nn.ELU):
+        super(DPN, self).__init__()
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        self.b = b
+        assert output_stride == 32  # FIXME look into dilation support
+        norm_layer = partial(BatchNormAct2d, eps=.001)
+        fc_norm_layer = partial(BatchNormAct2d, eps=.001, act_layer=fc_act_layer, inplace=False)
+        bw_factor = 1 if small else 4
+        blocks = OrderedDict()
+
+        # conv1
+        blocks['conv1_1'] = ConvNormAct(
+            in_chans, num_init_features, kernel_size=3 if small else 7, stride=2, norm_layer=norm_layer)
+        blocks['conv1_pool'] = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.feature_info = [dict(num_chs=num_init_features, reduction=2, module='features.conv1_1')]
+
+        # conv2
+        bw = 64 * bw_factor
+        inc = inc_sec[0]
+        r = (k_r * bw) // (64 * bw_factor)
+        blocks['conv2_1'] = DualPathBlock(num_init_features, r, r, bw, inc, groups, 'proj', b)
+        in_chs = bw + 3 * inc
+        for i in range(2, k_sec[0] + 1):
+            blocks['conv2_' + str(i)] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b)
+            in_chs += inc
+        self.feature_info += [dict(num_chs=in_chs, reduction=4, module=f'features.conv2_{k_sec[0]}')]
+
+        # conv3
+        bw = 128 * bw_factor
+        inc = inc_sec[1]
+        r = (k_r * bw) // (64 * bw_factor)
+        blocks['conv3_1'] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'down', b)
+        in_chs = bw + 3 * inc
+        for i in range(2, k_sec[1] + 1):
+            blocks['conv3_' + str(i)] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b)
+            in_chs += inc
+        self.feature_info += [dict(num_chs=in_chs, reduction=8, module=f'features.conv3_{k_sec[1]}')]
+
+        # conv4
+        bw = 256 * bw_factor
+        inc = inc_sec[2]
+        r = (k_r * bw) // (64 * bw_factor)
+        blocks['conv4_1'] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'down', b)
+        in_chs = bw + 3 * inc
+        for i in range(2, k_sec[2] + 1):
+            blocks['conv4_' + str(i)] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b)
+            in_chs += inc
+        self.feature_info += [dict(num_chs=in_chs, reduction=16, module=f'features.conv4_{k_sec[2]}')]
+
+        # conv5
+        bw = 512 * bw_factor
+        inc = inc_sec[3]
+        r = (k_r * bw) // (64 * bw_factor)
+        blocks['conv5_1'] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'down', b)
+        in_chs = bw + 3 * inc
+        for i in range(2, k_sec[3] + 1):
+            blocks['conv5_' + str(i)] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b)
+            in_chs += inc
+        self.feature_info += [dict(num_chs=in_chs, reduction=32, module=f'features.conv5_{k_sec[3]}')]
+
+        blocks['conv5_bn_ac'] = CatBnAct(in_chs, norm_layer=fc_norm_layer)
+
+        self.num_features = in_chs
+        self.features = nn.Sequential(blocks)
+
+        # Using 1x1 conv for the FC layer to allow the extra pooling scheme
+        self.global_pool, self.classifier = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool, use_conv=True)
+        self.flatten = nn.Flatten(1) if global_pool else nn.Identity()
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^features\.conv1',
+            blocks=[
+                (r'^features\.conv(\d+)' if coarse else r'^features\.conv(\d+)_(\d+)', None),
+                (r'^features\.conv5_bn_ac', (99999,))
+            ]
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        assert not enable, 'gradient checkpointing not supported'
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.classifier
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool, self.classifier = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool, use_conv=True)
+        self.flatten = nn.Flatten(1) if global_pool else nn.Identity()
+
+    def forward_features(self, x):
+        return self.features(x)
+
+    def forward_head(self, x, pre_logits: bool = False):
+        x = self.global_pool(x)
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        if pre_logits:
+            return x.flatten(1)
+        else:
+            x = self.classifier(x)
+            return self.flatten(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_dpn(variant, pretrained=False, **kwargs):
+    return build_model_with_cfg(
+        DPN, variant, pretrained,
+        feature_cfg=dict(feature_concat=True, flatten_sequential=True),
+        **kwargs)
+
+
+@register_model
+def dpn68(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        small=True, num_init_features=10, k_r=128, groups=32,
+        k_sec=(3, 4, 12, 3), inc_sec=(16, 32, 32, 64), **kwargs)
+    return _create_dpn('dpn68', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def dpn68b(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        small=True, num_init_features=10, k_r=128, groups=32,
+        b=True, k_sec=(3, 4, 12, 3), inc_sec=(16, 32, 32, 64), **kwargs)
+    return _create_dpn('dpn68b', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def dpn92(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        num_init_features=64, k_r=96, groups=32,
+        k_sec=(3, 4, 20, 3), inc_sec=(16, 32, 24, 128), **kwargs)
+    return _create_dpn('dpn92', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def dpn98(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        num_init_features=96, k_r=160, groups=40,
+        k_sec=(3, 6, 20, 3), inc_sec=(16, 32, 32, 128), **kwargs)
+    return _create_dpn('dpn98', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def dpn131(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        num_init_features=128, k_r=160, groups=40,
+        k_sec=(4, 8, 28, 3), inc_sec=(16, 32, 32, 128), **kwargs)
+    return _create_dpn('dpn131', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def dpn107(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        num_init_features=128, k_r=200, groups=50,
+        k_sec=(4, 8, 20, 3), inc_sec=(20, 64, 64, 128), **kwargs)
+    return _create_dpn('dpn107', pretrained=pretrained, **model_kwargs)
diff --git a/src/custom_timm/models/edgenext.py b/src/custom_timm/models/edgenext.py
new file mode 100644
index 0000000000000000000000000000000000000000..202c89ba8a9cf6c15087efd441a437e85d0ce515
--- /dev/null
+++ b/src/custom_timm/models/edgenext.py
@@ -0,0 +1,572 @@
+""" EdgeNeXt
+
+Paper: `EdgeNeXt: Efficiently Amalgamated CNN-Transformer Architecture for Mobile Vision Applications`
+ - https://arxiv.org/abs/2206.10589
+
+Original code and weights from https://github.com/mmaaz60/EdgeNeXt
+
+Modifications and additions for timm by / Copyright 2022, Ross Wightman
+"""
+import math
+import torch
+from collections import OrderedDict
+from functools import partial
+from typing import Tuple
+
+from torch import nn
+import torch.nn.functional as F
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .fx_features import register_notrace_module
+from .layers import trunc_normal_tf_, DropPath, LayerNorm2d, Mlp, SelectAdaptivePool2d, create_conv2d
+from .helpers import named_apply, build_model_with_cfg, checkpoint_seq
+from .registry import register_model
+
+
+__all__ = ['EdgeNeXt']  # model_registry will add each entrypoint fn to this
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 256, 256), 'pool_size': (8, 8),
+        'crop_pct': 0.9, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.0', 'classifier': 'head.fc',
+        **kwargs
+    }
+
+
+default_cfgs = dict(
+    edgenext_xx_small=_cfg(
+        url="https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.0/edgenext_xx_small.pth",
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    edgenext_x_small=_cfg(
+        url="https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.0/edgenext_x_small.pth",
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    # edgenext_small=_cfg(
+    #     url="https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.0/edgenext_small.pth"),
+    edgenext_small=_cfg(  # USI weights
+        url="https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.1/edgenext_small_usi.pth",
+        crop_pct=0.95, test_input_size=(3, 320, 320), test_crop_pct=1.0,
+    ),
+    # edgenext_base=_cfg(
+    #     url="https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.2/edgenext_base_usi.pth"),
+    edgenext_base=_cfg(  # USI weights
+        url="https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.2/edgenext_base_usi.pth",
+        crop_pct=0.95, test_input_size=(3, 320, 320), test_crop_pct=1.0,
+    ),
+
+    edgenext_small_rw=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/edgenext_small_rw-sw-b00041bb.pth',
+        test_input_size=(3, 320, 320), test_crop_pct=1.0,
+    ),
+)
+
+
+@register_notrace_module  # reason: FX can't symbolically trace torch.arange in forward method
+class PositionalEncodingFourier(nn.Module):
+    def __init__(self, hidden_dim=32, dim=768, temperature=10000):
+        super().__init__()
+        self.token_projection = nn.Conv2d(hidden_dim * 2, dim, kernel_size=1)
+        self.scale = 2 * math.pi
+        self.temperature = temperature
+        self.hidden_dim = hidden_dim
+        self.dim = dim
+
+    def forward(self, shape: Tuple[int, int, int]):
+        inv_mask = ~torch.zeros(shape).to(device=self.token_projection.weight.device, dtype=torch.bool)
+        y_embed = inv_mask.cumsum(1, dtype=torch.float32)
+        x_embed = inv_mask.cumsum(2, dtype=torch.float32)
+        eps = 1e-6
+        y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+        x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.hidden_dim, dtype=torch.float32, device=inv_mask.device)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode='floor') / self.hidden_dim)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(),
+             pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(),
+             pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        pos = self.token_projection(pos)
+
+        return pos
+
+
+class ConvBlock(nn.Module):
+    def __init__(
+            self,
+            dim,
+            dim_out=None,
+            kernel_size=7,
+            stride=1,
+            conv_bias=True,
+            expand_ratio=4,
+            ls_init_value=1e-6,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6),
+            act_layer=nn.GELU, drop_path=0.,
+    ):
+        super().__init__()
+        dim_out = dim_out or dim
+        self.shortcut_after_dw = stride > 1 or dim != dim_out
+
+        self.conv_dw = create_conv2d(
+            dim, dim_out, kernel_size=kernel_size, stride=stride, depthwise=True, bias=conv_bias)
+        self.norm = norm_layer(dim_out)
+        self.mlp = Mlp(dim_out, int(expand_ratio * dim_out), act_layer=act_layer)
+        self.gamma = nn.Parameter(ls_init_value * torch.ones(dim_out)) if ls_init_value > 0 else None
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv_dw(x)
+        if self.shortcut_after_dw:
+            shortcut = x
+
+        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.mlp(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+
+        x = shortcut + self.drop_path(x)
+        return x
+
+
+class CrossCovarianceAttn(nn.Module):
+    def __init__(
+            self,
+            dim,
+            num_heads=8,
+            qkv_bias=False,
+            attn_drop=0.,
+            proj_drop=0.
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.temperature = nn.Parameter(torch.ones(num_heads, 1, 1))
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 4, 1)
+        q, k, v = qkv.unbind(0)
+
+        # NOTE, this is NOT spatial attn, q, k, v are B, num_heads, C, L -->  C x C attn map
+        attn = (F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1)) * self.temperature
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).permute(0, 3, 1, 2).reshape(B, N, C)
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'temperature'}
+
+
+class SplitTransposeBlock(nn.Module):
+    def __init__(
+            self,
+            dim,
+            num_scales=1,
+            num_heads=8,
+            expand_ratio=4,
+            use_pos_emb=True,
+            conv_bias=True,
+            qkv_bias=True,
+            ls_init_value=1e-6,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6),
+            act_layer=nn.GELU,
+            drop_path=0.,
+            attn_drop=0.,
+            proj_drop=0.
+    ):
+        super().__init__()
+        width = max(int(math.ceil(dim / num_scales)), int(math.floor(dim // num_scales)))
+        self.width = width
+        self.num_scales = max(1, num_scales - 1)
+
+        convs = []
+        for i in range(self.num_scales):
+            convs.append(create_conv2d(width, width, kernel_size=3, depthwise=True, bias=conv_bias))
+        self.convs = nn.ModuleList(convs)
+
+        self.pos_embd = None
+        if use_pos_emb:
+            self.pos_embd = PositionalEncodingFourier(dim=dim)
+        self.norm_xca = norm_layer(dim)
+        self.gamma_xca = nn.Parameter(ls_init_value * torch.ones(dim)) if ls_init_value > 0 else None
+        self.xca = CrossCovarianceAttn(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=proj_drop)
+
+        self.norm = norm_layer(dim, eps=1e-6)
+        self.mlp = Mlp(dim, int(expand_ratio * dim), act_layer=act_layer)
+        self.gamma = nn.Parameter(ls_init_value * torch.ones(dim)) if ls_init_value > 0 else None
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        shortcut = x
+
+        # scales code re-written for torchscript as per my res2net fixes -rw
+        # NOTE torch.split(x, self.width, 1) causing issues with ONNX export
+        spx = x.chunk(len(self.convs) + 1, dim=1)
+        spo = []
+        sp = spx[0]
+        for i, conv in enumerate(self.convs):
+            if i > 0:
+                sp = sp + spx[i]
+            sp = conv(sp)
+            spo.append(sp)
+        spo.append(spx[-1])
+        x = torch.cat(spo, 1)
+
+        # XCA
+        B, C, H, W = x.shape
+        x = x.reshape(B, C, H * W).permute(0, 2, 1)
+        if self.pos_embd is not None:
+            pos_encoding = self.pos_embd((B, H, W)).reshape(B, -1, x.shape[1]).permute(0, 2, 1)
+            x = x + pos_encoding
+        x = x + self.drop_path(self.gamma_xca * self.xca(self.norm_xca(x)))
+        x = x.reshape(B, H, W, C)
+
+        # Inverted Bottleneck
+        x = self.norm(x)
+        x = self.mlp(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+
+        x = shortcut + self.drop_path(x)
+        return x
+
+
+class EdgeNeXtStage(nn.Module):
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            stride=2,
+            depth=2,
+            num_global_blocks=1,
+            num_heads=4,
+            scales=2,
+            kernel_size=7,
+            expand_ratio=4,
+            use_pos_emb=False,
+            downsample_block=False,
+            conv_bias=True,
+            ls_init_value=1.0,
+            drop_path_rates=None,
+            norm_layer=LayerNorm2d,
+            norm_layer_cl=partial(nn.LayerNorm, eps=1e-6),
+            act_layer=nn.GELU
+    ):
+        super().__init__()
+        self.grad_checkpointing = False
+
+        if downsample_block or stride == 1:
+            self.downsample = nn.Identity()
+        else:
+            self.downsample = nn.Sequential(
+                norm_layer(in_chs),
+                nn.Conv2d(in_chs, out_chs, kernel_size=2, stride=2, bias=conv_bias)
+            )
+            in_chs = out_chs
+
+        stage_blocks = []
+        for i in range(depth):
+            if i < depth - num_global_blocks:
+                stage_blocks.append(
+                    ConvBlock(
+                        dim=in_chs,
+                        dim_out=out_chs,
+                        stride=stride if downsample_block and i == 0 else 1,
+                        conv_bias=conv_bias,
+                        kernel_size=kernel_size,
+                        expand_ratio=expand_ratio,
+                        ls_init_value=ls_init_value,
+                        drop_path=drop_path_rates[i],
+                        norm_layer=norm_layer_cl,
+                        act_layer=act_layer,
+                    )
+                )
+            else:
+                stage_blocks.append(
+                    SplitTransposeBlock(
+                        dim=in_chs,
+                        num_scales=scales,
+                        num_heads=num_heads,
+                        expand_ratio=expand_ratio,
+                        use_pos_emb=use_pos_emb,
+                        conv_bias=conv_bias,
+                        ls_init_value=ls_init_value,
+                        drop_path=drop_path_rates[i],
+                        norm_layer=norm_layer_cl,
+                        act_layer=act_layer,
+                    )
+                )
+            in_chs = out_chs
+        self.blocks = nn.Sequential(*stage_blocks)
+
+    def forward(self, x):
+        x = self.downsample(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        return x
+
+
+class EdgeNeXt(nn.Module):
+    def __init__(
+            self,
+            in_chans=3,
+            num_classes=1000,
+            global_pool='avg',
+            dims=(24, 48, 88, 168),
+            depths=(3, 3, 9, 3),
+            global_block_counts=(0, 1, 1, 1),
+            kernel_sizes=(3, 5, 7, 9),
+            heads=(8, 8, 8, 8),
+            d2_scales=(2, 2, 3, 4),
+            use_pos_emb=(False, True, False, False),
+            ls_init_value=1e-6,
+            head_init_scale=1.,
+            expand_ratio=4,
+            downsample_block=False,
+            conv_bias=True,
+            stem_type='patch',
+            head_norm_first=False,
+            act_layer=nn.GELU,
+            drop_path_rate=0.,
+            drop_rate=0.,
+    ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.drop_rate = drop_rate
+        norm_layer = partial(LayerNorm2d, eps=1e-6)
+        norm_layer_cl = partial(nn.LayerNorm, eps=1e-6)
+        self.feature_info = []
+
+        assert stem_type in ('patch', 'overlap')
+        if stem_type == 'patch':
+            self.stem = nn.Sequential(
+                nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4, bias=conv_bias),
+                norm_layer(dims[0]),
+            )
+        else:
+            self.stem = nn.Sequential(
+                nn.Conv2d(in_chans, dims[0], kernel_size=9, stride=4, padding=9 // 2, bias=conv_bias),
+                norm_layer(dims[0]),
+            )
+
+        curr_stride = 4
+        stages = []
+        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        in_chs = dims[0]
+        for i in range(4):
+            stride = 2 if curr_stride == 2 or i > 0 else 1
+            # FIXME support dilation / output_stride
+            curr_stride *= stride
+            stages.append(EdgeNeXtStage(
+                in_chs=in_chs,
+                out_chs=dims[i],
+                stride=stride,
+                depth=depths[i],
+                num_global_blocks=global_block_counts[i],
+                num_heads=heads[i],
+                drop_path_rates=dp_rates[i],
+                scales=d2_scales[i],
+                expand_ratio=expand_ratio,
+                kernel_size=kernel_sizes[i],
+                use_pos_emb=use_pos_emb[i],
+                ls_init_value=ls_init_value,
+                downsample_block=downsample_block,
+                conv_bias=conv_bias,
+                norm_layer=norm_layer,
+                norm_layer_cl=norm_layer_cl,
+                act_layer=act_layer,
+            ))
+            # NOTE feature_info use currently assumes stage 0 == stride 1, rest are stride 2
+            in_chs = dims[i]
+            self.feature_info += [dict(num_chs=in_chs, reduction=curr_stride, module=f'stages.{i}')]
+
+        self.stages = nn.Sequential(*stages)
+
+        self.num_features = dims[-1]
+        self.norm_pre = norm_layer(self.num_features) if head_norm_first else nn.Identity()
+        self.head = nn.Sequential(OrderedDict([
+                ('global_pool', SelectAdaptivePool2d(pool_type=global_pool)),
+                ('norm', nn.Identity() if head_norm_first else norm_layer(self.num_features)),
+                ('flatten', nn.Flatten(1) if global_pool else nn.Identity()),
+                ('drop', nn.Dropout(self.drop_rate)),
+                ('fc', nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity())]))
+
+        named_apply(partial(_init_weights, head_init_scale=head_init_scale), self)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^stem',
+            blocks=r'^stages\.(\d+)' if coarse else [
+                (r'^stages\.(\d+)\.downsample', (0,)),  # blocks
+                (r'^stages\.(\d+)\.blocks\.(\d+)', None),
+                (r'^norm_pre', (99999,))
+            ]
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for s in self.stages:
+            s.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes=0, global_pool=None):
+        if global_pool is not None:
+            self.head.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+            self.head.flatten = nn.Flatten(1) if global_pool else nn.Identity()
+        self.head.fc = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = self.stages(x)
+        x = self.norm_pre(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        # NOTE nn.Sequential in head broken down since can't call head[:-1](x) in torchscript :(
+        x = self.head.global_pool(x)
+        x = self.head.norm(x)
+        x = self.head.flatten(x)
+        x = self.head.drop(x)
+        return x if pre_logits else self.head.fc(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _init_weights(module, name=None, head_init_scale=1.0):
+    if isinstance(module, nn.Conv2d):
+        trunc_normal_tf_(module.weight, std=.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Linear):
+        trunc_normal_tf_(module.weight, std=.02)
+        nn.init.zeros_(module.bias)
+        if name and 'head.' in name:
+            module.weight.data.mul_(head_init_scale)
+            module.bias.data.mul_(head_init_scale)
+
+
+def checkpoint_filter_fn(state_dict, model):
+    """ Remap FB checkpoints -> timm """
+    if 'head.norm.weight' in state_dict or 'norm_pre.weight' in state_dict:
+        return state_dict  # non-FB checkpoint
+
+    # models were released as train checkpoints... :/
+    if 'model_ema' in state_dict:
+        state_dict = state_dict['model_ema']
+    elif 'model' in state_dict:
+        state_dict = state_dict['model']
+    elif 'state_dict' in state_dict:
+        state_dict = state_dict['state_dict']
+
+    out_dict = {}
+    import re
+    for k, v in state_dict.items():
+        k = k.replace('downsample_layers.0.', 'stem.')
+        k = re.sub(r'stages.([0-9]+).([0-9]+)', r'stages.\1.blocks.\2', k)
+        k = re.sub(r'downsample_layers.([0-9]+).([0-9]+)', r'stages.\1.downsample.\2', k)
+        k = k.replace('dwconv', 'conv_dw')
+        k = k.replace('pwconv', 'mlp.fc')
+        k = k.replace('head.', 'head.fc.')
+        if k.startswith('norm.'):
+            k = k.replace('norm', 'head.norm')
+        if v.ndim == 2 and 'head' not in k:
+            model_shape = model.state_dict()[k].shape
+            v = v.reshape(model_shape)
+        out_dict[k] = v
+    return out_dict
+
+
+def _create_edgenext(variant, pretrained=False, **kwargs):
+    model = build_model_with_cfg(
+        EdgeNeXt, variant, pretrained,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        feature_cfg=dict(out_indices=(0, 1, 2, 3), flatten_sequential=True),
+        **kwargs)
+    return model
+
+
+@register_model
+def edgenext_xx_small(pretrained=False, **kwargs):
+    # 1.33M & 260.58M @ 256 resolution
+    # 71.23% Top-1 accuracy
+    # No AA, Color Jitter=0.4, No Mixup & Cutmix, DropPath=0.0, BS=4096, lr=0.006, multi-scale-sampler
+    # Jetson FPS=51.66 versus 47.67 for MobileViT_XXS
+    # For A100: FPS @ BS=1: 212.13 & @ BS=256: 7042.06 versus FPS @ BS=1: 96.68 & @ BS=256: 4624.71 for MobileViT_XXS
+    model_kwargs = dict(depths=(2, 2, 6, 2), dims=(24, 48, 88, 168), heads=(4, 4, 4, 4), **kwargs)
+    return _create_edgenext('edgenext_xx_small', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def edgenext_x_small(pretrained=False, **kwargs):
+    # 2.34M & 538.0M @ 256 resolution
+    # 75.00% Top-1 accuracy
+    # No AA, No Mixup & Cutmix, DropPath=0.0, BS=4096, lr=0.006, multi-scale-sampler
+    # Jetson FPS=31.61 versus 28.49 for MobileViT_XS
+    # For A100: FPS @ BS=1: 179.55 & @ BS=256: 4404.95 versus FPS @ BS=1: 94.55 & @ BS=256: 2361.53 for MobileViT_XS
+    model_kwargs = dict(depths=(3, 3, 9, 3), dims=(32, 64, 100, 192), heads=(4, 4, 4, 4), **kwargs)
+    return _create_edgenext('edgenext_x_small', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def edgenext_small(pretrained=False, **kwargs):
+    # 5.59M & 1260.59M @ 256 resolution
+    # 79.43% Top-1 accuracy
+    # AA=True, No Mixup & Cutmix, DropPath=0.1, BS=4096, lr=0.006, multi-scale-sampler
+    # Jetson FPS=20.47 versus 18.86 for MobileViT_S
+    # For A100: FPS @ BS=1: 172.33 & @ BS=256: 3010.25 versus FPS @ BS=1: 93.84 & @ BS=256: 1785.92 for MobileViT_S
+    model_kwargs = dict(depths=(3, 3, 9, 3), dims=(48, 96, 160, 304),  **kwargs)
+    return _create_edgenext('edgenext_small', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def edgenext_base(pretrained=False, **kwargs):
+    # 18.51M & 3840.93M @ 256 resolution
+    # 82.5% (normal) 83.7% (USI) Top-1 accuracy
+    # AA=True, Mixup & Cutmix, DropPath=0.1, BS=4096, lr=0.006, multi-scale-sampler
+    # Jetson FPS=xx.xx versus xx.xx for MobileViT_S
+    # For A100: FPS @ BS=1: xxx.xx & @ BS=256: xxxx.xx
+    model_kwargs = dict(depths=[3, 3, 9, 3], dims=[80, 160, 288, 584], **kwargs)
+    return _create_edgenext('edgenext_base', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def edgenext_small_rw(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        depths=(3, 3, 9, 3), dims=(48, 96, 192, 384),
+        downsample_block=True, conv_bias=False, stem_type='overlap', **kwargs)
+    return _create_edgenext('edgenext_small_rw', pretrained=pretrained, **model_kwargs)
+
diff --git a/src/custom_timm/models/efficientformer.py b/src/custom_timm/models/efficientformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f5c71ab8766892c10d8063df055883484dc04c4
--- /dev/null
+++ b/src/custom_timm/models/efficientformer.py
@@ -0,0 +1,551 @@
+""" EfficientFormer
+
+@article{li2022efficientformer,
+  title={EfficientFormer: Vision Transformers at MobileNet Speed},
+  author={Li, Yanyu and Yuan, Geng and Wen, Yang and Hu, Eric and Evangelidis, Georgios and Tulyakov,
+   Sergey and Wang, Yanzhi and Ren, Jian},
+  journal={arXiv preprint arXiv:2206.01191},
+  year={2022}
+}
+
+Based on Apache 2.0 licensed code at https://github.com/snap-research/EfficientFormer, Copyright (c) 2022 Snap Inc.
+
+Modifications and timm support by / Copyright 2022, Ross Wightman
+"""
+from typing import Dict
+
+import torch
+import torch.nn as nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg
+from .layers import DropPath, trunc_normal_, to_2tuple, Mlp
+from .registry import register_model
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, 'fixed_input_size': True,
+        'crop_pct': .95, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.conv1', 'classifier': ('head', 'head_dist'),
+        **kwargs
+    }
+
+
+default_cfgs = dict(
+    efficientformer_l1=_cfg(
+        url="https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-morevit/efficientformer_l1_1000d_224-5b08fab0.pth",
+    ),
+    efficientformer_l3=_cfg(
+        url="https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-morevit/efficientformer_l3_300d_224-6816624f.pth",
+    ),
+    efficientformer_l7=_cfg(
+        url="https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-morevit/efficientformer_l7_300d_224-e957ab75.pth",
+    ),
+)
+
+EfficientFormer_width = {
+    'l1': (48, 96, 224, 448),
+    'l3': (64, 128, 320, 512),
+    'l7': (96, 192, 384, 768),
+}
+
+EfficientFormer_depth = {
+    'l1': (3, 2, 6, 4),
+    'l3': (4, 4, 12, 6),
+    'l7': (6, 6, 18, 8),
+}
+
+
+class Attention(torch.nn.Module):
+    attention_bias_cache: Dict[str, torch.Tensor]
+
+    def __init__(
+            self,
+            dim=384,
+            key_dim=32,
+            num_heads=8,
+            attn_ratio=4,
+            resolution=7
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.scale = key_dim ** -0.5
+        self.key_dim = key_dim
+        self.key_attn_dim = key_dim * num_heads
+        self.val_dim = int(attn_ratio * key_dim)
+        self.val_attn_dim = self.val_dim * num_heads
+        self.attn_ratio = attn_ratio
+
+        self.qkv = nn.Linear(dim, self.key_attn_dim * 2 + self.val_attn_dim)
+        self.proj = nn.Linear(self.val_attn_dim, dim)
+
+        resolution = to_2tuple(resolution)
+        pos = torch.stack(torch.meshgrid(torch.arange(resolution[0]), torch.arange(resolution[1]))).flatten(1)
+        rel_pos = (pos[..., :, None] - pos[..., None, :]).abs()
+        rel_pos = (rel_pos[0] * resolution[1]) + rel_pos[1]
+        self.attention_biases = torch.nn.Parameter(torch.zeros(num_heads, resolution[0] * resolution[1]))
+        self.register_buffer('attention_bias_idxs', torch.LongTensor(rel_pos))
+        self.attention_bias_cache = {}  # per-device attention_biases cache (data-parallel compat)
+
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and self.attention_bias_cache:
+            self.attention_bias_cache = {}  # clear ab cache
+
+    def get_attention_biases(self, device: torch.device) -> torch.Tensor:
+        if self.training:
+            return self.attention_biases[:, self.attention_bias_idxs]
+        else:
+            device_key = str(device)
+            if device_key not in self.attention_bias_cache:
+                self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs]
+            return self.attention_bias_cache[device_key]
+
+    def forward(self, x):  # x (B,N,C)
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+        q, k, v = qkv.split([self.key_dim, self.key_dim, self.val_dim], dim=3)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn + self.get_attention_biases(x.device)
+
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, self.val_attn_dim)
+        x = self.proj(x)
+        return x
+
+
+class Stem4(nn.Sequential):
+    def __init__(self, in_chs, out_chs, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
+        super().__init__()
+        self.stride = 4
+
+        self.add_module('conv1', nn.Conv2d(in_chs, out_chs // 2, kernel_size=3, stride=2, padding=1))
+        self.add_module('norm1', norm_layer(out_chs // 2))
+        self.add_module('act1', act_layer())
+        self.add_module('conv2', nn.Conv2d(out_chs // 2, out_chs, kernel_size=3, stride=2, padding=1))
+        self.add_module('norm2', norm_layer(out_chs))
+        self.add_module('act2', act_layer())
+
+
+class Downsample(nn.Module):
+    """
+    Downsampling via strided conv w/ norm
+    Input: tensor in shape [B, C, H, W]
+    Output: tensor in shape [B, C, H/stride, W/stride]
+    """
+
+    def __init__(self, in_chs, out_chs, kernel_size=3, stride=2, padding=None, norm_layer=nn.BatchNorm2d):
+        super().__init__()
+        if padding is None:
+            padding = kernel_size // 2
+        self.conv = nn.Conv2d(in_chs, out_chs, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.norm = norm_layer(out_chs)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        return x
+
+
+class Flat(nn.Module):
+
+    def __init__(self, ):
+        super().__init__()
+
+    def forward(self, x):
+        x = x.flatten(2).transpose(1, 2)
+        return x
+
+
+class Pooling(nn.Module):
+    """
+    Implementation of pooling for PoolFormer
+    --pool_size: pooling size
+    """
+
+    def __init__(self, pool_size=3):
+        super().__init__()
+        self.pool = nn.AvgPool2d(pool_size, stride=1, padding=pool_size // 2, count_include_pad=False)
+
+    def forward(self, x):
+        return self.pool(x) - x
+
+
+class ConvMlpWithNorm(nn.Module):
+    """
+    Implementation of MLP with 1*1 convolutions.
+    Input: tensor with shape [B, C, H, W]
+    """
+
+    def __init__(
+            self,
+            in_features,
+            hidden_features=None,
+            out_features=None,
+            act_layer=nn.GELU,
+            norm_layer=nn.BatchNorm2d,
+            drop=0.
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
+        self.norm1 = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        self.act = act_layer()
+        self.fc2 = nn.Conv2d(hidden_features, out_features, 1)
+        self.norm2 = norm_layer(out_features) if norm_layer is not None else nn.Identity()
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.norm1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.norm2(x)
+        x = self.drop(x)
+        return x
+
+
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x):
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+
+
+class MetaBlock1d(nn.Module):
+
+    def __init__(
+            self,
+            dim,
+            mlp_ratio=4.,
+            act_layer=nn.GELU,
+            norm_layer=nn.LayerNorm,
+            drop=0.,
+            drop_path=0.,
+            layer_scale_init_value=1e-5
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.token_mixer = Attention(dim)
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.ls1 = LayerScale(dim, layer_scale_init_value)
+        self.ls2 = LayerScale(dim, layer_scale_init_value)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.ls1(self.token_mixer(self.norm1(x))))
+        x = x + self.drop_path(self.ls2(self.mlp(self.norm2(x))))
+        return x
+
+
+class LayerScale2d(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x):
+        gamma = self.gamma.view(1, -1, 1, 1)
+        return x.mul_(gamma) if self.inplace else x * gamma
+
+
+class MetaBlock2d(nn.Module):
+
+    def __init__(
+            self,
+            dim,
+            pool_size=3,
+            mlp_ratio=4.,
+            act_layer=nn.GELU,
+            norm_layer=nn.BatchNorm2d,
+            drop=0.,
+            drop_path=0.,
+            layer_scale_init_value=1e-5
+    ):
+        super().__init__()
+        self.token_mixer = Pooling(pool_size=pool_size)
+        self.mlp = ConvMlpWithNorm(
+            dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, norm_layer=norm_layer, drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.ls1 = LayerScale2d(dim, layer_scale_init_value)
+        self.ls2 = LayerScale2d(dim, layer_scale_init_value)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.ls1(self.token_mixer(x)))
+        x = x + self.drop_path(self.ls2(self.mlp(x)))
+        return x
+
+
+class EfficientFormerStage(nn.Module):
+
+    def __init__(
+            self,
+            dim,
+            dim_out,
+            depth,
+            downsample=True,
+            num_vit=1,
+            pool_size=3,
+            mlp_ratio=4.,
+            act_layer=nn.GELU,
+            norm_layer=nn.BatchNorm2d,
+            norm_layer_cl=nn.LayerNorm,
+            drop=.0,
+            drop_path=0.,
+            layer_scale_init_value=1e-5,
+):
+        super().__init__()
+        self.grad_checkpointing = False
+
+        if downsample:
+            self.downsample = Downsample(in_chs=dim, out_chs=dim_out, norm_layer=norm_layer)
+            dim = dim_out
+        else:
+            assert dim == dim_out
+            self.downsample = nn.Identity()
+
+        blocks = []
+        if num_vit and num_vit >= depth:
+            blocks.append(Flat())
+
+        for block_idx in range(depth):
+            remain_idx = depth - block_idx - 1
+            if num_vit and num_vit > remain_idx:
+                blocks.append(
+                    MetaBlock1d(
+                        dim,
+                        mlp_ratio=mlp_ratio,
+                        act_layer=act_layer,
+                        norm_layer=norm_layer_cl,
+                        drop=drop,
+                        drop_path=drop_path[block_idx],
+                        layer_scale_init_value=layer_scale_init_value,
+                    ))
+            else:
+                blocks.append(
+                    MetaBlock2d(
+                        dim,
+                        pool_size=pool_size,
+                        mlp_ratio=mlp_ratio,
+                        act_layer=act_layer,
+                        norm_layer=norm_layer,
+                        drop=drop,
+                        drop_path=drop_path[block_idx],
+                        layer_scale_init_value=layer_scale_init_value,
+                    ))
+                if num_vit and num_vit == remain_idx:
+                    blocks.append(Flat())
+
+        self.blocks = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+
+
+class EfficientFormer(nn.Module):
+
+    def __init__(
+            self,
+            depths,
+            embed_dims=None,
+            in_chans=3,
+            num_classes=1000,
+            global_pool='avg',
+            downsamples=None,
+            num_vit=0,
+            mlp_ratios=4,
+            pool_size=3,
+            layer_scale_init_value=1e-5,
+            act_layer=nn.GELU,
+            norm_layer=nn.BatchNorm2d,
+            norm_layer_cl=nn.LayerNorm,
+            drop_rate=0.,
+            drop_path_rate=0.,
+            **kwargs
+    ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+
+        self.stem = Stem4(in_chans, embed_dims[0], norm_layer=norm_layer)
+        prev_dim = embed_dims[0]
+
+        # stochastic depth decay rule
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        downsamples = downsamples or (False,) + (True,) * (len(depths) - 1)
+        stages = []
+        for i in range(len(depths)):
+            stage = EfficientFormerStage(
+                prev_dim,
+                embed_dims[i],
+                depths[i],
+                downsample=downsamples[i],
+                num_vit=num_vit if i == 3 else 0,
+                pool_size=pool_size,
+                mlp_ratio=mlp_ratios,
+                act_layer=act_layer,
+                norm_layer_cl=norm_layer_cl,
+                norm_layer=norm_layer,
+                drop=drop_rate,
+                drop_path=dpr[i],
+                layer_scale_init_value=layer_scale_init_value,
+            )
+            prev_dim = embed_dims[i]
+            stages.append(stage)
+
+        self.stages = nn.Sequential(*stages)
+
+        # Classifier head
+        self.num_features = embed_dims[-1]
+        self.norm = norm_layer_cl(self.num_features)
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        # assuming model is always distilled (valid for current checkpoints, will split def if that changes)
+        self.head_dist = nn.Linear(embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
+        self.distilled_training = False  # must set this True to train w/ distillation token
+
+        self.apply(self._init_weights)
+
+    # init for classification
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {k for k, _ in self.named_parameters() if 'attention_biases' in k}
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^stem',  # stem and embed
+            blocks=[(r'^stages\.(\d+)', None), (r'^norm', (99999,))]
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for s in self.stages:
+            s.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head, self.head_dist
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        self.head_dist = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+    @torch.jit.ignore
+    def set_distilled_training(self, enable=True):
+        self.distilled_training = enable
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = self.stages(x)
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool == 'avg':
+            x = x.mean(dim=1)
+        if pre_logits:
+            return x
+        x, x_dist = self.head(x), self.head_dist(x)
+        if self.distilled_training and self.training and not torch.jit.is_scripting():
+            # only return separate classification predictions when training in distilled mode
+            return x, x_dist
+        else:
+            # during standard train/finetune, inference average the classifier predictions
+            return (x + x_dist) / 2
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _checkpoint_filter_fn(state_dict, model):
+    """ Remap original checkpoints -> timm """
+    if 'stem.0.weight' in state_dict:
+        return state_dict  # non-original checkpoint, no remapping needed
+
+    out_dict = {}
+    import re
+    stage_idx = 0
+    for k, v in state_dict.items():
+        if k.startswith('patch_embed'):
+            k = k.replace('patch_embed.0', 'stem.conv1')
+            k = k.replace('patch_embed.1', 'stem.norm1')
+            k = k.replace('patch_embed.3', 'stem.conv2')
+            k = k.replace('patch_embed.4', 'stem.norm2')
+
+        if re.match(r'network\.(\d+)\.proj\.weight', k):
+            stage_idx += 1
+        k = re.sub(r'network.(\d+).(\d+)', f'stages.{stage_idx}.blocks.\\2', k)
+        k = re.sub(r'network.(\d+).proj', f'stages.{stage_idx}.downsample.conv', k)
+        k = re.sub(r'network.(\d+).norm', f'stages.{stage_idx}.downsample.norm', k)
+
+        k = re.sub(r'layer_scale_([0-9])', r'ls\1.gamma', k)
+        k = k.replace('dist_head', 'head_dist')
+        out_dict[k] = v
+    return out_dict
+
+
+def _create_efficientformer(variant, pretrained=False, **kwargs):
+    model = build_model_with_cfg(
+        EfficientFormer, variant, pretrained,
+        pretrained_filter_fn=_checkpoint_filter_fn,
+        **kwargs)
+    return model
+
+
+@register_model
+def efficientformer_l1(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        depths=EfficientFormer_depth['l1'],
+        embed_dims=EfficientFormer_width['l1'],
+        num_vit=1,
+        **kwargs)
+    return _create_efficientformer('efficientformer_l1', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def efficientformer_l3(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        depths=EfficientFormer_depth['l3'],
+        embed_dims=EfficientFormer_width['l3'],
+        num_vit=4,
+        **kwargs)
+    return _create_efficientformer('efficientformer_l3', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def efficientformer_l7(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        depths=EfficientFormer_depth['l7'],
+        embed_dims=EfficientFormer_width['l7'],
+        num_vit=8,
+        **kwargs)
+    return _create_efficientformer('efficientformer_l7', pretrained=pretrained, **model_kwargs)
+
diff --git a/src/custom_timm/models/efficientnet.py b/src/custom_timm/models/efficientnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..90dd9eb85dfc6ab473e48df9aacdccd73bdff22b
--- /dev/null
+++ b/src/custom_timm/models/efficientnet.py
@@ -0,0 +1,2403 @@
+""" The EfficientNet Family in PyTorch
+
+An implementation of EfficienNet that covers variety of related models with efficient architectures:
+
+* EfficientNet-V2
+  - `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298
+
+* EfficientNet (B0-B8, L2 + Tensorflow pretrained AutoAug/RandAug/AdvProp/NoisyStudent weight ports)
+  - EfficientNet: Rethinking Model Scaling for CNNs - https://arxiv.org/abs/1905.11946
+  - CondConv: Conditionally Parameterized Convolutions for Efficient Inference - https://arxiv.org/abs/1904.04971
+  - Adversarial Examples Improve Image Recognition - https://arxiv.org/abs/1911.09665
+  - Self-training with Noisy Student improves ImageNet classification - https://arxiv.org/abs/1911.04252
+
+* MixNet (Small, Medium, and Large)
+  - MixConv: Mixed Depthwise Convolutional Kernels - https://arxiv.org/abs/1907.09595
+
+* MNasNet B1, A1 (SE), Small
+  - MnasNet: Platform-Aware Neural Architecture Search for Mobile - https://arxiv.org/abs/1807.11626
+
+* FBNet-C
+  - FBNet: Hardware-Aware Efficient ConvNet Design via Differentiable NAS - https://arxiv.org/abs/1812.03443
+
+* Single-Path NAS Pixel1
+  - Single-Path NAS: Designing Hardware-Efficient ConvNets - https://arxiv.org/abs/1904.02877
+
+* TinyNet
+    - Model Rubik's Cube: Twisting Resolution, Depth and Width for TinyNets - https://arxiv.org/abs/2010.14819
+    - Definitions & weights borrowed from https://github.com/huawei-noah/CV-Backbones/tree/master/tinynet_pytorch
+
+* And likely more...
+
+The majority of the above models (EfficientNet*, MixNet, MnasNet) and original weights were made available
+by Mingxing Tan, Quoc Le, and other members of their Google Brain team. Thanks for consistently releasing
+the models and weights open source!
+
+Hacked together by / Copyright 2019, Ross Wightman
+"""
+from functools import partial
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+from .efficientnet_blocks import SqueezeExcite
+from .efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficientnet_init_weights,\
+    round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT
+from .features import FeatureInfo, FeatureHooks
+from .helpers import build_model_with_cfg, pretrained_cfg_for_features, checkpoint_seq
+from .layers import create_conv2d, create_classifier, get_norm_act_layer, EvoNorm2dS0, GroupNormAct
+from .registry import register_model
+
+__all__ = ['EfficientNet', 'EfficientNetFeatures']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'conv_stem', 'classifier': 'classifier',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'mnasnet_050': _cfg(url=''),
+    'mnasnet_075': _cfg(url=''),
+    'mnasnet_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_b1-74cb7081.pth'),
+    'mnasnet_140': _cfg(url=''),
+
+    'semnasnet_050': _cfg(url=''),
+    'semnasnet_075': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/semnasnet_075-18710866.pth'),
+    'semnasnet_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_a1-d9418771.pth'),
+    'semnasnet_140': _cfg(url=''),
+    'mnasnet_small': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_small_lamb-aff75073.pth'),
+
+    'mobilenetv2_035': _cfg(
+        url=''),
+    'mobilenetv2_050': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_050-3d30d450.pth',
+        interpolation='bicubic',
+    ),
+    'mobilenetv2_075': _cfg(
+        url=''),
+    'mobilenetv2_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_100_ra-b33bc2c4.pth'),
+    'mobilenetv2_110d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_110d_ra-77090ade.pth'),
+    'mobilenetv2_120d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_120d_ra-5987e2ed.pth'),
+    'mobilenetv2_140': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_140_ra-21a4e913.pth'),
+
+    'fbnetc_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/fbnetc_100-c345b898.pth',
+        interpolation='bilinear'),
+    'spnasnet_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/spnasnet_100-048bc3f4.pth',
+        interpolation='bilinear'),
+
+    # NOTE experimenting with alternate attention
+    'efficientnet_b0': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b0_ra-3dd342df.pth'),
+    'efficientnet_b1': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b1-533bc792.pth',
+        test_input_size=(3, 256, 256), crop_pct=1.0),
+    'efficientnet_b2': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b2_ra-bcdf34b7.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8), test_input_size=(3, 288, 288), crop_pct=1.0),
+    'efficientnet_b3': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b3_ra2-cf984f9c.pth',
+        input_size=(3, 288, 288), pool_size=(9, 9), test_input_size=(3, 320, 320), crop_pct=1.0),
+    'efficientnet_b4': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b4_ra2_320-7eb33cd5.pth',
+        input_size=(3, 320, 320), pool_size=(10, 10), test_input_size=(3, 384, 384), crop_pct=1.0),
+    'efficientnet_b5': _cfg(
+        url='', input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934),
+    'efficientnet_b6': _cfg(
+        url='', input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942),
+    'efficientnet_b7': _cfg(
+        url='', input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949),
+    'efficientnet_b8': _cfg(
+        url='', input_size=(3, 672, 672), pool_size=(21, 21), crop_pct=0.954),
+    'efficientnet_l2': _cfg(
+        url='', input_size=(3, 800, 800), pool_size=(25, 25), crop_pct=0.961),
+
+    # FIXME experimental
+    'efficientnet_b0_gn': _cfg(
+        url=''),
+    'efficientnet_b0_g8_gn': _cfg(
+        url=''),
+    'efficientnet_b0_g16_evos': _cfg(
+        url=''),
+    'efficientnet_b3_gn': _cfg(
+        url='',
+        input_size=(3, 288, 288), pool_size=(9, 9), test_input_size=(3, 320, 320), crop_pct=1.0),
+    'efficientnet_b3_g8_gn': _cfg(
+        url='',
+        input_size=(3, 288, 288), pool_size=(9, 9), test_input_size=(3, 320, 320), crop_pct=1.0),
+
+    'efficientnet_es': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_es_ra-f111e99c.pth'),
+    'efficientnet_em': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_em_ra2-66250f76.pth',
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+    'efficientnet_el': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_el-3b455510.pth',
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+
+    'efficientnet_es_pruned': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_es_pruned75-1b7248cf.pth'),
+    'efficientnet_el_pruned': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_el_pruned70-ef2a2ccf.pth',
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+
+    'efficientnet_cc_b0_4e': _cfg(url=''),
+    'efficientnet_cc_b0_8e': _cfg(url=''),
+    'efficientnet_cc_b1_8e': _cfg(url='', input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+
+    'efficientnet_lite0': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_lite0_ra-37913777.pth'),
+    'efficientnet_lite1': _cfg(
+        url='',
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+    'efficientnet_lite2': _cfg(
+        url='',
+        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890),
+    'efficientnet_lite3': _cfg(
+        url='',
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+    'efficientnet_lite4': _cfg(
+        url='', input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922),
+
+    'efficientnet_b1_pruned': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/effnetb1_pruned-bea43a3a.pth',
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882, mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'efficientnet_b2_pruned': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/effnetb2_pruned-08c1b27c.pth',
+        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890, mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'efficientnet_b3_pruned': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/effnetb3_pruned-59ecf72d.pth',
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904, mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+
+    'efficientnetv2_rw_t': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnetv2_t_agc-3620981a.pth',
+        input_size=(3, 224, 224), test_input_size=(3, 288, 288), pool_size=(7, 7), crop_pct=1.0),
+    'gc_efficientnetv2_rw_t': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gc_efficientnetv2_rw_t_agc-927a0bde.pth',
+        input_size=(3, 224, 224), test_input_size=(3, 288, 288), pool_size=(7, 7), crop_pct=1.0),
+    'efficientnetv2_rw_s': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_v2s_ra2_288-a6477665.pth',
+        input_size=(3, 288, 288), test_input_size=(3, 384, 384), pool_size=(9, 9), crop_pct=1.0),
+    'efficientnetv2_rw_m': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnetv2_rw_m_agc-3d90cb1e.pth',
+        input_size=(3, 320, 320), test_input_size=(3, 416, 416), pool_size=(10, 10), crop_pct=1.0),
+
+    'efficientnetv2_s': _cfg(
+        url='',
+        input_size=(3, 288, 288), test_input_size=(3, 384, 384), pool_size=(9, 9), crop_pct=1.0),
+    'efficientnetv2_m': _cfg(
+        url='',
+        input_size=(3, 320, 320), test_input_size=(3, 416, 416), pool_size=(10, 10), crop_pct=1.0),
+    'efficientnetv2_l': _cfg(
+        url='',
+        input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0),
+    'efficientnetv2_xl': _cfg(
+        url='',
+        input_size=(3, 384, 384), test_input_size=(3, 512, 512), pool_size=(12, 12), crop_pct=1.0),
+
+    'tf_efficientnet_b0': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_aa-827b6e33.pth',
+        input_size=(3, 224, 224)),
+    'tf_efficientnet_b1': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_aa-ea7a6ee0.pth',
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+    'tf_efficientnet_b2': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_aa-60c94f97.pth',
+        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890),
+    'tf_efficientnet_b3': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_aa-84b4657e.pth',
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+    'tf_efficientnet_b4': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_aa-818f208c.pth',
+        input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922),
+    'tf_efficientnet_b5': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ra-9a3e5369.pth',
+        input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934),
+    'tf_efficientnet_b6': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_aa-80ba17e4.pth',
+        input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942),
+    'tf_efficientnet_b7': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ra-6c08e654.pth',
+        input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949),
+    'tf_efficientnet_b8': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b8_ra-572d5dd9.pth',
+        input_size=(3, 672, 672), pool_size=(21, 21), crop_pct=0.954),
+
+    'tf_efficientnet_b0_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ap-f262efe1.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, input_size=(3, 224, 224)),
+    'tf_efficientnet_b1_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_ap-44ef0a3d.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+    'tf_efficientnet_b2_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_ap-2f8e7636.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890),
+    'tf_efficientnet_b3_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ap-aad25bdd.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+    'tf_efficientnet_b4_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_ap-dedb23e6.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922),
+    'tf_efficientnet_b5_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ap-9e82fae8.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934),
+    'tf_efficientnet_b6_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_ap-4ffb161f.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942),
+    'tf_efficientnet_b7_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ap-ddb28fec.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949),
+    'tf_efficientnet_b8_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b8_ap-00e169fa.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 672, 672), pool_size=(21, 21), crop_pct=0.954),
+
+    'tf_efficientnet_b0_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ns-c0e6a31c.pth',
+        input_size=(3, 224, 224)),
+    'tf_efficientnet_b1_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_ns-99dd0c41.pth',
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+    'tf_efficientnet_b2_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_ns-00306e48.pth',
+        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890),
+    'tf_efficientnet_b3_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ns-9d44bf68.pth',
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+    'tf_efficientnet_b4_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_ns-d6313a46.pth',
+        input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922),
+    'tf_efficientnet_b5_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ns-6f26d0cf.pth',
+        input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934),
+    'tf_efficientnet_b6_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_ns-51548356.pth',
+        input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942),
+    'tf_efficientnet_b7_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ns-1dbc32de.pth',
+        input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949),
+    'tf_efficientnet_l2_ns_475': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_l2_ns_475-bebbd00a.pth',
+        input_size=(3, 475, 475), pool_size=(15, 15), crop_pct=0.936),
+    'tf_efficientnet_l2_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_l2_ns-df73bb44.pth',
+        input_size=(3, 800, 800), pool_size=(25, 25), crop_pct=0.96),
+
+    'tf_efficientnet_es': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_es-ca1afbfe.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 224, 224), ),
+    'tf_efficientnet_em': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_em-e78cfe58.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+    'tf_efficientnet_el': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_el-5143854e.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+
+    'tf_efficientnet_cc_b0_4e': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b0_4e-4362b6b2.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_efficientnet_cc_b0_8e': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b0_8e-66184a25.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_efficientnet_cc_b1_8e': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b1_8e-f7c79ae1.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+
+    'tf_efficientnet_lite0': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite0-0aa007d2.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        interpolation='bicubic',  # should be bilinear but bicubic better match for TF bilinear at low res
+    ),
+    'tf_efficientnet_lite1': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite1-bde8b488.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882,
+        interpolation='bicubic',  # should be bilinear but bicubic better match for TF bilinear at low res
+    ),
+    'tf_efficientnet_lite2': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite2-dcccb7df.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890,
+        interpolation='bicubic',  # should be bilinear but bicubic better match for TF bilinear at low res
+    ),
+    'tf_efficientnet_lite3': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite3-b733e338.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904, interpolation='bilinear'),
+    'tf_efficientnet_lite4': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite4-741542c3.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.920, interpolation='bilinear'),
+
+    'tf_efficientnetv2_s': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s-eb54923e.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 300, 300), test_input_size=(3, 384, 384), pool_size=(10, 10), crop_pct=1.0),
+    'tf_efficientnetv2_m': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m-cc09e0cd.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0),
+    'tf_efficientnetv2_l': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l-d664b728.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0),
+
+    'tf_efficientnetv2_s_in21ft1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s_21ft1k-d7dafa41.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 300, 300), test_input_size=(3, 384, 384), pool_size=(10, 10), crop_pct=1.0),
+    'tf_efficientnetv2_m_in21ft1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m_21ft1k-bf41664a.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0),
+    'tf_efficientnetv2_l_in21ft1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l_21ft1k-60127a9d.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0),
+    'tf_efficientnetv2_xl_in21ft1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_xl_in21ft1k-06c35c48.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 384, 384), test_input_size=(3, 512, 512), pool_size=(12, 12), crop_pct=1.0),
+
+    'tf_efficientnetv2_s_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s_21k-6337ad01.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843,
+        input_size=(3, 300, 300), test_input_size=(3, 384, 384), pool_size=(10, 10), crop_pct=1.0),
+    'tf_efficientnetv2_m_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m_21k-361418a2.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843,
+        input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0),
+    'tf_efficientnetv2_l_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l_21k-91a19ec9.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843,
+        input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0),
+    'tf_efficientnetv2_xl_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_xl_in21k-fd7e8abf.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843,
+        input_size=(3, 384, 384), test_input_size=(3, 512, 512), pool_size=(12, 12), crop_pct=1.0),
+
+    'tf_efficientnetv2_b0': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b0-c7cc451f.pth',
+        input_size=(3, 192, 192), test_input_size=(3, 224, 224), pool_size=(6, 6)),
+    'tf_efficientnetv2_b1': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b1-be6e41b0.pth',
+        input_size=(3, 192, 192), test_input_size=(3, 240, 240), pool_size=(6, 6), crop_pct=0.882),
+    'tf_efficientnetv2_b2': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b2-847de54e.pth',
+        input_size=(3, 208, 208), test_input_size=(3, 260, 260), pool_size=(7, 7), crop_pct=0.890),
+    'tf_efficientnetv2_b3': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b3-57773f13.pth',
+        input_size=(3, 240, 240), test_input_size=(3, 300, 300), pool_size=(8, 8), crop_pct=0.904),
+
+    'mixnet_s': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_s-a907afbc.pth'),
+    'mixnet_m': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_m-4647fc68.pth'),
+    'mixnet_l': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_l-5a9a2ed8.pth'),
+    'mixnet_xl': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_xl_ra-aac3c00c.pth'),
+    'mixnet_xxl': _cfg(),
+
+    'tf_mixnet_s': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_s-89d3354b.pth'),
+    'tf_mixnet_m': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_m-0f4d8805.pth'),
+    'tf_mixnet_l': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_l-6c92e0c8.pth'),
+
+    "tinynet_a": _cfg(
+        input_size=(3, 192, 192), pool_size=(6, 6),  # int(224 * 0.86)
+        url='https://github.com/huawei-noah/CV-Backbones/releases/download/v1.2.0/tinynet_a.pth'),
+    "tinynet_b": _cfg(
+        input_size=(3, 188, 188), pool_size=(6, 6),  # int(224 * 0.84)
+        url='https://github.com/huawei-noah/CV-Backbones/releases/download/v1.2.0/tinynet_b.pth'),
+    "tinynet_c": _cfg(
+        input_size=(3, 184, 184), pool_size=(6, 6),  # int(224 * 0.825)
+        url='https://github.com/huawei-noah/CV-Backbones/releases/download/v1.2.0/tinynet_c.pth'),
+    "tinynet_d": _cfg(
+        input_size=(3, 152, 152), pool_size=(5, 5),  # int(224 * 0.68)
+        url='https://github.com/huawei-noah/CV-Backbones/releases/download/v1.2.0/tinynet_d.pth'),
+    "tinynet_e": _cfg(
+        input_size=(3, 106, 106), pool_size=(4, 4),  # int(224 * 0.475)
+        url='https://github.com/huawei-noah/CV-Backbones/releases/download/v1.2.0/tinynet_e.pth'),
+}
+
+
+class EfficientNet(nn.Module):
+    """ EfficientNet
+
+    A flexible and performant PyTorch implementation of efficient network architectures, including:
+      * EfficientNet-V2 Small, Medium, Large, XL & B0-B3
+      * EfficientNet B0-B8, L2
+      * EfficientNet-EdgeTPU
+      * EfficientNet-CondConv
+      * MixNet S, M, L, XL
+      * MnasNet A1, B1, and small
+      * MobileNet-V2
+      * FBNet C
+      * Single-Path NAS Pixel1
+      * TinyNet
+    """
+
+    def __init__(
+            self, block_args, num_classes=1000, num_features=1280, in_chans=3, stem_size=32, fix_stem=False,
+            output_stride=32, pad_type='', round_chs_fn=round_channels, act_layer=None, norm_layer=None,
+            se_layer=None, drop_rate=0., drop_path_rate=0., global_pool='avg'):
+        super(EfficientNet, self).__init__()
+        act_layer = act_layer or nn.ReLU
+        norm_layer = norm_layer or nn.BatchNorm2d
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
+        se_layer = se_layer or SqueezeExcite
+        self.num_classes = num_classes
+        self.num_features = num_features
+        self.drop_rate = drop_rate
+        self.grad_checkpointing = False
+
+        # Stem
+        if not fix_stem:
+            stem_size = round_chs_fn(stem_size)
+        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
+        self.bn1 = norm_act_layer(stem_size, inplace=True)
+
+        # Middle stages (IR/ER/DS Blocks)
+        builder = EfficientNetBuilder(
+            output_stride=output_stride, pad_type=pad_type, round_chs_fn=round_chs_fn,
+            act_layer=act_layer, norm_layer=norm_layer, se_layer=se_layer, drop_path_rate=drop_path_rate)
+        self.blocks = nn.Sequential(*builder(stem_size, block_args))
+        self.feature_info = builder.features
+        head_chs = builder.in_chs
+
+        # Head + Pooling
+        self.conv_head = create_conv2d(head_chs, self.num_features, 1, padding=pad_type)
+        self.bn2 = norm_act_layer(self.num_features, inplace=True)
+        self.global_pool, self.classifier = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool)
+
+        efficientnet_init_weights(self)
+
+    def as_sequential(self):
+        layers = [self.conv_stem, self.bn1]
+        layers.extend(self.blocks)
+        layers.extend([self.conv_head, self.bn2, self.global_pool])
+        layers.extend([nn.Dropout(self.drop_rate), self.classifier])
+        return nn.Sequential(*layers)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^conv_stem|bn1',
+            blocks=[
+                (r'^blocks\.(\d+)' if coarse else r'^blocks\.(\d+)\.(\d+)', None),
+                (r'conv_head|bn2', (99999,))
+            ]
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.classifier
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool, self.classifier = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool)
+
+    def forward_features(self, x):
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x, flatten=True)
+        else:
+            x = self.blocks(x)
+        x = self.conv_head(x)
+        x = self.bn2(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        x = self.global_pool(x)
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        return x if pre_logits else self.classifier(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+class EfficientNetFeatures(nn.Module):
+    """ EfficientNet Feature Extractor
+
+    A work-in-progress feature extraction module for EfficientNet, to use as a backbone for segmentation
+    and object detection models.
+    """
+
+    def __init__(
+            self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='bottleneck', in_chans=3,
+            stem_size=32, fix_stem=False, output_stride=32, pad_type='', round_chs_fn=round_channels,
+            act_layer=None, norm_layer=None, se_layer=None, drop_rate=0., drop_path_rate=0.):
+        super(EfficientNetFeatures, self).__init__()
+        act_layer = act_layer or nn.ReLU
+        norm_layer = norm_layer or nn.BatchNorm2d
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
+        se_layer = se_layer or SqueezeExcite
+        self.drop_rate = drop_rate
+
+        # Stem
+        if not fix_stem:
+            stem_size = round_chs_fn(stem_size)
+        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
+        self.bn1 = norm_act_layer(stem_size, inplace=True)
+
+        # Middle stages (IR/ER/DS Blocks)
+        builder = EfficientNetBuilder(
+            output_stride=output_stride, pad_type=pad_type, round_chs_fn=round_chs_fn,
+            act_layer=act_layer, norm_layer=norm_layer, se_layer=se_layer, drop_path_rate=drop_path_rate,
+            feature_location=feature_location)
+        self.blocks = nn.Sequential(*builder(stem_size, block_args))
+        self.feature_info = FeatureInfo(builder.features, out_indices)
+        self._stage_out_idx = {v['stage']: i for i, v in enumerate(self.feature_info) if i in out_indices}
+
+        efficientnet_init_weights(self)
+
+        # Register feature extraction hooks with FeatureHooks helper
+        self.feature_hooks = None
+        if feature_location != 'bottleneck':
+            hooks = self.feature_info.get_dicts(keys=('module', 'hook_type'))
+            self.feature_hooks = FeatureHooks(hooks, self.named_modules())
+
+    def forward(self, x) -> List[torch.Tensor]:
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        if self.feature_hooks is None:
+            features = []
+            if 0 in self._stage_out_idx:
+                features.append(x)  # add stem out
+            for i, b in enumerate(self.blocks):
+                x = b(x)
+                if i + 1 in self._stage_out_idx:
+                    features.append(x)
+            return features
+        else:
+            self.blocks(x)
+            out = self.feature_hooks.get_output(x.device)
+            return list(out.values())
+
+
+def _create_effnet(variant, pretrained=False, **kwargs):
+    features_only = False
+    model_cls = EfficientNet
+    kwargs_filter = None
+    if kwargs.pop('features_only', False):
+        features_only = True
+        kwargs_filter = ('num_classes', 'num_features', 'head_conv', 'global_pool')
+        model_cls = EfficientNetFeatures
+    model = build_model_with_cfg(
+        model_cls, variant, pretrained,
+        pretrained_strict=not features_only,
+        kwargs_filter=kwargs_filter,
+        **kwargs)
+    if features_only:
+        model.default_cfg = pretrained_cfg_for_features(model.default_cfg)
+    return model
+
+
+def _gen_mnasnet_a1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a mnasnet-a1 model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+    Paper: https://arxiv.org/pdf/1807.11626.pdf.
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_e1_c16_noskip'],
+        # stage 1, 112x112 in
+        ['ir_r2_k3_s2_e6_c24'],
+        # stage 2, 56x56 in
+        ['ir_r3_k5_s2_e3_c40_se0.25'],
+        # stage 3, 28x28 in
+        ['ir_r4_k3_s2_e6_c80'],
+        # stage 4, 14x14in
+        ['ir_r2_k3_s1_e6_c112_se0.25'],
+        # stage 5, 14x14in
+        ['ir_r3_k5_s2_e6_c160_se0.25'],
+        # stage 6, 7x7 in
+        ['ir_r1_k3_s1_e6_c320'],
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=32,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_mnasnet_b1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a mnasnet-b1 model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+    Paper: https://arxiv.org/pdf/1807.11626.pdf.
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_c16_noskip'],
+        # stage 1, 112x112 in
+        ['ir_r3_k3_s2_e3_c24'],
+        # stage 2, 56x56 in
+        ['ir_r3_k5_s2_e3_c40'],
+        # stage 3, 28x28 in
+        ['ir_r3_k5_s2_e6_c80'],
+        # stage 4, 14x14in
+        ['ir_r2_k3_s1_e6_c96'],
+        # stage 5, 14x14in
+        ['ir_r4_k5_s2_e6_c192'],
+        # stage 6, 7x7 in
+        ['ir_r1_k3_s1_e6_c320_noskip']
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=32,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_mnasnet_small(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a mnasnet-b1 model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+    Paper: https://arxiv.org/pdf/1807.11626.pdf.
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_c8'],
+        ['ir_r1_k3_s2_e3_c16'],
+        ['ir_r2_k3_s2_e6_c16'],
+        ['ir_r4_k5_s2_e6_c32_se0.25'],
+        ['ir_r3_k3_s1_e6_c32_se0.25'],
+        ['ir_r3_k5_s2_e6_c88_se0.25'],
+        ['ir_r1_k3_s1_e6_c144']
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=8,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_mobilenet_v2(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, fix_stem_head=False, pretrained=False, **kwargs):
+    """ Generate MobileNet-V2 network
+    Ref impl: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v2.py
+    Paper: https://arxiv.org/abs/1801.04381
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_c16'],
+        ['ir_r2_k3_s2_e6_c24'],
+        ['ir_r3_k3_s2_e6_c32'],
+        ['ir_r4_k3_s2_e6_c64'],
+        ['ir_r3_k3_s1_e6_c96'],
+        ['ir_r3_k3_s2_e6_c160'],
+        ['ir_r1_k3_s1_e6_c320'],
+    ]
+    round_chs_fn = partial(round_channels, multiplier=channel_multiplier)
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier=depth_multiplier, fix_first_last=fix_stem_head),
+        num_features=1280 if fix_stem_head else max(1280, round_chs_fn(1280)),
+        stem_size=32,
+        fix_stem=fix_stem_head,
+        round_chs_fn=round_chs_fn,
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'relu6'),
+        **kwargs
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_fbnetc(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """ FBNet-C
+
+        Paper: https://arxiv.org/abs/1812.03443
+        Ref Impl: https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py
+
+        NOTE: the impl above does not relate to the 'C' variant here, that was derived from paper,
+        it was used to confirm some building block details
+    """
+    arch_def = [
+        ['ir_r1_k3_s1_e1_c16'],
+        ['ir_r1_k3_s2_e6_c24', 'ir_r2_k3_s1_e1_c24'],
+        ['ir_r1_k5_s2_e6_c32', 'ir_r1_k5_s1_e3_c32', 'ir_r1_k5_s1_e6_c32', 'ir_r1_k3_s1_e6_c32'],
+        ['ir_r1_k5_s2_e6_c64', 'ir_r1_k5_s1_e3_c64', 'ir_r2_k5_s1_e6_c64'],
+        ['ir_r3_k5_s1_e6_c112', 'ir_r1_k5_s1_e3_c112'],
+        ['ir_r4_k5_s2_e6_c184'],
+        ['ir_r1_k3_s1_e6_c352'],
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=16,
+        num_features=1984,  # paper suggests this, but is not 100% clear
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_spnasnet(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates the Single-Path NAS model from search targeted for Pixel1 phone.
+
+    Paper: https://arxiv.org/abs/1904.02877
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_c16_noskip'],
+        # stage 1, 112x112 in
+        ['ir_r3_k3_s2_e3_c24'],
+        # stage 2, 56x56 in
+        ['ir_r1_k5_s2_e6_c40', 'ir_r3_k3_s1_e3_c40'],
+        # stage 3, 28x28 in
+        ['ir_r1_k5_s2_e6_c80', 'ir_r3_k3_s1_e3_c80'],
+        # stage 4, 14x14in
+        ['ir_r1_k5_s1_e6_c96', 'ir_r3_k5_s1_e3_c96'],
+        # stage 5, 14x14in
+        ['ir_r4_k5_s2_e6_c192'],
+        # stage 6, 7x7 in
+        ['ir_r1_k3_s1_e6_c320_noskip']
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=32,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnet(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, channel_divisor=8,
+        group_size=None, pretrained=False, **kwargs):
+    """Creates an EfficientNet model.
+
+    Ref impl: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py
+    Paper: https://arxiv.org/abs/1905.11946
+
+    EfficientNet params
+    name: (channel_multiplier, depth_multiplier, resolution, dropout_rate)
+    'efficientnet-b0': (1.0, 1.0, 224, 0.2),
+    'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+    'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+    'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+    'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+    'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+    'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+    'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+    'efficientnet-b8': (2.2, 3.6, 672, 0.5),
+    'efficientnet-l2': (4.3, 5.3, 800, 0.5),
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer
+      depth_multiplier: multiplier to number of repeats per stage
+
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_e1_c16_se0.25'],
+        ['ir_r2_k3_s2_e6_c24_se0.25'],
+        ['ir_r2_k5_s2_e6_c40_se0.25'],
+        ['ir_r3_k3_s2_e6_c80_se0.25'],
+        ['ir_r3_k5_s1_e6_c112_se0.25'],
+        ['ir_r4_k5_s2_e6_c192_se0.25'],
+        ['ir_r1_k3_s1_e6_c320_se0.25'],
+    ]
+    round_chs_fn = partial(round_channels, multiplier=channel_multiplier, divisor=channel_divisor)
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, group_size=group_size),
+        num_features=round_chs_fn(1280),
+        stem_size=32,
+        round_chs_fn=round_chs_fn,
+        act_layer=resolve_act_layer(kwargs, 'swish'),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnet_edge(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, group_size=None, pretrained=False, **kwargs):
+    """ Creates an EfficientNet-EdgeTPU model
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu
+    """
+
+    arch_def = [
+        # NOTE `fc` is present to override a mismatch between stem channels and in chs not
+        # present in other models
+        ['er_r1_k3_s1_e4_c24_fc24_noskip'],
+        ['er_r2_k3_s2_e8_c32'],
+        ['er_r4_k3_s2_e8_c48'],
+        ['ir_r5_k5_s2_e8_c96'],
+        ['ir_r4_k5_s1_e8_c144'],
+        ['ir_r2_k5_s2_e8_c192'],
+    ]
+    round_chs_fn = partial(round_channels, multiplier=channel_multiplier)
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, group_size=group_size),
+        num_features=round_chs_fn(1280),
+        stem_size=32,
+        round_chs_fn=round_chs_fn,
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'relu'),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnet_condconv(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=1, pretrained=False, **kwargs):
+    """Creates an EfficientNet-CondConv model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/condconv
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_e1_c16_se0.25'],
+        ['ir_r2_k3_s2_e6_c24_se0.25'],
+        ['ir_r2_k5_s2_e6_c40_se0.25'],
+        ['ir_r3_k3_s2_e6_c80_se0.25'],
+        ['ir_r3_k5_s1_e6_c112_se0.25_cc4'],
+        ['ir_r4_k5_s2_e6_c192_se0.25_cc4'],
+        ['ir_r1_k3_s1_e6_c320_se0.25_cc4'],
+    ]
+    # NOTE unlike official impl, this one uses `cc<x>` option where x is the base number of experts for each stage and
+    # the expert_multiplier increases that on a per-model basis as with depth/channel multipliers
+    round_chs_fn = partial(round_channels, multiplier=channel_multiplier)
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, experts_multiplier=experts_multiplier),
+        num_features=round_chs_fn(1280),
+        stem_size=32,
+        round_chs_fn=round_chs_fn,
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'swish'),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnet_lite(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates an EfficientNet-Lite model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite
+    Paper: https://arxiv.org/abs/1905.11946
+
+    EfficientNet params
+    name: (channel_multiplier, depth_multiplier, resolution, dropout_rate)
+      'efficientnet-lite0': (1.0, 1.0, 224, 0.2),
+      'efficientnet-lite1': (1.0, 1.1, 240, 0.2),
+      'efficientnet-lite2': (1.1, 1.2, 260, 0.3),
+      'efficientnet-lite3': (1.2, 1.4, 280, 0.3),
+      'efficientnet-lite4': (1.4, 1.8, 300, 0.3),
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer
+      depth_multiplier: multiplier to number of repeats per stage
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_e1_c16'],
+        ['ir_r2_k3_s2_e6_c24'],
+        ['ir_r2_k5_s2_e6_c40'],
+        ['ir_r3_k3_s2_e6_c80'],
+        ['ir_r3_k5_s1_e6_c112'],
+        ['ir_r4_k5_s2_e6_c192'],
+        ['ir_r1_k3_s1_e6_c320'],
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, fix_first_last=True),
+        num_features=1280,
+        stem_size=32,
+        fix_stem=True,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        act_layer=resolve_act_layer(kwargs, 'relu6'),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnetv2_base(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """ Creates an EfficientNet-V2 base model
+
+    Ref impl: https://github.com/google/automl/tree/master/efficientnetv2
+    Paper: `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298
+    """
+    arch_def = [
+        ['cn_r1_k3_s1_e1_c16_skip'],
+        ['er_r2_k3_s2_e4_c32'],
+        ['er_r2_k3_s2_e4_c48'],
+        ['ir_r3_k3_s2_e4_c96_se0.25'],
+        ['ir_r5_k3_s1_e6_c112_se0.25'],
+        ['ir_r8_k3_s2_e6_c192_se0.25'],
+    ]
+    round_chs_fn = partial(round_channels, multiplier=channel_multiplier, round_limit=0.)
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier),
+        num_features=round_chs_fn(1280),
+        stem_size=32,
+        round_chs_fn=round_chs_fn,
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'silu'),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnetv2_s(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, group_size=None, rw=False, pretrained=False, **kwargs):
+    """ Creates an EfficientNet-V2 Small model
+
+    Ref impl: https://github.com/google/automl/tree/master/efficientnetv2
+    Paper: `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298
+
+    NOTE: `rw` flag sets up 'small' variant to behave like my initial v2 small model,
+        before ref the impl was released.
+    """
+    arch_def = [
+        ['cn_r2_k3_s1_e1_c24_skip'],
+        ['er_r4_k3_s2_e4_c48'],
+        ['er_r4_k3_s2_e4_c64'],
+        ['ir_r6_k3_s2_e4_c128_se0.25'],
+        ['ir_r9_k3_s1_e6_c160_se0.25'],
+        ['ir_r15_k3_s2_e6_c256_se0.25'],
+    ]
+    num_features = 1280
+    if rw:
+        # my original variant, based on paper figure differs from the official release
+        arch_def[0] = ['er_r2_k3_s1_e1_c24']
+        arch_def[-1] = ['ir_r15_k3_s2_e6_c272_se0.25']
+        num_features = 1792
+
+    round_chs_fn = partial(round_channels, multiplier=channel_multiplier)
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, group_size=group_size),
+        num_features=round_chs_fn(num_features),
+        stem_size=24,
+        round_chs_fn=round_chs_fn,
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'silu'),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnetv2_m(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """ Creates an EfficientNet-V2 Medium model
+
+    Ref impl: https://github.com/google/automl/tree/master/efficientnetv2
+    Paper: `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298
+    """
+
+    arch_def = [
+        ['cn_r3_k3_s1_e1_c24_skip'],
+        ['er_r5_k3_s2_e4_c48'],
+        ['er_r5_k3_s2_e4_c80'],
+        ['ir_r7_k3_s2_e4_c160_se0.25'],
+        ['ir_r14_k3_s1_e6_c176_se0.25'],
+        ['ir_r18_k3_s2_e6_c304_se0.25'],
+        ['ir_r5_k3_s1_e6_c512_se0.25'],
+    ]
+
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier),
+        num_features=1280,
+        stem_size=24,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'silu'),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnetv2_l(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """ Creates an EfficientNet-V2 Large model
+
+    Ref impl: https://github.com/google/automl/tree/master/efficientnetv2
+    Paper: `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298
+    """
+
+    arch_def = [
+        ['cn_r4_k3_s1_e1_c32_skip'],
+        ['er_r7_k3_s2_e4_c64'],
+        ['er_r7_k3_s2_e4_c96'],
+        ['ir_r10_k3_s2_e4_c192_se0.25'],
+        ['ir_r19_k3_s1_e6_c224_se0.25'],
+        ['ir_r25_k3_s2_e6_c384_se0.25'],
+        ['ir_r7_k3_s1_e6_c640_se0.25'],
+    ]
+
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier),
+        num_features=1280,
+        stem_size=32,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'silu'),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_efficientnetv2_xl(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """ Creates an EfficientNet-V2 Xtra-Large model
+
+    Ref impl: https://github.com/google/automl/tree/master/efficientnetv2
+    Paper: `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298
+    """
+
+    arch_def = [
+        ['cn_r4_k3_s1_e1_c32_skip'],
+        ['er_r8_k3_s2_e4_c64'],
+        ['er_r8_k3_s2_e4_c96'],
+        ['ir_r16_k3_s2_e4_c192_se0.25'],
+        ['ir_r24_k3_s1_e6_c256_se0.25'],
+        ['ir_r32_k3_s2_e6_c512_se0.25'],
+        ['ir_r8_k3_s1_e6_c640_se0.25'],
+    ]
+
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier),
+        num_features=1280,
+        stem_size=32,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'silu'),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_mixnet_s(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a MixNet Small model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet
+    Paper: https://arxiv.org/abs/1907.09595
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_e1_c16'],  # relu
+        # stage 1, 112x112 in
+        ['ir_r1_k3_a1.1_p1.1_s2_e6_c24', 'ir_r1_k3_a1.1_p1.1_s1_e3_c24'],  # relu
+        # stage 2, 56x56 in
+        ['ir_r1_k3.5.7_s2_e6_c40_se0.5_nsw', 'ir_r3_k3.5_a1.1_p1.1_s1_e6_c40_se0.5_nsw'],  # swish
+        # stage 3, 28x28 in
+        ['ir_r1_k3.5.7_p1.1_s2_e6_c80_se0.25_nsw', 'ir_r2_k3.5_p1.1_s1_e6_c80_se0.25_nsw'],  # swish
+        # stage 4, 14x14in
+        ['ir_r1_k3.5.7_a1.1_p1.1_s1_e6_c120_se0.5_nsw', 'ir_r2_k3.5.7.9_a1.1_p1.1_s1_e3_c120_se0.5_nsw'],  # swish
+        # stage 5, 14x14in
+        ['ir_r1_k3.5.7.9.11_s2_e6_c200_se0.5_nsw', 'ir_r2_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'],  # swish
+        # 7x7
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        num_features=1536,
+        stem_size=16,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_mixnet_m(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a MixNet Medium-Large model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet
+    Paper: https://arxiv.org/abs/1907.09595
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_e1_c24'],  # relu
+        # stage 1, 112x112 in
+        ['ir_r1_k3.5.7_a1.1_p1.1_s2_e6_c32', 'ir_r1_k3_a1.1_p1.1_s1_e3_c32'],  # relu
+        # stage 2, 56x56 in
+        ['ir_r1_k3.5.7.9_s2_e6_c40_se0.5_nsw', 'ir_r3_k3.5_a1.1_p1.1_s1_e6_c40_se0.5_nsw'],  # swish
+        # stage 3, 28x28 in
+        ['ir_r1_k3.5.7_s2_e6_c80_se0.25_nsw', 'ir_r3_k3.5.7.9_a1.1_p1.1_s1_e6_c80_se0.25_nsw'],  # swish
+        # stage 4, 14x14in
+        ['ir_r1_k3_s1_e6_c120_se0.5_nsw', 'ir_r3_k3.5.7.9_a1.1_p1.1_s1_e3_c120_se0.5_nsw'],  # swish
+        # stage 5, 14x14in
+        ['ir_r1_k3.5.7.9_s2_e6_c200_se0.5_nsw', 'ir_r3_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'],  # swish
+        # 7x7
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, depth_trunc='round'),
+        num_features=1536,
+        stem_size=24,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_tinynet(
+    variant, model_width=1.0, depth_multiplier=1.0, pretrained=False, **kwargs
+):
+    """Creates a TinyNet model.
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_e1_c16_se0.25'], ['ir_r2_k3_s2_e6_c24_se0.25'],
+        ['ir_r2_k5_s2_e6_c40_se0.25'], ['ir_r3_k3_s2_e6_c80_se0.25'],
+        ['ir_r3_k5_s1_e6_c112_se0.25'], ['ir_r4_k5_s2_e6_c192_se0.25'],
+        ['ir_r1_k3_s1_e6_c320_se0.25'],
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, depth_trunc='round'),
+        num_features=max(1280, round_channels(1280, model_width, 8, None)),
+        stem_size=32,
+        fix_stem=True,
+        round_chs_fn=partial(round_channels, multiplier=model_width),
+        act_layer=resolve_act_layer(kwargs, 'swish'),
+        norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        **kwargs,
+    )
+    model = _create_effnet(variant, pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def mnasnet_050(pretrained=False, **kwargs):
+    """ MNASNet B1, depth multiplier of 0.5. """
+    model = _gen_mnasnet_b1('mnasnet_050', 0.5, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mnasnet_075(pretrained=False, **kwargs):
+    """ MNASNet B1, depth multiplier of 0.75. """
+    model = _gen_mnasnet_b1('mnasnet_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mnasnet_100(pretrained=False, **kwargs):
+    """ MNASNet B1, depth multiplier of 1.0. """
+    model = _gen_mnasnet_b1('mnasnet_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mnasnet_b1(pretrained=False, **kwargs):
+    """ MNASNet B1, depth multiplier of 1.0. """
+    return mnasnet_100(pretrained, **kwargs)
+
+
+@register_model
+def mnasnet_140(pretrained=False, **kwargs):
+    """ MNASNet B1,  depth multiplier of 1.4 """
+    model = _gen_mnasnet_b1('mnasnet_140', 1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def semnasnet_050(pretrained=False, **kwargs):
+    """ MNASNet A1 (w/ SE), depth multiplier of 0.5 """
+    model = _gen_mnasnet_a1('semnasnet_050', 0.5, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def semnasnet_075(pretrained=False, **kwargs):
+    """ MNASNet A1 (w/ SE),  depth multiplier of 0.75. """
+    model = _gen_mnasnet_a1('semnasnet_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def semnasnet_100(pretrained=False, **kwargs):
+    """ MNASNet A1 (w/ SE), depth multiplier of 1.0. """
+    model = _gen_mnasnet_a1('semnasnet_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mnasnet_a1(pretrained=False, **kwargs):
+    """ MNASNet A1 (w/ SE), depth multiplier of 1.0. """
+    return semnasnet_100(pretrained, **kwargs)
+
+
+@register_model
+def semnasnet_140(pretrained=False, **kwargs):
+    """ MNASNet A1 (w/ SE), depth multiplier of 1.4. """
+    model = _gen_mnasnet_a1('semnasnet_140', 1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mnasnet_small(pretrained=False, **kwargs):
+    """ MNASNet Small,  depth multiplier of 1.0. """
+    model = _gen_mnasnet_small('mnasnet_small', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv2_035(pretrained=False, **kwargs):
+    """ MobileNet V2 w/ 0.35 channel multiplier """
+    model = _gen_mobilenet_v2('mobilenetv2_035', 0.35, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv2_050(pretrained=False, **kwargs):
+    """ MobileNet V2 w/ 0.5 channel multiplier """
+    model = _gen_mobilenet_v2('mobilenetv2_050', 0.5, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv2_075(pretrained=False, **kwargs):
+    """ MobileNet V2 w/ 0.75 channel multiplier """
+    model = _gen_mobilenet_v2('mobilenetv2_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv2_100(pretrained=False, **kwargs):
+    """ MobileNet V2 w/ 1.0 channel multiplier """
+    model = _gen_mobilenet_v2('mobilenetv2_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv2_140(pretrained=False, **kwargs):
+    """ MobileNet V2 w/ 1.4 channel multiplier """
+    model = _gen_mobilenet_v2('mobilenetv2_140', 1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv2_110d(pretrained=False, **kwargs):
+    """ MobileNet V2 w/ 1.1 channel, 1.2 depth multipliers"""
+    model = _gen_mobilenet_v2(
+        'mobilenetv2_110d', 1.1, depth_multiplier=1.2, fix_stem_head=True, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv2_120d(pretrained=False, **kwargs):
+    """ MobileNet V2 w/ 1.2 channel, 1.4 depth multipliers """
+    model = _gen_mobilenet_v2(
+        'mobilenetv2_120d', 1.2, depth_multiplier=1.4, fix_stem_head=True, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def fbnetc_100(pretrained=False, **kwargs):
+    """ FBNet-C """
+    if pretrained:
+        # pretrained model trained with non-default BN epsilon
+        kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    model = _gen_fbnetc('fbnetc_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def spnasnet_100(pretrained=False, **kwargs):
+    """ Single-Path NAS Pixel1"""
+    model = _gen_spnasnet('spnasnet_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b0(pretrained=False, **kwargs):
+    """ EfficientNet-B0 """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b1(pretrained=False, **kwargs):
+    """ EfficientNet-B1 """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b2(pretrained=False, **kwargs):
+    """ EfficientNet-B2 """
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b2a(pretrained=False, **kwargs):
+    """ EfficientNet-B2 @ 288x288 w/ 1.0 test crop"""
+    # WARN this model def is deprecated, different train/test res + test crop handled by default_cfg now
+    return efficientnet_b2(pretrained=pretrained, **kwargs)
+
+
+@register_model
+def efficientnet_b3(pretrained=False, **kwargs):
+    """ EfficientNet-B3 """
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b3a(pretrained=False, **kwargs):
+    """ EfficientNet-B3 @ 320x320 w/ 1.0 test crop-pct """
+    # WARN this model def is deprecated, different train/test res + test crop handled by default_cfg now
+    return efficientnet_b3(pretrained=pretrained, **kwargs)
+
+
+@register_model
+def efficientnet_b4(pretrained=False, **kwargs):
+    """ EfficientNet-B4 """
+    # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b5(pretrained=False, **kwargs):
+    """ EfficientNet-B5 """
+    # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b6(pretrained=False, **kwargs):
+    """ EfficientNet-B6 """
+    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b7(pretrained=False, **kwargs):
+    """ EfficientNet-B7 """
+    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b8(pretrained=False, **kwargs):
+    """ EfficientNet-B8 """
+    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b8', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_l2(pretrained=False, **kwargs):
+    """ EfficientNet-L2."""
+    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_l2', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
+    return model
+
+
+# FIXME experimental group cong / GroupNorm / EvoNorm experiments
+@register_model
+def efficientnet_b0_gn(pretrained=False, **kwargs):
+    """ EfficientNet-B0 + GroupNorm"""
+    model = _gen_efficientnet(
+        'efficientnet_b0_gn', norm_layer=partial(GroupNormAct, group_size=8), pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b0_g8_gn(pretrained=False, **kwargs):
+    """ EfficientNet-B0 w/ group conv + GroupNorm"""
+    model = _gen_efficientnet(
+        'efficientnet_b0_g8_gn', group_size=8, norm_layer=partial(GroupNormAct, group_size=8),
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b0_g16_evos(pretrained=False, **kwargs):
+    """ EfficientNet-B0 w/ group 16 conv + EvoNorm"""
+    model = _gen_efficientnet(
+        'efficientnet_b0_g16_evos', group_size=16, channel_divisor=16,
+        pretrained=pretrained, **kwargs) #norm_layer=partial(EvoNorm2dS0, group_size=16),
+    return model
+
+
+@register_model
+def efficientnet_b3_gn(pretrained=False, **kwargs):
+    """ EfficientNet-B3 w/ GroupNorm """
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b3_gn', channel_multiplier=1.2, depth_multiplier=1.4, channel_divisor=16,
+        norm_layer=partial(GroupNormAct, group_size=16), pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b3_g8_gn(pretrained=False, **kwargs):
+    """ EfficientNet-B3 w/ grouped conv + BN"""
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b3_g8_gn', channel_multiplier=1.2, depth_multiplier=1.4, group_size=8, channel_divisor=16,
+        norm_layer=partial(GroupNormAct, group_size=16), pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_es(pretrained=False, **kwargs):
+    """ EfficientNet-Edge Small. """
+    model = _gen_efficientnet_edge(
+        'efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_es_pruned(pretrained=False, **kwargs):
+    """ EfficientNet-Edge Small Pruned. For more info: https://github.com/DeGirum/pruned-models/releases/tag/efficientnet_v1.0"""
+    model = _gen_efficientnet_edge(
+        'efficientnet_es_pruned', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+@register_model
+def efficientnet_em(pretrained=False, **kwargs):
+    """ EfficientNet-Edge-Medium. """
+    model = _gen_efficientnet_edge(
+        'efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_el(pretrained=False, **kwargs):
+    """ EfficientNet-Edge-Large. """
+    model = _gen_efficientnet_edge(
+        'efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+@register_model
+def efficientnet_el_pruned(pretrained=False, **kwargs):
+    """ EfficientNet-Edge-Large pruned. For more info: https://github.com/DeGirum/pruned-models/releases/tag/efficientnet_v1.0"""
+    model = _gen_efficientnet_edge(
+        'efficientnet_el_pruned', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+@register_model
+def efficientnet_cc_b0_4e(pretrained=False, **kwargs):
+    """ EfficientNet-CondConv-B0 w/ 8 Experts """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet_condconv(
+        'efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_cc_b0_8e(pretrained=False, **kwargs):
+    """ EfficientNet-CondConv-B0 w/ 8 Experts """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet_condconv(
+        'efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2,
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_cc_b1_8e(pretrained=False, **kwargs):
+    """ EfficientNet-CondConv-B1 w/ 8 Experts """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet_condconv(
+        'efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2,
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_lite0(pretrained=False, **kwargs):
+    """ EfficientNet-Lite0 """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet_lite(
+        'efficientnet_lite0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_lite1(pretrained=False, **kwargs):
+    """ EfficientNet-Lite1 """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet_lite(
+        'efficientnet_lite1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_lite2(pretrained=False, **kwargs):
+    """ EfficientNet-Lite2 """
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    model = _gen_efficientnet_lite(
+        'efficientnet_lite2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_lite3(pretrained=False, **kwargs):
+    """ EfficientNet-Lite3 """
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    model = _gen_efficientnet_lite(
+        'efficientnet_lite3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_lite4(pretrained=False, **kwargs):
+    """ EfficientNet-Lite4 """
+    # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2
+    model = _gen_efficientnet_lite(
+        'efficientnet_lite4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b1_pruned(pretrained=False, **kwargs):
+    """ EfficientNet-B1 Pruned. The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    variant = 'efficientnet_b1_pruned'
+    model = _gen_efficientnet(
+        variant, channel_multiplier=1.0, depth_multiplier=1.1, pruned=True, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b2_pruned(pretrained=False, **kwargs):
+    """ EfficientNet-B2 Pruned. The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'efficientnet_b2_pruned', channel_multiplier=1.1, depth_multiplier=1.2, pruned=True,
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b3_pruned(pretrained=False, **kwargs):
+    """ EfficientNet-B3 Pruned. The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'efficientnet_b3_pruned', channel_multiplier=1.2, depth_multiplier=1.4, pruned=True,
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnetv2_rw_t(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Tiny (Custom variant, tiny not in paper). """
+    model = _gen_efficientnetv2_s(
+        'efficientnetv2_rw_t', channel_multiplier=0.8, depth_multiplier=0.9, rw=False, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def gc_efficientnetv2_rw_t(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Tiny w/ Global Context Attn (Custom variant, tiny not in paper). """
+    model = _gen_efficientnetv2_s(
+        'gc_efficientnetv2_rw_t', channel_multiplier=0.8, depth_multiplier=0.9,
+        rw=False, se_layer='gc', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnetv2_rw_s(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Small (RW variant).
+    NOTE: This is my initial (pre official code release) w/ some differences.
+    See efficientnetv2_s and tf_efficientnetv2_s for versions that match the official w/ PyTorch vs TF padding
+    """
+    model = _gen_efficientnetv2_s('efficientnetv2_rw_s', rw=True, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnetv2_rw_m(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Medium (RW variant).
+    """
+    model = _gen_efficientnetv2_s(
+        'efficientnetv2_rw_m', channel_multiplier=1.2, depth_multiplier=(1.2,) * 4 + (1.6,) * 2, rw=True,
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnetv2_s(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Small. """
+    model = _gen_efficientnetv2_s('efficientnetv2_s', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnetv2_m(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Medium. """
+    model = _gen_efficientnetv2_m('efficientnetv2_m', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnetv2_l(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Large. """
+    model = _gen_efficientnetv2_l('efficientnetv2_l', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnetv2_xl(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Xtra-Large. """
+    model = _gen_efficientnetv2_xl('efficientnetv2_xl', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b0(pretrained=False, **kwargs):
+    """ EfficientNet-B0. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b1(pretrained=False, **kwargs):
+    """ EfficientNet-B1. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b2(pretrained=False, **kwargs):
+    """ EfficientNet-B2. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b3(pretrained=False, **kwargs):
+    """ EfficientNet-B3. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b4(pretrained=False, **kwargs):
+    """ EfficientNet-B4. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b5(pretrained=False, **kwargs):
+    """ EfficientNet-B5. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b6(pretrained=False, **kwargs):
+    """ EfficientNet-B6. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b7(pretrained=False, **kwargs):
+    """ EfficientNet-B7. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b8(pretrained=False, **kwargs):
+    """ EfficientNet-B8. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b8', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b0_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B0 AdvProp. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b0_ap', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b1_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B1 AdvProp. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b1_ap', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b2_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B2 AdvProp. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b2_ap', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b3_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B3 AdvProp. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b3_ap', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b4_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B4 AdvProp. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b4_ap', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b5_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B5 AdvProp. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b5_ap', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b6_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B6 AdvProp. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b6_ap', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b7_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B7 AdvProp. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b7_ap', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b8_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B8 AdvProp. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b8_ap', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b0_ns(pretrained=False, **kwargs):
+    """ EfficientNet-B0 NoisyStudent. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b0_ns', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b1_ns(pretrained=False, **kwargs):
+    """ EfficientNet-B1 NoisyStudent. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b1_ns', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b2_ns(pretrained=False, **kwargs):
+    """ EfficientNet-B2 NoisyStudent. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b2_ns', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b3_ns(pretrained=False, **kwargs):
+    """ EfficientNet-B3 NoisyStudent. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b3_ns', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b4_ns(pretrained=False, **kwargs):
+    """ EfficientNet-B4 NoisyStudent. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b4_ns', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b5_ns(pretrained=False, **kwargs):
+    """ EfficientNet-B5 NoisyStudent. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b5_ns', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b6_ns(pretrained=False, **kwargs):
+    """ EfficientNet-B6 NoisyStudent. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b6_ns', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b7_ns(pretrained=False, **kwargs):
+    """ EfficientNet-B7 NoisyStudent. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b7_ns', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_l2_ns_475(pretrained=False, **kwargs):
+    """ EfficientNet-L2 NoisyStudent @ 475x475. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_l2_ns_475', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_l2_ns(pretrained=False, **kwargs):
+    """ EfficientNet-L2 NoisyStudent. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_l2_ns', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_es(pretrained=False, **kwargs):
+    """ EfficientNet-Edge Small. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_edge(
+        'tf_efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_em(pretrained=False, **kwargs):
+    """ EfficientNet-Edge-Medium. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_edge(
+        'tf_efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_el(pretrained=False, **kwargs):
+    """ EfficientNet-Edge-Large. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_edge(
+        'tf_efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs):
+    """ EfficientNet-CondConv-B0 w/ 4 Experts. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_condconv(
+        'tf_efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
+    """ EfficientNet-CondConv-B0 w/ 8 Experts. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_condconv(
+        'tf_efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2,
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_cc_b1_8e(pretrained=False, **kwargs):
+    """ EfficientNet-CondConv-B1 w/ 8 Experts. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_condconv(
+        'tf_efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2,
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_lite0(pretrained=False, **kwargs):
+    """ EfficientNet-Lite0 """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_lite(
+        'tf_efficientnet_lite0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_lite1(pretrained=False, **kwargs):
+    """ EfficientNet-Lite1 """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_lite(
+        'tf_efficientnet_lite1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_lite2(pretrained=False, **kwargs):
+    """ EfficientNet-Lite2 """
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_lite(
+        'tf_efficientnet_lite2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_lite3(pretrained=False, **kwargs):
+    """ EfficientNet-Lite3 """
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_lite(
+        'tf_efficientnet_lite3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_lite4(pretrained=False, **kwargs):
+    """ EfficientNet-Lite4 """
+    # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_lite(
+        'tf_efficientnet_lite4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+    return model
+
+
+
+@register_model
+def tf_efficientnetv2_s(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Small. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnetv2_s('tf_efficientnetv2_s', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnetv2_m(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Medium. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnetv2_m('tf_efficientnetv2_m', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnetv2_l(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Large. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnetv2_l('tf_efficientnetv2_l', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnetv2_s_in21ft1k(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Small. Pretrained on ImageNet-21k, fine-tuned on 1k. Tensorflow compatible variant
+    """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnetv2_s('tf_efficientnetv2_s_in21ft1k', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnetv2_m_in21ft1k(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Medium. Pretrained on ImageNet-21k, fine-tuned on 1k. Tensorflow compatible variant
+    """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnetv2_m('tf_efficientnetv2_m_in21ft1k', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnetv2_l_in21ft1k(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Large. Pretrained on ImageNet-21k, fine-tuned on 1k. Tensorflow compatible variant
+    """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnetv2_l('tf_efficientnetv2_l_in21ft1k', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnetv2_xl_in21ft1k(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Xtra-Large. Pretrained on ImageNet-21k, fine-tuned on 1k. Tensorflow compatible variant
+    """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnetv2_xl('tf_efficientnetv2_xl_in21ft1k', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnetv2_s_in21k(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Small w/ ImageNet-21k pretrained weights. Tensorflow compatible variant
+    """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnetv2_s('tf_efficientnetv2_s_in21k', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnetv2_m_in21k(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Medium w/ ImageNet-21k pretrained weights. Tensorflow compatible variant
+    """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnetv2_m('tf_efficientnetv2_m_in21k', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnetv2_l_in21k(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Large w/ ImageNet-21k pretrained weights. Tensorflow compatible variant
+    """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnetv2_l('tf_efficientnetv2_l_in21k', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnetv2_xl_in21k(pretrained=False, **kwargs):
+    """ EfficientNet-V2 Xtra-Large w/ ImageNet-21k pretrained weights. Tensorflow compatible variant
+    """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnetv2_xl('tf_efficientnetv2_xl_in21k', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnetv2_b0(pretrained=False, **kwargs):
+    """ EfficientNet-V2-B0. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnetv2_base('tf_efficientnetv2_b0', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnetv2_b1(pretrained=False, **kwargs):
+    """ EfficientNet-V2-B1. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnetv2_base(
+        'tf_efficientnetv2_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnetv2_b2(pretrained=False, **kwargs):
+    """ EfficientNet-V2-B2. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnetv2_base(
+        'tf_efficientnetv2_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnetv2_b3(pretrained=False, **kwargs):
+    """ EfficientNet-V2-B3. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnetv2_base(
+        'tf_efficientnetv2_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mixnet_s(pretrained=False, **kwargs):
+    """Creates a MixNet Small model.
+    """
+    model = _gen_mixnet_s(
+        'mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mixnet_m(pretrained=False, **kwargs):
+    """Creates a MixNet Medium model.
+    """
+    model = _gen_mixnet_m(
+        'mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mixnet_l(pretrained=False, **kwargs):
+    """Creates a MixNet Large model.
+    """
+    model = _gen_mixnet_m(
+        'mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mixnet_xl(pretrained=False, **kwargs):
+    """Creates a MixNet Extra-Large model.
+    Not a paper spec, experimental def by RW w/ depth scaling.
+    """
+    model = _gen_mixnet_m(
+        'mixnet_xl', channel_multiplier=1.6, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mixnet_xxl(pretrained=False, **kwargs):
+    """Creates a MixNet Double Extra Large model.
+    Not a paper spec, experimental def by RW w/ depth scaling.
+    """
+    model = _gen_mixnet_m(
+        'mixnet_xxl', channel_multiplier=2.4, depth_multiplier=1.3, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_mixnet_s(pretrained=False, **kwargs):
+    """Creates a MixNet Small model. Tensorflow compatible variant
+    """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mixnet_s(
+        'tf_mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_mixnet_m(pretrained=False, **kwargs):
+    """Creates a MixNet Medium model. Tensorflow compatible variant
+    """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mixnet_m(
+        'tf_mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_mixnet_l(pretrained=False, **kwargs):
+    """Creates a MixNet Large model. Tensorflow compatible variant
+    """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mixnet_m(
+        'tf_mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tinynet_a(pretrained=False, **kwargs):
+    model = _gen_tinynet('tinynet_a', 1.0, 1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tinynet_b(pretrained=False, **kwargs):
+    model = _gen_tinynet('tinynet_b', 0.75, 1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tinynet_c(pretrained=False, **kwargs):
+    model = _gen_tinynet('tinynet_c', 0.54, 0.85, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tinynet_d(pretrained=False, **kwargs):
+    model = _gen_tinynet('tinynet_d', 0.54, 0.695, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tinynet_e(pretrained=False, **kwargs):
+    model = _gen_tinynet('tinynet_e', 0.51, 0.6, pretrained=pretrained, **kwargs)
+    return model
diff --git a/src/custom_timm/models/efficientnet_blocks.py b/src/custom_timm/models/efficientnet_blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..34a317571c99132cbd6c00561f1eaf9699eabaff
--- /dev/null
+++ b/src/custom_timm/models/efficientnet_blocks.py
@@ -0,0 +1,281 @@
+""" EfficientNet, MobileNetV3, etc Blocks
+
+Hacked together by / Copyright 2019, Ross Wightman
+"""
+import math
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from .layers import create_conv2d, DropPath, make_divisible, create_act_layer, get_norm_act_layer
+
+__all__ = [
+    'SqueezeExcite', 'ConvBnAct', 'DepthwiseSeparableConv', 'InvertedResidual', 'CondConvResidual', 'EdgeResidual']
+
+
+def num_groups(group_size, channels):
+    if not group_size:  # 0 or None
+        return 1  # normal conv with 1 group
+    else:
+        # NOTE group_size == 1 -> depthwise conv
+        assert channels % group_size == 0
+        return channels // group_size
+
+
+class SqueezeExcite(nn.Module):
+    """ Squeeze-and-Excitation w/ specific features for EfficientNet/MobileNet family
+
+    Args:
+        in_chs (int): input channels to layer
+        rd_ratio (float): ratio of squeeze reduction
+        act_layer (nn.Module): activation layer of containing block
+        gate_layer (Callable): attention gate function
+        force_act_layer (nn.Module): override block's activation fn if this is set/bound
+        rd_round_fn (Callable): specify a fn to calculate rounding of reduced chs
+    """
+
+    def __init__(
+            self, in_chs, rd_ratio=0.25, rd_channels=None, act_layer=nn.ReLU,
+            gate_layer=nn.Sigmoid, force_act_layer=None, rd_round_fn=None):
+        super(SqueezeExcite, self).__init__()
+        if rd_channels is None:
+            rd_round_fn = rd_round_fn or round
+            rd_channels = rd_round_fn(in_chs * rd_ratio)
+        act_layer = force_act_layer or act_layer
+        self.conv_reduce = nn.Conv2d(in_chs, rd_channels, 1, bias=True)
+        self.act1 = create_act_layer(act_layer, inplace=True)
+        self.conv_expand = nn.Conv2d(rd_channels, in_chs, 1, bias=True)
+        self.gate = create_act_layer(gate_layer)
+
+    def forward(self, x):
+        x_se = x.mean((2, 3), keepdim=True)
+        x_se = self.conv_reduce(x_se)
+        x_se = self.act1(x_se)
+        x_se = self.conv_expand(x_se)
+        return x * self.gate(x_se)
+
+
+class ConvBnAct(nn.Module):
+    """ Conv + Norm Layer + Activation w/ optional skip connection
+    """
+    def __init__(
+            self, in_chs, out_chs, kernel_size, stride=1, dilation=1, group_size=0, pad_type='',
+            skip=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, drop_path_rate=0.):
+        super(ConvBnAct, self).__init__()
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
+        groups = num_groups(group_size, in_chs)
+        self.has_skip = skip and stride == 1 and in_chs == out_chs
+
+        self.conv = create_conv2d(
+            in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, groups=groups, padding=pad_type)
+        self.bn1 = norm_act_layer(out_chs, inplace=True)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity()
+
+    def feature_info(self, location):
+        if location == 'expansion':  # output of conv after act, same as block coutput
+            return dict(module='bn1', hook_type='forward', num_chs=self.conv.out_channels)
+        else:  # location == 'bottleneck', block output
+            return dict(module='', hook_type='', num_chs=self.conv.out_channels)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv(x)
+        x = self.bn1(x)
+        if self.has_skip:
+            x = self.drop_path(x) + shortcut
+        return x
+
+
+class DepthwiseSeparableConv(nn.Module):
+    """ DepthwiseSeparable block
+    Used for DS convs in MobileNet-V1 and in the place of IR blocks that have no expansion
+    (factor of 1.0). This is an alternative to having a IR with an optional first pw conv.
+    """
+    def __init__(
+            self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, group_size=1, pad_type='',
+            noskip=False, pw_kernel_size=1, pw_act=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
+            se_layer=None, drop_path_rate=0.):
+        super(DepthwiseSeparableConv, self).__init__()
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
+        groups = num_groups(group_size, in_chs)
+        self.has_skip = (stride == 1 and in_chs == out_chs) and not noskip
+        self.has_pw_act = pw_act  # activation after point-wise conv
+
+        self.conv_dw = create_conv2d(
+            in_chs, in_chs, dw_kernel_size, stride=stride, dilation=dilation, padding=pad_type, groups=groups)
+        self.bn1 = norm_act_layer(in_chs, inplace=True)
+
+        # Squeeze-and-excitation
+        self.se = se_layer(in_chs, act_layer=act_layer) if se_layer else nn.Identity()
+
+        self.conv_pw = create_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type)
+        self.bn2 = norm_act_layer(out_chs, inplace=True, apply_act=self.has_pw_act)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity()
+
+    def feature_info(self, location):
+        if location == 'expansion':  # after SE, input to PW
+            return dict(module='conv_pw', hook_type='forward_pre', num_chs=self.conv_pw.in_channels)
+        else:  # location == 'bottleneck', block output
+            return dict(module='', hook_type='', num_chs=self.conv_pw.out_channels)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv_dw(x)
+        x = self.bn1(x)
+        x = self.se(x)
+        x = self.conv_pw(x)
+        x = self.bn2(x)
+        if self.has_skip:
+            x = self.drop_path(x) + shortcut
+        return x
+
+
+class InvertedResidual(nn.Module):
+    """ Inverted residual block w/ optional SE
+
+    Originally used in MobileNet-V2 - https://arxiv.org/abs/1801.04381v4, this layer is often
+    referred to as 'MBConv' for (Mobile inverted bottleneck conv) and is also used in
+      * MNasNet - https://arxiv.org/abs/1807.11626
+      * EfficientNet - https://arxiv.org/abs/1905.11946
+      * MobileNet-V3 - https://arxiv.org/abs/1905.02244
+    """
+
+    def __init__(
+            self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, group_size=1, pad_type='',
+            noskip=False, exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1, act_layer=nn.ReLU,
+            norm_layer=nn.BatchNorm2d, se_layer=None, conv_kwargs=None, drop_path_rate=0.):
+        super(InvertedResidual, self).__init__()
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
+        conv_kwargs = conv_kwargs or {}
+        mid_chs = make_divisible(in_chs * exp_ratio)
+        groups = num_groups(group_size, mid_chs)
+        self.has_skip = (in_chs == out_chs and stride == 1) and not noskip
+
+        # Point-wise expansion
+        self.conv_pw = create_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **conv_kwargs)
+        self.bn1 = norm_act_layer(mid_chs, inplace=True)
+
+        # Depth-wise convolution
+        self.conv_dw = create_conv2d(
+            mid_chs, mid_chs, dw_kernel_size, stride=stride, dilation=dilation,
+            groups=groups, padding=pad_type, **conv_kwargs)
+        self.bn2 = norm_act_layer(mid_chs, inplace=True)
+
+        # Squeeze-and-excitation
+        self.se = se_layer(mid_chs, act_layer=act_layer) if se_layer else nn.Identity()
+
+        # Point-wise linear projection
+        self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs)
+        self.bn3 = norm_act_layer(out_chs, apply_act=False)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity()
+
+    def feature_info(self, location):
+        if location == 'expansion':  # after SE, input to PWL
+            return dict(module='conv_pwl', hook_type='forward_pre', num_chs=self.conv_pwl.in_channels)
+        else:  # location == 'bottleneck', block output
+            return dict(module='', hook_type='', num_chs=self.conv_pwl.out_channels)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv_pw(x)
+        x = self.bn1(x)
+        x = self.conv_dw(x)
+        x = self.bn2(x)
+        x = self.se(x)
+        x = self.conv_pwl(x)
+        x = self.bn3(x)
+        if self.has_skip:
+            x = self.drop_path(x) + shortcut
+        return x
+
+
+class CondConvResidual(InvertedResidual):
+    """ Inverted residual block w/ CondConv routing"""
+
+    def __init__(
+            self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, group_size=1, pad_type='',
+            noskip=False, exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1, act_layer=nn.ReLU,
+            norm_layer=nn.BatchNorm2d, se_layer=None, num_experts=0, drop_path_rate=0.):
+
+        self.num_experts = num_experts
+        conv_kwargs = dict(num_experts=self.num_experts)
+
+        super(CondConvResidual, self).__init__(
+            in_chs, out_chs, dw_kernel_size=dw_kernel_size, stride=stride, dilation=dilation, group_size=group_size,
+            pad_type=pad_type, act_layer=act_layer, noskip=noskip, exp_ratio=exp_ratio, exp_kernel_size=exp_kernel_size,
+            pw_kernel_size=pw_kernel_size, se_layer=se_layer, norm_layer=norm_layer, conv_kwargs=conv_kwargs,
+            drop_path_rate=drop_path_rate)
+
+        self.routing_fn = nn.Linear(in_chs, self.num_experts)
+
+    def forward(self, x):
+        shortcut = x
+        pooled_inputs = F.adaptive_avg_pool2d(x, 1).flatten(1)  # CondConv routing
+        routing_weights = torch.sigmoid(self.routing_fn(pooled_inputs))
+        x = self.conv_pw(x, routing_weights)
+        x = self.bn1(x)
+        x = self.conv_dw(x, routing_weights)
+        x = self.bn2(x)
+        x = self.se(x)
+        x = self.conv_pwl(x, routing_weights)
+        x = self.bn3(x)
+        if self.has_skip:
+            x = self.drop_path(x) + shortcut
+        return x
+
+
+class EdgeResidual(nn.Module):
+    """ Residual block with expansion convolution followed by pointwise-linear w/ stride
+
+    Originally introduced in `EfficientNet-EdgeTPU: Creating Accelerator-Optimized Neural Networks with AutoML`
+        - https://ai.googleblog.com/2019/08/efficientnet-edgetpu-creating.html
+
+    This layer is also called FusedMBConv in the MobileDet, EfficientNet-X, and EfficientNet-V2 papers
+      * MobileDet - https://arxiv.org/abs/2004.14525
+      * EfficientNet-X - https://arxiv.org/abs/2102.05610
+      * EfficientNet-V2 - https://arxiv.org/abs/2104.00298
+    """
+
+    def __init__(
+            self, in_chs, out_chs, exp_kernel_size=3, stride=1, dilation=1, group_size=0, pad_type='',
+            force_in_chs=0, noskip=False, exp_ratio=1.0, pw_kernel_size=1, act_layer=nn.ReLU,
+            norm_layer=nn.BatchNorm2d, se_layer=None, drop_path_rate=0.):
+        super(EdgeResidual, self).__init__()
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
+        if force_in_chs > 0:
+            mid_chs = make_divisible(force_in_chs * exp_ratio)
+        else:
+            mid_chs = make_divisible(in_chs * exp_ratio)
+        groups = num_groups(group_size, in_chs)
+        self.has_skip = (in_chs == out_chs and stride == 1) and not noskip
+
+        # Expansion convolution
+        self.conv_exp = create_conv2d(
+            in_chs, mid_chs, exp_kernel_size, stride=stride, dilation=dilation, groups=groups, padding=pad_type)
+        self.bn1 = norm_act_layer(mid_chs, inplace=True)
+
+        # Squeeze-and-excitation
+        self.se = se_layer(mid_chs, act_layer=act_layer) if se_layer else nn.Identity()
+
+        # Point-wise linear projection
+        self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type)
+        self.bn2 = norm_act_layer(out_chs, apply_act=False)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity()
+
+    def feature_info(self, location):
+        if location == 'expansion':  # after SE, before PWL
+            return dict(module='conv_pwl', hook_type='forward_pre', num_chs=self.conv_pwl.in_channels)
+        else:  # location == 'bottleneck', block output
+            return dict(module='', hook_type='', num_chs=self.conv_pwl.out_channels)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv_exp(x)
+        x = self.bn1(x)
+        x = self.se(x)
+        x = self.conv_pwl(x)
+        x = self.bn2(x)
+        if self.has_skip:
+            x = self.drop_path(x) + shortcut
+        return x
diff --git a/src/custom_timm/models/efficientnet_builder.py b/src/custom_timm/models/efficientnet_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..67d15a8692dc99d735c94b37505f3c01b2c29fea
--- /dev/null
+++ b/src/custom_timm/models/efficientnet_builder.py
@@ -0,0 +1,477 @@
+""" EfficientNet, MobileNetV3, etc Builder
+
+Assembles EfficieNet and related network feature blocks from string definitions.
+Handles stride, dilation calculations, and selects feature extraction points.
+
+Hacked together by / Copyright 2019, Ross Wightman
+"""
+
+import logging
+import math
+import re
+from copy import deepcopy
+from functools import partial
+
+import torch.nn as nn
+
+from .efficientnet_blocks import *
+from .layers import CondConv2d, get_condconv_initializer, get_act_layer, get_attn, make_divisible
+
+__all__ = ["EfficientNetBuilder", "decode_arch_def", "efficientnet_init_weights",
+           'resolve_bn_args', 'resolve_act_layer', 'round_channels', 'BN_MOMENTUM_TF_DEFAULT', 'BN_EPS_TF_DEFAULT']
+
+_logger = logging.getLogger(__name__)
+
+
+_DEBUG_BUILDER = False
+
+# Defaults used for Google/Tensorflow training of mobile networks /w RMSprop as per
+# papers and TF reference implementations. PT momentum equiv for TF decay is (1 - TF decay)
+# NOTE: momentum varies btw .99 and .9997 depending on source
+# .99 in official TF TPU impl
+# .9997 (/w .999 in search space) for paper
+BN_MOMENTUM_TF_DEFAULT = 1 - 0.99
+BN_EPS_TF_DEFAULT = 1e-3
+_BN_ARGS_TF = dict(momentum=BN_MOMENTUM_TF_DEFAULT, eps=BN_EPS_TF_DEFAULT)
+
+
+def get_bn_args_tf():
+    return _BN_ARGS_TF.copy()
+
+
+def resolve_bn_args(kwargs):
+    bn_args = {}
+    bn_momentum = kwargs.pop('bn_momentum', None)
+    if bn_momentum is not None:
+        bn_args['momentum'] = bn_momentum
+    bn_eps = kwargs.pop('bn_eps', None)
+    if bn_eps is not None:
+        bn_args['eps'] = bn_eps
+    return bn_args
+
+
+def resolve_act_layer(kwargs, default='relu'):
+    return get_act_layer(kwargs.pop('act_layer', default))
+
+
+def round_channels(channels, multiplier=1.0, divisor=8, channel_min=None, round_limit=0.9):
+    """Round number of filters based on depth multiplier."""
+    if not multiplier:
+        return channels
+    return make_divisible(channels * multiplier, divisor, channel_min, round_limit=round_limit)
+
+
+def _log_info_if(msg, condition):
+    if condition:
+        _logger.info(msg)
+
+
+def _parse_ksize(ss):
+    if ss.isdigit():
+        return int(ss)
+    else:
+        return [int(k) for k in ss.split('.')]
+
+
+def _decode_block_str(block_str):
+    """ Decode block definition string
+
+    Gets a list of block arg (dicts) through a string notation of arguments.
+    E.g. ir_r2_k3_s2_e1_i32_o16_se0.25_noskip
+
+    All args can exist in any order with the exception of the leading string which
+    is assumed to indicate the block type.
+
+    leading string - block type (
+      ir = InvertedResidual, ds = DepthwiseSep, dsa = DeptwhiseSep with pw act, cn = ConvBnAct)
+    r - number of repeat blocks,
+    k - kernel size,
+    s - strides (1-9),
+    e - expansion ratio,
+    c - output channels,
+    se - squeeze/excitation ratio
+    n - activation fn ('re', 'r6', 'hs', or 'sw')
+    Args:
+        block_str: a string representation of block arguments.
+    Returns:
+        A list of block args (dicts)
+    Raises:
+        ValueError: if the string def not properly specified (TODO)
+    """
+    assert isinstance(block_str, str)
+    ops = block_str.split('_')
+    block_type = ops[0]  # take the block type off the front
+    ops = ops[1:]
+    options = {}
+    skip = None
+    for op in ops:
+        # string options being checked on individual basis, combine if they grow
+        if op == 'noskip':
+            skip = False  # force no skip connection
+        elif op == 'skip':
+            skip = True  # force a skip connection
+        elif op.startswith('n'):
+            # activation fn
+            key = op[0]
+            v = op[1:]
+            if v == 're':
+                value = get_act_layer('relu')
+            elif v == 'r6':
+                value = get_act_layer('relu6')
+            elif v == 'hs':
+                value = get_act_layer('hard_swish')
+            elif v == 'sw':
+                value = get_act_layer('swish')  # aka SiLU
+            elif v == 'mi':
+                value = get_act_layer('mish')
+            else:
+                continue
+            options[key] = value
+        else:
+            # all numeric options
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+
+    # if act_layer is None, the model default (passed to model init) will be used
+    act_layer = options['n'] if 'n' in options else None
+    exp_kernel_size = _parse_ksize(options['a']) if 'a' in options else 1
+    pw_kernel_size = _parse_ksize(options['p']) if 'p' in options else 1
+    force_in_chs = int(options['fc']) if 'fc' in options else 0  # FIXME hack to deal with in_chs issue in TPU def
+    num_repeat = int(options['r'])
+
+    # each type of block has different valid arguments, fill accordingly
+    block_args = dict(
+        block_type=block_type,
+        out_chs=int(options['c']),
+        stride=int(options['s']),
+        act_layer=act_layer,
+    )
+    if block_type == 'ir':
+        block_args.update(dict(
+            dw_kernel_size=_parse_ksize(options['k']),
+            exp_kernel_size=exp_kernel_size,
+            pw_kernel_size=pw_kernel_size,
+            exp_ratio=float(options['e']),
+            se_ratio=float(options['se']) if 'se' in options else 0.,
+            noskip=skip is False,
+        ))
+        if 'cc' in options:
+            block_args['num_experts'] = int(options['cc'])
+    elif block_type == 'ds' or block_type == 'dsa':
+        block_args.update(dict(
+            dw_kernel_size=_parse_ksize(options['k']),
+            pw_kernel_size=pw_kernel_size,
+            se_ratio=float(options['se']) if 'se' in options else 0.,
+            pw_act=block_type == 'dsa',
+            noskip=block_type == 'dsa' or skip is False,
+        ))
+    elif block_type == 'er':
+        block_args.update(dict(
+            exp_kernel_size=_parse_ksize(options['k']),
+            pw_kernel_size=pw_kernel_size,
+            exp_ratio=float(options['e']),
+            force_in_chs=force_in_chs,
+            se_ratio=float(options['se']) if 'se' in options else 0.,
+            noskip=skip is False,
+        ))
+    elif block_type == 'cn':
+        block_args.update(dict(
+            kernel_size=int(options['k']),
+            skip=skip is True,
+        ))
+    else:
+        assert False, 'Unknown block type (%s)' % block_type
+    if 'gs' in options:
+        block_args['group_size'] = options['gs']
+
+    return block_args, num_repeat
+
+
+def _scale_stage_depth(stack_args, repeats, depth_multiplier=1.0, depth_trunc='ceil'):
+    """ Per-stage depth scaling
+    Scales the block repeats in each stage. This depth scaling impl maintains
+    compatibility with the EfficientNet scaling method, while allowing sensible
+    scaling for other models that may have multiple block arg definitions in each stage.
+    """
+
+    # We scale the total repeat count for each stage, there may be multiple
+    # block arg defs per stage so we need to sum.
+    num_repeat = sum(repeats)
+    if depth_trunc == 'round':
+        # Truncating to int by rounding allows stages with few repeats to remain
+        # proportionally smaller for longer. This is a good choice when stage definitions
+        # include single repeat stages that we'd prefer to keep that way as long as possible
+        num_repeat_scaled = max(1, round(num_repeat * depth_multiplier))
+    else:
+        # The default for EfficientNet truncates repeats to int via 'ceil'.
+        # Any multiplier > 1.0 will result in an increased depth for every stage.
+        num_repeat_scaled = int(math.ceil(num_repeat * depth_multiplier))
+
+    # Proportionally distribute repeat count scaling to each block definition in the stage.
+    # Allocation is done in reverse as it results in the first block being less likely to be scaled.
+    # The first block makes less sense to repeat in most of the arch definitions.
+    repeats_scaled = []
+    for r in repeats[::-1]:
+        rs = max(1, round((r / num_repeat * num_repeat_scaled)))
+        repeats_scaled.append(rs)
+        num_repeat -= r
+        num_repeat_scaled -= rs
+    repeats_scaled = repeats_scaled[::-1]
+
+    # Apply the calculated scaling to each block arg in the stage
+    sa_scaled = []
+    for ba, rep in zip(stack_args, repeats_scaled):
+        sa_scaled.extend([deepcopy(ba) for _ in range(rep)])
+    return sa_scaled
+
+
+def decode_arch_def(
+        arch_def,
+        depth_multiplier=1.0,
+        depth_trunc='ceil',
+        experts_multiplier=1,
+        fix_first_last=False,
+        group_size=None,
+):
+    """ Decode block architecture definition strings -> block kwargs
+
+    Args:
+        arch_def: architecture definition strings, list of list of strings
+        depth_multiplier: network depth multiplier
+        depth_trunc: networ depth truncation mode when applying multiplier
+        experts_multiplier: CondConv experts multiplier
+        fix_first_last: fix first and last block depths when multiplier is applied
+        group_size: group size override for all blocks that weren't explicitly set in arch string
+
+    Returns:
+        list of list of block kwargs
+    """
+    arch_args = []
+    if isinstance(depth_multiplier, tuple):
+        assert len(depth_multiplier) == len(arch_def)
+    else:
+        depth_multiplier = (depth_multiplier,) * len(arch_def)
+    for stack_idx, (block_strings, multiplier) in enumerate(zip(arch_def, depth_multiplier)):
+        assert isinstance(block_strings, list)
+        stack_args = []
+        repeats = []
+        for block_str in block_strings:
+            assert isinstance(block_str, str)
+            ba, rep = _decode_block_str(block_str)
+            if ba.get('num_experts', 0) > 0 and experts_multiplier > 1:
+                ba['num_experts'] *= experts_multiplier
+            if group_size is not None:
+                ba.setdefault('group_size', group_size)
+            stack_args.append(ba)
+            repeats.append(rep)
+        if fix_first_last and (stack_idx == 0 or stack_idx == len(arch_def) - 1):
+            arch_args.append(_scale_stage_depth(stack_args, repeats, 1.0, depth_trunc))
+        else:
+            arch_args.append(_scale_stage_depth(stack_args, repeats, multiplier, depth_trunc))
+    return arch_args
+
+
+class EfficientNetBuilder:
+    """ Build Trunk Blocks
+
+    This ended up being somewhat of a cross between
+    https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_models.py
+    and
+    https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py
+
+    """
+    def __init__(self, output_stride=32, pad_type='', round_chs_fn=round_channels, se_from_exp=False,
+                 act_layer=None, norm_layer=None, se_layer=None, drop_path_rate=0., feature_location=''):
+        self.output_stride = output_stride
+        self.pad_type = pad_type
+        self.round_chs_fn = round_chs_fn
+        self.se_from_exp = se_from_exp  # calculate se channel reduction from expanded (mid) chs
+        self.act_layer = act_layer
+        self.norm_layer = norm_layer
+        self.se_layer = get_attn(se_layer)
+        try:
+            self.se_layer(8, rd_ratio=1.0)  # test if attn layer accepts rd_ratio arg
+            self.se_has_ratio = True
+        except TypeError:
+            self.se_has_ratio = False
+        self.drop_path_rate = drop_path_rate
+        if feature_location == 'depthwise':
+            # old 'depthwise' mode renamed 'expansion' to match TF impl, old expansion mode didn't make sense
+            _logger.warning("feature_location=='depthwise' is deprecated, using 'expansion'")
+            feature_location = 'expansion'
+        self.feature_location = feature_location
+        assert feature_location in ('bottleneck', 'expansion', '')
+        self.verbose = _DEBUG_BUILDER
+
+        # state updated during build, consumed by model
+        self.in_chs = None
+        self.features = []
+
+    def _make_block(self, ba, block_idx, block_count):
+        drop_path_rate = self.drop_path_rate * block_idx / block_count
+        bt = ba.pop('block_type')
+        ba['in_chs'] = self.in_chs
+        ba['out_chs'] = self.round_chs_fn(ba['out_chs'])
+        if 'force_in_chs' in ba and ba['force_in_chs']:
+            # NOTE this is a hack to work around mismatch in TF EdgeEffNet impl
+            ba['force_in_chs'] = self.round_chs_fn(ba['force_in_chs'])
+        ba['pad_type'] = self.pad_type
+        # block act fn overrides the model default
+        ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer
+        assert ba['act_layer'] is not None
+        ba['norm_layer'] = self.norm_layer
+        ba['drop_path_rate'] = drop_path_rate
+        if bt != 'cn':
+            se_ratio = ba.pop('se_ratio')
+            if se_ratio and self.se_layer is not None:
+                if not self.se_from_exp:
+                    # adjust se_ratio by expansion ratio if calculating se channels from block input
+                    se_ratio /= ba.get('exp_ratio', 1.0)
+                if self.se_has_ratio:
+                    ba['se_layer'] = partial(self.se_layer, rd_ratio=se_ratio)
+                else:
+                    ba['se_layer'] = self.se_layer
+
+        if bt == 'ir':
+            _log_info_if('  InvertedResidual {}, Args: {}'.format(block_idx, str(ba)), self.verbose)
+            block = CondConvResidual(**ba) if ba.get('num_experts', 0) else InvertedResidual(**ba)
+        elif bt == 'ds' or bt == 'dsa':
+            _log_info_if('  DepthwiseSeparable {}, Args: {}'.format(block_idx, str(ba)), self.verbose)
+            block = DepthwiseSeparableConv(**ba)
+        elif bt == 'er':
+            _log_info_if('  EdgeResidual {}, Args: {}'.format(block_idx, str(ba)), self.verbose)
+            block = EdgeResidual(**ba)
+        elif bt == 'cn':
+            _log_info_if('  ConvBnAct {}, Args: {}'.format(block_idx, str(ba)), self.verbose)
+            block = ConvBnAct(**ba)
+        else:
+            assert False, 'Uknkown block type (%s) while building model.' % bt
+
+        self.in_chs = ba['out_chs']  # update in_chs for arg of next block
+        return block
+
+    def __call__(self, in_chs, model_block_args):
+        """ Build the blocks
+        Args:
+            in_chs: Number of input-channels passed to first block
+            model_block_args: A list of lists, outer list defines stages, inner
+                list contains strings defining block configuration(s)
+        Return:
+             List of block stacks (each stack wrapped in nn.Sequential)
+        """
+        _log_info_if('Building model trunk with %d stages...' % len(model_block_args), self.verbose)
+        self.in_chs = in_chs
+        total_block_count = sum([len(x) for x in model_block_args])
+        total_block_idx = 0
+        current_stride = 2
+        current_dilation = 1
+        stages = []
+        if model_block_args[0][0]['stride'] > 1:
+            # if the first block starts with a stride, we need to extract first level feat from stem
+            feature_info = dict(
+                module='act1', num_chs=in_chs, stage=0, reduction=current_stride,
+                hook_type='forward' if self.feature_location != 'bottleneck' else '')
+            self.features.append(feature_info)
+
+        # outer list of block_args defines the stacks
+        for stack_idx, stack_args in enumerate(model_block_args):
+            last_stack = stack_idx + 1 == len(model_block_args)
+            _log_info_if('Stack: {}'.format(stack_idx), self.verbose)
+            assert isinstance(stack_args, list)
+
+            blocks = []
+            # each stack (stage of blocks) contains a list of block arguments
+            for block_idx, block_args in enumerate(stack_args):
+                last_block = block_idx + 1 == len(stack_args)
+                _log_info_if(' Block: {}'.format(block_idx), self.verbose)
+
+                assert block_args['stride'] in (1, 2)
+                if block_idx >= 1:   # only the first block in any stack can have a stride > 1
+                    block_args['stride'] = 1
+
+                extract_features = False
+                if last_block:
+                    next_stack_idx = stack_idx + 1
+                    extract_features = next_stack_idx >= len(model_block_args) or \
+                        model_block_args[next_stack_idx][0]['stride'] > 1
+
+                next_dilation = current_dilation
+                if block_args['stride'] > 1:
+                    next_output_stride = current_stride * block_args['stride']
+                    if next_output_stride > self.output_stride:
+                        next_dilation = current_dilation * block_args['stride']
+                        block_args['stride'] = 1
+                        _log_info_if('  Converting stride to dilation to maintain output_stride=={}'.format(
+                            self.output_stride), self.verbose)
+                    else:
+                        current_stride = next_output_stride
+                block_args['dilation'] = current_dilation
+                if next_dilation != current_dilation:
+                    current_dilation = next_dilation
+
+                # create the block
+                block = self._make_block(block_args, total_block_idx, total_block_count)
+                blocks.append(block)
+
+                # stash feature module name and channel info for model feature extraction
+                if extract_features:
+                    feature_info = dict(
+                        stage=stack_idx + 1, reduction=current_stride, **block.feature_info(self.feature_location))
+                    module_name = f'blocks.{stack_idx}.{block_idx}'
+                    leaf_name = feature_info.get('module', '')
+                    feature_info['module'] = '.'.join([module_name, leaf_name]) if leaf_name else module_name
+                    self.features.append(feature_info)
+
+                total_block_idx += 1  # incr global block idx (across all stacks)
+            stages.append(nn.Sequential(*blocks))
+        return stages
+
+
+def _init_weight_goog(m, n='', fix_group_fanout=True):
+    """ Weight initialization as per Tensorflow official implementations.
+
+    Args:
+        m (nn.Module): module to init
+        n (str): module name
+        fix_group_fanout (bool): enable correct (matching Tensorflow TPU impl) fanout calculation w/ group convs
+
+    Handles layers in EfficientNet, EfficientNet-CondConv, MixNet, MnasNet, MobileNetV3, etc:
+    * https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_model.py
+    * https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py
+    """
+    if isinstance(m, CondConv2d):
+        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        if fix_group_fanout:
+            fan_out //= m.groups
+        init_weight_fn = get_condconv_initializer(
+            lambda w: nn.init.normal_(w, 0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape)
+        init_weight_fn(m.weight)
+        if m.bias is not None:
+            nn.init.zeros_(m.bias)
+    elif isinstance(m, nn.Conv2d):
+        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        if fix_group_fanout:
+            fan_out //= m.groups
+        nn.init.normal_(m.weight, 0, math.sqrt(2.0 / fan_out))
+        if m.bias is not None:
+            nn.init.zeros_(m.bias)
+    elif isinstance(m, nn.BatchNorm2d):
+        nn.init.ones_(m.weight)
+        nn.init.zeros_(m.bias)
+    elif isinstance(m, nn.Linear):
+        fan_out = m.weight.size(0)  # fan-out
+        fan_in = 0
+        if 'routing_fn' in n:
+            fan_in = m.weight.size(1)
+        init_range = 1.0 / math.sqrt(fan_in + fan_out)
+        nn.init.uniform_(m.weight, -init_range, init_range)
+        nn.init.zeros_(m.bias)
+
+
+def efficientnet_init_weights(model: nn.Module, init_fn=None):
+    init_fn = init_fn or _init_weight_goog
+    for n, m in model.named_modules():
+        init_fn(m, n)
+
diff --git a/src/custom_timm/models/factory.py b/src/custom_timm/models/factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7a8fd9cddf04633d6f5160dd1e2e96bab4737ad
--- /dev/null
+++ b/src/custom_timm/models/factory.py
@@ -0,0 +1,76 @@
+from urllib.parse import urlsplit, urlunsplit
+import os
+
+from .registry import is_model, is_model_in_modules, model_entrypoint
+from .helpers import load_checkpoint
+from .layers import set_layer_config
+from .hub import load_model_config_from_hf
+
+
+def parse_model_name(model_name):
+    model_name = model_name.replace('hf_hub', 'hf-hub')  # NOTE for backwards compat, to deprecate hf_hub use
+    parsed = urlsplit(model_name)
+    assert parsed.scheme in ('', 'timm', 'hf-hub')
+    if parsed.scheme == 'hf-hub':
+        # FIXME may use fragment as revision, currently `@` in URI path
+        return parsed.scheme, parsed.path
+    else:
+        model_name = os.path.split(parsed.path)[-1]
+        return 'timm', model_name
+
+
+def safe_model_name(model_name, remove_source=True):
+    def make_safe(name):
+        return ''.join(c if c.isalnum() else '_' for c in name).rstrip('_')
+    if remove_source:
+        model_name = parse_model_name(model_name)[-1]
+    return make_safe(model_name)
+
+
+def create_model(
+        model_name,
+        pretrained=False,
+        pretrained_cfg=None,
+        checkpoint_path='',
+        scriptable=None,
+        exportable=None,
+        no_jit=None,
+        **kwargs):
+    """Create a model
+
+    Args:
+        model_name (str): name of model to instantiate
+        pretrained (bool): load pretrained ImageNet-1k weights if true
+        checkpoint_path (str): path of checkpoint to load after model is initialized
+        scriptable (bool): set layer config so that model is jit scriptable (not working for all models yet)
+        exportable (bool): set layer config so that model is traceable / ONNX exportable (not fully impl/obeyed yet)
+        no_jit (bool): set layer config so that model doesn't utilize jit scripted layers (so far activations only)
+
+    Keyword Args:
+        drop_rate (float): dropout rate for training (default: 0.0)
+        global_pool (str): global pool type (default: 'avg')
+        **: other kwargs are model specific
+    """
+    # Parameters that aren't supported by all models or are intended to only override model defaults if set
+    # should default to None in command line args/cfg. Remove them if they are present and not set so that
+    # non-supporting models don't break and default args remain in effect.
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+
+    model_source, model_name = parse_model_name(model_name)
+    if model_source == 'hf-hub':
+        # FIXME hf-hub source overrides any passed in pretrained_cfg, warn?
+        # For model names specified in the form `hf-hub:path/architecture_name@revision`,
+        # load model weights + pretrained_cfg from Hugging Face hub.
+        pretrained_cfg, model_name = load_model_config_from_hf(model_name)
+
+    if not is_model(model_name):
+        raise RuntimeError('Unknown model (%s)' % model_name)
+
+    create_fn = model_entrypoint(model_name)
+    with set_layer_config(scriptable=scriptable, exportable=exportable, no_jit=no_jit):
+        model = create_fn(pretrained=pretrained, pretrained_cfg=pretrained_cfg, **kwargs)
+
+    if checkpoint_path:
+        load_checkpoint(model, checkpoint_path)
+
+    return model
diff --git a/src/custom_timm/models/features.py b/src/custom_timm/models/features.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bc46419d16f9759221a39061f4eb34e76aa6efd
--- /dev/null
+++ b/src/custom_timm/models/features.py
@@ -0,0 +1,284 @@
+""" PyTorch Feature Extraction Helpers
+
+A collection of classes, functions, modules to help extract features from models
+and provide a common interface for describing them.
+
+The return_layers, module re-writing idea inspired by torchvision IntermediateLayerGetter
+https://github.com/pytorch/vision/blob/d88d8961ae51507d0cb680329d985b1488b1b76b/torchvision/models/_utils.py
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+from collections import OrderedDict, defaultdict
+from copy import deepcopy
+from functools import partial
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+
+
+class FeatureInfo:
+
+    def __init__(self, feature_info: List[Dict], out_indices: Tuple[int]):
+        prev_reduction = 1
+        for fi in feature_info:
+            # sanity check the mandatory fields, there may be additional fields depending on the model
+            assert 'num_chs' in fi and fi['num_chs'] > 0
+            assert 'reduction' in fi and fi['reduction'] >= prev_reduction
+            prev_reduction = fi['reduction']
+            assert 'module' in fi
+        self.out_indices = out_indices
+        self.info = feature_info
+
+    def from_other(self, out_indices: Tuple[int]):
+        return FeatureInfo(deepcopy(self.info), out_indices)
+
+    def get(self, key, idx=None):
+        """ Get value by key at specified index (indices)
+        if idx == None, returns value for key at each output index
+        if idx is an integer, return value for that feature module index (ignoring output indices)
+        if idx is a list/tupple, return value for each module index (ignoring output indices)
+        """
+        if idx is None:
+            return [self.info[i][key] for i in self.out_indices]
+        if isinstance(idx, (tuple, list)):
+            return [self.info[i][key] for i in idx]
+        else:
+            return self.info[idx][key]
+
+    def get_dicts(self, keys=None, idx=None):
+        """ return info dicts for specified keys (or all if None) at specified indices (or out_indices if None)
+        """
+        if idx is None:
+            if keys is None:
+                return [self.info[i] for i in self.out_indices]
+            else:
+                return [{k: self.info[i][k] for k in keys} for i in self.out_indices]
+        if isinstance(idx, (tuple, list)):
+            return [self.info[i] if keys is None else {k: self.info[i][k] for k in keys} for i in idx]
+        else:
+            return self.info[idx] if keys is None else {k: self.info[idx][k] for k in keys}
+
+    def channels(self, idx=None):
+        """ feature channels accessor
+        """
+        return self.get('num_chs', idx)
+
+    def reduction(self, idx=None):
+        """ feature reduction (output stride) accessor
+        """
+        return self.get('reduction', idx)
+
+    def module_name(self, idx=None):
+        """ feature module name accessor
+        """
+        return self.get('module', idx)
+
+    def __getitem__(self, item):
+        return self.info[item]
+
+    def __len__(self):
+        return len(self.info)
+
+
+class FeatureHooks:
+    """ Feature Hook Helper
+
+    This module helps with the setup and extraction of hooks for extracting features from
+    internal nodes in a model by node name. This works quite well in eager Python but needs
+    redesign for torchscript.
+    """
+
+    def __init__(self, hooks, named_modules, out_map=None, default_hook_type='forward'):
+        # setup feature hooks
+        modules = {k: v for k, v in named_modules}
+        for i, h in enumerate(hooks):
+            hook_name = h['module']
+            m = modules[hook_name]
+            hook_id = out_map[i] if out_map else hook_name
+            hook_fn = partial(self._collect_output_hook, hook_id)
+            hook_type = h.get('hook_type', default_hook_type)
+            if hook_type == 'forward_pre':
+                m.register_forward_pre_hook(hook_fn)
+            elif hook_type == 'forward':
+                m.register_forward_hook(hook_fn)
+            else:
+                assert False, "Unsupported hook type"
+        self._feature_outputs = defaultdict(OrderedDict)
+
+    def _collect_output_hook(self, hook_id, *args):
+        x = args[-1]  # tensor we want is last argument, output for fwd, input for fwd_pre
+        if isinstance(x, tuple):
+            x = x[0]  # unwrap input tuple
+        self._feature_outputs[x.device][hook_id] = x
+
+    def get_output(self, device) -> Dict[str, torch.tensor]:
+        output = self._feature_outputs[device]
+        self._feature_outputs[device] = OrderedDict()  # clear after reading
+        return output
+
+
+def _module_list(module, flatten_sequential=False):
+    # a yield/iter would be better for this but wouldn't be compatible with torchscript
+    ml = []
+    for name, module in module.named_children():
+        if flatten_sequential and isinstance(module, nn.Sequential):
+            # first level of Sequential containers is flattened into containing model
+            for child_name, child_module in module.named_children():
+                combined = [name, child_name]
+                ml.append(('_'.join(combined), '.'.join(combined), child_module))
+        else:
+            ml.append((name, name, module))
+    return ml
+
+
+def _get_feature_info(net, out_indices):
+    feature_info = getattr(net, 'feature_info')
+    if isinstance(feature_info, FeatureInfo):
+        return feature_info.from_other(out_indices)
+    elif isinstance(feature_info, (list, tuple)):
+        return FeatureInfo(net.feature_info, out_indices)
+    else:
+        assert False, "Provided feature_info is not valid"
+
+
+def _get_return_layers(feature_info, out_map):
+    module_names = feature_info.module_name()
+    return_layers = {}
+    for i, name in enumerate(module_names):
+        return_layers[name] = out_map[i] if out_map is not None else feature_info.out_indices[i]
+    return return_layers
+
+
+class FeatureDictNet(nn.ModuleDict):
+    """ Feature extractor with OrderedDict return
+
+    Wrap a model and extract features as specified by the out indices, the network is
+    partially re-built from contained modules.
+
+    There is a strong assumption that the modules have been registered into the model in the same
+    order as they are used. There should be no reuse of the same nn.Module more than once, including
+    trivial modules like `self.relu = nn.ReLU`.
+
+    Only submodules that are directly assigned to the model class (`model.feature1`) or at most
+    one Sequential container deep (`model.features.1`, with flatten_sequent=True) can be captured.
+    All Sequential containers that are directly assigned to the original model will have their
+    modules assigned to this module with the name `model.features.1` being changed to `model.features_1`
+
+    Arguments:
+        model (nn.Module): model from which we will extract the features
+        out_indices (tuple[int]): model output indices to extract features for
+        out_map (sequence): list or tuple specifying desired return id for each out index,
+            otherwise str(index) is used
+        feature_concat (bool): whether to concatenate intermediate features that are lists or tuples
+            vs select element [0]
+        flatten_sequential (bool): whether to flatten sequential modules assigned to model
+    """
+    def __init__(
+            self, model,
+            out_indices=(0, 1, 2, 3, 4), out_map=None, feature_concat=False, flatten_sequential=False):
+        super(FeatureDictNet, self).__init__()
+        self.feature_info = _get_feature_info(model, out_indices)
+        self.concat = feature_concat
+        self.return_layers = {}
+        return_layers = _get_return_layers(self.feature_info, out_map)
+        modules = _module_list(model, flatten_sequential=flatten_sequential)
+        remaining = set(return_layers.keys())
+        layers = OrderedDict()
+        for new_name, old_name, module in modules:
+            layers[new_name] = module
+            if old_name in remaining:
+                # return id has to be consistently str type for torchscript
+                self.return_layers[new_name] = str(return_layers[old_name])
+                remaining.remove(old_name)
+            if not remaining:
+                break
+        assert not remaining and len(self.return_layers) == len(return_layers), \
+            f'Return layers ({remaining}) are not present in model'
+        self.update(layers)
+
+    def _collect(self, x) -> (Dict[str, torch.Tensor]):
+        out = OrderedDict()
+        for name, module in self.items():
+            x = module(x)
+            if name in self.return_layers:
+                out_id = self.return_layers[name]
+                if isinstance(x, (tuple, list)):
+                    # If model tap is a tuple or list, concat or select first element
+                    # FIXME this may need to be more generic / flexible for some nets
+                    out[out_id] = torch.cat(x, 1) if self.concat else x[0]
+                else:
+                    out[out_id] = x
+        return out
+
+    def forward(self, x) -> Dict[str, torch.Tensor]:
+        return self._collect(x)
+
+
+class FeatureListNet(FeatureDictNet):
+    """ Feature extractor with list return
+
+    See docstring for FeatureDictNet above, this class exists only to appease Torchscript typing constraints.
+    In eager Python we could have returned List[Tensor] vs Dict[id, Tensor] based on a member bool.
+    """
+    def __init__(
+            self, model,
+            out_indices=(0, 1, 2, 3, 4), out_map=None, feature_concat=False, flatten_sequential=False):
+        super(FeatureListNet, self).__init__(
+            model, out_indices=out_indices, out_map=out_map, feature_concat=feature_concat,
+            flatten_sequential=flatten_sequential)
+
+    def forward(self, x) -> (List[torch.Tensor]):
+        return list(self._collect(x).values())
+
+
+class FeatureHookNet(nn.ModuleDict):
+    """ FeatureHookNet
+
+    Wrap a model and extract features specified by the out indices using forward/forward-pre hooks.
+
+    If `no_rewrite` is True, features are extracted via hooks without modifying the underlying
+    network in any way.
+
+    If `no_rewrite` is False, the model will be re-written as in the
+    FeatureList/FeatureDict case by folding first to second (Sequential only) level modules into this one.
+
+    FIXME this does not currently work with Torchscript, see FeatureHooks class
+    """
+    def __init__(
+            self, model,
+            out_indices=(0, 1, 2, 3, 4), out_map=None, out_as_dict=False, no_rewrite=False,
+            feature_concat=False, flatten_sequential=False, default_hook_type='forward'):
+        super(FeatureHookNet, self).__init__()
+        assert not torch.jit.is_scripting()
+        self.feature_info = _get_feature_info(model, out_indices)
+        self.out_as_dict = out_as_dict
+        layers = OrderedDict()
+        hooks = []
+        if no_rewrite:
+            assert not flatten_sequential
+            if hasattr(model, 'reset_classifier'):  # make sure classifier is removed?
+                model.reset_classifier(0)
+            layers['body'] = model
+            hooks.extend(self.feature_info.get_dicts())
+        else:
+            modules = _module_list(model, flatten_sequential=flatten_sequential)
+            remaining = {f['module']: f['hook_type'] if 'hook_type' in f else default_hook_type
+                         for f in self.feature_info.get_dicts()}
+            for new_name, old_name, module in modules:
+                layers[new_name] = module
+                for fn, fm in module.named_modules(prefix=old_name):
+                    if fn in remaining:
+                        hooks.append(dict(module=fn, hook_type=remaining[fn]))
+                        del remaining[fn]
+                if not remaining:
+                    break
+            assert not remaining, f'Return layers ({remaining}) are not present in model'
+        self.update(layers)
+        self.hooks = FeatureHooks(hooks, model.named_modules(), out_map=out_map)
+
+    def forward(self, x):
+        for name, module in self.items():
+            x = module(x)
+        out = self.hooks.get_output(x.device)
+        return out if self.out_as_dict else list(out.values())
diff --git a/src/custom_timm/models/fx_features.py b/src/custom_timm/models/fx_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fadcbf2ed9447496c744db95af84e697e527a4b
--- /dev/null
+++ b/src/custom_timm/models/fx_features.py
@@ -0,0 +1,106 @@
+""" PyTorch FX Based Feature Extraction Helpers
+Using https://pytorch.org/vision/stable/feature_extraction.html
+"""
+from typing import Callable, List, Dict, Union, Type
+
+import torch
+from torch import nn
+
+from .features import _get_feature_info
+
+try:
+    from torchvision.models.feature_extraction import create_feature_extractor as _create_feature_extractor
+    has_fx_feature_extraction = True
+except ImportError:
+    has_fx_feature_extraction = False
+
+# Layers we went to treat as leaf modules
+from .layers import Conv2dSame, ScaledStdConv2dSame, CondConv2d, StdConv2dSame
+from .layers.non_local_attn import BilinearAttnTransform
+from .layers.pool2d_same import MaxPool2dSame, AvgPool2dSame
+
+# NOTE: By default, any modules from custom_timm.models.layers that we want to treat as leaf modules go here
+# BUT modules from custom_timm.models should use the registration mechanism below
+_leaf_modules = {
+    BilinearAttnTransform,  # reason: flow control t <= 1
+    # Reason: get_same_padding has a max which raises a control flow error
+    Conv2dSame, MaxPool2dSame, ScaledStdConv2dSame, StdConv2dSame, AvgPool2dSame,
+    CondConv2d,  # reason: TypeError: F.conv2d received Proxy in groups=self.groups * B (because B = x.shape[0])
+}
+
+try:
+    from .layers import InplaceAbn
+    _leaf_modules.add(InplaceAbn)
+except ImportError:
+    pass
+
+
+def register_notrace_module(module: Type[nn.Module]):
+    """
+    Any module not under timm.models.layers should get this decorator if we don't want to trace through it.
+    """
+    _leaf_modules.add(module)
+    return module
+
+
+# Functions we want to autowrap (treat them as leaves)
+_autowrap_functions = set()
+
+
+def register_notrace_function(func: Callable):
+    """
+    Decorator for functions which ought not to be traced through
+    """
+    _autowrap_functions.add(func)
+    return func
+
+
+def create_feature_extractor(model: nn.Module, return_nodes: Union[Dict[str, str], List[str]]):
+    assert has_fx_feature_extraction, 'Please update to PyTorch 1.10+, torchvision 0.11+ for FX feature extraction'
+    return _create_feature_extractor(
+        model, return_nodes,
+        tracer_kwargs={'leaf_modules': list(_leaf_modules), 'autowrap_functions': list(_autowrap_functions)}
+    )
+
+
+class FeatureGraphNet(nn.Module):
+    """ A FX Graph based feature extractor that works with the model feature_info metadata
+    """
+    def __init__(self, model, out_indices, out_map=None):
+        super().__init__()
+        assert has_fx_feature_extraction, 'Please update to PyTorch 1.10+, torchvision 0.11+ for FX feature extraction'
+        self.feature_info = _get_feature_info(model, out_indices)
+        if out_map is not None:
+            assert len(out_map) == len(out_indices)
+        return_nodes = {
+            info['module']: out_map[i] if out_map is not None else info['module']
+            for i, info in enumerate(self.feature_info) if i in out_indices}
+        self.graph_module = create_feature_extractor(model, return_nodes)
+
+    def forward(self, x):
+        return list(self.graph_module(x).values())
+
+
+class GraphExtractNet(nn.Module):
+    """ A standalone feature extraction wrapper that maps dict -> list or single tensor
+    NOTE:
+      * one can use feature_extractor directly if dictionary output is desired
+      * unlike FeatureGraphNet, this is intended to be used standalone and not with model feature_info
+      metadata for builtin feature extraction mode
+      * create_feature_extractor can be used directly if dictionary output is acceptable
+
+    Args:
+        model: model to extract features from
+        return_nodes: node names to return features from (dict or list)
+        squeeze_out: if only one output, and output in list format, flatten to single tensor
+    """
+    def __init__(self, model, return_nodes: Union[Dict[str, str], List[str]], squeeze_out: bool = True):
+        super().__init__()
+        self.squeeze_out = squeeze_out
+        self.graph_module = create_feature_extractor(model, return_nodes)
+
+    def forward(self, x) -> Union[List[torch.Tensor], torch.Tensor]:
+        out = list(self.graph_module(x).values())
+        if self.squeeze_out and len(out) == 1:
+            return out[0]
+        return out
diff --git a/src/custom_timm/models/gcvit.py b/src/custom_timm/models/gcvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8984dfe2b60b1e574ed42458bce292ce8bf1fe2
--- /dev/null
+++ b/src/custom_timm/models/gcvit.py
@@ -0,0 +1,592 @@
+""" Global Context ViT
+
+From scratch implementation of GCViT in the style of timm swin_transformer_v2_cr.py
+
+Global Context Vision Transformers -https://arxiv.org/abs/2206.09959
+
+@article{hatamizadeh2022global,
+  title={Global Context Vision Transformers},
+  author={Hatamizadeh, Ali and Yin, Hongxu and Kautz, Jan and Molchanov, Pavlo},
+  journal={arXiv preprint arXiv:2206.09959},
+  year={2022}
+}
+
+Free of any code related to NVIDIA GCVit impl at https://github.com/NVlabs/GCVit.
+The license for this code release is Apache 2.0 with no commercial restrictions.
+
+However, weight files adapted from NVIDIA GCVit impl ARE under a non-commercial share-alike license
+(https://creativecommons.org/licenses/by-nc-sa/4.0/) until I have a chance to train new ones...
+
+Hacked together by / Copyright 2022, Ross Wightman
+"""
+import math
+from functools import partial
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .fx_features import register_notrace_function
+from .helpers import build_model_with_cfg, named_apply
+from .layers import DropPath, to_2tuple, to_ntuple, Mlp, ClassifierHead, LayerNorm2d,\
+    get_attn, get_act_layer, get_norm_layer, _assert
+from .registry import register_model
+from .vision_transformer_relpos import RelPosMlp, RelPosBias  # FIXME move to common location
+
+__all__ = ['GlobalContextVit']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.conv1', 'classifier': 'head.fc',
+        'fixed_input_size': True,
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'gcvit_xxtiny': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-morevit/gcvit_xxtiny_224_nvidia-d1d86009.pth'),
+    'gcvit_xtiny': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-morevit/gcvit_xtiny_224_nvidia-274b92b7.pth'),
+    'gcvit_tiny': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-morevit/gcvit_tiny_224_nvidia-ac783954.pth'),
+    'gcvit_small': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-morevit/gcvit_small_224_nvidia-4e98afa2.pth'),
+    'gcvit_base': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-morevit/gcvit_base_224_nvidia-f009139b.pth'),
+}
+
+
+class MbConvBlock(nn.Module):
+    """ A depthwise separable / fused mbconv style residual block with SE, `no norm.
+    """
+    def __init__(
+            self,
+            in_chs,
+            out_chs=None,
+            expand_ratio=1.0,
+            attn_layer='se',
+            bias=False,
+            act_layer=nn.GELU,
+    ):
+        super().__init__()
+        attn_kwargs = dict(act_layer=act_layer)
+        if isinstance(attn_layer, str) and attn_layer == 'se' or attn_layer == 'eca':
+            attn_kwargs['rd_ratio'] = 0.25
+            attn_kwargs['bias'] = False
+        attn_layer = get_attn(attn_layer)
+        out_chs = out_chs or in_chs
+        mid_chs = int(expand_ratio * in_chs)
+
+        self.conv_dw = nn.Conv2d(in_chs, mid_chs, 3, 1, 1, groups=in_chs, bias=bias)
+        self.act = act_layer()
+        self.se = attn_layer(mid_chs, **attn_kwargs)
+        self.conv_pw = nn.Conv2d(mid_chs, out_chs, 1, 1, 0, bias=bias)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv_dw(x)
+        x = self.act(x)
+        x = self.se(x)
+        x = self.conv_pw(x)
+        x = x + shortcut
+        return x
+
+
+class Downsample2d(nn.Module):
+    def __init__(
+            self,
+            dim,
+            dim_out=None,
+            reduction='conv',
+            act_layer=nn.GELU,
+            norm_layer=LayerNorm2d,  # NOTE in NCHW
+    ):
+        super().__init__()
+        dim_out = dim_out or dim
+
+        self.norm1 = norm_layer(dim) if norm_layer is not None else nn.Identity()
+        self.conv_block = MbConvBlock(dim, act_layer=act_layer)
+        assert reduction in ('conv', 'max', 'avg')
+        if reduction == 'conv':
+            self.reduction = nn.Conv2d(dim, dim_out, 3, 2, 1, bias=False)
+        elif reduction == 'max':
+            assert dim == dim_out
+            self.reduction = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        else:
+            assert dim == dim_out
+            self.reduction = nn.AvgPool2d(kernel_size=2)
+        self.norm2 = norm_layer(dim_out) if norm_layer is not None else nn.Identity()
+
+    def forward(self, x):
+        x = self.norm1(x)
+        x = self.conv_block(x)
+        x = self.reduction(x)
+        x = self.norm2(x)
+        return x
+
+
+class FeatureBlock(nn.Module):
+    def __init__(
+            self,
+            dim,
+            levels=0,
+            reduction='max',
+            act_layer=nn.GELU,
+    ):
+        super().__init__()
+        reductions = levels
+        levels = max(1, levels)
+        if reduction == 'avg':
+            pool_fn = partial(nn.AvgPool2d, kernel_size=2)
+        else:
+            pool_fn = partial(nn.MaxPool2d, kernel_size=3, stride=2, padding=1)
+        self.blocks = nn.Sequential()
+        for i in range(levels):
+            self.blocks.add_module(f'conv{i+1}', MbConvBlock(dim, act_layer=act_layer))
+            if reductions:
+                self.blocks.add_module(f'pool{i+1}', pool_fn())
+                reductions -= 1
+
+    def forward(self, x):
+        return self.blocks(x)
+
+
+class Stem(nn.Module):
+    def __init__(
+            self,
+            in_chs: int = 3,
+            out_chs: int = 96,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = LayerNorm2d,  # NOTE stem in NCHW
+    ):
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_chs, out_chs, kernel_size=3, stride=2, padding=1)
+        self.down = Downsample2d(out_chs, act_layer=act_layer, norm_layer=norm_layer)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.down(x)
+        return x
+
+
+class WindowAttentionGlobal(nn.Module):
+
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int,
+            window_size: Tuple[int, int],
+            use_global: bool = True,
+            qkv_bias: bool = True,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.,
+    ):
+        super().__init__()
+        window_size = to_2tuple(window_size)
+        self.window_size = window_size
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.use_global = use_global
+
+        self.rel_pos = RelPosBias(window_size=window_size, num_heads=num_heads)
+        if self.use_global:
+            self.qkv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        else:
+            self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, q_global: Optional[torch.Tensor] = None):
+        B, N, C = x.shape
+        if self.use_global and q_global is not None:
+            _assert(x.shape[-1] == q_global.shape[-1], 'x and q_global seq lengths should be equal')
+
+            kv = self.qkv(x)
+            kv = kv.reshape(B, N, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+            k, v = kv.unbind(0)
+
+            q = q_global.repeat(B // q_global.shape[0], 1, 1, 1)
+            q = q.reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        else:
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv.unbind(0)
+        q = q * self.scale
+
+        attn = (q @ k.transpose(-2, -1))
+        attn = self.rel_pos(attn)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+def window_partition(x, window_size: Tuple[int, int]):
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size[0], window_size[0], W // window_size[1], window_size[1], C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size[0], window_size[1], C)
+    return windows
+
+
+@register_notrace_function  # reason: int argument is a Proxy
+def window_reverse(windows, window_size: Tuple[int, int], img_size: Tuple[int, int]):
+    H, W = img_size
+    B = int(windows.shape[0] / (H * W / window_size[0] / window_size[1]))
+    x = windows.view(B, H // window_size[0], W // window_size[1], window_size[0], window_size[1], -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x):
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+
+
+class GlobalContextVitBlock(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            feat_size: Tuple[int, int],
+            num_heads: int,
+            window_size: int = 7,
+            mlp_ratio: float = 4.,
+            use_global: bool = True,
+            qkv_bias: bool = True,
+            layer_scale: Optional[float] = None,
+            proj_drop: float = 0.,
+            attn_drop: float = 0.,
+            drop_path: float = 0.,
+            attn_layer: Callable = WindowAttentionGlobal,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = nn.LayerNorm,
+    ):
+        super().__init__()
+        feat_size = to_2tuple(feat_size)
+        window_size = to_2tuple(window_size)
+        self.window_size = window_size
+        self.num_windows = int((feat_size[0] // window_size[0]) * (feat_size[1] // window_size[1]))
+
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_layer(
+            dim,
+            num_heads=num_heads,
+            window_size=window_size,
+            use_global=use_global,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+        )
+        self.ls1 = LayerScale(dim, layer_scale) if layer_scale is not None else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=proj_drop)
+        self.ls2 = LayerScale(dim, layer_scale) if layer_scale is not None else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def _window_attn(self, x, q_global: Optional[torch.Tensor] = None):
+        B, H, W, C = x.shape
+        x_win = window_partition(x, self.window_size)
+        x_win = x_win.view(-1, self.window_size[0] * self.window_size[1], C)
+        attn_win = self.attn(x_win, q_global)
+        x = window_reverse(attn_win, self.window_size, (H, W))
+        return x
+
+    def forward(self, x, q_global: Optional[torch.Tensor] = None):
+        x = x + self.drop_path1(self.ls1(self._window_attn(self.norm1(x), q_global)))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+
+
+class GlobalContextVitStage(nn.Module):
+    def __init__(
+            self,
+            dim,
+            depth: int,
+            num_heads: int,
+            feat_size: Tuple[int, int],
+            window_size: Tuple[int, int],
+            downsample: bool = True,
+            global_norm: bool = False,
+            stage_norm: bool = False,
+            mlp_ratio: float = 4.,
+            qkv_bias: bool = True,
+            layer_scale: Optional[float] = None,
+            proj_drop: float = 0.,
+            attn_drop: float = 0.,
+            drop_path: Union[List[float], float] = 0.0,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = nn.LayerNorm,
+            norm_layer_cl: Callable = LayerNorm2d,
+    ):
+        super().__init__()
+        if downsample:
+            self.downsample = Downsample2d(
+                dim=dim,
+                dim_out=dim * 2,
+                norm_layer=norm_layer,
+            )
+            dim = dim * 2
+            feat_size = (feat_size[0] // 2, feat_size[1] // 2)
+        else:
+            self.downsample = nn.Identity()
+        self.feat_size = feat_size
+        window_size = to_2tuple(window_size)
+
+        feat_levels = int(math.log2(min(feat_size) / min(window_size)))
+        self.global_block = FeatureBlock(dim, feat_levels)
+        self.global_norm = norm_layer_cl(dim) if global_norm else nn.Identity()
+
+        self.blocks = nn.ModuleList([
+            GlobalContextVitBlock(
+                dim=dim,
+                num_heads=num_heads,
+                feat_size=feat_size,
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                use_global=(i % 2 != 0),
+                layer_scale=layer_scale,
+                proj_drop=proj_drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                act_layer=act_layer,
+                norm_layer=norm_layer_cl,
+            )
+            for i in range(depth)
+        ])
+        self.norm = norm_layer_cl(dim) if stage_norm else nn.Identity()
+        self.dim = dim
+        self.feat_size = feat_size
+        self.grad_checkpointing = False
+
+    def forward(self, x):
+        # input NCHW, downsample & global block are 2d conv + pooling
+        x = self.downsample(x)
+        global_query = self.global_block(x)
+
+        # reshape NCHW --> NHWC for transformer blocks
+        x = x.permute(0, 2, 3, 1)
+        global_query = self.global_norm(global_query.permute(0, 2, 3, 1))
+        for blk in self.blocks:
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x, global_query)
+        x = self.norm(x)
+        x = x.permute(0, 3, 1, 2).contiguous()  # back to NCHW
+        return x
+
+
+class GlobalContextVit(nn.Module):
+    def __init__(
+            self,
+            in_chans: int = 3,
+            num_classes: int = 1000,
+            global_pool: str = 'avg',
+            img_size: Tuple[int, int] = 224,
+            window_ratio: Tuple[int, ...] = (32, 32, 16, 32),
+            window_size: Tuple[int, ...] = None,
+            embed_dim: int = 64,
+            depths: Tuple[int, ...] = (3, 4, 19, 5),
+            num_heads: Tuple[int, ...] = (2, 4, 8, 16),
+            mlp_ratio: float = 3.0,
+            qkv_bias: bool = True,
+            layer_scale: Optional[float] = None,
+            drop_rate: float = 0.,
+            proj_drop_rate: float = 0.,
+            attn_drop_rate: float = 0.,
+            drop_path_rate: float = 0.,
+            weight_init='',
+            act_layer: str = 'gelu',
+            norm_layer: str = 'layernorm2d',
+            norm_layer_cl: str = 'layernorm',
+            norm_eps: float = 1e-5,
+    ):
+        super().__init__()
+        act_layer = get_act_layer(act_layer)
+        norm_layer = partial(get_norm_layer(norm_layer), eps=norm_eps)
+        norm_layer_cl = partial(get_norm_layer(norm_layer_cl), eps=norm_eps)
+
+        img_size = to_2tuple(img_size)
+        feat_size = tuple(d // 4 for d in img_size)  # stem reduction by 4
+        self.global_pool = global_pool
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        num_stages = len(depths)
+        self.num_features = int(embed_dim * 2 ** (num_stages - 1))
+        if window_size is not None:
+            window_size = to_ntuple(num_stages)(window_size)
+        else:
+            assert window_ratio is not None
+            window_size = tuple([(img_size[0] // r, img_size[1] // r) for r in to_ntuple(num_stages)(window_ratio)])
+
+        self.stem = Stem(
+            in_chs=in_chans,
+            out_chs=embed_dim,
+            act_layer=act_layer,
+            norm_layer=norm_layer
+        )
+
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        stages = []
+        for i in range(num_stages):
+            last_stage = i == num_stages - 1
+            stage_scale = 2 ** max(i - 1, 0)
+            stages.append(GlobalContextVitStage(
+                dim=embed_dim * stage_scale,
+                depth=depths[i],
+                num_heads=num_heads[i],
+                feat_size=(feat_size[0] // stage_scale, feat_size[1] // stage_scale),
+                window_size=window_size[i],
+                downsample=i != 0,
+                stage_norm=last_stage,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                layer_scale=layer_scale,
+                proj_drop=proj_drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                norm_layer_cl=norm_layer_cl,
+            ))
+        self.stages = nn.Sequential(*stages)
+
+        # Classifier head
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate)
+
+        if weight_init:
+            named_apply(partial(self._init_weights, scheme=weight_init), self)
+
+    def _init_weights(self, module, name, scheme='vit'):
+        # note Conv2d left as default init
+        if scheme == 'vit':
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    if 'mlp' in name:
+                        nn.init.normal_(module.bias, std=1e-6)
+                    else:
+                        nn.init.zeros_(module.bias)
+        else:
+            if isinstance(module, nn.Linear):
+                nn.init.normal_(module.weight, std=.02)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {
+            k for k, _ in self.named_parameters()
+            if any(n in k for n in ["relative_position_bias_table", "rel_pos.mlp"])}
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^stem',  # stem and embed
+            blocks=r'^stages\.(\d+)'
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for s in self.stages:
+            s.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is None:
+            global_pool = self.head.global_pool.pool_type
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.stem(x)
+        x = self.stages(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        return self.head(x, pre_logits=pre_logits)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_gcvit(variant, pretrained=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+    model = build_model_with_cfg(GlobalContextVit, variant, pretrained, **kwargs)
+    return model
+
+
+@register_model
+def gcvit_xxtiny(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        depths=(2, 2, 6, 2),
+        num_heads=(2, 4, 8, 16),
+        **kwargs)
+    return _create_gcvit('gcvit_xxtiny', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def gcvit_xtiny(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        depths=(3, 4, 6, 5),
+        num_heads=(2, 4, 8, 16),
+        **kwargs)
+    return _create_gcvit('gcvit_xtiny', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def gcvit_tiny(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        depths=(3, 4, 19, 5),
+        num_heads=(2, 4, 8, 16),
+        **kwargs)
+    return _create_gcvit('gcvit_tiny', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def gcvit_small(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        depths=(3, 4, 19, 5),
+        num_heads=(3, 6, 12, 24),
+        embed_dim=96,
+        mlp_ratio=2,
+        layer_scale=1e-5,
+        **kwargs)
+    return _create_gcvit('gcvit_small', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def gcvit_base(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        depths=(3, 4, 19, 5),
+        num_heads=(4, 8, 16, 32),
+        embed_dim=128,
+        mlp_ratio=2,
+        layer_scale=1e-5,
+        **kwargs)
+    return _create_gcvit('gcvit_base', pretrained=pretrained, **model_kwargs)
diff --git a/src/custom_timm/models/ghostnet.py b/src/custom_timm/models/ghostnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f31127dd86409b5fe2e9b54036e72a0a938da09f
--- /dev/null
+++ b/src/custom_timm/models/ghostnet.py
@@ -0,0 +1,302 @@
+"""
+An implementation of GhostNet Model as defined in:
+GhostNet: More Features from Cheap Operations. https://arxiv.org/abs/1911.11907
+The train script of the model is similar to that of MobileNetV3
+Original model: https://github.com/huawei-noah/CV-backbones/tree/master/ghostnet_pytorch
+"""
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .layers import SelectAdaptivePool2d, Linear, make_divisible
+from .efficientnet_blocks import SqueezeExcite, ConvBnAct
+from .helpers import build_model_with_cfg, checkpoint_seq
+from .registry import register_model
+
+
+__all__ = ['GhostNet']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'conv_stem', 'classifier': 'classifier',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'ghostnet_050': _cfg(url=''),
+    'ghostnet_100': _cfg(
+        url='https://github.com/huawei-noah/CV-backbones/releases/download/ghostnet_pth/ghostnet_1x.pth'),
+    'ghostnet_130': _cfg(url=''),
+}
+
+
+_SE_LAYER = partial(SqueezeExcite, gate_layer='hard_sigmoid', rd_round_fn=partial(make_divisible, divisor=4))
+
+
+class GhostModule(nn.Module):
+    def __init__(self, inp, oup, kernel_size=1, ratio=2, dw_size=3, stride=1, relu=True):
+        super(GhostModule, self).__init__()
+        self.oup = oup
+        init_channels = math.ceil(oup / ratio)
+        new_channels = init_channels * (ratio - 1)
+
+        self.primary_conv = nn.Sequential(
+            nn.Conv2d(inp, init_channels, kernel_size, stride, kernel_size//2, bias=False),
+            nn.BatchNorm2d(init_channels),
+            nn.ReLU(inplace=True) if relu else nn.Sequential(),
+        )
+
+        self.cheap_operation = nn.Sequential(
+            nn.Conv2d(init_channels, new_channels, dw_size, 1, dw_size//2, groups=init_channels, bias=False),
+            nn.BatchNorm2d(new_channels),
+            nn.ReLU(inplace=True) if relu else nn.Sequential(),
+        )
+
+    def forward(self, x):
+        x1 = self.primary_conv(x)
+        x2 = self.cheap_operation(x1)
+        out = torch.cat([x1, x2], dim=1)
+        return out[:, :self.oup, :, :]
+
+
+class GhostBottleneck(nn.Module):
+    """ Ghost bottleneck w/ optional SE"""
+
+    def __init__(self, in_chs, mid_chs, out_chs, dw_kernel_size=3,
+                 stride=1, act_layer=nn.ReLU, se_ratio=0.):
+        super(GhostBottleneck, self).__init__()
+        has_se = se_ratio is not None and se_ratio > 0.
+        self.stride = stride
+
+        # Point-wise expansion
+        self.ghost1 = GhostModule(in_chs, mid_chs, relu=True)
+
+        # Depth-wise convolution
+        if self.stride > 1:
+            self.conv_dw = nn.Conv2d(
+                mid_chs, mid_chs, dw_kernel_size, stride=stride,
+                padding=(dw_kernel_size-1)//2, groups=mid_chs, bias=False)
+            self.bn_dw = nn.BatchNorm2d(mid_chs)
+        else:
+            self.conv_dw = None
+            self.bn_dw = None
+
+        # Squeeze-and-excitation
+        self.se = _SE_LAYER(mid_chs, rd_ratio=se_ratio) if has_se else None
+
+        # Point-wise linear projection
+        self.ghost2 = GhostModule(mid_chs, out_chs, relu=False)
+        
+        # shortcut
+        if in_chs == out_chs and self.stride == 1:
+            self.shortcut = nn.Sequential()
+        else:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(
+                    in_chs, in_chs, dw_kernel_size, stride=stride,
+                    padding=(dw_kernel_size-1)//2, groups=in_chs, bias=False),
+                nn.BatchNorm2d(in_chs),
+                nn.Conv2d(in_chs, out_chs, 1, stride=1, padding=0, bias=False),
+                nn.BatchNorm2d(out_chs),
+            )
+
+    def forward(self, x):
+        shortcut = x
+
+        # 1st ghost bottleneck
+        x = self.ghost1(x)
+
+        # Depth-wise convolution
+        if self.conv_dw is not None:
+            x = self.conv_dw(x)
+            x = self.bn_dw(x)
+
+        # Squeeze-and-excitation
+        if self.se is not None:
+            x = self.se(x)
+
+        # 2nd ghost bottleneck
+        x = self.ghost2(x)
+        
+        x += self.shortcut(shortcut)
+        return x
+
+
+class GhostNet(nn.Module):
+    def __init__(
+            self, cfgs, num_classes=1000, width=1.0, in_chans=3, output_stride=32, global_pool='avg', drop_rate=0.2):
+        super(GhostNet, self).__init__()
+        # setting of inverted residual blocks
+        assert output_stride == 32, 'only output_stride==32 is valid, dilation not supported'
+        self.cfgs = cfgs
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        self.grad_checkpointing = False
+        self.feature_info = []
+
+        # building first layer
+        stem_chs = make_divisible(16 * width, 4)
+        self.conv_stem = nn.Conv2d(in_chans, stem_chs, 3, 2, 1, bias=False)
+        self.feature_info.append(dict(num_chs=stem_chs, reduction=2, module=f'conv_stem'))
+        self.bn1 = nn.BatchNorm2d(stem_chs)
+        self.act1 = nn.ReLU(inplace=True)
+        prev_chs = stem_chs
+
+        # building inverted residual blocks
+        stages = nn.ModuleList([])
+        block = GhostBottleneck
+        stage_idx = 0
+        net_stride = 2
+        for cfg in self.cfgs:
+            layers = []
+            s = 1
+            for k, exp_size, c, se_ratio, s in cfg:
+                out_chs = make_divisible(c * width, 4)
+                mid_chs = make_divisible(exp_size * width, 4)
+                layers.append(block(prev_chs, mid_chs, out_chs, k, s, se_ratio=se_ratio))
+                prev_chs = out_chs
+            if s > 1:
+                net_stride *= 2
+                self.feature_info.append(dict(
+                    num_chs=prev_chs, reduction=net_stride, module=f'blocks.{stage_idx}'))
+            stages.append(nn.Sequential(*layers))
+            stage_idx += 1
+
+        out_chs = make_divisible(exp_size * width, 4)
+        stages.append(nn.Sequential(ConvBnAct(prev_chs, out_chs, 1)))
+        self.pool_dim = prev_chs = out_chs
+        
+        self.blocks = nn.Sequential(*stages)        
+
+        # building last several layers
+        self.num_features = out_chs = 1280
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.conv_head = nn.Conv2d(prev_chs, out_chs, 1, 1, 0, bias=True)
+        self.act2 = nn.ReLU(inplace=True)
+        self.flatten = nn.Flatten(1) if global_pool else nn.Identity()  # don't flatten if pooling disabled
+        self.classifier = Linear(out_chs, num_classes) if num_classes > 0 else nn.Identity()
+
+        # FIXME init
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^conv_stem|bn1',
+            blocks=[
+                (r'^blocks\.(\d+)' if coarse else r'^blocks\.(\d+)\.(\d+)', None),
+                (r'conv_head', (99999,))
+            ]
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.classifier
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        # cannot meaningfully change pooling of efficient head after creation
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.flatten = nn.Flatten(1) if global_pool else nn.Identity()  # don't flatten if pooling disabled
+        self.classifier = Linear(self.pool_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x, flatten=True)
+        else:
+            x = self.blocks(x)
+        return x
+
+    def forward_head(self, x):
+        x = self.global_pool(x)
+        x = self.conv_head(x)
+        x = self.act2(x)
+        x = self.flatten(x)
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        x = self.classifier(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_ghostnet(variant, width=1.0, pretrained=False, **kwargs):
+    """
+    Constructs a GhostNet model
+    """
+    cfgs = [
+        # k, t, c, SE, s 
+        # stage1
+        [[3,  16,  16, 0, 1]],
+        # stage2
+        [[3,  48,  24, 0, 2]],
+        [[3,  72,  24, 0, 1]],
+        # stage3
+        [[5,  72,  40, 0.25, 2]],
+        [[5, 120,  40, 0.25, 1]],
+        # stage4
+        [[3, 240,  80, 0, 2]],
+        [[3, 200,  80, 0, 1],
+         [3, 184,  80, 0, 1],
+         [3, 184,  80, 0, 1],
+         [3, 480, 112, 0.25, 1],
+         [3, 672, 112, 0.25, 1]
+        ],
+        # stage5
+        [[5, 672, 160, 0.25, 2]],
+        [[5, 960, 160, 0, 1],
+         [5, 960, 160, 0.25, 1],
+         [5, 960, 160, 0, 1],
+         [5, 960, 160, 0.25, 1]
+        ]
+    ]
+    model_kwargs = dict(
+        cfgs=cfgs,
+        width=width,
+        **kwargs,
+    )
+    return build_model_with_cfg(
+        GhostNet, variant, pretrained,
+        feature_cfg=dict(flatten_sequential=True),
+        **model_kwargs)
+
+
+@register_model
+def ghostnet_050(pretrained=False, **kwargs):
+    """ GhostNet-0.5x """
+    model = _create_ghostnet('ghostnet_050', width=0.5, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def ghostnet_100(pretrained=False, **kwargs):
+    """ GhostNet-1.0x """
+    model = _create_ghostnet('ghostnet_100', width=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def ghostnet_130(pretrained=False, **kwargs):
+    """ GhostNet-1.3x """
+    model = _create_ghostnet('ghostnet_130', width=1.3, pretrained=pretrained, **kwargs)
+    return model
diff --git a/src/custom_timm/models/maxxvit.py b/src/custom_timm/models/maxxvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..f01e0812e86cb6a205d0bb18adf7de1d03a3e318
--- /dev/null
+++ b/src/custom_timm/models/maxxvit.py
@@ -0,0 +1,1914 @@
+""" MaxVit and CoAtNet Vision Transformer - CNN Hybrids in PyTorch
+
+This is a from-scratch implementation of both CoAtNet and MaxVit in PyTorch.
+
+99% of the implementation was done from papers, however last minute some adjustments were made
+based on the (as yet unfinished?) public code release https://github.com/google-research/maxvit
+
+There are multiple sets of models defined for both architectures. Typically, names with a
+ `_rw` suffix are my own original configs prior to referencing https://github.com/google-research/maxvit.
+These configs work well and appear to be a bit faster / lower resource than the paper.
+
+The models without extra prefix / suffix' (coatnet_0_224, maxvit_tiny_224, etc), are intended to
+match paper, BUT, without any official pretrained weights it's difficult to confirm a 100% match.
+
+# FIXME / WARNING
+This impl remains a WIP, some configs and models may vanish or change...
+
+Papers:
+
+MaxViT: Multi-Axis Vision Transformer - https://arxiv.org/abs/2204.01697
+@article{tu2022maxvit,
+  title={MaxViT: Multi-Axis Vision Transformer},
+  author={Tu, Zhengzhong and Talebi, Hossein and Zhang, Han and Yang, Feng and Milanfar, Peyman and Bovik, Alan and Li, Yinxiao},
+  journal={ECCV},
+  year={2022},
+}
+
+CoAtNet: Marrying Convolution and Attention for All Data Sizes - https://arxiv.org/abs/2106.04803
+@article{DBLP:journals/corr/abs-2106-04803,
+  author    = {Zihang Dai and Hanxiao Liu and Quoc V. Le and Mingxing Tan},
+  title     = {CoAtNet: Marrying Convolution and Attention for All Data Sizes},
+  journal   = {CoRR},
+  volume    = {abs/2106.04803},
+  year      = {2021}
+}
+
+Hacked together by / Copyright 2022, Ross Wightman
+"""
+
+import math
+from collections import OrderedDict
+from dataclasses import dataclass, replace, field
+from functools import partial
+from typing import Callable, Optional, Union, Tuple, List
+
+import torch
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg, checkpoint_seq, named_apply
+from .fx_features import register_notrace_function
+from .layers import Mlp, ConvMlp, DropPath, ClassifierHead, trunc_normal_tf_, LayerNorm2d, LayerNorm
+from .layers import create_attn, get_act_layer, get_norm_layer, get_norm_act_layer, create_conv2d
+from .layers import to_2tuple, extend_tuple, make_divisible, _assert
+from .registry import register_model
+from .vision_transformer_relpos import RelPosMlp, RelPosBias  # FIXME move these to common location
+
+__all__ = ['MaxxVitCfg', 'MaxxVitConvCfg', 'MaxxVitTransformerCfg', 'MaxxVit']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.95, 'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+        'first_conv': 'stem.conv1', 'classifier': 'head.fc',
+        'fixed_input_size': True,
+        **kwargs
+    }
+
+
+default_cfgs = {
+    # Fiddling with configs / defaults / still pretraining
+    'coatnet_pico_rw_224': _cfg(url=''),
+    'coatnet_nano_rw_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_nano_rw_224_sw-f53093b4.pth',
+        crop_pct=0.9),
+    'coatnet_0_rw_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_0_rw_224_sw-a6439706.pth'),
+    'coatnet_1_rw_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_1_rw_224_sw-5cae1ea8.pth'
+    ),
+    'coatnet_2_rw_224': _cfg(url=''),
+    'coatnet_3_rw_224': _cfg(url=''),
+
+    # Highly experimental configs
+    'coatnet_bn_0_rw_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_bn_0_rw_224_sw-c228e218.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD,
+        crop_pct=0.95),
+    'coatnet_rmlp_nano_rw_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_nano_rw_224_sw-bd1d51b3.pth',
+        crop_pct=0.9),
+    'coatnet_rmlp_0_rw_224': _cfg(url=''),
+    'coatnet_rmlp_1_rw_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_1_rw_224_sw-9051e6c3.pth'),
+    'coatnet_rmlp_2_rw_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_2_rw_224_sw-5ccfac55.pth'),
+    'coatnet_rmlp_3_rw_224': _cfg(url=''),
+    'coatnet_nano_cc_224': _cfg(url=''),
+    'coatnext_nano_rw_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnext_nano_rw_224_ad-22cb71c2.pth',
+        crop_pct=0.9),
+
+    # Trying to be like the CoAtNet paper configs
+    'coatnet_0_224': _cfg(url=''),
+    'coatnet_1_224': _cfg(url=''),
+    'coatnet_2_224': _cfg(url=''),
+    'coatnet_3_224': _cfg(url=''),
+    'coatnet_4_224': _cfg(url=''),
+    'coatnet_5_224': _cfg(url=''),
+
+    # Experimental configs
+    'maxvit_pico_rw_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)),
+    'maxvit_nano_rw_256': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_nano_rw_256_sw-fb127241.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8)),
+    'maxvit_tiny_rw_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_tiny_rw_224_sw-7d0dffeb.pth'),
+    'maxvit_tiny_rw_256': _cfg(
+        url='',
+        input_size=(3, 256, 256), pool_size=(8, 8)),
+    'maxvit_rmlp_pico_rw_256': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_pico_rw_256_sw-8d82f2c6.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8)),
+    'maxvit_rmlp_nano_rw_256': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_nano_rw_256_sw-c17bb0d6.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8)),
+    'maxvit_rmlp_tiny_rw_256': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_tiny_rw_256_sw-bbef0ff5.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8)),
+    'maxvit_rmlp_small_rw_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_small_rw_224_sw-6ef0ae4f.pth',
+        crop_pct=0.9,
+    ),
+    'maxvit_rmlp_small_rw_256': _cfg(
+        url='',
+        input_size=(3, 256, 256), pool_size=(8, 8)),
+
+    'maxvit_tiny_pm_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)),
+
+    'maxxvit_rmlp_nano_rw_256': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxxvit_rmlp_nano_rw_256_sw-0325d459.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8)),
+    'maxxvit_rmlp_tiny_rw_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)),
+    'maxxvit_rmlp_small_rw_256': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxxvit_rmlp_small_rw_256_sw-37e217ff.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8)),
+
+    # Trying to be like the MaxViT paper configs
+    'maxvit_tiny_224': _cfg(url=''),
+    'maxvit_small_224': _cfg(url=''),
+    'maxvit_base_224': _cfg(url=''),
+    'maxvit_large_224': _cfg(url=''),
+    'maxvit_xlarge_224': _cfg(url=''),
+}
+
+
+@dataclass
+class MaxxVitTransformerCfg:
+    dim_head: int = 32
+    expand_ratio: float = 4.0
+    expand_first: bool = True
+    shortcut_bias: bool = True
+    attn_bias: bool = True
+    attn_drop: float = 0.
+    proj_drop: float = 0.
+    pool_type: str = 'avg2'
+    rel_pos_type: str = 'bias'
+    rel_pos_dim: int = 512  # for relative position types w/ MLP
+    partition_ratio: int = 32
+    window_size: Optional[Tuple[int, int]] = None
+    grid_size: Optional[Tuple[int, int]] = None
+    init_values: Optional[float] = None
+    act_layer: str = 'gelu'
+    norm_layer: str = 'layernorm2d'
+    norm_layer_cl: str = 'layernorm'
+    norm_eps: float = 1e-6
+
+    def __post_init__(self):
+        if self.grid_size is not None:
+            self.grid_size = to_2tuple(self.grid_size)
+        if self.window_size is not None:
+            self.window_size = to_2tuple(self.window_size)
+            if self.grid_size is None:
+                self.grid_size = self.window_size
+
+
+@dataclass
+class MaxxVitConvCfg:
+    block_type: str = 'mbconv'
+    expand_ratio: float = 4.0
+    expand_output: bool = True  # calculate expansion channels from output (vs input chs)
+    kernel_size: int = 3
+    group_size: int = 1  # 1 == depthwise
+    pre_norm_act: bool = False  # activation after pre-norm
+    output_bias: bool = True  # bias for shortcut + final 1x1 projection conv
+    stride_mode: str = 'dw'  # stride done via one of 'pool', '1x1', 'dw'
+    pool_type: str = 'avg2'
+    downsample_pool_type: str = 'avg2'
+    attn_early: bool = False  # apply attn between conv2 and norm2, instead of after norm2
+    attn_layer: str = 'se'
+    attn_act_layer: str = 'silu'
+    attn_ratio: float = 0.25
+    init_values: Optional[float] = 1e-6  # for ConvNeXt block, ignored by MBConv
+    act_layer: str = 'gelu'
+    norm_layer: str = ''
+    norm_layer_cl: str = ''
+    norm_eps: Optional[float] = None
+
+    def __post_init__(self):
+        # mbconv vs convnext blocks have different defaults, set in post_init to avoid explicit config args
+        assert self.block_type in ('mbconv', 'convnext')
+        use_mbconv = self.block_type == 'mbconv'
+        if not self.norm_layer:
+            self.norm_layer = 'batchnorm2d' if use_mbconv else 'layernorm2d'
+        if not self.norm_layer_cl and not use_mbconv:
+            self.norm_layer_cl = 'layernorm'
+        if self.norm_eps is None:
+            self.norm_eps = 1e-5 if use_mbconv else 1e-6
+        self.downsample_pool_type = self.downsample_pool_type or self.pool_type
+
+
+@dataclass
+class MaxxVitCfg:
+    embed_dim: Tuple[int, ...] = (96, 192, 384, 768)
+    depths: Tuple[int, ...] = (2, 3, 5, 2)
+    block_type: Tuple[Union[str, Tuple[str, ...]], ...] = ('C', 'C', 'T', 'T')
+    stem_width: Union[int, Tuple[int, int]] = 64
+    stem_bias: bool = True
+    conv_cfg: MaxxVitConvCfg = field(default_factory=MaxxVitConvCfg)
+    transformer_cfg: MaxxVitTransformerCfg = field(default_factory=MaxxVitTransformerCfg)
+    weight_init: str = 'vit_eff'
+
+
+def _rw_coat_cfg(
+        stride_mode='pool',
+        pool_type='avg2',
+        conv_output_bias=False,
+        conv_attn_early=False,
+        conv_attn_act_layer='relu',
+        conv_norm_layer='',
+        transformer_shortcut_bias=True,
+        transformer_norm_layer='layernorm2d',
+        transformer_norm_layer_cl='layernorm',
+        init_values=None,
+        rel_pos_type='bias',
+        rel_pos_dim=512,
+):
+    # 'RW' timm variant models were created and trained before seeing https://github.com/google-research/maxvit
+    # Common differences for initial timm models:
+    # - pre-norm layer in MZBConv included an activation after norm
+    # - mbconv expansion calculated from input instead of output chs
+    # - mbconv shortcut and final 1x1 conv did not have a bias
+    # - SE act layer was relu, not silu
+    # - mbconv uses silu in timm, not gelu
+    # - expansion in attention block done via output proj, not input proj
+    # Variable differences (evolved over training initial models):
+    # - avg pool with kernel_size=2 favoured downsampling (instead of maxpool for coat)
+    # - SE attention was between conv2 and norm/act
+    # - default to avg pool for mbconv downsample instead of 1x1 or dw conv
+    # - transformer block shortcut has no bias
+    return dict(
+        conv_cfg=MaxxVitConvCfg(
+            stride_mode=stride_mode,
+            pool_type=pool_type,
+            pre_norm_act=True,
+            expand_output=False,
+            output_bias=conv_output_bias,
+            attn_early=conv_attn_early,
+            attn_act_layer=conv_attn_act_layer,
+            act_layer='silu',
+            norm_layer=conv_norm_layer,
+        ),
+        transformer_cfg=MaxxVitTransformerCfg(
+            expand_first=False,
+            shortcut_bias=transformer_shortcut_bias,
+            pool_type=pool_type,
+            init_values=init_values,
+            norm_layer=transformer_norm_layer,
+            norm_layer_cl=transformer_norm_layer_cl,
+            rel_pos_type=rel_pos_type,
+            rel_pos_dim=rel_pos_dim,
+        ),
+    )
+
+
+def _rw_max_cfg(
+        stride_mode='dw',
+        pool_type='avg2',
+        conv_output_bias=False,
+        conv_attn_ratio=1 / 16,
+        conv_norm_layer='',
+        transformer_norm_layer='layernorm2d',
+        transformer_norm_layer_cl='layernorm',
+        window_size=None,
+        dim_head=32,
+        init_values=None,
+        rel_pos_type='bias',
+        rel_pos_dim=512,
+):
+    # 'RW' timm variant models were created and trained before seeing https://github.com/google-research/maxvit
+    # Differences of initial timm models:
+    # - mbconv expansion calculated from input instead of output chs
+    # - mbconv shortcut and final 1x1 conv did not have a bias
+    # - mbconv uses silu in timm, not gelu
+    # - expansion in attention block done via output proj, not input proj
+    return dict(
+        conv_cfg=MaxxVitConvCfg(
+            stride_mode=stride_mode,
+            pool_type=pool_type,
+            expand_output=False,
+            output_bias=conv_output_bias,
+            attn_ratio=conv_attn_ratio,
+            act_layer='silu',
+            norm_layer=conv_norm_layer,
+        ),
+        transformer_cfg=MaxxVitTransformerCfg(
+            expand_first=False,
+            pool_type=pool_type,
+            dim_head=dim_head,
+            window_size=window_size,
+            init_values=init_values,
+            norm_layer=transformer_norm_layer,
+            norm_layer_cl=transformer_norm_layer_cl,
+            rel_pos_type=rel_pos_type,
+            rel_pos_dim=rel_pos_dim,
+        ),
+    )
+
+
+def _next_cfg(
+        stride_mode='dw',
+        pool_type='avg2',
+        conv_norm_layer='layernorm2d',
+        conv_norm_layer_cl='layernorm',
+        transformer_norm_layer='layernorm2d',
+        transformer_norm_layer_cl='layernorm',
+        window_size=None,
+        init_values=1e-6,
+        rel_pos_type='mlp',  # MLP by default for maxxvit
+        rel_pos_dim=512,
+):
+    # For experimental models with convnext instead of mbconv
+    init_values = to_2tuple(init_values)
+    return dict(
+        conv_cfg=MaxxVitConvCfg(
+            block_type='convnext',
+            stride_mode=stride_mode,
+            pool_type=pool_type,
+            expand_output=False,
+            init_values=init_values[0],
+            norm_layer=conv_norm_layer,
+            norm_layer_cl=conv_norm_layer_cl,
+        ),
+        transformer_cfg=MaxxVitTransformerCfg(
+            expand_first=False,
+            pool_type=pool_type,
+            window_size=window_size,
+            init_values=init_values[1],
+            norm_layer=transformer_norm_layer,
+            norm_layer_cl=transformer_norm_layer_cl,
+            rel_pos_type=rel_pos_type,
+            rel_pos_dim=rel_pos_dim,
+        ),
+    )
+
+
+model_cfgs = dict(
+    # Fiddling with configs / defaults / still pretraining
+    coatnet_pico_rw_224=MaxxVitCfg(
+        embed_dim=(64, 128, 256, 512),
+        depths=(2, 3, 5, 2),
+        stem_width=(32, 64),
+        **_rw_max_cfg(  # using newer max defaults here
+            conv_output_bias=True,
+            conv_attn_ratio=0.25,
+        ),
+    ),
+    coatnet_nano_rw_224=MaxxVitCfg(
+        embed_dim=(64, 128, 256, 512),
+        depths=(3, 4, 6, 3),
+        stem_width=(32, 64),
+        **_rw_max_cfg(  # using newer max defaults here
+            stride_mode='pool',
+            conv_output_bias=True,
+            conv_attn_ratio=0.25,
+        ),
+    ),
+    coatnet_0_rw_224=MaxxVitCfg(
+        embed_dim=(96, 192, 384, 768),
+        depths=(2, 3, 7, 2),  # deeper than paper '0' model
+        stem_width=(32, 64),
+        **_rw_coat_cfg(
+            conv_attn_early=True,
+            transformer_shortcut_bias=False,
+        ),
+    ),
+    coatnet_1_rw_224=MaxxVitCfg(
+        embed_dim=(96, 192, 384, 768),
+        depths=(2, 6, 14, 2),
+        stem_width=(32, 64),
+        **_rw_coat_cfg(
+            stride_mode='dw',
+            conv_attn_early=True,
+            transformer_shortcut_bias=False,
+        )
+    ),
+    coatnet_2_rw_224=MaxxVitCfg(
+        embed_dim=(128, 256, 512, 1024),
+        depths=(2, 6, 14, 2),
+        stem_width=(64, 128),
+        **_rw_coat_cfg(
+            stride_mode='dw',
+            conv_attn_act_layer='silu',
+            init_values=1e-6,
+        ),
+    ),
+    coatnet_3_rw_224=MaxxVitCfg(
+        embed_dim=(192, 384, 768, 1536),
+        depths=(2, 6, 14, 2),
+        stem_width=(96, 192),
+        **_rw_coat_cfg(
+            stride_mode='dw',
+            conv_attn_act_layer='silu',
+            init_values=1e-6,
+        ),
+    ),
+
+    # Highly experimental configs
+    coatnet_bn_0_rw_224=MaxxVitCfg(
+        embed_dim=(96, 192, 384, 768),
+        depths=(2, 3, 7, 2),  # deeper than paper '0' model
+        stem_width=(32, 64),
+        **_rw_coat_cfg(
+            stride_mode='dw',
+            conv_attn_early=True,
+            transformer_shortcut_bias=False,
+            transformer_norm_layer='batchnorm2d',
+        )
+    ),
+    coatnet_rmlp_nano_rw_224=MaxxVitCfg(
+        embed_dim=(64, 128, 256, 512),
+        depths=(3, 4, 6, 3),
+        stem_width=(32, 64),
+        **_rw_max_cfg(
+            conv_output_bias=True,
+            conv_attn_ratio=0.25,
+            rel_pos_type='mlp',
+            rel_pos_dim=384,
+        ),
+    ),
+    coatnet_rmlp_0_rw_224=MaxxVitCfg(
+        embed_dim=(96, 192, 384, 768),
+        depths=(2, 3, 7, 2),  # deeper than paper '0' model
+        stem_width=(32, 64),
+        **_rw_coat_cfg(
+            stride_mode='dw',
+            rel_pos_type='mlp',
+        ),
+    ),
+    coatnet_rmlp_1_rw_224=MaxxVitCfg(
+        embed_dim=(96, 192, 384, 768),
+        depths=(2, 6, 14, 2),
+        stem_width=(32, 64),
+        **_rw_coat_cfg(
+            pool_type='max',
+            conv_attn_early=True,
+            transformer_shortcut_bias=False,
+            rel_pos_type='mlp',
+            rel_pos_dim=384,  # was supposed to be 512, woops
+        ),
+    ),
+    coatnet_rmlp_2_rw_224=MaxxVitCfg(
+        embed_dim=(128, 256, 512, 1024),
+        depths=(2, 6, 14, 2),
+        stem_width=(64, 128),
+        **_rw_coat_cfg(
+            stride_mode='dw',
+            conv_attn_act_layer='silu',
+            init_values=1e-6,
+            rel_pos_type='mlp'
+        ),
+    ),
+    coatnet_rmlp_3_rw_224=MaxxVitCfg(
+        embed_dim=(192, 384, 768, 1536),
+        depths=(2, 6, 14, 2),
+        stem_width=(96, 192),
+        **_rw_coat_cfg(
+            stride_mode='dw',
+            conv_attn_act_layer='silu',
+            init_values=1e-6,
+            rel_pos_type='mlp'
+        ),
+    ),
+
+    coatnet_nano_cc_224=MaxxVitCfg(
+        embed_dim=(64, 128, 256, 512),
+        depths=(3, 4, 6, 3),
+        stem_width=(32, 64),
+        block_type=('C', 'C', ('C', 'T'), ('C', 'T')),
+        **_rw_coat_cfg(),
+    ),
+    coatnext_nano_rw_224=MaxxVitCfg(
+        embed_dim=(64, 128, 256, 512),
+        depths=(3, 4, 6, 3),
+        stem_width=(32, 64),
+        weight_init='normal',
+        **_next_cfg(
+            rel_pos_type='bias',
+            init_values=(1e-5, None)
+        ),
+    ),
+
+    # Trying to be like the CoAtNet paper configs
+    coatnet_0_224=MaxxVitCfg(
+        embed_dim=(96, 192, 384, 768),
+        depths=(2, 3, 5, 2),
+        stem_width=64,
+    ),
+    coatnet_1_224=MaxxVitCfg(
+        embed_dim=(96, 192, 384, 768),
+        depths=(2, 6, 14, 2),
+        stem_width=64,
+    ),
+    coatnet_2_224=MaxxVitCfg(
+        embed_dim=(128, 256, 512, 1024),
+        depths=(2, 6, 14, 2),
+        stem_width=128,
+    ),
+    coatnet_3_224=MaxxVitCfg(
+        embed_dim=(192, 384, 768, 1536),
+        depths=(2, 6, 14, 2),
+        stem_width=192,
+    ),
+    coatnet_4_224=MaxxVitCfg(
+        embed_dim=(192, 384, 768, 1536),
+        depths=(2, 12, 28, 2),
+        stem_width=192,
+    ),
+    coatnet_5_224=MaxxVitCfg(
+        embed_dim=(256, 512, 1280, 2048),
+        depths=(2, 12, 28, 2),
+        stem_width=192,
+    ),
+
+    # Experimental MaxVit configs
+    maxvit_pico_rw_256=MaxxVitCfg(
+        embed_dim=(32, 64, 128, 256),
+        depths=(2, 2, 5, 2),
+        block_type=('M',) * 4,
+        stem_width=(24, 32),
+        **_rw_max_cfg(),
+    ),
+    maxvit_nano_rw_256=MaxxVitCfg(
+        embed_dim=(64, 128, 256, 512),
+        depths=(1, 2, 3, 1),
+        block_type=('M',) * 4,
+        stem_width=(32, 64),
+        **_rw_max_cfg(),
+    ),
+    maxvit_tiny_rw_224=MaxxVitCfg(
+        embed_dim=(64, 128, 256, 512),
+        depths=(2, 2, 5, 2),
+        block_type=('M',) * 4,
+        stem_width=(32, 64),
+        **_rw_max_cfg(),
+    ),
+    maxvit_tiny_rw_256=MaxxVitCfg(
+        embed_dim=(64, 128, 256, 512),
+        depths=(2, 2, 5, 2),
+        block_type=('M',) * 4,
+        stem_width=(32, 64),
+        **_rw_max_cfg(),
+    ),
+
+    maxvit_rmlp_pico_rw_256=MaxxVitCfg(
+        embed_dim=(32, 64, 128, 256),
+        depths=(2, 2, 5, 2),
+        block_type=('M',) * 4,
+        stem_width=(24, 32),
+        **_rw_max_cfg(rel_pos_type='mlp'),
+    ),
+    maxvit_rmlp_nano_rw_256=MaxxVitCfg(
+        embed_dim=(64, 128, 256, 512),
+        depths=(1, 2, 3, 1),
+        block_type=('M',) * 4,
+        stem_width=(32, 64),
+        **_rw_max_cfg(rel_pos_type='mlp'),
+    ),
+    maxvit_rmlp_tiny_rw_256=MaxxVitCfg(
+        embed_dim=(64, 128, 256, 512),
+        depths=(2, 2, 5, 2),
+        block_type=('M',) * 4,
+        stem_width=(32, 64),
+        **_rw_max_cfg(rel_pos_type='mlp'),
+    ),
+    maxvit_rmlp_small_rw_224=MaxxVitCfg(
+        embed_dim=(96, 192, 384, 768),
+        depths=(2, 2, 5, 2),
+        block_type=('M',) * 4,
+        stem_width=(32, 64),
+        **_rw_max_cfg(
+            rel_pos_type='mlp',
+            init_values=1e-6,
+        ),
+    ),
+    maxvit_rmlp_small_rw_256=MaxxVitCfg(
+        embed_dim=(96, 192, 384, 768),
+        depths=(2, 2, 5, 2),
+        block_type=('M',) * 4,
+        stem_width=(32, 64),
+        **_rw_max_cfg(
+            rel_pos_type='mlp',
+            init_values=1e-6,
+        ),
+    ),
+
+    maxvit_tiny_pm_256=MaxxVitCfg(
+        embed_dim=(64, 128, 256, 512),
+        depths=(2, 2, 5, 2),
+        block_type=('PM',) * 4,
+        stem_width=(32, 64),
+        **_rw_max_cfg(),
+    ),
+
+    maxxvit_rmlp_nano_rw_256=MaxxVitCfg(
+        embed_dim=(64, 128, 256, 512),
+        depths=(1, 2, 3, 1),
+        block_type=('M',) * 4,
+        stem_width=(32, 64),
+        weight_init='normal',
+        **_next_cfg(),
+    ),
+    maxxvit_rmlp_tiny_rw_256=MaxxVitCfg(
+        embed_dim=(64, 128, 256, 512),
+        depths=(2, 2, 5, 2),
+        block_type=('M',) * 4,
+        stem_width=(32, 64),
+        **_next_cfg(),
+    ),
+    maxxvit_rmlp_small_rw_256=MaxxVitCfg(
+        embed_dim=(96, 192, 384, 768),
+        depths=(2, 2, 5, 2),
+        block_type=('M',) * 4,
+        stem_width=(48, 96),
+        **_next_cfg(),
+    ),
+
+    # Trying to be like the MaxViT paper configs
+    maxvit_tiny_224=MaxxVitCfg(
+        embed_dim=(64, 128, 256, 512),
+        depths=(2, 2, 5, 2),
+        block_type=('M',) * 4,
+        stem_width=64,
+    ),
+    maxvit_small_224=MaxxVitCfg(
+        embed_dim=(96, 192, 384, 768),
+        depths=(2, 2, 5, 2),
+        block_type=('M',) * 4,
+        stem_width=64,
+    ),
+    maxvit_base_224=MaxxVitCfg(
+        embed_dim=(96, 192, 384, 768),
+        depths=(2, 6, 14, 2),
+        block_type=('M',) * 4,
+        stem_width=64,
+    ),
+    maxvit_large_224=MaxxVitCfg(
+        embed_dim=(128, 256, 512, 1024),
+        depths=(2, 6, 14, 2),
+        block_type=('M',) * 4,
+        stem_width=128,
+    ),
+    maxvit_xlarge_224=MaxxVitCfg(
+        embed_dim=(192, 384, 768, 1536),
+        depths=(2, 6, 14, 2),
+        block_type=('M',) * 4,
+        stem_width=192,
+    ),
+
+)
+
+
+class Attention2d(nn.Module):
+    """ multi-head attention for 2D NCHW tensors"""
+    def __init__(
+            self,
+            dim: int,
+            dim_out: Optional[int] = None,
+            dim_head: int = 32,
+            bias: bool = True,
+            expand_first: bool = True,
+            rel_pos_cls: Callable = None,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.
+    ):
+        super().__init__()
+        dim_out = dim_out or dim
+        dim_attn = dim_out if expand_first else dim
+        self.num_heads = dim_attn // dim_head
+        self.dim_head = dim_head
+        self.scale = dim_head ** -0.5
+
+        self.qkv = nn.Conv2d(dim, dim_attn * 3, 1, bias=bias)
+        self.rel_pos = rel_pos_cls(num_heads=self.num_heads) if rel_pos_cls else None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Conv2d(dim_attn, dim_out, 1, bias=bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, shared_rel_pos: Optional[torch.Tensor] = None):
+        B, C, H, W = x.shape
+
+        q, k, v = self.qkv(x).view(B, self.num_heads, self.dim_head * 3, -1).chunk(3, dim=2)
+
+        attn = (q.transpose(-2, -1) @ k) * self.scale
+        if self.rel_pos is not None:
+            attn = self.rel_pos(attn)
+        elif shared_rel_pos is not None:
+            attn = attn + shared_rel_pos
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (v @ attn.transpose(-2, -1)).view(B, -1, H, W)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class AttentionCl(nn.Module):
+    """ Channels-last multi-head attention (B, ..., C) """
+    def __init__(
+            self,
+            dim: int,
+            dim_out: Optional[int] = None,
+            dim_head: int = 32,
+            bias: bool = True,
+            expand_first: bool = True,
+            rel_pos_cls: Callable = None,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.
+    ):
+        super().__init__()
+        dim_out = dim_out or dim
+        dim_attn = dim_out if expand_first and dim_out > dim else dim
+        assert dim_attn % dim_head == 0, 'attn dim should be divisible by head_dim'
+        self.num_heads = dim_attn // dim_head
+        self.dim_head = dim_head
+        self.scale = dim_head ** -0.5
+
+        self.qkv = nn.Linear(dim, dim_attn * 3, bias=bias)
+        self.rel_pos = rel_pos_cls(num_heads=self.num_heads) if rel_pos_cls else None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim_attn, dim_out, bias=bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, shared_rel_pos: Optional[torch.Tensor] = None):
+        B = x.shape[0]
+        restore_shape = x.shape[:-1]
+
+        q, k, v = self.qkv(x).view(B, -1, self.num_heads, self.dim_head * 3).transpose(1, 2).chunk(3, dim=3)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        if self.rel_pos is not None:
+            attn = self.rel_pos(attn, shared_rel_pos=shared_rel_pos)
+        elif shared_rel_pos is not None:
+            attn = attn + shared_rel_pos
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(restore_shape + (-1,))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x):
+        gamma = self.gamma
+        return x.mul_(gamma) if self.inplace else x * gamma
+
+
+class LayerScale2d(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x):
+        gamma = self.gamma.view(1, -1, 1, 1)
+        return x.mul_(gamma) if self.inplace else x * gamma
+
+
+class Downsample2d(nn.Module):
+    """ A downsample pooling module supporting several maxpool and avgpool modes
+    * 'max' - MaxPool2d w/ kernel_size 3, stride 2, padding 1
+    * 'max2' - MaxPool2d w/ kernel_size = stride = 2
+    * 'avg' - AvgPool2d w/ kernel_size 3, stride 2, padding 1
+    * 'avg2' - AvgPool2d w/ kernel_size = stride = 2
+    """
+
+    def __init__(
+            self,
+            dim: int,
+            dim_out: int,
+            pool_type: str = 'avg2',
+            bias: bool = True,
+    ):
+        super().__init__()
+        assert pool_type in ('max', 'max2', 'avg', 'avg2')
+        if pool_type == 'max':
+            self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        elif pool_type == 'max2':
+            self.pool = nn.MaxPool2d(2)  # kernel_size == stride == 2
+        elif pool_type == 'avg':
+            self.pool = nn.AvgPool2d(kernel_size=3, stride=2, padding=1, count_include_pad=False)
+        else:
+            self.pool = nn.AvgPool2d(2)  # kernel_size == stride == 2
+
+        if dim != dim_out:
+            self.expand = nn.Conv2d(dim, dim_out, 1, bias=bias)
+        else:
+            self.expand = nn.Identity()
+
+    def forward(self, x):
+        x = self.pool(x)  # spatial downsample
+        x = self.expand(x)  # expand chs
+        return x
+
+
+def _init_transformer(module, name, scheme=''):
+    if isinstance(module, (nn.Conv2d, nn.Linear)):
+        if scheme == 'normal':
+            nn.init.normal_(module.weight, std=.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif scheme == 'trunc_normal':
+            trunc_normal_tf_(module.weight, std=.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif scheme == 'xavier_normal':
+            nn.init.xavier_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        else:
+            # vit like
+            nn.init.xavier_uniform_(module.weight)
+            if module.bias is not None:
+                if 'mlp' in name:
+                    nn.init.normal_(module.bias, std=1e-6)
+                else:
+                    nn.init.zeros_(module.bias)
+
+
+class TransformerBlock2d(nn.Module):
+    """ Transformer block with 2D downsampling
+    '2D' NCHW tensor layout
+
+    Some gains can be seen on GPU using a 1D / CL block, BUT w/ the need to switch back/forth to NCHW
+    for spatial pooling, the benefit is minimal so ended up using just this variant for CoAt configs.
+
+    This impl was faster on TPU w/ PT XLA than the 1D experiment.
+    """
+
+    def __init__(
+            self,
+            dim: int,
+            dim_out: int,
+            stride: int = 1,
+            rel_pos_cls: Callable = None,
+            cfg: MaxxVitTransformerCfg = MaxxVitTransformerCfg(),
+            drop_path: float = 0.,
+    ):
+        super().__init__()
+        norm_layer = partial(get_norm_layer(cfg.norm_layer), eps=cfg.norm_eps)
+        act_layer = get_act_layer(cfg.act_layer)
+
+        if stride == 2:
+            self.shortcut = Downsample2d(dim, dim_out, pool_type=cfg.pool_type, bias=cfg.shortcut_bias)
+            self.norm1 = nn.Sequential(OrderedDict([
+                ('norm', norm_layer(dim)),
+                ('down', Downsample2d(dim, dim, pool_type=cfg.pool_type)),
+            ]))
+        else:
+            assert dim == dim_out
+            self.shortcut = nn.Identity()
+            self.norm1 = norm_layer(dim)
+
+        self.attn = Attention2d(
+            dim,
+            dim_out,
+            dim_head=cfg.dim_head,
+            expand_first=cfg.expand_first,
+            bias=cfg.attn_bias,
+            rel_pos_cls=rel_pos_cls,
+            attn_drop=cfg.attn_drop,
+            proj_drop=cfg.proj_drop
+        )
+        self.ls1 = LayerScale2d(dim_out, init_values=cfg.init_values) if cfg.init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.norm2 = norm_layer(dim_out)
+        self.mlp = ConvMlp(
+            in_features=dim_out,
+            hidden_features=int(dim_out * cfg.expand_ratio),
+            act_layer=act_layer,
+            drop=cfg.proj_drop)
+        self.ls2 = LayerScale2d(dim_out, init_values=cfg.init_values) if cfg.init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def init_weights(self, scheme=''):
+        named_apply(partial(_init_transformer, scheme=scheme), self)
+
+    def forward(self, x, shared_rel_pos: Optional[torch.Tensor] = None):
+        x = self.shortcut(x) + self.drop_path1(self.ls1(self.attn(self.norm1(x), shared_rel_pos=shared_rel_pos)))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+
+
+def _init_conv(module, name, scheme=''):
+    if isinstance(module, nn.Conv2d):
+        if scheme == 'normal':
+            nn.init.normal_(module.weight, std=.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif scheme == 'trunc_normal':
+            trunc_normal_tf_(module.weight, std=.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif scheme == 'xavier_normal':
+            nn.init.xavier_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        else:
+            # efficientnet like
+            fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
+            fan_out //= module.groups
+            nn.init.normal_(module.weight, 0, math.sqrt(2.0 / fan_out))
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+
+
+def num_groups(group_size, channels):
+    if not group_size:  # 0 or None
+        return 1  # normal conv with 1 group
+    else:
+        # NOTE group_size == 1 -> depthwise conv
+        assert channels % group_size == 0
+        return channels // group_size
+
+
+class MbConvBlock(nn.Module):
+    """ Pre-Norm Conv Block - 1x1 - kxk - 1x1, w/ inverted bottleneck (expand)
+    """
+    def __init__(
+            self,
+            in_chs: int,
+            out_chs: int,
+            stride: int = 1,
+            dilation: Tuple[int, int] = (1, 1),
+            cfg: MaxxVitConvCfg = MaxxVitConvCfg(),
+            drop_path: float = 0.
+    ):
+        super(MbConvBlock, self).__init__()
+        norm_act_layer = partial(get_norm_act_layer(cfg.norm_layer, cfg.act_layer), eps=cfg.norm_eps)
+        mid_chs = make_divisible((out_chs if cfg.expand_output else in_chs) * cfg.expand_ratio)
+        groups = num_groups(cfg.group_size, mid_chs)
+
+        if stride == 2:
+            self.shortcut = Downsample2d(in_chs, out_chs, pool_type=cfg.pool_type, bias=cfg.output_bias)
+        else:
+            self.shortcut = nn.Identity()
+
+        assert cfg.stride_mode in ('pool', '1x1', 'dw')
+        stride_pool, stride_1, stride_2 = 1, 1, 1
+        if cfg.stride_mode == 'pool':
+            # NOTE this is not described in paper, experiment to find faster option that doesn't stride in 1x1
+            stride_pool, dilation_2 = stride, dilation[1]
+            # FIXME handle dilation of avg pool
+        elif cfg.stride_mode == '1x1':
+            # NOTE I don't like this option described in paper, 1x1 w/ stride throws info away
+            stride_1, dilation_2 = stride, dilation[1]
+        else:
+            stride_2, dilation_2 = stride, dilation[0]
+
+        self.pre_norm = norm_act_layer(in_chs, apply_act=cfg.pre_norm_act)
+        if stride_pool > 1:
+            self.down = Downsample2d(in_chs, in_chs, pool_type=cfg.downsample_pool_type)
+        else:
+            self.down = nn.Identity()
+        self.conv1_1x1 = create_conv2d(in_chs, mid_chs, 1, stride=stride_1)
+        self.norm1 = norm_act_layer(mid_chs)
+
+        self.conv2_kxk = create_conv2d(
+            mid_chs, mid_chs, cfg.kernel_size, stride=stride_2, dilation=dilation_2, groups=groups)
+
+        attn_kwargs = {}
+        if isinstance(cfg.attn_layer, str):
+            if cfg.attn_layer == 'se' or cfg.attn_layer == 'eca':
+                attn_kwargs['act_layer'] = cfg.attn_act_layer
+                attn_kwargs['rd_channels'] = int(cfg.attn_ratio * (out_chs if cfg.expand_output else mid_chs))
+
+        # two different orderings for SE and norm2 (due to some weights and trials using SE before norm2)
+        if cfg.attn_early:
+            self.se_early = create_attn(cfg.attn_layer, mid_chs, **attn_kwargs)
+            self.norm2 = norm_act_layer(mid_chs)
+            self.se = None
+        else:
+            self.se_early = None
+            self.norm2 = norm_act_layer(mid_chs)
+            self.se = create_attn(cfg.attn_layer, mid_chs, **attn_kwargs)
+
+        self.conv3_1x1 = create_conv2d(mid_chs, out_chs, 1, bias=cfg.output_bias)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def init_weights(self, scheme=''):
+        named_apply(partial(_init_conv, scheme=scheme), self)
+
+    def forward(self, x):
+        shortcut = self.shortcut(x)
+        x = self.pre_norm(x)
+        x = self.down(x)
+
+        # 1x1 expansion conv & norm-act
+        x = self.conv1_1x1(x)
+        x = self.norm1(x)
+
+        # depthwise / grouped 3x3 conv w/ SE (or other) channel attention & norm-act
+        x = self.conv2_kxk(x)
+        if self.se_early is not None:
+            x = self.se_early(x)
+        x = self.norm2(x)
+        if self.se is not None:
+            x = self.se(x)
+
+        # 1x1 linear projection to output width
+        x = self.conv3_1x1(x)
+        x = self.drop_path(x) + shortcut
+        return x
+
+
+class ConvNeXtBlock(nn.Module):
+    """ ConvNeXt Block
+    """
+
+    def __init__(
+            self,
+            in_chs: int,
+            out_chs: Optional[int] = None,
+            kernel_size: int = 7,
+            stride: int = 1,
+            dilation: Tuple[int, int] = (1, 1),
+            cfg: MaxxVitConvCfg = MaxxVitConvCfg(),
+            conv_mlp: bool = True,
+            drop_path: float = 0.
+    ):
+        super().__init__()
+        out_chs = out_chs or in_chs
+        act_layer = get_act_layer(cfg.act_layer)
+        if conv_mlp:
+            norm_layer = partial(get_norm_layer(cfg.norm_layer), eps=cfg.norm_eps)
+            mlp_layer = ConvMlp
+        else:
+            assert 'layernorm' in cfg.norm_layer
+            norm_layer = LayerNorm
+            mlp_layer = Mlp
+        self.use_conv_mlp = conv_mlp
+
+        if stride == 2:
+            self.shortcut = Downsample2d(in_chs, out_chs)
+        elif in_chs != out_chs:
+            self.shortcut = nn.Conv2d(in_chs, out_chs, kernel_size=1, bias=cfg.output_bias)
+        else:
+            self.shortcut = nn.Identity()
+
+        assert cfg.stride_mode in ('pool', 'dw')
+        stride_pool, stride_dw = 1, 1
+        # FIXME handle dilation?
+        if cfg.stride_mode == 'pool':
+            stride_pool = stride
+        else:
+            stride_dw = stride
+
+        if stride_pool == 2:
+            self.down = Downsample2d(in_chs, in_chs, pool_type=cfg.downsample_pool_type)
+        else:
+            self.down = nn.Identity()
+
+        self.conv_dw = create_conv2d(
+            in_chs, out_chs, kernel_size=kernel_size, stride=stride_dw, dilation=dilation[1],
+            depthwise=True, bias=cfg.output_bias)
+        self.norm = norm_layer(out_chs)
+        self.mlp = mlp_layer(out_chs, int(cfg.expand_ratio * out_chs), bias=cfg.output_bias, act_layer=act_layer)
+        if conv_mlp:
+            self.ls = LayerScale2d(out_chs, cfg.init_values) if cfg.init_values else nn.Identity()
+        else:
+            self.ls = LayerScale(out_chs, cfg.init_values) if cfg.init_values else nn.Identity()
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        shortcut = self.shortcut(x)
+        x = self.down(x)
+        x = self.conv_dw(x)
+        if self.use_conv_mlp:
+            x = self.norm(x)
+            x = self.mlp(x)
+            x = self.ls(x)
+        else:
+            x = x.permute(0, 2, 3, 1)
+            x = self.norm(x)
+            x = self.mlp(x)
+            x = self.ls(x)
+            x = x.permute(0, 3, 1, 2)
+
+        x = self.drop_path(x) + shortcut
+        return x
+
+
+def window_partition(x, window_size: List[int]):
+    B, H, W, C = x.shape
+    _assert(H % window_size[0] == 0, f'height ({H}) must be divisible by window ({window_size[0]})')
+    _assert(W % window_size[1] == 0, '')
+    x = x.view(B, H // window_size[0], window_size[0], W // window_size[1], window_size[1], C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size[0], window_size[1], C)
+    return windows
+
+
+@register_notrace_function  # reason: int argument is a Proxy
+def window_reverse(windows, window_size: List[int], img_size: List[int]):
+    H, W = img_size
+    C = windows.shape[-1]
+    x = windows.view(-1, H // window_size[0], W // window_size[1], window_size[0], window_size[1], C)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, H, W, C)
+    return x
+
+
+def grid_partition(x, grid_size: List[int]):
+    B, H, W, C = x.shape
+    _assert(H % grid_size[0] == 0, f'height {H} must be divisible by grid {grid_size[0]}')
+    _assert(W % grid_size[1] == 0, '')
+    x = x.view(B, grid_size[0], H // grid_size[0], grid_size[1], W // grid_size[1], C)
+    windows = x.permute(0, 2, 4, 1, 3, 5).contiguous().view(-1, grid_size[0], grid_size[1], C)
+    return windows
+
+
+@register_notrace_function  # reason: int argument is a Proxy
+def grid_reverse(windows, grid_size: List[int], img_size: List[int]):
+    H, W = img_size
+    C = windows.shape[-1]
+    x = windows.view(-1, H // grid_size[0], W // grid_size[1], grid_size[0], grid_size[1], C)
+    x = x.permute(0, 3, 1, 4, 2, 5).contiguous().view(-1, H, W, C)
+    return x
+
+
+def get_rel_pos_cls(cfg: MaxxVitTransformerCfg, window_size):
+    rel_pos_cls = None
+    if cfg.rel_pos_type == 'mlp':
+        rel_pos_cls = partial(RelPosMlp, window_size=window_size, hidden_dim=cfg.rel_pos_dim)
+    elif cfg.rel_pos_type == 'bias':
+        rel_pos_cls = partial(RelPosBias, window_size=window_size)
+    return rel_pos_cls
+
+
+class PartitionAttentionCl(nn.Module):
+    """ Grid or Block partition + Attn + FFN.
+    NxC 'channels last' tensor layout.
+    """
+
+    def __init__(
+            self,
+            dim: int,
+            partition_type: str = 'block',
+            cfg: MaxxVitTransformerCfg = MaxxVitTransformerCfg(),
+            drop_path: float = 0.,
+    ):
+        super().__init__()
+        norm_layer = partial(get_norm_layer(cfg.norm_layer_cl), eps=cfg.norm_eps)  # NOTE this block is channels-last
+        act_layer = get_act_layer(cfg.act_layer)
+
+        self.partition_block = partition_type == 'block'
+        self.partition_size = to_2tuple(cfg.window_size if self.partition_block else cfg.grid_size)
+        rel_pos_cls = get_rel_pos_cls(cfg, self.partition_size)
+
+        self.norm1 = norm_layer(dim)
+        self.attn = AttentionCl(
+            dim,
+            dim,
+            dim_head=cfg.dim_head,
+            bias=cfg.attn_bias,
+            rel_pos_cls=rel_pos_cls,
+            attn_drop=cfg.attn_drop,
+            proj_drop=cfg.proj_drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=cfg.init_values) if cfg.init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=int(dim * cfg.expand_ratio),
+            act_layer=act_layer,
+            drop=cfg.proj_drop)
+        self.ls2 = LayerScale(dim, init_values=cfg.init_values) if cfg.init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def _partition_attn(self, x):
+        img_size = x.shape[1:3]
+        if self.partition_block:
+            partitioned = window_partition(x, self.partition_size)
+        else:
+            partitioned = grid_partition(x, self.partition_size)
+
+        partitioned = self.attn(partitioned)
+
+        if self.partition_block:
+            x = window_reverse(partitioned, self.partition_size, img_size)
+        else:
+            x = grid_reverse(partitioned, self.partition_size, img_size)
+        return x
+
+    def forward(self, x):
+        x = x + self.drop_path1(self.ls1(self._partition_attn(self.norm1(x))))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+
+
+class ParallelPartitionAttention(nn.Module):
+    """ Experimental. Grid and Block partition + single FFN
+    NxC tensor layout.
+    """
+
+    def __init__(
+            self,
+            dim: int,
+            cfg: MaxxVitTransformerCfg = MaxxVitTransformerCfg(),
+            drop_path: float = 0.,
+    ):
+        super().__init__()
+        assert dim % 2 == 0
+        norm_layer = partial(get_norm_layer(cfg.norm_layer_cl), eps=cfg.norm_eps)  # NOTE this block is channels-last
+        act_layer = get_act_layer(cfg.act_layer)
+
+        assert cfg.window_size == cfg.grid_size
+        self.partition_size = to_2tuple(cfg.window_size)
+        rel_pos_cls = get_rel_pos_cls(cfg, self.partition_size)
+
+        self.norm1 = norm_layer(dim)
+        self.attn_block = AttentionCl(
+            dim,
+            dim // 2,
+            dim_head=cfg.dim_head,
+            bias=cfg.attn_bias,
+            rel_pos_cls=rel_pos_cls,
+            attn_drop=cfg.attn_drop,
+            proj_drop=cfg.proj_drop,
+        )
+        self.attn_grid = AttentionCl(
+            dim,
+            dim // 2,
+            dim_head=cfg.dim_head,
+            bias=cfg.attn_bias,
+            rel_pos_cls=rel_pos_cls,
+            attn_drop=cfg.attn_drop,
+            proj_drop=cfg.proj_drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=cfg.init_values) if cfg.init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=int(dim * cfg.expand_ratio),
+            out_features=dim,
+            act_layer=act_layer,
+            drop=cfg.proj_drop)
+        self.ls2 = LayerScale(dim, init_values=cfg.init_values) if cfg.init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def _partition_attn(self, x):
+        img_size = x.shape[1:3]
+
+        partitioned_block = window_partition(x, self.partition_size)
+        partitioned_block = self.attn_block(partitioned_block)
+        x_window = window_reverse(partitioned_block, self.partition_size, img_size)
+
+        partitioned_grid = grid_partition(x, self.partition_size)
+        partitioned_grid = self.attn_grid(partitioned_grid)
+        x_grid = grid_reverse(partitioned_grid, self.partition_size, img_size)
+
+        return torch.cat([x_window, x_grid], dim=-1)
+
+    def forward(self, x):
+        x = x + self.drop_path1(self.ls1(self._partition_attn(self.norm1(x))))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+
+
+def window_partition_nchw(x, window_size: List[int]):
+    B, C, H, W = x.shape
+    _assert(H % window_size[0] == 0, f'height ({H}) must be divisible by window ({window_size[0]})')
+    _assert(W % window_size[1] == 0, '')
+    x = x.view(B, C, H // window_size[0], window_size[0], W // window_size[1], window_size[1])
+    windows = x.permute(0, 2, 4, 1, 3, 5).contiguous().view(-1, C, window_size[0], window_size[1])
+    return windows
+
+
+@register_notrace_function  # reason: int argument is a Proxy
+def window_reverse_nchw(windows, window_size: List[int], img_size: List[int]):
+    H, W = img_size
+    C = windows.shape[1]
+    x = windows.view(-1, H // window_size[0], W // window_size[1], C, window_size[0], window_size[1])
+    x = x.permute(0, 3, 1, 4, 2, 5).contiguous().view(-1, C, H, W)
+    return x
+
+
+def grid_partition_nchw(x, grid_size: List[int]):
+    B, C, H, W = x.shape
+    _assert(H % grid_size[0] == 0, f'height {H} must be divisible by grid {grid_size[0]}')
+    _assert(W % grid_size[1] == 0, '')
+    x = x.view(B, C, grid_size[0], H // grid_size[0], grid_size[1], W // grid_size[1])
+    windows = x.permute(0, 3, 5, 1, 2, 4).contiguous().view(-1, C, grid_size[0], grid_size[1])
+    return windows
+
+
+@register_notrace_function  # reason: int argument is a Proxy
+def grid_reverse_nchw(windows, grid_size: List[int], img_size: List[int]):
+    H, W = img_size
+    C = windows.shape[1]
+    x = windows.view(-1, H // grid_size[0], W // grid_size[1], C, grid_size[0], grid_size[1])
+    x = x.permute(0, 3, 4, 1, 5, 2).contiguous().view(-1, C, H, W)
+    return x
+
+
+class PartitionAttention2d(nn.Module):
+    """ Grid or Block partition + Attn + FFN
+
+    '2D' NCHW tensor layout.
+    """
+
+    def __init__(
+            self,
+            dim: int,
+            partition_type: str = 'block',
+            cfg: MaxxVitTransformerCfg = MaxxVitTransformerCfg(),
+            drop_path: float = 0.,
+    ):
+        super().__init__()
+        norm_layer = partial(get_norm_layer(cfg.norm_layer), eps=cfg.norm_eps)  # NOTE this block is channels-last
+        act_layer = get_act_layer(cfg.act_layer)
+
+        self.partition_block = partition_type == 'block'
+        self.partition_size = to_2tuple(cfg.window_size if self.partition_block else cfg.grid_size)
+        rel_pos_cls = get_rel_pos_cls(cfg, self.partition_size)
+
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention2d(
+            dim,
+            dim,
+            dim_head=cfg.dim_head,
+            bias=cfg.attn_bias,
+            rel_pos_cls=rel_pos_cls,
+            attn_drop=cfg.attn_drop,
+            proj_drop=cfg.proj_drop,
+        )
+        self.ls1 = LayerScale2d(dim, init_values=cfg.init_values) if cfg.init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = ConvMlp(
+            in_features=dim,
+            hidden_features=int(dim * cfg.expand_ratio),
+            act_layer=act_layer,
+            drop=cfg.proj_drop)
+        self.ls2 = LayerScale2d(dim, init_values=cfg.init_values) if cfg.init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def _partition_attn(self, x):
+        img_size = x.shape[-2:]
+        if self.partition_block:
+            partitioned = window_partition_nchw(x, self.partition_size)
+        else:
+            partitioned = grid_partition_nchw(x, self.partition_size)
+
+        partitioned = self.attn(partitioned)
+
+        if self.partition_block:
+            x = window_reverse_nchw(partitioned, self.partition_size, img_size)
+        else:
+            x = grid_reverse_nchw(partitioned, self.partition_size, img_size)
+        return x
+
+    def forward(self, x):
+        x = x + self.drop_path1(self.ls1(self._partition_attn(self.norm1(x))))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+
+
+class MaxxVitBlock(nn.Module):
+    """ MaxVit conv, window partition + FFN , grid partition + FFN
+    """
+
+    def __init__(
+            self,
+            dim: int,
+            dim_out: int,
+            stride: int = 1,
+            conv_cfg: MaxxVitConvCfg = MaxxVitConvCfg(),
+            transformer_cfg: MaxxVitTransformerCfg = MaxxVitTransformerCfg(),
+            use_nchw_attn: bool = False,  # FIXME move to cfg? True is ~20-30% faster on TPU, 5-10% slower on GPU
+            drop_path: float = 0.,
+    ):
+        super().__init__()
+
+        conv_cls = ConvNeXtBlock if conv_cfg.block_type == 'convnext' else MbConvBlock
+        self.conv = conv_cls(dim, dim_out, stride=stride, cfg=conv_cfg, drop_path=drop_path)
+
+        attn_kwargs = dict(dim=dim_out, cfg=transformer_cfg, drop_path=drop_path)
+        partition_layer = PartitionAttention2d if use_nchw_attn else PartitionAttentionCl
+        self.nchw_attn = use_nchw_attn
+        self.attn_block = partition_layer(**attn_kwargs)
+        self.attn_grid = partition_layer(partition_type='grid', **attn_kwargs)
+
+    def init_weights(self, scheme=''):
+        named_apply(partial(_init_transformer, scheme=scheme), self.attn_block)
+        named_apply(partial(_init_transformer, scheme=scheme), self.attn_grid)
+        named_apply(partial(_init_conv, scheme=scheme), self.conv)
+
+    def forward(self, x):
+        # NCHW format
+        x = self.conv(x)
+
+        if not self.nchw_attn:
+            x = x.permute(0, 2, 3, 1)  # to NHWC (channels-last)
+        x = self.attn_block(x)
+        x = self.attn_grid(x)
+        if not self.nchw_attn:
+            x = x.permute(0, 3, 1, 2)  # back to NCHW
+        return x
+
+
+class ParallelMaxxVitBlock(nn.Module):
+    """ MaxVit block with parallel cat(window + grid), one FF
+    Experimental timm block.
+    """
+
+    def __init__(
+            self,
+            dim,
+            dim_out,
+            stride=1,
+            num_conv=2,
+            conv_cfg: MaxxVitConvCfg = MaxxVitConvCfg(),
+            transformer_cfg: MaxxVitTransformerCfg = MaxxVitTransformerCfg(),
+            drop_path=0.,
+    ):
+        super().__init__()
+
+        conv_cls = ConvNeXtBlock if conv_cfg.block_type == 'convnext' else MbConvBlock
+        if num_conv > 1:
+            convs = [conv_cls(dim, dim_out, stride=stride, cfg=conv_cfg, drop_path=drop_path)]
+            convs += [conv_cls(dim_out, dim_out, cfg=conv_cfg, drop_path=drop_path)] * (num_conv - 1)
+            self.conv = nn.Sequential(*convs)
+        else:
+            self.conv = conv_cls(dim, dim_out, stride=stride, cfg=conv_cfg, drop_path=drop_path)
+        self.attn = ParallelPartitionAttention(dim=dim_out, cfg=transformer_cfg, drop_path=drop_path)
+
+    def init_weights(self, scheme=''):
+        named_apply(partial(_init_transformer, scheme=scheme), self.attn)
+        named_apply(partial(_init_conv, scheme=scheme), self.conv)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = x.permute(0, 2, 3, 1)
+        x = self.attn(x)
+        x = x.permute(0, 3, 1, 2)
+        return x
+
+
+class MaxxVitStage(nn.Module):
+    def __init__(
+            self,
+            in_chs: int,
+            out_chs: int,
+            stride: int = 2,
+            depth: int = 4,
+            feat_size: Tuple[int, int] = (14, 14),
+            block_types: Union[str, Tuple[str]] = 'C',
+            transformer_cfg: MaxxVitTransformerCfg = MaxxVitTransformerCfg(),
+            conv_cfg: MaxxVitConvCfg = MaxxVitConvCfg(),
+            drop_path: Union[float, List[float]] = 0.,
+    ):
+        super().__init__()
+        self.grad_checkpointing = False
+
+        block_types = extend_tuple(block_types, depth)
+        blocks = []
+        for i, t in enumerate(block_types):
+            block_stride = stride if i == 0 else 1
+            assert t in ('C', 'T', 'M', 'PM')
+            if t == 'C':
+                conv_cls = ConvNeXtBlock if conv_cfg.block_type == 'convnext' else MbConvBlock
+                blocks += [conv_cls(
+                    in_chs,
+                    out_chs,
+                    stride=block_stride,
+                    cfg=conv_cfg,
+                    drop_path=drop_path[i],
+                )]
+            elif t == 'T':
+                rel_pos_cls = get_rel_pos_cls(transformer_cfg, feat_size)
+                blocks += [TransformerBlock2d(
+                    in_chs,
+                    out_chs,
+                    stride=block_stride,
+                    rel_pos_cls=rel_pos_cls,
+                    cfg=transformer_cfg,
+                    drop_path=drop_path[i],
+                )]
+            elif t == 'M':
+                blocks += [MaxxVitBlock(
+                    in_chs,
+                    out_chs,
+                    stride=block_stride,
+                    conv_cfg=conv_cfg,
+                    transformer_cfg=transformer_cfg,
+                    drop_path=drop_path[i],
+                )]
+            elif t == 'PM':
+                blocks += [ParallelMaxxVitBlock(
+                    in_chs,
+                    out_chs,
+                    stride=block_stride,
+                    conv_cfg=conv_cfg,
+                    transformer_cfg=transformer_cfg,
+                    drop_path=drop_path[i],
+                )]
+            in_chs = out_chs
+        self.blocks = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        return x
+
+
+class Stem(nn.Module):
+
+    def __init__(
+            self,
+            in_chs: int,
+            out_chs: int,
+            kernel_size: int = 3,
+            act_layer: str = 'gelu',
+            norm_layer: str = 'batchnorm2d',
+            norm_eps: float = 1e-5,
+    ):
+        super().__init__()
+        if not isinstance(out_chs, (list, tuple)):
+            out_chs = to_2tuple(out_chs)
+
+        norm_act_layer = partial(get_norm_act_layer(norm_layer, act_layer), eps=norm_eps)
+        self.out_chs = out_chs[-1]
+        self.stride = 2
+
+        self.conv1 = create_conv2d(in_chs, out_chs[0], kernel_size, stride=2)
+        self.norm1 = norm_act_layer(out_chs[0])
+        self.conv2 = create_conv2d(out_chs[0], out_chs[1], kernel_size, stride=1)
+
+    def init_weights(self, scheme=''):
+        named_apply(partial(_init_conv, scheme=scheme), self)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.conv2(x)
+        return x
+
+
+def cfg_window_size(cfg: MaxxVitTransformerCfg, img_size: Tuple[int, int]):
+    if cfg.window_size is not None:
+        assert cfg.grid_size
+        return cfg
+    partition_size = img_size[0] // cfg.partition_ratio, img_size[1] // cfg.partition_ratio
+    cfg = replace(cfg, window_size=partition_size, grid_size=partition_size)
+    return cfg
+
+
+class MaxxVit(nn.Module):
+    """ CoaTNet + MaxVit base model.
+
+    Highly configurable for different block compositions, tensor layouts, pooling types.
+    """
+
+    def __init__(
+            self,
+            cfg: MaxxVitCfg,
+            img_size: Union[int, Tuple[int, int]] = 224,
+            in_chans: int = 3,
+            num_classes: int = 1000,
+            global_pool: str = 'avg',
+            drop_rate: float = 0.,
+            drop_path_rate: float = 0.
+    ):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        transformer_cfg = cfg_window_size(cfg.transformer_cfg, img_size)
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = cfg.embed_dim[-1]
+        self.embed_dim = cfg.embed_dim
+        self.drop_rate = drop_rate
+        self.grad_checkpointing = False
+
+        self.stem = Stem(
+            in_chs=in_chans,
+            out_chs=cfg.stem_width,
+            act_layer=cfg.conv_cfg.act_layer,
+            norm_layer=cfg.conv_cfg.norm_layer,
+            norm_eps=cfg.conv_cfg.norm_eps,
+        )
+
+        stride = self.stem.stride
+        feat_size = tuple([i // s for i, s in zip(img_size, to_2tuple(stride))])
+
+        num_stages = len(cfg.embed_dim)
+        assert len(cfg.depths) == num_stages
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg.depths)).split(cfg.depths)]
+        in_chs = self.stem.out_chs
+        stages = []
+        for i in range(num_stages):
+            stage_stride = 2
+            out_chs = cfg.embed_dim[i]
+            feat_size = tuple([(r - 1) // stage_stride + 1 for r in feat_size])
+            stages += [MaxxVitStage(
+                in_chs,
+                out_chs,
+                depth=cfg.depths[i],
+                block_types=cfg.block_type[i],
+                conv_cfg=cfg.conv_cfg,
+                transformer_cfg=transformer_cfg,
+                feat_size=feat_size,
+                drop_path=dpr[i],
+            )]
+            stride *= stage_stride
+            in_chs = out_chs
+        self.stages = nn.Sequential(*stages)
+
+        final_norm_layer = get_norm_layer(cfg.transformer_cfg.norm_layer)
+        self.norm = final_norm_layer(self.num_features, eps=cfg.transformer_cfg.norm_eps)
+
+        # Classifier head
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate)
+
+        # Weight init (default PyTorch init works well for AdamW if scheme not set)
+        assert cfg.weight_init in ('', 'normal', 'trunc_normal', 'xavier_normal', 'vit_eff')
+        if cfg.weight_init:
+            named_apply(partial(self._init_weights, scheme=cfg.weight_init), self)
+
+    def _init_weights(self, module, name, scheme=''):
+        if hasattr(module, 'init_weights'):
+            try:
+                module.init_weights(scheme=scheme)
+            except TypeError:
+                module.init_weights()
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {
+            k for k, _ in self.named_parameters()
+            if any(n in k for n in ["relative_position_bias_table", "rel_pos.mlp"])}
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^stem',  # stem and embed
+            blocks=[(r'^stages\.(\d+)', None), (r'^norm', (99999,))]
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for s in self.stages:
+            s.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is None:
+            global_pool = self.head.global_pool.pool_type
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = self.stages(x)
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        return self.head(x, pre_logits=pre_logits)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_maxxvit(variant, cfg_variant=None, pretrained=False, **kwargs):
+    return build_model_with_cfg(
+        MaxxVit, variant, pretrained,
+        model_cfg=model_cfgs[variant] if not cfg_variant else model_cfgs[cfg_variant],
+        feature_cfg=dict(flatten_sequential=True),
+        **kwargs)
+
+
+@register_model
+def coatnet_pico_rw_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_pico_rw_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_nano_rw_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_nano_rw_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_0_rw_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_0_rw_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_1_rw_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_1_rw_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_2_rw_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_2_rw_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_3_rw_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_3_rw_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_bn_0_rw_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_bn_0_rw_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_rmlp_nano_rw_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_rmlp_nano_rw_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_rmlp_0_rw_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_rmlp_0_rw_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_rmlp_1_rw_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_rmlp_1_rw_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_rmlp_2_rw_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_rmlp_2_rw_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_rmlp_3_rw_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_rmlp_3_rw_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_nano_cc_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_nano_cc_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnext_nano_rw_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnext_nano_rw_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_0_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_0_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_1_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_1_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_2_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_2_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_3_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_3_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_4_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_4_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def coatnet_5_224(pretrained=False, **kwargs):
+    return _create_maxxvit('coatnet_5_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxvit_pico_rw_256(pretrained=False, **kwargs):
+    return _create_maxxvit('maxvit_pico_rw_256', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxvit_nano_rw_256(pretrained=False, **kwargs):
+    return _create_maxxvit('maxvit_nano_rw_256', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxvit_tiny_rw_224(pretrained=False, **kwargs):
+    return _create_maxxvit('maxvit_tiny_rw_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxvit_tiny_rw_256(pretrained=False, **kwargs):
+    return _create_maxxvit('maxvit_tiny_rw_256', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxvit_rmlp_pico_rw_256(pretrained=False, **kwargs):
+    return _create_maxxvit('maxvit_rmlp_pico_rw_256', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxvit_rmlp_nano_rw_256(pretrained=False, **kwargs):
+    return _create_maxxvit('maxvit_rmlp_nano_rw_256', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxvit_rmlp_tiny_rw_256(pretrained=False, **kwargs):
+    return _create_maxxvit('maxvit_rmlp_tiny_rw_256', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxvit_rmlp_small_rw_224(pretrained=False, **kwargs):
+    return _create_maxxvit('maxvit_rmlp_small_rw_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxvit_rmlp_small_rw_256(pretrained=False, **kwargs):
+    return _create_maxxvit('maxvit_rmlp_small_rw_256', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxvit_tiny_pm_256(pretrained=False, **kwargs):
+    return _create_maxxvit('maxvit_tiny_pm_256', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxxvit_rmlp_nano_rw_256(pretrained=False, **kwargs):
+    return _create_maxxvit('maxxvit_rmlp_nano_rw_256', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxxvit_rmlp_tiny_rw_256(pretrained=False, **kwargs):
+    return _create_maxxvit('maxxvit_rmlp_tiny_rw_256', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxxvit_rmlp_small_rw_256(pretrained=False, **kwargs):
+    return _create_maxxvit('maxxvit_rmlp_small_rw_256', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxvit_tiny_224(pretrained=False, **kwargs):
+    return _create_maxxvit('maxvit_tiny_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxvit_small_224(pretrained=False, **kwargs):
+    return _create_maxxvit('maxvit_small_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxvit_base_224(pretrained=False, **kwargs):
+    return _create_maxxvit('maxvit_base_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxvit_large_224(pretrained=False, **kwargs):
+    return _create_maxxvit('maxvit_large_224', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def maxvit_xlarge_224(pretrained=False, **kwargs):
+    return _create_maxxvit('maxvit_xlarge_224', pretrained=pretrained, **kwargs)
+
diff --git a/src/custom_timm/models/mlp_mixer.py b/src/custom_timm/models/mlp_mixer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b044244baa63476f32e63b63e7604748bbbf0360
--- /dev/null
+++ b/src/custom_timm/models/mlp_mixer.py
@@ -0,0 +1,681 @@
+""" MLP-Mixer, ResMLP, and gMLP in PyTorch
+
+This impl originally based on MLP-Mixer paper.
+
+Official JAX impl: https://github.com/google-research/vision_transformer/blob/linen/vit_jax/models_mixer.py
+
+Paper: 'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601
+
+@article{tolstikhin2021,
+  title={MLP-Mixer: An all-MLP Architecture for Vision},
+  author={Tolstikhin, Ilya and Houlsby, Neil and Kolesnikov, Alexander and Beyer, Lucas and Zhai, Xiaohua and Unterthiner,
+        Thomas and Yung, Jessica and Keysers, Daniel and Uszkoreit, Jakob and Lucic, Mario and Dosovitskiy, Alexey},
+  journal={arXiv preprint arXiv:2105.01601},
+  year={2021}
+}
+
+Also supporting ResMlp, and a preliminary (not verified) implementations of gMLP
+
+Code: https://github.com/facebookresearch/deit
+Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404
+@misc{touvron2021resmlp,
+      title={ResMLP: Feedforward networks for image classification with data-efficient training},
+      author={Hugo Touvron and Piotr Bojanowski and Mathilde Caron and Matthieu Cord and Alaaeldin El-Nouby and
+        Edouard Grave and Armand Joulin and Gabriel Synnaeve and Jakob Verbeek and Hervé Jégou},
+      year={2021},
+      eprint={2105.03404},
+}
+
+Paper: `Pay Attention to MLPs` - https://arxiv.org/abs/2105.08050
+@misc{liu2021pay,
+      title={Pay Attention to MLPs},
+      author={Hanxiao Liu and Zihang Dai and David R. So and Quoc V. Le},
+      year={2021},
+      eprint={2105.08050},
+}
+
+A thank you to paper authors for releasing code and weights.
+
+Hacked together by / Copyright 2021 Ross Wightman
+"""
+import math
+from copy import deepcopy
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg, named_apply, checkpoint_seq
+from .layers import PatchEmbed, Mlp, GluMlp, GatedMlp, DropPath, lecun_normal_, to_2tuple
+from .registry import register_model
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': 0.875, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+        'first_conv': 'stem.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = dict(
+    mixer_s32_224=_cfg(),
+    mixer_s16_224=_cfg(),
+    mixer_b32_224=_cfg(),
+    mixer_b16_224=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_mixer_b16_224-76587d61.pth',
+    ),
+    mixer_b16_224_in21k=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_mixer_b16_224_in21k-617b3de2.pth',
+        num_classes=21843
+    ),
+    mixer_l32_224=_cfg(),
+    mixer_l16_224=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_mixer_l16_224-92f9adc4.pth',
+    ),
+    mixer_l16_224_in21k=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_mixer_l16_224_in21k-846aa33c.pth',
+        num_classes=21843
+    ),
+
+    # Mixer ImageNet-21K-P pretraining
+    mixer_b16_224_miil_in21k=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/mixer_b16_224_miil_in21k-2a558a71.pth',
+        mean=(0., 0., 0.), std=(1., 1., 1.), crop_pct=0.875, interpolation='bilinear', num_classes=11221,
+    ),
+    mixer_b16_224_miil=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/mixer_b16_224_miil-9229a591.pth',
+        mean=(0., 0., 0.), std=(1., 1., 1.), crop_pct=0.875, interpolation='bilinear',
+    ),
+
+    gmixer_12_224=_cfg(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+    gmixer_24_224=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gmixer_24_224_raa-7daf7ae6.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+
+    resmlp_12_224=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/resmlp_12_no_dist.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+    resmlp_24_224=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/resmlp_24_no_dist.pth',
+        #url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resmlp_24_224_raa-a8256759.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+    resmlp_36_224=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/resmlp_36_no_dist.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+    resmlp_big_24_224=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/resmlpB_24_no_dist.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+
+    resmlp_12_distilled_224=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/resmlp_12_dist.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+    resmlp_24_distilled_224=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/resmlp_24_dist.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+    resmlp_36_distilled_224=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/resmlp_36_dist.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+    resmlp_big_24_distilled_224=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/resmlpB_24_dist.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+
+    resmlp_big_24_224_in22ft1k=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/resmlpB_24_22k.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+
+    resmlp_12_224_dino=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/resmlp_12_dino.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+    resmlp_24_224_dino=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/resmlp_24_dino.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+
+    gmlp_ti16_224=_cfg(),
+    gmlp_s16_224=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gmlp_s16_224_raa-10536d42.pth',
+    ),
+    gmlp_b16_224=_cfg(),
+)
+
+
+class MixerBlock(nn.Module):
+    """ Residual Block w/ token mixing and channel MLPs
+    Based on: 'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601
+    """
+    def __init__(
+            self, dim, seq_len, mlp_ratio=(0.5, 4.0), mlp_layer=Mlp,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6), act_layer=nn.GELU, drop=0., drop_path=0.):
+        super().__init__()
+        tokens_dim, channels_dim = [int(x * dim) for x in to_2tuple(mlp_ratio)]
+        self.norm1 = norm_layer(dim)
+        self.mlp_tokens = mlp_layer(seq_len, tokens_dim, act_layer=act_layer, drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp_channels = mlp_layer(dim, channels_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.mlp_tokens(self.norm1(x).transpose(1, 2)).transpose(1, 2))
+        x = x + self.drop_path(self.mlp_channels(self.norm2(x)))
+        return x
+
+
+class Affine(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones((1, 1, dim)))
+        self.beta = nn.Parameter(torch.zeros((1, 1, dim)))
+
+    def forward(self, x):
+        return torch.addcmul(self.beta, self.alpha, x)
+
+
+class ResBlock(nn.Module):
+    """ Residual MLP block w/ LayerScale and Affine 'norm'
+
+    Based on: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404
+    """
+    def __init__(
+            self, dim, seq_len, mlp_ratio=4, mlp_layer=Mlp, norm_layer=Affine,
+            act_layer=nn.GELU, init_values=1e-4, drop=0., drop_path=0.):
+        super().__init__()
+        channel_dim = int(dim * mlp_ratio)
+        self.norm1 = norm_layer(dim)
+        self.linear_tokens = nn.Linear(seq_len, seq_len)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp_channels = mlp_layer(dim, channel_dim, act_layer=act_layer, drop=drop)
+        self.ls1 = nn.Parameter(init_values * torch.ones(dim))
+        self.ls2 = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x):
+        x = x + self.drop_path(self.ls1 * self.linear_tokens(self.norm1(x).transpose(1, 2)).transpose(1, 2))
+        x = x + self.drop_path(self.ls2 * self.mlp_channels(self.norm2(x)))
+        return x
+
+
+class SpatialGatingUnit(nn.Module):
+    """ Spatial Gating Unit
+
+    Based on: `Pay Attention to MLPs` - https://arxiv.org/abs/2105.08050
+    """
+    def __init__(self, dim, seq_len, norm_layer=nn.LayerNorm):
+        super().__init__()
+        gate_dim = dim // 2
+        self.norm = norm_layer(gate_dim)
+        self.proj = nn.Linear(seq_len, seq_len)
+
+    def init_weights(self):
+        # special init for the projection gate, called as override by base model init
+        nn.init.normal_(self.proj.weight, std=1e-6)
+        nn.init.ones_(self.proj.bias)
+
+    def forward(self, x):
+        u, v = x.chunk(2, dim=-1)
+        v = self.norm(v)
+        v = self.proj(v.transpose(-1, -2))
+        return u * v.transpose(-1, -2)
+
+
+class SpatialGatingBlock(nn.Module):
+    """ Residual Block w/ Spatial Gating
+
+    Based on: `Pay Attention to MLPs` - https://arxiv.org/abs/2105.08050
+    """
+    def __init__(
+            self, dim, seq_len, mlp_ratio=4, mlp_layer=GatedMlp,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6), act_layer=nn.GELU, drop=0., drop_path=0.):
+        super().__init__()
+        channel_dim = int(dim * mlp_ratio)
+        self.norm = norm_layer(dim)
+        sgu = partial(SpatialGatingUnit, seq_len=seq_len)
+        self.mlp_channels = mlp_layer(dim, channel_dim, act_layer=act_layer, gate_layer=sgu, drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        x = x + self.drop_path(self.mlp_channels(self.norm(x)))
+        return x
+
+
+class MlpMixer(nn.Module):
+
+    def __init__(
+            self,
+            num_classes=1000,
+            img_size=224,
+            in_chans=3,
+            patch_size=16,
+            num_blocks=8,
+            embed_dim=512,
+            mlp_ratio=(0.5, 4.0),
+            block_layer=MixerBlock,
+            mlp_layer=Mlp,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6),
+            act_layer=nn.GELU,
+            drop_rate=0.,
+            drop_path_rate=0.,
+            nlhb=False,
+            stem_norm=False,
+            global_pool='avg',
+    ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.grad_checkpointing = False
+
+        self.stem = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans,
+            embed_dim=embed_dim, norm_layer=norm_layer if stem_norm else None)
+        # FIXME drop_path (stochastic depth scaling rule or all the same?)
+        self.blocks = nn.Sequential(*[
+            block_layer(
+                embed_dim, self.stem.num_patches, mlp_ratio, mlp_layer=mlp_layer, norm_layer=norm_layer,
+                act_layer=act_layer, drop=drop_rate, drop_path=drop_path_rate)
+            for _ in range(num_blocks)])
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Linear(embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
+
+        self.init_weights(nlhb=nlhb)
+
+    @torch.jit.ignore
+    def init_weights(self, nlhb=False):
+        head_bias = -math.log(self.num_classes) if nlhb else 0.
+        named_apply(partial(_init_weights, head_bias=head_bias), module=self)  # depth-first
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^stem',  # stem and embed
+            blocks=[(r'^blocks\.(\d+)', None), (r'^norm', (99999,))]
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ('', 'avg')
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        x = self.norm(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        if self.global_pool == 'avg':
+            x = x.mean(dim=1)
+        x = self.head(x)
+        return x
+
+
+def _init_weights(module: nn.Module, name: str, head_bias: float = 0., flax=False):
+    """ Mixer weight initialization (trying to match Flax defaults)
+    """
+    if isinstance(module, nn.Linear):
+        if name.startswith('head'):
+            nn.init.zeros_(module.weight)
+            nn.init.constant_(module.bias, head_bias)
+        else:
+            if flax:
+                # Flax defaults
+                lecun_normal_(module.weight)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+            else:
+                # like MLP init in vit (my original init)
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    if 'mlp' in name:
+                        nn.init.normal_(module.bias, std=1e-6)
+                    else:
+                        nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Conv2d):
+        lecun_normal_(module.weight)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm)):
+        nn.init.ones_(module.weight)
+        nn.init.zeros_(module.bias)
+    elif hasattr(module, 'init_weights'):
+        # NOTE if a parent module contains init_weights method, it can override the init of the
+        # child modules as this will be called in depth-first order.
+        module.init_weights()
+
+
+def checkpoint_filter_fn(state_dict, model):
+    """ Remap checkpoints if needed """
+    if 'patch_embed.proj.weight' in state_dict:
+        # Remap FB ResMlp models -> timm
+        out_dict = {}
+        for k, v in state_dict.items():
+            k = k.replace('patch_embed.', 'stem.')
+            k = k.replace('attn.', 'linear_tokens.')
+            k = k.replace('mlp.', 'mlp_channels.')
+            k = k.replace('gamma_', 'ls')
+            if k.endswith('.alpha') or k.endswith('.beta'):
+                v = v.reshape(1, 1, -1)
+            out_dict[k] = v
+        return out_dict
+    return state_dict
+
+
+def _create_mixer(variant, pretrained=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for MLP-Mixer models.')
+
+    model = build_model_with_cfg(
+        MlpMixer, variant, pretrained,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        **kwargs)
+    return model
+
+
+@register_model
+def mixer_s32_224(pretrained=False, **kwargs):
+    """ Mixer-S/32 224x224
+    Paper: 'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601
+    """
+    model_args = dict(patch_size=32, num_blocks=8, embed_dim=512, **kwargs)
+    model = _create_mixer('mixer_s32_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def mixer_s16_224(pretrained=False, **kwargs):
+    """ Mixer-S/16 224x224
+    Paper:  'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601
+    """
+    model_args = dict(patch_size=16, num_blocks=8, embed_dim=512, **kwargs)
+    model = _create_mixer('mixer_s16_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def mixer_b32_224(pretrained=False, **kwargs):
+    """ Mixer-B/32 224x224
+    Paper:  'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601
+    """
+    model_args = dict(patch_size=32, num_blocks=12, embed_dim=768, **kwargs)
+    model = _create_mixer('mixer_b32_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def mixer_b16_224(pretrained=False, **kwargs):
+    """ Mixer-B/16 224x224. ImageNet-1k pretrained weights.
+    Paper:  'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601
+    """
+    model_args = dict(patch_size=16, num_blocks=12, embed_dim=768, **kwargs)
+    model = _create_mixer('mixer_b16_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def mixer_b16_224_in21k(pretrained=False, **kwargs):
+    """ Mixer-B/16 224x224. ImageNet-21k pretrained weights.
+    Paper:  'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601
+    """
+    model_args = dict(patch_size=16, num_blocks=12, embed_dim=768, **kwargs)
+    model = _create_mixer('mixer_b16_224_in21k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def mixer_l32_224(pretrained=False, **kwargs):
+    """ Mixer-L/32 224x224.
+    Paper:  'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601
+    """
+    model_args = dict(patch_size=32, num_blocks=24, embed_dim=1024, **kwargs)
+    model = _create_mixer('mixer_l32_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def mixer_l16_224(pretrained=False, **kwargs):
+    """ Mixer-L/16 224x224. ImageNet-1k pretrained weights.
+    Paper:  'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601
+    """
+    model_args = dict(patch_size=16, num_blocks=24, embed_dim=1024, **kwargs)
+    model = _create_mixer('mixer_l16_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def mixer_l16_224_in21k(pretrained=False, **kwargs):
+    """ Mixer-L/16 224x224. ImageNet-21k pretrained weights.
+    Paper:  'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601
+    """
+    model_args = dict(patch_size=16, num_blocks=24, embed_dim=1024, **kwargs)
+    model = _create_mixer('mixer_l16_224_in21k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def mixer_b16_224_miil(pretrained=False, **kwargs):
+    """ Mixer-B/16 224x224. ImageNet-21k pretrained weights.
+    Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
+    """
+    model_args = dict(patch_size=16, num_blocks=12, embed_dim=768, **kwargs)
+    model = _create_mixer('mixer_b16_224_miil', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def mixer_b16_224_miil_in21k(pretrained=False, **kwargs):
+    """ Mixer-B/16 224x224. ImageNet-1k pretrained weights.
+    Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
+    """
+    model_args = dict(patch_size=16, num_blocks=12, embed_dim=768, **kwargs)
+    model = _create_mixer('mixer_b16_224_miil_in21k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def gmixer_12_224(pretrained=False, **kwargs):
+    """ Glu-Mixer-12 224x224
+    Experiment by Ross Wightman, adding (Si)GLU to MLP-Mixer
+    """
+    model_args = dict(
+        patch_size=16, num_blocks=12, embed_dim=384, mlp_ratio=(1.0, 4.0),
+        mlp_layer=GluMlp, act_layer=nn.SiLU, **kwargs)
+    model = _create_mixer('gmixer_12_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def gmixer_24_224(pretrained=False, **kwargs):
+    """ Glu-Mixer-24 224x224
+    Experiment by Ross Wightman, adding (Si)GLU to MLP-Mixer
+    """
+    model_args = dict(
+        patch_size=16, num_blocks=24, embed_dim=384, mlp_ratio=(1.0, 4.0),
+        mlp_layer=GluMlp, act_layer=nn.SiLU, **kwargs)
+    model = _create_mixer('gmixer_24_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def resmlp_12_224(pretrained=False, **kwargs):
+    """ ResMLP-12
+    Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404
+    """
+    model_args = dict(
+        patch_size=16, num_blocks=12, embed_dim=384, mlp_ratio=4, block_layer=ResBlock, norm_layer=Affine, **kwargs)
+    model = _create_mixer('resmlp_12_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def resmlp_24_224(pretrained=False, **kwargs):
+    """ ResMLP-24
+    Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404
+    """
+    model_args = dict(
+        patch_size=16, num_blocks=24, embed_dim=384, mlp_ratio=4,
+        block_layer=partial(ResBlock, init_values=1e-5), norm_layer=Affine, **kwargs)
+    model = _create_mixer('resmlp_24_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def resmlp_36_224(pretrained=False, **kwargs):
+    """ ResMLP-36
+    Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404
+    """
+    model_args = dict(
+        patch_size=16, num_blocks=36, embed_dim=384, mlp_ratio=4,
+        block_layer=partial(ResBlock, init_values=1e-6), norm_layer=Affine, **kwargs)
+    model = _create_mixer('resmlp_36_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def resmlp_big_24_224(pretrained=False, **kwargs):
+    """ ResMLP-B-24
+    Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404
+    """
+    model_args = dict(
+        patch_size=8, num_blocks=24, embed_dim=768, mlp_ratio=4,
+        block_layer=partial(ResBlock, init_values=1e-6), norm_layer=Affine, **kwargs)
+    model = _create_mixer('resmlp_big_24_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def resmlp_12_distilled_224(pretrained=False, **kwargs):
+    """ ResMLP-12
+    Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404
+    """
+    model_args = dict(
+        patch_size=16, num_blocks=12, embed_dim=384, mlp_ratio=4, block_layer=ResBlock, norm_layer=Affine, **kwargs)
+    model = _create_mixer('resmlp_12_distilled_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def resmlp_24_distilled_224(pretrained=False, **kwargs):
+    """ ResMLP-24
+    Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404
+    """
+    model_args = dict(
+        patch_size=16, num_blocks=24, embed_dim=384, mlp_ratio=4,
+        block_layer=partial(ResBlock, init_values=1e-5), norm_layer=Affine, **kwargs)
+    model = _create_mixer('resmlp_24_distilled_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def resmlp_36_distilled_224(pretrained=False, **kwargs):
+    """ ResMLP-36
+    Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404
+    """
+    model_args = dict(
+        patch_size=16, num_blocks=36, embed_dim=384, mlp_ratio=4,
+        block_layer=partial(ResBlock, init_values=1e-6), norm_layer=Affine, **kwargs)
+    model = _create_mixer('resmlp_36_distilled_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def resmlp_big_24_distilled_224(pretrained=False, **kwargs):
+    """ ResMLP-B-24
+    Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404
+    """
+    model_args = dict(
+        patch_size=8, num_blocks=24, embed_dim=768, mlp_ratio=4,
+        block_layer=partial(ResBlock, init_values=1e-6), norm_layer=Affine, **kwargs)
+    model = _create_mixer('resmlp_big_24_distilled_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def resmlp_big_24_224_in22ft1k(pretrained=False, **kwargs):
+    """ ResMLP-B-24
+    Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404
+    """
+    model_args = dict(
+        patch_size=8, num_blocks=24, embed_dim=768, mlp_ratio=4,
+        block_layer=partial(ResBlock, init_values=1e-6), norm_layer=Affine, **kwargs)
+    model = _create_mixer('resmlp_big_24_224_in22ft1k', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def resmlp_12_224_dino(pretrained=False, **kwargs):
+    """ ResMLP-12
+    Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404
+
+    Model pretrained via DINO (self-supervised) - https://arxiv.org/abs/2104.14294
+    """
+    model_args = dict(
+        patch_size=16, num_blocks=12, embed_dim=384, mlp_ratio=4, block_layer=ResBlock, norm_layer=Affine, **kwargs)
+    model = _create_mixer('resmlp_12_224_dino', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def resmlp_24_224_dino(pretrained=False, **kwargs):
+    """ ResMLP-24
+    Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404
+
+    Model pretrained via DINO (self-supervised) - https://arxiv.org/abs/2104.14294
+    """
+    model_args = dict(
+        patch_size=16, num_blocks=24, embed_dim=384, mlp_ratio=4,
+        block_layer=partial(ResBlock, init_values=1e-5), norm_layer=Affine, **kwargs)
+    model = _create_mixer('resmlp_24_224_dino', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def gmlp_ti16_224(pretrained=False, **kwargs):
+    """ gMLP-Tiny
+    Paper: `Pay Attention to MLPs` - https://arxiv.org/abs/2105.08050
+    """
+    model_args = dict(
+        patch_size=16, num_blocks=30, embed_dim=128, mlp_ratio=6, block_layer=SpatialGatingBlock,
+        mlp_layer=GatedMlp, **kwargs)
+    model = _create_mixer('gmlp_ti16_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def gmlp_s16_224(pretrained=False, **kwargs):
+    """ gMLP-Small
+    Paper: `Pay Attention to MLPs` - https://arxiv.org/abs/2105.08050
+    """
+    model_args = dict(
+        patch_size=16, num_blocks=30, embed_dim=256, mlp_ratio=6, block_layer=SpatialGatingBlock,
+        mlp_layer=GatedMlp, **kwargs)
+    model = _create_mixer('gmlp_s16_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def gmlp_b16_224(pretrained=False, **kwargs):
+    """ gMLP-Base
+    Paper: `Pay Attention to MLPs` - https://arxiv.org/abs/2105.08050
+    """
+    model_args = dict(
+        patch_size=16, num_blocks=30, embed_dim=512, mlp_ratio=6, block_layer=SpatialGatingBlock,
+        mlp_layer=GatedMlp, **kwargs)
+    model = _create_mixer('gmlp_b16_224', pretrained=pretrained, **model_args)
+    return model
diff --git a/src/custom_timm/models/mobilenetv3.py b/src/custom_timm/models/mobilenetv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..19dd8b5b4bf10ea2dc307fda75ed8d49bc312f82
--- /dev/null
+++ b/src/custom_timm/models/mobilenetv3.py
@@ -0,0 +1,739 @@
+""" MobileNet V3
+
+A PyTorch impl of MobileNet-V3, compatible with TF weights from official impl.
+
+Paper: Searching for MobileNetV3 - https://arxiv.org/abs/1905.02244
+
+Hacked together by / Copyright 2019, Ross Wightman
+"""
+from functools import partial
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+from .efficientnet_blocks import SqueezeExcite
+from .efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficientnet_init_weights,\
+    round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT
+from .features import FeatureInfo, FeatureHooks
+from .helpers import build_model_with_cfg, pretrained_cfg_for_features, checkpoint_seq
+from .layers import SelectAdaptivePool2d, Linear, create_conv2d, get_act_fn, get_norm_act_layer
+from .registry import register_model
+
+__all__ = ['MobileNetV3', 'MobileNetV3Features']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'conv_stem', 'classifier': 'classifier',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'mobilenetv3_large_075': _cfg(url=''),
+    'mobilenetv3_large_100': _cfg(
+        interpolation='bicubic',
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_large_100_ra-f55367f5.pth'),
+    'mobilenetv3_large_100_miil': _cfg(
+        interpolation='bilinear', mean=(0., 0., 0.), std=(1., 1., 1.),
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/mobilenetv3_large_100_1k_miil_78_0-66471c13.pth'),
+    'mobilenetv3_large_100_miil_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/mobilenetv3_large_100_in21k_miil-d71cc17b.pth',
+        interpolation='bilinear', mean=(0., 0., 0.), std=(1., 1., 1.), num_classes=11221),
+
+    'mobilenetv3_small_050': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_small_050_lambc-4b7bbe87.pth',
+        interpolation='bicubic'),
+    'mobilenetv3_small_075': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_small_075_lambc-384766db.pth',
+        interpolation='bicubic'),
+    'mobilenetv3_small_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_small_100_lamb-266a294c.pth',
+        interpolation='bicubic'),
+
+    'mobilenetv3_rw': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_100-35495452.pth',
+        interpolation='bicubic'),
+
+    'tf_mobilenetv3_large_075': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_075-150ee8b0.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_mobilenetv3_large_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_100-427764d5.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_mobilenetv3_large_minimal_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_minimal_100-8596ae28.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_mobilenetv3_small_075': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_075-da427f52.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_mobilenetv3_small_100': _cfg(
+        url= 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_100-37f49e2b.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_mobilenetv3_small_minimal_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_minimal_100-922a7843.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+
+    'fbnetv3_b': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/fbnetv3_b_224-ead5d2a1.pth',
+        test_input_size=(3, 256, 256), crop_pct=0.95),
+    'fbnetv3_d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/fbnetv3_d_224-c98bce42.pth',
+        test_input_size=(3, 256, 256), crop_pct=0.95),
+    'fbnetv3_g': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/fbnetv3_g_240-0b1df83b.pth',
+        input_size=(3, 240, 240), test_input_size=(3, 288, 288), crop_pct=0.95, pool_size=(8, 8)),
+
+    "lcnet_035": _cfg(),
+    "lcnet_050": _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/lcnet_050-f447553b.pth',
+        interpolation='bicubic',
+    ),
+    "lcnet_075": _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/lcnet_075-318cad2c.pth',
+        interpolation='bicubic',
+    ),
+    "lcnet_100": _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/lcnet_100-a929038c.pth',
+        interpolation='bicubic',
+    ),
+    "lcnet_150": _cfg(),
+}
+
+
+class MobileNetV3(nn.Module):
+    """ MobiletNet-V3
+
+    Based on my EfficientNet implementation and building blocks, this model utilizes the MobileNet-v3 specific
+    'efficient head', where global pooling is done before the head convolution without a final batch-norm
+    layer before the classifier.
+
+    Paper: `Searching for MobileNetV3` - https://arxiv.org/abs/1905.02244
+
+    Other architectures utilizing MobileNet-V3 efficient head that are supported by this impl include:
+      * HardCoRe-NAS - https://arxiv.org/abs/2102.11646 (defn in hardcorenas.py uses this class)
+      * FBNet-V3 - https://arxiv.org/abs/2006.02049
+      * LCNet - https://arxiv.org/abs/2109.15099
+    """
+
+    def __init__(
+            self, block_args, num_classes=1000, in_chans=3, stem_size=16, fix_stem=False, num_features=1280,
+            head_bias=True, pad_type='', act_layer=None, norm_layer=None, se_layer=None, se_from_exp=True,
+            round_chs_fn=round_channels, drop_rate=0., drop_path_rate=0., global_pool='avg'):
+        super(MobileNetV3, self).__init__()
+        act_layer = act_layer or nn.ReLU
+        norm_layer = norm_layer or nn.BatchNorm2d
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
+        se_layer = se_layer or SqueezeExcite
+        self.num_classes = num_classes
+        self.num_features = num_features
+        self.drop_rate = drop_rate
+        self.grad_checkpointing = False
+
+        # Stem
+        if not fix_stem:
+            stem_size = round_chs_fn(stem_size)
+        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
+        self.bn1 = norm_act_layer(stem_size, inplace=True)
+
+        # Middle stages (IR/ER/DS Blocks)
+        builder = EfficientNetBuilder(
+            output_stride=32, pad_type=pad_type, round_chs_fn=round_chs_fn, se_from_exp=se_from_exp,
+            act_layer=act_layer, norm_layer=norm_layer, se_layer=se_layer, drop_path_rate=drop_path_rate)
+        self.blocks = nn.Sequential(*builder(stem_size, block_args))
+        self.feature_info = builder.features
+        head_chs = builder.in_chs
+
+        # Head + Pooling
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        num_pooled_chs = head_chs * self.global_pool.feat_mult()
+        self.conv_head = create_conv2d(num_pooled_chs, self.num_features, 1, padding=pad_type, bias=head_bias)
+        self.act2 = act_layer(inplace=True)
+        self.flatten = nn.Flatten(1) if global_pool else nn.Identity()  # don't flatten if pooling disabled
+        self.classifier = Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+        efficientnet_init_weights(self)
+
+    def as_sequential(self):
+        layers = [self.conv_stem, self.bn1]
+        layers.extend(self.blocks)
+        layers.extend([self.global_pool, self.conv_head, self.act2])
+        layers.extend([nn.Flatten(), nn.Dropout(self.drop_rate), self.classifier])
+        return nn.Sequential(*layers)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^conv_stem|bn1',
+            blocks=r'^blocks\.(\d+)' if coarse else r'^blocks\.(\d+)\.(\d+)'
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.classifier
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        # cannot meaningfully change pooling of efficient head after creation
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.flatten = nn.Flatten(1) if global_pool else nn.Identity()  # don't flatten if pooling disabled
+        self.classifier = Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x, flatten=True)
+        else:
+            x = self.blocks(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        x = self.global_pool(x)
+        x = self.conv_head(x)
+        x = self.act2(x)
+        if pre_logits:
+            return x.flatten(1)
+        else:
+            x = self.flatten(x)
+            if self.drop_rate > 0.:
+                x = F.dropout(x, p=self.drop_rate, training=self.training)
+            return self.classifier(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+class MobileNetV3Features(nn.Module):
+    """ MobileNetV3 Feature Extractor
+
+    A work-in-progress feature extraction module for MobileNet-V3 to use as a backbone for segmentation
+    and object detection models.
+    """
+
+    def __init__(
+            self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='bottleneck', in_chans=3,
+            stem_size=16, fix_stem=False, output_stride=32, pad_type='', round_chs_fn=round_channels,
+            se_from_exp=True, act_layer=None, norm_layer=None, se_layer=None, drop_rate=0., drop_path_rate=0.):
+        super(MobileNetV3Features, self).__init__()
+        act_layer = act_layer or nn.ReLU
+        norm_layer = norm_layer or nn.BatchNorm2d
+        se_layer = se_layer or SqueezeExcite
+        self.drop_rate = drop_rate
+
+        # Stem
+        if not fix_stem:
+            stem_size = round_chs_fn(stem_size)
+        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
+        self.bn1 = norm_layer(stem_size)
+        self.act1 = act_layer(inplace=True)
+
+        # Middle stages (IR/ER/DS Blocks)
+        builder = EfficientNetBuilder(
+            output_stride=output_stride, pad_type=pad_type, round_chs_fn=round_chs_fn, se_from_exp=se_from_exp,
+            act_layer=act_layer, norm_layer=norm_layer, se_layer=se_layer,
+            drop_path_rate=drop_path_rate, feature_location=feature_location)
+        self.blocks = nn.Sequential(*builder(stem_size, block_args))
+        self.feature_info = FeatureInfo(builder.features, out_indices)
+        self._stage_out_idx = {v['stage']: i for i, v in enumerate(self.feature_info) if i in out_indices}
+
+        efficientnet_init_weights(self)
+
+        # Register feature extraction hooks with FeatureHooks helper
+        self.feature_hooks = None
+        if feature_location != 'bottleneck':
+            hooks = self.feature_info.get_dicts(keys=('module', 'hook_type'))
+            self.feature_hooks = FeatureHooks(hooks, self.named_modules())
+
+    def forward(self, x) -> List[torch.Tensor]:
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        if self.feature_hooks is None:
+            features = []
+            if 0 in self._stage_out_idx:
+                features.append(x)  # add stem out
+            for i, b in enumerate(self.blocks):
+                x = b(x)
+                if i + 1 in self._stage_out_idx:
+                    features.append(x)
+            return features
+        else:
+            self.blocks(x)
+            out = self.feature_hooks.get_output(x.device)
+            return list(out.values())
+
+
+def _create_mnv3(variant, pretrained=False, **kwargs):
+    features_only = False
+    model_cls = MobileNetV3
+    kwargs_filter = None
+    if kwargs.pop('features_only', False):
+        features_only = True
+        kwargs_filter = ('num_classes', 'num_features', 'head_conv', 'head_bias', 'global_pool')
+        model_cls = MobileNetV3Features
+    model = build_model_with_cfg(
+        model_cls, variant, pretrained,
+        pretrained_strict=not features_only,
+        kwargs_filter=kwargs_filter,
+        **kwargs)
+    if features_only:
+        model.default_cfg = pretrained_cfg_for_features(model.default_cfg)
+    return model
+
+
+def _gen_mobilenet_v3_rw(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a MobileNet-V3 model.
+
+    Ref impl: ?
+    Paper: https://arxiv.org/abs/1905.02244
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_e1_c16_nre_noskip'],  # relu
+        # stage 1, 112x112 in
+        ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'],  # relu
+        # stage 2, 56x56 in
+        ['ir_r3_k5_s2_e3_c40_se0.25_nre'],  # relu
+        # stage 3, 28x28 in
+        ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'],  # hard-swish
+        # stage 4, 14x14in
+        ['ir_r2_k3_s1_e6_c112_se0.25'],  # hard-swish
+        # stage 5, 14x14in
+        ['ir_r3_k5_s2_e6_c160_se0.25'],  # hard-swish
+        # stage 6, 7x7 in
+        ['cn_r1_k1_s1_c960'],  # hard-swish
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        head_bias=False,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'hard_swish'),
+        se_layer=partial(SqueezeExcite, gate_layer='hard_sigmoid'),
+        **kwargs,
+    )
+    model = _create_mnv3(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a MobileNet-V3 model.
+
+    Ref impl: ?
+    Paper: https://arxiv.org/abs/1905.02244
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    if 'small' in variant:
+        num_features = 1024
+        if 'minimal' in variant:
+            act_layer = resolve_act_layer(kwargs, 'relu')
+            arch_def = [
+                # stage 0, 112x112 in
+                ['ds_r1_k3_s2_e1_c16'],
+                # stage 1, 56x56 in
+                ['ir_r1_k3_s2_e4.5_c24', 'ir_r1_k3_s1_e3.67_c24'],
+                # stage 2, 28x28 in
+                ['ir_r1_k3_s2_e4_c40', 'ir_r2_k3_s1_e6_c40'],
+                # stage 3, 14x14 in
+                ['ir_r2_k3_s1_e3_c48'],
+                # stage 4, 14x14in
+                ['ir_r3_k3_s2_e6_c96'],
+                # stage 6, 7x7 in
+                ['cn_r1_k1_s1_c576'],
+            ]
+        else:
+            act_layer = resolve_act_layer(kwargs, 'hard_swish')
+            arch_def = [
+                # stage 0, 112x112 in
+                ['ds_r1_k3_s2_e1_c16_se0.25_nre'],  # relu
+                # stage 1, 56x56 in
+                ['ir_r1_k3_s2_e4.5_c24_nre', 'ir_r1_k3_s1_e3.67_c24_nre'],  # relu
+                # stage 2, 28x28 in
+                ['ir_r1_k5_s2_e4_c40_se0.25', 'ir_r2_k5_s1_e6_c40_se0.25'],  # hard-swish
+                # stage 3, 14x14 in
+                ['ir_r2_k5_s1_e3_c48_se0.25'],  # hard-swish
+                # stage 4, 14x14in
+                ['ir_r3_k5_s2_e6_c96_se0.25'],  # hard-swish
+                # stage 6, 7x7 in
+                ['cn_r1_k1_s1_c576'],  # hard-swish
+            ]
+    else:
+        num_features = 1280
+        if 'minimal' in variant:
+            act_layer = resolve_act_layer(kwargs, 'relu')
+            arch_def = [
+                # stage 0, 112x112 in
+                ['ds_r1_k3_s1_e1_c16'],
+                # stage 1, 112x112 in
+                ['ir_r1_k3_s2_e4_c24', 'ir_r1_k3_s1_e3_c24'],
+                # stage 2, 56x56 in
+                ['ir_r3_k3_s2_e3_c40'],
+                # stage 3, 28x28 in
+                ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'],
+                # stage 4, 14x14in
+                ['ir_r2_k3_s1_e6_c112'],
+                # stage 5, 14x14in
+                ['ir_r3_k3_s2_e6_c160'],
+                # stage 6, 7x7 in
+                ['cn_r1_k1_s1_c960'],
+            ]
+        else:
+            act_layer = resolve_act_layer(kwargs, 'hard_swish')
+            arch_def = [
+                # stage 0, 112x112 in
+                ['ds_r1_k3_s1_e1_c16_nre'],  # relu
+                # stage 1, 112x112 in
+                ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'],  # relu
+                # stage 2, 56x56 in
+                ['ir_r3_k5_s2_e3_c40_se0.25_nre'],  # relu
+                # stage 3, 28x28 in
+                ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'],  # hard-swish
+                # stage 4, 14x14in
+                ['ir_r2_k3_s1_e6_c112_se0.25'],  # hard-swish
+                # stage 5, 14x14in
+                ['ir_r3_k5_s2_e6_c160_se0.25'],  # hard-swish
+                # stage 6, 7x7 in
+                ['cn_r1_k1_s1_c960'],  # hard-swish
+            ]
+    se_layer = partial(SqueezeExcite, gate_layer='hard_sigmoid', force_act_layer=nn.ReLU, rd_round_fn=round_channels)
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        num_features=num_features,
+        stem_size=16,
+        fix_stem=channel_multiplier < 0.75,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=act_layer,
+        se_layer=se_layer,
+        **kwargs,
+    )
+    model = _create_mnv3(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_fbnetv3(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """ FBNetV3
+    Paper: `FBNetV3: Joint Architecture-Recipe Search using Predictor Pretraining`
+        - https://arxiv.org/abs/2006.02049
+    FIXME untested, this is a preliminary impl of some FBNet-V3 variants.
+    """
+    vl = variant.split('_')[-1]
+    if vl in ('a', 'b'):
+        stem_size = 16
+        arch_def = [
+            ['ds_r2_k3_s1_e1_c16'],
+            ['ir_r1_k5_s2_e4_c24', 'ir_r3_k5_s1_e2_c24'],
+            ['ir_r1_k5_s2_e5_c40_se0.25', 'ir_r4_k5_s1_e3_c40_se0.25'],
+            ['ir_r1_k5_s2_e5_c72', 'ir_r4_k3_s1_e3_c72'],
+            ['ir_r1_k3_s1_e5_c120_se0.25', 'ir_r5_k5_s1_e3_c120_se0.25'],
+            ['ir_r1_k3_s2_e6_c184_se0.25', 'ir_r5_k5_s1_e4_c184_se0.25', 'ir_r1_k5_s1_e6_c224_se0.25'],
+            ['cn_r1_k1_s1_c1344'],
+        ]
+    elif vl == 'd':
+        stem_size = 24
+        arch_def = [
+            ['ds_r2_k3_s1_e1_c16'],
+            ['ir_r1_k3_s2_e5_c24', 'ir_r5_k3_s1_e2_c24'],
+            ['ir_r1_k5_s2_e4_c40_se0.25', 'ir_r4_k3_s1_e3_c40_se0.25'],
+            ['ir_r1_k3_s2_e5_c72', 'ir_r4_k3_s1_e3_c72'],
+            ['ir_r1_k3_s1_e5_c128_se0.25', 'ir_r6_k5_s1_e3_c128_se0.25'],
+            ['ir_r1_k3_s2_e6_c208_se0.25', 'ir_r5_k5_s1_e5_c208_se0.25', 'ir_r1_k5_s1_e6_c240_se0.25'],
+            ['cn_r1_k1_s1_c1440'],
+        ]
+    elif vl == 'g':
+        stem_size = 32
+        arch_def = [
+            ['ds_r3_k3_s1_e1_c24'],
+            ['ir_r1_k5_s2_e4_c40', 'ir_r4_k5_s1_e2_c40'],
+            ['ir_r1_k5_s2_e4_c56_se0.25', 'ir_r4_k5_s1_e3_c56_se0.25'],
+            ['ir_r1_k5_s2_e5_c104', 'ir_r4_k3_s1_e3_c104'],
+            ['ir_r1_k3_s1_e5_c160_se0.25', 'ir_r8_k5_s1_e3_c160_se0.25'],
+            ['ir_r1_k3_s2_e6_c264_se0.25', 'ir_r6_k5_s1_e5_c264_se0.25', 'ir_r2_k5_s1_e6_c288_se0.25'],
+            ['cn_r1_k1_s1_c1728'],
+        ]
+    else:
+        raise NotImplemented
+    round_chs_fn = partial(round_channels, multiplier=channel_multiplier, round_limit=0.95)
+    se_layer = partial(SqueezeExcite, gate_layer='hard_sigmoid', rd_round_fn=round_chs_fn)
+    act_layer = resolve_act_layer(kwargs, 'hard_swish')
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        num_features=1984,
+        head_bias=False,
+        stem_size=stem_size,
+        round_chs_fn=round_chs_fn,
+        se_from_exp=False,
+        norm_layer=partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=act_layer,
+        se_layer=se_layer,
+        **kwargs,
+    )
+    model = _create_mnv3(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_lcnet(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """ LCNet
+    Essentially a MobileNet-V3 crossed with a MobileNet-V1
+
+    Paper: `PP-LCNet: A Lightweight CPU Convolutional Neural Network` - https://arxiv.org/abs/2109.15099
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['dsa_r1_k3_s1_c32'],
+        # stage 1, 112x112 in
+        ['dsa_r2_k3_s2_c64'],
+        # stage 2, 56x56 in
+        ['dsa_r2_k3_s2_c128'],
+        # stage 3, 28x28 in
+        ['dsa_r1_k3_s2_c256', 'dsa_r1_k5_s1_c256'],
+        # stage 4, 14x14in
+        ['dsa_r4_k5_s1_c256'],
+        # stage 5, 14x14in
+        ['dsa_r2_k5_s2_c512_se0.25'],
+        # 7x7
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=16,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'hard_swish'),
+        se_layer=partial(SqueezeExcite, gate_layer='hard_sigmoid', force_act_layer=nn.ReLU),
+        num_features=1280,
+        **kwargs,
+    )
+    model = _create_mnv3(variant, pretrained, **model_kwargs)
+    return model
+
+
+def _gen_lcnet(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """ LCNet
+    Essentially a MobileNet-V3 crossed with a MobileNet-V1
+
+    Paper: `PP-LCNet: A Lightweight CPU Convolutional Neural Network` - https://arxiv.org/abs/2109.15099
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['dsa_r1_k3_s1_c32'],
+        # stage 1, 112x112 in
+        ['dsa_r2_k3_s2_c64'],
+        # stage 2, 56x56 in
+        ['dsa_r2_k3_s2_c128'],
+        # stage 3, 28x28 in
+        ['dsa_r1_k3_s2_c256', 'dsa_r1_k5_s1_c256'],
+        # stage 4, 14x14in
+        ['dsa_r4_k5_s1_c256'],
+        # stage 5, 14x14in
+        ['dsa_r2_k5_s2_c512_se0.25'],
+        # 7x7
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=16,
+        round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
+        norm_layer=partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
+        act_layer=resolve_act_layer(kwargs, 'hard_swish'),
+        se_layer=partial(SqueezeExcite, gate_layer='hard_sigmoid', force_act_layer=nn.ReLU),
+        num_features=1280,
+        **kwargs,
+    )
+    model = _create_mnv3(variant, pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def mobilenetv3_large_075(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    model = _gen_mobilenet_v3('mobilenetv3_large_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv3_large_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    model = _gen_mobilenet_v3('mobilenetv3_large_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv3_large_100_miil(pretrained=False, **kwargs):
+    """ MobileNet V3
+    Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
+    """
+    model = _gen_mobilenet_v3('mobilenetv3_large_100_miil', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv3_large_100_miil_in21k(pretrained=False, **kwargs):
+    """ MobileNet V3, 21k pretraining
+    Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
+    """
+    model = _gen_mobilenet_v3('mobilenetv3_large_100_miil_in21k', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv3_small_050(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    model = _gen_mobilenet_v3('mobilenetv3_small_050', 0.50, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv3_small_075(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    model = _gen_mobilenet_v3('mobilenetv3_small_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv3_small_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    model = _gen_mobilenet_v3('mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv3_rw(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    if pretrained:
+        # pretrained model trained with non-default BN epsilon
+        kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    model = _gen_mobilenet_v3_rw('mobilenetv3_rw', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_mobilenetv3_large_075(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_large_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_mobilenetv3_large_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_large_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_mobilenetv3_large_minimal_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_large_minimal_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_mobilenetv3_small_075(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_small_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_mobilenetv3_small_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_mobilenetv3_small_minimal_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_small_minimal_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def fbnetv3_b(pretrained=False, **kwargs):
+    """ FBNetV3-B """
+    model = _gen_fbnetv3('fbnetv3_b', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def fbnetv3_d(pretrained=False, **kwargs):
+    """ FBNetV3-D """
+    model = _gen_fbnetv3('fbnetv3_d', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def fbnetv3_g(pretrained=False, **kwargs):
+    """ FBNetV3-G """
+    model = _gen_fbnetv3('fbnetv3_g', pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def lcnet_035(pretrained=False, **kwargs):
+    """ PP-LCNet 0.35"""
+    model = _gen_lcnet('lcnet_035', 0.35, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def lcnet_050(pretrained=False, **kwargs):
+    """ PP-LCNet 0.5"""
+    model = _gen_lcnet('lcnet_050', 0.5, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def lcnet_075(pretrained=False, **kwargs):
+    """ PP-LCNet 1.0"""
+    model = _gen_lcnet('lcnet_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def lcnet_100(pretrained=False, **kwargs):
+    """ PP-LCNet 1.0"""
+    model = _gen_lcnet('lcnet_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def lcnet_150(pretrained=False, **kwargs):
+    """ PP-LCNet 1.5"""
+    model = _gen_lcnet('lcnet_150', 1.5, pretrained=pretrained, **kwargs)
+    return model
diff --git a/src/custom_timm/models/mobilevit.py b/src/custom_timm/models/mobilevit.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd5479a7cf9a379cc40e918a57980db6812be045
--- /dev/null
+++ b/src/custom_timm/models/mobilevit.py
@@ -0,0 +1,699 @@
+""" MobileViT
+
+Paper:
+V1: `MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer` - https://arxiv.org/abs/2110.02178
+V2: `Separable Self-attention for Mobile Vision Transformers` - https://arxiv.org/abs/2206.02680
+
+MobileVitBlock and checkpoints adapted from https://github.com/apple/ml-cvnets (original copyright below)
+License: https://github.com/apple/ml-cvnets/blob/main/LICENSE (Apple open source)
+
+Rest of code, ByobNet, and Transformer block hacked together by / Copyright 2022, Ross Wightman
+"""
+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2020 Apple Inc. All Rights Reserved.
+#
+import math
+from typing import Union, Callable, Dict, Tuple, Optional, Sequence
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from .byobnet import register_block, ByoBlockCfg, ByoModelCfg, ByobNet, LayerFn, num_groups
+from .fx_features import register_notrace_module
+from .layers import to_2tuple, make_divisible, LayerNorm2d, GroupNorm1, ConvMlp, DropPath
+from .vision_transformer import Block as TransformerBlock
+from .helpers import build_model_with_cfg
+from .registry import register_model
+
+__all__ = []
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 256, 256), 'pool_size': (8, 8),
+        'crop_pct': 0.9, 'interpolation': 'bicubic',
+        'mean': (0., 0., 0.), 'std': (1., 1., 1.),
+        'first_conv': 'stem.conv', 'classifier': 'head.fc',
+        'fixed_input_size': False,
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'mobilevit_xxs': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevit_xxs-ad385b40.pth'),
+    'mobilevit_xs': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevit_xs-8fbd6366.pth'),
+    'mobilevit_s': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevit_s-38a5a959.pth'),
+    'semobilevit_s': _cfg(),
+
+    'mobilevitv2_050': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_050-49951ee2.pth',
+        crop_pct=0.888),
+    'mobilevitv2_075': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_075-b5556ef6.pth',
+        crop_pct=0.888),
+    'mobilevitv2_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_100-e464ef3b.pth',
+        crop_pct=0.888),
+    'mobilevitv2_125': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_125-0ae35027.pth',
+        crop_pct=0.888),
+    'mobilevitv2_150': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_150-737c5019.pth',
+        crop_pct=0.888),
+    'mobilevitv2_175': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_175-16462ee2.pth',
+        crop_pct=0.888),
+    'mobilevitv2_200': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_200-b3422f67.pth',
+        crop_pct=0.888),
+
+    'mobilevitv2_150_in22ft1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_150_in22ft1k-0b555d7b.pth',
+        crop_pct=0.888),
+    'mobilevitv2_175_in22ft1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_175_in22ft1k-4117fa1f.pth',
+        crop_pct=0.888),
+    'mobilevitv2_200_in22ft1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_200_in22ft1k-1d7c8927.pth',
+        crop_pct=0.888),
+
+    'mobilevitv2_150_384_in22ft1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_150_384_in22ft1k-9e142854.pth',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
+    'mobilevitv2_175_384_in22ft1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_175_384_in22ft1k-059cbe56.pth',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
+    'mobilevitv2_200_384_in22ft1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_200_384_in22ft1k-32c87503.pth',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
+}
+
+
+def _inverted_residual_block(d, c, s, br=4.0):
+    # inverted residual is a bottleneck block with bottle_ratio > 1 applied to in_chs, linear output, gs=1 (depthwise)
+    return ByoBlockCfg(
+        type='bottle', d=d, c=c, s=s, gs=1, br=br,
+        block_kwargs=dict(bottle_in=True, linear_out=True))
+
+
+def _mobilevit_block(d, c, s, transformer_dim, transformer_depth, patch_size=4, br=4.0):
+    # inverted residual + mobilevit blocks as per MobileViT network
+    return (
+        _inverted_residual_block(d=d, c=c, s=s, br=br),
+        ByoBlockCfg(
+            type='mobilevit', d=1, c=c, s=1,
+            block_kwargs=dict(
+                transformer_dim=transformer_dim,
+                transformer_depth=transformer_depth,
+                patch_size=patch_size)
+        )
+    )
+
+
+def _mobilevitv2_block(d, c, s, transformer_depth, patch_size=2, br=2.0, transformer_br=0.5):
+    # inverted residual + mobilevit blocks as per MobileViT network
+    return (
+        _inverted_residual_block(d=d, c=c, s=s, br=br),
+        ByoBlockCfg(
+            type='mobilevit2', d=1, c=c, s=1, br=transformer_br, gs=1,
+            block_kwargs=dict(
+                transformer_depth=transformer_depth,
+                patch_size=patch_size)
+        )
+    )
+
+
+def _mobilevitv2_cfg(multiplier=1.0):
+    chs = (64, 128, 256, 384, 512)
+    if multiplier != 1.0:
+        chs = tuple([int(c * multiplier) for c in chs])
+    cfg = ByoModelCfg(
+        blocks=(
+            _inverted_residual_block(d=1, c=chs[0], s=1, br=2.0),
+            _inverted_residual_block(d=2, c=chs[1], s=2, br=2.0),
+            _mobilevitv2_block(d=1, c=chs[2], s=2, transformer_depth=2),
+            _mobilevitv2_block(d=1, c=chs[3], s=2, transformer_depth=4),
+            _mobilevitv2_block(d=1, c=chs[4], s=2, transformer_depth=3),
+        ),
+        stem_chs=int(32 * multiplier),
+        stem_type='3x3',
+        stem_pool='',
+        downsample='',
+        act_layer='silu',
+    )
+    return cfg
+
+
+model_cfgs = dict(
+    mobilevit_xxs=ByoModelCfg(
+        blocks=(
+            _inverted_residual_block(d=1, c=16, s=1, br=2.0),
+            _inverted_residual_block(d=3, c=24, s=2, br=2.0),
+            _mobilevit_block(d=1, c=48, s=2, transformer_dim=64, transformer_depth=2, patch_size=2, br=2.0),
+            _mobilevit_block(d=1, c=64, s=2, transformer_dim=80, transformer_depth=4, patch_size=2, br=2.0),
+            _mobilevit_block(d=1, c=80, s=2, transformer_dim=96, transformer_depth=3, patch_size=2, br=2.0),
+        ),
+        stem_chs=16,
+        stem_type='3x3',
+        stem_pool='',
+        downsample='',
+        act_layer='silu',
+        num_features=320,
+    ),
+
+    mobilevit_xs=ByoModelCfg(
+        blocks=(
+            _inverted_residual_block(d=1, c=32, s=1),
+            _inverted_residual_block(d=3, c=48, s=2),
+            _mobilevit_block(d=1, c=64, s=2, transformer_dim=96, transformer_depth=2, patch_size=2),
+            _mobilevit_block(d=1, c=80, s=2, transformer_dim=120, transformer_depth=4, patch_size=2),
+            _mobilevit_block(d=1, c=96, s=2, transformer_dim=144, transformer_depth=3, patch_size=2),
+        ),
+        stem_chs=16,
+        stem_type='3x3',
+        stem_pool='',
+        downsample='',
+        act_layer='silu',
+        num_features=384,
+    ),
+
+    mobilevit_s=ByoModelCfg(
+        blocks=(
+            _inverted_residual_block(d=1, c=32, s=1),
+            _inverted_residual_block(d=3, c=64, s=2),
+            _mobilevit_block(d=1, c=96, s=2, transformer_dim=144, transformer_depth=2, patch_size=2),
+            _mobilevit_block(d=1, c=128, s=2, transformer_dim=192, transformer_depth=4, patch_size=2),
+            _mobilevit_block(d=1, c=160, s=2, transformer_dim=240, transformer_depth=3, patch_size=2),
+        ),
+        stem_chs=16,
+        stem_type='3x3',
+        stem_pool='',
+        downsample='',
+        act_layer='silu',
+        num_features=640,
+    ),
+
+    semobilevit_s=ByoModelCfg(
+        blocks=(
+            _inverted_residual_block(d=1, c=32, s=1),
+            _inverted_residual_block(d=3, c=64, s=2),
+            _mobilevit_block(d=1, c=96, s=2, transformer_dim=144, transformer_depth=2, patch_size=2),
+            _mobilevit_block(d=1, c=128, s=2, transformer_dim=192, transformer_depth=4, patch_size=2),
+            _mobilevit_block(d=1, c=160, s=2, transformer_dim=240, transformer_depth=3, patch_size=2),
+        ),
+        stem_chs=16,
+        stem_type='3x3',
+        stem_pool='',
+        downsample='',
+        attn_layer='se',
+        attn_kwargs=dict(rd_ratio=1/8),
+        num_features=640,
+    ),
+
+    mobilevitv2_050=_mobilevitv2_cfg(.50),
+    mobilevitv2_075=_mobilevitv2_cfg(.75),
+    mobilevitv2_125=_mobilevitv2_cfg(1.25),
+    mobilevitv2_100=_mobilevitv2_cfg(1.0),
+    mobilevitv2_150=_mobilevitv2_cfg(1.5),
+    mobilevitv2_175=_mobilevitv2_cfg(1.75),
+    mobilevitv2_200=_mobilevitv2_cfg(2.0),
+)
+
+
+@register_notrace_module
+class MobileVitBlock(nn.Module):
+    """ MobileViT block
+        Paper: https://arxiv.org/abs/2110.02178?context=cs.LG
+    """
+    def __init__(
+            self,
+            in_chs: int,
+            out_chs: Optional[int] = None,
+            kernel_size: int = 3,
+            stride: int = 1,
+            bottle_ratio: float = 1.0,
+            group_size: Optional[int] = None,
+            dilation: Tuple[int, int] = (1, 1),
+            mlp_ratio: float = 2.0,
+            transformer_dim: Optional[int] = None,
+            transformer_depth: int = 2,
+            patch_size: int = 8,
+            num_heads: int = 4,
+            attn_drop: float = 0.,
+            drop: int = 0.,
+            no_fusion: bool = False,
+            drop_path_rate: float = 0.,
+            layers: LayerFn = None,
+            transformer_norm_layer: Callable = nn.LayerNorm,
+            **kwargs,  # eat unused args
+    ):
+        super(MobileVitBlock, self).__init__()
+
+        layers = layers or LayerFn()
+        groups = num_groups(group_size, in_chs)
+        out_chs = out_chs or in_chs
+        transformer_dim = transformer_dim or make_divisible(bottle_ratio * in_chs)
+
+        self.conv_kxk = layers.conv_norm_act(
+            in_chs, in_chs, kernel_size=kernel_size,
+            stride=stride, groups=groups, dilation=dilation[0])
+        self.conv_1x1 = nn.Conv2d(in_chs, transformer_dim, kernel_size=1, bias=False)
+
+        self.transformer = nn.Sequential(*[
+            TransformerBlock(
+                transformer_dim, mlp_ratio=mlp_ratio, num_heads=num_heads, qkv_bias=True,
+                attn_drop=attn_drop, drop=drop, drop_path=drop_path_rate,
+                act_layer=layers.act, norm_layer=transformer_norm_layer)
+            for _ in range(transformer_depth)
+        ])
+        self.norm = transformer_norm_layer(transformer_dim)
+
+        self.conv_proj = layers.conv_norm_act(transformer_dim, out_chs, kernel_size=1, stride=1)
+
+        if no_fusion:
+            self.conv_fusion = None
+        else:
+            self.conv_fusion = layers.conv_norm_act(in_chs + out_chs, out_chs, kernel_size=kernel_size, stride=1)
+
+        self.patch_size = to_2tuple(patch_size)
+        self.patch_area = self.patch_size[0] * self.patch_size[1]
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shortcut = x
+
+        # Local representation
+        x = self.conv_kxk(x)
+        x = self.conv_1x1(x)
+
+        # Unfold (feature map -> patches)
+        patch_h, patch_w = self.patch_size
+        B, C, H, W = x.shape
+        new_h, new_w = math.ceil(H / patch_h) * patch_h, math.ceil(W / patch_w) * patch_w
+        num_patch_h, num_patch_w = new_h // patch_h, new_w // patch_w  # n_h, n_w
+        num_patches = num_patch_h * num_patch_w  # N
+        interpolate = False
+        if new_h != H or new_w != W:
+            # Note: Padding can be done, but then it needs to be handled in attention function.
+            x = F.interpolate(x, size=(new_h, new_w), mode="bilinear", align_corners=False)
+            interpolate = True
+
+        # [B, C, H, W] --> [B * C * n_h, n_w, p_h, p_w]
+        x = x.reshape(B * C * num_patch_h, patch_h, num_patch_w, patch_w).transpose(1, 2)
+        # [B * C * n_h, n_w, p_h, p_w] --> [BP, N, C] where P = p_h * p_w and N = n_h * n_w
+        x = x.reshape(B, C, num_patches, self.patch_area).transpose(1, 3).reshape(B * self.patch_area, num_patches, -1)
+
+        # Global representations
+        x = self.transformer(x)
+        x = self.norm(x)
+
+        # Fold (patch -> feature map)
+        # [B, P, N, C] --> [B*C*n_h, n_w, p_h, p_w]
+        x = x.contiguous().view(B, self.patch_area, num_patches, -1)
+        x = x.transpose(1, 3).reshape(B * C * num_patch_h, num_patch_w, patch_h, patch_w)
+        # [B*C*n_h, n_w, p_h, p_w] --> [B*C*n_h, p_h, n_w, p_w] --> [B, C, H, W]
+        x = x.transpose(1, 2).reshape(B, C, num_patch_h * patch_h, num_patch_w * patch_w)
+        if interpolate:
+            x = F.interpolate(x, size=(H, W), mode="bilinear", align_corners=False)
+
+        x = self.conv_proj(x)
+        if self.conv_fusion is not None:
+            x = self.conv_fusion(torch.cat((shortcut, x), dim=1))
+        return x
+
+
+class LinearSelfAttention(nn.Module):
+    """
+    This layer applies a self-attention with linear complexity, as described in `https://arxiv.org/abs/2206.02680`
+    This layer can be used for self- as well as cross-attention.
+    Args:
+        embed_dim (int): :math:`C` from an expected input of size :math:`(N, C, H, W)`
+        attn_drop (float): Dropout value for context scores. Default: 0.0
+        bias (bool): Use bias in learnable layers. Default: True
+    Shape:
+        - Input: :math:`(N, C, P, N)` where :math:`N` is the batch size, :math:`C` is the input channels,
+        :math:`P` is the number of pixels in the patch, and :math:`N` is the number of patches
+        - Output: same as the input
+    .. note::
+        For MobileViTv2, we unfold the feature map [B, C, H, W] into [B, C, P, N] where P is the number of pixels
+        in a patch and N is the number of patches. Because channel is the first dimension in this unfolded tensor,
+        we use point-wise convolution (instead of a linear layer). This avoids a transpose operation (which may be
+        expensive on resource-constrained devices) that may be required to convert the unfolded tensor from
+        channel-first to channel-last format in case of a linear layer.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+
+        self.qkv_proj = nn.Conv2d(
+            in_channels=embed_dim,
+            out_channels=1 + (2 * embed_dim),
+            bias=bias,
+            kernel_size=1,
+        )
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.out_proj = nn.Conv2d(
+            in_channels=embed_dim,
+            out_channels=embed_dim,
+            bias=bias,
+            kernel_size=1,
+        )
+        self.out_drop = nn.Dropout(proj_drop)
+
+    def _forward_self_attn(self, x: torch.Tensor) -> torch.Tensor:
+        # [B, C, P, N] --> [B, h + 2d, P, N]
+        qkv = self.qkv_proj(x)
+
+        # Project x into query, key and value
+        # Query --> [B, 1, P, N]
+        # value, key --> [B, d, P, N]
+        query, key, value = qkv.split([1, self.embed_dim, self.embed_dim], dim=1)
+
+        # apply softmax along N dimension
+        context_scores = F.softmax(query, dim=-1)
+        context_scores = self.attn_drop(context_scores)
+
+        # Compute context vector
+        # [B, d, P, N] x [B, 1, P, N] -> [B, d, P, N] --> [B, d, P, 1]
+        context_vector = (key * context_scores).sum(dim=-1, keepdim=True)
+
+        # combine context vector with values
+        # [B, d, P, N] * [B, d, P, 1] --> [B, d, P, N]
+        out = F.relu(value) * context_vector.expand_as(value)
+        out = self.out_proj(out)
+        out = self.out_drop(out)
+        return out
+
+    @torch.jit.ignore()
+    def _forward_cross_attn(self, x: torch.Tensor, x_prev: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # x --> [B, C, P, N]
+        # x_prev = [B, C, P, M]
+        batch_size, in_dim, kv_patch_area, kv_num_patches = x.shape
+        q_patch_area, q_num_patches = x.shape[-2:]
+
+        assert (
+            kv_patch_area == q_patch_area
+        ), "The number of pixels in a patch for query and key_value should be the same"
+
+        # compute query, key, and value
+        # [B, C, P, M] --> [B, 1 + d, P, M]
+        qk = F.conv2d(
+            x_prev,
+            weight=self.qkv_proj.weight[:self.embed_dim + 1],
+            bias=self.qkv_proj.bias[:self.embed_dim + 1],
+        )
+
+        # [B, 1 + d, P, M] --> [B, 1, P, M], [B, d, P, M]
+        query, key = qk.split([1, self.embed_dim], dim=1)
+        # [B, C, P, N] --> [B, d, P, N]
+        value = F.conv2d(
+            x,
+            weight=self.qkv_proj.weight[self.embed_dim + 1],
+            bias=self.qkv_proj.bias[self.embed_dim + 1] if self.qkv_proj.bias is not None else None,
+        )
+
+        # apply softmax along M dimension
+        context_scores = F.softmax(query, dim=-1)
+        context_scores = self.attn_drop(context_scores)
+
+        # compute context vector
+        # [B, d, P, M] * [B, 1, P, M] -> [B, d, P, M] --> [B, d, P, 1]
+        context_vector = (key * context_scores).sum(dim=-1, keepdim=True)
+
+        # combine context vector with values
+        # [B, d, P, N] * [B, d, P, 1] --> [B, d, P, N]
+        out = F.relu(value) * context_vector.expand_as(value)
+        out = self.out_proj(out)
+        out = self.out_drop(out)
+        return out
+
+    def forward(self, x: torch.Tensor, x_prev: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if x_prev is None:
+            return self._forward_self_attn(x)
+        else:
+            return self._forward_cross_attn(x, x_prev=x_prev)
+
+
+class LinearTransformerBlock(nn.Module):
+    """
+    This class defines the pre-norm transformer encoder with linear self-attention in `MobileViTv2 paper <>`_
+    Args:
+        embed_dim (int): :math:`C_{in}` from an expected input of size :math:`(B, C_{in}, P, N)`
+        mlp_ratio (float): Inner dimension ratio of the FFN relative to embed_dim
+        drop (float): Dropout rate. Default: 0.0
+        attn_drop (float): Dropout rate for attention in multi-head attention. Default: 0.0
+        drop_path (float): Stochastic depth rate Default: 0.0
+        norm_layer (Callable): Normalization layer. Default: layer_norm_2d
+    Shape:
+        - Input: :math:`(B, C_{in}, P, N)` where :math:`B` is batch size, :math:`C_{in}` is input embedding dim,
+            :math:`P` is number of pixels in a patch, and :math:`N` is number of patches,
+        - Output: same shape as the input
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        mlp_ratio: float = 2.0,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        drop_path: float = 0.0,
+        act_layer=None,
+        norm_layer=None,
+    ) -> None:
+        super().__init__()
+        act_layer = act_layer or nn.SiLU
+        norm_layer = norm_layer or GroupNorm1
+
+        self.norm1 = norm_layer(embed_dim)
+        self.attn = LinearSelfAttention(embed_dim=embed_dim, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path1 = DropPath(drop_path)
+
+        self.norm2 = norm_layer(embed_dim)
+        self.mlp = ConvMlp(
+            in_features=embed_dim,
+            hidden_features=int(embed_dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=drop)
+        self.drop_path2 = DropPath(drop_path)
+
+    def forward(self, x: torch.Tensor, x_prev: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if x_prev is None:
+            # self-attention
+            x = x + self.drop_path1(self.attn(self.norm1(x)))
+        else:
+            # cross-attention
+            res = x
+            x = self.norm1(x)  # norm
+            x = self.attn(x, x_prev)  # attn
+            x = self.drop_path1(x) + res  # residual
+
+        # Feed forward network
+        x = x + self.drop_path2(self.mlp(self.norm2(x)))
+        return x
+
+
+@register_notrace_module
+class MobileVitV2Block(nn.Module):
+    """
+    This class defines the `MobileViTv2 block <>`_
+    """
+
+    def __init__(
+        self,
+        in_chs: int,
+        out_chs: Optional[int] = None,
+        kernel_size: int = 3,
+        bottle_ratio: float = 1.0,
+        group_size: Optional[int] = 1,
+        dilation: Tuple[int, int] = (1, 1),
+        mlp_ratio: float = 2.0,
+        transformer_dim: Optional[int] = None,
+        transformer_depth: int = 2,
+        patch_size: int = 8,
+        attn_drop: float = 0.,
+        drop: int = 0.,
+        drop_path_rate: float = 0.,
+        layers: LayerFn = None,
+        transformer_norm_layer: Callable = GroupNorm1,
+        **kwargs,  # eat unused args
+    ):
+        super(MobileVitV2Block, self).__init__()
+        layers = layers or LayerFn()
+        groups = num_groups(group_size, in_chs)
+        out_chs = out_chs or in_chs
+        transformer_dim = transformer_dim or make_divisible(bottle_ratio * in_chs)
+
+        self.conv_kxk = layers.conv_norm_act(
+            in_chs, in_chs, kernel_size=kernel_size,
+            stride=1, groups=groups, dilation=dilation[0])
+        self.conv_1x1 = nn.Conv2d(in_chs, transformer_dim, kernel_size=1, bias=False)
+
+        self.transformer = nn.Sequential(*[
+            LinearTransformerBlock(
+                transformer_dim,
+                mlp_ratio=mlp_ratio,
+                attn_drop=attn_drop,
+                drop=drop,
+                drop_path=drop_path_rate,
+                act_layer=layers.act,
+                norm_layer=transformer_norm_layer
+            )
+            for _ in range(transformer_depth)
+        ])
+        self.norm = transformer_norm_layer(transformer_dim)
+
+        self.conv_proj = layers.conv_norm_act(transformer_dim, out_chs, kernel_size=1, stride=1, apply_act=False)
+
+        self.patch_size = to_2tuple(patch_size)
+        self.patch_area = self.patch_size[0] * self.patch_size[1]
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, C, H, W = x.shape
+        patch_h, patch_w = self.patch_size
+        new_h, new_w = math.ceil(H / patch_h) * patch_h, math.ceil(W / patch_w) * patch_w
+        num_patch_h, num_patch_w = new_h // patch_h, new_w // patch_w  # n_h, n_w
+        num_patches = num_patch_h * num_patch_w  # N
+        if new_h != H or new_w != W:
+            x = F.interpolate(x, size=(new_h, new_w), mode="bilinear", align_corners=True)
+
+        # Local representation
+        x = self.conv_kxk(x)
+        x = self.conv_1x1(x)
+
+        # Unfold (feature map -> patches), [B, C, H, W] -> [B, C, P, N]
+        C = x.shape[1]
+        x = x.reshape(B, C, num_patch_h, patch_h, num_patch_w, patch_w).permute(0, 1, 3, 5, 2, 4)
+        x = x.reshape(B, C, -1, num_patches)
+
+        # Global representations
+        x = self.transformer(x)
+        x = self.norm(x)
+
+        # Fold (patches -> feature map), [B, C, P, N] --> [B, C, H, W]
+        x = x.reshape(B, C, patch_h, patch_w, num_patch_h, num_patch_w).permute(0, 1, 4, 2, 5, 3)
+        x = x.reshape(B, C, num_patch_h * patch_h, num_patch_w * patch_w)
+
+        x = self.conv_proj(x)
+        return x
+
+
+register_block('mobilevit', MobileVitBlock)
+register_block('mobilevit2', MobileVitV2Block)
+
+
+def _create_mobilevit(variant, cfg_variant=None, pretrained=False, **kwargs):
+    return build_model_with_cfg(
+        ByobNet, variant, pretrained,
+        model_cfg=model_cfgs[variant] if not cfg_variant else model_cfgs[cfg_variant],
+        feature_cfg=dict(flatten_sequential=True),
+        **kwargs)
+
+
+def _create_mobilevit2(variant, cfg_variant=None, pretrained=False, **kwargs):
+    return build_model_with_cfg(
+        ByobNet, variant, pretrained,
+        model_cfg=model_cfgs[variant] if not cfg_variant else model_cfgs[cfg_variant],
+        feature_cfg=dict(flatten_sequential=True),
+        **kwargs)
+
+
+@register_model
+def mobilevit_xxs(pretrained=False, **kwargs):
+    return _create_mobilevit('mobilevit_xxs', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mobilevit_xs(pretrained=False, **kwargs):
+    return _create_mobilevit('mobilevit_xs', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mobilevit_s(pretrained=False, **kwargs):
+    return _create_mobilevit('mobilevit_s', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def semobilevit_s(pretrained=False, **kwargs):
+    return _create_mobilevit('semobilevit_s', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mobilevitv2_050(pretrained=False, **kwargs):
+    return _create_mobilevit('mobilevitv2_050', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mobilevitv2_075(pretrained=False, **kwargs):
+    return _create_mobilevit('mobilevitv2_075', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mobilevitv2_100(pretrained=False, **kwargs):
+    return _create_mobilevit('mobilevitv2_100', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mobilevitv2_125(pretrained=False, **kwargs):
+    return _create_mobilevit('mobilevitv2_125', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mobilevitv2_150(pretrained=False, **kwargs):
+    return _create_mobilevit('mobilevitv2_150', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mobilevitv2_175(pretrained=False, **kwargs):
+    return _create_mobilevit('mobilevitv2_175', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mobilevitv2_200(pretrained=False, **kwargs):
+    return _create_mobilevit('mobilevitv2_200', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mobilevitv2_150_in22ft1k(pretrained=False, **kwargs):
+    return _create_mobilevit(
+        'mobilevitv2_150_in22ft1k', cfg_variant='mobilevitv2_150', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mobilevitv2_175_in22ft1k(pretrained=False, **kwargs):
+    return _create_mobilevit(
+        'mobilevitv2_175_in22ft1k', cfg_variant='mobilevitv2_175', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mobilevitv2_200_in22ft1k(pretrained=False, **kwargs):
+    return _create_mobilevit(
+        'mobilevitv2_200_in22ft1k', cfg_variant='mobilevitv2_200', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mobilevitv2_150_384_in22ft1k(pretrained=False, **kwargs):
+    return _create_mobilevit(
+        'mobilevitv2_150_384_in22ft1k', cfg_variant='mobilevitv2_150', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mobilevitv2_175_384_in22ft1k(pretrained=False, **kwargs):
+    return _create_mobilevit(
+        'mobilevitv2_175_384_in22ft1k', cfg_variant='mobilevitv2_175', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mobilevitv2_200_384_in22ft1k(pretrained=False, **kwargs):
+    return _create_mobilevit(
+        'mobilevitv2_200_384_in22ft1k', cfg_variant='mobilevitv2_200', pretrained=pretrained, **kwargs)
\ No newline at end of file
diff --git a/src/custom_timm/models/mvitv2.py b/src/custom_timm/models/mvitv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7ec58979f3b2f35393f4555abcb3342d055710b
--- /dev/null
+++ b/src/custom_timm/models/mvitv2.py
@@ -0,0 +1,1010 @@
+""" Multi-Scale Vision Transformer v2
+
+@inproceedings{li2021improved,
+  title={MViTv2: Improved multiscale vision transformers for classification and detection},
+  author={Li, Yanghao and Wu, Chao-Yuan and Fan, Haoqi and Mangalam, Karttikeya and Xiong, Bo and Malik, Jitendra and Feichtenhofer, Christoph},
+  booktitle={CVPR},
+  year={2022}
+}
+
+Code adapted from original Apache 2.0 licensed impl at https://github.com/facebookresearch/mvit
+Original copyright below.
+
+Modifications and timm support by / Copyright 2022, Ross Wightman
+"""
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. All Rights Reserved.
+import operator
+from collections import OrderedDict
+from dataclasses import dataclass
+from functools import partial, reduce
+from typing import Union, List, Tuple, Optional
+
+import torch
+import torch.utils.checkpoint as checkpoint
+from torch import nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .fx_features import register_notrace_function
+from .helpers import build_model_with_cfg
+from .layers import Mlp, DropPath, trunc_normal_tf_, get_norm_layer, to_2tuple
+from .registry import register_model
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.proj', 'classifier': 'head.fc',
+        'fixed_input_size': True,
+        **kwargs
+    }
+
+
+default_cfgs = dict(
+    mvitv2_tiny=_cfg(url='https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_T_in1k.pyth'),
+    mvitv2_small=_cfg(url='https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_S_in1k.pyth'),
+    mvitv2_base=_cfg(url='https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_B_in1k.pyth'),
+    mvitv2_large=_cfg(url='https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_L_in1k.pyth'),
+
+    mvitv2_base_in21k=_cfg(
+        url='https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_B_in21k.pyth',
+        num_classes=19168),
+    mvitv2_large_in21k=_cfg(
+        url='https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_L_in21k.pyth',
+        num_classes=19168),
+    mvitv2_huge_in21k=_cfg(
+        url='https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_H_in21k.pyth',
+        num_classes=19168),
+
+    mvitv2_small_cls=_cfg(url=''),
+)
+
+
+@dataclass
+class MultiScaleVitCfg:
+    depths: Tuple[int, ...] = (2, 3, 16, 3)
+    embed_dim: Union[int, Tuple[int, ...]] = 96
+    num_heads: Union[int, Tuple[int, ...]] = 1
+    mlp_ratio: float = 4.
+    pool_first: bool = False
+    expand_attn: bool = True
+    qkv_bias: bool = True
+    use_cls_token: bool = False
+    use_abs_pos: bool = False
+    residual_pooling: bool = True
+    mode: str = 'conv'
+    kernel_qkv: Tuple[int, int] = (3, 3)
+    stride_q: Optional[Tuple[Tuple[int, int]]] = ((1, 1), (2, 2), (2, 2), (2, 2))
+    stride_kv: Optional[Tuple[Tuple[int, int]]] = None
+    stride_kv_adaptive: Optional[Tuple[int, int]] = (4, 4)
+    patch_kernel: Tuple[int, int] = (7, 7)
+    patch_stride: Tuple[int, int] = (4, 4)
+    patch_padding: Tuple[int, int] = (3, 3)
+    pool_type: str = 'max'
+    rel_pos_type: str = 'spatial'
+    act_layer: Union[str, Tuple[str, str]] = 'gelu'
+    norm_layer: Union[str, Tuple[str, str]] = 'layernorm'
+    norm_eps: float = 1e-6
+
+    def __post_init__(self):
+        num_stages = len(self.depths)
+        if not isinstance(self.embed_dim, (tuple, list)):
+            self.embed_dim = tuple(self.embed_dim * 2 ** i for i in range(num_stages))
+        assert len(self.embed_dim) == num_stages
+
+        if not isinstance(self.num_heads, (tuple, list)):
+            self.num_heads = tuple(self.num_heads * 2 ** i for i in range(num_stages))
+        assert len(self.num_heads) == num_stages
+
+        if self.stride_kv_adaptive is not None and self.stride_kv is None:
+            _stride_kv = self.stride_kv_adaptive
+            pool_kv_stride = []
+            for i in range(num_stages):
+                if min(self.stride_q[i]) > 1:
+                    _stride_kv = [
+                        max(_stride_kv[d] // self.stride_q[i][d], 1)
+                        for d in range(len(_stride_kv))
+                    ]
+                pool_kv_stride.append(tuple(_stride_kv))
+            self.stride_kv = tuple(pool_kv_stride)
+
+
+model_cfgs = dict(
+    mvitv2_tiny=MultiScaleVitCfg(
+        depths=(1, 2, 5, 2),
+    ),
+    mvitv2_small=MultiScaleVitCfg(
+        depths=(1, 2, 11, 2),
+    ),
+    mvitv2_base=MultiScaleVitCfg(
+        depths=(2, 3, 16, 3),
+    ),
+    mvitv2_large=MultiScaleVitCfg(
+        depths=(2, 6, 36, 4),
+        embed_dim=144,
+        num_heads=2,
+        expand_attn=False,
+    ),
+
+    mvitv2_base_in21k=MultiScaleVitCfg(
+        depths=(2, 3, 16, 3),
+    ),
+    mvitv2_large_in21k=MultiScaleVitCfg(
+        depths=(2, 6, 36, 4),
+        embed_dim=144,
+        num_heads=2,
+        expand_attn=False,
+    ),
+
+    mvitv2_small_cls=MultiScaleVitCfg(
+        depths=(1, 2, 11, 2),
+        use_cls_token=True,
+    ),
+)
+
+
+def prod(iterable):
+    return reduce(operator.mul, iterable, 1)
+
+
+class PatchEmbed(nn.Module):
+    """
+    PatchEmbed.
+    """
+
+    def __init__(
+            self,
+            dim_in=3,
+            dim_out=768,
+            kernel=(7, 7),
+            stride=(4, 4),
+            padding=(3, 3),
+    ):
+        super().__init__()
+
+        self.proj = nn.Conv2d(
+            dim_in,
+            dim_out,
+            kernel_size=kernel,
+            stride=stride,
+            padding=padding,
+        )
+
+    def forward(self, x) -> Tuple[torch.Tensor, List[int]]:
+        x = self.proj(x)
+        # B C H W -> B HW C
+        return x.flatten(2).transpose(1, 2), x.shape[-2:]
+
+
+@register_notrace_function
+def reshape_pre_pool(
+        x,
+        feat_size: List[int],
+        has_cls_token: bool = True
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    H, W = feat_size
+    if has_cls_token:
+        cls_tok, x = x[:, :, :1, :], x[:, :, 1:, :]
+    else:
+        cls_tok = None
+    x = x.reshape(-1, H, W, x.shape[-1]).permute(0, 3, 1, 2).contiguous()
+    return x, cls_tok
+
+
+@register_notrace_function
+def reshape_post_pool(
+        x,
+        num_heads: int,
+        cls_tok: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, List[int]]:
+    feat_size = [x.shape[2], x.shape[3]]
+    L_pooled = x.shape[2] * x.shape[3]
+    x = x.reshape(-1, num_heads, x.shape[1], L_pooled).transpose(2, 3)
+    if cls_tok is not None:
+        x = torch.cat((cls_tok, x), dim=2)
+    return x, feat_size
+
+
+@register_notrace_function
+def cal_rel_pos_type(
+        attn: torch.Tensor,
+        q: torch.Tensor,
+        has_cls_token: bool,
+        q_size: List[int],
+        k_size: List[int],
+        rel_pos_h: torch.Tensor,
+        rel_pos_w: torch.Tensor,
+):
+    """
+    Spatial Relative Positional Embeddings.
+    """
+    sp_idx = 1 if has_cls_token else 0
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+
+    # Scale up rel pos if shapes for q and k are different.
+    q_h_ratio = max(k_h / q_h, 1.0)
+    k_h_ratio = max(q_h / k_h, 1.0)
+    dist_h = torch.arange(q_h)[:, None] * q_h_ratio - torch.arange(k_h)[None, :] * k_h_ratio
+    dist_h += (k_h - 1) * k_h_ratio
+    q_w_ratio = max(k_w / q_w, 1.0)
+    k_w_ratio = max(q_w / k_w, 1.0)
+    dist_w = torch.arange(q_w)[:, None] * q_w_ratio - torch.arange(k_w)[None, :] * k_w_ratio
+    dist_w += (k_w - 1) * k_w_ratio
+
+    Rh = rel_pos_h[dist_h.long()]
+    Rw = rel_pos_w[dist_w.long()]
+
+    B, n_head, q_N, dim = q.shape
+
+    r_q = q[:, :, sp_idx:].reshape(B, n_head, q_h, q_w, dim)
+    rel_h = torch.einsum("byhwc,hkc->byhwk", r_q, Rh)
+    rel_w = torch.einsum("byhwc,wkc->byhwk", r_q, Rw)
+
+    attn[:, :, sp_idx:, sp_idx:] = (
+        attn[:, :, sp_idx:, sp_idx:].view(B, -1, q_h, q_w, k_h, k_w)
+        + rel_h[:, :, :, :, :, None]
+        + rel_w[:, :, :, :, None, :]
+    ).view(B, -1, q_h * q_w, k_h * k_w)
+
+    return attn
+
+
+class MultiScaleAttentionPoolFirst(nn.Module):
+    def __init__(
+            self,
+            dim,
+            dim_out,
+            feat_size,
+            num_heads=8,
+            qkv_bias=True,
+            mode="conv",
+            kernel_q=(1, 1),
+            kernel_kv=(1, 1),
+            stride_q=(1, 1),
+            stride_kv=(1, 1),
+            has_cls_token=True,
+            rel_pos_type='spatial',
+            residual_pooling=True,
+            norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.dim_out = dim_out
+        self.head_dim = dim_out // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.has_cls_token = has_cls_token
+        padding_q = tuple([int(q // 2) for q in kernel_q])
+        padding_kv = tuple([int(kv // 2) for kv in kernel_kv])
+
+        self.q = nn.Linear(dim, dim_out, bias=qkv_bias)
+        self.k = nn.Linear(dim, dim_out, bias=qkv_bias)
+        self.v = nn.Linear(dim, dim_out, bias=qkv_bias)
+        self.proj = nn.Linear(dim_out, dim_out)
+
+        # Skip pooling with kernel and stride size of (1, 1, 1).
+        if prod(kernel_q) == 1 and prod(stride_q) == 1:
+            kernel_q = None
+        if prod(kernel_kv) == 1 and prod(stride_kv) == 1:
+            kernel_kv = None
+        self.mode = mode
+        self.unshared = mode == 'conv_unshared'
+        self.pool_q, self.pool_k, self.pool_v = None, None, None
+        self.norm_q, self.norm_k, self.norm_v = None, None, None
+        if mode in ("avg", "max"):
+            pool_op = nn.MaxPool2d if mode == "max" else nn.AvgPool2d
+            if kernel_q:
+                self.pool_q = pool_op(kernel_q, stride_q, padding_q)
+            if kernel_kv:
+                self.pool_k = pool_op(kernel_kv, stride_kv, padding_kv)
+                self.pool_v = pool_op(kernel_kv, stride_kv, padding_kv)
+        elif mode == "conv" or mode == "conv_unshared":
+            dim_conv = dim // num_heads if mode == "conv" else dim
+            if kernel_q:
+                self.pool_q = nn.Conv2d(
+                    dim_conv,
+                    dim_conv,
+                    kernel_q,
+                    stride=stride_q,
+                    padding=padding_q,
+                    groups=dim_conv,
+                    bias=False,
+                )
+                self.norm_q = norm_layer(dim_conv)
+            if kernel_kv:
+                self.pool_k = nn.Conv2d(
+                    dim_conv,
+                    dim_conv,
+                    kernel_kv,
+                    stride=stride_kv,
+                    padding=padding_kv,
+                    groups=dim_conv,
+                    bias=False,
+                )
+                self.norm_k = norm_layer(dim_conv)
+                self.pool_v = nn.Conv2d(
+                    dim_conv,
+                    dim_conv,
+                    kernel_kv,
+                    stride=stride_kv,
+                    padding=padding_kv,
+                    groups=dim_conv,
+                    bias=False,
+                )
+                self.norm_v = norm_layer(dim_conv)
+        else:
+            raise NotImplementedError(f"Unsupported model {mode}")
+
+        # relative pos embedding
+        self.rel_pos_type = rel_pos_type
+        if self.rel_pos_type == 'spatial':
+            assert feat_size[0] == feat_size[1]
+            size = feat_size[0]
+            q_size = size // stride_q[1] if len(stride_q) > 0 else size
+            kv_size = size // stride_kv[1] if len(stride_kv) > 0 else size
+            rel_sp_dim = 2 * max(q_size, kv_size) - 1
+
+            self.rel_pos_h = nn.Parameter(torch.zeros(rel_sp_dim, self.head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(rel_sp_dim, self.head_dim))
+            trunc_normal_tf_(self.rel_pos_h, std=0.02)
+            trunc_normal_tf_(self.rel_pos_w, std=0.02)
+
+        self.residual_pooling = residual_pooling
+
+    def forward(self, x, feat_size: List[int]):
+        B, N, _ = x.shape
+
+        fold_dim = 1 if self.unshared else self.num_heads
+        x = x.reshape(B, N, fold_dim, -1).permute(0, 2, 1, 3)
+        q = k = v = x
+
+        if self.pool_q is not None:
+            q, q_tok = reshape_pre_pool(q, feat_size, self.has_cls_token)
+            q = self.pool_q(q)
+            q, q_size = reshape_post_pool(q, self.num_heads, q_tok)
+        else:
+            q_size = feat_size
+        if self.norm_q is not None:
+            q = self.norm_q(q)
+
+        if self.pool_k is not None:
+            k, k_tok = reshape_pre_pool(k, feat_size, self.has_cls_token)
+            k = self.pool_k(k)
+            k, k_size = reshape_post_pool(k, self.num_heads, k_tok)
+        else:
+            k_size = feat_size
+        if self.norm_k is not None:
+            k = self.norm_k(k)
+
+        if self.pool_v is not None:
+            v, v_tok = reshape_pre_pool(v, feat_size, self.has_cls_token)
+            v = self.pool_v(v)
+            v, v_size = reshape_post_pool(v, self.num_heads, v_tok)
+        else:
+            v_size = feat_size
+        if self.norm_v is not None:
+            v = self.norm_v(v)
+
+        q_N = q_size[0] * q_size[1] + int(self.has_cls_token)
+        q = q.permute(0, 2, 1, 3).reshape(B, q_N, -1)
+        q = self.q(q).reshape(B, q_N, self.num_heads, -1).permute(0, 2, 1, 3)
+
+        k_N = k_size[0] * k_size[1] + int(self.has_cls_token)
+        k = k.permute(0, 2, 1, 3).reshape(B, k_N, -1)
+        k = self.k(k).reshape(B, k_N, self.num_heads, -1).permute(0, 2, 1, 3)
+
+        v_N = v_size[0] * v_size[1] + int(self.has_cls_token)
+        v = v.permute(0, 2, 1, 3).reshape(B, v_N, -1)
+        v = self.v(v).reshape(B, v_N, self.num_heads, -1).permute(0, 2, 1, 3)
+
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+        if self.rel_pos_type == 'spatial':
+            attn = cal_rel_pos_type(
+                attn,
+                q,
+                self.has_cls_token,
+                q_size,
+                k_size,
+                self.rel_pos_h,
+                self.rel_pos_w,
+            )
+        attn = attn.softmax(dim=-1)
+        x = attn @ v
+
+        if self.residual_pooling:
+            x = x + q
+
+        x = x.transpose(1, 2).reshape(B, -1, self.dim_out)
+        x = self.proj(x)
+
+        return x, q_size
+
+
+class MultiScaleAttention(nn.Module):
+    def __init__(
+            self,
+            dim,
+            dim_out,
+            feat_size,
+            num_heads=8,
+            qkv_bias=True,
+            mode="conv",
+            kernel_q=(1, 1),
+            kernel_kv=(1, 1),
+            stride_q=(1, 1),
+            stride_kv=(1, 1),
+            has_cls_token=True,
+            rel_pos_type='spatial',
+            residual_pooling=True,
+            norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.dim_out = dim_out
+        self.head_dim = dim_out // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.has_cls_token = has_cls_token
+        padding_q = tuple([int(q // 2) for q in kernel_q])
+        padding_kv = tuple([int(kv // 2) for kv in kernel_kv])
+
+        self.qkv = nn.Linear(dim, dim_out * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim_out, dim_out)
+
+        # Skip pooling with kernel and stride size of (1, 1, 1).
+        if prod(kernel_q) == 1 and prod(stride_q) == 1:
+            kernel_q = None
+        if prod(kernel_kv) == 1 and prod(stride_kv) == 1:
+            kernel_kv = None
+        self.mode = mode
+        self.unshared = mode == 'conv_unshared'
+        self.norm_q, self.norm_k, self.norm_v = None, None, None
+        self.pool_q, self.pool_k, self.pool_v = None, None, None
+        if mode in ("avg", "max"):
+            pool_op = nn.MaxPool2d if mode == "max" else nn.AvgPool2d
+            if kernel_q:
+                self.pool_q = pool_op(kernel_q, stride_q, padding_q)
+            if kernel_kv:
+                self.pool_k = pool_op(kernel_kv, stride_kv, padding_kv)
+                self.pool_v = pool_op(kernel_kv, stride_kv, padding_kv)
+        elif mode == "conv" or mode == "conv_unshared":
+            dim_conv = dim_out // num_heads if mode == "conv" else dim_out
+            if kernel_q:
+                self.pool_q = nn.Conv2d(
+                    dim_conv,
+                    dim_conv,
+                    kernel_q,
+                    stride=stride_q,
+                    padding=padding_q,
+                    groups=dim_conv,
+                    bias=False,
+                )
+                self.norm_q = norm_layer(dim_conv)
+            if kernel_kv:
+                self.pool_k = nn.Conv2d(
+                    dim_conv,
+                    dim_conv,
+                    kernel_kv,
+                    stride=stride_kv,
+                    padding=padding_kv,
+                    groups=dim_conv,
+                    bias=False,
+                )
+                self.norm_k = norm_layer(dim_conv)
+                self.pool_v = nn.Conv2d(
+                    dim_conv,
+                    dim_conv,
+                    kernel_kv,
+                    stride=stride_kv,
+                    padding=padding_kv,
+                    groups=dim_conv,
+                    bias=False,
+                )
+                self.norm_v = norm_layer(dim_conv)
+        else:
+            raise NotImplementedError(f"Unsupported model {mode}")
+
+        # relative pos embedding
+        self.rel_pos_type = rel_pos_type
+        if self.rel_pos_type == 'spatial':
+            assert feat_size[0] == feat_size[1]
+            size = feat_size[0]
+            q_size = size // stride_q[1] if len(stride_q) > 0 else size
+            kv_size = size // stride_kv[1] if len(stride_kv) > 0 else size
+            rel_sp_dim = 2 * max(q_size, kv_size) - 1
+
+            self.rel_pos_h = nn.Parameter(torch.zeros(rel_sp_dim, self.head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(rel_sp_dim, self.head_dim))
+            trunc_normal_tf_(self.rel_pos_h, std=0.02)
+            trunc_normal_tf_(self.rel_pos_w, std=0.02)
+
+        self.residual_pooling = residual_pooling
+
+    def forward(self, x, feat_size: List[int]):
+        B, N, _ = x.shape
+
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(dim=0)
+
+        if self.pool_q is not None:
+            q, q_tok = reshape_pre_pool(q, feat_size, self.has_cls_token)
+            q = self.pool_q(q)
+            q, q_size = reshape_post_pool(q, self.num_heads, q_tok)
+        else:
+            q_size = feat_size
+        if self.norm_q is not None:
+            q = self.norm_q(q)
+
+        if self.pool_k is not None:
+            k, k_tok = reshape_pre_pool(k, feat_size, self.has_cls_token)
+            k = self.pool_k(k)
+            k, k_size = reshape_post_pool(k, self.num_heads, k_tok)
+        else:
+            k_size = feat_size
+        if self.norm_k is not None:
+            k = self.norm_k(k)
+
+        if self.pool_v is not None:
+            v, v_tok = reshape_pre_pool(v, feat_size, self.has_cls_token)
+            v = self.pool_v(v)
+            v, _ = reshape_post_pool(v, self.num_heads, v_tok)
+        if self.norm_v is not None:
+            v = self.norm_v(v)
+
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+        if self.rel_pos_type == 'spatial':
+            attn = cal_rel_pos_type(
+                attn,
+                q,
+                self.has_cls_token,
+                q_size,
+                k_size,
+                self.rel_pos_h,
+                self.rel_pos_w,
+            )
+        attn = attn.softmax(dim=-1)
+        x = attn @ v
+
+        if self.residual_pooling:
+            x = x + q
+
+        x = x.transpose(1, 2).reshape(B, -1, self.dim_out)
+        x = self.proj(x)
+
+        return x, q_size
+
+
+class MultiScaleBlock(nn.Module):
+    def __init__(
+            self,
+            dim,
+            dim_out,
+            num_heads,
+            feat_size,
+            mlp_ratio=4.0,
+            qkv_bias=True,
+            drop_path=0.0,
+            norm_layer=nn.LayerNorm,
+            kernel_q=(1, 1),
+            kernel_kv=(1, 1),
+            stride_q=(1, 1),
+            stride_kv=(1, 1),
+            mode="conv",
+            has_cls_token=True,
+            expand_attn=False,
+            pool_first=False,
+            rel_pos_type='spatial',
+            residual_pooling=True,
+    ):
+        super().__init__()
+        proj_needed = dim != dim_out
+        self.dim = dim
+        self.dim_out = dim_out
+        self.has_cls_token = has_cls_token
+
+        self.norm1 = norm_layer(dim)
+
+        self.shortcut_proj_attn = nn.Linear(dim, dim_out) if proj_needed and expand_attn else None
+        if stride_q and prod(stride_q) > 1:
+            kernel_skip = [s + 1 if s > 1 else s for s in stride_q]
+            stride_skip = stride_q
+            padding_skip = [int(skip // 2) for skip in kernel_skip]
+            self.shortcut_pool_attn = nn.MaxPool2d(kernel_skip, stride_skip, padding_skip)
+        else:
+            self.shortcut_pool_attn = None
+
+        att_dim = dim_out if expand_attn else dim
+        attn_layer = MultiScaleAttentionPoolFirst if pool_first else MultiScaleAttention
+        self.attn = attn_layer(
+            dim,
+            att_dim,
+            num_heads=num_heads,
+            feat_size=feat_size,
+            qkv_bias=qkv_bias,
+            kernel_q=kernel_q,
+            kernel_kv=kernel_kv,
+            stride_q=stride_q,
+            stride_kv=stride_kv,
+            norm_layer=norm_layer,
+            has_cls_token=has_cls_token,
+            mode=mode,
+            rel_pos_type=rel_pos_type,
+            residual_pooling=residual_pooling,
+        )
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.norm2 = norm_layer(att_dim)
+        mlp_dim_out = dim_out
+        self.shortcut_proj_mlp = nn.Linear(dim, dim_out) if proj_needed and not expand_attn else None
+        self.mlp = Mlp(
+            in_features=att_dim,
+            hidden_features=int(att_dim * mlp_ratio),
+            out_features=mlp_dim_out,
+        )
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+    def _shortcut_pool(self, x, feat_size: List[int]):
+        if self.shortcut_pool_attn is None:
+            return x
+        if self.has_cls_token:
+            cls_tok, x = x[:, :1, :], x[:, 1:, :]
+        else:
+            cls_tok = None
+        B, L, C = x.shape
+        H, W = feat_size
+        x = x.reshape(B, H, W, C).permute(0, 3, 1, 2).contiguous()
+        x = self.shortcut_pool_attn(x)
+        x = x.reshape(B, C, -1).transpose(1, 2)
+        if cls_tok is not None:
+            x = torch.cat((cls_tok, x), dim=1)
+        return x
+
+    def forward(self, x, feat_size: List[int]):
+        x_norm = self.norm1(x)
+        # NOTE as per the original impl, this seems odd, but shortcut uses un-normalized input if no proj
+        x_shortcut = x if self.shortcut_proj_attn is None else self.shortcut_proj_attn(x_norm)
+        x_shortcut = self._shortcut_pool(x_shortcut, feat_size)
+        x, feat_size_new = self.attn(x_norm, feat_size)
+        x = x_shortcut + self.drop_path1(x)
+
+        x_norm = self.norm2(x)
+        x_shortcut = x if self.shortcut_proj_mlp is None else self.shortcut_proj_mlp(x_norm)
+        x = x_shortcut + self.drop_path2(self.mlp(x_norm))
+        return x, feat_size_new
+
+
+class MultiScaleVitStage(nn.Module):
+
+    def __init__(
+            self,
+            dim,
+            dim_out,
+            depth,
+            num_heads,
+            feat_size,
+            mlp_ratio=4.0,
+            qkv_bias=True,
+            mode="conv",
+            kernel_q=(1, 1),
+            kernel_kv=(1, 1),
+            stride_q=(1, 1),
+            stride_kv=(1, 1),
+            has_cls_token=True,
+            expand_attn=False,
+            pool_first=False,
+            rel_pos_type='spatial',
+            residual_pooling=True,
+            norm_layer=nn.LayerNorm,
+            drop_path=0.0,
+    ):
+        super().__init__()
+        self.grad_checkpointing = False
+
+        self.blocks = nn.ModuleList()
+        if expand_attn:
+            out_dims = (dim_out,) * depth
+        else:
+            out_dims = (dim,) * (depth - 1) + (dim_out,)
+
+        for i in range(depth):
+            attention_block = MultiScaleBlock(
+                dim=dim,
+                dim_out=out_dims[i],
+                num_heads=num_heads,
+                feat_size=feat_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                kernel_q=kernel_q,
+                kernel_kv=kernel_kv,
+                stride_q=stride_q if i == 0 else (1, 1),
+                stride_kv=stride_kv,
+                mode=mode,
+                has_cls_token=has_cls_token,
+                pool_first=pool_first,
+                rel_pos_type=rel_pos_type,
+                residual_pooling=residual_pooling,
+                expand_attn=expand_attn,
+                norm_layer=norm_layer,
+                drop_path=drop_path[i] if isinstance(drop_path, (list, tuple)) else drop_path,
+            )
+            dim = out_dims[i]
+            self.blocks.append(attention_block)
+            if i == 0:
+                feat_size = tuple([size // stride for size, stride in zip(feat_size, stride_q)])
+
+        self.feat_size = feat_size
+
+    def forward(self, x, feat_size: List[int]):
+        for blk in self.blocks:
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x, feat_size = checkpoint.checkpoint(blk, x, feat_size)
+            else:
+                x, feat_size = blk(x, feat_size)
+        return x, feat_size
+
+
+class MultiScaleVit(nn.Module):
+    """
+    Improved Multiscale Vision Transformers for Classification and Detection
+    Yanghao Li*, Chao-Yuan Wu*, Haoqi Fan, Karttikeya Mangalam, Bo Xiong, Jitendra Malik,
+        Christoph Feichtenhofer*
+    https://arxiv.org/abs/2112.01526
+
+    Multiscale Vision Transformers
+    Haoqi Fan*, Bo Xiong*, Karttikeya Mangalam*, Yanghao Li*, Zhicheng Yan, Jitendra Malik,
+        Christoph Feichtenhofer*
+    https://arxiv.org/abs/2104.11227
+    """
+
+    def __init__(
+            self,
+            cfg: MultiScaleVitCfg,
+            img_size: Tuple[int, int] = (224, 224),
+            in_chans: int = 3,
+            global_pool: str = 'avg',
+            num_classes: int = 1000,
+            drop_path_rate: float = 0.,
+            drop_rate: float = 0.,
+    ):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        norm_layer = partial(get_norm_layer(cfg.norm_layer), eps=cfg.norm_eps)
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        self.global_pool = global_pool
+        self.depths = tuple(cfg.depths)
+        self.expand_attn = cfg.expand_attn
+
+        embed_dim = cfg.embed_dim[0]
+        self.patch_embed = PatchEmbed(
+            dim_in=in_chans,
+            dim_out=embed_dim,
+            kernel=cfg.patch_kernel,
+            stride=cfg.patch_stride,
+            padding=cfg.patch_padding,
+        )
+        patch_dims = (img_size[0] // cfg.patch_stride[0], img_size[1] // cfg.patch_stride[1])
+        num_patches = prod(patch_dims)
+
+        if cfg.use_cls_token:
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+            self.num_prefix_tokens = 1
+            pos_embed_dim = num_patches + 1
+        else:
+            self.num_prefix_tokens = 0
+            self.cls_token = None
+            pos_embed_dim = num_patches
+
+        if cfg.use_abs_pos:
+            self.pos_embed = nn.Parameter(torch.zeros(1, pos_embed_dim, embed_dim))
+        else:
+            self.pos_embed = None
+
+        num_stages = len(cfg.embed_dim)
+        feat_size = patch_dims
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg.depths)).split(cfg.depths)]
+        self.stages = nn.ModuleList()
+        for i in range(num_stages):
+            if cfg.expand_attn:
+                dim_out = cfg.embed_dim[i]
+            else:
+                dim_out = cfg.embed_dim[min(i + 1, num_stages - 1)]
+            stage = MultiScaleVitStage(
+                dim=embed_dim,
+                dim_out=dim_out,
+                depth=cfg.depths[i],
+                num_heads=cfg.num_heads[i],
+                feat_size=feat_size,
+                mlp_ratio=cfg.mlp_ratio,
+                qkv_bias=cfg.qkv_bias,
+                mode=cfg.mode,
+                pool_first=cfg.pool_first,
+                expand_attn=cfg.expand_attn,
+                kernel_q=cfg.kernel_qkv,
+                kernel_kv=cfg.kernel_qkv,
+                stride_q=cfg.stride_q[i],
+                stride_kv=cfg.stride_kv[i],
+                has_cls_token=cfg.use_cls_token,
+                rel_pos_type=cfg.rel_pos_type,
+                residual_pooling=cfg.residual_pooling,
+                norm_layer=norm_layer,
+                drop_path=dpr[i],
+            )
+            embed_dim = dim_out
+            feat_size = stage.feat_size
+            self.stages.append(stage)
+
+        self.num_features = embed_dim
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Sequential(OrderedDict([
+            ('drop', nn.Dropout(self.drop_rate)),
+            ('fc', nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity())
+        ]))
+
+        if self.pos_embed is not None:
+            trunc_normal_tf_(self.pos_embed, std=0.02)
+        if self.cls_token is not None:
+            trunc_normal_tf_(self.cls_token, std=0.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_tf_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {k for k, _ in self.named_parameters()
+                if any(n in k for n in ["pos_embed", "rel_pos_h", "rel_pos_w", "cls_token"])}
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^patch_embed',  # stem and embed
+            blocks=[(r'^stages\.(\d+)', None), (r'^norm', (99999,))]
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for s in self.stages:
+            s.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            self.global_pool = global_pool
+        self.head = nn.Sequential(OrderedDict([
+            ('drop', nn.Dropout(self.drop_rate)),
+            ('fc', nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity())
+        ]))
+
+    def forward_features(self, x):
+        x, feat_size = self.patch_embed(x)
+        B, N, C = x.shape
+
+        if self.cls_token is not None:
+            cls_tokens = self.cls_token.expand(B, -1, -1)
+            x = torch.cat((cls_tokens, x), dim=1)
+
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+
+        for stage in self.stages:
+            x, feat_size = stage(x, feat_size)
+
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool:
+            if self.global_pool == 'avg':
+                x = x[:, self.num_prefix_tokens:].mean(1)
+            else:
+                x = x[:, 0]
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def checkpoint_filter_fn(state_dict, model):
+    if 'stages.0.blocks.0.norm1.weight' in state_dict:
+        return state_dict
+
+    import re
+    if 'model_state' in state_dict:
+        state_dict = state_dict['model_state']
+
+    depths = getattr(model, 'depths', None)
+    expand_attn = getattr(model, 'expand_attn', True)
+    assert depths is not None, 'model requires depth attribute to remap checkpoints'
+    depth_map = {}
+    block_idx = 0
+    for stage_idx, d in enumerate(depths):
+        depth_map.update({i: (stage_idx, i - block_idx) for i in range(block_idx, block_idx + d)})
+        block_idx += d
+
+    out_dict = {}
+    for k, v in state_dict.items():
+        k = re.sub(
+            r'blocks\.(\d+)',
+            lambda x: f'stages.{depth_map[int(x.group(1))][0]}.blocks.{depth_map[int(x.group(1))][1]}',
+            k)
+
+        if expand_attn:
+            k = re.sub(r'stages\.(\d+).blocks\.(\d+).proj', f'stages.\\1.blocks.\\2.shortcut_proj_attn', k)
+        else:
+            k = re.sub(r'stages\.(\d+).blocks\.(\d+).proj', f'stages.\\1.blocks.\\2.shortcut_proj_mlp', k)
+        if 'head' in k:
+            k = k.replace('head.projection', 'head.fc')
+        out_dict[k] = v
+
+    # for k, v in state_dict.items():
+    #     if model.pos_embed is not None and k == 'pos_embed' and v.shape[1] != model.pos_embed.shape[1]:
+    #         # To resize pos embedding when using model at different size from pretrained weights
+    #         v = resize_pos_embed(
+    #             v,
+    #             model.pos_embed,
+    #             0 if getattr(model, 'no_embed_class') else getattr(model, 'num_prefix_tokens', 1),
+    #             model.patch_embed.grid_size
+    #         )
+
+    return out_dict
+
+
+def _create_mvitv2(variant, cfg_variant=None, pretrained=False, **kwargs):
+    return build_model_with_cfg(
+        MultiScaleVit, variant, pretrained,
+        model_cfg=model_cfgs[variant] if not cfg_variant else model_cfgs[cfg_variant],
+        pretrained_filter_fn=checkpoint_filter_fn,
+        feature_cfg=dict(flatten_sequential=True),
+        **kwargs)
+
+
+@register_model
+def mvitv2_tiny(pretrained=False, **kwargs):
+    return _create_mvitv2('mvitv2_tiny', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mvitv2_small(pretrained=False, **kwargs):
+    return _create_mvitv2('mvitv2_small', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mvitv2_base(pretrained=False, **kwargs):
+    return _create_mvitv2('mvitv2_base', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mvitv2_large(pretrained=False, **kwargs):
+    return _create_mvitv2('mvitv2_large', pretrained=pretrained, **kwargs)
+
+
+# @register_model
+# def mvitv2_base_in21k(pretrained=False, **kwargs):
+#     return _create_mvitv2('mvitv2_base_in21k', pretrained=pretrained, **kwargs)
+#
+#
+# @register_model
+# def mvitv2_large_in21k(pretrained=False, **kwargs):
+#     return _create_mvitv2('mvitv2_large_in21k', pretrained=pretrained, **kwargs)
+#
+#
+# @register_model
+# def mvitv2_huge_in21k(pretrained=False, **kwargs):
+#     return _create_mvitv2('mvitv2_huge_in21k', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def mvitv2_small_cls(pretrained=False, **kwargs):
+    return _create_mvitv2('mvitv2_small_cls', pretrained=pretrained, **kwargs)
diff --git a/src/custom_timm/models/nasnet.py b/src/custom_timm/models/nasnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..50db1a3d351db0e9caa2002e16b8003b561050f8
--- /dev/null
+++ b/src/custom_timm/models/nasnet.py
@@ -0,0 +1,588 @@
+""" NasNet-A (Large)
+ nasnetalarge implementation grabbed from Cadene's pretrained models
+ https://github.com/Cadene/pretrained-models.pytorch
+"""
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .helpers import build_model_with_cfg
+from .layers import ConvNormAct, create_conv2d, create_pool2d, create_classifier
+from .registry import register_model
+
+__all__ = ['NASNetALarge']
+
+default_cfgs = {
+    'nasnetalarge': {
+        'url': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/nasnetalarge-dc4a7b8b.pth',
+        'input_size': (3, 331, 331),
+        'pool_size': (11, 11),
+        'crop_pct': 0.911,
+        'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5),
+        'std': (0.5, 0.5, 0.5),
+        'num_classes': 1000,
+        'first_conv': 'conv0.conv',
+        'classifier': 'last_linear',
+        'label_offset': 1,  # 1001 classes in pretrained weights
+    },
+}
+
+
+class ActConvBn(nn.Module):
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=''):
+        super(ActConvBn, self).__init__()
+        self.act = nn.ReLU()
+        self.conv = create_conv2d(
+            in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.bn = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.1)
+
+    def forward(self, x):
+        x = self.act(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class SeparableConv2d(nn.Module):
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding=''):
+        super(SeparableConv2d, self).__init__()
+        self.depthwise_conv2d = create_conv2d(
+            in_channels, in_channels, kernel_size=kernel_size,
+            stride=stride, padding=padding, groups=in_channels)
+        self.pointwise_conv2d = create_conv2d(
+            in_channels, out_channels, kernel_size=1, padding=0)
+
+    def forward(self, x):
+        x = self.depthwise_conv2d(x)
+        x = self.pointwise_conv2d(x)
+        return x
+
+
+class BranchSeparables(nn.Module):
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, pad_type='', stem_cell=False):
+        super(BranchSeparables, self).__init__()
+        middle_channels = out_channels if stem_cell else in_channels
+        self.act_1 = nn.ReLU()
+        self.separable_1 = SeparableConv2d(
+            in_channels, middle_channels, kernel_size, stride=stride, padding=pad_type)
+        self.bn_sep_1 = nn.BatchNorm2d(middle_channels, eps=0.001, momentum=0.1)
+        self.act_2 = nn.ReLU(inplace=True)
+        self.separable_2 = SeparableConv2d(
+            middle_channels, out_channels, kernel_size, stride=1, padding=pad_type)
+        self.bn_sep_2 = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.1)
+
+    def forward(self, x):
+        x = self.act_1(x)
+        x = self.separable_1(x)
+        x = self.bn_sep_1(x)
+        x = self.act_2(x)
+        x = self.separable_2(x)
+        x = self.bn_sep_2(x)
+        return x
+
+
+class CellStem0(nn.Module):
+    def __init__(self, stem_size, num_channels=42, pad_type=''):
+        super(CellStem0, self).__init__()
+        self.num_channels = num_channels
+        self.stem_size = stem_size
+        self.conv_1x1 = ActConvBn(self.stem_size, self.num_channels, 1, stride=1)
+
+        self.comb_iter_0_left = BranchSeparables(self.num_channels, self.num_channels, 5, 2, pad_type)
+        self.comb_iter_0_right = BranchSeparables(self.stem_size, self.num_channels, 7, 2, pad_type, stem_cell=True)
+
+        self.comb_iter_1_left = create_pool2d('max', 3, 2, padding=pad_type)
+        self.comb_iter_1_right = BranchSeparables(self.stem_size, self.num_channels, 7, 2, pad_type, stem_cell=True)
+
+        self.comb_iter_2_left = create_pool2d('avg', 3, 2, count_include_pad=False, padding=pad_type)
+        self.comb_iter_2_right = BranchSeparables(self.stem_size, self.num_channels, 5, 2, pad_type, stem_cell=True)
+
+        self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
+
+        self.comb_iter_4_left = BranchSeparables(self.num_channels, self.num_channels, 3, 1, pad_type)
+        self.comb_iter_4_right = create_pool2d('max', 3, 2, padding=pad_type)
+
+    def forward(self, x):
+        x1 = self.conv_1x1(x)
+
+        x_comb_iter_0_left = self.comb_iter_0_left(x1)
+        x_comb_iter_0_right = self.comb_iter_0_right(x)
+        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
+
+        x_comb_iter_1_left = self.comb_iter_1_left(x1)
+        x_comb_iter_1_right = self.comb_iter_1_right(x)
+        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
+
+        x_comb_iter_2_left = self.comb_iter_2_left(x1)
+        x_comb_iter_2_right = self.comb_iter_2_right(x)
+        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right
+
+        x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0)
+        x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1
+
+        x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0)
+        x_comb_iter_4_right = self.comb_iter_4_right(x1)
+        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right
+
+        x_out = torch.cat([x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
+        return x_out
+
+
+class CellStem1(nn.Module):
+
+    def __init__(self, stem_size, num_channels, pad_type=''):
+        super(CellStem1, self).__init__()
+        self.num_channels = num_channels
+        self.stem_size = stem_size
+        self.conv_1x1 = ActConvBn(2 * self.num_channels, self.num_channels, 1, stride=1)
+
+        self.act = nn.ReLU()
+        self.path_1 = nn.Sequential()
+        self.path_1.add_module('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False))
+        self.path_1.add_module('conv', nn.Conv2d(self.stem_size, self.num_channels // 2, 1, stride=1, bias=False))
+       
+        self.path_2 = nn.Sequential()
+        self.path_2.add_module('pad', nn.ZeroPad2d((-1, 1, -1, 1)))
+        self.path_2.add_module('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False))
+        self.path_2.add_module('conv', nn.Conv2d(self.stem_size, self.num_channels // 2, 1, stride=1, bias=False))
+
+        self.final_path_bn = nn.BatchNorm2d(self.num_channels, eps=0.001, momentum=0.1)
+
+        self.comb_iter_0_left = BranchSeparables(self.num_channels, self.num_channels, 5, 2, pad_type)
+        self.comb_iter_0_right = BranchSeparables(self.num_channels, self.num_channels, 7, 2, pad_type)
+
+        self.comb_iter_1_left = create_pool2d('max', 3, 2, padding=pad_type)
+        self.comb_iter_1_right = BranchSeparables(self.num_channels, self.num_channels, 7, 2, pad_type)
+
+        self.comb_iter_2_left = create_pool2d('avg', 3, 2, count_include_pad=False, padding=pad_type)
+        self.comb_iter_2_right = BranchSeparables(self.num_channels, self.num_channels, 5, 2, pad_type)
+
+        self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
+
+        self.comb_iter_4_left = BranchSeparables(self.num_channels, self.num_channels, 3, 1, pad_type)
+        self.comb_iter_4_right = create_pool2d('max', 3, 2, padding=pad_type)
+
+    def forward(self, x_conv0, x_stem_0):
+        x_left = self.conv_1x1(x_stem_0)
+
+        x_relu = self.act(x_conv0)
+        # path 1
+        x_path1 = self.path_1(x_relu)
+        # path 2
+        x_path2 = self.path_2(x_relu)
+        # final path
+        x_right = self.final_path_bn(torch.cat([x_path1, x_path2], 1))
+
+        x_comb_iter_0_left = self.comb_iter_0_left(x_left)
+        x_comb_iter_0_right = self.comb_iter_0_right(x_right)
+        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
+
+        x_comb_iter_1_left = self.comb_iter_1_left(x_left)
+        x_comb_iter_1_right = self.comb_iter_1_right(x_right)
+        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
+
+        x_comb_iter_2_left = self.comb_iter_2_left(x_left)
+        x_comb_iter_2_right = self.comb_iter_2_right(x_right)
+        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right
+
+        x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0)
+        x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1
+
+        x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0)
+        x_comb_iter_4_right = self.comb_iter_4_right(x_left)
+        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right
+
+        x_out = torch.cat([x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
+        return x_out
+
+
+class FirstCell(nn.Module):
+
+    def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type=''):
+        super(FirstCell, self).__init__()
+        self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, 1, stride=1)
+
+        self.act = nn.ReLU()
+        self.path_1 = nn.Sequential()
+        self.path_1.add_module('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False))
+        self.path_1.add_module('conv', nn.Conv2d(in_chs_left, out_chs_left, 1, stride=1, bias=False))
+
+        self.path_2 = nn.Sequential()
+        self.path_2.add_module('pad', nn.ZeroPad2d((-1, 1, -1, 1)))
+        self.path_2.add_module('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False))
+        self.path_2.add_module('conv', nn.Conv2d(in_chs_left, out_chs_left, 1, stride=1, bias=False))
+
+        self.final_path_bn = nn.BatchNorm2d(out_chs_left * 2, eps=0.001, momentum=0.1)
+
+        self.comb_iter_0_left = BranchSeparables(out_chs_right, out_chs_right, 5, 1, pad_type)
+        self.comb_iter_0_right = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type)
+
+        self.comb_iter_1_left = BranchSeparables(out_chs_right, out_chs_right, 5, 1, pad_type)
+        self.comb_iter_1_right = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type)
+
+        self.comb_iter_2_left = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
+
+        self.comb_iter_3_left = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
+        self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
+
+        self.comb_iter_4_left = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type)
+
+    def forward(self, x, x_prev):
+        x_relu = self.act(x_prev)
+        x_path1 = self.path_1(x_relu)
+        x_path2 = self.path_2(x_relu)
+        x_left = self.final_path_bn(torch.cat([x_path1, x_path2], 1))
+        x_right = self.conv_1x1(x)
+
+        x_comb_iter_0_left = self.comb_iter_0_left(x_right)
+        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
+        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
+
+        x_comb_iter_1_left = self.comb_iter_1_left(x_left)
+        x_comb_iter_1_right = self.comb_iter_1_right(x_left)
+        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
+
+        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
+        x_comb_iter_2 = x_comb_iter_2_left + x_left
+
+        x_comb_iter_3_left = self.comb_iter_3_left(x_left)
+        x_comb_iter_3_right = self.comb_iter_3_right(x_left)
+        x_comb_iter_3 = x_comb_iter_3_left + x_comb_iter_3_right
+
+        x_comb_iter_4_left = self.comb_iter_4_left(x_right)
+        x_comb_iter_4 = x_comb_iter_4_left + x_right
+
+        x_out = torch.cat([x_left, x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
+        return x_out
+
+
+class NormalCell(nn.Module):
+
+    def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type=''):
+        super(NormalCell, self).__init__()
+        self.conv_prev_1x1 = ActConvBn(in_chs_left, out_chs_left, 1, stride=1, padding=pad_type)
+        self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, 1, stride=1, padding=pad_type)
+
+        self.comb_iter_0_left = BranchSeparables(out_chs_right, out_chs_right, 5, 1, pad_type)
+        self.comb_iter_0_right = BranchSeparables(out_chs_left, out_chs_left, 3, 1, pad_type)
+
+        self.comb_iter_1_left = BranchSeparables(out_chs_left, out_chs_left, 5, 1, pad_type)
+        self.comb_iter_1_right = BranchSeparables(out_chs_left, out_chs_left, 3, 1, pad_type)
+
+        self.comb_iter_2_left = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
+
+        self.comb_iter_3_left = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
+        self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
+
+        self.comb_iter_4_left = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type)
+
+    def forward(self, x, x_prev):
+        x_left = self.conv_prev_1x1(x_prev)
+        x_right = self.conv_1x1(x)
+
+        x_comb_iter_0_left = self.comb_iter_0_left(x_right)
+        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
+        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
+
+        x_comb_iter_1_left = self.comb_iter_1_left(x_left)
+        x_comb_iter_1_right = self.comb_iter_1_right(x_left)
+        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
+
+        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
+        x_comb_iter_2 = x_comb_iter_2_left + x_left
+
+        x_comb_iter_3_left = self.comb_iter_3_left(x_left)
+        x_comb_iter_3_right = self.comb_iter_3_right(x_left)
+        x_comb_iter_3 = x_comb_iter_3_left + x_comb_iter_3_right
+
+        x_comb_iter_4_left = self.comb_iter_4_left(x_right)
+        x_comb_iter_4 = x_comb_iter_4_left + x_right
+
+        x_out = torch.cat([x_left, x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
+        return x_out
+
+
+class ReductionCell0(nn.Module):
+
+    def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type=''):
+        super(ReductionCell0, self).__init__()
+        self.conv_prev_1x1 = ActConvBn(in_chs_left, out_chs_left, 1, stride=1, padding=pad_type)
+        self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, 1, stride=1, padding=pad_type)
+
+        self.comb_iter_0_left = BranchSeparables(out_chs_right, out_chs_right, 5, 2, pad_type)
+        self.comb_iter_0_right = BranchSeparables(out_chs_right, out_chs_right, 7, 2, pad_type)
+
+        self.comb_iter_1_left = create_pool2d('max', 3, 2, padding=pad_type)
+        self.comb_iter_1_right = BranchSeparables(out_chs_right, out_chs_right, 7, 2, pad_type)
+
+        self.comb_iter_2_left = create_pool2d('avg', 3, 2, count_include_pad=False, padding=pad_type)
+        self.comb_iter_2_right = BranchSeparables(out_chs_right, out_chs_right, 5, 2, pad_type)
+
+        self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
+
+        self.comb_iter_4_left = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type)
+        self.comb_iter_4_right = create_pool2d('max', 3, 2, padding=pad_type)
+
+    def forward(self, x, x_prev):
+        x_left = self.conv_prev_1x1(x_prev)
+        x_right = self.conv_1x1(x)
+
+        x_comb_iter_0_left = self.comb_iter_0_left(x_right)
+        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
+        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
+
+        x_comb_iter_1_left = self.comb_iter_1_left(x_right)
+        x_comb_iter_1_right = self.comb_iter_1_right(x_left)
+        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
+
+        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
+        x_comb_iter_2_right = self.comb_iter_2_right(x_left)
+        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right
+
+        x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0)
+        x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1
+
+        x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0)
+        x_comb_iter_4_right = self.comb_iter_4_right(x_right)
+        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right
+
+        x_out = torch.cat([x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
+        return x_out
+
+
+class ReductionCell1(nn.Module):
+
+    def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type=''):
+        super(ReductionCell1, self).__init__()
+        self.conv_prev_1x1 = ActConvBn(in_chs_left, out_chs_left, 1, stride=1, padding=pad_type)
+        self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, 1, stride=1, padding=pad_type)
+
+        self.comb_iter_0_left = BranchSeparables(out_chs_right, out_chs_right, 5, 2, pad_type)
+        self.comb_iter_0_right = BranchSeparables(out_chs_right, out_chs_right, 7, 2, pad_type)
+
+        self.comb_iter_1_left = create_pool2d('max', 3, 2, padding=pad_type)
+        self.comb_iter_1_right = BranchSeparables(out_chs_right, out_chs_right, 7, 2, pad_type)
+
+        self.comb_iter_2_left = create_pool2d('avg', 3, 2, count_include_pad=False, padding=pad_type)
+        self.comb_iter_2_right = BranchSeparables(out_chs_right, out_chs_right, 5, 2, pad_type)
+
+        self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
+
+        self.comb_iter_4_left = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type)
+        self.comb_iter_4_right = create_pool2d('max', 3, 2, padding=pad_type)
+
+    def forward(self, x, x_prev):
+        x_left = self.conv_prev_1x1(x_prev)
+        x_right = self.conv_1x1(x)
+
+        x_comb_iter_0_left = self.comb_iter_0_left(x_right)
+        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
+        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
+
+        x_comb_iter_1_left = self.comb_iter_1_left(x_right)
+        x_comb_iter_1_right = self.comb_iter_1_right(x_left)
+        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
+
+        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
+        x_comb_iter_2_right = self.comb_iter_2_right(x_left)
+        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right
+
+        x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0)
+        x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1
+
+        x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0)
+        x_comb_iter_4_right = self.comb_iter_4_right(x_right)
+        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right
+
+        x_out = torch.cat([x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
+        return x_out
+
+
+class NASNetALarge(nn.Module):
+    """NASNetALarge (6 @ 4032) """
+
+    def __init__(
+            self, num_classes=1000, in_chans=3, stem_size=96, channel_multiplier=2,
+            num_features=4032, output_stride=32, drop_rate=0., global_pool='avg', pad_type='same'):
+        super(NASNetALarge, self).__init__()
+        self.num_classes = num_classes
+        self.stem_size = stem_size
+        self.num_features = num_features
+        self.channel_multiplier = channel_multiplier
+        self.drop_rate = drop_rate
+        assert output_stride == 32
+
+        channels = self.num_features // 24
+        # 24 is default value for the architecture
+
+        self.conv0 = ConvNormAct(
+            in_channels=in_chans, out_channels=self.stem_size, kernel_size=3, padding=0, stride=2,
+            norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.1), apply_act=False)
+
+        self.cell_stem_0 = CellStem0(
+            self.stem_size, num_channels=channels // (channel_multiplier ** 2), pad_type=pad_type)
+        self.cell_stem_1 = CellStem1(
+            self.stem_size, num_channels=channels // channel_multiplier, pad_type=pad_type)
+
+        self.cell_0 = FirstCell(
+            in_chs_left=channels, out_chs_left=channels // 2,
+            in_chs_right=2 * channels, out_chs_right=channels, pad_type=pad_type)
+        self.cell_1 = NormalCell(
+            in_chs_left=2 * channels, out_chs_left=channels,
+            in_chs_right=6 * channels, out_chs_right=channels, pad_type=pad_type)
+        self.cell_2 = NormalCell(
+            in_chs_left=6 * channels, out_chs_left=channels,
+            in_chs_right=6 * channels, out_chs_right=channels, pad_type=pad_type)
+        self.cell_3 = NormalCell(
+            in_chs_left=6 * channels, out_chs_left=channels,
+            in_chs_right=6 * channels, out_chs_right=channels, pad_type=pad_type)
+        self.cell_4 = NormalCell(
+            in_chs_left=6 * channels, out_chs_left=channels,
+            in_chs_right=6 * channels, out_chs_right=channels, pad_type=pad_type)
+        self.cell_5 = NormalCell(
+            in_chs_left=6 * channels, out_chs_left=channels,
+            in_chs_right=6 * channels, out_chs_right=channels, pad_type=pad_type)
+
+        self.reduction_cell_0 = ReductionCell0(
+            in_chs_left=6 * channels, out_chs_left=2 * channels,
+            in_chs_right=6 * channels, out_chs_right=2 * channels, pad_type=pad_type)
+        self.cell_6 = FirstCell(
+            in_chs_left=6 * channels, out_chs_left=channels,
+            in_chs_right=8 * channels, out_chs_right=2 * channels, pad_type=pad_type)
+        self.cell_7 = NormalCell(
+            in_chs_left=8 * channels, out_chs_left=2 * channels,
+            in_chs_right=12 * channels, out_chs_right=2 * channels, pad_type=pad_type)
+        self.cell_8 = NormalCell(
+            in_chs_left=12 * channels, out_chs_left=2 * channels,
+            in_chs_right=12 * channels, out_chs_right=2 * channels, pad_type=pad_type)
+        self.cell_9 = NormalCell(
+            in_chs_left=12 * channels, out_chs_left=2 * channels,
+            in_chs_right=12 * channels, out_chs_right=2 * channels, pad_type=pad_type)
+        self.cell_10 = NormalCell(
+            in_chs_left=12 * channels, out_chs_left=2 * channels,
+            in_chs_right=12 * channels, out_chs_right=2 * channels, pad_type=pad_type)
+        self.cell_11 = NormalCell(
+            in_chs_left=12 * channels, out_chs_left=2 * channels,
+            in_chs_right=12 * channels, out_chs_right=2 * channels, pad_type=pad_type)
+
+        self.reduction_cell_1 = ReductionCell1(
+            in_chs_left=12 * channels, out_chs_left=4 * channels,
+            in_chs_right=12 * channels, out_chs_right=4 * channels, pad_type=pad_type)
+        self.cell_12 = FirstCell(
+            in_chs_left=12 * channels, out_chs_left=2 * channels,
+            in_chs_right=16 * channels, out_chs_right=4 * channels, pad_type=pad_type)
+        self.cell_13 = NormalCell(
+            in_chs_left=16 * channels, out_chs_left=4 * channels,
+            in_chs_right=24 * channels, out_chs_right=4 * channels, pad_type=pad_type)
+        self.cell_14 = NormalCell(
+            in_chs_left=24 * channels, out_chs_left=4 * channels,
+            in_chs_right=24 * channels, out_chs_right=4 * channels, pad_type=pad_type)
+        self.cell_15 = NormalCell(
+            in_chs_left=24 * channels, out_chs_left=4 * channels,
+            in_chs_right=24 * channels, out_chs_right=4 * channels, pad_type=pad_type)
+        self.cell_16 = NormalCell(
+            in_chs_left=24 * channels, out_chs_left=4 * channels,
+            in_chs_right=24 * channels, out_chs_right=4 * channels, pad_type=pad_type)
+        self.cell_17 = NormalCell(
+            in_chs_left=24 * channels, out_chs_left=4 * channels,
+            in_chs_right=24 * channels, out_chs_right=4 * channels, pad_type=pad_type)
+        self.act = nn.ReLU(inplace=True)
+        self.feature_info = [
+            dict(num_chs=96, reduction=2, module='conv0'),
+            dict(num_chs=168, reduction=4, module='cell_stem_1.conv_1x1.act'),
+            dict(num_chs=1008, reduction=8, module='reduction_cell_0.conv_1x1.act'),
+            dict(num_chs=2016, reduction=16, module='reduction_cell_1.conv_1x1.act'),
+            dict(num_chs=4032, reduction=32, module='act'),
+        ]
+
+        self.global_pool, self.last_linear = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^conv0|cell_stem_[01]',
+            blocks=[
+                (r'^cell_(\d+)', None),
+                (r'^reduction_cell_0', (6,)),
+                (r'^reduction_cell_1', (12,)),
+            ]
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        assert not enable, 'gradient checkpointing not supported'
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.last_linear
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool, self.last_linear = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool)
+
+    def forward_features(self, x):
+        x_conv0 = self.conv0(x)
+
+        x_stem_0 = self.cell_stem_0(x_conv0)
+        x_stem_1 = self.cell_stem_1(x_conv0, x_stem_0)
+
+        x_cell_0 = self.cell_0(x_stem_1, x_stem_0)
+        x_cell_1 = self.cell_1(x_cell_0, x_stem_1)
+        x_cell_2 = self.cell_2(x_cell_1, x_cell_0)
+        x_cell_3 = self.cell_3(x_cell_2, x_cell_1)
+        x_cell_4 = self.cell_4(x_cell_3, x_cell_2)
+        x_cell_5 = self.cell_5(x_cell_4, x_cell_3)
+
+        x_reduction_cell_0 = self.reduction_cell_0(x_cell_5, x_cell_4)
+        x_cell_6 = self.cell_6(x_reduction_cell_0, x_cell_4)
+        x_cell_7 = self.cell_7(x_cell_6, x_reduction_cell_0)
+        x_cell_8 = self.cell_8(x_cell_7, x_cell_6)
+        x_cell_9 = self.cell_9(x_cell_8, x_cell_7)
+        x_cell_10 = self.cell_10(x_cell_9, x_cell_8)
+        x_cell_11 = self.cell_11(x_cell_10, x_cell_9)
+
+        x_reduction_cell_1 = self.reduction_cell_1(x_cell_11, x_cell_10)
+        x_cell_12 = self.cell_12(x_reduction_cell_1, x_cell_10)
+        x_cell_13 = self.cell_13(x_cell_12, x_reduction_cell_1)
+        x_cell_14 = self.cell_14(x_cell_13, x_cell_12)
+        x_cell_15 = self.cell_15(x_cell_14, x_cell_13)
+        x_cell_16 = self.cell_16(x_cell_15, x_cell_14)
+        x_cell_17 = self.cell_17(x_cell_16, x_cell_15)
+        x = self.act(x_cell_17)
+        return x
+
+    def forward_head(self, x):
+        x = self.global_pool(x)
+        if self.drop_rate > 0:
+            x = F.dropout(x, self.drop_rate, training=self.training)
+        x = self.last_linear(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_nasnet(variant, pretrained=False, **kwargs):
+    return build_model_with_cfg(
+        NASNetALarge, variant, pretrained,
+        feature_cfg=dict(feature_cls='hook', no_rewrite=True),  # not possible to re-write this model
+        **kwargs)
+
+
+@register_model
+def nasnetalarge(pretrained=False, **kwargs):
+    """NASNet-A large model architecture.
+    """
+    model_kwargs = dict(pad_type='same', **kwargs)
+    return _create_nasnet('nasnetalarge', pretrained, **model_kwargs)
diff --git a/src/custom_timm/models/nest.py b/src/custom_timm/models/nest.py
new file mode 100644
index 0000000000000000000000000000000000000000..f626a2e61b5b6137170f42e7b8bf8f1f62d7e48f
--- /dev/null
+++ b/src/custom_timm/models/nest.py
@@ -0,0 +1,486 @@
+""" Nested Transformer (NesT) in PyTorch
+
+A PyTorch implement of Aggregating Nested Transformers as described in:
+
+'Aggregating Nested Transformers'
+    - https://arxiv.org/abs/2105.12723
+
+The official Jax code is released and available at https://github.com/google-research/nested-transformer. The weights
+have been converted with convert/convert_nest_flax.py
+
+Acknowledgments:
+* The paper authors for sharing their research, code, and model weights
+* Ross Wightman's existing code off which I based this
+
+Copyright 2021 Alexander Soare
+"""
+
+import collections.abc
+import logging
+import math
+from functools import partial
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .fx_features import register_notrace_function
+from .helpers import build_model_with_cfg, named_apply, checkpoint_seq
+from .layers import PatchEmbed, Mlp, DropPath, create_classifier, trunc_normal_
+from .layers import _assert
+from .layers import create_conv2d, create_pool2d, to_ntuple
+from .registry import register_model
+
+_logger = logging.getLogger(__name__)
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': [14, 14],
+        'crop_pct': .875, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    # (weights from official Google JAX impl)
+    'nest_base': _cfg(),
+    'nest_small': _cfg(),
+    'nest_tiny': _cfg(),
+    'jx_nest_base': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/jx_nest_base-8bc41011.pth'),
+    'jx_nest_small': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/jx_nest_small-422eaded.pth'),
+    'jx_nest_tiny': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/jx_nest_tiny-e3428fb9.pth'),
+}
+
+
+class Attention(nn.Module):
+    """
+    This is much like `.vision_transformer.Attention` but uses *localised* self attention by accepting an input with
+     an extra "image block" dim
+    """
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, 3*dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        """
+        x is shape: B (batch_size), T (image blocks), N (seq length per image block), C (embed dim)
+        """ 
+        B, T, N, C = x.shape
+        # result of next line is (qkv, B, num (H)eads, T, N, (C')hannels per head)
+        qkv = self.qkv(x).reshape(B, T, N, 3, self.num_heads, C // self.num_heads).permute(3, 0, 4, 1, 2, 5)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale # (B, H, T, N, N)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        # (B, H, T, N, C'), permute -> (B, T, N, C', H)
+        x = (attn @ v).permute(0, 2, 3, 4, 1).reshape(B, T, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x  # (B, T, N, C)
+
+
+class TransformerLayer(nn.Module):
+    """
+    This is much like `.vision_transformer.Block` but:
+        - Called TransformerLayer here to allow for "block" as defined in the paper ("non-overlapping image blocks")
+        - Uses modified Attention layer that handles the "block" dimension
+    """
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x):
+        y = self.norm1(x)
+        x = x + self.drop_path(self.attn(y))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class ConvPool(nn.Module):
+    def __init__(self, in_channels, out_channels, norm_layer, pad_type=''):
+        super().__init__()
+        self.conv = create_conv2d(in_channels, out_channels, kernel_size=3, padding=pad_type, bias=True)
+        self.norm = norm_layer(out_channels)
+        self.pool = create_pool2d('max', kernel_size=3, stride=2, padding=pad_type)
+
+    def forward(self, x):
+        """
+        x is expected to have shape (B, C, H, W)
+        """
+        _assert(x.shape[-2] % 2 == 0, 'BlockAggregation requires even input spatial dims')
+        _assert(x.shape[-1] % 2 == 0, 'BlockAggregation requires even input spatial dims')
+        x = self.conv(x)
+        # Layer norm done over channel dim only
+        x = self.norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        x = self.pool(x)
+        return x  # (B, C, H//2, W//2)
+
+
+def blockify(x, block_size: int):
+    """image to blocks
+    Args:
+        x (Tensor): with shape (B, H, W, C)
+        block_size (int): edge length of a single square block in units of H, W
+    """
+    B, H, W, C  = x.shape
+    _assert(H % block_size == 0, '`block_size` must divide input height evenly')
+    _assert(W % block_size == 0, '`block_size` must divide input width evenly')
+    grid_height = H // block_size
+    grid_width = W // block_size
+    x = x.reshape(B, grid_height, block_size, grid_width, block_size, C)
+    x = x.transpose(2, 3).reshape(B, grid_height * grid_width, -1, C)
+    return x  # (B, T, N, C)
+
+
+@register_notrace_function  # reason: int receives Proxy
+def deblockify(x, block_size: int):
+    """blocks to image
+    Args:
+        x (Tensor): with shape (B, T, N, C) where T is number of blocks and N is sequence size per block
+        block_size (int): edge length of a single square block in units of desired H, W
+    """
+    B, T, _, C = x.shape
+    grid_size = int(math.sqrt(T))
+    height = width = grid_size * block_size
+    x = x.reshape(B, grid_size, grid_size, block_size, block_size, C)
+    x = x.transpose(2, 3).reshape(B, height, width, C)
+    return x  # (B, H, W, C)
+
+
+class NestLevel(nn.Module):
+    """ Single hierarchical level of a Nested Transformer
+    """
+    def __init__(
+            self, num_blocks, block_size, seq_length, num_heads, depth, embed_dim, prev_embed_dim=None,
+            mlp_ratio=4., qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rates=[],
+            norm_layer=None, act_layer=None, pad_type=''):
+        super().__init__()
+        self.block_size = block_size
+        self.grad_checkpointing = False
+
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_blocks, seq_length, embed_dim))
+
+        if prev_embed_dim is not None:
+            self.pool = ConvPool(prev_embed_dim, embed_dim, norm_layer=norm_layer, pad_type=pad_type)
+        else:
+            self.pool = nn.Identity()
+
+        # Transformer encoder
+        if len(drop_path_rates):
+            assert len(drop_path_rates) == depth, 'Must provide as many drop path rates as there are transformer layers'
+        self.transformer_encoder = nn.Sequential(*[
+            TransformerLayer(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=drop_path_rates[i],
+                norm_layer=norm_layer, act_layer=act_layer)
+            for i in range(depth)])
+
+    def forward(self, x):
+        """
+        expects x as (B, C, H, W)
+        """
+        x = self.pool(x)
+        x = x.permute(0, 2, 3, 1)  # (B, H', W', C), switch to channels last for transformer
+        x = blockify(x, self.block_size)  # (B, T, N, C')
+        x = x + self.pos_embed
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.transformer_encoder, x)
+        else:
+            x = self.transformer_encoder(x)  # (B, T, N, C')
+        x = deblockify(x, self.block_size)  # (B, H', W', C')
+        # Channel-first for block aggregation, and generally to replicate convnet feature map at each stage
+        return x.permute(0, 3, 1, 2)  # (B, C, H', W')
+
+
+class Nest(nn.Module):
+    """ Nested Transformer (NesT)
+
+    A PyTorch impl of : `Aggregating Nested Transformers`
+        - https://arxiv.org/abs/2105.12723
+    """
+
+    def __init__(
+            self, img_size=224, in_chans=3, patch_size=4, num_levels=3, embed_dims=(128, 256, 512),
+            num_heads=(4, 8, 16), depths=(2, 2, 20), num_classes=1000, mlp_ratio=4., qkv_bias=True,
+            drop_rate=0., attn_drop_rate=0., drop_path_rate=0.5, norm_layer=None, act_layer=None,
+            pad_type='', weight_init='', global_pool='avg'
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            in_chans (int): number of input channels
+            patch_size (int): patch size
+            num_levels (int): number of block hierarchies (T_d in the paper)
+            embed_dims (int, tuple): embedding dimensions of each level
+            num_heads (int, tuple): number of attention heads for each level
+            depths (int, tuple): number of transformer layers for each level
+            num_classes (int): number of classes for classification head
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim for MLP of transformer layers
+            qkv_bias (bool): enable bias for qkv if True
+            drop_rate (float): dropout rate for MLP of transformer layers, MSA final projection layer, and classifier
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            norm_layer: (nn.Module): normalization layer for transformer layers
+            act_layer: (nn.Module): activation layer in MLP of transformer layers
+            pad_type: str: Type of padding to use '' for PyTorch symmetric, 'same' for TF SAME
+            weight_init: (str): weight init scheme
+            global_pool: (str): type of pooling operation to apply to final feature map
+
+        Notes:
+            - Default values follow NesT-B from the original Jax code.
+            - `embed_dims`, `num_heads`, `depths` should be ints or tuples with length `num_levels`.
+            - For those following the paper, Table A1 may have errors!
+                - https://github.com/google-research/nested-transformer/issues/2
+        """
+        super().__init__()
+
+        for param_name in ['embed_dims', 'num_heads', 'depths']:
+            param_value = locals()[param_name]
+            if isinstance(param_value, collections.abc.Sequence):
+                assert len(param_value) == num_levels, f'Require `len({param_name}) == num_levels`'
+
+        embed_dims = to_ntuple(num_levels)(embed_dims)
+        num_heads = to_ntuple(num_levels)(num_heads)
+        depths = to_ntuple(num_levels)(depths)
+        self.num_classes = num_classes
+        self.num_features = embed_dims[-1]
+        self.feature_info = []
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+        self.drop_rate = drop_rate
+        self.num_levels = num_levels
+        if isinstance(img_size, collections.abc.Sequence):
+            assert img_size[0] == img_size[1], 'Model only handles square inputs'
+            img_size = img_size[0]
+        assert img_size % patch_size == 0, '`patch_size` must divide `img_size` evenly'
+        self.patch_size = patch_size
+
+        # Number of blocks at each level
+        self.num_blocks = (4 ** torch.arange(num_levels)).flip(0).tolist()
+        assert (img_size // patch_size) % math.sqrt(self.num_blocks[0]) == 0, \
+            'First level blocks don\'t fit evenly. Check `img_size`, `patch_size`, and `num_levels`'
+
+        # Block edge size in units of patches
+        # Hint: (img_size // patch_size) gives number of patches along edge of image. sqrt(self.num_blocks[0]) is the
+        #  number of blocks along edge of image
+        self.block_size = int((img_size // patch_size) // math.sqrt(self.num_blocks[0]))
+        
+        # Patch embedding
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dims[0], flatten=False)
+        self.num_patches = self.patch_embed.num_patches
+        self.seq_length = self.num_patches // self.num_blocks[0]
+
+        # Build up each hierarchical level
+        levels = []
+        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        prev_dim = None
+        curr_stride = 4
+        for i in range(len(self.num_blocks)):
+            dim = embed_dims[i]
+            levels.append(NestLevel(
+                self.num_blocks[i], self.block_size, self.seq_length, num_heads[i], depths[i], dim, prev_dim,
+                mlp_ratio, qkv_bias, drop_rate, attn_drop_rate, dp_rates[i], norm_layer, act_layer, pad_type=pad_type))
+            self.feature_info += [dict(num_chs=dim, reduction=curr_stride, module=f'levels.{i}')]
+            prev_dim = dim
+            curr_stride *= 2
+        self.levels = nn.Sequential(*levels)
+
+        # Final normalization layer
+        self.norm = norm_layer(embed_dims[-1])
+
+        # Classifier
+        self.global_pool, self.head = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
+
+        self.init_weights(weight_init)
+
+    @torch.jit.ignore
+    def init_weights(self, mode=''):
+        assert mode in ('nlhb', '')
+        head_bias = -math.log(self.num_classes) if 'nlhb' in mode else 0.
+        for level in self.levels:
+            trunc_normal_(level.pos_embed, std=.02, a=-2, b=2)
+        named_apply(partial(_init_nest_weights, head_bias=head_bias), self)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {f'level.{i}.pos_embed' for i in range(len(self.levels))}
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^patch_embed',  # stem and embed
+            blocks=[
+                (r'^levels\.(\d+)' if coarse else r'^levels\.(\d+)\.transformer_encoder\.(\d+)', None),
+                (r'^levels\.(\d+)\.(?:pool|pos_embed)', (0,)),
+                (r'^norm', (99999,))
+            ]
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for l in self.levels:
+            l.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool, self.head = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool)
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        x = self.levels(x)
+        # Layer norm done over channel dim only (to NHWC and back)
+        x = self.norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        x = self.global_pool(x)
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _init_nest_weights(module: nn.Module, name: str = '', head_bias: float = 0.):
+    """ NesT weight initialization
+    Can replicate Jax implementation. Otherwise follows vision_transformer.py
+    """
+    if isinstance(module, nn.Linear):
+        if name.startswith('head'):
+            trunc_normal_(module.weight, std=.02, a=-2, b=2)
+            nn.init.constant_(module.bias, head_bias)
+        else:
+            trunc_normal_(module.weight, std=.02, a=-2, b=2)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Conv2d):
+        trunc_normal_(module.weight, std=.02, a=-2, b=2)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+
+
+def resize_pos_embed(posemb, posemb_new):
+    """
+    Rescale the grid of position embeddings when loading from state_dict
+    Expected shape of position embeddings is (1, T, N, C), and considers only square images
+    """
+    _logger.info('Resized position embedding: %s to %s', posemb.shape, posemb_new.shape)
+    seq_length_old = posemb.shape[2]
+    num_blocks_new, seq_length_new = posemb_new.shape[1:3]
+    size_new = int(math.sqrt(num_blocks_new*seq_length_new))
+    # First change to (1, C, H, W)
+    posemb = deblockify(posemb, int(math.sqrt(seq_length_old))).permute(0, 3, 1, 2)
+    posemb = F.interpolate(posemb, size=[size_new, size_new], mode='bicubic', align_corners=False)
+    # Now change to new (1, T, N, C)
+    posemb = blockify(posemb.permute(0, 2, 3, 1), int(math.sqrt(seq_length_new)))
+    return posemb
+
+
+def checkpoint_filter_fn(state_dict, model):
+    """ resize positional embeddings of pretrained weights """
+    pos_embed_keys = [k for k in state_dict.keys() if k.startswith('pos_embed_')]
+    for k in pos_embed_keys:
+        if state_dict[k].shape != getattr(model, k).shape:
+            state_dict[k] = resize_pos_embed(state_dict[k], getattr(model, k))
+    return state_dict
+
+
+def _create_nest(variant, pretrained=False, **kwargs):
+    model = build_model_with_cfg(
+        Nest, variant, pretrained,
+        feature_cfg=dict(out_indices=(0, 1, 2), flatten_sequential=True),
+        pretrained_filter_fn=checkpoint_filter_fn,
+        **kwargs)
+
+    return model
+
+
+@register_model
+def nest_base(pretrained=False, **kwargs):
+    """ Nest-B @ 224x224
+    """
+    model_kwargs = dict(
+        embed_dims=(128, 256, 512), num_heads=(4, 8, 16), depths=(2, 2, 20), **kwargs)
+    model = _create_nest('nest_base', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def nest_small(pretrained=False, **kwargs):
+    """ Nest-S @ 224x224
+    """
+    model_kwargs = dict(embed_dims=(96, 192, 384), num_heads=(3, 6, 12), depths=(2, 2, 20), **kwargs)
+    model = _create_nest('nest_small', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def nest_tiny(pretrained=False, **kwargs):
+    """ Nest-T @ 224x224
+    """
+    model_kwargs = dict(embed_dims=(96, 192, 384), num_heads=(3, 6, 12), depths=(2, 2, 8), **kwargs)
+    model = _create_nest('nest_tiny', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def jx_nest_base(pretrained=False, **kwargs):
+    """ Nest-B @ 224x224, Pretrained weights converted from official Jax impl.
+    """
+    kwargs['pad_type'] = 'same'
+    model_kwargs = dict(embed_dims=(128, 256, 512), num_heads=(4, 8, 16), depths=(2, 2, 20), **kwargs)
+    model = _create_nest('jx_nest_base', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def jx_nest_small(pretrained=False, **kwargs):
+    """ Nest-S @ 224x224, Pretrained weights converted from official Jax impl.
+    """
+    kwargs['pad_type'] = 'same'
+    model_kwargs = dict(embed_dims=(96, 192, 384), num_heads=(3, 6, 12), depths=(2, 2, 20), **kwargs)
+    model = _create_nest('jx_nest_small', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def jx_nest_tiny(pretrained=False, **kwargs):
+    """ Nest-T @ 224x224, Pretrained weights converted from official Jax impl.
+    """
+    kwargs['pad_type'] = 'same'
+    model_kwargs = dict(embed_dims=(96, 192, 384), num_heads=(3, 6, 12), depths=(2, 2, 8), **kwargs)
+    model = _create_nest('jx_nest_tiny', pretrained=pretrained, **model_kwargs)
+    return model
diff --git a/src/custom_timm/models/nfnet.py b/src/custom_timm/models/nfnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..e65151f4b9108ba19143cba01ac282b4c3f3c973
--- /dev/null
+++ b/src/custom_timm/models/nfnet.py
@@ -0,0 +1,893 @@
+""" Normalization Free Nets. NFNet, NF-RegNet, NF-ResNet (pre-activation) Models
+
+Paper: `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
+    - https://arxiv.org/abs/2101.08692
+
+Paper: `High-Performance Large-Scale Image Recognition Without Normalization`
+    - https://arxiv.org/abs/2102.06171
+
+Official Deepmind JAX code: https://github.com/deepmind/deepmind-research/tree/master/nfnets
+
+Status:
+* These models are a work in progress, experiments ongoing.
+* Pretrained weights for two models so far, more to come.
+* Model details updated to closer match official JAX code now that it's released
+* NF-ResNet, NF-RegNet-B, and NFNet-F models supported
+
+Hacked together by / copyright Ross Wightman, 2021.
+"""
+import math
+from dataclasses import dataclass, field
+from collections import OrderedDict
+from typing import Tuple, Optional
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .fx_features import register_notrace_module
+from .helpers import build_model_with_cfg, checkpoint_seq
+from .registry import register_model
+from .layers import ClassifierHead, DropPath, AvgPool2dSame, ScaledStdConv2d, ScaledStdConv2dSame,\
+    get_act_layer, get_act_fn, get_attn, make_divisible
+
+
+def _dcfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.9, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.conv1', 'classifier': 'head.fc',
+        **kwargs
+    }
+
+
+default_cfgs = dict(
+    dm_nfnet_f0=_dcfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f0-604f9c3a.pth',
+        pool_size=(6, 6), input_size=(3, 192, 192), test_input_size=(3, 256, 256), crop_pct=.9),
+    dm_nfnet_f1=_dcfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f1-fc540f82.pth',
+        pool_size=(7, 7), input_size=(3, 224, 224), test_input_size=(3, 320, 320), crop_pct=0.91),
+    dm_nfnet_f2=_dcfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f2-89875923.pth',
+        pool_size=(8, 8), input_size=(3, 256, 256), test_input_size=(3, 352, 352), crop_pct=0.92),
+    dm_nfnet_f3=_dcfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f3-d74ab3aa.pth',
+        pool_size=(10, 10), input_size=(3, 320, 320), test_input_size=(3, 416, 416), crop_pct=0.94),
+    dm_nfnet_f4=_dcfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f4-0ac5b10b.pth',
+        pool_size=(12, 12), input_size=(3, 384, 384), test_input_size=(3, 512, 512), crop_pct=0.951),
+    dm_nfnet_f5=_dcfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f5-ecb20ab1.pth',
+        pool_size=(13, 13), input_size=(3, 416, 416), test_input_size=(3, 544, 544), crop_pct=0.954),
+    dm_nfnet_f6=_dcfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f6-e0f12116.pth',
+        pool_size=(14, 14), input_size=(3, 448, 448), test_input_size=(3, 576, 576), crop_pct=0.956),
+
+    nfnet_f0=_dcfg(
+        url='', pool_size=(6, 6), input_size=(3, 192, 192), test_input_size=(3, 256, 256)),
+    nfnet_f1=_dcfg(
+        url='', pool_size=(7, 7), input_size=(3, 224, 224), test_input_size=(3, 320, 320)),
+    nfnet_f2=_dcfg(
+        url='', pool_size=(8, 8), input_size=(3, 256, 256), test_input_size=(3, 352, 352)),
+    nfnet_f3=_dcfg(
+        url='', pool_size=(10, 10), input_size=(3, 320, 320), test_input_size=(3, 416, 416)),
+    nfnet_f4=_dcfg(
+        url='', pool_size=(12, 12), input_size=(3, 384, 384), test_input_size=(3, 512, 512)),
+    nfnet_f5=_dcfg(
+        url='', pool_size=(13, 13), input_size=(3, 416, 416), test_input_size=(3, 544, 544)),
+    nfnet_f6=_dcfg(
+        url='', pool_size=(14, 14), input_size=(3, 448, 448), test_input_size=(3, 576, 576)),
+    nfnet_f7=_dcfg(
+        url='', pool_size=(15, 15), input_size=(3, 480, 480), test_input_size=(3, 608, 608)),
+
+    nfnet_l0=_dcfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/nfnet_l0_ra2-45c6688d.pth',
+        pool_size=(7, 7), input_size=(3, 224, 224), test_input_size=(3, 288, 288), crop_pct=1.0),
+    eca_nfnet_l0=_dcfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecanfnet_l0_ra2-e3e9ac50.pth',
+        hf_hub_id='timm/eca_nfnet_l0',
+        pool_size=(7, 7), input_size=(3, 224, 224), test_input_size=(3, 288, 288), crop_pct=1.0),
+    eca_nfnet_l1=_dcfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecanfnet_l1_ra2-7dce93cd.pth',
+        pool_size=(8, 8), input_size=(3, 256, 256), test_input_size=(3, 320, 320), crop_pct=1.0),
+    eca_nfnet_l2=_dcfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecanfnet_l2_ra3-da781a61.pth',
+        pool_size=(10, 10), input_size=(3, 320, 320), test_input_size=(3, 384, 384), crop_pct=1.0),
+    eca_nfnet_l3=_dcfg(
+        url='',
+        pool_size=(11, 11), input_size=(3, 352, 352), test_input_size=(3, 448, 448), crop_pct=1.0),
+
+    nf_regnet_b0=_dcfg(
+        url='', pool_size=(6, 6), input_size=(3, 192, 192), test_input_size=(3, 256, 256), first_conv='stem.conv'),
+    nf_regnet_b1=_dcfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/nf_regnet_b1_256_ra2-ad85cfef.pth',
+        pool_size=(8, 8), input_size=(3, 256, 256), test_input_size=(3, 288, 288), first_conv='stem.conv'),  # NOT to paper spec
+    nf_regnet_b2=_dcfg(
+        url='', pool_size=(8, 8), input_size=(3, 240, 240), test_input_size=(3, 272, 272), first_conv='stem.conv'),
+    nf_regnet_b3=_dcfg(
+        url='', pool_size=(9, 9), input_size=(3, 288, 288), test_input_size=(3, 320, 320), first_conv='stem.conv'),
+    nf_regnet_b4=_dcfg(
+        url='', pool_size=(10, 10), input_size=(3, 320, 320), test_input_size=(3, 384, 384), first_conv='stem.conv'),
+    nf_regnet_b5=_dcfg(
+        url='', pool_size=(12, 12), input_size=(3, 384, 384), test_input_size=(3, 456, 456), first_conv='stem.conv'),
+
+    nf_resnet26=_dcfg(url='', first_conv='stem.conv'),
+    nf_resnet50=_dcfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/nf_resnet50_ra2-9f236009.pth',
+        pool_size=(8, 8), input_size=(3, 256, 256), test_input_size=(3, 288, 288), crop_pct=0.94, first_conv='stem.conv'),
+    nf_resnet101=_dcfg(url='', first_conv='stem.conv'),
+
+    nf_seresnet26=_dcfg(url='', first_conv='stem.conv'),
+    nf_seresnet50=_dcfg(url='', first_conv='stem.conv'),
+    nf_seresnet101=_dcfg(url='', first_conv='stem.conv'),
+
+    nf_ecaresnet26=_dcfg(url='', first_conv='stem.conv'),
+    nf_ecaresnet50=_dcfg(url='', first_conv='stem.conv'),
+    nf_ecaresnet101=_dcfg(url='', first_conv='stem.conv'),
+)
+
+
+@dataclass
+class NfCfg:
+    depths: Tuple[int, int, int, int]
+    channels: Tuple[int, int, int, int]
+    alpha: float = 0.2
+    stem_type: str = '3x3'
+    stem_chs: Optional[int] = None
+    group_size: Optional[int] = None
+    attn_layer: Optional[str] = None
+    attn_kwargs: dict = None
+    attn_gain: float = 2.0  # NF correction gain to apply if attn layer is used
+    width_factor: float = 1.0
+    bottle_ratio: float = 0.5
+    num_features: int = 0  # num out_channels for final conv, no final_conv if 0
+    ch_div: int = 8  # round channels % 8 == 0 to keep tensor-core use optimal
+    reg: bool = False  # enables EfficientNet-like options used in RegNet variants, expand from in_chs, se in middle
+    extra_conv: bool = False  # extra 3x3 bottleneck convolution for NFNet models
+    gamma_in_act: bool = False
+    same_padding: bool = False
+    std_conv_eps: float = 1e-5
+    skipinit: bool = False  # disabled by default, non-trivial performance impact
+    zero_init_fc: bool = False
+    act_layer: str = 'silu'
+
+
+def _nfres_cfg(
+        depths, channels=(256, 512, 1024, 2048), group_size=None, act_layer='relu', attn_layer=None, attn_kwargs=None):
+    attn_kwargs = attn_kwargs or {}
+    cfg = NfCfg(
+        depths=depths, channels=channels, stem_type='7x7_pool', stem_chs=64, bottle_ratio=0.25,
+        group_size=group_size, act_layer=act_layer, attn_layer=attn_layer, attn_kwargs=attn_kwargs)
+    return cfg
+
+
+def _nfreg_cfg(depths, channels=(48, 104, 208, 440)):
+    num_features = 1280 * channels[-1] // 440
+    attn_kwargs = dict(rd_ratio=0.5)
+    cfg = NfCfg(
+        depths=depths, channels=channels, stem_type='3x3', group_size=8, width_factor=0.75, bottle_ratio=2.25,
+        num_features=num_features, reg=True, attn_layer='se', attn_kwargs=attn_kwargs)
+    return cfg
+
+
+def _nfnet_cfg(
+        depths, channels=(256, 512, 1536, 1536), group_size=128, bottle_ratio=0.5, feat_mult=2.,
+        act_layer='gelu', attn_layer='se', attn_kwargs=None):
+    num_features = int(channels[-1] * feat_mult)
+    attn_kwargs = attn_kwargs if attn_kwargs is not None else dict(rd_ratio=0.5)
+    cfg = NfCfg(
+        depths=depths, channels=channels, stem_type='deep_quad', stem_chs=128, group_size=group_size,
+        bottle_ratio=bottle_ratio, extra_conv=True, num_features=num_features, act_layer=act_layer,
+        attn_layer=attn_layer, attn_kwargs=attn_kwargs)
+    return cfg
+
+
+def _dm_nfnet_cfg(depths, channels=(256, 512, 1536, 1536), act_layer='gelu', skipinit=True):
+    cfg = NfCfg(
+        depths=depths, channels=channels, stem_type='deep_quad', stem_chs=128, group_size=128,
+        bottle_ratio=0.5, extra_conv=True, gamma_in_act=True, same_padding=True, skipinit=skipinit,
+        num_features=int(channels[-1] * 2.0), act_layer=act_layer, attn_layer='se', attn_kwargs=dict(rd_ratio=0.5))
+    return cfg
+
+
+model_cfgs = dict(
+    # NFNet-F models w/ GELU compatible with DeepMind weights
+    dm_nfnet_f0=_dm_nfnet_cfg(depths=(1, 2, 6, 3)),
+    dm_nfnet_f1=_dm_nfnet_cfg(depths=(2, 4, 12, 6)),
+    dm_nfnet_f2=_dm_nfnet_cfg(depths=(3, 6, 18, 9)),
+    dm_nfnet_f3=_dm_nfnet_cfg(depths=(4, 8, 24, 12)),
+    dm_nfnet_f4=_dm_nfnet_cfg(depths=(5, 10, 30, 15)),
+    dm_nfnet_f5=_dm_nfnet_cfg(depths=(6, 12, 36, 18)),
+    dm_nfnet_f6=_dm_nfnet_cfg(depths=(7, 14, 42, 21)),
+
+    # NFNet-F models w/ GELU
+    nfnet_f0=_nfnet_cfg(depths=(1, 2, 6, 3)),
+    nfnet_f1=_nfnet_cfg(depths=(2, 4, 12, 6)),
+    nfnet_f2=_nfnet_cfg(depths=(3, 6, 18, 9)),
+    nfnet_f3=_nfnet_cfg(depths=(4, 8, 24, 12)),
+    nfnet_f4=_nfnet_cfg(depths=(5, 10, 30, 15)),
+    nfnet_f5=_nfnet_cfg(depths=(6, 12, 36, 18)),
+    nfnet_f6=_nfnet_cfg(depths=(7, 14, 42, 21)),
+    nfnet_f7=_nfnet_cfg(depths=(8, 16, 48, 24)),
+
+    # Experimental 'light' versions of NFNet-F that are little leaner
+    nfnet_l0=_nfnet_cfg(
+        depths=(1, 2, 6, 3), feat_mult=1.5, group_size=64, bottle_ratio=0.25,
+        attn_kwargs=dict(rd_ratio=0.25, rd_divisor=8), act_layer='silu'),
+    eca_nfnet_l0=_nfnet_cfg(
+        depths=(1, 2, 6, 3), feat_mult=1.5, group_size=64, bottle_ratio=0.25,
+        attn_layer='eca', attn_kwargs=dict(), act_layer='silu'),
+    eca_nfnet_l1=_nfnet_cfg(
+        depths=(2, 4, 12, 6), feat_mult=2, group_size=64, bottle_ratio=0.25,
+        attn_layer='eca', attn_kwargs=dict(), act_layer='silu'),
+    eca_nfnet_l2=_nfnet_cfg(
+        depths=(3, 6, 18, 9), feat_mult=2, group_size=64, bottle_ratio=0.25,
+        attn_layer='eca', attn_kwargs=dict(), act_layer='silu'),
+    eca_nfnet_l3=_nfnet_cfg(
+        depths=(4, 8, 24, 12), feat_mult=2, group_size=64, bottle_ratio=0.25,
+        attn_layer='eca', attn_kwargs=dict(), act_layer='silu'),
+
+    # EffNet influenced RegNet defs.
+    # NOTE: These aren't quite the official ver, ch_div=1 must be set for exact ch counts. I round to ch_div=8.
+    nf_regnet_b0=_nfreg_cfg(depths=(1, 3, 6, 6)),
+    nf_regnet_b1=_nfreg_cfg(depths=(2, 4, 7, 7)),
+    nf_regnet_b2=_nfreg_cfg(depths=(2, 4, 8, 8), channels=(56, 112, 232, 488)),
+    nf_regnet_b3=_nfreg_cfg(depths=(2, 5, 9, 9), channels=(56, 128, 248, 528)),
+    nf_regnet_b4=_nfreg_cfg(depths=(2, 6, 11, 11), channels=(64, 144, 288, 616)),
+    nf_regnet_b5=_nfreg_cfg(depths=(3, 7, 14, 14), channels=(80, 168, 336, 704)),
+    # FIXME add B6-B8
+
+    # ResNet (preact, D style deep stem/avg down) defs
+    nf_resnet26=_nfres_cfg(depths=(2, 2, 2, 2)),
+    nf_resnet50=_nfres_cfg(depths=(3, 4, 6, 3)),
+    nf_resnet101=_nfres_cfg(depths=(3, 4, 23, 3)),
+
+    nf_seresnet26=_nfres_cfg(depths=(2, 2, 2, 2), attn_layer='se', attn_kwargs=dict(rd_ratio=1/16)),
+    nf_seresnet50=_nfres_cfg(depths=(3, 4, 6, 3), attn_layer='se', attn_kwargs=dict(rd_ratio=1/16)),
+    nf_seresnet101=_nfres_cfg(depths=(3, 4, 23, 3), attn_layer='se', attn_kwargs=dict(rd_ratio=1/16)),
+
+    nf_ecaresnet26=_nfres_cfg(depths=(2, 2, 2, 2), attn_layer='eca', attn_kwargs=dict()),
+    nf_ecaresnet50=_nfres_cfg(depths=(3, 4, 6, 3), attn_layer='eca', attn_kwargs=dict()),
+    nf_ecaresnet101=_nfres_cfg(depths=(3, 4, 23, 3), attn_layer='eca', attn_kwargs=dict()),
+
+)
+
+
+class GammaAct(nn.Module):
+    def __init__(self, act_type='relu', gamma: float = 1.0, inplace=False):
+        super().__init__()
+        self.act_fn = get_act_fn(act_type)
+        self.gamma = gamma
+        self.inplace = inplace
+
+    def forward(self, x):
+        return self.act_fn(x, inplace=self.inplace).mul_(self.gamma)
+
+
+def act_with_gamma(act_type, gamma: float = 1.):
+    def _create(inplace=False):
+        return GammaAct(act_type, gamma=gamma, inplace=inplace)
+    return _create
+
+
+class DownsampleAvg(nn.Module):
+    def __init__(
+            self, in_chs, out_chs, stride=1, dilation=1, first_dilation=None, conv_layer=ScaledStdConv2d):
+        """ AvgPool Downsampling as in 'D' ResNet variants. Support for dilation."""
+        super(DownsampleAvg, self).__init__()
+        avg_stride = stride if dilation == 1 else 1
+        if stride > 1 or dilation > 1:
+            avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+            self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
+        else:
+            self.pool = nn.Identity()
+        self.conv = conv_layer(in_chs, out_chs, 1, stride=1)
+
+    def forward(self, x):
+        return self.conv(self.pool(x))
+
+
+@register_notrace_module  # reason: mul_ causes FX to drop a relevant node. https://github.com/pytorch/pytorch/issues/68301
+class NormFreeBlock(nn.Module):
+    """Normalization-Free pre-activation block.
+    """
+
+    def __init__(
+            self, in_chs, out_chs=None, stride=1, dilation=1, first_dilation=None,
+            alpha=1.0, beta=1.0, bottle_ratio=0.25, group_size=None, ch_div=1, reg=True, extra_conv=False,
+            skipinit=False, attn_layer=None, attn_gain=2.0, act_layer=None, conv_layer=None, drop_path_rate=0.):
+        super().__init__()
+        first_dilation = first_dilation or dilation
+        out_chs = out_chs or in_chs
+        # RegNet variants scale bottleneck from in_chs, otherwise scale from out_chs like ResNet
+        mid_chs = make_divisible(in_chs * bottle_ratio if reg else out_chs * bottle_ratio, ch_div)
+        groups = 1 if not group_size else mid_chs // group_size
+        if group_size and group_size % ch_div == 0:
+            mid_chs = group_size * groups  # correct mid_chs if group_size divisible by ch_div, otherwise error
+        self.alpha = alpha
+        self.beta = beta
+        self.attn_gain = attn_gain
+
+        if in_chs != out_chs or stride != 1 or dilation != first_dilation:
+            self.downsample = DownsampleAvg(
+                in_chs, out_chs, stride=stride, dilation=dilation, first_dilation=first_dilation, conv_layer=conv_layer)
+        else:
+            self.downsample = None
+
+        self.act1 = act_layer()
+        self.conv1 = conv_layer(in_chs, mid_chs, 1)
+        self.act2 = act_layer(inplace=True)
+        self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups)
+        if extra_conv:
+            self.act2b = act_layer(inplace=True)
+            self.conv2b = conv_layer(mid_chs, mid_chs, 3, stride=1, dilation=dilation, groups=groups)
+        else:
+            self.act2b = None
+            self.conv2b = None
+        if reg and attn_layer is not None:
+            self.attn = attn_layer(mid_chs)  # RegNet blocks apply attn btw conv2 & 3
+        else:
+            self.attn = None
+        self.act3 = act_layer()
+        self.conv3 = conv_layer(mid_chs, out_chs, 1, gain_init=1. if skipinit else 0.)
+        if not reg and attn_layer is not None:
+            self.attn_last = attn_layer(out_chs)  # ResNet blocks apply attn after conv3
+        else:
+            self.attn_last = None
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.skipinit_gain = nn.Parameter(torch.tensor(0.)) if skipinit else None
+
+    def forward(self, x):
+        out = self.act1(x) * self.beta
+
+        # shortcut branch
+        shortcut = x
+        if self.downsample is not None:
+            shortcut = self.downsample(out)
+
+        # residual branch
+        out = self.conv1(out)
+        out = self.conv2(self.act2(out))
+        if self.conv2b is not None:
+            out = self.conv2b(self.act2b(out))
+        if self.attn is not None:
+            out = self.attn_gain * self.attn(out)
+        out = self.conv3(self.act3(out))
+        if self.attn_last is not None:
+            out = self.attn_gain * self.attn_last(out)
+        out = self.drop_path(out)
+
+        if self.skipinit_gain is not None:
+            out.mul_(self.skipinit_gain)  # this slows things down more than expected, TBD
+        out = out * self.alpha + shortcut
+        return out
+
+
+def create_stem(in_chs, out_chs, stem_type='', conv_layer=None, act_layer=None, preact_feature=True):
+    stem_stride = 2
+    stem_feature = dict(num_chs=out_chs, reduction=2, module='stem.conv')
+    stem = OrderedDict()
+    assert stem_type in ('', 'deep', 'deep_tiered', 'deep_quad', '3x3', '7x7', 'deep_pool', '3x3_pool', '7x7_pool')
+    if 'deep' in stem_type:
+        if 'quad' in stem_type:
+            # 4 deep conv stack as in NFNet-F models
+            assert not 'pool' in stem_type
+            stem_chs = (out_chs // 8, out_chs // 4, out_chs // 2, out_chs)
+            strides = (2, 1, 1, 2)
+            stem_stride = 4
+            stem_feature = dict(num_chs=out_chs // 2, reduction=2, module='stem.conv3')
+        else:
+            if 'tiered' in stem_type:
+                stem_chs = (3 * out_chs // 8, out_chs // 2, out_chs)  # 'T' resnets in resnet.py
+            else:
+                stem_chs = (out_chs // 2, out_chs // 2, out_chs)  # 'D' ResNets
+            strides = (2, 1, 1)
+            stem_feature = dict(num_chs=out_chs // 2, reduction=2, module='stem.conv2')
+        last_idx = len(stem_chs) - 1
+        for i, (c, s) in enumerate(zip(stem_chs, strides)):
+            stem[f'conv{i + 1}'] = conv_layer(in_chs, c, kernel_size=3, stride=s)
+            if i != last_idx:
+                stem[f'act{i + 2}'] = act_layer(inplace=True)
+            in_chs = c
+    elif '3x3' in stem_type:
+        # 3x3 stem conv as in RegNet
+        stem['conv'] = conv_layer(in_chs, out_chs, kernel_size=3, stride=2)
+    else:
+        # 7x7 stem conv as in ResNet
+        stem['conv'] = conv_layer(in_chs, out_chs, kernel_size=7, stride=2)
+
+    if 'pool' in stem_type:
+        stem['pool'] = nn.MaxPool2d(3, stride=2, padding=1)
+        stem_stride = 4
+
+    return nn.Sequential(stem), stem_stride, stem_feature
+
+
+# from https://github.com/deepmind/deepmind-research/tree/master/nfnets
+_nonlin_gamma = dict(
+    identity=1.0,
+    celu=1.270926833152771,
+    elu=1.2716004848480225,
+    gelu=1.7015043497085571,
+    leaky_relu=1.70590341091156,
+    log_sigmoid=1.9193484783172607,
+    log_softmax=1.0002083778381348,
+    relu=1.7139588594436646,
+    relu6=1.7131484746932983,
+    selu=1.0008515119552612,
+    sigmoid=4.803835391998291,
+    silu=1.7881293296813965,
+    softsign=2.338853120803833,
+    softplus=1.9203323125839233,
+    tanh=1.5939117670059204,
+)
+
+
+class NormFreeNet(nn.Module):
+    """ Normalization-Free Network
+
+    As described in :
+    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
+        - https://arxiv.org/abs/2101.08692
+    and
+    `High-Performance Large-Scale Image Recognition Without Normalization` - https://arxiv.org/abs/2102.06171
+
+    This model aims to cover both the NFRegNet-Bx models as detailed in the paper's code snippets and
+    the (preact) ResNet models described earlier in the paper.
+
+    There are a few differences:
+        * channels are rounded to be divisible by 8 by default (keep tensor core kernels happy),
+            this changes channel dim and param counts slightly from the paper models
+        * activation correcting gamma constants are moved into the ScaledStdConv as it has less performance
+            impact in PyTorch when done with the weight scaling there. This likely wasn't a concern in the JAX impl.
+        * a config option `gamma_in_act` can be enabled to not apply gamma in StdConv as described above, but
+            apply it in each activation. This is slightly slower, numerically different, but matches official impl.
+        * skipinit is disabled by default, it seems to have a rather drastic impact on GPU memory use and throughput
+            for what it is/does. Approx 8-10% throughput loss.
+    """
+    def __init__(
+            self, cfg: NfCfg, num_classes=1000, in_chans=3, global_pool='avg', output_stride=32,
+            drop_rate=0., drop_path_rate=0.
+    ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        self.grad_checkpointing = False
+
+        assert cfg.act_layer in _nonlin_gamma, f"Please add non-linearity constants for activation ({cfg.act_layer})."
+        conv_layer = ScaledStdConv2dSame if cfg.same_padding else ScaledStdConv2d
+        if cfg.gamma_in_act:
+            act_layer = act_with_gamma(cfg.act_layer, gamma=_nonlin_gamma[cfg.act_layer])
+            conv_layer = partial(conv_layer, eps=cfg.std_conv_eps)
+        else:
+            act_layer = get_act_layer(cfg.act_layer)
+            conv_layer = partial(conv_layer, gamma=_nonlin_gamma[cfg.act_layer], eps=cfg.std_conv_eps)
+        attn_layer = partial(get_attn(cfg.attn_layer), **cfg.attn_kwargs) if cfg.attn_layer else None
+
+        stem_chs = make_divisible((cfg.stem_chs or cfg.channels[0]) * cfg.width_factor, cfg.ch_div)
+        self.stem, stem_stride, stem_feat = create_stem(
+            in_chans, stem_chs, cfg.stem_type, conv_layer=conv_layer, act_layer=act_layer)
+
+        self.feature_info = [stem_feat]
+        drop_path_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg.depths)).split(cfg.depths)]
+        prev_chs = stem_chs
+        net_stride = stem_stride
+        dilation = 1
+        expected_var = 1.0
+        stages = []
+        for stage_idx, stage_depth in enumerate(cfg.depths):
+            stride = 1 if stage_idx == 0 and stem_stride > 2 else 2
+            if net_stride >= output_stride and stride > 1:
+                dilation *= stride
+                stride = 1
+            net_stride *= stride
+            first_dilation = 1 if dilation in (1, 2) else 2
+
+            blocks = []
+            for block_idx in range(cfg.depths[stage_idx]):
+                first_block = block_idx == 0 and stage_idx == 0
+                out_chs = make_divisible(cfg.channels[stage_idx] * cfg.width_factor, cfg.ch_div)
+                blocks += [NormFreeBlock(
+                    in_chs=prev_chs, out_chs=out_chs,
+                    alpha=cfg.alpha,
+                    beta=1. / expected_var ** 0.5,
+                    stride=stride if block_idx == 0 else 1,
+                    dilation=dilation,
+                    first_dilation=first_dilation,
+                    group_size=cfg.group_size,
+                    bottle_ratio=1. if cfg.reg and first_block else cfg.bottle_ratio,
+                    ch_div=cfg.ch_div,
+                    reg=cfg.reg,
+                    extra_conv=cfg.extra_conv,
+                    skipinit=cfg.skipinit,
+                    attn_layer=attn_layer,
+                    attn_gain=cfg.attn_gain,
+                    act_layer=act_layer,
+                    conv_layer=conv_layer,
+                    drop_path_rate=drop_path_rates[stage_idx][block_idx],
+                )]
+                if block_idx == 0:
+                    expected_var = 1.  # expected var is reset after first block of each stage
+                expected_var += cfg.alpha ** 2   # Even if reset occurs, increment expected variance
+                first_dilation = dilation
+                prev_chs = out_chs
+            self.feature_info += [dict(num_chs=prev_chs, reduction=net_stride, module=f'stages.{stage_idx}')]
+            stages += [nn.Sequential(*blocks)]
+        self.stages = nn.Sequential(*stages)
+
+        if cfg.num_features:
+            # The paper NFRegNet models have an EfficientNet-like final head convolution.
+            self.num_features = make_divisible(cfg.width_factor * cfg.num_features, cfg.ch_div)
+            self.final_conv = conv_layer(prev_chs, self.num_features, 1)
+            self.feature_info[-1] = dict(num_chs=self.num_features, reduction=net_stride, module=f'final_conv')
+        else:
+            self.num_features = prev_chs
+            self.final_conv = nn.Identity()
+        self.final_act = act_layer(inplace=cfg.num_features > 0)
+
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+        for n, m in self.named_modules():
+            if 'fc' in n and isinstance(m, nn.Linear):
+                if cfg.zero_init_fc:
+                    nn.init.zeros_(m.weight)
+                else:
+                    nn.init.normal_(m.weight, 0., .01)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='linear')
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^stem',
+            blocks=[
+                (r'^stages\.(\d+)' if coarse else r'^stages\.(\d+)\.(\d+)', None),
+                (r'^final_conv', (99999,))
+            ]
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.stages, x)
+        else:
+            x = self.stages(x)
+        x = self.final_conv(x)
+        x = self.final_act(x)
+        return x
+
+    def forward_head(self, x):
+        return self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_normfreenet(variant, pretrained=False, **kwargs):
+    model_cfg = model_cfgs[variant]
+    feature_cfg = dict(flatten_sequential=True)
+    return build_model_with_cfg(
+        NormFreeNet, variant, pretrained,
+        model_cfg=model_cfg,
+        feature_cfg=feature_cfg,
+        **kwargs)
+
+
+@register_model
+def dm_nfnet_f0(pretrained=False, **kwargs):
+    """ NFNet-F0 (DeepMind weight compatible)
+    `High-Performance Large-Scale Image Recognition Without Normalization`
+        - https://arxiv.org/abs/2102.06171
+    """
+    return _create_normfreenet('dm_nfnet_f0', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def dm_nfnet_f1(pretrained=False, **kwargs):
+    """ NFNet-F1 (DeepMind weight compatible)
+    `High-Performance Large-Scale Image Recognition Without Normalization`
+        - https://arxiv.org/abs/2102.06171
+    """
+    return _create_normfreenet('dm_nfnet_f1', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def dm_nfnet_f2(pretrained=False, **kwargs):
+    """ NFNet-F2 (DeepMind weight compatible)
+    `High-Performance Large-Scale Image Recognition Without Normalization`
+        - https://arxiv.org/abs/2102.06171
+    """
+    return _create_normfreenet('dm_nfnet_f2', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def dm_nfnet_f3(pretrained=False, **kwargs):
+    """ NFNet-F3 (DeepMind weight compatible)
+    `High-Performance Large-Scale Image Recognition Without Normalization`
+        - https://arxiv.org/abs/2102.06171
+    """
+    return _create_normfreenet('dm_nfnet_f3', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def dm_nfnet_f4(pretrained=False, **kwargs):
+    """ NFNet-F4 (DeepMind weight compatible)
+    `High-Performance Large-Scale Image Recognition Without Normalization`
+        - https://arxiv.org/abs/2102.06171
+    """
+    return _create_normfreenet('dm_nfnet_f4', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def dm_nfnet_f5(pretrained=False, **kwargs):
+    """ NFNet-F5 (DeepMind weight compatible)
+    `High-Performance Large-Scale Image Recognition Without Normalization`
+        - https://arxiv.org/abs/2102.06171
+    """
+    return _create_normfreenet('dm_nfnet_f5', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def dm_nfnet_f6(pretrained=False, **kwargs):
+    """ NFNet-F6 (DeepMind weight compatible)
+    `High-Performance Large-Scale Image Recognition Without Normalization`
+        - https://arxiv.org/abs/2102.06171
+    """
+    return _create_normfreenet('dm_nfnet_f6', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nfnet_f0(pretrained=False, **kwargs):
+    """ NFNet-F0
+    `High-Performance Large-Scale Image Recognition Without Normalization`
+        - https://arxiv.org/abs/2102.06171
+    """
+    return _create_normfreenet('nfnet_f0', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nfnet_f1(pretrained=False, **kwargs):
+    """ NFNet-F1
+    `High-Performance Large-Scale Image Recognition Without Normalization`
+        - https://arxiv.org/abs/2102.06171
+    """
+    return _create_normfreenet('nfnet_f1', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nfnet_f2(pretrained=False, **kwargs):
+    """ NFNet-F2
+    `High-Performance Large-Scale Image Recognition Without Normalization`
+        - https://arxiv.org/abs/2102.06171
+    """
+    return _create_normfreenet('nfnet_f2', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nfnet_f3(pretrained=False, **kwargs):
+    """ NFNet-F3
+    `High-Performance Large-Scale Image Recognition Without Normalization`
+        - https://arxiv.org/abs/2102.06171
+    """
+    return _create_normfreenet('nfnet_f3', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nfnet_f4(pretrained=False, **kwargs):
+    """ NFNet-F4
+    `High-Performance Large-Scale Image Recognition Without Normalization`
+        - https://arxiv.org/abs/2102.06171
+    """
+    return _create_normfreenet('nfnet_f4', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nfnet_f5(pretrained=False, **kwargs):
+    """ NFNet-F5
+    `High-Performance Large-Scale Image Recognition Without Normalization`
+        - https://arxiv.org/abs/2102.06171
+    """
+    return _create_normfreenet('nfnet_f5', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nfnet_f6(pretrained=False, **kwargs):
+    """ NFNet-F6
+    `High-Performance Large-Scale Image Recognition Without Normalization`
+        - https://arxiv.org/abs/2102.06171
+    """
+    return _create_normfreenet('nfnet_f6', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nfnet_f7(pretrained=False, **kwargs):
+    """ NFNet-F7
+    `High-Performance Large-Scale Image Recognition Without Normalization`
+        - https://arxiv.org/abs/2102.06171
+    """
+    return _create_normfreenet('nfnet_f7', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nfnet_l0(pretrained=False, **kwargs):
+    """ NFNet-L0b w/ SiLU
+    My experimental 'light' model w/ F0 repeats, 1.5x final_conv mult, 64 group_size, .25 bottleneck & SE ratio
+    """
+    return _create_normfreenet('nfnet_l0', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def eca_nfnet_l0(pretrained=False, **kwargs):
+    """ ECA-NFNet-L0 w/ SiLU
+    My experimental 'light' model w/ F0 repeats, 1.5x final_conv mult, 64 group_size, .25 bottleneck & ECA attn
+    """
+    return _create_normfreenet('eca_nfnet_l0', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def eca_nfnet_l1(pretrained=False, **kwargs):
+    """ ECA-NFNet-L1 w/ SiLU
+    My experimental 'light' model w/ F1 repeats, 2.0x final_conv mult, 64 group_size, .25 bottleneck & ECA attn
+    """
+    return _create_normfreenet('eca_nfnet_l1', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def eca_nfnet_l2(pretrained=False, **kwargs):
+    """ ECA-NFNet-L2 w/ SiLU
+    My experimental 'light' model w/ F2 repeats, 2.0x final_conv mult, 64 group_size, .25 bottleneck & ECA attn
+    """
+    return _create_normfreenet('eca_nfnet_l2', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def eca_nfnet_l3(pretrained=False, **kwargs):
+    """ ECA-NFNet-L3 w/ SiLU
+    My experimental 'light' model w/ F3 repeats, 2.0x final_conv mult, 64 group_size, .25 bottleneck & ECA attn
+    """
+    return _create_normfreenet('eca_nfnet_l3', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nf_regnet_b0(pretrained=False, **kwargs):
+    """ Normalization-Free RegNet-B0
+    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
+        - https://arxiv.org/abs/2101.08692
+    """
+    return _create_normfreenet('nf_regnet_b0', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nf_regnet_b1(pretrained=False, **kwargs):
+    """ Normalization-Free RegNet-B1
+    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
+        - https://arxiv.org/abs/2101.08692
+    """
+    return _create_normfreenet('nf_regnet_b1', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nf_regnet_b2(pretrained=False, **kwargs):
+    """ Normalization-Free RegNet-B2
+    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
+        - https://arxiv.org/abs/2101.08692
+    """
+    return _create_normfreenet('nf_regnet_b2', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nf_regnet_b3(pretrained=False, **kwargs):
+    """ Normalization-Free RegNet-B3
+    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
+        - https://arxiv.org/abs/2101.08692
+    """
+    return _create_normfreenet('nf_regnet_b3', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nf_regnet_b4(pretrained=False, **kwargs):
+    """ Normalization-Free RegNet-B4
+    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
+        - https://arxiv.org/abs/2101.08692
+    """
+    return _create_normfreenet('nf_regnet_b4', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nf_regnet_b5(pretrained=False, **kwargs):
+    """ Normalization-Free RegNet-B5
+    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
+        - https://arxiv.org/abs/2101.08692
+    """
+    return _create_normfreenet('nf_regnet_b5', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nf_resnet26(pretrained=False, **kwargs):
+    """ Normalization-Free ResNet-26
+    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
+        - https://arxiv.org/abs/2101.08692
+    """
+    return _create_normfreenet('nf_resnet26', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nf_resnet50(pretrained=False, **kwargs):
+    """ Normalization-Free ResNet-50
+    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
+        - https://arxiv.org/abs/2101.08692
+    """
+    return _create_normfreenet('nf_resnet50', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nf_resnet101(pretrained=False, **kwargs):
+    """ Normalization-Free ResNet-101
+    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
+        - https://arxiv.org/abs/2101.08692
+    """
+    return _create_normfreenet('nf_resnet101', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nf_seresnet26(pretrained=False, **kwargs):
+    """ Normalization-Free SE-ResNet26
+    """
+    return _create_normfreenet('nf_seresnet26', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nf_seresnet50(pretrained=False, **kwargs):
+    """ Normalization-Free SE-ResNet50
+    """
+    return _create_normfreenet('nf_seresnet50', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nf_seresnet101(pretrained=False, **kwargs):
+    """ Normalization-Free SE-ResNet101
+    """
+    return _create_normfreenet('nf_seresnet101', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nf_ecaresnet26(pretrained=False, **kwargs):
+    """ Normalization-Free ECA-ResNet26
+    """
+    return _create_normfreenet('nf_ecaresnet26', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nf_ecaresnet50(pretrained=False, **kwargs):
+    """ Normalization-Free ECA-ResNet50
+    """
+    return _create_normfreenet('nf_ecaresnet50', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def nf_ecaresnet101(pretrained=False, **kwargs):
+    """ Normalization-Free ECA-ResNet101
+    """
+    return _create_normfreenet('nf_ecaresnet101', pretrained=pretrained, **kwargs)
diff --git a/src/custom_timm/models/pit.py b/src/custom_timm/models/pit.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dd79c0638fccbe52b91eab348f5abf61bdac67e
--- /dev/null
+++ b/src/custom_timm/models/pit.py
@@ -0,0 +1,404 @@
+""" Pooling-based Vision Transformer (PiT) in PyTorch
+
+A PyTorch implement of Pooling-based Vision Transformers as described in
+'Rethinking Spatial Dimensions of Vision Transformers' - https://arxiv.org/abs/2103.16302
+
+This code was adapted from the original version at https://github.com/naver-ai/pit, original copyright below.
+
+Modifications for timm by / Copyright 2020 Ross Wightman
+"""
+# PiT
+# Copyright 2021-present NAVER Corp.
+# Apache License v2.0
+
+import math
+import re
+from copy import deepcopy
+from functools import partial
+from typing import Tuple
+
+import torch
+from torch import nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg
+from .layers import trunc_normal_, to_2tuple
+from .registry import register_model
+from .vision_transformer import Block
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.conv', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    # deit models (FB weights)
+    'pit_ti_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_ti_730.pth'),
+    'pit_xs_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_xs_781.pth'),
+    'pit_s_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_s_809.pth'),
+    'pit_b_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_b_820.pth'),
+    'pit_ti_distilled_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_ti_distill_746.pth',
+        classifier=('head', 'head_dist')),
+    'pit_xs_distilled_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_xs_distill_791.pth',
+        classifier=('head', 'head_dist')),
+    'pit_s_distilled_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_s_distill_819.pth',
+        classifier=('head', 'head_dist')),
+    'pit_b_distilled_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_b_distill_840.pth',
+        classifier=('head', 'head_dist')),
+}
+
+
+class SequentialTuple(nn.Sequential):
+    """ This module exists to work around torchscript typing issues list -> list"""
+    def __init__(self, *args):
+        super(SequentialTuple, self).__init__(*args)
+
+    def forward(self, x: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+        for module in self:
+            x = module(x)
+        return x
+
+
+class Transformer(nn.Module):
+    def __init__(
+            self, base_dim, depth, heads, mlp_ratio, pool=None, drop_rate=.0, attn_drop_rate=.0, drop_path_prob=None):
+        super(Transformer, self).__init__()
+        self.layers = nn.ModuleList([])
+        embed_dim = base_dim * heads
+
+        self.blocks = nn.Sequential(*[
+            Block(
+                dim=embed_dim,
+                num_heads=heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=True,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=drop_path_prob[i],
+                norm_layer=partial(nn.LayerNorm, eps=1e-6)
+            )
+            for i in range(depth)])
+
+        self.pool = pool
+
+    def forward(self, x: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+        x, cls_tokens = x
+        B, C, H, W = x.shape
+        token_length = cls_tokens.shape[1]
+
+        x = x.flatten(2).transpose(1, 2)
+        x = torch.cat((cls_tokens, x), dim=1)
+
+        x = self.blocks(x)
+
+        cls_tokens = x[:, :token_length]
+        x = x[:, token_length:]
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+
+        if self.pool is not None:
+            x, cls_tokens = self.pool(x, cls_tokens)
+        return x, cls_tokens
+
+
+class ConvHeadPooling(nn.Module):
+    def __init__(self, in_feature, out_feature, stride, padding_mode='zeros'):
+        super(ConvHeadPooling, self).__init__()
+
+        self.conv = nn.Conv2d(
+            in_feature, out_feature, kernel_size=stride + 1, padding=stride // 2, stride=stride,
+            padding_mode=padding_mode, groups=in_feature)
+        self.fc = nn.Linear(in_feature, out_feature)
+
+    def forward(self, x, cls_token) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = self.conv(x)
+        cls_token = self.fc(cls_token)
+        return x, cls_token
+
+
+class ConvEmbedding(nn.Module):
+    def __init__(self, in_channels, out_channels, patch_size, stride, padding):
+        super(ConvEmbedding, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, kernel_size=patch_size, stride=stride, padding=padding, bias=True)
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+
+
+class PoolingVisionTransformer(nn.Module):
+    """ Pooling-based Vision Transformer
+
+    A PyTorch implement of 'Rethinking Spatial Dimensions of Vision Transformers'
+        - https://arxiv.org/abs/2103.16302
+    """
+    def __init__(
+            self, img_size, patch_size, stride, base_dims, depth, heads,
+            mlp_ratio, num_classes=1000, in_chans=3, global_pool='token',
+            distilled=False, attn_drop_rate=.0, drop_rate=.0, drop_path_rate=.0):
+        super(PoolingVisionTransformer, self).__init__()
+        assert global_pool in ('token',)
+
+        padding = 0
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        height = math.floor((img_size[0] + 2 * padding - patch_size[0]) / stride + 1)
+        width = math.floor((img_size[1] + 2 * padding - patch_size[1]) / stride + 1)
+
+        self.base_dims = base_dims
+        self.heads = heads
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_tokens = 2 if distilled else 1
+
+        self.patch_size = patch_size
+        self.pos_embed = nn.Parameter(torch.randn(1, base_dims[0] * heads[0], height, width))
+        self.patch_embed = ConvEmbedding(in_chans, base_dims[0] * heads[0], patch_size, stride, padding)
+
+        self.cls_token = nn.Parameter(torch.randn(1, self.num_tokens, base_dims[0] * heads[0]))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        transformers = []
+        # stochastic depth decay rule
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depth)).split(depth)]
+        for stage in range(len(depth)):
+            pool = None
+            if stage < len(heads) - 1:
+                pool = ConvHeadPooling(
+                    base_dims[stage] * heads[stage], base_dims[stage + 1] * heads[stage + 1], stride=2)
+            transformers += [Transformer(
+                base_dims[stage], depth[stage], heads[stage], mlp_ratio, pool=pool,
+                drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_prob=dpr[stage])
+            ]
+        self.transformers = SequentialTuple(*transformers)
+        self.norm = nn.LayerNorm(base_dims[-1] * heads[-1], eps=1e-6)
+        self.num_features = self.embed_dim = base_dims[-1] * heads[-1]
+
+        # Classifier head
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        self.head_dist = None
+        if distilled:
+            self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
+        self.distilled_training = False  # must set this True to train w/ distillation token
+
+        trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    @torch.jit.ignore
+    def set_distilled_training(self, enable=True):
+        self.distilled_training = enable
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        assert not enable, 'gradient checkpointing not supported'
+
+    def get_classifier(self):
+        if self.head_dist is not None:
+            return self.head, self.head_dist
+        else:
+            return self.head
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        if self.head_dist is not None:
+            self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        x = self.pos_drop(x + self.pos_embed)
+        cls_tokens = self.cls_token.expand(x.shape[0], -1, -1)
+        x, cls_tokens = self.transformers((x, cls_tokens))
+        cls_tokens = self.norm(cls_tokens)
+        return cls_tokens
+
+    def forward_head(self, x, pre_logits: bool = False) -> torch.Tensor:
+        if self.head_dist is not None:
+            assert self.global_pool == 'token'
+            x, x_dist = x[:, 0], x[:, 1]
+            if not pre_logits:
+                x = self.head(x)
+                x_dist = self.head_dist(x_dist)
+            if self.distilled_training and self.training and not torch.jit.is_scripting():
+                # only return separate classification predictions when training in distilled mode
+                return x, x_dist
+            else:
+                # during standard train / finetune, inference average the classifier predictions
+                return (x + x_dist) / 2
+        else:
+            if self.global_pool == 'token':
+                x = x[:, 0]
+            if not pre_logits:
+                x = self.head(x)
+            return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def checkpoint_filter_fn(state_dict, model):
+    """ preprocess checkpoints """
+    out_dict = {}
+    p_blocks = re.compile(r'pools\.(\d)\.')
+    for k, v in state_dict.items():
+        # FIXME need to update resize for PiT impl
+        # if k == 'pos_embed' and v.shape != model.pos_embed.shape:
+        #     # To resize pos embedding when using model at different size from pretrained weights
+        #     v = resize_pos_embed(v, model.pos_embed)
+        k = p_blocks.sub(lambda exp: f'transformers.{int(exp.group(1))}.pool.', k)
+        out_dict[k] = v
+    return out_dict
+
+
+def _create_pit(variant, pretrained=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+
+    model = build_model_with_cfg(
+        PoolingVisionTransformer, variant, pretrained,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        **kwargs)
+    return model
+
+
+@register_model
+def pit_b_224(pretrained, **kwargs):
+    model_kwargs = dict(
+        patch_size=14,
+        stride=7,
+        base_dims=[64, 64, 64],
+        depth=[3, 6, 4],
+        heads=[4, 8, 16],
+        mlp_ratio=4,
+        **kwargs
+    )
+    return _create_pit('pit_b_224', pretrained, **model_kwargs)
+
+
+@register_model
+def pit_s_224(pretrained, **kwargs):
+    model_kwargs = dict(
+        patch_size=16,
+        stride=8,
+        base_dims=[48, 48, 48],
+        depth=[2, 6, 4],
+        heads=[3, 6, 12],
+        mlp_ratio=4,
+        **kwargs
+    )
+    return _create_pit('pit_s_224', pretrained, **model_kwargs)
+
+
+@register_model
+def pit_xs_224(pretrained, **kwargs):
+    model_kwargs = dict(
+        patch_size=16,
+        stride=8,
+        base_dims=[48, 48, 48],
+        depth=[2, 6, 4],
+        heads=[2, 4, 8],
+        mlp_ratio=4,
+        **kwargs
+    )
+    return _create_pit('pit_xs_224', pretrained, **model_kwargs)
+
+
+@register_model
+def pit_ti_224(pretrained, **kwargs):
+    model_kwargs = dict(
+        patch_size=16,
+        stride=8,
+        base_dims=[32, 32, 32],
+        depth=[2, 6, 4],
+        heads=[2, 4, 8],
+        mlp_ratio=4,
+        **kwargs
+    )
+    return _create_pit('pit_ti_224', pretrained, **model_kwargs)
+
+
+@register_model
+def pit_b_distilled_224(pretrained, **kwargs):
+    model_kwargs = dict(
+        patch_size=14,
+        stride=7,
+        base_dims=[64, 64, 64],
+        depth=[3, 6, 4],
+        heads=[4, 8, 16],
+        mlp_ratio=4,
+        distilled=True,
+        **kwargs
+    )
+    return _create_pit('pit_b_distilled_224', pretrained, **model_kwargs)
+
+
+@register_model
+def pit_s_distilled_224(pretrained, **kwargs):
+    model_kwargs = dict(
+        patch_size=16,
+        stride=8,
+        base_dims=[48, 48, 48],
+        depth=[2, 6, 4],
+        heads=[3, 6, 12],
+        mlp_ratio=4,
+        distilled=True,
+        **kwargs
+    )
+    return _create_pit('pit_s_distilled_224', pretrained, **model_kwargs)
+
+
+@register_model
+def pit_xs_distilled_224(pretrained, **kwargs):
+    model_kwargs = dict(
+        patch_size=16,
+        stride=8,
+        base_dims=[48, 48, 48],
+        depth=[2, 6, 4],
+        heads=[2, 4, 8],
+        mlp_ratio=4,
+        distilled=True,
+        **kwargs
+    )
+    return _create_pit('pit_xs_distilled_224', pretrained, **model_kwargs)
+
+
+@register_model
+def pit_ti_distilled_224(pretrained, **kwargs):
+    model_kwargs = dict(
+        patch_size=16,
+        stride=8,
+        base_dims=[32, 32, 32],
+        depth=[2, 6, 4],
+        heads=[2, 4, 8],
+        mlp_ratio=4,
+        distilled=True,
+        **kwargs
+    )
+    return _create_pit('pit_ti_distilled_224', pretrained, **model_kwargs)
\ No newline at end of file
diff --git a/src/custom_timm/models/pnasnet.py b/src/custom_timm/models/pnasnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..81067845befcfaf5436d112af73359ae4128c2d5
--- /dev/null
+++ b/src/custom_timm/models/pnasnet.py
@@ -0,0 +1,361 @@
+"""
+ pnasnet5large implementation grabbed from Cadene's pretrained models
+ Additional credit to https://github.com/creafz
+
+ https://github.com/Cadene/pretrained-models.pytorch/blob/master/pretrainedmodels/models/pnasnet.py
+
+"""
+from collections import OrderedDict
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .helpers import build_model_with_cfg
+from .layers import ConvNormAct, create_conv2d, create_pool2d, create_classifier
+from .registry import register_model
+
+__all__ = ['PNASNet5Large']
+
+default_cfgs = {
+    'pnasnet5large': {
+        'url': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/pnasnet5large-bf079911.pth',
+        'input_size': (3, 331, 331),
+        'pool_size': (11, 11),
+        'crop_pct': 0.911,
+        'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5),
+        'std': (0.5, 0.5, 0.5),
+        'num_classes': 1000,
+        'first_conv': 'conv_0.conv',
+        'classifier': 'last_linear',
+        'label_offset': 1,  # 1001 classes in pretrained weights
+    },
+}
+
+
+class SeparableConv2d(nn.Module):
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding=''):
+        super(SeparableConv2d, self).__init__()
+        self.depthwise_conv2d = create_conv2d(
+            in_channels, in_channels, kernel_size=kernel_size,
+            stride=stride, padding=padding, groups=in_channels)
+        self.pointwise_conv2d = create_conv2d(
+            in_channels, out_channels, kernel_size=1, padding=padding)
+
+    def forward(self, x):
+        x = self.depthwise_conv2d(x)
+        x = self.pointwise_conv2d(x)
+        return x
+
+
+class BranchSeparables(nn.Module):
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, stem_cell=False, padding=''):
+        super(BranchSeparables, self).__init__()
+        middle_channels = out_channels if stem_cell else in_channels
+        self.act_1 = nn.ReLU()
+        self.separable_1 = SeparableConv2d(
+            in_channels, middle_channels, kernel_size, stride=stride, padding=padding)
+        self.bn_sep_1 = nn.BatchNorm2d(middle_channels, eps=0.001)
+        self.act_2 = nn.ReLU()
+        self.separable_2 = SeparableConv2d(
+            middle_channels, out_channels, kernel_size, stride=1, padding=padding)
+        self.bn_sep_2 = nn.BatchNorm2d(out_channels, eps=0.001)
+
+    def forward(self, x):
+        x = self.act_1(x)
+        x = self.separable_1(x)
+        x = self.bn_sep_1(x)
+        x = self.act_2(x)
+        x = self.separable_2(x)
+        x = self.bn_sep_2(x)
+        return x
+
+
+class ActConvBn(nn.Module):
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=''):
+        super(ActConvBn, self).__init__()
+        self.act = nn.ReLU()
+        self.conv = create_conv2d(
+            in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
+
+    def forward(self, x):
+        x = self.act(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class FactorizedReduction(nn.Module):
+
+    def __init__(self, in_channels, out_channels, padding=''):
+        super(FactorizedReduction, self).__init__()
+        self.act = nn.ReLU()
+        self.path_1 = nn.Sequential(OrderedDict([
+            ('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False)),
+            ('conv', create_conv2d(in_channels, out_channels // 2, kernel_size=1, padding=padding)),
+        ]))
+        self.path_2 = nn.Sequential(OrderedDict([
+            ('pad', nn.ZeroPad2d((-1, 1, -1, 1))),  # shift
+            ('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False)),
+            ('conv', create_conv2d(in_channels, out_channels // 2, kernel_size=1, padding=padding)),
+        ]))
+        self.final_path_bn = nn.BatchNorm2d(out_channels, eps=0.001)
+
+    def forward(self, x):
+        x = self.act(x)
+        x_path1 = self.path_1(x)
+        x_path2 = self.path_2(x)
+        out = self.final_path_bn(torch.cat([x_path1, x_path2], 1))
+        return out
+
+
+class CellBase(nn.Module):
+
+    def cell_forward(self, x_left, x_right):
+        x_comb_iter_0_left = self.comb_iter_0_left(x_left)
+        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
+        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
+
+        x_comb_iter_1_left = self.comb_iter_1_left(x_right)
+        x_comb_iter_1_right = self.comb_iter_1_right(x_right)
+        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
+
+        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
+        x_comb_iter_2_right = self.comb_iter_2_right(x_right)
+        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right
+
+        x_comb_iter_3_left = self.comb_iter_3_left(x_comb_iter_2)
+        x_comb_iter_3_right = self.comb_iter_3_right(x_right)
+        x_comb_iter_3 = x_comb_iter_3_left + x_comb_iter_3_right
+
+        x_comb_iter_4_left = self.comb_iter_4_left(x_left)
+        if self.comb_iter_4_right is not None:
+            x_comb_iter_4_right = self.comb_iter_4_right(x_right)
+        else:
+            x_comb_iter_4_right = x_right
+        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right
+
+        x_out = torch.cat([x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
+        return x_out
+
+
+class CellStem0(CellBase):
+
+    def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type=''):
+        super(CellStem0, self).__init__()
+        self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, kernel_size=1, padding=pad_type)
+
+        self.comb_iter_0_left = BranchSeparables(
+            in_chs_left, out_chs_left, kernel_size=5, stride=2, stem_cell=True, padding=pad_type)
+        self.comb_iter_0_right = nn.Sequential(OrderedDict([
+            ('max_pool', create_pool2d('max', 3, stride=2, padding=pad_type)),
+            ('conv', create_conv2d(in_chs_left, out_chs_left, kernel_size=1, padding=pad_type)),
+            ('bn', nn.BatchNorm2d(out_chs_left, eps=0.001)),
+        ]))
+
+        self.comb_iter_1_left = BranchSeparables(
+            out_chs_right, out_chs_right, kernel_size=7, stride=2, padding=pad_type)
+        self.comb_iter_1_right = create_pool2d('max', 3, stride=2, padding=pad_type)
+
+        self.comb_iter_2_left = BranchSeparables(
+            out_chs_right, out_chs_right, kernel_size=5, stride=2, padding=pad_type)
+        self.comb_iter_2_right = BranchSeparables(
+            out_chs_right, out_chs_right, kernel_size=3, stride=2, padding=pad_type)
+
+        self.comb_iter_3_left = BranchSeparables(
+            out_chs_right, out_chs_right, kernel_size=3, padding=pad_type)
+        self.comb_iter_3_right = create_pool2d('max', 3, stride=2, padding=pad_type)
+
+        self.comb_iter_4_left = BranchSeparables(
+            in_chs_right, out_chs_right, kernel_size=3, stride=2, stem_cell=True, padding=pad_type)
+        self.comb_iter_4_right = ActConvBn(
+            out_chs_right, out_chs_right, kernel_size=1, stride=2, padding=pad_type)
+
+    def forward(self, x_left):
+        x_right = self.conv_1x1(x_left)
+        x_out = self.cell_forward(x_left, x_right)
+        return x_out
+
+
+class Cell(CellBase):
+
+    def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type='',
+                 is_reduction=False, match_prev_layer_dims=False):
+        super(Cell, self).__init__()
+
+        # If `is_reduction` is set to `True` stride 2 is used for
+        # convolution and pooling layers to reduce the spatial size of
+        # the output of a cell approximately by a factor of 2.
+        stride = 2 if is_reduction else 1
+
+        # If `match_prev_layer_dimensions` is set to `True`
+        # `FactorizedReduction` is used to reduce the spatial size
+        # of the left input of a cell approximately by a factor of 2.
+        self.match_prev_layer_dimensions = match_prev_layer_dims
+        if match_prev_layer_dims:
+            self.conv_prev_1x1 = FactorizedReduction(in_chs_left, out_chs_left, padding=pad_type)
+        else:
+            self.conv_prev_1x1 = ActConvBn(in_chs_left, out_chs_left, kernel_size=1, padding=pad_type)
+        self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, kernel_size=1, padding=pad_type)
+
+        self.comb_iter_0_left = BranchSeparables(
+            out_chs_left, out_chs_left, kernel_size=5, stride=stride, padding=pad_type)
+        self.comb_iter_0_right = create_pool2d('max', 3, stride=stride, padding=pad_type)
+
+        self.comb_iter_1_left = BranchSeparables(
+            out_chs_right, out_chs_right, kernel_size=7, stride=stride, padding=pad_type)
+        self.comb_iter_1_right = create_pool2d('max', 3, stride=stride, padding=pad_type)
+
+        self.comb_iter_2_left = BranchSeparables(
+            out_chs_right, out_chs_right, kernel_size=5, stride=stride, padding=pad_type)
+        self.comb_iter_2_right = BranchSeparables(
+            out_chs_right, out_chs_right, kernel_size=3, stride=stride, padding=pad_type)
+
+        self.comb_iter_3_left = BranchSeparables(out_chs_right, out_chs_right, kernel_size=3)
+        self.comb_iter_3_right = create_pool2d('max', 3, stride=stride, padding=pad_type)
+
+        self.comb_iter_4_left = BranchSeparables(
+            out_chs_left, out_chs_left, kernel_size=3, stride=stride, padding=pad_type)
+        if is_reduction:
+            self.comb_iter_4_right = ActConvBn(
+                out_chs_right, out_chs_right, kernel_size=1, stride=stride, padding=pad_type)
+        else:
+            self.comb_iter_4_right = None
+
+    def forward(self, x_left, x_right):
+        x_left = self.conv_prev_1x1(x_left)
+        x_right = self.conv_1x1(x_right)
+        x_out = self.cell_forward(x_left, x_right)
+        return x_out
+
+
+class PNASNet5Large(nn.Module):
+    def __init__(self, num_classes=1000, in_chans=3, output_stride=32, drop_rate=0., global_pool='avg', pad_type=''):
+        super(PNASNet5Large, self).__init__()
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        self.num_features = 4320
+        assert output_stride == 32
+
+        self.conv_0 = ConvNormAct(
+            in_chans, 96, kernel_size=3, stride=2, padding=0,
+            norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.1), apply_act=False)
+
+        self.cell_stem_0 = CellStem0(
+            in_chs_left=96, out_chs_left=54, in_chs_right=96, out_chs_right=54, pad_type=pad_type)
+
+        self.cell_stem_1 = Cell(
+            in_chs_left=96, out_chs_left=108, in_chs_right=270, out_chs_right=108, pad_type=pad_type,
+            match_prev_layer_dims=True, is_reduction=True)
+        self.cell_0 = Cell(
+            in_chs_left=270, out_chs_left=216, in_chs_right=540, out_chs_right=216, pad_type=pad_type,
+            match_prev_layer_dims=True)
+        self.cell_1 = Cell(
+            in_chs_left=540, out_chs_left=216, in_chs_right=1080, out_chs_right=216, pad_type=pad_type)
+        self.cell_2 = Cell(
+            in_chs_left=1080, out_chs_left=216, in_chs_right=1080, out_chs_right=216, pad_type=pad_type)
+        self.cell_3 = Cell(
+            in_chs_left=1080, out_chs_left=216, in_chs_right=1080, out_chs_right=216, pad_type=pad_type)
+
+        self.cell_4 = Cell(
+            in_chs_left=1080, out_chs_left=432, in_chs_right=1080, out_chs_right=432, pad_type=pad_type,
+            is_reduction=True)
+        self.cell_5 = Cell(
+            in_chs_left=1080, out_chs_left=432, in_chs_right=2160, out_chs_right=432, pad_type=pad_type,
+            match_prev_layer_dims=True)
+        self.cell_6 = Cell(
+            in_chs_left=2160, out_chs_left=432, in_chs_right=2160, out_chs_right=432, pad_type=pad_type)
+        self.cell_7 = Cell(
+            in_chs_left=2160, out_chs_left=432, in_chs_right=2160, out_chs_right=432, pad_type=pad_type)
+
+        self.cell_8 = Cell(
+            in_chs_left=2160, out_chs_left=864, in_chs_right=2160, out_chs_right=864, pad_type=pad_type,
+            is_reduction=True)
+        self.cell_9 = Cell(
+            in_chs_left=2160, out_chs_left=864, in_chs_right=4320, out_chs_right=864, pad_type=pad_type,
+            match_prev_layer_dims=True)
+        self.cell_10 = Cell(
+            in_chs_left=4320, out_chs_left=864, in_chs_right=4320, out_chs_right=864, pad_type=pad_type)
+        self.cell_11 = Cell(
+            in_chs_left=4320, out_chs_left=864, in_chs_right=4320, out_chs_right=864, pad_type=pad_type)
+        self.act = nn.ReLU()
+        self.feature_info = [
+            dict(num_chs=96, reduction=2, module='conv_0'),
+            dict(num_chs=270, reduction=4, module='cell_stem_1.conv_1x1.act'),
+            dict(num_chs=1080, reduction=8, module='cell_4.conv_1x1.act'),
+            dict(num_chs=2160, reduction=16, module='cell_8.conv_1x1.act'),
+            dict(num_chs=4320, reduction=32, module='act'),
+        ]
+
+        self.global_pool, self.last_linear = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(stem=r'^conv_0|cell_stem_[01]', blocks=r'^cell_(\d+)')
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        assert not enable, 'gradient checkpointing not supported'
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.last_linear
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool, self.last_linear = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool)
+
+    def forward_features(self, x):
+        x_conv_0 = self.conv_0(x)
+        x_stem_0 = self.cell_stem_0(x_conv_0)
+        x_stem_1 = self.cell_stem_1(x_conv_0, x_stem_0)
+        x_cell_0 = self.cell_0(x_stem_0, x_stem_1)
+        x_cell_1 = self.cell_1(x_stem_1, x_cell_0)
+        x_cell_2 = self.cell_2(x_cell_0, x_cell_1)
+        x_cell_3 = self.cell_3(x_cell_1, x_cell_2)
+        x_cell_4 = self.cell_4(x_cell_2, x_cell_3)
+        x_cell_5 = self.cell_5(x_cell_3, x_cell_4)
+        x_cell_6 = self.cell_6(x_cell_4, x_cell_5)
+        x_cell_7 = self.cell_7(x_cell_5, x_cell_6)
+        x_cell_8 = self.cell_8(x_cell_6, x_cell_7)
+        x_cell_9 = self.cell_9(x_cell_7, x_cell_8)
+        x_cell_10 = self.cell_10(x_cell_8, x_cell_9)
+        x_cell_11 = self.cell_11(x_cell_9, x_cell_10)
+        x = self.act(x_cell_11)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        x = self.global_pool(x)
+        if self.drop_rate > 0:
+            x = F.dropout(x, self.drop_rate, training=self.training)
+        return x if pre_logits else self.last_linear(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_pnasnet(variant, pretrained=False, **kwargs):
+    return build_model_with_cfg(
+        PNASNet5Large, variant, pretrained,
+        feature_cfg=dict(feature_cls='hook', no_rewrite=True),  # not possible to re-write this model
+        **kwargs)
+
+
+@register_model
+def pnasnet5large(pretrained=False, **kwargs):
+    r"""PNASNet-5 model architecture from the
+    `"Progressive Neural Architecture Search"
+    <https://arxiv.org/abs/1712.00559>`_ paper.
+    """
+    model_kwargs = dict(pad_type='same', **kwargs)
+    return _create_pnasnet('pnasnet5large', pretrained, **model_kwargs)
diff --git a/src/custom_timm/models/poolformer.py b/src/custom_timm/models/poolformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee7167af586b63ae7ee03c8bb609061cf9244c08
--- /dev/null
+++ b/src/custom_timm/models/poolformer.py
@@ -0,0 +1,313 @@
+""" PoolFormer implementation
+
+Paper: `PoolFormer: MetaFormer is Actually What You Need for Vision` - https://arxiv.org/abs/2111.11418
+
+Code adapted from official impl at https://github.com/sail-sg/poolformer, original copyright in comment below
+
+Modifications and additions for timm by / Copyright 2022, Ross Wightman
+"""
+# Copyright 2021 Garena Online Private Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import copy
+import torch
+import torch.nn as nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg, checkpoint_seq
+from .layers import DropPath, trunc_normal_, to_2tuple, ConvMlp, GroupNorm1
+from .registry import register_model
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .95, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = dict(
+    poolformer_s12=_cfg(
+        url='https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_s12.pth.tar',
+        crop_pct=0.9),
+    poolformer_s24=_cfg(
+        url='https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_s24.pth.tar',
+        crop_pct=0.9),
+    poolformer_s36=_cfg(
+        url='https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_s36.pth.tar',
+        crop_pct=0.9),
+    poolformer_m36=_cfg(
+        url='https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_m36.pth.tar',
+        crop_pct=0.95),
+    poolformer_m48=_cfg(
+        url='https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_m48.pth.tar',
+        crop_pct=0.95),
+)
+
+
+class PatchEmbed(nn.Module):
+    """ Patch Embedding that is implemented by a layer of conv.
+    Input: tensor in shape [B, C, H, W]
+    Output: tensor in shape [B, C, H/stride, W/stride]
+    """
+
+    def __init__(self, in_chs=3, embed_dim=768, patch_size=16, stride=16, padding=0, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        stride = to_2tuple(stride)
+        padding = to_2tuple(padding)
+        self.proj = nn.Conv2d(in_chs, embed_dim, kernel_size=patch_size, stride=stride, padding=padding)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        x = self.proj(x)
+        x = self.norm(x)
+        return x
+
+
+class Pooling(nn.Module):
+    def __init__(self, pool_size=3):
+        super().__init__()
+        self.pool = nn.AvgPool2d(pool_size, stride=1, padding=pool_size // 2, count_include_pad=False)
+
+    def forward(self, x):
+        return self.pool(x) - x
+
+
+class PoolFormerBlock(nn.Module):
+    """
+    Args:
+        dim: embedding dim
+        pool_size: pooling size
+        mlp_ratio: mlp expansion ratio
+        act_layer: activation
+        norm_layer: normalization
+        drop: dropout rate
+        drop path: Stochastic Depth, refer to https://arxiv.org/abs/1603.09382
+        use_layer_scale, --layer_scale_init_value: LayerScale, refer to https://arxiv.org/abs/2103.17239
+    """
+
+    def __init__(
+            self, dim, pool_size=3, mlp_ratio=4.,
+            act_layer=nn.GELU, norm_layer=GroupNorm1,
+            drop=0., drop_path=0., layer_scale_init_value=1e-5):
+
+        super().__init__()
+
+        self.norm1 = norm_layer(dim)
+        self.token_mixer = Pooling(pool_size=pool_size)
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = ConvMlp(dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        if layer_scale_init_value:
+            self.layer_scale_1 = nn.Parameter(layer_scale_init_value * torch.ones(dim))
+            self.layer_scale_2 = nn.Parameter(layer_scale_init_value * torch.ones(dim))
+        else:
+            self.layer_scale_1 = None
+            self.layer_scale_2 = None
+
+    def forward(self, x):
+        if self.layer_scale_1 is not None:
+            x = x + self.drop_path1(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * self.token_mixer(self.norm1(x)))
+            x = x + self.drop_path2(self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path1(self.token_mixer(self.norm1(x)))
+            x = x + self.drop_path2(self.mlp(self.norm2(x)))
+        return x
+
+
+def basic_blocks(
+        dim, index, layers,
+        pool_size=3, mlp_ratio=4.,
+        act_layer=nn.GELU, norm_layer=GroupNorm1,
+        drop_rate=.0, drop_path_rate=0.,
+        layer_scale_init_value=1e-5,
+):
+    """ generate PoolFormer blocks for a stage """
+    blocks = []
+    for block_idx in range(layers[index]):
+        block_dpr = drop_path_rate * (block_idx + sum(layers[:index])) / (sum(layers) - 1)
+        blocks.append(PoolFormerBlock(
+            dim, pool_size=pool_size, mlp_ratio=mlp_ratio,
+            act_layer=act_layer, norm_layer=norm_layer,
+            drop=drop_rate, drop_path=block_dpr,
+            layer_scale_init_value=layer_scale_init_value,
+        ))
+    blocks = nn.Sequential(*blocks)
+    return blocks
+
+
+class PoolFormer(nn.Module):
+    """ PoolFormer
+    """
+
+    def __init__(
+            self,
+            layers,
+            embed_dims=(64, 128, 320, 512),
+            mlp_ratios=(4, 4, 4, 4),
+            downsamples=(True, True, True, True),
+            pool_size=3,
+            in_chans=3,
+            num_classes=1000,
+            global_pool='avg',
+            norm_layer=GroupNorm1,
+            act_layer=nn.GELU,
+            in_patch_size=7,
+            in_stride=4,
+            in_pad=2,
+            down_patch_size=3,
+            down_stride=2,
+            down_pad=1,
+            drop_rate=0., drop_path_rate=0.,
+            layer_scale_init_value=1e-5,
+            **kwargs):
+
+        super().__init__()
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = embed_dims[-1]
+        self.grad_checkpointing = False
+
+        self.patch_embed = PatchEmbed(
+            patch_size=in_patch_size, stride=in_stride, padding=in_pad,
+            in_chs=in_chans, embed_dim=embed_dims[0])
+
+        # set the main block in network
+        network = []
+        for i in range(len(layers)):
+            network.append(basic_blocks(
+                embed_dims[i], i, layers,
+                pool_size=pool_size, mlp_ratio=mlp_ratios[i],
+                act_layer=act_layer, norm_layer=norm_layer,
+                drop_rate=drop_rate, drop_path_rate=drop_path_rate,
+                layer_scale_init_value=layer_scale_init_value)
+            )
+            if i < len(layers) - 1 and (downsamples[i] or embed_dims[i] != embed_dims[i + 1]):
+                # downsampling between stages
+                network.append(PatchEmbed(
+                    in_chs=embed_dims[i], embed_dim=embed_dims[i + 1],
+                    patch_size=down_patch_size, stride=down_stride, padding=down_pad)
+                )
+
+        self.network = nn.Sequential(*network)
+        self.norm = norm_layer(self.num_features)
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+
+    # init for classification
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^patch_embed',  # stem and embed
+            blocks=[
+                (r'^network\.(\d+).*\.proj', (99999,)),
+                (r'^network\.(\d+)', None) if coarse else (r'^network\.(\d+)\.(\d+)', None),
+                (r'^norm', (99999,))
+            ],
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        x = self.network(x)
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool == 'avg':
+            x = x.mean([-2, -1])
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_poolformer(variant, pretrained=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+    model = build_model_with_cfg(PoolFormer, variant, pretrained, **kwargs)
+    return model
+
+
+@register_model
+def poolformer_s12(pretrained=False, **kwargs):
+    """ PoolFormer-S12 model, Params: 12M """
+    model = _create_poolformer('poolformer_s12', pretrained=pretrained, layers=(2, 2, 6, 2), **kwargs)
+    return model
+
+
+@register_model
+def poolformer_s24(pretrained=False, **kwargs):
+    """ PoolFormer-S24 model, Params: 21M """
+    model = _create_poolformer('poolformer_s24', pretrained=pretrained, layers=(4, 4, 12, 4), **kwargs)
+    return model
+
+
+@register_model
+def poolformer_s36(pretrained=False, **kwargs):
+    """ PoolFormer-S36 model, Params: 31M """
+    model = _create_poolformer(
+        'poolformer_s36', pretrained=pretrained, layers=(6, 6, 18, 6), layer_scale_init_value=1e-6, **kwargs)
+    return model
+
+
+@register_model
+def poolformer_m36(pretrained=False, **kwargs):
+    """ PoolFormer-M36 model, Params: 56M """
+    layers = (6, 6, 18, 6)
+    embed_dims = (96, 192, 384, 768)
+    model = _create_poolformer(
+        'poolformer_m36', pretrained=pretrained, layers=layers, embed_dims=embed_dims,
+        layer_scale_init_value=1e-6, **kwargs)
+    return model
+
+
+@register_model
+def poolformer_m48(pretrained=False, **kwargs):
+    """ PoolFormer-M48 model, Params: 73M """
+    layers = (8, 8, 24, 8)
+    embed_dims = (96, 192, 384, 768)
+    model = _create_poolformer(
+        'poolformer_m48', pretrained=pretrained, layers=layers, embed_dims=embed_dims,
+        layer_scale_init_value=1e-6, **kwargs)
+    return model
diff --git a/src/custom_timm/models/pvt_v2.py b/src/custom_timm/models/pvt_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e024f43c05c624fada3b682b7efedbf41e51008
--- /dev/null
+++ b/src/custom_timm/models/pvt_v2.py
@@ -0,0 +1,476 @@
+""" Pyramid Vision Transformer v2
+
+@misc{wang2021pvtv2,
+      title={PVTv2: Improved Baselines with Pyramid Vision Transformer},
+      author={Wenhai Wang and Enze Xie and Xiang Li and Deng-Ping Fan and Kaitao Song and Ding Liang and
+        Tong Lu and Ping Luo and Ling Shao},
+      year={2021},
+      eprint={2106.13797},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+Based on Apache 2.0 licensed code at https://github.com/whai362/PVT
+
+Modifications and timm support by / Copyright 2022, Ross Wightman
+"""
+
+import math
+from functools import partial
+from typing import Tuple, List, Callable, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg
+from .layers import DropPath, to_2tuple, to_ntuple, trunc_normal_
+from .registry import register_model
+
+__all__ = ['PyramidVisionTransformerV2']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.9, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.proj', 'classifier': 'head', 'fixed_input_size': False,
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'pvt_v2_b0': _cfg(url='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b0.pth'),
+    'pvt_v2_b1': _cfg(url='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b1.pth'),
+    'pvt_v2_b2': _cfg(url='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'),
+    'pvt_v2_b3': _cfg(url='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b3.pth'),
+    'pvt_v2_b4': _cfg(url='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b4.pth'),
+    'pvt_v2_b5': _cfg(url='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b5.pth'),
+    'pvt_v2_b2_li': _cfg(url='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2_li.pth')
+}
+
+
+class MlpWithDepthwiseConv(nn.Module):
+    def __init__(
+            self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU,
+            drop=0., extra_relu=False):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.relu = nn.ReLU() if extra_relu else nn.Identity()
+        self.dwconv = nn.Conv2d(hidden_features, hidden_features, 3, 1, 1, bias=True, groups=hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x, feat_size: List[int]):
+        x = self.fc1(x)
+        B, N, C = x.shape
+        x = x.transpose(1, 2).view(B, C, feat_size[0], feat_size[1])
+        x = self.relu(x)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(
+            self,
+            dim,
+            num_heads=8,
+            sr_ratio=1,
+            linear_attn=False,
+            qkv_bias=True,
+            attn_drop=0.,
+            proj_drop=0.
+    ):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        if not linear_attn:
+            self.pool = None
+            if sr_ratio > 1:
+                self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+                self.norm = nn.LayerNorm(dim)
+            else:
+                self.sr = None
+                self.norm = None
+            self.act = None
+        else:
+            self.pool = nn.AdaptiveAvgPool2d(7)
+            self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1)
+            self.norm = nn.LayerNorm(dim)
+            self.act = nn.GELU()
+
+    def forward(self, x, feat_size: List[int]):
+        B, N, C = x.shape
+        H, W = feat_size
+        q = self.q(x).reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+
+        if self.pool is not None:
+            x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+            x_ = self.sr(self.pool(x_)).reshape(B, C, -1).permute(0, 2, 1)
+            x_ = self.norm(x_)
+            x_ = self.act(x_)
+            kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        else:
+            if self.sr is not None:
+                x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+                x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
+                x_ = self.norm(x_)
+                kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+            else:
+                kv = self.kv(x).reshape(B, -1, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        k, v = kv.unbind(0)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(
+            self, dim, num_heads, mlp_ratio=4., sr_ratio=1, linear_attn=False, qkv_bias=False,
+            drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            sr_ratio=sr_ratio,
+            linear_attn=linear_attn,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = MlpWithDepthwiseConv(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=drop,
+            extra_relu=linear_attn
+        )
+
+    def forward(self, x, feat_size: List[int]):
+        x = x + self.drop_path(self.attn(self.norm1(x), feat_size))
+        x = x + self.drop_path(self.mlp(self.norm2(x), feat_size))
+
+        return x
+
+
+class OverlapPatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, patch_size=7, stride=4, in_chans=3, embed_dim=768):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        assert max(patch_size) > stride, "Set larger patch_size than stride"
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=stride,
+            padding=(patch_size[0] // 2, patch_size[1] // 2))
+        self.norm = nn.LayerNorm(embed_dim)
+
+    def forward(self, x):
+        x = self.proj(x)
+        feat_size = x.shape[-2:]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x, feat_size
+
+
+class PyramidVisionTransformerStage(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            dim_out: int,
+            depth: int,
+            downsample: bool = True,
+            num_heads: int = 8,
+            sr_ratio: int = 1,
+            linear_attn: bool = False,
+            mlp_ratio: float = 4.0,
+            qkv_bias: bool = True,
+            drop: float = 0.,
+            attn_drop: float = 0.,
+            drop_path: Union[List[float], float] = 0.0,
+            norm_layer: Callable = nn.LayerNorm,
+    ):
+        super().__init__()
+        self.grad_checkpointing = False
+
+        if downsample:
+            self.downsample = OverlapPatchEmbed(
+                patch_size=3,
+                stride=2,
+                in_chans=dim,
+                embed_dim=dim_out)
+        else:
+            assert dim == dim_out
+            self.downsample = None
+
+        self.blocks = nn.ModuleList([Block(
+            dim=dim_out,
+            num_heads=num_heads,
+            sr_ratio=sr_ratio,
+            linear_attn=linear_attn,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            drop=drop,
+            attn_drop=attn_drop,
+            drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+            norm_layer=norm_layer,
+        ) for i in range(depth)])
+
+        self.norm = norm_layer(dim_out)
+
+    def forward(self, x, feat_size: List[int]) -> Tuple[torch.Tensor, List[int]]:
+        if self.downsample is not None:
+            x, feat_size = self.downsample(x)
+        for blk in self.blocks:
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint.checkpoint(blk, x, feat_size)
+            else:
+                x = blk(x, feat_size)
+        x = self.norm(x)
+        x = x.reshape(x.shape[0], feat_size[0], feat_size[1], -1).permute(0, 3, 1, 2).contiguous()
+        return x, feat_size
+
+
+class PyramidVisionTransformerV2(nn.Module):
+    def __init__(
+            self,
+            img_size=None,
+            in_chans=3,
+            num_classes=1000,
+            global_pool='avg',
+            depths=(3, 4, 6, 3),
+            embed_dims=(64, 128, 256, 512),
+            num_heads=(1, 2, 4, 8),
+            sr_ratios=(8, 4, 2, 1),
+            mlp_ratios=(8., 8., 4., 4.),
+            qkv_bias=True,
+            linear=False,
+            drop_rate=0.,
+            attn_drop_rate=0.,
+            drop_path_rate=0.,
+            norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.num_classes = num_classes
+        assert global_pool in ('avg', '')
+        self.global_pool = global_pool
+        self.depths = depths
+        num_stages = len(depths)
+        mlp_ratios = to_ntuple(num_stages)(mlp_ratios)
+        num_heads = to_ntuple(num_stages)(num_heads)
+        sr_ratios = to_ntuple(num_stages)(sr_ratios)
+        assert(len(embed_dims)) == num_stages
+
+        self.patch_embed = OverlapPatchEmbed(
+            patch_size=7,
+            stride=4,
+            in_chans=in_chans,
+            embed_dim=embed_dims[0])
+
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        cur = 0
+        prev_dim = embed_dims[0]
+        self.stages = nn.ModuleList()
+        for i in range(num_stages):
+            self.stages.append(PyramidVisionTransformerStage(
+                dim=prev_dim,
+                dim_out=embed_dims[i],
+                depth=depths[i],
+                downsample=i > 0,
+                num_heads=num_heads[i],
+                sr_ratio=sr_ratios[i],
+                mlp_ratio=mlp_ratios[i],
+                linear_attn=linear,
+                qkv_bias=qkv_bias,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer
+            ))
+            prev_dim = embed_dims[i]
+            cur += depths[i]
+
+        # classification head
+        self.num_features = embed_dims[-1]
+        self.head = nn.Linear(embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def freeze_patch_emb(self):
+        self.patch_embed.requires_grad = False
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {}
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^patch_embed',  # stem and embed
+            blocks=r'^stages\.(\d+)'
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for s in self.stages:
+            s.grad_checkpointing = enable
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ('avg', '')
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x, feat_size = self.patch_embed(x)
+        for stage in self.stages:
+            x, feat_size = stage(x, feat_size=feat_size)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool:
+            x = x.mean(dim=(-1, -2))
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _checkpoint_filter_fn(state_dict, model):
+    """ Remap original checkpoints -> timm """
+    if 'patch_embed.proj.weight' in state_dict:
+        return state_dict  # non-original checkpoint, no remapping needed
+
+    out_dict = {}
+    import re
+    for k, v in state_dict.items():
+        if k.startswith('patch_embed'):
+            k = k.replace('patch_embed1', 'patch_embed')
+            k = k.replace('patch_embed2', 'stages.1.downsample')
+            k = k.replace('patch_embed3', 'stages.2.downsample')
+            k = k.replace('patch_embed4', 'stages.3.downsample')
+        k = k.replace('dwconv.dwconv', 'dwconv')
+        k = re.sub(r'block(\d+).(\d+)', lambda x: f'stages.{int(x.group(1)) - 1}.blocks.{x.group(2)}', k)
+        k = re.sub(r'^norm(\d+)', lambda x: f'stages.{int(x.group(1)) - 1}.norm', k)
+        out_dict[k] = v
+    return out_dict
+
+
+def _create_pvt2(variant, pretrained=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+    model = build_model_with_cfg(
+        PyramidVisionTransformerV2, variant, pretrained,
+        pretrained_filter_fn=_checkpoint_filter_fn,
+        **kwargs
+    )
+    return model
+
+
+@register_model
+def pvt_v2_b0(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        depths=(2, 2, 2, 2), embed_dims=(32, 64, 160, 256), num_heads=(1, 2, 5, 8),
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return _create_pvt2('pvt_v2_b0', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def pvt_v2_b1(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        depths=(2, 2, 2, 2), embed_dims=(64, 128, 320, 512), num_heads=(1, 2, 5, 8),
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return _create_pvt2('pvt_v2_b1', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def pvt_v2_b2(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        depths=(3, 4, 6, 3), embed_dims=(64, 128, 320, 512), num_heads=(1, 2, 5, 8),
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),  **kwargs)
+    return _create_pvt2('pvt_v2_b2', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def pvt_v2_b3(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        depths=(3, 4, 18, 3), embed_dims=(64, 128, 320, 512), num_heads=(1, 2, 5, 8),
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return _create_pvt2('pvt_v2_b3', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def pvt_v2_b4(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        depths=(3, 8, 27, 3), embed_dims=(64, 128, 320, 512), num_heads=(1, 2, 5, 8),
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return _create_pvt2('pvt_v2_b4', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def pvt_v2_b5(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        depths=(3, 6, 40, 3), embed_dims=(64, 128, 320, 512), num_heads=(1, 2, 5, 8),
+        mlp_ratios=(4, 4, 4, 4), norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs)
+    return _create_pvt2('pvt_v2_b5', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def pvt_v2_b2_li(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        depths=(3, 4, 6, 3), embed_dims=(64, 128, 320, 512), num_heads=(1, 2, 5, 8),
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), linear=True, **kwargs)
+    return _create_pvt2('pvt_v2_b2_li', pretrained=pretrained, **model_kwargs)
+
diff --git a/src/custom_timm/models/registry.py b/src/custom_timm/models/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f58060fd0fdf1a2b3256327d479efd0bba77fc0
--- /dev/null
+++ b/src/custom_timm/models/registry.py
@@ -0,0 +1,159 @@
+""" Model Registry
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+import sys
+import re
+import fnmatch
+from collections import defaultdict
+from copy import deepcopy
+
+__all__ = ['list_models', 'is_model', 'model_entrypoint', 'list_modules', 'is_model_in_modules',
+           'is_pretrained_cfg_key', 'has_pretrained_cfg_key', 'get_pretrained_cfg_value', 'is_model_pretrained']
+
+_module_to_models = defaultdict(set)  # dict of sets to check membership of model in module
+_model_to_module = {}  # mapping of model names to module names
+_model_entrypoints = {}  # mapping of model names to entrypoint fns
+_model_has_pretrained = set()  # set of model names that have pretrained weight url present
+_model_pretrained_cfgs = dict()  # central repo for model default_cfgs
+
+
+def register_model(fn):
+    # lookup containing module
+    mod = sys.modules[fn.__module__]
+    module_name_split = fn.__module__.split('.')
+    module_name = module_name_split[-1] if len(module_name_split) else ''
+
+    # add model to __all__ in module
+    model_name = fn.__name__
+    if hasattr(mod, '__all__'):
+        mod.__all__.append(model_name)
+    else:
+        mod.__all__ = [model_name]
+
+    # add entries to registry dict/sets
+    _model_entrypoints[model_name] = fn
+    _model_to_module[model_name] = module_name
+    _module_to_models[module_name].add(model_name)
+    has_valid_pretrained = False  # check if model has a pretrained url to allow filtering on this
+    if hasattr(mod, 'default_cfgs') and model_name in mod.default_cfgs:
+        # this will catch all models that have entrypoint matching cfg key, but miss any aliasing
+        # entrypoints or non-matching combos
+        cfg = mod.default_cfgs[model_name]
+        has_valid_pretrained = (
+            ('url' in cfg and 'http' in cfg['url']) or
+            ('file' in cfg and cfg['file']) or
+            ('hf_hub_id' in cfg and cfg['hf_hub_id'])
+        )
+        _model_pretrained_cfgs[model_name] = mod.default_cfgs[model_name]
+    if has_valid_pretrained:
+        _model_has_pretrained.add(model_name)
+    return fn
+
+
+def _natural_key(string_):
+    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())]
+
+
+def list_models(filter='', module='', pretrained=False, exclude_filters='', name_matches_cfg=False):
+    """ Return list of available model names, sorted alphabetically
+
+    Args:
+        filter (str) - Wildcard filter string that works with fnmatch
+        module (str) - Limit model selection to a specific sub-module (ie 'gen_efficientnet')
+        pretrained (bool) - Include only models with pretrained weights if True
+        exclude_filters (str or list[str]) - Wildcard filters to exclude models after including them with filter
+        name_matches_cfg (bool) - Include only models w/ model_name matching default_cfg name (excludes some aliases)
+
+    Example:
+        model_list('gluon_resnet*') -- returns all models starting with 'gluon_resnet'
+        model_list('*resnext*, 'resnet') -- returns all models with 'resnext' in 'resnet' module
+    """
+    if module:
+        all_models = list(_module_to_models[module])
+    else:
+        all_models = _model_entrypoints.keys()
+    if filter:
+        models = []
+        include_filters = filter if isinstance(filter, (tuple, list)) else [filter]
+        for f in include_filters:
+            include_models = fnmatch.filter(all_models, f)  # include these models
+            if len(include_models):
+                models = set(models).union(include_models)
+    else:
+        models = all_models
+    if exclude_filters:
+        if not isinstance(exclude_filters, (tuple, list)):
+            exclude_filters = [exclude_filters]
+        for xf in exclude_filters:
+            exclude_models = fnmatch.filter(models, xf)  # exclude these models
+            if len(exclude_models):
+                models = set(models).difference(exclude_models)
+    if pretrained:
+        models = _model_has_pretrained.intersection(models)
+    if name_matches_cfg:
+        models = set(_model_pretrained_cfgs).intersection(models)
+    return list(sorted(models, key=_natural_key))
+
+
+def is_model(model_name):
+    """ Check if a model name exists
+    """
+    return model_name in _model_entrypoints
+
+
+def model_entrypoint(model_name):
+    """Fetch a model entrypoint for specified model name
+    """
+    return _model_entrypoints[model_name]
+
+
+def list_modules():
+    """ Return list of module names that contain models / model entrypoints
+    """
+    modules = _module_to_models.keys()
+    return list(sorted(modules))
+
+
+def is_model_in_modules(model_name, module_names):
+    """Check if a model exists within a subset of modules
+    Args:
+        model_name (str) - name of model to check
+        module_names (tuple, list, set) - names of modules to search in
+    """
+    assert isinstance(module_names, (tuple, list, set))
+    return any(model_name in _module_to_models[n] for n in module_names)
+
+
+def is_model_pretrained(model_name):
+    return model_name in _model_has_pretrained
+
+
+def get_pretrained_cfg(model_name):
+    if model_name in _model_pretrained_cfgs:
+        return deepcopy(_model_pretrained_cfgs[model_name])
+    return {}
+
+
+def has_pretrained_cfg_key(model_name, cfg_key):
+    """ Query model default_cfgs for existence of a specific key.
+    """
+    if model_name in _model_pretrained_cfgs and cfg_key in _model_pretrained_cfgs[model_name]:
+        return True
+    return False
+
+
+def is_pretrained_cfg_key(model_name, cfg_key):
+    """ Return truthy value for specified model default_cfg key, False if does not exist.
+    """
+    if model_name in _model_pretrained_cfgs and _model_pretrained_cfgs[model_name].get(cfg_key, False):
+        return True
+    return False
+
+
+def get_pretrained_cfg_value(model_name, cfg_key):
+    """ Get a specific model default_cfg value by key. None if it doesn't exist.
+    """
+    if model_name in _model_pretrained_cfgs:
+        return _model_pretrained_cfgs[model_name].get(cfg_key, None)
+    return None
\ No newline at end of file
diff --git a/src/custom_timm/models/regnet.py b/src/custom_timm/models/regnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ead5d9e9fe6a060e1559c2affed4698e3a4b57f
--- /dev/null
+++ b/src/custom_timm/models/regnet.py
@@ -0,0 +1,711 @@
+"""RegNet
+
+Paper: `Designing Network Design Spaces` - https://arxiv.org/abs/2003.13678
+Original Impl: https://github.com/facebookresearch/pycls/blob/master/pycls/models/regnet.py
+
+Based on original PyTorch impl linked above, but re-wrote to use my own blocks (adapted from ResNet here)
+and cleaned up with more descriptive variable names.
+
+Weights from original impl have been modified
+* first layer from BGR -> RGB as most PyTorch models are
+* removed training specific dict entries from checkpoints and keep model state_dict only
+* remap names to match the ones here
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import math
+from dataclasses import dataclass
+from functools import partial
+from typing import Optional, Union, Callable
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg, named_apply, checkpoint_seq
+from .layers import ClassifierHead, AvgPool2dSame, ConvNormAct, SEModule, DropPath, GroupNormAct
+from .layers import get_act_layer, get_norm_act_layer, create_conv2d
+from .registry import register_model
+
+
+@dataclass
+class RegNetCfg:
+    depth: int = 21
+    w0: int = 80
+    wa: float = 42.63
+    wm: float = 2.66
+    group_size: int = 24
+    bottle_ratio: float = 1.
+    se_ratio: float = 0.
+    stem_width: int = 32
+    downsample: Optional[str] = 'conv1x1'
+    linear_out: bool = False
+    preact: bool = False
+    num_features: int = 0
+    act_layer: Union[str, Callable] = 'relu'
+    norm_layer: Union[str, Callable] = 'batchnorm'
+
+
+# Model FLOPS = three trailing digits * 10^8
+model_cfgs = dict(
+    # RegNet-X
+    regnetx_002=RegNetCfg(w0=24, wa=36.44, wm=2.49, group_size=8, depth=13),
+    regnetx_004=RegNetCfg(w0=24, wa=24.48, wm=2.54, group_size=16, depth=22),
+    regnetx_006=RegNetCfg(w0=48, wa=36.97, wm=2.24, group_size=24, depth=16),
+    regnetx_008=RegNetCfg(w0=56, wa=35.73, wm=2.28, group_size=16, depth=16),
+    regnetx_016=RegNetCfg(w0=80, wa=34.01, wm=2.25, group_size=24, depth=18),
+    regnetx_032=RegNetCfg(w0=88, wa=26.31, wm=2.25, group_size=48, depth=25),
+    regnetx_040=RegNetCfg(w0=96, wa=38.65, wm=2.43, group_size=40, depth=23),
+    regnetx_064=RegNetCfg(w0=184, wa=60.83, wm=2.07, group_size=56, depth=17),
+    regnetx_080=RegNetCfg(w0=80, wa=49.56, wm=2.88, group_size=120, depth=23),
+    regnetx_120=RegNetCfg(w0=168, wa=73.36, wm=2.37, group_size=112, depth=19),
+    regnetx_160=RegNetCfg(w0=216, wa=55.59, wm=2.1, group_size=128, depth=22),
+    regnetx_320=RegNetCfg(w0=320, wa=69.86, wm=2.0, group_size=168, depth=23),
+
+    # RegNet-Y
+    regnety_002=RegNetCfg(w0=24, wa=36.44, wm=2.49, group_size=8, depth=13, se_ratio=0.25),
+    regnety_004=RegNetCfg(w0=48, wa=27.89, wm=2.09, group_size=8, depth=16, se_ratio=0.25),
+    regnety_006=RegNetCfg(w0=48, wa=32.54, wm=2.32, group_size=16, depth=15, se_ratio=0.25),
+    regnety_008=RegNetCfg(w0=56, wa=38.84, wm=2.4, group_size=16, depth=14, se_ratio=0.25),
+    regnety_016=RegNetCfg(w0=48, wa=20.71, wm=2.65, group_size=24, depth=27, se_ratio=0.25),
+    regnety_032=RegNetCfg(w0=80, wa=42.63, wm=2.66, group_size=24, depth=21, se_ratio=0.25),
+    regnety_040=RegNetCfg(w0=96, wa=31.41, wm=2.24, group_size=64, depth=22, se_ratio=0.25),
+    regnety_064=RegNetCfg(w0=112, wa=33.22, wm=2.27, group_size=72, depth=25, se_ratio=0.25),
+    regnety_080=RegNetCfg(w0=192, wa=76.82, wm=2.19, group_size=56, depth=17, se_ratio=0.25),
+    regnety_120=RegNetCfg(w0=168, wa=73.36, wm=2.37, group_size=112, depth=19, se_ratio=0.25),
+    regnety_160=RegNetCfg(w0=200, wa=106.23, wm=2.48, group_size=112, depth=18, se_ratio=0.25),
+    regnety_320=RegNetCfg(w0=232, wa=115.89, wm=2.53, group_size=232, depth=20, se_ratio=0.25),
+
+    # Experimental
+    regnety_040s_gn=RegNetCfg(
+        w0=96, wa=31.41, wm=2.24, group_size=64, depth=22, se_ratio=0.25,
+        act_layer='silu', norm_layer=partial(GroupNormAct, group_size=16)),
+
+    # regnetv = 'preact regnet y'
+    regnetv_040=RegNetCfg(
+        depth=22, w0=96, wa=31.41, wm=2.24, group_size=64, se_ratio=0.25, preact=True, act_layer='silu'),
+    regnetv_064=RegNetCfg(
+        depth=25, w0=112, wa=33.22, wm=2.27, group_size=72, se_ratio=0.25, preact=True, act_layer='silu',
+        downsample='avg'),
+
+    # RegNet-Z (unverified)
+    regnetz_005=RegNetCfg(
+        depth=21, w0=16, wa=10.7, wm=2.51, group_size=4, bottle_ratio=4.0, se_ratio=0.25,
+        downsample=None, linear_out=True, num_features=1024, act_layer='silu',
+    ),
+    regnetz_040=RegNetCfg(
+        depth=28, w0=48, wa=14.5, wm=2.226, group_size=8, bottle_ratio=4.0, se_ratio=0.25,
+        downsample=None, linear_out=True, num_features=0, act_layer='silu',
+    ),
+    regnetz_040h=RegNetCfg(
+        depth=28, w0=48, wa=14.5, wm=2.226, group_size=8, bottle_ratio=4.0, se_ratio=0.25,
+        downsample=None, linear_out=True, num_features=1536, act_layer='silu',
+    ),
+)
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.conv', 'classifier': 'head.fc',
+        **kwargs
+    }
+
+
+default_cfgs = dict(
+    regnetx_002=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_002-e7e85e5c.pth'),
+    regnetx_004=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_004-7d0e9424.pth'),
+    regnetx_006=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_006-85ec1baa.pth'),
+    regnetx_008=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_008-d8b470eb.pth'),
+    regnetx_016=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_016-65ca972a.pth'),
+    regnetx_032=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_032-ed0c7f7e.pth'),
+    regnetx_040=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_040-73c2a654.pth'),
+    regnetx_064=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_064-29278baa.pth'),
+    regnetx_080=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_080-7c7fcab1.pth'),
+    regnetx_120=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_120-65d5521e.pth'),
+    regnetx_160=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_160-c98c4112.pth'),
+    regnetx_320=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_320-8ea38b93.pth'),
+
+    regnety_002=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_002-e68ca334.pth'),
+    regnety_004=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_004-0db870e6.pth'),
+    regnety_006=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_006-c67e57ec.pth'),
+    regnety_008=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_008-dc900dbe.pth'),
+    regnety_016=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_016-54367f74.pth'),
+    regnety_032=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/regnety_032_ra-7f2439f9.pth',
+        crop_pct=1.0, test_input_size=(3, 288, 288)),
+    regnety_040=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnety_040_ra3-670e1166.pth',
+        crop_pct=1.0, test_input_size=(3, 288, 288)),
+    regnety_064=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnety_064_ra3-aa26dc7d.pth',
+        crop_pct=1.0, test_input_size=(3, 288, 288)),
+    regnety_080=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnety_080_ra3-1fdc4344.pth',
+        crop_pct=1.0, test_input_size=(3, 288, 288)),
+    regnety_120=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_120-721ba79a.pth'),
+    regnety_160=_cfg(
+        url='https://dl.fbaipublicfiles.com/deit/regnety_160-a5fe301d.pth',  # from Facebook DeiT GitHub repository
+        crop_pct=1.0, test_input_size=(3, 288, 288)),
+    regnety_320=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_320-ba464b29.pth'),
+
+    regnety_040s_gn=_cfg(url=''),
+    regnetv_040=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnetv_040_ra3-c248f51f.pth',
+        first_conv='stem', crop_pct=1.0, test_input_size=(3, 288, 288)),
+    regnetv_064=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnetv_064_ra3-530616c2.pth',
+        first_conv='stem', crop_pct=1.0, test_input_size=(3, 288, 288)),
+
+    regnetz_005=_cfg(url=''),
+    regnetz_040=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnetz_040_ra3-9007edf5.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, test_input_size=(3, 320, 320)),
+    regnetz_040h=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnetz_040h_ra3-f594343b.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, test_input_size=(3, 320, 320)),
+)
+
+
+def quantize_float(f, q):
+    """Converts a float to closest non-zero int divisible by q."""
+    return int(round(f / q) * q)
+
+
+def adjust_widths_groups_comp(widths, bottle_ratios, groups):
+    """Adjusts the compatibility of widths and groups."""
+    bottleneck_widths = [int(w * b) for w, b in zip(widths, bottle_ratios)]
+    groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_widths)]
+    bottleneck_widths = [quantize_float(w_bot, g) for w_bot, g in zip(bottleneck_widths, groups)]
+    widths = [int(w_bot / b) for w_bot, b in zip(bottleneck_widths, bottle_ratios)]
+    return widths, groups
+
+
+def generate_regnet(width_slope, width_initial, width_mult, depth, group_size, q=8):
+    """Generates per block widths from RegNet parameters."""
+    assert width_slope >= 0 and width_initial > 0 and width_mult > 1 and width_initial % q == 0
+    # TODO dWr scaling?
+    # depth = int(depth * (scale ** 0.1))
+    # width_scale = scale ** 0.4  # dWr scale, exp 0.8 / 2, applied to both group and layer widths
+    widths_cont = np.arange(depth) * width_slope + width_initial
+    width_exps = np.round(np.log(widths_cont / width_initial) / np.log(width_mult))
+    widths = width_initial * np.power(width_mult, width_exps)
+    widths = np.round(np.divide(widths, q)) * q
+    num_stages, max_stage = len(np.unique(widths)), width_exps.max() + 1
+    groups = np.array([group_size for _ in range(num_stages)])
+    return widths.astype(int).tolist(), num_stages, groups.astype(int).tolist()
+
+
+def downsample_conv(in_chs, out_chs, kernel_size=1, stride=1, dilation=1, norm_layer=None, preact=False):
+    norm_layer = norm_layer or nn.BatchNorm2d
+    kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size
+    dilation = dilation if kernel_size > 1 else 1
+    if preact:
+        return create_conv2d(in_chs, out_chs, kernel_size, stride=stride, dilation=dilation)
+    else:
+        return ConvNormAct(
+            in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, norm_layer=norm_layer, apply_act=False)
+
+
+def downsample_avg(in_chs, out_chs, kernel_size=1, stride=1, dilation=1, norm_layer=None, preact=False):
+    """ AvgPool Downsampling as in 'D' ResNet variants. This is not in RegNet space but I might experiment."""
+    norm_layer = norm_layer or nn.BatchNorm2d
+    avg_stride = stride if dilation == 1 else 1
+    pool = nn.Identity()
+    if stride > 1 or dilation > 1:
+        avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+        pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
+    if preact:
+        conv = create_conv2d(in_chs, out_chs, 1, stride=1)
+    else:
+        conv = ConvNormAct(in_chs, out_chs, 1, stride=1, norm_layer=norm_layer, apply_act=False)
+    return nn.Sequential(*[pool, conv])
+
+
+def create_shortcut(
+        downsample_type, in_chs, out_chs, kernel_size, stride, dilation=(1, 1), norm_layer=None, preact=False):
+    assert downsample_type in ('avg', 'conv1x1', '', None)
+    if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
+        dargs = dict(stride=stride, dilation=dilation[0], norm_layer=norm_layer, preact=preact)
+        if not downsample_type:
+            return None  # no shortcut, no downsample
+        elif downsample_type == 'avg':
+            return downsample_avg(in_chs, out_chs, **dargs)
+        else:
+            return downsample_conv(in_chs, out_chs, kernel_size=kernel_size, **dargs)
+    else:
+        return nn.Identity()  # identity shortcut (no downsample)
+
+
+class Bottleneck(nn.Module):
+    """ RegNet Bottleneck
+
+    This is almost exactly the same as a ResNet Bottlneck. The main difference is the SE block is moved from
+    after conv3 to after conv2. Otherwise, it's just redefining the arguments for groups/bottleneck channels.
+    """
+
+    def __init__(
+            self, in_chs, out_chs, stride=1, dilation=(1, 1), bottle_ratio=1, group_size=1, se_ratio=0.25,
+            downsample='conv1x1', linear_out=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
+            drop_block=None, drop_path_rate=0.):
+        super(Bottleneck, self).__init__()
+        act_layer = get_act_layer(act_layer)
+        bottleneck_chs = int(round(out_chs * bottle_ratio))
+        groups = bottleneck_chs // group_size
+
+        cargs = dict(act_layer=act_layer, norm_layer=norm_layer)
+        self.conv1 = ConvNormAct(in_chs, bottleneck_chs, kernel_size=1, **cargs)
+        self.conv2 = ConvNormAct(
+            bottleneck_chs, bottleneck_chs, kernel_size=3, stride=stride, dilation=dilation[0],
+            groups=groups, drop_layer=drop_block, **cargs)
+        if se_ratio:
+            se_channels = int(round(in_chs * se_ratio))
+            self.se = SEModule(bottleneck_chs, rd_channels=se_channels, act_layer=act_layer)
+        else:
+            self.se = nn.Identity()
+        self.conv3 = ConvNormAct(bottleneck_chs, out_chs, kernel_size=1, apply_act=False, **cargs)
+        self.act3 = nn.Identity() if linear_out else act_layer()
+        self.downsample = create_shortcut(downsample, in_chs, out_chs, 1, stride, dilation, norm_layer=norm_layer)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+
+    def zero_init_last(self):
+        nn.init.zeros_(self.conv3.bn.weight)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.se(x)
+        x = self.conv3(x)
+        if self.downsample is not None:
+            # NOTE stuck with downsample as the attr name due to weight compatibility
+            # now represents the shortcut, no shortcut if None, and non-downsample shortcut == nn.Identity()
+            x = self.drop_path(x) + self.downsample(shortcut)
+        x = self.act3(x)
+        return x
+
+
+class PreBottleneck(nn.Module):
+    """ RegNet Bottleneck
+
+    This is almost exactly the same as a ResNet Bottlneck. The main difference is the SE block is moved from
+    after conv3 to after conv2. Otherwise, it's just redefining the arguments for groups/bottleneck channels.
+    """
+
+    def __init__(
+            self, in_chs, out_chs, stride=1, dilation=(1, 1), bottle_ratio=1, group_size=1, se_ratio=0.25,
+            downsample='conv1x1', linear_out=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
+            drop_block=None, drop_path_rate=0.):
+        super(PreBottleneck, self).__init__()
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
+        bottleneck_chs = int(round(out_chs * bottle_ratio))
+        groups = bottleneck_chs // group_size
+
+        self.norm1 = norm_act_layer(in_chs)
+        self.conv1 = create_conv2d(in_chs, bottleneck_chs, kernel_size=1)
+        self.norm2 = norm_act_layer(bottleneck_chs)
+        self.conv2 = create_conv2d(
+            bottleneck_chs, bottleneck_chs, kernel_size=3, stride=stride, dilation=dilation[0], groups=groups)
+        if se_ratio:
+            se_channels = int(round(in_chs * se_ratio))
+            self.se = SEModule(bottleneck_chs, rd_channels=se_channels, act_layer=act_layer)
+        else:
+            self.se = nn.Identity()
+        self.norm3 = norm_act_layer(bottleneck_chs)
+        self.conv3 = create_conv2d(bottleneck_chs, out_chs, kernel_size=1)
+        self.downsample = create_shortcut(downsample, in_chs, out_chs, 1, stride, dilation, preact=True)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+
+    def zero_init_last(self):
+        pass
+
+    def forward(self, x):
+        x = self.norm1(x)
+        shortcut = x
+        x = self.conv1(x)
+        x = self.norm2(x)
+        x = self.conv2(x)
+        x = self.se(x)
+        x = self.norm3(x)
+        x = self.conv3(x)
+        if self.downsample is not None:
+            # NOTE stuck with downsample as the attr name due to weight compatibility
+            # now represents the shortcut, no shortcut if None, and non-downsample shortcut == nn.Identity()
+            x = self.drop_path(x) + self.downsample(shortcut)
+        return x
+
+
+class RegStage(nn.Module):
+    """Stage (sequence of blocks w/ the same output shape)."""
+
+    def __init__(
+            self, depth, in_chs, out_chs, stride, dilation,
+            drop_path_rates=None, block_fn=Bottleneck, **block_kwargs):
+        super(RegStage, self).__init__()
+        self.grad_checkpointing = False
+
+        first_dilation = 1 if dilation in (1, 2) else 2
+        for i in range(depth):
+            block_stride = stride if i == 0 else 1
+            block_in_chs = in_chs if i == 0 else out_chs
+            block_dilation = (first_dilation, dilation)
+            dpr = drop_path_rates[i] if drop_path_rates is not None else 0.
+            name = "b{}".format(i + 1)
+            self.add_module(
+                name, block_fn(
+                    block_in_chs, out_chs, stride=block_stride, dilation=block_dilation,
+                    drop_path_rate=dpr, **block_kwargs)
+            )
+            first_dilation = dilation
+
+    def forward(self, x):
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.children(), x)
+        else:
+            for block in self.children():
+                x = block(x)
+        return x
+
+
+class RegNet(nn.Module):
+    """RegNet-X, Y, and Z Models
+
+    Paper: https://arxiv.org/abs/2003.13678
+    Original Impl: https://github.com/facebookresearch/pycls/blob/master/pycls/models/regnet.py
+    """
+
+    def __init__(
+            self, cfg: RegNetCfg, in_chans=3, num_classes=1000, output_stride=32, global_pool='avg',
+            drop_rate=0., drop_path_rate=0., zero_init_last=True):
+        super().__init__()
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        assert output_stride in (8, 16, 32)
+
+        # Construct the stem
+        stem_width = cfg.stem_width
+        na_args = dict(act_layer=cfg.act_layer, norm_layer=cfg.norm_layer)
+        if cfg.preact:
+            self.stem = create_conv2d(in_chans, stem_width, 3, stride=2)
+        else:
+            self.stem = ConvNormAct(in_chans, stem_width, 3, stride=2, **na_args)
+        self.feature_info = [dict(num_chs=stem_width, reduction=2, module='stem')]
+
+        # Construct the stages
+        prev_width = stem_width
+        curr_stride = 2
+        per_stage_args, common_args = self._get_stage_args(
+            cfg, output_stride=output_stride, drop_path_rate=drop_path_rate)
+        assert len(per_stage_args) == 4
+        block_fn = PreBottleneck if cfg.preact else Bottleneck
+        for i, stage_args in enumerate(per_stage_args):
+            stage_name = "s{}".format(i + 1)
+            self.add_module(stage_name, RegStage(in_chs=prev_width, block_fn=block_fn, **stage_args, **common_args))
+            prev_width = stage_args['out_chs']
+            curr_stride *= stage_args['stride']
+            self.feature_info += [dict(num_chs=prev_width, reduction=curr_stride, module=stage_name)]
+
+        # Construct the head
+        if cfg.num_features:
+            self.final_conv = ConvNormAct(prev_width, cfg.num_features, kernel_size=1, **na_args)
+            self.num_features = cfg.num_features
+        else:
+            final_act = cfg.linear_out or cfg.preact
+            self.final_conv = get_act_layer(cfg.act_layer)() if final_act else nn.Identity()
+            self.num_features = prev_width
+        self.head = ClassifierHead(
+            in_chs=self.num_features, num_classes=num_classes, pool_type=global_pool, drop_rate=drop_rate)
+
+        named_apply(partial(_init_weights, zero_init_last=zero_init_last), self)
+
+    def _get_stage_args(self, cfg: RegNetCfg, default_stride=2, output_stride=32, drop_path_rate=0.):
+        # Generate RegNet ws per block
+        widths, num_stages, stage_gs = generate_regnet(cfg.wa, cfg.w0, cfg.wm, cfg.depth, cfg.group_size)
+
+        # Convert to per stage format
+        stage_widths, stage_depths = np.unique(widths, return_counts=True)
+        stage_br = [cfg.bottle_ratio for _ in range(num_stages)]
+        stage_strides = []
+        stage_dilations = []
+        net_stride = 2
+        dilation = 1
+        for _ in range(num_stages):
+            if net_stride >= output_stride:
+                dilation *= default_stride
+                stride = 1
+            else:
+                stride = default_stride
+                net_stride *= stride
+            stage_strides.append(stride)
+            stage_dilations.append(dilation)
+        stage_dpr = np.split(np.linspace(0, drop_path_rate, sum(stage_depths)), np.cumsum(stage_depths[:-1]))
+
+        # Adjust the compatibility of ws and gws
+        stage_widths, stage_gs = adjust_widths_groups_comp(stage_widths, stage_br, stage_gs)
+        arg_names = ['out_chs', 'stride', 'dilation', 'depth', 'bottle_ratio', 'group_size', 'drop_path_rates']
+        per_stage_args = [
+            dict(zip(arg_names, params)) for params in
+            zip(stage_widths, stage_strides, stage_dilations, stage_depths, stage_br, stage_gs, stage_dpr)]
+        common_args = dict(
+            downsample=cfg.downsample, se_ratio=cfg.se_ratio, linear_out=cfg.linear_out,
+            act_layer=cfg.act_layer, norm_layer=cfg.norm_layer)
+        return per_stage_args, common_args
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^stem',
+            blocks=r'^s(\d+)' if coarse else r'^s(\d+)\.b(\d+)',
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for s in list(self.children())[1:-1]:
+            s.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = self.s1(x)
+        x = self.s2(x)
+        x = self.s3(x)
+        x = self.s4(x)
+        x = self.final_conv(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        return self.head(x, pre_logits=pre_logits)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _init_weights(module, name='', zero_init_last=False):
+    if isinstance(module, nn.Conv2d):
+        fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
+        fan_out //= module.groups
+        module.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+        if module.bias is not None:
+            module.bias.data.zero_()
+    elif isinstance(module, nn.Linear):
+        nn.init.normal_(module.weight, mean=0.0, std=0.01)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif zero_init_last and hasattr(module, 'zero_init_last'):
+        module.zero_init_last()
+
+
+def _filter_fn(state_dict):
+    """ convert patch embedding weight from manual patchify + linear proj to conv"""
+    if 'model' in state_dict:
+        # For DeiT trained regnety_160 pretraiend model
+        state_dict = state_dict['model']
+    return state_dict
+
+
+def _create_regnet(variant, pretrained, **kwargs):
+    return build_model_with_cfg(
+        RegNet, variant, pretrained,
+        model_cfg=model_cfgs[variant],
+        pretrained_filter_fn=_filter_fn,
+        **kwargs)
+
+
+@register_model
+def regnetx_002(pretrained=False, **kwargs):
+    """RegNetX-200MF"""
+    return _create_regnet('regnetx_002', pretrained, **kwargs)
+
+
+@register_model
+def regnetx_004(pretrained=False, **kwargs):
+    """RegNetX-400MF"""
+    return _create_regnet('regnetx_004', pretrained, **kwargs)
+
+
+@register_model
+def regnetx_006(pretrained=False, **kwargs):
+    """RegNetX-600MF"""
+    return _create_regnet('regnetx_006', pretrained, **kwargs)
+
+
+@register_model
+def regnetx_008(pretrained=False, **kwargs):
+    """RegNetX-800MF"""
+    return _create_regnet('regnetx_008', pretrained, **kwargs)
+
+
+@register_model
+def regnetx_016(pretrained=False, **kwargs):
+    """RegNetX-1.6GF"""
+    return _create_regnet('regnetx_016', pretrained, **kwargs)
+
+
+@register_model
+def regnetx_032(pretrained=False, **kwargs):
+    """RegNetX-3.2GF"""
+    return _create_regnet('regnetx_032', pretrained, **kwargs)
+
+
+@register_model
+def regnetx_040(pretrained=False, **kwargs):
+    """RegNetX-4.0GF"""
+    return _create_regnet('regnetx_040', pretrained, **kwargs)
+
+
+@register_model
+def regnetx_064(pretrained=False, **kwargs):
+    """RegNetX-6.4GF"""
+    return _create_regnet('regnetx_064', pretrained, **kwargs)
+
+
+@register_model
+def regnetx_080(pretrained=False, **kwargs):
+    """RegNetX-8.0GF"""
+    return _create_regnet('regnetx_080', pretrained, **kwargs)
+
+
+@register_model
+def regnetx_120(pretrained=False, **kwargs):
+    """RegNetX-12GF"""
+    return _create_regnet('regnetx_120', pretrained, **kwargs)
+
+
+@register_model
+def regnetx_160(pretrained=False, **kwargs):
+    """RegNetX-16GF"""
+    return _create_regnet('regnetx_160', pretrained, **kwargs)
+
+
+@register_model
+def regnetx_320(pretrained=False, **kwargs):
+    """RegNetX-32GF"""
+    return _create_regnet('regnetx_320', pretrained, **kwargs)
+
+
+@register_model
+def regnety_002(pretrained=False, **kwargs):
+    """RegNetY-200MF"""
+    return _create_regnet('regnety_002', pretrained, **kwargs)
+
+
+@register_model
+def regnety_004(pretrained=False, **kwargs):
+    """RegNetY-400MF"""
+    return _create_regnet('regnety_004', pretrained, **kwargs)
+
+
+@register_model
+def regnety_006(pretrained=False, **kwargs):
+    """RegNetY-600MF"""
+    return _create_regnet('regnety_006', pretrained, **kwargs)
+
+
+@register_model
+def regnety_008(pretrained=False, **kwargs):
+    """RegNetY-800MF"""
+    return _create_regnet('regnety_008', pretrained, **kwargs)
+
+
+@register_model
+def regnety_016(pretrained=False, **kwargs):
+    """RegNetY-1.6GF"""
+    return _create_regnet('regnety_016', pretrained, **kwargs)
+
+
+@register_model
+def regnety_032(pretrained=False, **kwargs):
+    """RegNetY-3.2GF"""
+    return _create_regnet('regnety_032', pretrained, **kwargs)
+
+
+@register_model
+def regnety_040(pretrained=False, **kwargs):
+    """RegNetY-4.0GF"""
+    return _create_regnet('regnety_040', pretrained, **kwargs)
+
+
+@register_model
+def regnety_064(pretrained=False, **kwargs):
+    """RegNetY-6.4GF"""
+    return _create_regnet('regnety_064', pretrained, **kwargs)
+
+
+@register_model
+def regnety_080(pretrained=False, **kwargs):
+    """RegNetY-8.0GF"""
+    return _create_regnet('regnety_080', pretrained, **kwargs)
+
+
+@register_model
+def regnety_120(pretrained=False, **kwargs):
+    """RegNetY-12GF"""
+    return _create_regnet('regnety_120', pretrained, **kwargs)
+
+
+@register_model
+def regnety_160(pretrained=False, **kwargs):
+    """RegNetY-16GF"""
+    return _create_regnet('regnety_160', pretrained, **kwargs)
+
+
+@register_model
+def regnety_320(pretrained=False, **kwargs):
+    """RegNetY-32GF"""
+    return _create_regnet('regnety_320', pretrained, **kwargs)
+
+
+@register_model
+def regnety_040s_gn(pretrained=False, **kwargs):
+    """RegNetY-4.0GF w/ GroupNorm """
+    return _create_regnet('regnety_040s_gn', pretrained, **kwargs)
+
+
+@register_model
+def regnetv_040(pretrained=False, **kwargs):
+    """"""
+    return _create_regnet('regnetv_040', pretrained, **kwargs)
+
+
+@register_model
+def regnetv_064(pretrained=False, **kwargs):
+    """"""
+    return _create_regnet('regnetv_064', pretrained, **kwargs)
+
+
+@register_model
+def regnetz_005(pretrained=False, **kwargs):
+    """RegNetZ-500MF
+    NOTE: config found in https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py
+    but it's not clear it is equivalent to paper model as not detailed in the paper.
+    """
+    return _create_regnet('regnetz_005', pretrained, zero_init_last=False, **kwargs)
+
+
+@register_model
+def regnetz_040(pretrained=False, **kwargs):
+    """RegNetZ-4.0GF
+    NOTE: config found in https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py
+    but it's not clear it is equivalent to paper model as not detailed in the paper.
+    """
+    return _create_regnet('regnetz_040', pretrained, zero_init_last=False, **kwargs)
+
+
+@register_model
+def regnetz_040h(pretrained=False, **kwargs):
+    """RegNetZ-4.0GF
+    NOTE: config found in https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py
+    but it's not clear it is equivalent to paper model as not detailed in the paper.
+    """
+    return _create_regnet('regnetz_040h', pretrained, zero_init_last=False, **kwargs)
diff --git a/src/custom_timm/models/res2net.py b/src/custom_timm/models/res2net.py
new file mode 100644
index 0000000000000000000000000000000000000000..01899c6438bb88e907fb879abf27895b7d9ca970
--- /dev/null
+++ b/src/custom_timm/models/res2net.py
@@ -0,0 +1,213 @@
+""" Res2Net and Res2NeXt
+Adapted from Official Pytorch impl at: https://github.com/gasvn/Res2Net/
+Paper: `Res2Net: A New Multi-scale Backbone Architecture` - https://arxiv.org/abs/1904.01169
+"""
+import math
+
+import torch
+import torch.nn as nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg
+from .registry import register_model
+from .resnet import ResNet
+
+__all__ = []
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'conv1', 'classifier': 'fc',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'res2net50_26w_4s': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net50_26w_4s-06e79181.pth'),
+    'res2net50_48w_2s': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net50_48w_2s-afed724a.pth'),
+    'res2net50_14w_8s': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net50_14w_8s-6527dddc.pth'),
+    'res2net50_26w_6s': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net50_26w_6s-19041792.pth'),
+    'res2net50_26w_8s': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net50_26w_8s-2c7c9f12.pth'),
+    'res2net101_26w_4s': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net101_26w_4s-02a759a1.pth'),
+    'res2next50': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2next50_4s-6ef7e7bf.pth'),
+}
+
+
+class Bottle2neck(nn.Module):
+    """ Res2Net/Res2NeXT Bottleneck
+    Adapted from https://github.com/gasvn/Res2Net/blob/master/res2net.py
+    """
+    expansion = 4
+
+    def __init__(
+            self, inplanes, planes, stride=1, downsample=None,
+            cardinality=1, base_width=26, scale=4, dilation=1, first_dilation=None,
+            act_layer=nn.ReLU, norm_layer=None, attn_layer=None, **_):
+        super(Bottle2neck, self).__init__()
+        self.scale = scale
+        self.is_first = stride > 1 or downsample is not None
+        self.num_scales = max(1, scale - 1)
+        width = int(math.floor(planes * (base_width / 64.0))) * cardinality
+        self.width = width
+        outplanes = planes * self.expansion
+        first_dilation = first_dilation or dilation
+
+        self.conv1 = nn.Conv2d(inplanes, width * scale, kernel_size=1, bias=False)
+        self.bn1 = norm_layer(width * scale)
+
+        convs = []
+        bns = []
+        for i in range(self.num_scales):
+            convs.append(nn.Conv2d(
+                width, width, kernel_size=3, stride=stride, padding=first_dilation,
+                dilation=first_dilation, groups=cardinality, bias=False))
+            bns.append(norm_layer(width))
+        self.convs = nn.ModuleList(convs)
+        self.bns = nn.ModuleList(bns)
+        if self.is_first:
+            # FIXME this should probably have count_include_pad=False, but hurts original weights
+            self.pool = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1)
+        else:
+            self.pool = None
+
+        self.conv3 = nn.Conv2d(width * scale, outplanes, kernel_size=1, bias=False)
+        self.bn3 = norm_layer(outplanes)
+        self.se = attn_layer(outplanes) if attn_layer is not None else None
+
+        self.relu = act_layer(inplace=True)
+        self.downsample = downsample
+
+    def zero_init_last(self):
+        nn.init.zeros_(self.bn3.weight)
+
+    def forward(self, x):
+        shortcut = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        spx = torch.split(out, self.width, 1)
+        spo = []
+        sp = spx[0]  # redundant, for torchscript
+        for i, (conv, bn) in enumerate(zip(self.convs, self.bns)):
+            if i == 0 or self.is_first:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
+            sp = conv(sp)
+            sp = bn(sp)
+            sp = self.relu(sp)
+            spo.append(sp)
+        if self.scale > 1:
+            if self.pool is not None:  # self.is_first == True, None check for torchscript
+                spo.append(self.pool(spx[-1]))
+            else:
+                spo.append(spx[-1])
+        out = torch.cat(spo, 1)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.se is not None:
+            out = self.se(out)
+
+        if self.downsample is not None:
+            shortcut = self.downsample(x)
+
+        out += shortcut
+        out = self.relu(out)
+
+        return out
+
+
+def _create_res2net(variant, pretrained=False, **kwargs):
+    return build_model_with_cfg(ResNet, variant, pretrained, **kwargs)
+
+
+@register_model
+def res2net50_26w_4s(pretrained=False, **kwargs):
+    """Constructs a Res2Net-50 26w4s model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model_args = dict(
+        block=Bottle2neck, layers=[3, 4, 6, 3], base_width=26, block_args=dict(scale=4), **kwargs)
+    return _create_res2net('res2net50_26w_4s', pretrained, **model_args)
+
+
+@register_model
+def res2net101_26w_4s(pretrained=False, **kwargs):
+    """Constructs a Res2Net-101 26w4s model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model_args = dict(
+        block=Bottle2neck, layers=[3, 4, 23, 3], base_width=26, block_args=dict(scale=4), **kwargs)
+    return _create_res2net('res2net101_26w_4s', pretrained, **model_args)
+
+
+@register_model
+def res2net50_26w_6s(pretrained=False, **kwargs):
+    """Constructs a Res2Net-50 26w6s model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model_args = dict(
+        block=Bottle2neck, layers=[3, 4, 6, 3], base_width=26, block_args=dict(scale=6), **kwargs)
+    return _create_res2net('res2net50_26w_6s', pretrained, **model_args)
+
+
+@register_model
+def res2net50_26w_8s(pretrained=False, **kwargs):
+    """Constructs a Res2Net-50 26w8s model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model_args = dict(
+        block=Bottle2neck, layers=[3, 4, 6, 3], base_width=26, block_args=dict(scale=8), **kwargs)
+    return _create_res2net('res2net50_26w_8s', pretrained, **model_args)
+
+
+@register_model
+def res2net50_48w_2s(pretrained=False, **kwargs):
+    """Constructs a Res2Net-50 48w2s model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model_args = dict(
+        block=Bottle2neck, layers=[3, 4, 6, 3], base_width=48, block_args=dict(scale=2), **kwargs)
+    return _create_res2net('res2net50_48w_2s', pretrained, **model_args)
+
+
+@register_model
+def res2net50_14w_8s(pretrained=False, **kwargs):
+    """Constructs a Res2Net-50 14w8s model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model_args = dict(
+        block=Bottle2neck, layers=[3, 4, 6, 3], base_width=14, block_args=dict(scale=8), **kwargs)
+    return _create_res2net('res2net50_14w_8s', pretrained, **model_args)
+
+
+@register_model
+def res2next50(pretrained=False, **kwargs):
+    """Construct Res2NeXt-50 4s
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model_args = dict(
+        block=Bottle2neck, layers=[3, 4, 6, 3], base_width=4, cardinality=8, block_args=dict(scale=4), **kwargs)
+    return _create_res2net('res2next50', pretrained, **model_args)
diff --git a/src/custom_timm/models/resnest.py b/src/custom_timm/models/resnest.py
new file mode 100644
index 0000000000000000000000000000000000000000..84f329d9551c600c321fea4e3858520466f334df
--- /dev/null
+++ b/src/custom_timm/models/resnest.py
@@ -0,0 +1,231 @@
+""" ResNeSt Models
+
+Paper: `ResNeSt: Split-Attention Networks` - https://arxiv.org/abs/2004.08955
+
+Adapted from original PyTorch impl w/ weights at https://github.com/zhanghang1989/ResNeSt by Hang Zhang
+
+Modified for torchscript compat, and consistency with timm by Ross Wightman
+"""
+import torch
+from torch import nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg
+from .layers import SplitAttn
+from .registry import register_model
+from .resnet import ResNet
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'conv1.0', 'classifier': 'fc',
+        **kwargs
+    }
+
+default_cfgs = {
+    'resnest14d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gluon_resnest14-9c8fe254.pth'),
+    'resnest26d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gluon_resnest26-50eb607c.pth'),
+    'resnest50d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest50-528c19ca.pth'),
+    'resnest101e': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest101-22405ba7.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8)),
+    'resnest200e': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest200-75117900.pth',
+        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=0.909, interpolation='bicubic'),
+    'resnest269e': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest269-0cc87c48.pth',
+        input_size=(3, 416, 416), pool_size=(13, 13), crop_pct=0.928, interpolation='bicubic'),
+    'resnest50d_4s2x40d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest50_fast_4s2x40d-41d14ed0.pth',
+        interpolation='bicubic'),
+    'resnest50d_1s4x24d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest50_fast_1s4x24d-d4a4f76f.pth',
+        interpolation='bicubic')
+}
+
+
+class ResNestBottleneck(nn.Module):
+    """ResNet Bottleneck
+    """
+    # pylint: disable=unused-argument
+    expansion = 4
+
+    def __init__(
+            self, inplanes, planes, stride=1, downsample=None,
+            radix=1, cardinality=1, base_width=64, avd=False, avd_first=False, is_first=False,
+            reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
+            attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
+        super(ResNestBottleneck, self).__init__()
+        assert reduce_first == 1  # not supported
+        assert attn_layer is None  # not supported
+        assert aa_layer is None  # TODO not yet supported
+        assert drop_path is None  # TODO not yet supported
+
+        group_width = int(planes * (base_width / 64.)) * cardinality
+        first_dilation = first_dilation or dilation
+        if avd and (stride > 1 or is_first):
+            avd_stride = stride
+            stride = 1
+        else:
+            avd_stride = 0
+        self.radix = radix
+
+        self.conv1 = nn.Conv2d(inplanes, group_width, kernel_size=1, bias=False)
+        self.bn1 = norm_layer(group_width)
+        self.act1 = act_layer(inplace=True)
+        self.avd_first = nn.AvgPool2d(3, avd_stride, padding=1) if avd_stride > 0 and avd_first else None
+
+        if self.radix >= 1:
+            self.conv2 = SplitAttn(
+                group_width, group_width, kernel_size=3, stride=stride, padding=first_dilation,
+                dilation=first_dilation, groups=cardinality, radix=radix, norm_layer=norm_layer, drop_layer=drop_block)
+            self.bn2 = nn.Identity()
+            self.drop_block = nn.Identity()
+            self.act2 = nn.Identity()
+        else:
+            self.conv2 = nn.Conv2d(
+                group_width, group_width, kernel_size=3, stride=stride, padding=first_dilation,
+                dilation=first_dilation, groups=cardinality, bias=False)
+            self.bn2 = norm_layer(group_width)
+            self.drop_block = drop_block() if drop_block is not None else nn.Identity()
+            self.act2 = act_layer(inplace=True)
+        self.avd_last = nn.AvgPool2d(3, avd_stride, padding=1) if avd_stride > 0 and not avd_first else None
+
+        self.conv3 = nn.Conv2d(group_width, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = norm_layer(planes*4)
+        self.act3 = act_layer(inplace=True)
+        self.downsample = downsample
+
+    def zero_init_last(self):
+        nn.init.zeros_(self.bn3.weight)
+
+    def forward(self, x):
+        shortcut = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.act1(out)
+
+        if self.avd_first is not None:
+            out = self.avd_first(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.drop_block(out)
+        out = self.act2(out)
+
+        if self.avd_last is not None:
+            out = self.avd_last(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            shortcut = self.downsample(x)
+
+        out += shortcut
+        out = self.act3(out)
+        return out
+
+
+def _create_resnest(variant, pretrained=False, **kwargs):
+    return build_model_with_cfg(ResNet, variant, pretrained, **kwargs)
+
+
+@register_model
+def resnest14d(pretrained=False, **kwargs):
+    """ ResNeSt-14d model. Weights ported from GluonCV.
+    """
+    model_kwargs = dict(
+        block=ResNestBottleneck, layers=[1, 1, 1, 1],
+        stem_type='deep', stem_width=32, avg_down=True, base_width=64, cardinality=1,
+        block_args=dict(radix=2, avd=True, avd_first=False), **kwargs)
+    return _create_resnest('resnest14d', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def resnest26d(pretrained=False, **kwargs):
+    """ ResNeSt-26d model. Weights ported from GluonCV.
+    """
+    model_kwargs = dict(
+        block=ResNestBottleneck, layers=[2, 2, 2, 2],
+        stem_type='deep', stem_width=32, avg_down=True, base_width=64, cardinality=1,
+        block_args=dict(radix=2, avd=True, avd_first=False), **kwargs)
+    return _create_resnest('resnest26d', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def resnest50d(pretrained=False, **kwargs):
+    """ ResNeSt-50d model. Matches paper ResNeSt-50 model, https://arxiv.org/abs/2004.08955
+    Since this codebase supports all possible variations, 'd' for deep stem, stem_width 32, avg in downsample.
+    """
+    model_kwargs = dict(
+        block=ResNestBottleneck, layers=[3, 4, 6, 3],
+        stem_type='deep', stem_width=32, avg_down=True, base_width=64, cardinality=1,
+        block_args=dict(radix=2, avd=True, avd_first=False), **kwargs)
+    return _create_resnest('resnest50d', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def resnest101e(pretrained=False, **kwargs):
+    """ ResNeSt-101e model. Matches paper ResNeSt-101 model, https://arxiv.org/abs/2004.08955
+     Since this codebase supports all possible variations, 'e' for deep stem, stem_width 64, avg in downsample.
+    """
+    model_kwargs = dict(
+        block=ResNestBottleneck, layers=[3, 4, 23, 3],
+        stem_type='deep', stem_width=64, avg_down=True, base_width=64, cardinality=1,
+        block_args=dict(radix=2, avd=True, avd_first=False), **kwargs)
+    return _create_resnest('resnest101e', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def resnest200e(pretrained=False, **kwargs):
+    """ ResNeSt-200e model. Matches paper ResNeSt-200 model, https://arxiv.org/abs/2004.08955
+    Since this codebase supports all possible variations, 'e' for deep stem, stem_width 64, avg in downsample.
+    """
+    model_kwargs = dict(
+        block=ResNestBottleneck, layers=[3, 24, 36, 3],
+        stem_type='deep', stem_width=64, avg_down=True, base_width=64, cardinality=1,
+        block_args=dict(radix=2, avd=True, avd_first=False), **kwargs)
+    return _create_resnest('resnest200e', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def resnest269e(pretrained=False, **kwargs):
+    """ ResNeSt-269e model. Matches paper ResNeSt-269 model, https://arxiv.org/abs/2004.08955
+    Since this codebase supports all possible variations, 'e' for deep stem, stem_width 64, avg in downsample.
+    """
+    model_kwargs = dict(
+        block=ResNestBottleneck, layers=[3, 30, 48, 8],
+        stem_type='deep', stem_width=64, avg_down=True, base_width=64, cardinality=1,
+        block_args=dict(radix=2, avd=True, avd_first=False), **kwargs)
+    return _create_resnest('resnest269e', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def resnest50d_4s2x40d(pretrained=False, **kwargs):
+    """ResNeSt-50 4s2x40d from https://github.com/zhanghang1989/ResNeSt/blob/master/ablation.md
+    """
+    model_kwargs = dict(
+        block=ResNestBottleneck, layers=[3, 4, 6, 3],
+        stem_type='deep', stem_width=32, avg_down=True, base_width=40, cardinality=2,
+        block_args=dict(radix=4, avd=True, avd_first=True), **kwargs)
+    return _create_resnest('resnest50d_4s2x40d', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def resnest50d_1s4x24d(pretrained=False, **kwargs):
+    """ResNeSt-50 1s4x24d from https://github.com/zhanghang1989/ResNeSt/blob/master/ablation.md
+    """
+    model_kwargs = dict(
+        block=ResNestBottleneck, layers=[3, 4, 6, 3],
+        stem_type='deep', stem_width=32, avg_down=True, base_width=24, cardinality=4,
+        block_args=dict(radix=1, avd=True, avd_first=True), **kwargs)
+    return _create_resnest('resnest50d_1s4x24d', pretrained=pretrained, **model_kwargs)
diff --git a/src/custom_timm/models/resnet.py b/src/custom_timm/models/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c3b2a9ce02bd223a17be44765bc094390b32811
--- /dev/null
+++ b/src/custom_timm/models/resnet.py
@@ -0,0 +1,1608 @@
+"""PyTorch ResNet
+
+This started as a copy of https://github.com/pytorch/vision 'resnet.py' (BSD-3-Clause) with
+additional dropout and dynamic global avg/max pool.
+
+ResNeXt, SE-ResNeXt, SENet, and MXNet Gluon stem/downsample variants, tiered stems added by Ross Wightman
+
+Copyright 2019, Ross Wightman
+"""
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg, checkpoint_seq
+from .layers import DropBlock2d, DropPath, AvgPool2dSame, BlurPool2d, GroupNorm, create_attn, get_attn, create_classifier
+from .registry import register_model
+
+__all__ = ['ResNet', 'BasicBlock', 'Bottleneck']  # model_registry will add each entrypoint fn to this
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'conv1', 'classifier': 'fc',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    # ResNet and Wide ResNet
+    'resnet10t': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet10t_176_c3-f3215ab1.pth',
+        input_size=(3, 176, 176), pool_size=(6, 6),
+        test_crop_pct=0.95, test_input_size=(3, 224, 224),
+        first_conv='conv1.0'),
+    'resnet14t': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet14t_176_c3-c4ed2c37.pth',
+        input_size=(3, 176, 176), pool_size=(6, 6),
+        test_crop_pct=0.95, test_input_size=(3, 224, 224),
+        first_conv='conv1.0'),
+    'resnet18': _cfg(url='https://download.pytorch.org/models/resnet18-5c106cde.pth'),
+    'resnet18d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet18d_ra2-48a79e06.pth',
+        interpolation='bicubic', first_conv='conv1.0'),
+    'resnet34': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet34-43635321.pth'),
+    'resnet34d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet34d_ra2-f8dcfcaf.pth',
+        interpolation='bicubic', first_conv='conv1.0'),
+    'resnet26': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet26-9aa10e23.pth',
+        interpolation='bicubic'),
+    'resnet26d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet26d-69e92c46.pth',
+        interpolation='bicubic', first_conv='conv1.0'),
+    'resnet26t': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/resnet26t_256_ra2-6f6fa748.pth',
+        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=0.94),
+    'resnet50': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet50_a1_0-14fe96d1.pth',
+        interpolation='bicubic', crop_pct=0.95),
+    'resnet50d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet50d_ra2-464e36ba.pth',
+        interpolation='bicubic', first_conv='conv1.0'),
+    'resnet50t': _cfg(
+        url='',
+        interpolation='bicubic', first_conv='conv1.0'),
+    'resnet101': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet101_a1h-36d3f2aa.pth',
+        interpolation='bicubic', crop_pct=0.95),
+    'resnet101d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet101d_ra2-2803ffab.pth',
+        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8),
+        crop_pct=1.0, test_input_size=(3, 320, 320)),
+    'resnet152': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet152_a1h-dc400468.pth',
+        interpolation='bicubic', crop_pct=0.95),
+    'resnet152d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet152d_ra2-5cac0439.pth',
+        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8),
+        crop_pct=1.0, test_input_size=(3, 320, 320)),
+    'resnet200': _cfg(url='', interpolation='bicubic'),
+    'resnet200d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet200d_ra2-bdba9bf9.pth',
+        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8),
+        crop_pct=1.0, test_input_size=(3, 320, 320)),
+    'tv_resnet34': _cfg(url='https://download.pytorch.org/models/resnet34-333f7ec4.pth'),
+    'tv_resnet50': _cfg(url='https://download.pytorch.org/models/resnet50-19c8e357.pth'),
+    'tv_resnet101': _cfg(url='https://download.pytorch.org/models/resnet101-5d3b4d8f.pth'),
+    'tv_resnet152': _cfg(url='https://download.pytorch.org/models/resnet152-b121ed2d.pth'),
+    'wide_resnet50_2': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/wide_resnet50_racm-8234f177.pth',
+        interpolation='bicubic'),
+    'wide_resnet101_2': _cfg(url='https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth'),
+
+    # ResNets w/ alternative norm layers
+    'resnet50_gn': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet50_gn_a1h2-8fe6c4d0.pth',
+        crop_pct=0.94, interpolation='bicubic'),
+
+    # ResNeXt
+    'resnext50_32x4d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnext50_32x4d_a1h-0146ab0a.pth',
+        interpolation='bicubic', crop_pct=0.95),
+    'resnext50d_32x4d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnext50d_32x4d-103e99f8.pth',
+        interpolation='bicubic',
+        first_conv='conv1.0'),
+    'resnext101_32x4d': _cfg(url=''),
+    'resnext101_32x8d': _cfg(url='https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth'),
+    'resnext101_64x4d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/resnext101_64x4d_c-0d0e0cc0.pth',
+        interpolation='bicubic', crop_pct=1.0,  test_input_size=(3, 288, 288)),
+    'tv_resnext50_32x4d': _cfg(url='https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth'),
+
+    #  ResNeXt models - Weakly Supervised Pretraining on Instagram Hashtags
+    #  from https://github.com/facebookresearch/WSL-Images
+    #  Please note the CC-BY-NC 4.0 license on theses weights, non-commercial use only.
+    'ig_resnext101_32x8d': _cfg(url='https://download.pytorch.org/models/ig_resnext101_32x8-c38310e5.pth'),
+    'ig_resnext101_32x16d': _cfg(url='https://download.pytorch.org/models/ig_resnext101_32x16-c6f796b0.pth'),
+    'ig_resnext101_32x32d': _cfg(url='https://download.pytorch.org/models/ig_resnext101_32x32-e4b90b00.pth'),
+    'ig_resnext101_32x48d': _cfg(url='https://download.pytorch.org/models/ig_resnext101_32x48-3e41cc8a.pth'),
+
+    #  Semi-Supervised ResNe*t models from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models
+    #  Please note the CC-BY-NC 4.0 license on theses weights, non-commercial use only.
+    'ssl_resnet18':  _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnet18-d92f0530.pth'),
+    'ssl_resnet50':  _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnet50-08389792.pth'),
+    'ssl_resnext50_32x4d': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnext50_32x4-ddb3e555.pth'),
+    'ssl_resnext101_32x4d': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnext101_32x4-dc43570a.pth'),
+    'ssl_resnext101_32x8d': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnext101_32x8-2cfe2f8b.pth'),
+    'ssl_resnext101_32x16d': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnext101_32x16-15fffa57.pth'),
+
+    #  Semi-Weakly Supervised ResNe*t models from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models
+    #  Please note the CC-BY-NC 4.0 license on theses weights, non-commercial use only.
+    'swsl_resnet18': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnet18-118f1556.pth'),
+    'swsl_resnet50': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnet50-16a12f1b.pth'),
+    'swsl_resnext50_32x4d': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext50_32x4-72679e44.pth'),
+    'swsl_resnext101_32x4d': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext101_32x4-3f87e46b.pth'),
+    'swsl_resnext101_32x8d': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext101_32x8-b4712904.pth'),
+    'swsl_resnext101_32x16d': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext101_32x16-f3559a9c.pth'),
+
+    #  Efficient Channel Attention ResNets
+    'ecaresnet26t': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecaresnet26t_ra2-46609757.pth',
+        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8),
+        crop_pct=0.95, test_input_size=(3, 320, 320)),
+    'ecaresnetlight': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/ecaresnetlight-75a9c627.pth',
+        interpolation='bicubic'),
+    'ecaresnet50d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/ecaresnet50d-93c81e3b.pth',
+        interpolation='bicubic',
+        first_conv='conv1.0'),
+    'ecaresnet50d_pruned': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/ecaresnet50d_p-e4fa23c2.pth',
+        interpolation='bicubic',
+        first_conv='conv1.0'),
+    'ecaresnet50t': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecaresnet50t_ra2-f7ac63c4.pth',
+        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8),
+        crop_pct=0.95, test_input_size=(3, 320, 320)),
+    'ecaresnet101d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/ecaresnet101d-153dad65.pth',
+        interpolation='bicubic', first_conv='conv1.0'),
+    'ecaresnet101d_pruned': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/ecaresnet101d_p-9e74cb91.pth',
+        interpolation='bicubic',
+        first_conv='conv1.0'),
+    'ecaresnet200d': _cfg(
+        url='',
+        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), crop_pct=0.94, pool_size=(8, 8)),
+    'ecaresnet269d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecaresnet269d_320_ra2-7baa55cb.pth',
+        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 320, 320), pool_size=(10, 10),
+        crop_pct=1.0, test_input_size=(3, 352, 352)),
+
+    #  Efficient Channel Attention ResNeXts
+    'ecaresnext26t_32x4d': _cfg(
+        url='',
+        interpolation='bicubic', first_conv='conv1.0'),
+    'ecaresnext50t_32x4d': _cfg(
+        url='',
+        interpolation='bicubic', first_conv='conv1.0'),
+
+    #  Squeeze-Excitation ResNets, to eventually replace the models in senet.py
+    'seresnet18': _cfg(
+        url='',
+        interpolation='bicubic'),
+    'seresnet34': _cfg(
+        url='',
+        interpolation='bicubic'),
+    'seresnet50': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnet50_ra_224-8efdb4bb.pth',
+        interpolation='bicubic'),
+    'seresnet50t': _cfg(
+        url='',
+        interpolation='bicubic',
+        first_conv='conv1.0'),
+    'seresnet101': _cfg(
+        url='',
+        interpolation='bicubic'),
+    'seresnet152': _cfg(
+        url='',
+        interpolation='bicubic'),
+    'seresnet152d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnet152d_ra2-04464dd2.pth',
+        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8),
+        crop_pct=1.0, test_input_size=(3, 320, 320)
+    ),
+    'seresnet200d': _cfg(
+        url='',
+        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), crop_pct=0.94, pool_size=(8, 8)),
+    'seresnet269d': _cfg(
+        url='',
+        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), crop_pct=0.94, pool_size=(8, 8)),
+
+    #  Squeeze-Excitation ResNeXts, to eventually replace the models in senet.py
+    'seresnext26d_32x4d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnext26d_32x4d-80fa48a3.pth',
+        interpolation='bicubic',
+        first_conv='conv1.0'),
+    'seresnext26t_32x4d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnext26tn_32x4d-569cb627.pth',
+        interpolation='bicubic',
+        first_conv='conv1.0'),
+    'seresnext50_32x4d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnext50_32x4d_racm-a304a460.pth',
+        interpolation='bicubic'),
+    'seresnext101_32x4d': _cfg(
+        url='',
+        interpolation='bicubic'),
+    'seresnext101_32x8d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/seresnext101_32x8d_ah-e6bc4c0a.pth',
+        interpolation='bicubic', test_input_size=(3, 288, 288), crop_pct=1.0),
+    'seresnext101d_32x8d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/seresnext101d_32x8d_ah-191d7b94.pth',
+        interpolation='bicubic', first_conv='conv1.0', test_input_size=(3, 288, 288), crop_pct=1.0),
+
+    'senet154': _cfg(
+        url='',
+        interpolation='bicubic',
+        first_conv='conv1.0'),
+
+    # ResNets with anti-aliasing / blur pool
+    'resnetblur18': _cfg(
+        interpolation='bicubic'),
+    'resnetblur50': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnetblur50-84f4748f.pth',
+        interpolation='bicubic'),
+    'resnetblur50d': _cfg(
+        url='',
+        interpolation='bicubic', first_conv='conv1.0'),
+    'resnetblur101d': _cfg(
+        url='',
+        interpolation='bicubic', first_conv='conv1.0'),
+    'resnetaa50': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnetaa50_a1h-4cf422b3.pth',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0, interpolation='bicubic'),
+    'resnetaa50d': _cfg(
+        url='',
+        interpolation='bicubic', first_conv='conv1.0'),
+    'resnetaa101d': _cfg(
+        url='',
+        interpolation='bicubic', first_conv='conv1.0'),
+    'seresnetaa50d': _cfg(
+        url='',
+        interpolation='bicubic', first_conv='conv1.0'),
+    'seresnextaa101d_32x8d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/seresnextaa101d_32x8d_ah-83c8ae12.pth',
+        interpolation='bicubic', first_conv='conv1.0', test_input_size=(3, 288, 288), crop_pct=1.0),
+
+    # ResNet-RS models
+    'resnetrs50': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rs-weights/resnetrs50_ema-6b53758b.pth',
+        input_size=(3, 160, 160), pool_size=(5, 5), crop_pct=0.91, test_input_size=(3, 224, 224),
+        interpolation='bicubic', first_conv='conv1.0'),
+    'resnetrs101': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rs-weights/resnetrs101_i192_ema-1509bbf6.pth',
+        input_size=(3, 192, 192), pool_size=(6, 6), crop_pct=0.94, test_input_size=(3, 288, 288),
+        interpolation='bicubic', first_conv='conv1.0'),
+    'resnetrs152': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rs-weights/resnetrs152_i256_ema-a9aff7f9.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, test_input_size=(3, 320, 320),
+        interpolation='bicubic', first_conv='conv1.0'),
+    'resnetrs200': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/resnetrs200_c-6b698b88.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, test_input_size=(3, 320, 320),
+        interpolation='bicubic', first_conv='conv1.0'),
+    'resnetrs270': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rs-weights/resnetrs270_ema-b40e674c.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, test_input_size=(3, 352, 352),
+        interpolation='bicubic', first_conv='conv1.0'),
+    'resnetrs350': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rs-weights/resnetrs350_i256_ema-5a1aa8f1.pth',
+        input_size=(3, 288, 288), pool_size=(9, 9), crop_pct=1.0, test_input_size=(3, 384, 384),
+        interpolation='bicubic', first_conv='conv1.0'),
+    'resnetrs420': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rs-weights/resnetrs420_ema-972dee69.pth',
+        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, test_input_size=(3, 416, 416),
+        interpolation='bicubic', first_conv='conv1.0'),
+}
+
+
+def get_padding(kernel_size, stride, dilation=1):
+    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+    return padding
+
+
+def create_aa(aa_layer, channels, stride=2, enable=True):
+    if not aa_layer or not enable:
+        return nn.Identity()
+    return aa_layer(stride) if issubclass(aa_layer, nn.AvgPool2d) else aa_layer(channels=channels, stride=stride)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(
+            self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
+            reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
+            attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
+        super(BasicBlock, self).__init__()
+
+        assert cardinality == 1, 'BasicBlock only supports cardinality of 1'
+        assert base_width == 64, 'BasicBlock does not support changing base width'
+        first_planes = planes // reduce_first
+        outplanes = planes * self.expansion
+        first_dilation = first_dilation or dilation
+        use_aa = aa_layer is not None and (stride == 2 or first_dilation != dilation)
+
+        self.conv1 = nn.Conv2d(
+            inplanes, first_planes, kernel_size=3, stride=1 if use_aa else stride, padding=first_dilation,
+            dilation=first_dilation, bias=False)
+        self.bn1 = norm_layer(first_planes)
+        self.drop_block = drop_block() if drop_block is not None else nn.Identity()
+        self.act1 = act_layer(inplace=True)
+        self.aa = create_aa(aa_layer, channels=first_planes, stride=stride, enable=use_aa)
+
+        self.conv2 = nn.Conv2d(
+            first_planes, outplanes, kernel_size=3, padding=dilation, dilation=dilation, bias=False)
+        self.bn2 = norm_layer(outplanes)
+
+        self.se = create_attn(attn_layer, outplanes)
+
+        self.act2 = act_layer(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.drop_path = drop_path
+
+    def zero_init_last(self):
+        nn.init.zeros_(self.bn2.weight)
+
+    def forward(self, x):
+        shortcut = x
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.drop_block(x)
+        x = self.act1(x)
+        x = self.aa(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+
+        if self.se is not None:
+            x = self.se(x)
+
+        if self.drop_path is not None:
+            x = self.drop_path(x)
+
+        if self.downsample is not None:
+            shortcut = self.downsample(shortcut)
+        x += shortcut
+        x = self.act2(x)
+
+        return x
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(
+            self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
+            reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
+            attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
+        super(Bottleneck, self).__init__()
+
+        width = int(math.floor(planes * (base_width / 64)) * cardinality)
+        first_planes = width // reduce_first
+        outplanes = planes * self.expansion
+        first_dilation = first_dilation or dilation
+        use_aa = aa_layer is not None and (stride == 2 or first_dilation != dilation)
+
+        self.conv1 = nn.Conv2d(inplanes, first_planes, kernel_size=1, bias=False)
+        self.bn1 = norm_layer(first_planes)
+        self.act1 = act_layer(inplace=True)
+
+        self.conv2 = nn.Conv2d(
+            first_planes, width, kernel_size=3, stride=1 if use_aa else stride,
+            padding=first_dilation, dilation=first_dilation, groups=cardinality, bias=False)
+        self.bn2 = norm_layer(width)
+        self.drop_block = drop_block() if drop_block is not None else nn.Identity()
+        self.act2 = act_layer(inplace=True)
+        self.aa = create_aa(aa_layer, channels=width, stride=stride, enable=use_aa)
+
+        self.conv3 = nn.Conv2d(width, outplanes, kernel_size=1, bias=False)
+        self.bn3 = norm_layer(outplanes)
+
+        self.se = create_attn(attn_layer, outplanes)
+
+        self.act3 = act_layer(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.drop_path = drop_path
+
+    def zero_init_last(self):
+        nn.init.zeros_(self.bn3.weight)
+
+    def forward(self, x):
+        shortcut = x
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.drop_block(x)
+        x = self.act2(x)
+        x = self.aa(x)
+
+        x = self.conv3(x)
+        x = self.bn3(x)
+
+        if self.se is not None:
+            x = self.se(x)
+
+        if self.drop_path is not None:
+            x = self.drop_path(x)
+
+        if self.downsample is not None:
+            shortcut = self.downsample(shortcut)
+        x += shortcut
+        x = self.act3(x)
+
+        return x
+
+
+def downsample_conv(
+        in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None):
+    norm_layer = norm_layer or nn.BatchNorm2d
+    kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size
+    first_dilation = (first_dilation or dilation) if kernel_size > 1 else 1
+    p = get_padding(kernel_size, stride, first_dilation)
+
+    return nn.Sequential(*[
+        nn.Conv2d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=p, dilation=first_dilation, bias=False),
+        norm_layer(out_channels)
+    ])
+
+
+def downsample_avg(
+        in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None):
+    norm_layer = norm_layer or nn.BatchNorm2d
+    avg_stride = stride if dilation == 1 else 1
+    if stride == 1 and dilation == 1:
+        pool = nn.Identity()
+    else:
+        avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+        pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
+
+    return nn.Sequential(*[
+        pool,
+        nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False),
+        norm_layer(out_channels)
+    ])
+
+
+def drop_blocks(drop_prob=0.):
+    return [
+        None, None,
+        partial(DropBlock2d, drop_prob=drop_prob, block_size=5, gamma_scale=0.25) if drop_prob else None,
+        partial(DropBlock2d, drop_prob=drop_prob, block_size=3, gamma_scale=1.00) if drop_prob else None]
+
+
+def make_blocks(
+        block_fn, channels, block_repeats, inplanes, reduce_first=1, output_stride=32,
+        down_kernel_size=1, avg_down=False, drop_block_rate=0., drop_path_rate=0., **kwargs):
+    stages = []
+    feature_info = []
+    net_num_blocks = sum(block_repeats)
+    net_block_idx = 0
+    net_stride = 4
+    dilation = prev_dilation = 1
+    for stage_idx, (planes, num_blocks, db) in enumerate(zip(channels, block_repeats, drop_blocks(drop_block_rate))):
+        stage_name = f'layer{stage_idx + 1}'  # never liked this name, but weight compat requires it
+        stride = 1 if stage_idx == 0 else 2
+        if net_stride >= output_stride:
+            dilation *= stride
+            stride = 1
+        else:
+            net_stride *= stride
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block_fn.expansion:
+            down_kwargs = dict(
+                in_channels=inplanes, out_channels=planes * block_fn.expansion, kernel_size=down_kernel_size,
+                stride=stride, dilation=dilation, first_dilation=prev_dilation, norm_layer=kwargs.get('norm_layer'))
+            downsample = downsample_avg(**down_kwargs) if avg_down else downsample_conv(**down_kwargs)
+
+        block_kwargs = dict(reduce_first=reduce_first, dilation=dilation, drop_block=db, **kwargs)
+        blocks = []
+        for block_idx in range(num_blocks):
+            downsample = downsample if block_idx == 0 else None
+            stride = stride if block_idx == 0 else 1
+            block_dpr = drop_path_rate * net_block_idx / (net_num_blocks - 1)  # stochastic depth linear decay rule
+            blocks.append(block_fn(
+                inplanes, planes, stride, downsample, first_dilation=prev_dilation,
+                drop_path=DropPath(block_dpr) if block_dpr > 0. else None, **block_kwargs))
+            prev_dilation = dilation
+            inplanes = planes * block_fn.expansion
+            net_block_idx += 1
+
+        stages.append((stage_name, nn.Sequential(*blocks)))
+        feature_info.append(dict(num_chs=inplanes, reduction=net_stride, module=stage_name))
+
+    return stages, feature_info
+
+
+class ResNet(nn.Module):
+    """ResNet / ResNeXt / SE-ResNeXt / SE-Net
+
+    This class implements all variants of ResNet, ResNeXt, SE-ResNeXt, and SENet that
+      * have > 1 stride in the 3x3 conv layer of bottleneck
+      * have conv-bn-act ordering
+
+    This ResNet impl supports a number of stem and downsample options based on the v1c, v1d, v1e, and v1s
+    variants included in the MXNet Gluon ResNetV1b model. The C and D variants are also discussed in the
+    'Bag of Tricks' paper: https://arxiv.org/pdf/1812.01187. The B variant is equivalent to torchvision default.
+
+    ResNet variants (the same modifications can be used in SE/ResNeXt models as well):
+      * normal, b - 7x7 stem, stem_width = 64, same as torchvision ResNet, NVIDIA ResNet 'v1.5', Gluon v1b
+      * c - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64)
+      * d - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64), average pool in downsample
+      * e - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128), average pool in downsample
+      * s - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128)
+      * t - 3 layer deep 3x3 stem, stem width = 32 (24, 48, 64), average pool in downsample
+      * tn - 3 layer deep 3x3 stem, stem width = 32 (24, 32, 64), average pool in downsample
+
+    ResNeXt
+      * normal - 7x7 stem, stem_width = 64, standard cardinality and base widths
+      * same c,d, e, s variants as ResNet can be enabled
+
+    SE-ResNeXt
+      * normal - 7x7 stem, stem_width = 64
+      * same c, d, e, s variants as ResNet can be enabled
+
+    SENet-154 - 3 layer deep 3x3 stem (same as v1c-v1s), stem_width = 64, cardinality=64,
+        reduction by 2 on width of first bottleneck convolution, 3x3 downsample convs after first block
+
+    Parameters
+    ----------
+    block : Block, class for the residual block. Options are BasicBlockGl, BottleneckGl.
+    layers : list of int, number of layers in each block
+    num_classes : int, default 1000, number of classification classes.
+    in_chans : int, default 3, number of input (color) channels.
+    output_stride : int, default 32, output stride of the network, 32, 16, or 8.
+    global_pool : str, Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax'
+    cardinality : int, default 1, number of convolution groups for 3x3 conv in Bottleneck.
+    base_width : int, default 64, factor determining bottleneck channels. `planes * base_width / 64 * cardinality`
+    stem_width : int, default 64, number of channels in stem convolutions
+    stem_type : str, default ''
+        The type of stem:
+          * '', default - a single 7x7 conv with a width of stem_width
+          * 'deep' - three 3x3 convolution layers of widths stem_width, stem_width, stem_width * 2
+          * 'deep_tiered' - three 3x3 conv layers of widths stem_width//4 * 3, stem_width, stem_width * 2
+    block_reduce_first : int, default 1
+        Reduction factor for first convolution output width of residual blocks, 1 for all archs except senets, where 2
+    down_kernel_size : int, default 1, kernel size of residual block downsample path, 1x1 for most, 3x3 for senets
+    avg_down : bool, default False, use average pooling for projection skip connection between stages/downsample.
+    act_layer : nn.Module, activation layer
+    norm_layer : nn.Module, normalization layer
+    aa_layer : nn.Module, anti-aliasing layer
+    drop_rate : float, default 0. Dropout probability before classifier, for training
+    """
+
+    def __init__(
+            self, block, layers, num_classes=1000, in_chans=3, output_stride=32, global_pool='avg',
+            cardinality=1, base_width=64, stem_width=64, stem_type='', replace_stem_pool=False, block_reduce_first=1,
+            down_kernel_size=1, avg_down=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None,
+            drop_rate=0.0, drop_path_rate=0., drop_block_rate=0., zero_init_last=True, block_args=None):
+        super(ResNet, self).__init__()
+        block_args = block_args or dict()
+        assert output_stride in (8, 16, 32)
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        self.grad_checkpointing = False
+
+        # Stem
+        deep_stem = 'deep' in stem_type
+        inplanes = stem_width * 2 if deep_stem else 64
+        if deep_stem:
+            stem_chs = (stem_width, stem_width)
+            if 'tiered' in stem_type:
+                stem_chs = (3 * (stem_width // 4), stem_width)
+            self.conv1 = nn.Sequential(*[
+                nn.Conv2d(in_chans, stem_chs[0], 3, stride=2, padding=1, bias=False),
+                norm_layer(stem_chs[0]),
+                act_layer(inplace=True),
+                nn.Conv2d(stem_chs[0], stem_chs[1], 3, stride=1, padding=1, bias=False),
+                norm_layer(stem_chs[1]),
+                act_layer(inplace=True),
+                nn.Conv2d(stem_chs[1], inplanes, 3, stride=1, padding=1, bias=False)])
+        else:
+            self.conv1 = nn.Conv2d(in_chans, inplanes, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = norm_layer(inplanes)
+        self.act1 = act_layer(inplace=True)
+        self.feature_info = [dict(num_chs=inplanes, reduction=2, module='act1')]
+
+        # Stem pooling. The name 'maxpool' remains for weight compatibility.
+        if replace_stem_pool:
+            self.maxpool = nn.Sequential(*filter(None, [
+                nn.Conv2d(inplanes, inplanes, 3, stride=1 if aa_layer else 2, padding=1, bias=False),
+                create_aa(aa_layer, channels=inplanes, stride=2) if aa_layer is not None else None,
+                norm_layer(inplanes),
+                act_layer(inplace=True)
+            ]))
+        else:
+            if aa_layer is not None:
+                if issubclass(aa_layer, nn.AvgPool2d):
+                    self.maxpool = aa_layer(2)
+                else:
+                    self.maxpool = nn.Sequential(*[
+                        nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
+                        aa_layer(channels=inplanes, stride=2)])
+            else:
+                self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        # Feature Blocks
+        channels = [64, 128, 256, 512]
+        stage_modules, stage_feature_info = make_blocks(
+            block, channels, layers, inplanes, cardinality=cardinality, base_width=base_width,
+            output_stride=output_stride, reduce_first=block_reduce_first, avg_down=avg_down,
+            down_kernel_size=down_kernel_size, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer,
+            drop_block_rate=drop_block_rate, drop_path_rate=drop_path_rate, **block_args)
+        for stage in stage_modules:
+            self.add_module(*stage)  # layer1, layer2, etc
+        self.feature_info.extend(stage_feature_info)
+
+        # Head (Pooling and Classifier)
+        self.num_features = 512 * block.expansion
+        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
+
+        self.init_weights(zero_init_last=zero_init_last)
+
+    @torch.jit.ignore
+    def init_weights(self, zero_init_last=True):
+        for n, m in self.named_modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+        if zero_init_last:
+            for m in self.modules():
+                if hasattr(m, 'zero_init_last'):
+                    m.zero_init_last()
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(stem=r'^conv1|bn1|maxpool', blocks=r'^layer(\d+)' if coarse else r'^layer(\d+)\.(\d+)')
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self, name_only=False):
+        return 'fc' if name_only else self.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
+
+    def forward_features(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.maxpool(x)
+
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq([self.layer1, self.layer2, self.layer3, self.layer4], x, flatten=True)
+        else:
+            x = self.layer1(x)
+            x = self.layer2(x)
+            x = self.layer3(x)
+            x = self.layer4(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        x = self.global_pool(x)
+        if self.drop_rate:
+            x = F.dropout(x, p=float(self.drop_rate), training=self.training)
+        return x if pre_logits else self.fc(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_resnet(variant, pretrained=False, **kwargs):
+    return build_model_with_cfg(ResNet, variant, pretrained, **kwargs)
+
+
+@register_model
+def resnet10t(pretrained=False, **kwargs):
+    """Constructs a ResNet-10-T model.
+    """
+    model_args = dict(
+        block=BasicBlock, layers=[1, 1, 1, 1], stem_width=32, stem_type='deep_tiered', avg_down=True, **kwargs)
+    return _create_resnet('resnet10t', pretrained, **model_args)
+
+
+@register_model
+def resnet14t(pretrained=False, **kwargs):
+    """Constructs a ResNet-14-T model.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[1, 1, 1, 1], stem_width=32, stem_type='deep_tiered', avg_down=True, **kwargs)
+    return _create_resnet('resnet14t', pretrained, **model_args)
+
+
+@register_model
+def resnet18(pretrained=False, **kwargs):
+    """Constructs a ResNet-18 model.
+    """
+    model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], **kwargs)
+    return _create_resnet('resnet18', pretrained, **model_args)
+
+
+@register_model
+def resnet18d(pretrained=False, **kwargs):
+    """Constructs a ResNet-18-D model.
+    """
+    model_args = dict(
+        block=BasicBlock, layers=[2, 2, 2, 2], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
+    return _create_resnet('resnet18d', pretrained, **model_args)
+
+
+@register_model
+def resnet34(pretrained=False, **kwargs):
+    """Constructs a ResNet-34 model.
+    """
+    model_args = dict(block=BasicBlock, layers=[3, 4, 6, 3], **kwargs)
+    return _create_resnet('resnet34', pretrained, **model_args)
+
+
+@register_model
+def resnet34d(pretrained=False, **kwargs):
+    """Constructs a ResNet-34-D model.
+    """
+    model_args = dict(
+        block=BasicBlock, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
+    return _create_resnet('resnet34d', pretrained, **model_args)
+
+
+@register_model
+def resnet26(pretrained=False, **kwargs):
+    """Constructs a ResNet-26 model.
+    """
+    model_args = dict(block=Bottleneck, layers=[2, 2, 2, 2], **kwargs)
+    return _create_resnet('resnet26', pretrained, **model_args)
+
+
+@register_model
+def resnet26t(pretrained=False, **kwargs):
+    """Constructs a ResNet-26-T model.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[2, 2, 2, 2], stem_width=32, stem_type='deep_tiered', avg_down=True, **kwargs)
+    return _create_resnet('resnet26t', pretrained, **model_args)
+
+
+@register_model
+def resnet26d(pretrained=False, **kwargs):
+    """Constructs a ResNet-26-D model.
+    """
+    model_args = dict(block=Bottleneck, layers=[2, 2, 2, 2], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
+    return _create_resnet('resnet26d', pretrained, **model_args)
+
+
+@register_model
+def resnet50(pretrained=False, **kwargs):
+    """Constructs a ResNet-50 model.
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3],  **kwargs)
+    return _create_resnet('resnet50', pretrained, **model_args)
+
+
+@register_model
+def resnet50d(pretrained=False, **kwargs):
+    """Constructs a ResNet-50-D model.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
+    return _create_resnet('resnet50d', pretrained, **model_args)
+
+
+@register_model
+def resnet50t(pretrained=False, **kwargs):
+    """Constructs a ResNet-50-T model.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep_tiered', avg_down=True, **kwargs)
+    return _create_resnet('resnet50t', pretrained, **model_args)
+
+
+@register_model
+def resnet101(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model.
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], **kwargs)
+    return _create_resnet('resnet101', pretrained, **model_args)
+
+
+@register_model
+def resnet101d(pretrained=False, **kwargs):
+    """Constructs a ResNet-101-D model.
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
+    return _create_resnet('resnet101d', pretrained, **model_args)
+
+
+@register_model
+def resnet152(pretrained=False, **kwargs):
+    """Constructs a ResNet-152 model.
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 8, 36, 3], **kwargs)
+    return _create_resnet('resnet152', pretrained, **model_args)
+
+
+@register_model
+def resnet152d(pretrained=False, **kwargs):
+    """Constructs a ResNet-152-D model.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
+    return _create_resnet('resnet152d', pretrained, **model_args)
+
+
+@register_model
+def resnet200(pretrained=False, **kwargs):
+    """Constructs a ResNet-200 model.
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 24, 36, 3], **kwargs)
+    return _create_resnet('resnet200', pretrained, **model_args)
+
+
+@register_model
+def resnet200d(pretrained=False, **kwargs):
+    """Constructs a ResNet-200-D model.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 24, 36, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
+    return _create_resnet('resnet200d', pretrained, **model_args)
+
+
+@register_model
+def tv_resnet34(pretrained=False, **kwargs):
+    """Constructs a ResNet-34 model with original Torchvision weights.
+    """
+    model_args = dict(block=BasicBlock, layers=[3, 4, 6, 3], **kwargs)
+    return _create_resnet('tv_resnet34', pretrained, **model_args)
+
+
+@register_model
+def tv_resnet50(pretrained=False, **kwargs):
+    """Constructs a ResNet-50 model with original Torchvision weights.
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3],  **kwargs)
+    return _create_resnet('tv_resnet50', pretrained, **model_args)
+
+
+@register_model
+def tv_resnet101(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model w/ Torchvision pretrained weights.
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], **kwargs)
+    return _create_resnet('tv_resnet101', pretrained, **model_args)
+
+
+@register_model
+def tv_resnet152(pretrained=False, **kwargs):
+    """Constructs a ResNet-152 model w/ Torchvision pretrained weights.
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 8, 36, 3], **kwargs)
+    return _create_resnet('tv_resnet152', pretrained, **model_args)
+
+
+@register_model
+def wide_resnet50_2(pretrained=False, **kwargs):
+    """Constructs a Wide ResNet-50-2 model.
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], base_width=128, **kwargs)
+    return _create_resnet('wide_resnet50_2', pretrained, **model_args)
+
+
+@register_model
+def wide_resnet101_2(pretrained=False, **kwargs):
+    """Constructs a Wide ResNet-101-2 model.
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same.
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], base_width=128, **kwargs)
+    return _create_resnet('wide_resnet101_2', pretrained, **model_args)
+
+
+@register_model
+def resnet50_gn(pretrained=False, **kwargs):
+    """Constructs a ResNet-50 model w/ GroupNorm
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3],  **kwargs)
+    return _create_resnet('resnet50_gn', pretrained, norm_layer=GroupNorm, **model_args)
+
+
+@register_model
+def resnext50_32x4d(pretrained=False, **kwargs):
+    """Constructs a ResNeXt50-32x4d model.
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, **kwargs)
+    return _create_resnet('resnext50_32x4d', pretrained, **model_args)
+
+
+@register_model
+def resnext50d_32x4d(pretrained=False, **kwargs):
+    """Constructs a ResNeXt50d-32x4d model. ResNext50 w/ deep stem & avg pool downsample
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 6, 3],  cardinality=32, base_width=4,
+        stem_width=32, stem_type='deep', avg_down=True, **kwargs)
+    return _create_resnet('resnext50d_32x4d', pretrained, **model_args)
+
+
+@register_model
+def resnext101_32x4d(pretrained=False, **kwargs):
+    """Constructs a ResNeXt-101 32x4d model.
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=4, **kwargs)
+    return _create_resnet('resnext101_32x4d', pretrained, **model_args)
+
+
+@register_model
+def resnext101_32x8d(pretrained=False, **kwargs):
+    """Constructs a ResNeXt-101 32x8d model.
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8, **kwargs)
+    return _create_resnet('resnext101_32x8d', pretrained, **model_args)
+
+
+@register_model
+def resnext101_64x4d(pretrained=False, **kwargs):
+    """Constructs a ResNeXt101-64x4d model.
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=64, base_width=4, **kwargs)
+    return _create_resnet('resnext101_64x4d', pretrained, **model_args)
+
+
+@register_model
+def tv_resnext50_32x4d(pretrained=False, **kwargs):
+    """Constructs a ResNeXt50-32x4d model with original Torchvision weights.
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, **kwargs)
+    return _create_resnet('tv_resnext50_32x4d', pretrained, **model_args)
+
+
+@register_model
+def ig_resnext101_32x8d(pretrained=False, **kwargs):
+    """Constructs a ResNeXt-101 32x8 model pre-trained on weakly-supervised data
+    and finetuned on ImageNet from Figure 5 in
+    `"Exploring the Limits of Weakly Supervised Pretraining" <https://arxiv.org/abs/1805.00932>`_
+    Weights from https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8, **kwargs)
+    return _create_resnet('ig_resnext101_32x8d', pretrained, **model_args)
+
+
+@register_model
+def ig_resnext101_32x16d(pretrained=False, **kwargs):
+    """Constructs a ResNeXt-101 32x16 model pre-trained on weakly-supervised data
+    and finetuned on ImageNet from Figure 5 in
+    `"Exploring the Limits of Weakly Supervised Pretraining" <https://arxiv.org/abs/1805.00932>`_
+    Weights from https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=16, **kwargs)
+    return _create_resnet('ig_resnext101_32x16d', pretrained, **model_args)
+
+
+@register_model
+def ig_resnext101_32x32d(pretrained=False, **kwargs):
+    """Constructs a ResNeXt-101 32x32 model pre-trained on weakly-supervised data
+    and finetuned on ImageNet from Figure 5 in
+    `"Exploring the Limits of Weakly Supervised Pretraining" <https://arxiv.org/abs/1805.00932>`_
+    Weights from https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=32, **kwargs)
+    return _create_resnet('ig_resnext101_32x32d', pretrained, **model_args)
+
+
+@register_model
+def ig_resnext101_32x48d(pretrained=False, **kwargs):
+    """Constructs a ResNeXt-101 32x48 model pre-trained on weakly-supervised data
+    and finetuned on ImageNet from Figure 5 in
+    `"Exploring the Limits of Weakly Supervised Pretraining" <https://arxiv.org/abs/1805.00932>`_
+    Weights from https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=48, **kwargs)
+    return _create_resnet('ig_resnext101_32x48d', pretrained, **model_args)
+
+
+@register_model
+def ssl_resnet18(pretrained=False, **kwargs):
+    """Constructs a semi-supervised ResNet-18 model pre-trained on YFCC100M dataset and finetuned on ImageNet
+    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], **kwargs)
+    return _create_resnet('ssl_resnet18', pretrained, **model_args)
+
+
+@register_model
+def ssl_resnet50(pretrained=False, **kwargs):
+    """Constructs a semi-supervised ResNet-50 model pre-trained on YFCC100M dataset and finetuned on ImageNet
+    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3],  **kwargs)
+    return _create_resnet('ssl_resnet50', pretrained, **model_args)
+
+
+@register_model
+def ssl_resnext50_32x4d(pretrained=False, **kwargs):
+    """Constructs a semi-supervised ResNeXt-50 32x4 model pre-trained on YFCC100M dataset and finetuned on ImageNet
+    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, **kwargs)
+    return _create_resnet('ssl_resnext50_32x4d', pretrained, **model_args)
+
+
+@register_model
+def ssl_resnext101_32x4d(pretrained=False, **kwargs):
+    """Constructs a semi-supervised ResNeXt-101 32x4 model pre-trained on YFCC100M dataset and finetuned on ImageNet
+    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=4, **kwargs)
+    return _create_resnet('ssl_resnext101_32x4d', pretrained, **model_args)
+
+
+@register_model
+def ssl_resnext101_32x8d(pretrained=False, **kwargs):
+    """Constructs a semi-supervised ResNeXt-101 32x8 model pre-trained on YFCC100M dataset and finetuned on ImageNet
+    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8, **kwargs)
+    return _create_resnet('ssl_resnext101_32x8d', pretrained, **model_args)
+
+
+@register_model
+def ssl_resnext101_32x16d(pretrained=False, **kwargs):
+    """Constructs a semi-supervised ResNeXt-101 32x16 model pre-trained on YFCC100M dataset and finetuned on ImageNet
+    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=16, **kwargs)
+    return _create_resnet('ssl_resnext101_32x16d', pretrained, **model_args)
+
+
+@register_model
+def swsl_resnet18(pretrained=False, **kwargs):
+    """Constructs a semi-weakly supervised Resnet-18 model pre-trained on 1B weakly supervised
+       image dataset and finetuned on ImageNet.
+       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], **kwargs)
+    return _create_resnet('swsl_resnet18', pretrained, **model_args)
+
+
+@register_model
+def swsl_resnet50(pretrained=False, **kwargs):
+    """Constructs a semi-weakly supervised ResNet-50 model pre-trained on 1B weakly supervised
+       image dataset and finetuned on ImageNet.
+       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3],  **kwargs)
+    return _create_resnet('swsl_resnet50', pretrained, **model_args)
+
+
+@register_model
+def swsl_resnext50_32x4d(pretrained=False, **kwargs):
+    """Constructs a semi-weakly supervised ResNeXt-50 32x4 model pre-trained on 1B weakly supervised
+       image dataset and finetuned on ImageNet.
+       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, **kwargs)
+    return _create_resnet('swsl_resnext50_32x4d', pretrained, **model_args)
+
+
+@register_model
+def swsl_resnext101_32x4d(pretrained=False, **kwargs):
+    """Constructs a semi-weakly supervised ResNeXt-101 32x4 model pre-trained on 1B weakly supervised
+       image dataset and finetuned on ImageNet.
+       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=4, **kwargs)
+    return _create_resnet('swsl_resnext101_32x4d', pretrained, **model_args)
+
+
+@register_model
+def swsl_resnext101_32x8d(pretrained=False, **kwargs):
+    """Constructs a semi-weakly supervised ResNeXt-101 32x8 model pre-trained on 1B weakly supervised
+       image dataset and finetuned on ImageNet.
+       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8, **kwargs)
+    return _create_resnet('swsl_resnext101_32x8d', pretrained, **model_args)
+
+
+@register_model
+def swsl_resnext101_32x16d(pretrained=False, **kwargs):
+    """Constructs a semi-weakly supervised ResNeXt-101 32x16 model pre-trained on 1B weakly supervised
+       image dataset and finetuned on ImageNet.
+       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=16, **kwargs)
+    return _create_resnet('swsl_resnext101_32x16d', pretrained, **model_args)
+
+
+@register_model
+def ecaresnet26t(pretrained=False, **kwargs):
+    """Constructs an ECA-ResNeXt-26-T model.
+    This is technically a 28 layer ResNet, like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels
+    in the deep stem and ECA attn.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[2, 2, 2, 2], stem_width=32,
+        stem_type='deep_tiered', avg_down=True, block_args=dict(attn_layer='eca'), **kwargs)
+    return _create_resnet('ecaresnet26t', pretrained, **model_args)
+
+
+@register_model
+def ecaresnet50d(pretrained=False, **kwargs):
+    """Constructs a ResNet-50-D model with eca.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True,
+        block_args=dict(attn_layer='eca'), **kwargs)
+    return _create_resnet('ecaresnet50d', pretrained, **model_args)
+
+
+@register_model
+def ecaresnet50d_pruned(pretrained=False, **kwargs):
+    """Constructs a ResNet-50-D model pruned with eca.
+        The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True,
+        block_args=dict(attn_layer='eca'), **kwargs)
+    return _create_resnet('ecaresnet50d_pruned', pretrained, pruned=True, **model_args)
+
+
+@register_model
+def ecaresnet50t(pretrained=False, **kwargs):
+    """Constructs an ECA-ResNet-50-T model.
+    Like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels in the deep stem and ECA attn.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32,
+        stem_type='deep_tiered', avg_down=True, block_args=dict(attn_layer='eca'), **kwargs)
+    return _create_resnet('ecaresnet50t', pretrained, **model_args)
+
+
+@register_model
+def ecaresnetlight(pretrained=False, **kwargs):
+    """Constructs a ResNet-50-D light model with eca.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[1, 1, 11, 3], stem_width=32, avg_down=True,
+        block_args=dict(attn_layer='eca'), **kwargs)
+    return _create_resnet('ecaresnetlight', pretrained, **model_args)
+
+
+@register_model
+def ecaresnet101d(pretrained=False, **kwargs):
+    """Constructs a ResNet-101-D model with eca.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', avg_down=True,
+        block_args=dict(attn_layer='eca'), **kwargs)
+    return _create_resnet('ecaresnet101d', pretrained, **model_args)
+
+
+@register_model
+def ecaresnet101d_pruned(pretrained=False, **kwargs):
+    """Constructs a ResNet-101-D model pruned with eca.
+       The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', avg_down=True,
+        block_args=dict(attn_layer='eca'), **kwargs)
+    return _create_resnet('ecaresnet101d_pruned', pretrained, pruned=True, **model_args)
+
+
+@register_model
+def ecaresnet200d(pretrained=False, **kwargs):
+    """Constructs a ResNet-200-D model with ECA.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 24, 36, 3], stem_width=32, stem_type='deep', avg_down=True,
+        block_args=dict(attn_layer='eca'), **kwargs)
+    return _create_resnet('ecaresnet200d', pretrained, **model_args)
+
+
+@register_model
+def ecaresnet269d(pretrained=False, **kwargs):
+    """Constructs a ResNet-269-D model with ECA.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 30, 48, 8], stem_width=32, stem_type='deep', avg_down=True,
+        block_args=dict(attn_layer='eca'), **kwargs)
+    return _create_resnet('ecaresnet269d', pretrained, **model_args)
+
+
+@register_model
+def ecaresnext26t_32x4d(pretrained=False, **kwargs):
+    """Constructs an ECA-ResNeXt-26-T model.
+    This is technically a 28 layer ResNet, like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels
+    in the deep stem. This model replaces SE module with the ECA module
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[2, 2, 2, 2], cardinality=32, base_width=4, stem_width=32,
+        stem_type='deep_tiered', avg_down=True, block_args=dict(attn_layer='eca'), **kwargs)
+    return _create_resnet('ecaresnext26t_32x4d', pretrained, **model_args)
+
+
+@register_model
+def ecaresnext50t_32x4d(pretrained=False, **kwargs):
+    """Constructs an ECA-ResNeXt-50-T model.
+    This is technically a 28 layer ResNet, like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels
+    in the deep stem. This model replaces SE module with the ECA module
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[2, 2, 2, 2], cardinality=32, base_width=4, stem_width=32,
+        stem_type='deep_tiered', avg_down=True, block_args=dict(attn_layer='eca'), **kwargs)
+    return _create_resnet('ecaresnext50t_32x4d', pretrained, **model_args)
+
+
+@register_model
+def seresnet18(pretrained=False, **kwargs):
+    model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnet18', pretrained, **model_args)
+
+
+@register_model
+def seresnet34(pretrained=False, **kwargs):
+    model_args = dict(block=BasicBlock, layers=[3, 4, 6, 3], block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnet34', pretrained, **model_args)
+
+
+@register_model
+def seresnet50(pretrained=False, **kwargs):
+    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnet50', pretrained, **model_args)
+
+
+@register_model
+def seresnet50t(pretrained=False, **kwargs):
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 6, 3],  stem_width=32, stem_type='deep_tiered', avg_down=True,
+        block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnet50t', pretrained, **model_args)
+
+
+@register_model
+def seresnet101(pretrained=False, **kwargs):
+    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnet101', pretrained, **model_args)
+
+
+@register_model
+def seresnet152(pretrained=False, **kwargs):
+    model_args = dict(block=Bottleneck, layers=[3, 8, 36, 3], block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnet152', pretrained, **model_args)
+
+
+@register_model
+def seresnet152d(pretrained=False, **kwargs):
+    model_args = dict(
+        block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', avg_down=True,
+        block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnet152d', pretrained, **model_args)
+
+
+@register_model
+def seresnet200d(pretrained=False, **kwargs):
+    """Constructs a ResNet-200-D model with SE attn.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 24, 36, 3], stem_width=32, stem_type='deep', avg_down=True,
+        block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnet200d', pretrained, **model_args)
+
+
+@register_model
+def seresnet269d(pretrained=False, **kwargs):
+    """Constructs a ResNet-269-D model with SE attn.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 30, 48, 8], stem_width=32, stem_type='deep', avg_down=True,
+        block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnet269d', pretrained, **model_args)
+
+
+@register_model
+def seresnext26d_32x4d(pretrained=False, **kwargs):
+    """Constructs a SE-ResNeXt-26-D model.`
+    This is technically a 28 layer ResNet, using the 'D' modifier from Gluon / bag-of-tricks for
+    combination of deep stem and avg_pool in downsample.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[2, 2, 2, 2], cardinality=32, base_width=4, stem_width=32,
+        stem_type='deep', avg_down=True, block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnext26d_32x4d', pretrained, **model_args)
+
+
+@register_model
+def seresnext26t_32x4d(pretrained=False, **kwargs):
+    """Constructs a SE-ResNet-26-T model.
+    This is technically a 28 layer ResNet, like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels
+    in the deep stem.
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[2, 2, 2, 2], cardinality=32, base_width=4, stem_width=32,
+        stem_type='deep_tiered', avg_down=True, block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnext26t_32x4d', pretrained, **model_args)
+
+
+@register_model
+def seresnext26tn_32x4d(pretrained=False, **kwargs):
+    """Constructs a SE-ResNeXt-26-T model.
+    NOTE I deprecated previous 't' model defs and replaced 't' with 'tn', this was the only tn model of note
+    so keeping this def for backwards compat with any uses out there. Old 't' model is lost.
+    """
+    return seresnext26t_32x4d(pretrained=pretrained, **kwargs)
+
+
+@register_model
+def seresnext50_32x4d(pretrained=False, **kwargs):
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4,
+        block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnext50_32x4d', pretrained, **model_args)
+
+
+@register_model
+def seresnext101_32x4d(pretrained=False, **kwargs):
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=4,
+        block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnext101_32x4d', pretrained, **model_args)
+
+
+@register_model
+def seresnext101_32x8d(pretrained=False, **kwargs):
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8,
+        block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnext101_32x8d', pretrained, **model_args)
+
+
+@register_model
+def seresnext101d_32x8d(pretrained=False, **kwargs):
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8,
+        stem_width=32, stem_type='deep', avg_down=True,
+        block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnext101d_32x8d', pretrained, **model_args)
+
+
+@register_model
+def senet154(pretrained=False, **kwargs):
+    model_args = dict(
+        block=Bottleneck, layers=[3, 8, 36, 3], cardinality=64, base_width=4, stem_type='deep',
+        down_kernel_size=3, block_reduce_first=2, block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('senet154', pretrained, **model_args)
+
+
+@register_model
+def resnetblur18(pretrained=False, **kwargs):
+    """Constructs a ResNet-18 model with blur anti-aliasing
+    """
+    model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], aa_layer=BlurPool2d, **kwargs)
+    return _create_resnet('resnetblur18', pretrained, **model_args)
+
+
+@register_model
+def resnetblur50(pretrained=False, **kwargs):
+    """Constructs a ResNet-50 model with blur anti-aliasing
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], aa_layer=BlurPool2d, **kwargs)
+    return _create_resnet('resnetblur50', pretrained, **model_args)
+
+
+@register_model
+def resnetblur50d(pretrained=False, **kwargs):
+    """Constructs a ResNet-50-D model with blur anti-aliasing
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 6, 3], aa_layer=BlurPool2d,
+        stem_width=32, stem_type='deep', avg_down=True, **kwargs)
+    return _create_resnet('resnetblur50d', pretrained, **model_args)
+
+
+@register_model
+def resnetblur101d(pretrained=False, **kwargs):
+    """Constructs a ResNet-101-D model with blur anti-aliasing
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 23, 3], aa_layer=BlurPool2d,
+        stem_width=32, stem_type='deep', avg_down=True, **kwargs)
+    return _create_resnet('resnetblur101d', pretrained, **model_args)
+
+
+@register_model
+def resnetaa50(pretrained=False, **kwargs):
+    """Constructs a ResNet-50 model with avgpool anti-aliasing
+    """
+    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], aa_layer=nn.AvgPool2d, **kwargs)
+    return _create_resnet('resnetaa50', pretrained, **model_args)
+
+
+@register_model
+def resnetaa50d(pretrained=False, **kwargs):
+    """Constructs a ResNet-50-D model with avgpool anti-aliasing
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 6, 3], aa_layer=nn.AvgPool2d,
+        stem_width=32, stem_type='deep', avg_down=True, **kwargs)
+    return _create_resnet('resnetaa50d', pretrained, **model_args)
+
+
+@register_model
+def resnetaa101d(pretrained=False, **kwargs):
+    """Constructs a ResNet-101-D model with avgpool anti-aliasing
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 23, 3], aa_layer=nn.AvgPool2d,
+        stem_width=32, stem_type='deep', avg_down=True, **kwargs)
+    return _create_resnet('resnetaa101d', pretrained, **model_args)
+
+
+@register_model
+def seresnetaa50d(pretrained=False, **kwargs):
+    """Constructs a SE=ResNet-50-D model with avgpool anti-aliasing
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 6, 3], aa_layer=nn.AvgPool2d,
+        stem_width=32, stem_type='deep', avg_down=True, block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnetaa50d', pretrained, **model_args)
+
+
+@register_model
+def seresnextaa101d_32x8d(pretrained=False, **kwargs):
+    """Constructs a SE=ResNeXt-101-D 32x8d model with avgpool anti-aliasing
+    """
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8,
+        stem_width=32, stem_type='deep', avg_down=True, aa_layer=nn.AvgPool2d,
+        block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('seresnextaa101d_32x8d', pretrained, **model_args)
+
+
+@register_model
+def resnetrs50(pretrained=False, **kwargs):
+    """Constructs a ResNet-RS-50 model.
+    Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579
+    Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs
+    """
+    attn_layer = partial(get_attn('se'), rd_ratio=0.25)
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', replace_stem_pool=True,
+        avg_down=True,  block_args=dict(attn_layer=attn_layer), **kwargs)
+    return _create_resnet('resnetrs50', pretrained, **model_args)
+
+
+@register_model
+def resnetrs101(pretrained=False, **kwargs):
+    """Constructs a ResNet-RS-101 model.
+    Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579
+    Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs
+    """
+    attn_layer = partial(get_attn('se'), rd_ratio=0.25)
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', replace_stem_pool=True,
+        avg_down=True,  block_args=dict(attn_layer=attn_layer), **kwargs)
+    return _create_resnet('resnetrs101', pretrained, **model_args)
+
+
+@register_model
+def resnetrs152(pretrained=False, **kwargs):
+    """Constructs a ResNet-RS-152 model.
+    Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579
+    Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs
+    """
+    attn_layer = partial(get_attn('se'), rd_ratio=0.25)
+    model_args = dict(
+        block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', replace_stem_pool=True,
+        avg_down=True,  block_args=dict(attn_layer=attn_layer), **kwargs)
+    return _create_resnet('resnetrs152', pretrained, **model_args)
+
+
+@register_model
+def resnetrs200(pretrained=False, **kwargs):
+    """Constructs a ResNet-RS-200 model.
+    Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579
+    Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs
+    """
+    attn_layer = partial(get_attn('se'), rd_ratio=0.25)
+    model_args = dict(
+        block=Bottleneck, layers=[3, 24, 36, 3], stem_width=32, stem_type='deep', replace_stem_pool=True,
+        avg_down=True,  block_args=dict(attn_layer=attn_layer), **kwargs)
+    return _create_resnet('resnetrs200', pretrained, **model_args)
+
+
+@register_model
+def resnetrs270(pretrained=False, **kwargs):
+    """Constructs a ResNet-RS-270 model.
+    Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579
+    Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs
+    """
+    attn_layer = partial(get_attn('se'), rd_ratio=0.25)
+    model_args = dict(
+        block=Bottleneck, layers=[4, 29, 53, 4], stem_width=32, stem_type='deep', replace_stem_pool=True,
+        avg_down=True,  block_args=dict(attn_layer=attn_layer), **kwargs)
+    return _create_resnet('resnetrs270', pretrained, **model_args)
+
+
+
+@register_model
+def resnetrs350(pretrained=False, **kwargs):
+    """Constructs a ResNet-RS-350 model.
+    Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579
+    Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs
+    """
+    attn_layer = partial(get_attn('se'), rd_ratio=0.25)
+    model_args = dict(
+        block=Bottleneck, layers=[4, 36, 72, 4], stem_width=32, stem_type='deep', replace_stem_pool=True,
+        avg_down=True,  block_args=dict(attn_layer=attn_layer), **kwargs)
+    return _create_resnet('resnetrs350', pretrained, **model_args)
+
+
+@register_model
+def resnetrs420(pretrained=False, **kwargs):
+    """Constructs a ResNet-RS-420 model
+    Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579
+    Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs
+    """
+    attn_layer = partial(get_attn('se'), rd_ratio=0.25)
+    model_args = dict(
+        block=Bottleneck, layers=[4, 44, 87, 4], stem_width=32, stem_type='deep', replace_stem_pool=True,
+        avg_down=True,  block_args=dict(attn_layer=attn_layer), **kwargs)
+    return _create_resnet('resnetrs420', pretrained, **model_args)
diff --git a/src/custom_timm/models/resnetv2.py b/src/custom_timm/models/resnetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85677a479f75779da8edb2d112a29fd744b6e7b
--- /dev/null
+++ b/src/custom_timm/models/resnetv2.py
@@ -0,0 +1,708 @@
+"""Pre-Activation ResNet v2 with GroupNorm and Weight Standardization.
+
+A PyTorch implementation of ResNetV2 adapted from the Google Big-Transfoer (BiT) source code
+at https://github.com/google-research/big_transfer to match timm interfaces. The BiT weights have
+been included here as pretrained models from their original .NPZ checkpoints.
+
+Additionally, supports non pre-activation bottleneck for use as a backbone for Vision Transfomers (ViT) and
+extra padding support to allow porting of official Hybrid ResNet pretrained weights from
+https://github.com/google-research/vision_transformer
+
+Thanks to the Google team for the above two repositories and associated papers:
+* Big Transfer (BiT): General Visual Representation Learning - https://arxiv.org/abs/1912.11370
+* An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale - https://arxiv.org/abs/2010.11929
+* Knowledge distillation: A good teacher is patient and consistent - https://arxiv.org/abs/2106.05237
+
+Original copyright of Google code below, modifications by Ross Wightman, Copyright 2020.
+"""
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict  # pylint: disable=g-importing-member
+
+import torch
+import torch.nn as nn
+from functools import partial
+
+from custom_timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+from .helpers import build_model_with_cfg, named_apply, adapt_input_conv, checkpoint_seq
+from .registry import register_model
+from .layers import GroupNormAct, BatchNormAct2d, EvoNorm2dB0, EvoNorm2dS0, EvoNorm2dS1, FilterResponseNormTlu2d,\
+    ClassifierHead, DropPath, AvgPool2dSame, create_pool2d, StdConv2d, create_conv2d
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD,
+        'first_conv': 'stem.conv', 'classifier': 'head.fc',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    # pretrained on imagenet21k, finetuned on imagenet1k
+    'resnetv2_50x1_bitm': _cfg(
+        url='https://storage.googleapis.com/bit_models/BiT-M-R50x1-ILSVRC2012.npz',
+        input_size=(3, 448, 448), pool_size=(14, 14), crop_pct=1.0),
+    'resnetv2_50x3_bitm': _cfg(
+        url='https://storage.googleapis.com/bit_models/BiT-M-R50x3-ILSVRC2012.npz',
+        input_size=(3, 448, 448), pool_size=(14, 14), crop_pct=1.0),
+    'resnetv2_101x1_bitm': _cfg(
+        url='https://storage.googleapis.com/bit_models/BiT-M-R101x1-ILSVRC2012.npz',
+        input_size=(3, 448, 448), pool_size=(14, 14), crop_pct=1.0),
+    'resnetv2_101x3_bitm': _cfg(
+        url='https://storage.googleapis.com/bit_models/BiT-M-R101x3-ILSVRC2012.npz',
+        input_size=(3, 448, 448), pool_size=(14, 14), crop_pct=1.0),
+    'resnetv2_152x2_bitm': _cfg(
+        url='https://storage.googleapis.com/bit_models/BiT-M-R152x2-ILSVRC2012.npz',
+        input_size=(3, 448, 448), pool_size=(14, 14), crop_pct=1.0),
+    'resnetv2_152x4_bitm': _cfg(
+        url='https://storage.googleapis.com/bit_models/BiT-M-R152x4-ILSVRC2012.npz',
+        input_size=(3, 480, 480), pool_size=(15, 15), crop_pct=1.0),  # only one at 480x480?
+
+    # trained on imagenet-21k
+    'resnetv2_50x1_bitm_in21k': _cfg(
+        url='https://storage.googleapis.com/bit_models/BiT-M-R50x1.npz',
+        num_classes=21843),
+    'resnetv2_50x3_bitm_in21k': _cfg(
+        url='https://storage.googleapis.com/bit_models/BiT-M-R50x3.npz',
+        num_classes=21843),
+    'resnetv2_101x1_bitm_in21k': _cfg(
+        url='https://storage.googleapis.com/bit_models/BiT-M-R101x1.npz',
+        num_classes=21843),
+    'resnetv2_101x3_bitm_in21k': _cfg(
+        url='https://storage.googleapis.com/bit_models/BiT-M-R101x3.npz',
+        num_classes=21843),
+    'resnetv2_152x2_bitm_in21k': _cfg(
+        url='https://storage.googleapis.com/bit_models/BiT-M-R152x2.npz',
+        num_classes=21843),
+    'resnetv2_152x4_bitm_in21k': _cfg(
+        url='https://storage.googleapis.com/bit_models/BiT-M-R152x4.npz',
+        num_classes=21843),
+
+    'resnetv2_50x1_bit_distilled': _cfg(
+        url='https://storage.googleapis.com/bit_models/distill/R50x1_224.npz',
+        interpolation='bicubic'),
+    'resnetv2_152x2_bit_teacher': _cfg(
+        url='https://storage.googleapis.com/bit_models/distill/R152x2_T_224.npz',
+        interpolation='bicubic'),
+    'resnetv2_152x2_bit_teacher_384': _cfg(
+        url='https://storage.googleapis.com/bit_models/distill/R152x2_T_384.npz',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, interpolation='bicubic'),
+
+    'resnetv2_50': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnetv2_50_a1h-000cdf49.pth',
+        interpolation='bicubic', crop_pct=0.95),
+    'resnetv2_50d': _cfg(
+        interpolation='bicubic', first_conv='stem.conv1'),
+    'resnetv2_50t': _cfg(
+        interpolation='bicubic', first_conv='stem.conv1'),
+    'resnetv2_101': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnetv2_101_a1h-5d01f016.pth',
+        interpolation='bicubic', crop_pct=0.95),
+    'resnetv2_101d': _cfg(
+        interpolation='bicubic', first_conv='stem.conv1'),
+    'resnetv2_152': _cfg(
+        interpolation='bicubic'),
+    'resnetv2_152d': _cfg(
+        interpolation='bicubic', first_conv='stem.conv1'),
+
+    'resnetv2_50d_gn': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/resnetv2_50d_gn_ah-c415c11a.pth',
+        interpolation='bicubic', first_conv='stem.conv1', test_input_size=(3, 288, 288), crop_pct=0.95),
+    'resnetv2_50d_evob': _cfg(
+        interpolation='bicubic', first_conv='stem.conv1'),
+    'resnetv2_50d_evos': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/resnetv2_50d_evos_ah-7c4dd548.pth',
+        interpolation='bicubic', first_conv='stem.conv1', test_input_size=(3, 288, 288), crop_pct=0.95),
+    'resnetv2_50d_frn': _cfg(
+        interpolation='bicubic', first_conv='stem.conv1'),
+}
+
+
+def make_div(v, divisor=8):
+    min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class PreActBottleneck(nn.Module):
+    """Pre-activation (v2) bottleneck block.
+
+    Follows the implementation of "Identity Mappings in Deep Residual Networks":
+    https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua
+
+    Except it puts the stride on 3x3 conv when available.
+    """
+
+    def __init__(
+            self, in_chs, out_chs=None, bottle_ratio=0.25, stride=1, dilation=1, first_dilation=None, groups=1,
+            act_layer=None, conv_layer=None, norm_layer=None, proj_layer=None, drop_path_rate=0.):
+        super().__init__()
+        first_dilation = first_dilation or dilation
+        conv_layer = conv_layer or StdConv2d
+        norm_layer = norm_layer or partial(GroupNormAct, num_groups=32)
+        out_chs = out_chs or in_chs
+        mid_chs = make_div(out_chs * bottle_ratio)
+
+        if proj_layer is not None:
+            self.downsample = proj_layer(
+                in_chs, out_chs, stride=stride, dilation=dilation, first_dilation=first_dilation, preact=True,
+                conv_layer=conv_layer, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+        self.norm1 = norm_layer(in_chs)
+        self.conv1 = conv_layer(in_chs, mid_chs, 1)
+        self.norm2 = norm_layer(mid_chs)
+        self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups)
+        self.norm3 = norm_layer(mid_chs)
+        self.conv3 = conv_layer(mid_chs, out_chs, 1)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+
+    def zero_init_last(self):
+        nn.init.zeros_(self.conv3.weight)
+
+    def forward(self, x):
+        x_preact = self.norm1(x)
+
+        # shortcut branch
+        shortcut = x
+        if self.downsample is not None:
+            shortcut = self.downsample(x_preact)
+
+        # residual branch
+        x = self.conv1(x_preact)
+        x = self.conv2(self.norm2(x))
+        x = self.conv3(self.norm3(x))
+        x = self.drop_path(x)
+        return x + shortcut
+
+
+class Bottleneck(nn.Module):
+    """Non Pre-activation bottleneck block, equiv to V1.5/V1b Bottleneck. Used for ViT.
+    """
+    def __init__(
+            self, in_chs, out_chs=None, bottle_ratio=0.25, stride=1, dilation=1, first_dilation=None, groups=1,
+            act_layer=None, conv_layer=None, norm_layer=None, proj_layer=None, drop_path_rate=0.):
+        super().__init__()
+        first_dilation = first_dilation or dilation
+        act_layer = act_layer or nn.ReLU
+        conv_layer = conv_layer or StdConv2d
+        norm_layer = norm_layer or partial(GroupNormAct, num_groups=32)
+        out_chs = out_chs or in_chs
+        mid_chs = make_div(out_chs * bottle_ratio)
+
+        if proj_layer is not None:
+            self.downsample = proj_layer(
+                in_chs, out_chs, stride=stride, dilation=dilation, preact=False,
+                conv_layer=conv_layer, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+        self.conv1 = conv_layer(in_chs, mid_chs, 1)
+        self.norm1 = norm_layer(mid_chs)
+        self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups)
+        self.norm2 = norm_layer(mid_chs)
+        self.conv3 = conv_layer(mid_chs, out_chs, 1)
+        self.norm3 = norm_layer(out_chs, apply_act=False)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.act3 = act_layer(inplace=True)
+
+    def zero_init_last(self):
+        nn.init.zeros_(self.norm3.weight)
+
+    def forward(self, x):
+        # shortcut branch
+        shortcut = x
+        if self.downsample is not None:
+            shortcut = self.downsample(x)
+
+        # residual
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.conv3(x)
+        x = self.norm3(x)
+        x = self.drop_path(x)
+        x = self.act3(x + shortcut)
+        return x
+
+
+class DownsampleConv(nn.Module):
+    def __init__(
+            self, in_chs, out_chs, stride=1, dilation=1, first_dilation=None, preact=True,
+            conv_layer=None, norm_layer=None):
+        super(DownsampleConv, self).__init__()
+        self.conv = conv_layer(in_chs, out_chs, 1, stride=stride)
+        self.norm = nn.Identity() if preact else norm_layer(out_chs, apply_act=False)
+
+    def forward(self, x):
+        return self.norm(self.conv(x))
+
+
+class DownsampleAvg(nn.Module):
+    def __init__(
+            self, in_chs, out_chs, stride=1, dilation=1, first_dilation=None,
+            preact=True, conv_layer=None, norm_layer=None):
+        """ AvgPool Downsampling as in 'D' ResNet variants. This is not in RegNet space but I might experiment."""
+        super(DownsampleAvg, self).__init__()
+        avg_stride = stride if dilation == 1 else 1
+        if stride > 1 or dilation > 1:
+            avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+            self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
+        else:
+            self.pool = nn.Identity()
+        self.conv = conv_layer(in_chs, out_chs, 1, stride=1)
+        self.norm = nn.Identity() if preact else norm_layer(out_chs, apply_act=False)
+
+    def forward(self, x):
+        return self.norm(self.conv(self.pool(x)))
+
+
+class ResNetStage(nn.Module):
+    """ResNet Stage."""
+    def __init__(
+            self, in_chs, out_chs, stride, dilation, depth, bottle_ratio=0.25, groups=1,
+            avg_down=False, block_dpr=None, block_fn=PreActBottleneck,
+            act_layer=None, conv_layer=None, norm_layer=None, **block_kwargs):
+        super(ResNetStage, self).__init__()
+        first_dilation = 1 if dilation in (1, 2) else 2
+        layer_kwargs = dict(act_layer=act_layer, conv_layer=conv_layer, norm_layer=norm_layer)
+        proj_layer = DownsampleAvg if avg_down else DownsampleConv
+        prev_chs = in_chs
+        self.blocks = nn.Sequential()
+        for block_idx in range(depth):
+            drop_path_rate = block_dpr[block_idx] if block_dpr else 0.
+            stride = stride if block_idx == 0 else 1
+            self.blocks.add_module(str(block_idx), block_fn(
+                prev_chs, out_chs, stride=stride, dilation=dilation, bottle_ratio=bottle_ratio, groups=groups,
+                first_dilation=first_dilation, proj_layer=proj_layer, drop_path_rate=drop_path_rate,
+                **layer_kwargs, **block_kwargs))
+            prev_chs = out_chs
+            first_dilation = dilation
+            proj_layer = None
+
+    def forward(self, x):
+        x = self.blocks(x)
+        return x
+
+
+def is_stem_deep(stem_type):
+    return any([s in stem_type for s in ('deep', 'tiered')])
+
+
+def create_resnetv2_stem(
+        in_chs, out_chs=64, stem_type='', preact=True,
+        conv_layer=StdConv2d, norm_layer=partial(GroupNormAct, num_groups=32)):
+    stem = OrderedDict()
+    assert stem_type in ('', 'fixed', 'same', 'deep', 'deep_fixed', 'deep_same', 'tiered')
+
+    # NOTE conv padding mode can be changed by overriding the conv_layer def
+    if is_stem_deep(stem_type):
+        # A 3 deep 3x3  conv stack as in ResNet V1D models
+        if 'tiered' in stem_type:
+            stem_chs = (3 * out_chs // 8, out_chs // 2)  # 'T' resnets in resnet.py
+        else:
+            stem_chs = (out_chs // 2, out_chs // 2)  # 'D' ResNets
+        stem['conv1'] = conv_layer(in_chs, stem_chs[0], kernel_size=3, stride=2)
+        stem['norm1'] = norm_layer(stem_chs[0])
+        stem['conv2'] = conv_layer(stem_chs[0], stem_chs[1], kernel_size=3, stride=1)
+        stem['norm2'] = norm_layer(stem_chs[1])
+        stem['conv3'] = conv_layer(stem_chs[1], out_chs, kernel_size=3, stride=1)
+        if not preact:
+            stem['norm3'] = norm_layer(out_chs)
+    else:
+        # The usual 7x7 stem conv
+        stem['conv'] = conv_layer(in_chs, out_chs, kernel_size=7, stride=2)
+        if not preact:
+            stem['norm'] = norm_layer(out_chs)
+
+    if 'fixed' in stem_type:
+        # 'fixed' SAME padding approximation that is used in BiT models
+        stem['pad'] = nn.ConstantPad2d(1, 0.)
+        stem['pool'] = nn.MaxPool2d(kernel_size=3, stride=2, padding=0)
+    elif 'same' in stem_type:
+        # full, input size based 'SAME' padding, used in ViT Hybrid model
+        stem['pool'] = create_pool2d('max', kernel_size=3, stride=2, padding='same')
+    else:
+        # the usual PyTorch symmetric padding
+        stem['pool'] = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    return nn.Sequential(stem)
+
+
+class ResNetV2(nn.Module):
+    """Implementation of Pre-activation (v2) ResNet mode.
+    """
+
+    def __init__(
+            self, layers, channels=(256, 512, 1024, 2048),
+            num_classes=1000, in_chans=3, global_pool='avg', output_stride=32,
+            width_factor=1, stem_chs=64, stem_type='', avg_down=False, preact=True,
+            act_layer=nn.ReLU, conv_layer=StdConv2d, norm_layer=partial(GroupNormAct, num_groups=32),
+            drop_rate=0., drop_path_rate=0., zero_init_last=False):
+        super().__init__()
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        wf = width_factor
+
+        self.feature_info = []
+        stem_chs = make_div(stem_chs * wf)
+        self.stem = create_resnetv2_stem(
+            in_chans, stem_chs, stem_type, preact, conv_layer=conv_layer, norm_layer=norm_layer)
+        stem_feat = ('stem.conv3' if is_stem_deep(stem_type) else 'stem.conv') if preact else 'stem.norm'
+        self.feature_info.append(dict(num_chs=stem_chs, reduction=2, module=stem_feat))
+
+        prev_chs = stem_chs
+        curr_stride = 4
+        dilation = 1
+        block_dprs = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(layers)).split(layers)]
+        block_fn = PreActBottleneck if preact else Bottleneck
+        self.stages = nn.Sequential()
+        for stage_idx, (d, c, bdpr) in enumerate(zip(layers, channels, block_dprs)):
+            out_chs = make_div(c * wf)
+            stride = 1 if stage_idx == 0 else 2
+            if curr_stride >= output_stride:
+                dilation *= stride
+                stride = 1
+            stage = ResNetStage(
+                prev_chs, out_chs, stride=stride, dilation=dilation, depth=d, avg_down=avg_down,
+                act_layer=act_layer, conv_layer=conv_layer, norm_layer=norm_layer, block_dpr=bdpr, block_fn=block_fn)
+            prev_chs = out_chs
+            curr_stride *= stride
+            self.feature_info += [dict(num_chs=prev_chs, reduction=curr_stride, module=f'stages.{stage_idx}')]
+            self.stages.add_module(str(stage_idx), stage)
+
+        self.num_features = prev_chs
+        self.norm = norm_layer(self.num_features) if preact else nn.Identity()
+        self.head = ClassifierHead(
+            self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate, use_conv=True)
+
+        self.init_weights(zero_init_last=zero_init_last)
+        self.grad_checkpointing = False
+
+    @torch.jit.ignore
+    def init_weights(self, zero_init_last=True):
+        named_apply(partial(_init_weights, zero_init_last=zero_init_last), self)
+
+    @torch.jit.ignore()
+    def load_pretrained(self, checkpoint_path, prefix='resnet/'):
+        _load_weights(self, checkpoint_path, prefix)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^stem',
+            blocks=r'^stages\.(\d+)' if coarse else [
+                (r'^stages\.(\d+)\.blocks\.(\d+)', None),
+                (r'^norm', (99999,))
+            ]
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.head = ClassifierHead(
+            self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate, use_conv=True)
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.stages, x, flatten=True)
+        else:
+            x = self.stages(x)
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        return self.head(x, pre_logits=pre_logits)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _init_weights(module: nn.Module, name: str = '', zero_init_last=True):
+    if isinstance(module, nn.Linear) or ('head.fc' in name and isinstance(module, nn.Conv2d)):
+        nn.init.normal_(module.weight, mean=0.0, std=0.01)
+        nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Conv2d):
+        nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, (nn.BatchNorm2d, nn.LayerNorm, nn.GroupNorm)):
+        nn.init.ones_(module.weight)
+        nn.init.zeros_(module.bias)
+    elif zero_init_last and hasattr(module, 'zero_init_last'):
+        module.zero_init_last()
+
+
+@torch.no_grad()
+def _load_weights(model: nn.Module, checkpoint_path: str, prefix: str = 'resnet/'):
+    import numpy as np
+
+    def t2p(conv_weights):
+        """Possibly convert HWIO to OIHW."""
+        if conv_weights.ndim == 4:
+            conv_weights = conv_weights.transpose([3, 2, 0, 1])
+        return torch.from_numpy(conv_weights)
+
+    weights = np.load(checkpoint_path)
+    stem_conv_w = adapt_input_conv(
+        model.stem.conv.weight.shape[1], t2p(weights[f'{prefix}root_block/standardized_conv2d/kernel']))
+    model.stem.conv.weight.copy_(stem_conv_w)
+    model.norm.weight.copy_(t2p(weights[f'{prefix}group_norm/gamma']))
+    model.norm.bias.copy_(t2p(weights[f'{prefix}group_norm/beta']))
+    if isinstance(getattr(model.head, 'fc', None), nn.Conv2d) and \
+            model.head.fc.weight.shape[0] == weights[f'{prefix}head/conv2d/kernel'].shape[-1]:
+        model.head.fc.weight.copy_(t2p(weights[f'{prefix}head/conv2d/kernel']))
+        model.head.fc.bias.copy_(t2p(weights[f'{prefix}head/conv2d/bias']))
+    for i, (sname, stage) in enumerate(model.stages.named_children()):
+        for j, (bname, block) in enumerate(stage.blocks.named_children()):
+            cname = 'standardized_conv2d'
+            block_prefix = f'{prefix}block{i + 1}/unit{j + 1:02d}/'
+            block.conv1.weight.copy_(t2p(weights[f'{block_prefix}a/{cname}/kernel']))
+            block.conv2.weight.copy_(t2p(weights[f'{block_prefix}b/{cname}/kernel']))
+            block.conv3.weight.copy_(t2p(weights[f'{block_prefix}c/{cname}/kernel']))
+            block.norm1.weight.copy_(t2p(weights[f'{block_prefix}a/group_norm/gamma']))
+            block.norm2.weight.copy_(t2p(weights[f'{block_prefix}b/group_norm/gamma']))
+            block.norm3.weight.copy_(t2p(weights[f'{block_prefix}c/group_norm/gamma']))
+            block.norm1.bias.copy_(t2p(weights[f'{block_prefix}a/group_norm/beta']))
+            block.norm2.bias.copy_(t2p(weights[f'{block_prefix}b/group_norm/beta']))
+            block.norm3.bias.copy_(t2p(weights[f'{block_prefix}c/group_norm/beta']))
+            if block.downsample is not None:
+                w = weights[f'{block_prefix}a/proj/{cname}/kernel']
+                block.downsample.conv.weight.copy_(t2p(w))
+
+
+def _create_resnetv2(variant, pretrained=False, **kwargs):
+    feature_cfg = dict(flatten_sequential=True)
+    return build_model_with_cfg(
+        ResNetV2, variant, pretrained,
+        feature_cfg=feature_cfg,
+        pretrained_custom_load='_bit' in variant,
+        **kwargs)
+
+
+def _create_resnetv2_bit(variant, pretrained=False, **kwargs):
+    return _create_resnetv2(
+        variant, pretrained=pretrained, stem_type='fixed',  conv_layer=partial(StdConv2d, eps=1e-8), **kwargs)
+
+
+@register_model
+def resnetv2_50x1_bitm(pretrained=False, **kwargs):
+    return _create_resnetv2_bit(
+        'resnetv2_50x1_bitm', pretrained=pretrained, layers=[3, 4, 6, 3], width_factor=1, **kwargs)
+
+
+@register_model
+def resnetv2_50x3_bitm(pretrained=False, **kwargs):
+    return _create_resnetv2_bit(
+        'resnetv2_50x3_bitm', pretrained=pretrained, layers=[3, 4, 6, 3], width_factor=3, **kwargs)
+
+
+@register_model
+def resnetv2_101x1_bitm(pretrained=False, **kwargs):
+    return _create_resnetv2_bit(
+        'resnetv2_101x1_bitm', pretrained=pretrained, layers=[3, 4, 23, 3], width_factor=1, **kwargs)
+
+
+@register_model
+def resnetv2_101x3_bitm(pretrained=False, **kwargs):
+    return _create_resnetv2_bit(
+        'resnetv2_101x3_bitm', pretrained=pretrained, layers=[3, 4, 23, 3], width_factor=3, **kwargs)
+
+
+@register_model
+def resnetv2_152x2_bitm(pretrained=False, **kwargs):
+    return _create_resnetv2_bit(
+        'resnetv2_152x2_bitm', pretrained=pretrained, layers=[3, 8, 36, 3], width_factor=2, **kwargs)
+
+
+@register_model
+def resnetv2_152x4_bitm(pretrained=False, **kwargs):
+    return _create_resnetv2_bit(
+        'resnetv2_152x4_bitm', pretrained=pretrained, layers=[3, 8, 36, 3], width_factor=4, **kwargs)
+
+
+@register_model
+def resnetv2_50x1_bitm_in21k(pretrained=False, **kwargs):
+    return _create_resnetv2_bit(
+        'resnetv2_50x1_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843),
+        layers=[3, 4, 6, 3], width_factor=1, **kwargs)
+
+
+@register_model
+def resnetv2_50x3_bitm_in21k(pretrained=False, **kwargs):
+    return _create_resnetv2_bit(
+        'resnetv2_50x3_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843),
+        layers=[3, 4, 6, 3], width_factor=3, **kwargs)
+
+
+@register_model
+def resnetv2_101x1_bitm_in21k(pretrained=False, **kwargs):
+    return _create_resnetv2(
+        'resnetv2_101x1_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843),
+        layers=[3, 4, 23, 3], width_factor=1, **kwargs)
+
+
+@register_model
+def resnetv2_101x3_bitm_in21k(pretrained=False, **kwargs):
+    return _create_resnetv2_bit(
+        'resnetv2_101x3_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843),
+        layers=[3, 4, 23, 3], width_factor=3, **kwargs)
+
+
+@register_model
+def resnetv2_152x2_bitm_in21k(pretrained=False, **kwargs):
+    return _create_resnetv2_bit(
+        'resnetv2_152x2_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843),
+        layers=[3, 8, 36, 3], width_factor=2, **kwargs)
+
+
+@register_model
+def resnetv2_152x4_bitm_in21k(pretrained=False, **kwargs):
+    return _create_resnetv2_bit(
+        'resnetv2_152x4_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843),
+        layers=[3, 8, 36, 3], width_factor=4, **kwargs)
+
+
+@register_model
+def resnetv2_50x1_bit_distilled(pretrained=False, **kwargs):
+    """ ResNetV2-50x1-BiT Distilled
+    Paper: Knowledge distillation: A good teacher is patient and consistent - https://arxiv.org/abs/2106.05237
+    """
+    return _create_resnetv2_bit(
+        'resnetv2_50x1_bit_distilled', pretrained=pretrained, layers=[3, 4, 6, 3], width_factor=1, **kwargs)
+
+
+@register_model
+def resnetv2_152x2_bit_teacher(pretrained=False, **kwargs):
+    """ ResNetV2-152x2-BiT Teacher
+    Paper: Knowledge distillation: A good teacher is patient and consistent - https://arxiv.org/abs/2106.05237
+    """
+    return _create_resnetv2_bit(
+        'resnetv2_152x2_bit_teacher', pretrained=pretrained, layers=[3, 8, 36, 3], width_factor=2, **kwargs)
+
+
+@register_model
+def resnetv2_152x2_bit_teacher_384(pretrained=False, **kwargs):
+    """ ResNetV2-152xx-BiT Teacher @ 384x384
+    Paper: Knowledge distillation: A good teacher is patient and consistent - https://arxiv.org/abs/2106.05237
+    """
+    return _create_resnetv2_bit(
+        'resnetv2_152x2_bit_teacher_384', pretrained=pretrained, layers=[3, 8, 36, 3], width_factor=2, **kwargs)
+
+
+@register_model
+def resnetv2_50(pretrained=False, **kwargs):
+    return _create_resnetv2(
+        'resnetv2_50', pretrained=pretrained,
+        layers=[3, 4, 6, 3], conv_layer=create_conv2d, norm_layer=BatchNormAct2d, **kwargs)
+
+
+@register_model
+def resnetv2_50d(pretrained=False, **kwargs):
+    return _create_resnetv2(
+        'resnetv2_50d', pretrained=pretrained,
+        layers=[3, 4, 6, 3], conv_layer=create_conv2d, norm_layer=BatchNormAct2d,
+        stem_type='deep', avg_down=True, **kwargs)
+
+
+@register_model
+def resnetv2_50t(pretrained=False, **kwargs):
+    return _create_resnetv2(
+        'resnetv2_50t', pretrained=pretrained,
+        layers=[3, 4, 6, 3], conv_layer=create_conv2d, norm_layer=BatchNormAct2d,
+        stem_type='tiered', avg_down=True, **kwargs)
+
+
+@register_model
+def resnetv2_101(pretrained=False, **kwargs):
+    return _create_resnetv2(
+        'resnetv2_101', pretrained=pretrained,
+        layers=[3, 4, 23, 3], conv_layer=create_conv2d, norm_layer=BatchNormAct2d, **kwargs)
+
+
+@register_model
+def resnetv2_101d(pretrained=False, **kwargs):
+    return _create_resnetv2(
+        'resnetv2_101d', pretrained=pretrained,
+        layers=[3, 4, 23, 3], conv_layer=create_conv2d, norm_layer=BatchNormAct2d,
+        stem_type='deep', avg_down=True, **kwargs)
+
+
+@register_model
+def resnetv2_152(pretrained=False, **kwargs):
+    return _create_resnetv2(
+        'resnetv2_152', pretrained=pretrained,
+        layers=[3, 8, 36, 3], conv_layer=create_conv2d, norm_layer=BatchNormAct2d, **kwargs)
+
+
+@register_model
+def resnetv2_152d(pretrained=False, **kwargs):
+    return _create_resnetv2(
+        'resnetv2_152d', pretrained=pretrained,
+        layers=[3, 8, 36, 3], conv_layer=create_conv2d, norm_layer=BatchNormAct2d,
+        stem_type='deep', avg_down=True, **kwargs)
+
+
+# Experimental configs (may change / be removed)
+
+@register_model
+def resnetv2_50d_gn(pretrained=False, **kwargs):
+    return _create_resnetv2(
+        'resnetv2_50d_gn', pretrained=pretrained,
+        layers=[3, 4, 6, 3], conv_layer=create_conv2d, norm_layer=GroupNormAct,
+        stem_type='deep', avg_down=True, **kwargs)
+
+
+@register_model
+def resnetv2_50d_evob(pretrained=False, **kwargs):
+    return _create_resnetv2(
+        'resnetv2_50d_evob', pretrained=pretrained,
+        layers=[3, 4, 6, 3], conv_layer=create_conv2d, norm_layer=EvoNorm2dB0,
+        stem_type='deep', avg_down=True, zero_init_last=True, **kwargs)
+
+
+@register_model
+def resnetv2_50d_evos(pretrained=False, **kwargs):
+    return _create_resnetv2(
+        'resnetv2_50d_evos', pretrained=pretrained,
+        layers=[3, 4, 6, 3], conv_layer=create_conv2d, norm_layer=EvoNorm2dS0,
+        stem_type='deep', avg_down=True, **kwargs)
+
+
+@register_model
+def resnetv2_50d_frn(pretrained=False, **kwargs):
+    return _create_resnetv2(
+        'resnetv2_50d_frn', pretrained=pretrained,
+        layers=[3, 4, 6, 3], conv_layer=create_conv2d, norm_layer=FilterResponseNormTlu2d,
+        stem_type='deep', avg_down=True, **kwargs)
diff --git a/src/custom_timm/models/rexnet.py b/src/custom_timm/models/rexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7077ea6e996c624ef85052b1a6114ea681142b9
--- /dev/null
+++ b/src/custom_timm/models/rexnet.py
@@ -0,0 +1,261 @@
+""" ReXNet
+
+A PyTorch impl of `ReXNet: Diminishing Representational Bottleneck on Convolutional Neural Network` -
+https://arxiv.org/abs/2007.00992
+
+Adapted from original impl at https://github.com/clovaai/rexnet
+Copyright (c) 2020-present NAVER Corp. MIT license
+
+Changes for timm, feature extraction, and rounded channel variant hacked together by Ross Wightman
+Copyright 2020 Ross Wightman
+"""
+
+import torch
+import torch.nn as nn
+from functools import partial
+from math import ceil
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg, checkpoint_seq
+from .layers import ClassifierHead, create_act_layer, ConvNormAct, DropPath, make_divisible, SEModule
+from .registry import register_model
+from .efficientnet_builder import efficientnet_init_weights
+
+
+def _cfg(url=''):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.conv', 'classifier': 'head.fc',
+    }
+
+
+default_cfgs = dict(
+    rexnet_100=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rexnet/rexnetv1_100-1b4dddf4.pth'),
+    rexnet_130=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rexnet/rexnetv1_130-590d768e.pth'),
+    rexnet_150=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rexnet/rexnetv1_150-bd1a6aa8.pth'),
+    rexnet_200=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rexnet/rexnetv1_200-8c0b7f2d.pth'),
+    rexnetr_100=_cfg(
+        url=''),
+    rexnetr_130=_cfg(
+        url=''),
+    rexnetr_150=_cfg(
+        url=''),
+    rexnetr_200=_cfg(
+        url=''),
+)
+
+SEWithNorm = partial(SEModule, norm_layer=nn.BatchNorm2d)
+
+
+class LinearBottleneck(nn.Module):
+    def __init__(
+            self, in_chs, out_chs, stride, exp_ratio=1.0, se_ratio=0., ch_div=1,
+            act_layer='swish', dw_act_layer='relu6', drop_path=None):
+        super(LinearBottleneck, self).__init__()
+        self.use_shortcut = stride == 1 and in_chs <= out_chs
+        self.in_channels = in_chs
+        self.out_channels = out_chs
+
+        if exp_ratio != 1.:
+            dw_chs = make_divisible(round(in_chs * exp_ratio), divisor=ch_div)
+            self.conv_exp = ConvNormAct(in_chs, dw_chs, act_layer=act_layer)
+        else:
+            dw_chs = in_chs
+            self.conv_exp = None
+
+        self.conv_dw = ConvNormAct(dw_chs, dw_chs, 3, stride=stride, groups=dw_chs, apply_act=False)
+        if se_ratio > 0:
+            self.se = SEWithNorm(dw_chs, rd_channels=make_divisible(int(dw_chs * se_ratio), ch_div))
+        else:
+            self.se = None
+        self.act_dw = create_act_layer(dw_act_layer)
+
+        self.conv_pwl = ConvNormAct(dw_chs, out_chs, 1, apply_act=False)
+        self.drop_path = drop_path
+
+    def feat_channels(self, exp=False):
+        return self.conv_dw.out_channels if exp else self.out_channels
+
+    def forward(self, x):
+        shortcut = x
+        if self.conv_exp is not None:
+            x = self.conv_exp(x)
+        x = self.conv_dw(x)
+        if self.se is not None:
+            x = self.se(x)
+        x = self.act_dw(x)
+        x = self.conv_pwl(x)
+        if self.use_shortcut:
+            if self.drop_path is not None:
+                x = self.drop_path(x)
+            x = torch.cat([x[:, 0:self.in_channels] + shortcut, x[:, self.in_channels:]], dim=1)
+        return x
+
+
+def _block_cfg(width_mult=1.0, depth_mult=1.0, initial_chs=16, final_chs=180, se_ratio=0., ch_div=1):
+    layers = [1, 2, 2, 3, 3, 5]
+    strides = [1, 2, 2, 2, 1, 2]
+    layers = [ceil(element * depth_mult) for element in layers]
+    strides = sum([[element] + [1] * (layers[idx] - 1) for idx, element in enumerate(strides)], [])
+    exp_ratios = [1] * layers[0] + [6] * sum(layers[1:])
+    depth = sum(layers[:]) * 3
+    base_chs = initial_chs / width_mult if width_mult < 1.0 else initial_chs
+
+    # The following channel configuration is a simple instance to make each layer become an expand layer.
+    out_chs_list = []
+    for i in range(depth // 3):
+        out_chs_list.append(make_divisible(round(base_chs * width_mult), divisor=ch_div))
+        base_chs += final_chs / (depth // 3 * 1.0)
+
+    se_ratios = [0.] * (layers[0] + layers[1]) + [se_ratio] * sum(layers[2:])
+
+    return list(zip(out_chs_list, exp_ratios, strides, se_ratios))
+
+
+def _build_blocks(
+        block_cfg, prev_chs, width_mult, ch_div=1, act_layer='swish', dw_act_layer='relu6', drop_path_rate=0.):
+    feat_chs = [prev_chs]
+    feature_info = []
+    curr_stride = 2
+    features = []
+    num_blocks = len(block_cfg)
+    for block_idx, (chs, exp_ratio, stride, se_ratio) in enumerate(block_cfg):
+        if stride > 1:
+            fname = 'stem' if block_idx == 0 else f'features.{block_idx - 1}'
+            feature_info += [dict(num_chs=feat_chs[-1], reduction=curr_stride, module=fname)]
+            curr_stride *= stride
+        block_dpr = drop_path_rate * block_idx / (num_blocks - 1)  # stochastic depth linear decay rule
+        drop_path = DropPath(block_dpr) if block_dpr > 0. else None
+        features.append(LinearBottleneck(
+            in_chs=prev_chs, out_chs=chs, exp_ratio=exp_ratio, stride=stride, se_ratio=se_ratio,
+            ch_div=ch_div, act_layer=act_layer, dw_act_layer=dw_act_layer, drop_path=drop_path))
+        prev_chs = chs
+        feat_chs += [features[-1].feat_channels()]
+    pen_chs = make_divisible(1280 * width_mult, divisor=ch_div)
+    feature_info += [dict(num_chs=feat_chs[-1], reduction=curr_stride, module=f'features.{len(features) - 1}')]
+    features.append(ConvNormAct(prev_chs, pen_chs, act_layer=act_layer))
+    return features, feature_info
+
+
+class ReXNetV1(nn.Module):
+    def __init__(
+            self, in_chans=3, num_classes=1000, global_pool='avg', output_stride=32,
+            initial_chs=16, final_chs=180, width_mult=1.0, depth_mult=1.0, se_ratio=1/12.,
+            ch_div=1, act_layer='swish', dw_act_layer='relu6', drop_rate=0.2, drop_path_rate=0.
+    ):
+        super(ReXNetV1, self).__init__()
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        self.grad_checkpointing = False
+
+        assert output_stride == 32  # FIXME support dilation
+        stem_base_chs = 32 / width_mult if width_mult < 1.0 else 32
+        stem_chs = make_divisible(round(stem_base_chs * width_mult), divisor=ch_div)
+        self.stem = ConvNormAct(in_chans, stem_chs, 3, stride=2, act_layer=act_layer)
+
+        block_cfg = _block_cfg(width_mult, depth_mult, initial_chs, final_chs, se_ratio, ch_div)
+        features, self.feature_info = _build_blocks(
+            block_cfg, stem_chs, width_mult, ch_div, act_layer, dw_act_layer, drop_path_rate)
+        self.num_features = features[-1].out_channels
+        self.features = nn.Sequential(*features)
+
+        self.head = ClassifierHead(self.num_features, num_classes, global_pool, drop_rate)
+
+        efficientnet_init_weights(self)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^stem',
+            blocks=r'^features\.(\d+)',
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.features, x, flatten=True)
+        else:
+            x = self.features(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        return self.head(x, pre_logits=pre_logits)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_rexnet(variant, pretrained, **kwargs):
+    feature_cfg = dict(flatten_sequential=True)
+    return build_model_with_cfg(
+        ReXNetV1, variant, pretrained,
+        feature_cfg=feature_cfg,
+        **kwargs)
+
+
+@register_model
+def rexnet_100(pretrained=False, **kwargs):
+    """ReXNet V1 1.0x"""
+    return _create_rexnet('rexnet_100', pretrained, **kwargs)
+
+
+@register_model
+def rexnet_130(pretrained=False, **kwargs):
+    """ReXNet V1 1.3x"""
+    return _create_rexnet('rexnet_130', pretrained, width_mult=1.3, **kwargs)
+
+
+@register_model
+def rexnet_150(pretrained=False, **kwargs):
+    """ReXNet V1 1.5x"""
+    return _create_rexnet('rexnet_150', pretrained, width_mult=1.5, **kwargs)
+
+
+@register_model
+def rexnet_200(pretrained=False, **kwargs):
+    """ReXNet V1 2.0x"""
+    return _create_rexnet('rexnet_200', pretrained, width_mult=2.0, **kwargs)
+
+
+@register_model
+def rexnetr_100(pretrained=False, **kwargs):
+    """ReXNet V1 1.0x w/ rounded (mod 8) channels"""
+    return _create_rexnet('rexnetr_100', pretrained, ch_div=8, **kwargs)
+
+
+@register_model
+def rexnetr_130(pretrained=False, **kwargs):
+    """ReXNet V1 1.3x w/ rounded (mod 8) channels"""
+    return _create_rexnet('rexnetr_130', pretrained, width_mult=1.3, ch_div=8, **kwargs)
+
+
+@register_model
+def rexnetr_150(pretrained=False, **kwargs):
+    """ReXNet V1 1.5x w/ rounded (mod 8) channels"""
+    return _create_rexnet('rexnetr_150', pretrained, width_mult=1.5, ch_div=8, **kwargs)
+
+
+@register_model
+def rexnetr_200(pretrained=False, **kwargs):
+    """ReXNet V1 2.0x w/ rounded (mod 8) channels"""
+    return _create_rexnet('rexnetr_200', pretrained, width_mult=2.0, ch_div=8, **kwargs)
diff --git a/src/custom_timm/models/selecsls.py b/src/custom_timm/models/selecsls.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eb9e1f6dc9647e1c5071300ff030f760fba3984
--- /dev/null
+++ b/src/custom_timm/models/selecsls.py
@@ -0,0 +1,377 @@
+"""PyTorch SelecSLS Net example for ImageNet Classification
+License: CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/legalcode)
+Author: Dushyant Mehta (@mehtadushy)
+
+SelecSLS (core) Network Architecture as proposed in "XNect: Real-time Multi-person 3D
+Human Pose Estimation with a Single RGB Camera, Mehta et al."
+https://arxiv.org/abs/1907.00837
+
+Based on ResNet implementation in https://github.com/rwightman/pytorch-image-models
+and SelecSLS Net implementation in https://github.com/mehtadushy/SelecSLS-Pytorch
+"""
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg
+from .layers import create_classifier
+from .registry import register_model
+
+__all__ = ['SelecSLS']  # model_registry will add each entrypoint fn to this
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (4, 4),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.0', 'classifier': 'fc',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'selecsls42': _cfg(
+        url='',
+        interpolation='bicubic'),
+    'selecsls42b': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-selecsls/selecsls42b-8af30141.pth',
+        interpolation='bicubic'),
+    'selecsls60': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-selecsls/selecsls60-bbf87526.pth',
+        interpolation='bicubic'),
+    'selecsls60b': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-selecsls/selecsls60b-94e619b5.pth',
+        interpolation='bicubic'),
+    'selecsls84': _cfg(
+        url='',
+        interpolation='bicubic'),
+}
+
+
+class SequentialList(nn.Sequential):
+
+    def __init__(self, *args):
+        super(SequentialList, self).__init__(*args)
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, x):
+        # type: (List[torch.Tensor]) -> (List[torch.Tensor])
+        pass
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, x):
+        # type: (torch.Tensor) -> (List[torch.Tensor])
+        pass
+
+    def forward(self, x) -> List[torch.Tensor]:
+        for module in self:
+            x = module(x)
+        return x
+
+
+class SelectSeq(nn.Module):
+    def __init__(self, mode='index', index=0):
+        super(SelectSeq, self).__init__()
+        self.mode = mode
+        self.index = index
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, x):
+        # type: (List[torch.Tensor]) -> (torch.Tensor)
+        pass
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, x):
+        # type: (Tuple[torch.Tensor]) -> (torch.Tensor)
+        pass
+
+    def forward(self, x) -> torch.Tensor:
+        if self.mode == 'index':
+            return x[self.index]
+        else:
+            return torch.cat(x, dim=1)
+
+
+def conv_bn(in_chs, out_chs, k=3, stride=1, padding=None, dilation=1):
+    if padding is None:
+        padding = ((stride - 1) + dilation * (k - 1)) // 2
+    return nn.Sequential(
+        nn.Conv2d(in_chs, out_chs, k, stride, padding=padding, dilation=dilation, bias=False),
+        nn.BatchNorm2d(out_chs),
+        nn.ReLU(inplace=True)
+    )
+
+
+class SelecSLSBlock(nn.Module):
+    def __init__(self, in_chs, skip_chs, mid_chs, out_chs, is_first, stride, dilation=1):
+        super(SelecSLSBlock, self).__init__()
+        self.stride = stride
+        self.is_first = is_first
+        assert stride in [1, 2]
+
+        # Process input with 4 conv blocks with the same number of input and output channels
+        self.conv1 = conv_bn(in_chs, mid_chs, 3, stride, dilation=dilation)
+        self.conv2 = conv_bn(mid_chs, mid_chs, 1)
+        self.conv3 = conv_bn(mid_chs, mid_chs // 2, 3)
+        self.conv4 = conv_bn(mid_chs // 2, mid_chs, 1)
+        self.conv5 = conv_bn(mid_chs, mid_chs // 2, 3)
+        self.conv6 = conv_bn(2 * mid_chs + (0 if is_first else skip_chs), out_chs, 1)
+
+    def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]:
+        if not isinstance(x, list):
+            x = [x]
+        assert len(x) in [1, 2]
+
+        d1 = self.conv1(x[0])
+        d2 = self.conv3(self.conv2(d1))
+        d3 = self.conv5(self.conv4(d2))
+        if self.is_first:
+            out = self.conv6(torch.cat([d1, d2, d3], 1))
+            return [out, out]
+        else:
+            return [self.conv6(torch.cat([d1, d2, d3, x[1]], 1)), x[1]]
+
+
+class SelecSLS(nn.Module):
+    """SelecSLS42 / SelecSLS60 / SelecSLS84
+
+    Parameters
+    ----------
+    cfg : network config dictionary specifying block type, feature, and head args
+    num_classes : int, default 1000
+        Number of classification classes.
+    in_chans : int, default 3
+        Number of input (color) channels.
+    drop_rate : float, default 0.
+        Dropout probability before classifier, for training
+    global_pool : str, default 'avg'
+        Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax'
+    """
+
+    def __init__(self, cfg, num_classes=1000, in_chans=3, drop_rate=0.0, global_pool='avg'):
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        super(SelecSLS, self).__init__()
+
+        self.stem = conv_bn(in_chans, 32, stride=2)
+        self.features = SequentialList(*[cfg['block'](*block_args) for block_args in cfg['features']])
+        self.from_seq = SelectSeq()  # from List[tensor] -> Tensor in module compatible way
+        self.head = nn.Sequential(*[conv_bn(*conv_args) for conv_args in cfg['head']])
+        self.num_features = cfg['num_features']
+        self.feature_info = cfg['feature_info']
+
+        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
+
+        for n, m in self.named_modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1.)
+                nn.init.constant_(m.bias, 0.)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^stem',
+            blocks=r'^features\.(\d+)',
+            blocks_head=r'^head'
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        assert not enable, 'gradient checkpointing not supported'
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = self.features(x)
+        x = self.head(self.from_seq(x))
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        x = self.global_pool(x)
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        return x if pre_logits else self.fc(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_selecsls(variant, pretrained, **kwargs):
+    cfg = {}
+    feature_info = [dict(num_chs=32, reduction=2, module='stem.2')]
+    if variant.startswith('selecsls42'):
+        cfg['block'] = SelecSLSBlock
+        # Define configuration of the network after the initial neck
+        cfg['features'] = [
+            # in_chs, skip_chs, mid_chs, out_chs, is_first, stride
+            (32, 0, 64, 64, True, 2),
+            (64, 64, 64, 128, False, 1),
+            (128, 0, 144, 144, True, 2),
+            (144, 144, 144, 288, False, 1),
+            (288, 0, 304, 304, True, 2),
+            (304, 304, 304, 480, False, 1),
+        ]
+        feature_info.extend([
+            dict(num_chs=128, reduction=4, module='features.1'),
+            dict(num_chs=288, reduction=8, module='features.3'),
+            dict(num_chs=480, reduction=16, module='features.5'),
+        ])
+        # Head can be replaced with alternative configurations depending on the problem
+        feature_info.append(dict(num_chs=1024, reduction=32, module='head.1'))
+        if variant == 'selecsls42b':
+            cfg['head'] = [
+                (480, 960, 3, 2),
+                (960, 1024, 3, 1),
+                (1024, 1280, 3, 2),
+                (1280, 1024, 1, 1),
+            ]
+            feature_info.append(dict(num_chs=1024, reduction=64, module='head.3'))
+            cfg['num_features'] = 1024
+        else:
+            cfg['head'] = [
+                (480, 960, 3, 2),
+                (960, 1024, 3, 1),
+                (1024, 1024, 3, 2),
+                (1024, 1280, 1, 1),
+            ]
+            feature_info.append(dict(num_chs=1280, reduction=64, module='head.3'))
+            cfg['num_features'] = 1280
+
+    elif variant.startswith('selecsls60'):
+        cfg['block'] = SelecSLSBlock
+        # Define configuration of the network after the initial neck
+        cfg['features'] = [
+            # in_chs, skip_chs, mid_chs, out_chs, is_first, stride
+            (32, 0, 64, 64, True, 2),
+            (64, 64, 64, 128, False, 1),
+            (128, 0, 128, 128, True, 2),
+            (128, 128, 128, 128, False, 1),
+            (128, 128, 128, 288, False, 1),
+            (288, 0, 288, 288, True, 2),
+            (288, 288, 288, 288, False, 1),
+            (288, 288, 288, 288, False, 1),
+            (288, 288, 288, 416, False, 1),
+        ]
+        feature_info.extend([
+            dict(num_chs=128, reduction=4, module='features.1'),
+            dict(num_chs=288, reduction=8, module='features.4'),
+            dict(num_chs=416, reduction=16, module='features.8'),
+        ])
+        # Head can be replaced with alternative configurations depending on the problem
+        feature_info.append(dict(num_chs=1024, reduction=32, module='head.1'))
+        if variant == 'selecsls60b':
+            cfg['head'] = [
+                (416, 756, 3, 2),
+                (756, 1024, 3, 1),
+                (1024, 1280, 3, 2),
+                (1280, 1024, 1, 1),
+            ]
+            feature_info.append(dict(num_chs=1024, reduction=64, module='head.3'))
+            cfg['num_features'] = 1024
+        else:
+            cfg['head'] = [
+                (416, 756, 3, 2),
+                (756, 1024, 3, 1),
+                (1024, 1024, 3, 2),
+                (1024, 1280, 1, 1),
+            ]
+            feature_info.append(dict(num_chs=1280, reduction=64, module='head.3'))
+            cfg['num_features'] = 1280
+
+    elif variant == 'selecsls84':
+        cfg['block'] = SelecSLSBlock
+        # Define configuration of the network after the initial neck
+        cfg['features'] = [
+            # in_chs, skip_chs, mid_chs, out_chs, is_first, stride
+            (32, 0, 64, 64, True, 2),
+            (64, 64, 64, 144, False, 1),
+            (144, 0, 144, 144, True, 2),
+            (144, 144, 144, 144, False, 1),
+            (144, 144, 144, 144, False, 1),
+            (144, 144, 144, 144, False, 1),
+            (144, 144, 144, 304, False, 1),
+            (304, 0, 304, 304, True, 2),
+            (304, 304, 304, 304, False, 1),
+            (304, 304, 304, 304, False, 1),
+            (304, 304, 304, 304, False, 1),
+            (304, 304, 304, 304, False, 1),
+            (304, 304, 304, 512, False, 1),
+        ]
+        feature_info.extend([
+            dict(num_chs=144, reduction=4, module='features.1'),
+            dict(num_chs=304, reduction=8, module='features.6'),
+            dict(num_chs=512, reduction=16, module='features.12'),
+        ])
+        # Head can be replaced with alternative configurations depending on the problem
+        cfg['head'] = [
+            (512, 960, 3, 2),
+            (960, 1024, 3, 1),
+            (1024, 1024, 3, 2),
+            (1024, 1280, 3, 1),
+        ]
+        cfg['num_features'] = 1280
+        feature_info.extend([
+            dict(num_chs=1024, reduction=32, module='head.1'),
+            dict(num_chs=1280, reduction=64, module='head.3')
+        ])
+    else:
+        raise ValueError('Invalid net configuration ' + variant + ' !!!')
+    cfg['feature_info'] = feature_info
+
+    # this model can do 6 feature levels by default, unlike most others, leave as 0-4 to avoid surprises?
+    return build_model_with_cfg(
+        SelecSLS, variant, pretrained,
+        model_cfg=cfg,
+        feature_cfg=dict(out_indices=(0, 1, 2, 3, 4), flatten_sequential=True),
+        **kwargs)
+
+
+@register_model
+def selecsls42(pretrained=False, **kwargs):
+    """Constructs a SelecSLS42 model.
+    """
+    return _create_selecsls('selecsls42', pretrained, **kwargs)
+
+
+@register_model
+def selecsls42b(pretrained=False, **kwargs):
+    """Constructs a SelecSLS42_B model.
+    """
+    return _create_selecsls('selecsls42b', pretrained, **kwargs)
+
+
+@register_model
+def selecsls60(pretrained=False, **kwargs):
+    """Constructs a SelecSLS60 model.
+    """
+    return _create_selecsls('selecsls60', pretrained, **kwargs)
+
+
+@register_model
+def selecsls60b(pretrained=False, **kwargs):
+    """Constructs a SelecSLS60_B model.
+    """
+    return _create_selecsls('selecsls60b', pretrained, **kwargs)
+
+
+@register_model
+def selecsls84(pretrained=False, **kwargs):
+    """Constructs a SelecSLS84 model.
+    """
+    return _create_selecsls('selecsls84', pretrained, **kwargs)
diff --git a/src/custom_timm/models/senet.py b/src/custom_timm/models/senet.py
new file mode 100644
index 0000000000000000000000000000000000000000..5611479f82bef79df4913c6bf0e56b35e0630651
--- /dev/null
+++ b/src/custom_timm/models/senet.py
@@ -0,0 +1,465 @@
+"""
+SEResNet implementation from Cadene's pretrained models
+https://github.com/Cadene/pretrained-models.pytorch/blob/master/pretrainedmodels/models/senet.py
+Additional credit to https://github.com/creafz
+
+Original model: https://github.com/hujie-frank/SENet
+
+ResNet code gently borrowed from
+https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
+
+FIXME I'm deprecating this model and moving them to ResNet as I don't want to maintain duplicate
+support for extras like dilation, switchable BN/activations, feature extraction, etc that don't exist here.
+"""
+import math
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg
+from .layers import create_classifier
+from .registry import register_model
+
+__all__ = ['SENet']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'layer0.conv1', 'classifier': 'last_linear',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'legacy_senet154': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/legacy_senet154-e9eb9fe6.pth'),
+    'legacy_seresnet18': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnet18-4bb0ce65.pth',
+        interpolation='bicubic'),
+    'legacy_seresnet34': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnet34-a4004e63.pth'),
+    'legacy_seresnet50': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/se_resnet50-ce0d4300.pth'),
+    'legacy_seresnet101': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/se_resnet101-7e38fcc6.pth'),
+    'legacy_seresnet152': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/se_resnet152-d17c99b7.pth'),
+    'legacy_seresnext26_32x4d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnext26_32x4d-65ebdb501.pth',
+        interpolation='bicubic'),
+    'legacy_seresnext50_32x4d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/legacy_se_resnext50_32x4d-f3651bad.pth'),
+    'legacy_seresnext101_32x4d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/legacy_se_resnext101_32x4d-37725eac.pth'),
+}
+
+
+def _weight_init(m):
+    if isinstance(m, nn.Conv2d):
+        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+    elif isinstance(m, nn.BatchNorm2d):
+        nn.init.constant_(m.weight, 1.)
+        nn.init.constant_(m.bias, 0.)
+
+
+class SEModule(nn.Module):
+
+    def __init__(self, channels, reduction):
+        super(SEModule, self).__init__()
+        self.fc1 = nn.Conv2d(channels, channels // reduction, kernel_size=1)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc2 = nn.Conv2d(channels // reduction, channels, kernel_size=1)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        module_input = x
+        x = x.mean((2, 3), keepdim=True)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+        return module_input * x
+
+
+class Bottleneck(nn.Module):
+    """
+    Base class for bottlenecks that implements `forward()` method.
+    """
+
+    def forward(self, x):
+        shortcut = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            shortcut = self.downsample(x)
+
+        out = self.se_module(out) + shortcut
+        out = self.relu(out)
+
+        return out
+
+
+class SEBottleneck(Bottleneck):
+    """
+    Bottleneck for SENet154.
+    """
+    expansion = 4
+
+    def __init__(self, inplanes, planes, groups, reduction, stride=1, downsample=None):
+        super(SEBottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes * 2, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes * 2)
+        self.conv2 = nn.Conv2d(
+            planes * 2, planes * 4, kernel_size=3, stride=stride,
+            padding=1, groups=groups, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes * 4)
+        self.conv3 = nn.Conv2d(planes * 4, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.se_module = SEModule(planes * 4, reduction=reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+
+class SEResNetBottleneck(Bottleneck):
+    """
+    ResNet bottleneck with a Squeeze-and-Excitation module. It follows Caffe
+    implementation and uses `stride=stride` in `conv1` and not in `conv2`
+    (the latter is used in the torchvision implementation of ResNet).
+    """
+    expansion = 4
+
+    def __init__(self, inplanes, planes, groups, reduction, stride=1, downsample=None):
+        super(SEResNetBottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False, stride=stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, groups=groups, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.se_module = SEModule(planes * 4, reduction=reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+
+class SEResNeXtBottleneck(Bottleneck):
+    """
+    ResNeXt bottleneck type C with a Squeeze-and-Excitation module.
+    """
+    expansion = 4
+
+    def __init__(self, inplanes, planes, groups, reduction, stride=1, downsample=None, base_width=4):
+        super(SEResNeXtBottleneck, self).__init__()
+        width = math.floor(planes * (base_width / 64)) * groups
+        self.conv1 = nn.Conv2d(inplanes, width, kernel_size=1, bias=False, stride=1)
+        self.bn1 = nn.BatchNorm2d(width)
+        self.conv2 = nn.Conv2d(width, width, kernel_size=3, stride=stride, padding=1, groups=groups, bias=False)
+        self.bn2 = nn.BatchNorm2d(width)
+        self.conv3 = nn.Conv2d(width, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.se_module = SEModule(planes * 4, reduction=reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+
+class SEResNetBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, groups, reduction, stride=1, downsample=None):
+        super(SEResNetBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, padding=1, stride=stride, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, groups=groups, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.se_module = SEModule(planes, reduction=reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        shortcut = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        if self.downsample is not None:
+            shortcut = self.downsample(x)
+
+        out = self.se_module(out) + shortcut
+        out = self.relu(out)
+
+        return out
+
+
+class SENet(nn.Module):
+
+    def __init__(
+            self, block, layers, groups, reduction, drop_rate=0.2,
+            in_chans=3, inplanes=64, input_3x3=False, downsample_kernel_size=1,
+            downsample_padding=0, num_classes=1000, global_pool='avg'):
+        """
+        Parameters
+        ----------
+        block (nn.Module): Bottleneck class.
+            - For SENet154: SEBottleneck
+            - For SE-ResNet models: SEResNetBottleneck
+            - For SE-ResNeXt models:  SEResNeXtBottleneck
+        layers (list of ints): Number of residual blocks for 4 layers of the
+            network (layer1...layer4).
+        groups (int): Number of groups for the 3x3 convolution in each
+            bottleneck block.
+            - For SENet154: 64
+            - For SE-ResNet models: 1
+            - For SE-ResNeXt models:  32
+        reduction (int): Reduction ratio for Squeeze-and-Excitation modules.
+            - For all models: 16
+        dropout_p (float or None): Drop probability for the Dropout layer.
+            If `None` the Dropout layer is not used.
+            - For SENet154: 0.2
+            - For SE-ResNet models: None
+            - For SE-ResNeXt models: None
+        inplanes (int):  Number of input channels for layer1.
+            - For SENet154: 128
+            - For SE-ResNet models: 64
+            - For SE-ResNeXt models: 64
+        input_3x3 (bool): If `True`, use three 3x3 convolutions instead of
+            a single 7x7 convolution in layer0.
+            - For SENet154: True
+            - For SE-ResNet models: False
+            - For SE-ResNeXt models: False
+        downsample_kernel_size (int): Kernel size for downsampling convolutions
+            in layer2, layer3 and layer4.
+            - For SENet154: 3
+            - For SE-ResNet models: 1
+            - For SE-ResNeXt models: 1
+        downsample_padding (int): Padding for downsampling convolutions in
+            layer2, layer3 and layer4.
+            - For SENet154: 1
+            - For SE-ResNet models: 0
+            - For SE-ResNeXt models: 0
+        num_classes (int): Number of outputs in `last_linear` layer.
+            - For all models: 1000
+        """
+        super(SENet, self).__init__()
+        self.inplanes = inplanes
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        if input_3x3:
+            layer0_modules = [
+                ('conv1', nn.Conv2d(in_chans, 64, 3, stride=2, padding=1, bias=False)),
+                ('bn1', nn.BatchNorm2d(64)),
+                ('relu1', nn.ReLU(inplace=True)),
+                ('conv2', nn.Conv2d(64, 64, 3, stride=1, padding=1, bias=False)),
+                ('bn2', nn.BatchNorm2d(64)),
+                ('relu2', nn.ReLU(inplace=True)),
+                ('conv3', nn.Conv2d(64, inplanes, 3, stride=1, padding=1, bias=False)),
+                ('bn3', nn.BatchNorm2d(inplanes)),
+                ('relu3', nn.ReLU(inplace=True)),
+            ]
+        else:
+            layer0_modules = [
+                ('conv1', nn.Conv2d(
+                    in_chans, inplanes, kernel_size=7, stride=2, padding=3, bias=False)),
+                ('bn1', nn.BatchNorm2d(inplanes)),
+                ('relu1', nn.ReLU(inplace=True)),
+            ]
+        self.layer0 = nn.Sequential(OrderedDict(layer0_modules))
+        # To preserve compatibility with Caffe weights `ceil_mode=True` is used instead of `padding=1`.
+        self.pool0 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
+        self.feature_info = [dict(num_chs=inplanes, reduction=2, module='layer0')]
+        self.layer1 = self._make_layer(
+            block,
+            planes=64,
+            blocks=layers[0],
+            groups=groups,
+            reduction=reduction,
+            downsample_kernel_size=1,
+            downsample_padding=0
+        )
+        self.feature_info += [dict(num_chs=64 * block.expansion, reduction=4, module='layer1')]
+        self.layer2 = self._make_layer(
+            block,
+            planes=128,
+            blocks=layers[1],
+            stride=2,
+            groups=groups,
+            reduction=reduction,
+            downsample_kernel_size=downsample_kernel_size,
+            downsample_padding=downsample_padding
+        )
+        self.feature_info += [dict(num_chs=128 * block.expansion, reduction=8, module='layer2')]
+        self.layer3 = self._make_layer(
+            block,
+            planes=256,
+            blocks=layers[2],
+            stride=2,
+            groups=groups,
+            reduction=reduction,
+            downsample_kernel_size=downsample_kernel_size,
+            downsample_padding=downsample_padding
+        )
+        self.feature_info += [dict(num_chs=256 * block.expansion, reduction=16, module='layer3')]
+        self.layer4 = self._make_layer(
+            block,
+            planes=512,
+            blocks=layers[3],
+            stride=2,
+            groups=groups,
+            reduction=reduction,
+            downsample_kernel_size=downsample_kernel_size,
+            downsample_padding=downsample_padding
+        )
+        self.feature_info += [dict(num_chs=512 * block.expansion, reduction=32, module='layer4')]
+        self.num_features = 512 * block.expansion
+        self.global_pool, self.last_linear = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool)
+
+        for m in self.modules():
+            _weight_init(m)
+
+    def _make_layer(self, block, planes, blocks, groups, reduction, stride=1,
+                    downsample_kernel_size=1, downsample_padding=0):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.inplanes, planes * block.expansion, kernel_size=downsample_kernel_size,
+                    stride=stride, padding=downsample_padding, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = [block(self.inplanes, planes, groups, reduction, stride, downsample)]
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups, reduction))
+
+        return nn.Sequential(*layers)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(stem=r'^layer0', blocks=r'^layer(\d+)' if coarse else r'^layer(\d+)\.(\d+)')
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        assert not enable, 'gradient checkpointing not supported'
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.last_linear
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool, self.last_linear = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool)
+
+    def forward_features(self, x):
+        x = self.layer0(x)
+        x = self.pool0(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        x = self.global_pool(x)
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        return x if pre_logits else self.last_linear(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_senet(variant, pretrained=False, **kwargs):
+    return build_model_with_cfg(SENet, variant, pretrained, **kwargs)
+
+
+@register_model
+def legacy_seresnet18(pretrained=False, **kwargs):
+    model_args = dict(
+        block=SEResNetBlock, layers=[2, 2, 2, 2], groups=1, reduction=16, **kwargs)
+    return _create_senet('legacy_seresnet18', pretrained, **model_args)
+
+
+@register_model
+def legacy_seresnet34(pretrained=False, **kwargs):
+    model_args = dict(
+        block=SEResNetBlock, layers=[3, 4, 6, 3], groups=1, reduction=16, **kwargs)
+    return _create_senet('legacy_seresnet34', pretrained, **model_args)
+
+
+@register_model
+def legacy_seresnet50(pretrained=False, **kwargs):
+    model_args = dict(
+        block=SEResNetBottleneck, layers=[3, 4, 6, 3], groups=1, reduction=16, **kwargs)
+    return _create_senet('legacy_seresnet50', pretrained, **model_args)
+
+
+@register_model
+def legacy_seresnet101(pretrained=False, **kwargs):
+    model_args = dict(
+        block=SEResNetBottleneck, layers=[3, 4, 23, 3], groups=1, reduction=16, **kwargs)
+    return _create_senet('legacy_seresnet101', pretrained, **model_args)
+
+
+@register_model
+def legacy_seresnet152(pretrained=False, **kwargs):
+    model_args = dict(
+        block=SEResNetBottleneck, layers=[3, 8, 36, 3], groups=1, reduction=16, **kwargs)
+    return _create_senet('legacy_seresnet152', pretrained, **model_args)
+
+
+@register_model
+def legacy_senet154(pretrained=False, **kwargs):
+    model_args = dict(
+        block=SEBottleneck, layers=[3, 8, 36, 3], groups=64, reduction=16,
+        downsample_kernel_size=3, downsample_padding=1,  inplanes=128, input_3x3=True, **kwargs)
+    return _create_senet('legacy_senet154', pretrained, **model_args)
+
+
+@register_model
+def legacy_seresnext26_32x4d(pretrained=False, **kwargs):
+    model_args = dict(
+        block=SEResNeXtBottleneck, layers=[2, 2, 2, 2], groups=32, reduction=16, **kwargs)
+    return _create_senet('legacy_seresnext26_32x4d', pretrained, **model_args)
+
+
+@register_model
+def legacy_seresnext50_32x4d(pretrained=False, **kwargs):
+    model_args = dict(
+        block=SEResNeXtBottleneck, layers=[3, 4, 6, 3], groups=32, reduction=16, **kwargs)
+    return _create_senet('legacy_seresnext50_32x4d', pretrained, **model_args)
+
+
+@register_model
+def legacy_seresnext101_32x4d(pretrained=False, **kwargs):
+    model_args = dict(
+        block=SEResNeXtBottleneck, layers=[3, 4, 23, 3], groups=32, reduction=16, **kwargs)
+    return _create_senet('legacy_seresnext101_32x4d', pretrained, **model_args)
diff --git a/src/custom_timm/models/sequencer.py b/src/custom_timm/models/sequencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..48240d1d8625f4c0cb3c497a5c49058d722c2549
--- /dev/null
+++ b/src/custom_timm/models/sequencer.py
@@ -0,0 +1,417 @@
+""" Sequencer
+
+Paper: `Sequencer: Deep LSTM for Image Classification` - https://arxiv.org/pdf/2205.01972.pdf
+
+"""
+#  Copyright (c) 2022. Yuki Tatsunami
+#  Licensed under the Apache License, Version 2.0 (the "License");
+
+
+import math
+from functools import partial
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, DEFAULT_CROP_PCT
+from .helpers import build_model_with_cfg, named_apply
+from .layers import lecun_normal_, DropPath, Mlp, PatchEmbed as TimmPatchEmbed
+from .registry import register_model
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': DEFAULT_CROP_PCT, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = dict(
+    sequencer2d_s=_cfg(url="https://github.com/okojoalg/sequencer/releases/download/weights/sequencer2d_s.pth"),
+    sequencer2d_m=_cfg(url="https://github.com/okojoalg/sequencer/releases/download/weights/sequencer2d_m.pth"),
+    sequencer2d_l=_cfg(url="https://github.com/okojoalg/sequencer/releases/download/weights/sequencer2d_l.pth"),
+)
+
+
+def _init_weights(module: nn.Module, name: str, head_bias: float = 0., flax=False):
+    if isinstance(module, nn.Linear):
+        if name.startswith('head'):
+            nn.init.zeros_(module.weight)
+            nn.init.constant_(module.bias, head_bias)
+        else:
+            if flax:
+                # Flax defaults
+                lecun_normal_(module.weight)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+            else:
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    if 'mlp' in name:
+                        nn.init.normal_(module.bias, std=1e-6)
+                    else:
+                        nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Conv2d):
+        lecun_normal_(module.weight)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm)):
+        nn.init.ones_(module.weight)
+        nn.init.zeros_(module.bias)
+    elif isinstance(module, (nn.RNN, nn.GRU, nn.LSTM)):
+        stdv = 1.0 / math.sqrt(module.hidden_size)
+        for weight in module.parameters():
+            nn.init.uniform_(weight, -stdv, stdv)
+    elif hasattr(module, 'init_weights'):
+        module.init_weights()
+
+
+def get_stage(
+        index, layers, patch_sizes, embed_dims, hidden_sizes, mlp_ratios, block_layer, rnn_layer, mlp_layer,
+        norm_layer, act_layer, num_layers, bidirectional, union,
+        with_fc, drop=0., drop_path_rate=0., **kwargs):
+    assert len(layers) == len(patch_sizes) == len(embed_dims) == len(hidden_sizes) == len(mlp_ratios)
+    blocks = []
+    for block_idx in range(layers[index]):
+        drop_path = drop_path_rate * (block_idx + sum(layers[:index])) / (sum(layers) - 1)
+        blocks.append(block_layer(
+            embed_dims[index], hidden_sizes[index], mlp_ratio=mlp_ratios[index],
+            rnn_layer=rnn_layer, mlp_layer=mlp_layer, norm_layer=norm_layer, act_layer=act_layer,
+            num_layers=num_layers, bidirectional=bidirectional, union=union, with_fc=with_fc,
+            drop=drop, drop_path=drop_path))
+
+    if index < len(embed_dims) - 1:
+        blocks.append(Downsample2D(embed_dims[index], embed_dims[index + 1], patch_sizes[index + 1]))
+
+    blocks = nn.Sequential(*blocks)
+    return blocks
+
+
+class RNNIdentity(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super(RNNIdentity, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, None]:
+        return x, None
+
+
+class RNN2DBase(nn.Module):
+
+    def __init__(
+            self, input_size: int, hidden_size: int,
+            num_layers: int = 1, bias: bool = True, bidirectional: bool = True,
+            union="cat", with_fc=True):
+        super().__init__()
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.output_size = 2 * hidden_size if bidirectional else hidden_size
+        self.union = union
+
+        self.with_vertical = True
+        self.with_horizontal = True
+        self.with_fc = with_fc
+
+        self.fc = None
+        if with_fc:
+            if union == "cat":
+                self.fc = nn.Linear(2 * self.output_size, input_size)
+            elif union == "add":
+                self.fc = nn.Linear(self.output_size, input_size)
+            elif union == "vertical":
+                self.fc = nn.Linear(self.output_size, input_size)
+                self.with_horizontal = False
+            elif union == "horizontal":
+                self.fc = nn.Linear(self.output_size, input_size)
+                self.with_vertical = False
+            else:
+                raise ValueError("Unrecognized union: " + union)
+        elif union == "cat":
+            pass
+            if 2 * self.output_size != input_size:
+                raise ValueError(f"The output channel {2 * self.output_size} is different from the input channel {input_size}.")
+        elif union == "add":
+            pass
+            if self.output_size != input_size:
+                raise ValueError(f"The output channel {self.output_size} is different from the input channel {input_size}.")
+        elif union == "vertical":
+            if self.output_size != input_size:
+                raise ValueError(f"The output channel {self.output_size} is different from the input channel {input_size}.")
+            self.with_horizontal = False
+        elif union == "horizontal":
+            if self.output_size != input_size:
+                raise ValueError(f"The output channel {self.output_size} is different from the input channel {input_size}.")
+            self.with_vertical = False
+        else:
+            raise ValueError("Unrecognized union: " + union)
+
+        self.rnn_v = RNNIdentity()
+        self.rnn_h = RNNIdentity()
+
+    def forward(self, x):
+        B, H, W, C = x.shape
+
+        if self.with_vertical:
+            v = x.permute(0, 2, 1, 3)
+            v = v.reshape(-1, H, C)
+            v, _ = self.rnn_v(v)
+            v = v.reshape(B, W, H, -1)
+            v = v.permute(0, 2, 1, 3)
+        else:
+            v = None
+
+        if self.with_horizontal:
+            h = x.reshape(-1, W, C)
+            h, _ = self.rnn_h(h)
+            h = h.reshape(B, H, W, -1)
+        else:
+            h = None
+
+        if v is not None and h is not None:
+            if self.union == "cat":
+                x = torch.cat([v, h], dim=-1)
+            else:
+                x = v + h
+        elif v is not None:
+            x = v
+        elif h is not None:
+            x = h
+
+        if self.fc is not None:
+            x = self.fc(x)
+
+        return x
+
+
+class LSTM2D(RNN2DBase):
+
+    def __init__(
+            self, input_size: int, hidden_size: int,
+            num_layers: int = 1, bias: bool = True, bidirectional: bool = True,
+            union="cat", with_fc=True):
+        super().__init__(input_size, hidden_size, num_layers, bias, bidirectional, union, with_fc)
+        if self.with_vertical:
+            self.rnn_v = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bias=bias, bidirectional=bidirectional)
+        if self.with_horizontal:
+            self.rnn_h = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bias=bias, bidirectional=bidirectional)
+
+
+class Sequencer2DBlock(nn.Module):
+    def __init__(
+            self, dim, hidden_size, mlp_ratio=3.0, rnn_layer=LSTM2D, mlp_layer=Mlp,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6), act_layer=nn.GELU,
+            num_layers=1, bidirectional=True, union="cat", with_fc=True, drop=0., drop_path=0.):
+        super().__init__()
+        channels_dim = int(mlp_ratio * dim)
+        self.norm1 = norm_layer(dim)
+        self.rnn_tokens = rnn_layer(dim, hidden_size, num_layers=num_layers, bidirectional=bidirectional,
+                                    union=union, with_fc=with_fc)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp_channels = mlp_layer(dim, channels_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.rnn_tokens(self.norm1(x)))
+        x = x + self.drop_path(self.mlp_channels(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(TimmPatchEmbed):
+    def forward(self, x):
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        else:
+            x = x.permute(0, 2, 3, 1)  # BCHW -> BHWC
+        x = self.norm(x)
+        return x
+
+
+class Shuffle(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        if self.training:
+            B, H, W, C = x.shape
+            r = torch.randperm(H * W)
+            x = x.reshape(B, -1, C)
+            x = x[:, r, :].reshape(B, H, W, -1)
+        return x
+
+
+class Downsample2D(nn.Module):
+    def __init__(self, input_dim, output_dim, patch_size):
+        super().__init__()
+        self.down = nn.Conv2d(input_dim, output_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        x = x.permute(0, 3, 1, 2)
+        x = self.down(x)
+        x = x.permute(0, 2, 3, 1)
+        return x
+
+
+class Sequencer2D(nn.Module):
+    def __init__(
+            self,
+            num_classes=1000,
+            img_size=224,
+            in_chans=3,
+            global_pool='avg',
+            layers=[4, 3, 8, 3],
+            patch_sizes=[7, 2, 1, 1],
+            embed_dims=[192, 384, 384, 384],
+            hidden_sizes=[48, 96, 96, 96],
+            mlp_ratios=[3.0, 3.0, 3.0, 3.0],
+            block_layer=Sequencer2DBlock,
+            rnn_layer=LSTM2D,
+            mlp_layer=Mlp,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6),
+            act_layer=nn.GELU,
+            num_rnn_layers=1,
+            bidirectional=True,
+            union="cat",
+            with_fc=True,
+            drop_rate=0.,
+            drop_path_rate=0.,
+            nlhb=False,
+            stem_norm=False,
+    ):
+        super().__init__()
+        assert global_pool in ('', 'avg')
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = embed_dims[-1]  # num_features for consistency with other models
+        self.feature_dim = -1  # channel dim index for feature outputs (rank 4, NHWC)
+        self.embed_dims = embed_dims
+        self.stem = PatchEmbed(
+            img_size=img_size, patch_size=patch_sizes[0], in_chans=in_chans,
+            embed_dim=embed_dims[0], norm_layer=norm_layer if stem_norm else None,
+            flatten=False)
+
+        self.blocks = nn.Sequential(*[
+            get_stage(
+                i, layers, patch_sizes, embed_dims, hidden_sizes, mlp_ratios, block_layer=block_layer,
+                rnn_layer=rnn_layer, mlp_layer=mlp_layer, norm_layer=norm_layer, act_layer=act_layer,
+                num_layers=num_rnn_layers, bidirectional=bidirectional,
+                union=union, with_fc=with_fc, drop=drop_rate, drop_path_rate=drop_path_rate,
+            )
+            for i, _ in enumerate(embed_dims)])
+
+        self.norm = norm_layer(embed_dims[-1])
+        self.head = nn.Linear(embed_dims[-1], self.num_classes) if num_classes > 0 else nn.Identity()
+
+        self.init_weights(nlhb=nlhb)
+
+    def init_weights(self, nlhb=False):
+        head_bias = -math.log(self.num_classes) if nlhb else 0.
+        named_apply(partial(_init_weights, head_bias=head_bias), module=self)  # depth-first
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^stem',
+            blocks=[
+                (r'^blocks\.(\d+)\..*\.down', (99999,)),
+                (r'^blocks\.(\d+)', None) if coarse else (r'^blocks\.(\d+)\.(\d+)', None),
+                (r'^norm', (99999,))
+            ]
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        assert not enable, 'gradient checkpointing not supported'
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ('', 'avg')
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = self.blocks(x)
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool == 'avg':
+            x = x.mean(dim=(1, 2))
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_sequencer2d(variant, pretrained=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Sequencer2D models.')
+
+    model = build_model_with_cfg(Sequencer2D, variant, pretrained, **kwargs)
+    return model
+
+
+# main
+
+@register_model
+def sequencer2d_s(pretrained=False, **kwargs):
+    model_args = dict(
+        layers=[4, 3, 8, 3],
+        patch_sizes=[7, 2, 1, 1],
+        embed_dims=[192, 384, 384, 384],
+        hidden_sizes=[48, 96, 96, 96],
+        mlp_ratios=[3.0, 3.0, 3.0, 3.0],
+        rnn_layer=LSTM2D,
+        bidirectional=True,
+        union="cat",
+        with_fc=True,
+        **kwargs)
+    model = _create_sequencer2d('sequencer2d_s', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def sequencer2d_m(pretrained=False, **kwargs):
+    model_args = dict(
+        layers=[4, 3, 14, 3],
+        patch_sizes=[7, 2, 1, 1],
+        embed_dims=[192, 384, 384, 384],
+        hidden_sizes=[48, 96, 96, 96],
+        mlp_ratios=[3.0, 3.0, 3.0, 3.0],
+        rnn_layer=LSTM2D,
+        bidirectional=True,
+        union="cat",
+        with_fc=True,
+        **kwargs)
+    model = _create_sequencer2d('sequencer2d_m', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def sequencer2d_l(pretrained=False, **kwargs):
+    model_args = dict(
+        layers=[8, 8, 16, 4],
+        patch_sizes=[7, 2, 1, 1],
+        embed_dims=[192, 384, 384, 384],
+        hidden_sizes=[48, 96, 96, 96],
+        mlp_ratios=[3.0, 3.0, 3.0, 3.0],
+        rnn_layer=LSTM2D,
+        bidirectional=True,
+        union="cat",
+        with_fc=True,
+        **kwargs)
+    model = _create_sequencer2d('sequencer2d_l', pretrained=pretrained, **model_args)
+    return model
diff --git a/src/custom_timm/models/sknet.py b/src/custom_timm/models/sknet.py
new file mode 100644
index 0000000000000000000000000000000000000000..342a7901325780809a3213d6188e87ea111a9a11
--- /dev/null
+++ b/src/custom_timm/models/sknet.py
@@ -0,0 +1,206 @@
+""" Selective Kernel Networks (ResNet base)
+
+Paper: Selective Kernel Networks (https://arxiv.org/abs/1903.06586)
+
+This was inspired by reading 'Compounding the Performance Improvements...' (https://arxiv.org/abs/2001.06268)
+and a streamlined impl at https://github.com/clovaai/assembled-cnn but I ended up building something closer
+to the original paper with some modifications of my own to better balance param count vs accuracy.
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import math
+
+from torch import nn as nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg
+from .layers import SelectiveKernel, ConvNormAct, ConvNormActAa, create_attn
+from .registry import register_model
+from .resnet import ResNet
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'conv1', 'classifier': 'fc',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'skresnet18': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/skresnet18_ra-4eec2804.pth'),
+    'skresnet34': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/skresnet34_ra-bdc0ccde.pth'),
+    'skresnet50': _cfg(),
+    'skresnet50d': _cfg(
+        first_conv='conv1.0'),
+    'skresnext50_32x4d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/skresnext50_ra-f40e40bf.pth'),
+}
+
+
+class SelectiveKernelBasic(nn.Module):
+    expansion = 1
+
+    def __init__(
+            self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
+            sk_kwargs=None, reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU,
+            norm_layer=nn.BatchNorm2d, attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
+        super(SelectiveKernelBasic, self).__init__()
+
+        sk_kwargs = sk_kwargs or {}
+        conv_kwargs = dict(act_layer=act_layer, norm_layer=norm_layer)
+        assert cardinality == 1, 'BasicBlock only supports cardinality of 1'
+        assert base_width == 64, 'BasicBlock doest not support changing base width'
+        first_planes = planes // reduce_first
+        outplanes = planes * self.expansion
+        first_dilation = first_dilation or dilation
+
+        self.conv1 = SelectiveKernel(
+            inplanes, first_planes, stride=stride, dilation=first_dilation,
+            aa_layer=aa_layer, drop_layer=drop_block, **conv_kwargs, **sk_kwargs)
+        self.conv2 = ConvNormAct(
+            first_planes, outplanes, kernel_size=3, dilation=dilation, apply_act=False, **conv_kwargs)
+        self.se = create_attn(attn_layer, outplanes)
+        self.act = act_layer(inplace=True)
+        self.downsample = downsample
+        self.drop_path = drop_path
+
+    def zero_init_last(self):
+        nn.init.zeros_(self.conv2.bn.weight)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        if self.se is not None:
+            x = self.se(x)
+        if self.drop_path is not None:
+            x = self.drop_path(x)
+        if self.downsample is not None:
+            shortcut = self.downsample(shortcut)
+        x += shortcut
+        x = self.act(x)
+        return x
+
+
+class SelectiveKernelBottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(
+            self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64, sk_kwargs=None,
+            reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
+            attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
+        super(SelectiveKernelBottleneck, self).__init__()
+
+        sk_kwargs = sk_kwargs or {}
+        conv_kwargs = dict(act_layer=act_layer, norm_layer=norm_layer)
+        width = int(math.floor(planes * (base_width / 64)) * cardinality)
+        first_planes = width // reduce_first
+        outplanes = planes * self.expansion
+        first_dilation = first_dilation or dilation
+
+        self.conv1 = ConvNormAct(inplanes, first_planes, kernel_size=1, **conv_kwargs)
+        self.conv2 = SelectiveKernel(
+            first_planes, width, stride=stride, dilation=first_dilation, groups=cardinality,
+            aa_layer=aa_layer, drop_layer=drop_block, **conv_kwargs, **sk_kwargs)
+        self.conv3 = ConvNormAct(width, outplanes, kernel_size=1, apply_act=False, **conv_kwargs)
+        self.se = create_attn(attn_layer, outplanes)
+        self.act = act_layer(inplace=True)
+        self.downsample = downsample
+        self.drop_path = drop_path
+
+    def zero_init_last(self):
+        nn.init.zeros_(self.conv3.bn.weight)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        if self.se is not None:
+            x = self.se(x)
+        if self.drop_path is not None:
+            x = self.drop_path(x)
+        if self.downsample is not None:
+            shortcut = self.downsample(shortcut)
+        x += shortcut
+        x = self.act(x)
+        return x
+
+
+def _create_skresnet(variant, pretrained=False, **kwargs):
+    return build_model_with_cfg(ResNet, variant, pretrained, **kwargs)
+
+
+@register_model
+def skresnet18(pretrained=False, **kwargs):
+    """Constructs a Selective Kernel ResNet-18 model.
+
+    Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this
+    variation splits the input channels to the selective convolutions to keep param count down.
+    """
+    sk_kwargs = dict(rd_ratio=1 / 8, rd_divisor=16, split_input=True)
+    model_args = dict(
+        block=SelectiveKernelBasic, layers=[2, 2, 2, 2], block_args=dict(sk_kwargs=sk_kwargs),
+        zero_init_last=False, **kwargs)
+    return _create_skresnet('skresnet18', pretrained, **model_args)
+
+
+@register_model
+def skresnet34(pretrained=False, **kwargs):
+    """Constructs a Selective Kernel ResNet-34 model.
+
+    Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this
+    variation splits the input channels to the selective convolutions to keep param count down.
+    """
+    sk_kwargs = dict(rd_ratio=1 / 8, rd_divisor=16, split_input=True)
+    model_args = dict(
+        block=SelectiveKernelBasic, layers=[3, 4, 6, 3], block_args=dict(sk_kwargs=sk_kwargs),
+        zero_init_last=False, **kwargs)
+    return _create_skresnet('skresnet34', pretrained, **model_args)
+
+
+@register_model
+def skresnet50(pretrained=False, **kwargs):
+    """Constructs a Select Kernel ResNet-50 model.
+
+    Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this
+    variation splits the input channels to the selective convolutions to keep param count down.
+    """
+    sk_kwargs = dict(split_input=True)
+    model_args = dict(
+        block=SelectiveKernelBottleneck, layers=[3, 4, 6, 3], block_args=dict(sk_kwargs=sk_kwargs),
+        zero_init_last=False, **kwargs)
+    return _create_skresnet('skresnet50', pretrained, **model_args)
+
+
+@register_model
+def skresnet50d(pretrained=False, **kwargs):
+    """Constructs a Select Kernel ResNet-50-D model.
+
+    Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this
+    variation splits the input channels to the selective convolutions to keep param count down.
+    """
+    sk_kwargs = dict(split_input=True)
+    model_args = dict(
+        block=SelectiveKernelBottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True,
+        block_args=dict(sk_kwargs=sk_kwargs), zero_init_last=False, **kwargs)
+    return _create_skresnet('skresnet50d', pretrained, **model_args)
+
+
+@register_model
+def skresnext50_32x4d(pretrained=False, **kwargs):
+    """Constructs a Select Kernel ResNeXt50-32x4d model. This should be equivalent to
+    the SKNet-50 model in the Select Kernel Paper
+    """
+    sk_kwargs = dict(rd_ratio=1/16, rd_divisor=32, split_input=False)
+    model_args = dict(
+        block=SelectiveKernelBottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4,
+        block_args=dict(sk_kwargs=sk_kwargs), zero_init_last=False, **kwargs)
+    return _create_skresnet('skresnext50_32x4d', pretrained, **model_args)
+
diff --git a/src/custom_timm/models/swin_transformer.py b/src/custom_timm/models/swin_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b2e215dc8d98ba91ced0f381096c2be8c3f8163
--- /dev/null
+++ b/src/custom_timm/models/swin_transformer.py
@@ -0,0 +1,700 @@
+""" Swin Transformer
+A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`
+    - https://arxiv.org/pdf/2103.14030
+
+Code/weights from https://github.com/microsoft/Swin-Transformer, original copyright/license info below
+
+S3 (AutoFormerV2, https://arxiv.org/abs/2111.14725) Swin weights from
+    - https://github.com/microsoft/Cream/tree/main/AutoFormerV2
+
+Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman
+"""
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+import logging
+import math
+from functools import partial
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .fx_features import register_notrace_function
+from .helpers import build_model_with_cfg, named_apply, checkpoint_seq
+from .layers import PatchEmbed, Mlp, DropPath, to_2tuple, to_ntuple, trunc_normal_, _assert
+from .registry import register_model
+from .vision_transformer import checkpoint_filter_fn, get_init_weights_vit
+
+
+_logger = logging.getLogger(__name__)
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'swin_base_patch4_window12_384': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22kto1k.pth',
+        input_size=(3, 384, 384), crop_pct=1.0),
+
+    'swin_base_patch4_window7_224': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22kto1k.pth',
+    ),
+
+    'swin_large_patch4_window12_384': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22kto1k.pth',
+        input_size=(3, 384, 384), crop_pct=1.0),
+
+    'swin_large_patch4_window7_224': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window7_224_22kto1k.pth',
+    ),
+
+    'swin_small_patch4_window7_224': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth',
+    ),
+
+    'swin_tiny_patch4_window7_224': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth',
+    ),
+
+    'swin_base_patch4_window12_384_in22k': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth',
+        input_size=(3, 384, 384), crop_pct=1.0, num_classes=21841),
+
+    'swin_base_patch4_window7_224_in22k': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22k.pth',
+        num_classes=21841),
+
+    'swin_large_patch4_window12_384_in22k': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth',
+        input_size=(3, 384, 384), crop_pct=1.0, num_classes=21841),
+
+    'swin_large_patch4_window7_224_in22k': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window7_224_22k.pth',
+        num_classes=21841),
+
+    'swin_s3_tiny_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/s3_t-1d53f6a8.pth'
+    ),
+    'swin_s3_small_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/s3_s-3bb4c69d.pth'
+    ),
+    'swin_s3_base_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/s3_b-a1e95db4.pth'
+    )
+}
+
+
+def window_partition(x, window_size: int):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+@register_notrace_function  # reason: int argument is a Proxy
+def window_reverse(windows, window_size: int, H: int, W: int):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+def get_relative_position_index(win_h, win_w):
+    # get pair-wise relative position index for each token inside the window
+    coords = torch.stack(torch.meshgrid([torch.arange(win_h), torch.arange(win_w)]))  # 2, Wh, Ww
+    coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+    relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+    relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+    relative_coords[:, :, 0] += win_h - 1  # shift to start from 0
+    relative_coords[:, :, 1] += win_w - 1
+    relative_coords[:, :, 0] *= 2 * win_w - 1
+    return relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+
+
+class WindowAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        head_dim (int): Number of channels per head (dim // num_heads if not set)
+        window_size (tuple[int]): The height and width of the window.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self, dim, num_heads, head_dim=None, window_size=7, qkv_bias=True, attn_drop=0., proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = to_2tuple(window_size)  # Wh, Ww
+        win_h, win_w = self.window_size
+        self.window_area = win_h * win_w
+        self.num_heads = num_heads
+        head_dim = head_dim or dim // num_heads
+        attn_dim = head_dim * num_heads
+        self.scale = head_dim ** -0.5
+
+        # define a parameter table of relative position bias, shape: 2*Wh-1 * 2*Ww-1, nH
+        self.relative_position_bias_table = nn.Parameter(torch.zeros((2 * win_h - 1) * (2 * win_w - 1), num_heads))
+
+        # get pair-wise relative position index for each token inside the window
+        self.register_buffer("relative_position_index", get_relative_position_index(win_h, win_w))
+
+        self.qkv = nn.Linear(dim, attn_dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(attn_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def _get_rel_pos_bias(self) -> torch.Tensor:
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(self.window_area, self.window_area, -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        return relative_position_bias.unsqueeze(0)
+
+    def forward(self, x, mask: Optional[torch.Tensor] = None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        attn = attn + self._get_rel_pos_bias()
+
+        if mask is not None:
+            num_win = mask.shape[0]
+            attn = attn.view(B_ // num_win, num_win, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Module):
+    r""" Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        window_size (int): Window size.
+        num_heads (int): Number of attention heads.
+        head_dim (int): Enforce the number of channels per head
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(
+            self, dim, input_resolution, num_heads=4, head_dim=None, window_size=7, shift_size=0,
+            mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., drop_path=0.,
+            act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, num_heads=num_heads, head_dim=head_dim, window_size=to_2tuple(self.window_size),
+            qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
+
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+            cnt = 0
+            for h in (
+                    slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None)):
+                for w in (
+                        slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None)):
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+            mask_windows = window_partition(img_mask, self.window_size)  # num_win, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+
+        self.register_buffer("attn_mask", attn_mask)
+
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        _assert(L == H * W, "input feature has wrong size")
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
+
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # num_win*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # num_win*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=self.attn_mask)  # num_win*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Module):
+    r""" Patch Merging Layer.
+
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, out_dim=None, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.out_dim = out_dim or 2 * dim
+        self.norm = norm_layer(4 * dim)
+        self.reduction = nn.Linear(4 * dim, self.out_dim, bias=False)
+
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        _assert(L == H * W, "input feature has wrong size")
+        _assert(H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even.")
+
+        x = x.view(B, H, W, C)
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        head_dim (int): Channels per head (dim // num_heads if not set)
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+    """
+
+    def __init__(
+            self, dim, out_dim, input_resolution, depth, num_heads=4, head_dim=None,
+            window_size=7, mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0.,
+            drop_path=0., norm_layer=nn.LayerNorm, downsample=None):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.grad_checkpointing = False
+
+        # build blocks
+        self.blocks = nn.Sequential(*[
+            SwinTransformerBlock(
+                dim=dim, input_resolution=input_resolution, num_heads=num_heads, head_dim=head_dim,
+                window_size=window_size, shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop, attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, norm_layer=norm_layer)
+            for i in range(depth)])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, out_dim=out_dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+
+
+class SwinTransformer(nn.Module):
+    r""" Swin Transformer
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+
+    Args:
+        img_size (int | tuple(int)): Input image size. Default 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        head_dim (int, tuple(int)):
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+    """
+
+    def __init__(
+            self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, global_pool='avg',
+            embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), head_dim=None,
+            window_size=7, mlp_ratio=4., qkv_bias=True,
+            drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
+            norm_layer=nn.LayerNorm, ape=False, patch_norm=True, weight_init='', **kwargs):
+        super().__init__()
+        assert global_pool in ('', 'avg')
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+        self.patch_grid = self.patch_embed.grid_size
+
+        # absolute position embedding
+        self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) if ape else None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # build layers
+        if not isinstance(embed_dim, (tuple, list)):
+            embed_dim = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        embed_out_dim = embed_dim[1:] + [None]
+        head_dim = to_ntuple(self.num_layers)(head_dim)
+        window_size = to_ntuple(self.num_layers)(window_size)
+        mlp_ratio = to_ntuple(self.num_layers)(mlp_ratio)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        layers = []
+        for i in range(self.num_layers):
+            layers += [BasicLayer(
+                dim=embed_dim[i],
+                out_dim=embed_out_dim[i],
+                input_resolution=(self.patch_grid[0] // (2 ** i), self.patch_grid[1] // (2 ** i)),
+                depth=depths[i],
+                num_heads=num_heads[i],
+                head_dim=head_dim[i],
+                window_size=window_size[i],
+                mlp_ratio=mlp_ratio[i],
+                qkv_bias=qkv_bias,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i]):sum(depths[:i + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i < self.num_layers - 1) else None
+            )]
+        self.layers = nn.Sequential(*layers)
+
+        self.norm = norm_layer(self.num_features)
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+        if weight_init != 'skip':
+            self.init_weights(weight_init)
+
+    @torch.jit.ignore
+    def init_weights(self, mode=''):
+        assert mode in ('jax', 'jax_nlhb', 'moco', '')
+        if self.absolute_pos_embed is not None:
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        head_bias = -math.log(self.num_classes) if 'nlhb' in mode else 0.
+        named_apply(get_init_weights_vit(mode, head_bias=head_bias), self)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        nwd = {'absolute_pos_embed'}
+        for n, _ in self.named_parameters():
+            if 'relative_position_bias_table' in n:
+                nwd.add(n)
+        return nwd
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^absolute_pos_embed|patch_embed',  # stem and embed
+            blocks=r'^layers\.(\d+)' if coarse else [
+                (r'^layers\.(\d+).downsample', (0,)),
+                (r'^layers\.(\d+)\.\w+\.(\d+)', None),
+                (r'^norm', (99999,)),
+            ]
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for l in self.layers:
+            l.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ('', 'avg')
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        if self.absolute_pos_embed is not None:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+        x = self.layers(x)
+        x = self.norm(x)  # B L C
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool == 'avg':
+            x = x.mean(dim=1)
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_swin_transformer(variant, pretrained=False, **kwargs):
+    model = build_model_with_cfg(
+        SwinTransformer, variant, pretrained,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        **kwargs)
+
+    return model
+
+
+@register_model
+def swin_base_patch4_window12_384(pretrained=False, **kwargs):
+    """ Swin-B @ 384x384, pretrained ImageNet-22k, fine tune 1k
+    """
+    model_kwargs = dict(
+        patch_size=4, window_size=12, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs)
+    return _create_swin_transformer('swin_base_patch4_window12_384', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swin_base_patch4_window7_224(pretrained=False, **kwargs):
+    """ Swin-B @ 224x224, pretrained ImageNet-22k, fine tune 1k
+    """
+    model_kwargs = dict(
+        patch_size=4, window_size=7, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs)
+    return _create_swin_transformer('swin_base_patch4_window7_224', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swin_large_patch4_window12_384(pretrained=False, **kwargs):
+    """ Swin-L @ 384x384, pretrained ImageNet-22k, fine tune 1k
+    """
+    model_kwargs = dict(
+        patch_size=4, window_size=12, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs)
+    return _create_swin_transformer('swin_large_patch4_window12_384', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swin_large_patch4_window7_224(pretrained=False, **kwargs):
+    """ Swin-L @ 224x224, pretrained ImageNet-22k, fine tune 1k
+    """
+    model_kwargs = dict(
+        patch_size=4, window_size=7, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs)
+    return _create_swin_transformer('swin_large_patch4_window7_224', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swin_small_patch4_window7_224(pretrained=False, **kwargs):
+    """ Swin-S @ 224x224, trained ImageNet-1k
+    """
+    model_kwargs = dict(
+        patch_size=4, window_size=7, embed_dim=96, depths=(2, 2, 18, 2), num_heads=(3, 6, 12, 24), **kwargs)
+    return _create_swin_transformer('swin_small_patch4_window7_224', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swin_tiny_patch4_window7_224(pretrained=False, **kwargs):
+    """ Swin-T @ 224x224, trained ImageNet-1k
+    """
+    model_kwargs = dict(
+        patch_size=4, window_size=7, embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), **kwargs)
+    return _create_swin_transformer('swin_tiny_patch4_window7_224', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swin_base_patch4_window12_384_in22k(pretrained=False, **kwargs):
+    """ Swin-B @ 384x384, trained ImageNet-22k
+    """
+    model_kwargs = dict(
+        patch_size=4, window_size=12, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs)
+    return _create_swin_transformer('swin_base_patch4_window12_384_in22k', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swin_base_patch4_window7_224_in22k(pretrained=False, **kwargs):
+    """ Swin-B @ 224x224, trained ImageNet-22k
+    """
+    model_kwargs = dict(
+        patch_size=4, window_size=7, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs)
+    return _create_swin_transformer('swin_base_patch4_window7_224_in22k', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swin_large_patch4_window12_384_in22k(pretrained=False, **kwargs):
+    """ Swin-L @ 384x384, trained ImageNet-22k
+    """
+    model_kwargs = dict(
+        patch_size=4, window_size=12, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs)
+    return _create_swin_transformer('swin_large_patch4_window12_384_in22k', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swin_large_patch4_window7_224_in22k(pretrained=False, **kwargs):
+    """ Swin-L @ 224x224, trained ImageNet-22k
+    """
+    model_kwargs = dict(
+        patch_size=4, window_size=7, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs)
+    return _create_swin_transformer('swin_large_patch4_window7_224_in22k', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swin_s3_tiny_224(pretrained=False, **kwargs):
+    """ Swin-S3-T @ 224x224, ImageNet-1k. https://arxiv.org/abs/2111.14725
+    """
+    model_kwargs = dict(
+        patch_size=4, window_size=(7, 7, 14, 7), embed_dim=96, depths=(2, 2, 6, 2),
+        num_heads=(3, 6, 12, 24), **kwargs)
+    return _create_swin_transformer('swin_s3_tiny_224', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swin_s3_small_224(pretrained=False, **kwargs):
+    """ Swin-S3-S @ 224x224, trained ImageNet-1k. https://arxiv.org/abs/2111.14725
+    """
+    model_kwargs = dict(
+        patch_size=4, window_size=(14, 14, 14, 7), embed_dim=96, depths=(2, 2, 18, 2),
+        num_heads=(3, 6, 12, 24), **kwargs)
+    return _create_swin_transformer('swin_s3_small_224', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swin_s3_base_224(pretrained=False, **kwargs):
+    """ Swin-S3-B @ 224x224, trained ImageNet-1k. https://arxiv.org/abs/2111.14725
+    """
+    model_kwargs = dict(
+        patch_size=4, window_size=(7, 7, 14, 7), embed_dim=96, depths=(2, 2, 30, 2),
+        num_heads=(3, 6, 12, 24), **kwargs)
+    return _create_swin_transformer('swin_s3_base_224', pretrained=pretrained, **model_kwargs)
+
diff --git a/src/custom_timm/models/swin_transformer_v2.py b/src/custom_timm/models/swin_transformer_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ade2b050a956fe6f30811736d196d3f33e4dcc7c
--- /dev/null
+++ b/src/custom_timm/models/swin_transformer_v2.py
@@ -0,0 +1,753 @@
+""" Swin Transformer V2
+A PyTorch impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution`
+    - https://arxiv.org/abs/2111.09883
+
+Code/weights from https://github.com/microsoft/Swin-Transformer, original copyright/license info below
+
+Modifications and additions for timm hacked together by / Copyright 2022, Ross Wightman
+"""
+# --------------------------------------------------------
+# Swin Transformer V2
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+import math
+from typing import Tuple, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .fx_features import register_notrace_function
+from .helpers import build_model_with_cfg, named_apply
+from .layers import PatchEmbed, Mlp, DropPath, to_2tuple, to_ntuple, trunc_normal_, _assert
+from .registry import register_model
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'swinv2_tiny_window8_256': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_tiny_patch4_window8_256.pth',
+        input_size=(3, 256, 256)
+    ),
+    'swinv2_tiny_window16_256': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_tiny_patch4_window16_256.pth',
+        input_size=(3, 256, 256)
+    ),
+    'swinv2_small_window8_256': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_small_patch4_window8_256.pth',
+        input_size=(3, 256, 256)
+    ),
+    'swinv2_small_window16_256': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_small_patch4_window16_256.pth',
+        input_size=(3, 256, 256)
+    ),
+    'swinv2_base_window8_256': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window8_256.pth',
+        input_size=(3, 256, 256)
+    ),
+    'swinv2_base_window16_256': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window16_256.pth',
+        input_size=(3, 256, 256)
+    ),
+
+    'swinv2_base_window12_192_22k': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12_192_22k.pth',
+        num_classes=21841, input_size=(3, 192, 192)
+    ),
+    'swinv2_base_window12to16_192to256_22kft1k': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12to16_192to256_22kto1k_ft.pth',
+        input_size=(3, 256, 256)
+    ),
+    'swinv2_base_window12to24_192to384_22kft1k': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12to24_192to384_22kto1k_ft.pth',
+        input_size=(3, 384, 384), crop_pct=1.0,
+    ),
+    'swinv2_large_window12_192_22k': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12_192_22k.pth',
+        num_classes=21841, input_size=(3, 192, 192)
+    ),
+    'swinv2_large_window12to16_192to256_22kft1k': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12to16_192to256_22kto1k_ft.pth',
+        input_size=(3, 256, 256)
+    ),
+    'swinv2_large_window12to24_192to384_22kft1k': _cfg(
+        url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12to24_192to384_22kto1k_ft.pth',
+        input_size=(3, 384, 384), crop_pct=1.0,
+    ),
+}
+
+
+def window_partition(x, window_size: Tuple[int, int]):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size[0], window_size[0], W // window_size[1], window_size[1], C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size[0], window_size[1], C)
+    return windows
+
+
+@register_notrace_function  # reason: int argument is a Proxy
+def window_reverse(windows, window_size: Tuple[int, int], img_size: Tuple[int, int]):
+    """
+    Args:
+        windows: (num_windows * B, window_size[0], window_size[1], C)
+        window_size (Tuple[int, int]): Window size
+        img_size (Tuple[int, int]): Image size
+
+    Returns:
+        x: (B, H, W, C)
+    """
+    H, W = img_size
+    B = int(windows.shape[0] / (H * W / window_size[0] / window_size[1]))
+    x = windows.view(B, H // window_size[0], W // window_size[1], window_size[0], window_size[1], -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+        pretrained_window_size (tuple[int]): The height and width of the window in pre-training.
+    """
+
+    def __init__(
+            self, dim, window_size, num_heads, qkv_bias=True, attn_drop=0., proj_drop=0.,
+            pretrained_window_size=[0, 0]):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.pretrained_window_size = pretrained_window_size
+        self.num_heads = num_heads
+
+        self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))))
+
+        # mlp to generate continuous relative position bias
+        self.cpb_mlp = nn.Sequential(
+            nn.Linear(2, 512, bias=True),
+            nn.ReLU(inplace=True),
+            nn.Linear(512, num_heads, bias=False)
+        )
+
+        # get relative_coords_table
+        relative_coords_h = torch.arange(-(self.window_size[0] - 1), self.window_size[0], dtype=torch.float32)
+        relative_coords_w = torch.arange(-(self.window_size[1] - 1), self.window_size[1], dtype=torch.float32)
+        relative_coords_table = torch.stack(torch.meshgrid([
+            relative_coords_h,
+            relative_coords_w])).permute(1, 2, 0).contiguous().unsqueeze(0)  # 1, 2*Wh-1, 2*Ww-1, 2
+        if pretrained_window_size[0] > 0:
+            relative_coords_table[:, :, :, 0] /= (pretrained_window_size[0] - 1)
+            relative_coords_table[:, :, :, 1] /= (pretrained_window_size[1] - 1)
+        else:
+            relative_coords_table[:, :, :, 0] /= (self.window_size[0] - 1)
+            relative_coords_table[:, :, :, 1] /= (self.window_size[1] - 1)
+        relative_coords_table *= 8  # normalize to -8, 8
+        relative_coords_table = torch.sign(relative_coords_table) * torch.log2(
+            torch.abs(relative_coords_table) + 1.0) / math.log2(8)
+
+        self.register_buffer("relative_coords_table", relative_coords_table, persistent=False)
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index, persistent=False)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(dim))
+            self.register_buffer('k_bias', torch.zeros(dim), persistent=False)
+            self.v_bias = nn.Parameter(torch.zeros(dim))
+        else:
+            self.q_bias = None
+            self.k_bias = None
+            self.v_bias = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask: Optional[torch.Tensor] = None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat((self.q_bias, self.k_bias, self.v_bias))
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B_, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+
+        # cosine attention
+        attn = (F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1))
+        logit_scale = torch.clamp(self.logit_scale, max=math.log(1. / 0.01)).exp()
+        attn = attn * logit_scale
+
+        relative_position_bias_table = self.cpb_mlp(self.relative_coords_table).view(-1, self.num_heads)
+        relative_position_bias = relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        relative_position_bias = 16 * torch.sigmoid(relative_position_bias)
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Module):
+    r""" Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+        pretrained_window_size (int): Window size in pretraining.
+    """
+
+    def __init__(
+            self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
+            mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., drop_path=0.,
+            act_layer=nn.GELU, norm_layer=nn.LayerNorm, pretrained_window_size=0):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = to_2tuple(input_resolution)
+        self.num_heads = num_heads
+        ws, ss = self._calc_window_shift(window_size, shift_size)
+        self.window_size: Tuple[int, int] = ws
+        self.shift_size: Tuple[int, int] = ss
+        self.window_area = self.window_size[0] * self.window_size[1]
+        self.mlp_ratio = mlp_ratio
+
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop,
+            pretrained_window_size=to_2tuple(pretrained_window_size))
+        self.norm1 = norm_layer(dim)
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
+        self.norm2 = norm_layer(dim)
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        if any(self.shift_size):
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+            cnt = 0
+            for h in (
+                    slice(0, -self.window_size[0]),
+                    slice(-self.window_size[0], -self.shift_size[0]),
+                    slice(-self.shift_size[0], None)):
+                for w in (
+                        slice(0, -self.window_size[1]),
+                        slice(-self.window_size[1], -self.shift_size[1]),
+                        slice(-self.shift_size[1], None)):
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_area)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+
+        self.register_buffer("attn_mask", attn_mask)
+
+    def _calc_window_shift(self, target_window_size, target_shift_size) -> Tuple[Tuple[int, int], Tuple[int, int]]:
+        target_window_size = to_2tuple(target_window_size)
+        target_shift_size = to_2tuple(target_shift_size)
+        window_size = [r if r <= w else w for r, w in zip(self.input_resolution, target_window_size)]
+        shift_size = [0 if r <= w else s for r, w, s in zip(self.input_resolution, window_size, target_shift_size)]
+        return tuple(window_size), tuple(shift_size)
+
+    def _attn(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        _assert(L == H * W, "input feature has wrong size")
+        x = x.view(B, H, W, C)
+
+        # cyclic shift
+        has_shift = any(self.shift_size)
+        if has_shift:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size[0], -self.shift_size[1]), dims=(1, 2))
+        else:
+            shifted_x = x
+
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_area, C)  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size[0], self.window_size[1], C)
+        shifted_x = window_reverse(attn_windows, self.window_size, self.input_resolution)  # B H' W' C
+
+        # reverse cyclic shift
+        if has_shift:
+            x = torch.roll(shifted_x, shifts=self.shift_size, dims=(1, 2))
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
+        return x
+
+    def forward(self, x):
+        x = x + self.drop_path1(self.norm1(self._attn(x)))
+        x = x + self.drop_path2(self.norm2(self.mlp(x)))
+        return x
+
+
+class PatchMerging(nn.Module):
+    r""" Patch Merging Layer.
+
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(2 * dim)
+
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        _assert(L == H * W, "input feature has wrong size")
+        _assert(H % 2 == 0, f"x size ({H}*{W}) are not even.")
+        _assert(W % 2 == 0, f"x size ({H}*{W}) are not even.")
+
+        x = x.view(B, H, W, C)
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.reduction(x)
+        x = self.norm(x)
+
+        return x
+
+
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        pretrained_window_size (int): Local window size in pre-training.
+    """
+
+    def __init__(
+            self, dim, input_resolution, depth, num_heads, window_size,
+            mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., drop_path=0.,
+            norm_layer=nn.LayerNorm, downsample=None, pretrained_window_size=0):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.grad_checkpointing = False
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(
+                dim=dim, input_resolution=input_resolution,
+                num_heads=num_heads, window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop=drop, attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer,
+                pretrained_window_size=pretrained_window_size)
+            for i in range(depth)])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = nn.Identity()
+
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        x = self.downsample(x)
+        return x
+
+    def _init_respostnorm(self):
+        for blk in self.blocks:
+            nn.init.constant_(blk.norm1.bias, 0)
+            nn.init.constant_(blk.norm1.weight, 0)
+            nn.init.constant_(blk.norm2.bias, 0)
+            nn.init.constant_(blk.norm2.weight, 0)
+
+
+class SwinTransformerV2(nn.Module):
+    r""" Swin Transformer V2
+        A PyTorch impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution`
+            - https://arxiv.org/abs/2111.09883
+    Args:
+        img_size (int | tuple(int)): Input image size. Default 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+        pretrained_window_sizes (tuple(int)): Pretrained window sizes of each layer.
+    """
+
+    def __init__(
+            self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, global_pool='avg',
+            embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24),
+            window_size=7, mlp_ratio=4., qkv_bias=True,
+            drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
+            norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
+            pretrained_window_sizes=(0, 0, 0, 0), **kwargs):
+        super().__init__()
+
+        self.num_classes = num_classes
+        assert global_pool in ('', 'avg')
+        self.global_pool = global_pool
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+
+        # absolute position embedding
+        if ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        else:
+            self.absolute_pos_embed = None
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2 ** i_layer),
+                input_resolution=(
+                    self.patch_embed.grid_size[0] // (2 ** i_layer),
+                    self.patch_embed.grid_size[1] // (2 ** i_layer)),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop=drop_rate, attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                pretrained_window_size=pretrained_window_sizes[i_layer]
+            )
+            self.layers.append(layer)
+
+        self.norm = norm_layer(self.num_features)
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+        for bly in self.layers:
+            bly._init_respostnorm()
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        nod = {'absolute_pos_embed'}
+        for n, m in self.named_modules():
+            if any([kw in n for kw in ("cpb_mlp", "logit_scale", 'relative_position_bias_table')]):
+                nod.add(n)
+        return nod
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^absolute_pos_embed|patch_embed',  # stem and embed
+            blocks=r'^layers\.(\d+)' if coarse else [
+                (r'^layers\.(\d+).downsample', (0,)),
+                (r'^layers\.(\d+)\.\w+\.(\d+)', None),
+                (r'^norm', (99999,)),
+            ]
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for l in self.layers:
+            l.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ('', 'avg')
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        if self.absolute_pos_embed is not None:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x = layer(x)
+
+        x = self.norm(x)  # B L C
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool == 'avg':
+            x = x.mean(dim=1)
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def checkpoint_filter_fn(state_dict, model):
+    out_dict = {}
+    if 'model' in state_dict:
+        # For deit models
+        state_dict = state_dict['model']
+    for k, v in state_dict.items():
+        if any([n in k for n in ('relative_position_index', 'relative_coords_table')]):
+            continue  # skip buffers that should not be persistent
+        out_dict[k] = v
+    return out_dict
+
+
+def _create_swin_transformer_v2(variant, pretrained=False, **kwargs):
+    model = build_model_with_cfg(
+        SwinTransformerV2, variant, pretrained,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        **kwargs)
+    return model
+
+
+@register_model
+def swinv2_tiny_window16_256(pretrained=False, **kwargs):
+    """
+    """
+    model_kwargs = dict(
+        window_size=16, embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), **kwargs)
+    return _create_swin_transformer_v2('swinv2_tiny_window16_256', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_tiny_window8_256(pretrained=False, **kwargs):
+    """
+    """
+    model_kwargs = dict(
+        window_size=8, embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), **kwargs)
+    return _create_swin_transformer_v2('swinv2_tiny_window8_256', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_small_window16_256(pretrained=False, **kwargs):
+    """
+    """
+    model_kwargs = dict(
+        window_size=16, embed_dim=96, depths=(2, 2, 18, 2), num_heads=(3, 6, 12, 24), **kwargs)
+    return _create_swin_transformer_v2('swinv2_small_window16_256', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_small_window8_256(pretrained=False, **kwargs):
+    """
+    """
+    model_kwargs = dict(
+        window_size=8, embed_dim=96, depths=(2, 2, 18, 2), num_heads=(3, 6, 12, 24), **kwargs)
+    return _create_swin_transformer_v2('swinv2_small_window8_256', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_base_window16_256(pretrained=False, **kwargs):
+    """
+    """
+    model_kwargs = dict(
+        window_size=16, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs)
+    return _create_swin_transformer_v2('swinv2_base_window16_256', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_base_window8_256(pretrained=False, **kwargs):
+    """
+    """
+    model_kwargs = dict(
+        window_size=8, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs)
+    return _create_swin_transformer_v2('swinv2_base_window8_256', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_base_window12_192_22k(pretrained=False, **kwargs):
+    """
+    """
+    model_kwargs = dict(
+        window_size=12, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs)
+    return _create_swin_transformer_v2('swinv2_base_window12_192_22k', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_base_window12to16_192to256_22kft1k(pretrained=False, **kwargs):
+    """
+    """
+    model_kwargs = dict(
+        window_size=16, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32),
+        pretrained_window_sizes=(12, 12, 12, 6), **kwargs)
+    return _create_swin_transformer_v2(
+        'swinv2_base_window12to16_192to256_22kft1k', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_base_window12to24_192to384_22kft1k(pretrained=False, **kwargs):
+    """
+    """
+    model_kwargs = dict(
+        window_size=24, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32),
+        pretrained_window_sizes=(12, 12, 12, 6), **kwargs)
+    return _create_swin_transformer_v2(
+        'swinv2_base_window12to24_192to384_22kft1k', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_large_window12_192_22k(pretrained=False, **kwargs):
+    """
+    """
+    model_kwargs = dict(
+        window_size=12, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs)
+    return _create_swin_transformer_v2('swinv2_large_window12_192_22k', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_large_window12to16_192to256_22kft1k(pretrained=False, **kwargs):
+    """
+    """
+    model_kwargs = dict(
+        window_size=16, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48),
+        pretrained_window_sizes=(12, 12, 12, 6), **kwargs)
+    return _create_swin_transformer_v2(
+        'swinv2_large_window12to16_192to256_22kft1k', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_large_window12to24_192to384_22kft1k(pretrained=False, **kwargs):
+    """
+    """
+    model_kwargs = dict(
+        window_size=24, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48),
+        pretrained_window_sizes=(12, 12, 12, 6), **kwargs)
+    return _create_swin_transformer_v2(
+        'swinv2_large_window12to24_192to384_22kft1k', pretrained=pretrained, **model_kwargs)
diff --git a/src/custom_timm/models/swin_transformer_v2_cr.py b/src/custom_timm/models/swin_transformer_v2_cr.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3ac4ac572d0b55bc1abf278f34fa9e3bd7bcb7a
--- /dev/null
+++ b/src/custom_timm/models/swin_transformer_v2_cr.py
@@ -0,0 +1,1029 @@
+""" Swin Transformer V2
+
+A PyTorch impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution`
+    - https://arxiv.org/pdf/2111.09883
+
+Code adapted from https://github.com/ChristophReich1996/Swin-Transformer-V2, original copyright/license info below
+
+This implementation is experimental and subject to change in manners that will break weight compat:
+* Size of the pos embed MLP are not spelled out in paper in terms of dim, fixed for all models? vary with num_heads?
+  * currently dim is fixed, I feel it may make sense to scale with num_heads (dim per head)
+* The specifics of the memory saving 'sequential attention' are not detailed, Christoph Reich has an impl at
+  GitHub link above. It needs further investigation as throughput vs mem tradeoff doesn't appear beneficial.
+* num_heads per stage is not detailed for Huge and Giant model variants
+* 'Giant' is 3B params in paper but ~2.6B here despite matching paper dim + block counts
+* experiments are ongoing wrt to 'main branch' norm layer use and weight init scheme
+
+Noteworthy additions over official Swin v1:
+* MLP relative position embedding is looking promising and adapts to different image/window sizes
+* This impl has been designed to allow easy change of image size with matching window size changes
+* Non-square image size and window size are supported
+
+Modifications and additions for timm hacked together by / Copyright 2022, Ross Wightman
+"""
+# --------------------------------------------------------
+# Swin Transformer V2 reimplementation
+# Copyright (c) 2021 Christoph Reich
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Christoph Reich
+# --------------------------------------------------------
+import logging
+import math
+from copy import deepcopy
+from typing import Tuple, Optional, List, Union, Any, Type
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .fx_features import register_notrace_function
+from .helpers import build_model_with_cfg, named_apply
+from .layers import DropPath, Mlp, to_2tuple, _assert
+from .registry import register_model
+
+
+_logger = logging.getLogger(__name__)
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000,
+        'input_size': (3, 224, 224),
+        'pool_size': (7, 7),
+        'crop_pct': 0.9,
+        'interpolation': 'bicubic',
+        'fixed_input_size': True,
+        'mean': IMAGENET_DEFAULT_MEAN,
+        'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.proj',
+        'classifier': 'head',
+        **kwargs,
+    }
+
+
+default_cfgs = {
+    'swinv2_cr_tiny_384': _cfg(
+        url="", input_size=(3, 384, 384), crop_pct=1.0, pool_size=(12, 12)),
+    'swinv2_cr_tiny_224': _cfg(
+        url="", input_size=(3, 224, 224), crop_pct=0.9),
+    'swinv2_cr_tiny_ns_224': _cfg(
+        url="https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-swinv2/swin_v2_cr_tiny_ns_224-ba8166c6.pth",
+        input_size=(3, 224, 224), crop_pct=0.9),
+    'swinv2_cr_small_384': _cfg(
+        url="", input_size=(3, 384, 384), crop_pct=1.0, pool_size=(12, 12)),
+    'swinv2_cr_small_224': _cfg(
+        url="https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-swinv2/swin_v2_cr_small_224-0813c165.pth",
+        input_size=(3, 224, 224), crop_pct=0.9),
+    'swinv2_cr_small_ns_224': _cfg(
+        url="https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-swinv2/swin_v2_cr_small_ns_224_iv-2ce90f8e.pth",
+        input_size=(3, 224, 224), crop_pct=0.9),
+    'swinv2_cr_base_384': _cfg(
+        url="", input_size=(3, 384, 384), crop_pct=1.0, pool_size=(12, 12)),
+    'swinv2_cr_base_224': _cfg(
+        url="", input_size=(3, 224, 224), crop_pct=0.9),
+    'swinv2_cr_base_ns_224': _cfg(
+        url="", input_size=(3, 224, 224), crop_pct=0.9),
+    'swinv2_cr_large_384': _cfg(
+        url="", input_size=(3, 384, 384), crop_pct=1.0, pool_size=(12, 12)),
+    'swinv2_cr_large_224': _cfg(
+        url="", input_size=(3, 224, 224), crop_pct=0.9),
+    'swinv2_cr_huge_384': _cfg(
+        url="", input_size=(3, 384, 384), crop_pct=1.0, pool_size=(12, 12)),
+    'swinv2_cr_huge_224': _cfg(
+        url="", input_size=(3, 224, 224), crop_pct=0.9),
+    'swinv2_cr_giant_384': _cfg(
+        url="", input_size=(3, 384, 384), crop_pct=1.0, pool_size=(12, 12)),
+    'swinv2_cr_giant_224': _cfg(
+        url="", input_size=(3, 224, 224), crop_pct=0.9),
+}
+
+
+def bchw_to_bhwc(x: torch.Tensor) -> torch.Tensor:
+    """Permutes a tensor from the shape (B, C, H, W) to (B, H, W, C). """
+    return x.permute(0, 2, 3, 1)
+
+
+def bhwc_to_bchw(x: torch.Tensor) -> torch.Tensor:
+    """Permutes a tensor from the shape (B, H, W, C) to (B, C, H, W). """
+    return x.permute(0, 3, 1, 2)
+
+
+def window_partition(x, window_size: Tuple[int, int]):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size[0], window_size[0], W // window_size[1], window_size[1], C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size[0], window_size[1], C)
+    return windows
+
+
+@register_notrace_function  # reason: int argument is a Proxy
+def window_reverse(windows, window_size: Tuple[int, int], img_size: Tuple[int, int]):
+    """
+    Args:
+        windows: (num_windows * B, window_size[0], window_size[1], C)
+        window_size (Tuple[int, int]): Window size
+        img_size (Tuple[int, int]): Image size
+
+    Returns:
+        x: (B, H, W, C)
+    """
+    H, W = img_size
+    B = int(windows.shape[0] / (H * W / window_size[0] / window_size[1]))
+    x = windows.view(B, H // window_size[0], W // window_size[1], window_size[0], window_size[1], -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowMultiHeadAttention(nn.Module):
+    r"""This class implements window-based Multi-Head-Attention with log-spaced continuous position bias.
+
+    Args:
+        dim (int): Number of input features
+        window_size (int): Window size
+        num_heads (int): Number of attention heads
+        drop_attn (float): Dropout rate of attention map
+        drop_proj (float): Dropout rate after projection
+        meta_hidden_dim (int): Number of hidden features in the two layer MLP meta network
+        sequential_attn (bool): If true sequential self-attention is performed
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        window_size: Tuple[int, int],
+        drop_attn: float = 0.0,
+        drop_proj: float = 0.0,
+        meta_hidden_dim: int = 384,  # FIXME what's the optimal value?
+        sequential_attn: bool = False,
+    ) -> None:
+        super(WindowMultiHeadAttention, self).__init__()
+        assert dim % num_heads == 0, \
+            "The number of input features (in_features) are not divisible by the number of heads (num_heads)."
+        self.in_features: int = dim
+        self.window_size: Tuple[int, int] = window_size
+        self.num_heads: int = num_heads
+        self.sequential_attn: bool = sequential_attn
+
+        self.qkv = nn.Linear(in_features=dim, out_features=dim * 3, bias=True)
+        self.attn_drop = nn.Dropout(drop_attn)
+        self.proj = nn.Linear(in_features=dim, out_features=dim, bias=True)
+        self.proj_drop = nn.Dropout(drop_proj)
+        # meta network for positional encodings
+        self.meta_mlp = Mlp(
+            2,  # x, y
+            hidden_features=meta_hidden_dim,
+            out_features=num_heads,
+            act_layer=nn.ReLU,
+            drop=(0.125, 0.)  # FIXME should there be stochasticity, appears to 'overfit' without?
+        )
+        # NOTE old checkpoints used inverse of logit_scale ('tau') following the paper, see conversion fn
+        self.logit_scale = nn.Parameter(torch.log(10 * torch.ones(num_heads)))
+        self._make_pair_wise_relative_positions()
+
+    def _make_pair_wise_relative_positions(self) -> None:
+        """Method initializes the pair-wise relative positions to compute the positional biases."""
+        device = self.logit_scale.device
+        coordinates = torch.stack(torch.meshgrid([
+            torch.arange(self.window_size[0], device=device),
+            torch.arange(self.window_size[1], device=device)]), dim=0).flatten(1)
+        relative_coordinates = coordinates[:, :, None] - coordinates[:, None, :]
+        relative_coordinates = relative_coordinates.permute(1, 2, 0).reshape(-1, 2).float()
+        relative_coordinates_log = torch.sign(relative_coordinates) * torch.log(
+            1.0 + relative_coordinates.abs())
+        self.register_buffer("relative_coordinates_log", relative_coordinates_log, persistent=False)
+
+    def update_input_size(self, new_window_size: int, **kwargs: Any) -> None:
+        """Method updates the window size and so the pair-wise relative positions
+
+        Args:
+            new_window_size (int): New window size
+            kwargs (Any): Unused
+        """
+        # Set new window size and new pair-wise relative positions
+        self.window_size: int = new_window_size
+        self._make_pair_wise_relative_positions()
+
+    def _relative_positional_encodings(self) -> torch.Tensor:
+        """Method computes the relative positional encodings
+
+        Returns:
+            relative_position_bias (torch.Tensor): Relative positional encodings
+            (1, number of heads, window size ** 2, window size ** 2)
+        """
+        window_area = self.window_size[0] * self.window_size[1]
+        relative_position_bias = self.meta_mlp(self.relative_coordinates_log)
+        relative_position_bias = relative_position_bias.transpose(1, 0).reshape(
+            self.num_heads, window_area, window_area
+        )
+        relative_position_bias = relative_position_bias.unsqueeze(0)
+        return relative_position_bias
+
+    def _forward_sequential(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        """
+        # FIXME TODO figure out 'sequential' attention mentioned in paper (should reduce GPU memory)
+        assert False, "not implemented"
+
+    def _forward_batch(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """This function performs standard (non-sequential) scaled cosine self-attention.
+        """
+        Bw, L, C = x.shape
+
+        qkv = self.qkv(x).view(Bw, L, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        query, key, value = qkv.unbind(0)
+
+        # compute attention map with scaled cosine attention
+        attn = (F.normalize(query, dim=-1) @ F.normalize(key, dim=-1).transpose(-2, -1))
+        logit_scale = torch.clamp(self.logit_scale.reshape(1, self.num_heads, 1, 1), max=math.log(1. / 0.01)).exp()
+        attn = attn * logit_scale
+        attn = attn + self._relative_positional_encodings()
+
+        if mask is not None:
+            # Apply mask if utilized
+            num_win: int = mask.shape[0]
+            attn = attn.view(Bw // num_win, num_win, self.num_heads, L, L)
+            attn = attn + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, L, L)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ value).transpose(1, 2).reshape(Bw, L, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """ Forward pass.
+        Args:
+            x (torch.Tensor): Input tensor of the shape (B * windows, N, C)
+            mask (Optional[torch.Tensor]): Attention mask for the shift case
+
+        Returns:
+            Output tensor of the shape [B * windows, N, C]
+        """
+        if self.sequential_attn:
+            return self._forward_sequential(x, mask)
+        else:
+            return self._forward_batch(x, mask)
+
+
+class SwinTransformerBlock(nn.Module):
+    r"""This class implements the Swin transformer block.
+
+    Args:
+        dim (int): Number of input channels
+        num_heads (int): Number of attention heads to be utilized
+        feat_size (Tuple[int, int]): Input resolution
+        window_size (Tuple[int, int]): Window size to be utilized
+        shift_size (int): Shifting size to be used
+        mlp_ratio (int): Ratio of the hidden dimension in the FFN to the input channels
+        drop (float): Dropout in input mapping
+        drop_attn (float): Dropout rate of attention map
+        drop_path (float): Dropout in main path
+        extra_norm (bool): Insert extra norm on 'main' branch if True
+        sequential_attn (bool): If true sequential self-attention is performed
+        norm_layer (Type[nn.Module]): Type of normalization layer to be utilized
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        feat_size: Tuple[int, int],
+        window_size: Tuple[int, int],
+        shift_size: Tuple[int, int] = (0, 0),
+        mlp_ratio: float = 4.0,
+        init_values: Optional[float] = 0,
+        drop: float = 0.0,
+        drop_attn: float = 0.0,
+        drop_path: float = 0.0,
+        extra_norm: bool = False,
+        sequential_attn: bool = False,
+        norm_layer: Type[nn.Module] = nn.LayerNorm,
+    ) -> None:
+        super(SwinTransformerBlock, self).__init__()
+        self.dim: int = dim
+        self.feat_size: Tuple[int, int] = feat_size
+        self.target_shift_size: Tuple[int, int] = to_2tuple(shift_size)
+        self.window_size, self.shift_size = self._calc_window_shift(to_2tuple(window_size))
+        self.window_area = self.window_size[0] * self.window_size[1]
+        self.init_values: Optional[float] = init_values
+
+        # attn branch
+        self.attn = WindowMultiHeadAttention(
+            dim=dim,
+            num_heads=num_heads,
+            window_size=self.window_size,
+            drop_attn=drop_attn,
+            drop_proj=drop,
+            sequential_attn=sequential_attn,
+        )
+        self.norm1 = norm_layer(dim)
+        self.drop_path1 = DropPath(drop_prob=drop_path) if drop_path > 0.0 else nn.Identity()
+
+        # mlp branch
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            drop=drop,
+            out_features=dim,
+        )
+        self.norm2 = norm_layer(dim)
+        self.drop_path2 = DropPath(drop_prob=drop_path) if drop_path > 0.0 else nn.Identity()
+
+        # Extra main branch norm layer mentioned for Huge/Giant models in V2 paper.
+        # Also being used as final network norm and optional stage ending norm while still in a C-last format.
+        self.norm3 = norm_layer(dim) if extra_norm else nn.Identity()
+
+        self._make_attention_mask()
+        self.init_weights()
+
+    def _calc_window_shift(self, target_window_size):
+        window_size = [f if f <= w else w for f, w in zip(self.feat_size, target_window_size)]
+        shift_size = [0 if f <= w else s for f, w, s in zip(self.feat_size, window_size, self.target_shift_size)]
+        return tuple(window_size), tuple(shift_size)
+
+    def _make_attention_mask(self) -> None:
+        """Method generates the attention mask used in shift case."""
+        # Make masks for shift case
+        if any(self.shift_size):
+            # calculate attention mask for SW-MSA
+            H, W = self.feat_size
+            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+            cnt = 0
+            for h in (
+                    slice(0, -self.window_size[0]),
+                    slice(-self.window_size[0], -self.shift_size[0]),
+                    slice(-self.shift_size[0], None)):
+                for w in (
+                        slice(0, -self.window_size[1]),
+                        slice(-self.window_size[1], -self.shift_size[1]),
+                        slice(-self.shift_size[1], None)):
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+            mask_windows = window_partition(img_mask, self.window_size)  # num_windows, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_area)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+        self.register_buffer("attn_mask", attn_mask, persistent=False)
+
+    def init_weights(self):
+        # extra, module specific weight init
+        if self.init_values is not None:
+            nn.init.constant_(self.norm1.weight, self.init_values)
+            nn.init.constant_(self.norm2.weight, self.init_values)
+
+    def update_input_size(self, new_window_size: Tuple[int, int], new_feat_size: Tuple[int, int]) -> None:
+        """Method updates the image resolution to be processed and window size and so the pair-wise relative positions.
+
+        Args:
+            new_window_size (int): New window size
+            new_feat_size (Tuple[int, int]): New input resolution
+        """
+        # Update input resolution
+        self.feat_size: Tuple[int, int] = new_feat_size
+        self.window_size, self.shift_size = self._calc_window_shift(to_2tuple(new_window_size))
+        self.window_area = self.window_size[0] * self.window_size[1]
+        self.attn.update_input_size(new_window_size=self.window_size)
+        self._make_attention_mask()
+
+    def _shifted_window_attn(self, x):
+        H, W = self.feat_size
+        B, L, C = x.shape
+        x = x.view(B, H, W, C)
+
+        # cyclic shift
+        sh, sw = self.shift_size
+        do_shift: bool = any(self.shift_size)
+        if do_shift:
+            # FIXME PyTorch XLA needs cat impl, roll not lowered
+            # x = torch.cat([x[:, sh:], x[:, :sh]], dim=1)
+            # x = torch.cat([x[:, :, sw:], x[:, :, :sw]], dim=2)
+            x = torch.roll(x, shifts=(-sh, -sw), dims=(1, 2))
+
+        # partition windows
+        x_windows = window_partition(x, self.window_size)  # num_windows * B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size[0] * self.window_size[1], C)
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=self.attn_mask)  # num_windows * B, window_size * window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size[0], self.window_size[1], C)
+        x = window_reverse(attn_windows, self.window_size, self.feat_size)  # B H' W' C
+
+        # reverse cyclic shift
+        if do_shift:
+            # FIXME PyTorch XLA needs cat impl, roll not lowered
+            # x = torch.cat([x[:, -sh:], x[:, :-sh]], dim=1)
+            # x = torch.cat([x[:, :, -sw:], x[:, :, :-sw]], dim=2)
+            x = torch.roll(x, shifts=(sh, sw), dims=(1, 2))
+
+        x = x.view(B, L, C)
+        return x
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass.
+
+        Args:
+            x (torch.Tensor): Input tensor of the shape [B, C, H, W]
+
+        Returns:
+            output (torch.Tensor): Output tensor of the shape [B, C, H, W]
+        """
+        # post-norm branches (op -> norm -> drop)
+        x = x + self.drop_path1(self.norm1(self._shifted_window_attn(x)))
+        x = x + self.drop_path2(self.norm2(self.mlp(x)))
+        x = self.norm3(x)  # main-branch norm enabled for some blocks / stages (every 6 for Huge/Giant)
+        return x
+
+
+class PatchMerging(nn.Module):
+    """ This class implements the patch merging as a strided convolution with a normalization before.
+    Args:
+        dim (int): Number of input channels
+        norm_layer (Type[nn.Module]): Type of normalization layer to be utilized.
+    """
+
+    def __init__(self, dim: int, norm_layer: Type[nn.Module] = nn.LayerNorm) -> None:
+        super(PatchMerging, self).__init__()
+        self.norm = norm_layer(4 * dim)
+        self.reduction = nn.Linear(in_features=4 * dim, out_features=2 * dim, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """ Forward pass.
+        Args:
+            x (torch.Tensor): Input tensor of the shape [B, C, H, W]
+        Returns:
+            output (torch.Tensor): Output tensor of the shape [B, 2 * C, H // 2, W // 2]
+        """
+        B, C, H, W = x.shape
+        # unfold + BCHW -> BHWC together
+        # ordering, 5, 3, 1 instead of 3, 5, 1 maintains compat with original swin v1 merge
+        x = x.reshape(B, C, H // 2, 2, W // 2, 2).permute(0, 2, 4, 5, 3, 1).flatten(3)
+        x = self.norm(x)
+        x = bhwc_to_bchw(self.reduction(x))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        _assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).")
+        _assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).")
+        x = self.proj(x)
+        x = self.norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        return x
+
+
+class SwinTransformerStage(nn.Module):
+    r"""This class implements a stage of the Swin transformer including multiple layers.
+
+    Args:
+        embed_dim (int): Number of input channels
+        depth (int): Depth of the stage (number of layers)
+        downscale (bool): If true input is downsampled (see Fig. 3 or V1 paper)
+        feat_size (Tuple[int, int]): input feature map size (H, W)
+        num_heads (int): Number of attention heads to be utilized
+        window_size (int): Window size to be utilized
+        mlp_ratio (int): Ratio of the hidden dimension in the FFN to the input channels
+        drop (float): Dropout in input mapping
+        drop_attn (float): Dropout rate of attention map
+        drop_path (float): Dropout in main path
+        norm_layer (Type[nn.Module]): Type of normalization layer to be utilized. Default: nn.LayerNorm
+        extra_norm_period (int): Insert extra norm layer on main branch every N (period) blocks
+        extra_norm_stage (bool): End each stage with an extra norm layer in main branch
+        sequential_attn (bool): If true sequential self-attention is performed
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        depth: int,
+        downscale: bool,
+        num_heads: int,
+        feat_size: Tuple[int, int],
+        window_size: Tuple[int, int],
+        mlp_ratio: float = 4.0,
+        init_values: Optional[float] = 0.0,
+        drop: float = 0.0,
+        drop_attn: float = 0.0,
+        drop_path: Union[List[float], float] = 0.0,
+        norm_layer: Type[nn.Module] = nn.LayerNorm,
+        extra_norm_period: int = 0,
+        extra_norm_stage: bool = False,
+        sequential_attn: bool = False,
+    ) -> None:
+        super(SwinTransformerStage, self).__init__()
+        self.downscale: bool = downscale
+        self.grad_checkpointing: bool = False
+        self.feat_size: Tuple[int, int] = (feat_size[0] // 2, feat_size[1] // 2) if downscale else feat_size
+
+        self.downsample = PatchMerging(embed_dim, norm_layer=norm_layer) if downscale else nn.Identity()
+
+        def _extra_norm(index):
+            i = index + 1
+            if extra_norm_period and i % extra_norm_period == 0:
+                return True
+            return i == depth if extra_norm_stage else False
+
+        embed_dim = embed_dim * 2 if downscale else embed_dim
+        self.blocks = nn.Sequential(*[
+            SwinTransformerBlock(
+                dim=embed_dim,
+                num_heads=num_heads,
+                feat_size=self.feat_size,
+                window_size=window_size,
+                shift_size=tuple([0 if ((index % 2) == 0) else w // 2 for w in window_size]),
+                mlp_ratio=mlp_ratio,
+                init_values=init_values,
+                drop=drop,
+                drop_attn=drop_attn,
+                drop_path=drop_path[index] if isinstance(drop_path, list) else drop_path,
+                extra_norm=_extra_norm(index),
+                sequential_attn=sequential_attn,
+                norm_layer=norm_layer,
+            )
+            for index in range(depth)]
+        )
+
+    def update_input_size(self, new_window_size: int, new_feat_size: Tuple[int, int]) -> None:
+        """Method updates the resolution to utilize and the window size and so the pair-wise relative positions.
+
+        Args:
+            new_window_size (int): New window size
+            new_feat_size (Tuple[int, int]): New input resolution
+        """
+        self.feat_size: Tuple[int, int] = (
+            (new_feat_size[0] // 2, new_feat_size[1] // 2) if self.downscale else new_feat_size
+        )
+        for block in self.blocks:
+            block.update_input_size(new_window_size=new_window_size, new_feat_size=self.feat_size)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass.
+        Args:
+            x (torch.Tensor): Input tensor of the shape [B, C, H, W] or [B, L, C]
+        Returns:
+            output (torch.Tensor): Output tensor of the shape [B, 2 * C, H // 2, W // 2]
+        """
+        x = self.downsample(x)
+        B, C, H, W = x.shape
+        L = H * W
+
+        x = bchw_to_bhwc(x).reshape(B, L, C)
+        for block in self.blocks:
+            # Perform checkpointing if utilized
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint.checkpoint(block, x)
+            else:
+                x = block(x)
+        x = bhwc_to_bchw(x.reshape(B, H, W, -1))
+        return x
+
+
+class SwinTransformerV2Cr(nn.Module):
+    r""" Swin Transformer V2
+        A PyTorch impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution`  -
+          https://arxiv.org/pdf/2111.09883
+
+    Args:
+        img_size (Tuple[int, int]): Input resolution.
+        window_size (Optional[int]): Window size. If None, img_size // window_div. Default: None
+        img_window_ratio (int): Window size to image size ratio. Default: 32
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input channels.
+        depths (int): Depth of the stage (number of layers).
+        num_heads (int): Number of attention heads to be utilized.
+        embed_dim (int): Patch embedding dimension. Default: 96
+        num_classes (int): Number of output classes. Default: 1000
+        mlp_ratio (int):  Ratio of the hidden dimension in the FFN to the input channels. Default: 4
+        drop_rate (float): Dropout rate. Default: 0.0
+        attn_drop_rate (float): Dropout rate of attention map. Default: 0.0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.0
+        norm_layer (Type[nn.Module]): Type of normalization layer to be utilized. Default: nn.LayerNorm
+        extra_norm_period (int): Insert extra norm layer on main branch every N (period) blocks in stage
+        extra_norm_stage (bool): End each stage with an extra norm layer in main branch
+        sequential_attn (bool): If true sequential self-attention is performed. Default: False
+    """
+
+    def __init__(
+        self,
+        img_size: Tuple[int, int] = (224, 224),
+        patch_size: int = 4,
+        window_size: Optional[int] = None,
+        img_window_ratio: int = 32,
+        in_chans: int = 3,
+        num_classes: int = 1000,
+        embed_dim: int = 96,
+        depths: Tuple[int, ...] = (2, 2, 6, 2),
+        num_heads: Tuple[int, ...] = (3, 6, 12, 24),
+        mlp_ratio: float = 4.0,
+        init_values: Optional[float] = 0.,
+        drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        drop_path_rate: float = 0.0,
+        norm_layer: Type[nn.Module] = nn.LayerNorm,
+        extra_norm_period: int = 0,
+        extra_norm_stage: bool = False,
+        sequential_attn: bool = False,
+        global_pool: str = 'avg',
+        weight_init='skip',
+        **kwargs: Any
+    ) -> None:
+        super(SwinTransformerV2Cr, self).__init__()
+        img_size = to_2tuple(img_size)
+        window_size = tuple([
+            s // img_window_ratio for s in img_size]) if window_size is None else to_2tuple(window_size)
+
+        self.num_classes: int = num_classes
+        self.patch_size: int = patch_size
+        self.img_size: Tuple[int, int] = img_size
+        self.window_size: int = window_size
+        self.num_features: int = int(embed_dim * 2 ** (len(depths) - 1))
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans,
+            embed_dim=embed_dim, norm_layer=norm_layer)
+        patch_grid_size: Tuple[int, int] = self.patch_embed.grid_size
+
+        drop_path_rate = torch.linspace(0.0, drop_path_rate, sum(depths)).tolist()
+        stages = []
+        for index, (depth, num_heads) in enumerate(zip(depths, num_heads)):
+            stage_scale = 2 ** max(index - 1, 0)
+            stages.append(
+                SwinTransformerStage(
+                    embed_dim=embed_dim * stage_scale,
+                    depth=depth,
+                    downscale=index != 0,
+                    feat_size=(patch_grid_size[0] // stage_scale, patch_grid_size[1] // stage_scale),
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    mlp_ratio=mlp_ratio,
+                    init_values=init_values,
+                    drop=drop_rate,
+                    drop_attn=attn_drop_rate,
+                    drop_path=drop_path_rate[sum(depths[:index]):sum(depths[:index + 1])],
+                    extra_norm_period=extra_norm_period,
+                    extra_norm_stage=extra_norm_stage or (index + 1) == len(depths),  # last stage ends w/ norm
+                    sequential_attn=sequential_attn,
+                    norm_layer=norm_layer,
+                )
+            )
+        self.stages = nn.Sequential(*stages)
+
+        self.global_pool: str = global_pool
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes else nn.Identity()
+
+        # current weight init skips custom init and uses pytorch layer defaults, seems to work well
+        # FIXME more experiments needed
+        if weight_init != 'skip':
+            named_apply(init_weights, self)
+
+    def update_input_size(
+            self,
+            new_img_size: Optional[Tuple[int, int]] = None,
+            new_window_size: Optional[int] = None,
+            img_window_ratio: int = 32,
+    ) -> None:
+        """Method updates the image resolution to be processed and window size and so the pair-wise relative positions.
+
+        Args:
+            new_window_size (Optional[int]): New window size, if None based on new_img_size // window_div
+            new_img_size (Optional[Tuple[int, int]]): New input resolution, if None current resolution is used
+            img_window_ratio (int): divisor for calculating window size from image size
+        """
+        # Check parameters
+        if new_img_size is None:
+            new_img_size = self.img_size
+        else:
+            new_img_size = to_2tuple(new_img_size)
+        if new_window_size is None:
+            new_window_size = tuple([s // img_window_ratio for s in new_img_size])
+        # Compute new patch resolution & update resolution of each stage
+        new_patch_grid_size = (new_img_size[0] // self.patch_size, new_img_size[1] // self.patch_size)
+        for index, stage in enumerate(self.stages):
+            stage_scale = 2 ** max(index - 1, 0)
+            stage.update_input_size(
+                new_window_size=new_window_size,
+                new_img_size=(new_patch_grid_size[0] // stage_scale, new_patch_grid_size[1] // stage_scale),
+            )
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^patch_embed',  # stem and embed
+            blocks=r'^stages\.(\d+)' if coarse else [
+                (r'^stages\.(\d+).downsample', (0,)),
+                (r'^stages\.(\d+)\.\w+\.(\d+)', None),
+            ]
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for s in self.stages:
+            s.grad_checkpointing = enable
+
+    @torch.jit.ignore()
+    def get_classifier(self) -> nn.Module:
+        """Method returns the classification head of the model.
+        Returns:
+            head (nn.Module): Current classification head
+        """
+        return self.head
+
+    def reset_classifier(self, num_classes: int, global_pool: Optional[str] = None) -> None:
+        """Method results the classification head
+
+        Args:
+            num_classes (int): Number of classes to be predicted
+            global_pool (str): Unused
+        """
+        self.num_classes: int = num_classes
+        if global_pool is not None:
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patch_embed(x)
+        x = self.stages(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool == 'avg':
+            x = x.mean(dim=(2, 3))
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def init_weights(module: nn.Module, name: str = ''):
+    # FIXME WIP determining if there's a better weight init
+    if isinstance(module, nn.Linear):
+        if 'qkv' in name:
+            # treat the weights of Q, K, V separately
+            val = math.sqrt(6. / float(module.weight.shape[0] // 3 + module.weight.shape[1]))
+            nn.init.uniform_(module.weight, -val, val)
+        elif 'head' in name:
+            nn.init.zeros_(module.weight)
+        else:
+            nn.init.xavier_uniform_(module.weight)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif hasattr(module, 'init_weights'):
+        module.init_weights()
+
+
+def checkpoint_filter_fn(state_dict, model):
+    """ convert patch embedding weight from manual patchify + linear proj to conv"""
+    out_dict = {}
+    if 'model' in state_dict:
+        # For deit models
+        state_dict = state_dict['model']
+    for k, v in state_dict.items():
+        if 'tau' in k:
+            # convert old tau based checkpoints -> logit_scale (inverse)
+            v = torch.log(1 / v)
+            k = k.replace('tau', 'logit_scale')
+        out_dict[k] = v
+    return out_dict
+
+
+def _create_swin_transformer_v2_cr(variant, pretrained=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+    model = build_model_with_cfg(
+        SwinTransformerV2Cr, variant, pretrained,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        **kwargs
+    )
+    return model
+
+
+@register_model
+def swinv2_cr_tiny_384(pretrained=False, **kwargs):
+    """Swin-T V2 CR @ 384x384, trained ImageNet-1k"""
+    model_kwargs = dict(
+        embed_dim=96,
+        depths=(2, 2, 6, 2),
+        num_heads=(3, 6, 12, 24),
+        **kwargs
+    )
+    return _create_swin_transformer_v2_cr('swinv2_cr_tiny_384', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_cr_tiny_224(pretrained=False, **kwargs):
+    """Swin-T V2 CR @ 224x224, trained ImageNet-1k"""
+    model_kwargs = dict(
+        embed_dim=96,
+        depths=(2, 2, 6, 2),
+        num_heads=(3, 6, 12, 24),
+        **kwargs
+    )
+    return _create_swin_transformer_v2_cr('swinv2_cr_tiny_224', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_cr_tiny_ns_224(pretrained=False, **kwargs):
+    """Swin-T V2 CR @ 224x224, trained ImageNet-1k w/ extra stage norms.
+    ** Experimental, may make default if results are improved. **
+    """
+    model_kwargs = dict(
+        embed_dim=96,
+        depths=(2, 2, 6, 2),
+        num_heads=(3, 6, 12, 24),
+        extra_norm_stage=True,
+        **kwargs
+    )
+    return _create_swin_transformer_v2_cr('swinv2_cr_tiny_ns_224', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_cr_small_384(pretrained=False, **kwargs):
+    """Swin-S V2 CR @ 384x384, trained ImageNet-1k"""
+    model_kwargs = dict(
+        embed_dim=96,
+        depths=(2, 2, 18, 2),
+        num_heads=(3, 6, 12, 24),
+        **kwargs
+    )
+    return _create_swin_transformer_v2_cr('swinv2_cr_small_384', pretrained=pretrained, **model_kwargs
+    )
+
+
+@register_model
+def swinv2_cr_small_224(pretrained=False, **kwargs):
+    """Swin-S V2 CR @ 224x224, trained ImageNet-1k"""
+    model_kwargs = dict(
+        embed_dim=96,
+        depths=(2, 2, 18, 2),
+        num_heads=(3, 6, 12, 24),
+        **kwargs
+    )
+    return _create_swin_transformer_v2_cr('swinv2_cr_small_224', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_cr_small_ns_224(pretrained=False, **kwargs):
+    """Swin-S V2 CR @ 224x224, trained ImageNet-1k"""
+    model_kwargs = dict(
+        embed_dim=96,
+        depths=(2, 2, 18, 2),
+        num_heads=(3, 6, 12, 24),
+        extra_norm_stage=True,
+        **kwargs
+    )
+    return _create_swin_transformer_v2_cr('swinv2_cr_small_ns_224', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_cr_base_384(pretrained=False, **kwargs):
+    """Swin-B V2 CR @ 384x384, trained ImageNet-1k"""
+    model_kwargs = dict(
+        embed_dim=128,
+        depths=(2, 2, 18, 2),
+        num_heads=(4, 8, 16, 32),
+        **kwargs
+    )
+    return _create_swin_transformer_v2_cr('swinv2_cr_base_384', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_cr_base_224(pretrained=False, **kwargs):
+    """Swin-B V2 CR @ 224x224, trained ImageNet-1k"""
+    model_kwargs = dict(
+        embed_dim=128,
+        depths=(2, 2, 18, 2),
+        num_heads=(4, 8, 16, 32),
+        **kwargs
+    )
+    return _create_swin_transformer_v2_cr('swinv2_cr_base_224', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_cr_base_ns_224(pretrained=False, **kwargs):
+    """Swin-B V2 CR @ 224x224, trained ImageNet-1k"""
+    model_kwargs = dict(
+        embed_dim=128,
+        depths=(2, 2, 18, 2),
+        num_heads=(4, 8, 16, 32),
+        extra_norm_stage=True,
+        **kwargs
+    )
+    return _create_swin_transformer_v2_cr('swinv2_cr_base_ns_224', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_cr_large_384(pretrained=False, **kwargs):
+    """Swin-L V2 CR @ 384x384, trained ImageNet-1k"""
+    model_kwargs = dict(
+        embed_dim=192,
+        depths=(2, 2, 18, 2),
+        num_heads=(6, 12, 24, 48),
+        **kwargs
+    )
+    return _create_swin_transformer_v2_cr('swinv2_cr_large_384', pretrained=pretrained, **model_kwargs
+    )
+
+
+@register_model
+def swinv2_cr_large_224(pretrained=False, **kwargs):
+    """Swin-L V2 CR @ 224x224, trained ImageNet-1k"""
+    model_kwargs = dict(
+        embed_dim=192,
+        depths=(2, 2, 18, 2),
+        num_heads=(6, 12, 24, 48),
+        **kwargs
+    )
+    return _create_swin_transformer_v2_cr('swinv2_cr_large_224', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_cr_huge_384(pretrained=False, **kwargs):
+    """Swin-H V2 CR @ 384x384, trained ImageNet-1k"""
+    model_kwargs = dict(
+        embed_dim=352,
+        depths=(2, 2, 18, 2),
+        num_heads=(11, 22, 44, 88),  # head count not certain for Huge, 384 & 224 trying diff values
+        extra_norm_period=6,
+        **kwargs
+    )
+    return _create_swin_transformer_v2_cr('swinv2_cr_huge_384', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_cr_huge_224(pretrained=False, **kwargs):
+    """Swin-H V2 CR @ 224x224, trained ImageNet-1k"""
+    model_kwargs = dict(
+        embed_dim=352,
+        depths=(2, 2, 18, 2),
+        num_heads=(8, 16, 32, 64),  # head count not certain for Huge, 384 & 224 trying diff values
+        extra_norm_period=6,
+        **kwargs
+    )
+    return _create_swin_transformer_v2_cr('swinv2_cr_huge_224', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def swinv2_cr_giant_384(pretrained=False, **kwargs):
+    """Swin-G V2 CR @ 384x384, trained ImageNet-1k"""
+    model_kwargs = dict(
+        embed_dim=512,
+        depths=(2, 2, 42, 2),
+        num_heads=(16, 32, 64, 128),
+        extra_norm_period=6,
+        **kwargs
+    )
+    return _create_swin_transformer_v2_cr('swinv2_cr_giant_384', pretrained=pretrained, **model_kwargs
+    )
+
+
+@register_model
+def swinv2_cr_giant_224(pretrained=False, **kwargs):
+    """Swin-G V2 CR @ 224x224, trained ImageNet-1k"""
+    model_kwargs = dict(
+        embed_dim=512,
+        depths=(2, 2, 42, 2),
+        num_heads=(16, 32, 64, 128),
+        extra_norm_period=6,
+        **kwargs
+    )
+    return _create_swin_transformer_v2_cr('swinv2_cr_giant_224', pretrained=pretrained, **model_kwargs)
diff --git a/src/custom_timm/models/tnt.py b/src/custom_timm/models/tnt.py
new file mode 100644
index 0000000000000000000000000000000000000000..c73bb4b252c47158177d0fb8345fa38c1104542a
--- /dev/null
+++ b/src/custom_timm/models/tnt.py
@@ -0,0 +1,304 @@
+""" Transformer in Transformer (TNT) in PyTorch
+
+A PyTorch implement of TNT as described in
+'Transformer in Transformer' - https://arxiv.org/abs/2103.00112
+
+The official mindspore code is released and available at
+https://gitee.com/mindspore/mindspore/tree/master/model_zoo/research/cv/TNT
+"""
+import math
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from custom_timm.models.helpers import build_model_with_cfg
+from custom_timm.models.layers import Mlp, DropPath, trunc_normal_
+from custom_timm.models.layers.helpers import to_2tuple
+from custom_timm.models.layers import _assert
+from custom_timm.models.registry import register_model
+from custom_timm.models.vision_transformer import resize_pos_embed
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'pixel_embed.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'tnt_s_patch16_224': _cfg(
+        url='https://github.com/contrastive/pytorch-image-models/releases/download/TNT/tnt_s_patch16_224.pth.tar',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+    ),
+    'tnt_b_patch16_224': _cfg(
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+    ),
+}
+
+
+class Attention(nn.Module):
+    """ Multi-Head Attention
+    """
+    def __init__(self, dim, hidden_dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        head_dim = hidden_dim // num_heads
+        self.head_dim = head_dim
+        self.scale = head_dim ** -0.5
+
+        self.qk = nn.Linear(dim, hidden_dim * 2, bias=qkv_bias)
+        self.v = nn.Linear(dim, dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop, inplace=True)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop, inplace=True)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qk = self.qk(x).reshape(B, N, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k = qk.unbind(0)   # make torchscript happy (cannot use tensor as tuple)
+        v = self.v(x).reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+    """ TNT Block
+    """
+    def __init__(
+            self, dim, in_dim, num_pixel, num_heads=12, in_num_head=4, mlp_ratio=4.,
+            qkv_bias=False, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        # Inner transformer
+        self.norm_in = norm_layer(in_dim)
+        self.attn_in = Attention(
+            in_dim, in_dim, num_heads=in_num_head, qkv_bias=qkv_bias,
+            attn_drop=attn_drop, proj_drop=drop)
+        
+        self.norm_mlp_in = norm_layer(in_dim)
+        self.mlp_in = Mlp(in_features=in_dim, hidden_features=int(in_dim * 4),
+            out_features=in_dim, act_layer=act_layer, drop=drop)
+        
+        self.norm1_proj = norm_layer(in_dim)
+        self.proj = nn.Linear(in_dim * num_pixel, dim, bias=True)
+        # Outer transformer
+        self.norm_out = norm_layer(dim)
+        self.attn_out = Attention(
+            dim, dim, num_heads=num_heads, qkv_bias=qkv_bias,
+            attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        
+        self.norm_mlp = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio),
+            out_features=dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, pixel_embed, patch_embed):
+        # inner
+        pixel_embed = pixel_embed + self.drop_path(self.attn_in(self.norm_in(pixel_embed)))
+        pixel_embed = pixel_embed + self.drop_path(self.mlp_in(self.norm_mlp_in(pixel_embed)))
+        # outer
+        B, N, C = patch_embed.size()
+        patch_embed = torch.cat(
+            [patch_embed[:, 0:1], patch_embed[:, 1:] + self.proj(self.norm1_proj(pixel_embed).reshape(B, N - 1, -1))],
+            dim=1)
+        patch_embed = patch_embed + self.drop_path(self.attn_out(self.norm_out(patch_embed)))
+        patch_embed = patch_embed + self.drop_path(self.mlp(self.norm_mlp(patch_embed)))
+        return pixel_embed, patch_embed
+
+
+class PixelEmbed(nn.Module):
+    """ Image to Pixel Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, in_dim=48, stride=4):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        # grid_size property necessary for resizing positional embedding
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        num_patches = (self.grid_size[0]) * (self.grid_size[1])
+        self.img_size = img_size
+        self.num_patches = num_patches
+        self.in_dim = in_dim
+        new_patch_size = [math.ceil(ps / stride) for ps in patch_size]
+        self.new_patch_size = new_patch_size
+
+        self.proj = nn.Conv2d(in_chans, self.in_dim, kernel_size=7, padding=3, stride=stride)
+        self.unfold = nn.Unfold(kernel_size=new_patch_size, stride=new_patch_size)
+
+    def forward(self, x, pixel_pos):
+        B, C, H, W = x.shape
+        _assert(H == self.img_size[0],
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]}).")
+        _assert(W == self.img_size[1],
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]}).")
+        x = self.proj(x)
+        x = self.unfold(x)
+        x = x.transpose(1, 2).reshape(B * self.num_patches, self.in_dim, self.new_patch_size[0], self.new_patch_size[1])
+        x = x + pixel_pos
+        x = x.reshape(B * self.num_patches, self.in_dim, -1).transpose(1, 2)
+        return x
+
+
+class TNT(nn.Module):
+    """ Transformer in Transformer - https://arxiv.org/abs/2103.00112
+    """
+    def __init__(
+            self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, global_pool='token',
+            embed_dim=768, in_dim=48, depth=12, num_heads=12, in_num_head=4, mlp_ratio=4., qkv_bias=False,
+            drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, first_stride=4):
+        super().__init__()
+        assert global_pool in ('', 'token', 'avg')
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.grad_checkpointing = False
+
+        self.pixel_embed = PixelEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, in_dim=in_dim, stride=first_stride)
+        num_patches = self.pixel_embed.num_patches
+        self.num_patches = num_patches
+        new_patch_size = self.pixel_embed.new_patch_size
+        num_pixel = new_patch_size[0] * new_patch_size[1]
+        
+        self.norm1_proj = norm_layer(num_pixel * in_dim)
+        self.proj = nn.Linear(num_pixel * in_dim, embed_dim)
+        self.norm2_proj = norm_layer(embed_dim)
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.patch_pos = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        self.pixel_pos = nn.Parameter(torch.zeros(1, in_dim, new_patch_size[0], new_patch_size[1]))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        blocks = []
+        for i in range(depth):
+            blocks.append(Block(
+                dim=embed_dim, in_dim=in_dim, num_pixel=num_pixel, num_heads=num_heads, in_num_head=in_num_head,
+                mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop_rate, attn_drop=attn_drop_rate,
+                drop_path=dpr[i], norm_layer=norm_layer))
+        self.blocks = nn.ModuleList(blocks)
+        self.norm = norm_layer(embed_dim)
+
+        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        trunc_normal_(self.cls_token, std=.02)
+        trunc_normal_(self.patch_pos, std=.02)
+        trunc_normal_(self.pixel_pos, std=.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'patch_pos', 'pixel_pos', 'cls_token'}
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^cls_token|patch_pos|pixel_pos|pixel_embed|norm[12]_proj|proj',  # stem and embed / pos
+            blocks=[
+                (r'^blocks\.(\d+)', None),
+                (r'^norm', (99999,)),
+            ]
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ('', 'token', 'avg')
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        pixel_embed = self.pixel_embed(x, self.pixel_pos)
+        
+        patch_embed = self.norm2_proj(self.proj(self.norm1_proj(pixel_embed.reshape(B, self.num_patches, -1))))
+        patch_embed = torch.cat((self.cls_token.expand(B, -1, -1), patch_embed), dim=1)
+        patch_embed = patch_embed + self.patch_pos
+        patch_embed = self.pos_drop(patch_embed)
+
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for blk in self.blocks:
+                pixel_embed, patch_embed = checkpoint(blk, pixel_embed, patch_embed)
+        else:
+            for blk in self.blocks:
+                pixel_embed, patch_embed = blk(pixel_embed, patch_embed)
+
+        patch_embed = self.norm(patch_embed)
+        return patch_embed
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool:
+            x = x[:, 1:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0]
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def checkpoint_filter_fn(state_dict, model):
+    """ convert patch embedding weight from manual patchify + linear proj to conv"""
+    if state_dict['patch_pos'].shape != model.patch_pos.shape:
+        state_dict['patch_pos'] = resize_pos_embed(state_dict['patch_pos'],
+            model.patch_pos, getattr(model, 'num_tokens', 1), model.pixel_embed.grid_size)
+    return state_dict
+
+
+def _create_tnt(variant, pretrained=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+
+    model = build_model_with_cfg(
+        TNT, variant, pretrained,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        **kwargs)
+    return model
+
+
+@register_model
+def tnt_s_patch16_224(pretrained=False, **kwargs):
+    model_cfg = dict(
+        patch_size=16, embed_dim=384, in_dim=24, depth=12, num_heads=6, in_num_head=4,
+        qkv_bias=False, **kwargs)
+    model = _create_tnt('tnt_s_patch16_224', pretrained=pretrained, **model_cfg)
+    return model
+
+
+@register_model
+def tnt_b_patch16_224(pretrained=False, **kwargs):
+    model_cfg = dict(
+        patch_size=16, embed_dim=640, in_dim=40, depth=12, num_heads=10, in_num_head=4,
+        qkv_bias=False, **kwargs)
+    model = _create_tnt('tnt_b_patch16_224', pretrained=pretrained, **model_cfg)
+    return model
diff --git a/src/custom_timm/models/tresnet.py b/src/custom_timm/models/tresnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2469acd265aaff10c9d3b04a5b9db090f3939a7b
--- /dev/null
+++ b/src/custom_timm/models/tresnet.py
@@ -0,0 +1,331 @@
+"""
+TResNet: High Performance GPU-Dedicated Architecture
+https://arxiv.org/pdf/2003.13630.pdf
+
+Original model: https://github.com/mrT23/TResNet
+
+"""
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+from .helpers import build_model_with_cfg
+from .layers import SpaceToDepthModule, BlurPool2d, InplaceAbn, ClassifierHead, SEModule
+from .registry import register_model
+
+__all__ = ['tresnet_m', 'tresnet_l', 'tresnet_xl']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': (0., 0., 0.), 'std': (1., 1., 1.),
+        'first_conv': 'body.conv1.0', 'classifier': 'head.fc',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'tresnet_m': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_m_1k_miil_83_1-d236afcb.pth'),
+    'tresnet_m_miil_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_m_miil_in21k-901b6ed4.pth', num_classes=11221),
+    'tresnet_l': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_l_81_5-235b486c.pth'),
+    'tresnet_xl': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_xl_82_0-a2d51b00.pth'),
+    'tresnet_m_448': _cfg(
+        input_size=(3, 448, 448), pool_size=(14, 14),
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_m_448-bc359d10.pth'),
+    'tresnet_l_448': _cfg(
+        input_size=(3, 448, 448), pool_size=(14, 14),
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_l_448-940d0cd1.pth'),
+    'tresnet_xl_448': _cfg(
+        input_size=(3, 448, 448), pool_size=(14, 14),
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_xl_448-8c1815de.pth'),
+
+    'tresnet_v2_l': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_l_v2_83_9-f36e4445.pth'),
+}
+
+
+def IABN2Float(module: nn.Module) -> nn.Module:
+    """If `module` is IABN don't use half precision."""
+    if isinstance(module, InplaceAbn):
+        module.float()
+    for child in module.children():
+        IABN2Float(child)
+    return module
+
+
+def conv2d_iabn(ni, nf, stride, kernel_size=3, groups=1, act_layer="leaky_relu", act_param=1e-2):
+    return nn.Sequential(
+        nn.Conv2d(
+            ni, nf, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, groups=groups, bias=False),
+        InplaceAbn(nf, act_layer=act_layer, act_param=act_param)
+    )
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, use_se=True, aa_layer=None):
+        super(BasicBlock, self).__init__()
+        if stride == 1:
+            self.conv1 = conv2d_iabn(inplanes, planes, stride=1, act_param=1e-3)
+        else:
+            if aa_layer is None:
+                self.conv1 = conv2d_iabn(inplanes, planes, stride=2, act_param=1e-3)
+            else:
+                self.conv1 = nn.Sequential(
+                    conv2d_iabn(inplanes, planes, stride=1, act_param=1e-3),
+                    aa_layer(channels=planes, filt_size=3, stride=2))
+
+        self.conv2 = conv2d_iabn(planes, planes, stride=1, act_layer="identity")
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        rd_chs = max(planes * self.expansion // 4, 64)
+        self.se = SEModule(planes * self.expansion, rd_channels=rd_chs) if use_se else None
+
+    def forward(self, x):
+        if self.downsample is not None:
+            shortcut = self.downsample(x)
+        else:
+            shortcut = x
+
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.se is not None:
+            out = self.se(out)
+
+        out = out + shortcut
+        out = self.relu(out)
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(
+            self, inplanes, planes, stride=1, downsample=None, use_se=True,
+            act_layer="leaky_relu", aa_layer=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = conv2d_iabn(
+            inplanes, planes, kernel_size=1, stride=1, act_layer=act_layer, act_param=1e-3)
+        if stride == 1:
+            self.conv2 = conv2d_iabn(
+                planes, planes, kernel_size=3, stride=1, act_layer=act_layer, act_param=1e-3)
+        else:
+            if aa_layer is None:
+                self.conv2 = conv2d_iabn(
+                    planes, planes, kernel_size=3, stride=2, act_layer=act_layer, act_param=1e-3)
+            else:
+                self.conv2 = nn.Sequential(
+                    conv2d_iabn(planes, planes, kernel_size=3, stride=1, act_layer=act_layer, act_param=1e-3),
+                    aa_layer(channels=planes, filt_size=3, stride=2))
+
+        reduction_chs = max(planes * self.expansion // 8, 64)
+        self.se = SEModule(planes, rd_channels=reduction_chs) if use_se else None
+
+        self.conv3 = conv2d_iabn(
+            planes, planes * self.expansion, kernel_size=1, stride=1, act_layer="identity")
+
+        self.act = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        if self.downsample is not None:
+            shortcut = self.downsample(x)
+        else:
+            shortcut = x
+
+        out = self.conv1(x)
+        out = self.conv2(out)
+        if self.se is not None:
+            out = self.se(out)
+        out = self.conv3(out)
+        out = out + shortcut  # no inplace
+        out = self.act(out)
+
+        return out
+
+
+class TResNet(nn.Module):
+    def __init__(
+            self,
+            layers,
+            in_chans=3,
+            num_classes=1000,
+            width_factor=1.0,
+            v2=False,
+            global_pool='fast',
+            drop_rate=0.,
+    ):
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        super(TResNet, self).__init__()
+
+        aa_layer = BlurPool2d
+
+        # TResnet stages
+        self.inplanes = int(64 * width_factor)
+        self.planes = int(64 * width_factor)
+        if v2:
+            self.inplanes = self.inplanes // 8 * 8
+            self.planes = self.planes // 8 * 8
+
+        conv1 = conv2d_iabn(in_chans * 16, self.planes, stride=1, kernel_size=3)
+        layer1 = self._make_layer(
+            Bottleneck if v2 else BasicBlock, self.planes, layers[0], stride=1, use_se=True, aa_layer=aa_layer)
+        layer2 = self._make_layer(
+            Bottleneck if v2 else BasicBlock, self.planes * 2, layers[1], stride=2, use_se=True, aa_layer=aa_layer)
+        layer3 = self._make_layer(
+            Bottleneck, self.planes * 4, layers[2], stride=2, use_se=True, aa_layer=aa_layer)
+        layer4 = self._make_layer(
+            Bottleneck, self.planes * 8, layers[3], stride=2, use_se=False, aa_layer=aa_layer)
+
+        # body
+        self.body = nn.Sequential(OrderedDict([
+            ('SpaceToDepth', SpaceToDepthModule()),
+            ('conv1', conv1),
+            ('layer1', layer1),
+            ('layer2', layer2),
+            ('layer3', layer3),
+            ('layer4', layer4)]))
+
+        self.feature_info = [
+            dict(num_chs=self.planes, reduction=2, module=''),  # Not with S2D?
+            dict(num_chs=self.planes * (Bottleneck.expansion if v2 else 1), reduction=4, module='body.layer1'),
+            dict(num_chs=self.planes * 2 * (Bottleneck.expansion if v2 else 1), reduction=8, module='body.layer2'),
+            dict(num_chs=self.planes * 4 * Bottleneck.expansion, reduction=16, module='body.layer3'),
+            dict(num_chs=self.planes * 8 * Bottleneck.expansion, reduction=32, module='body.layer4'),
+        ]
+
+        # head
+        self.num_features = (self.planes * 8) * Bottleneck.expansion
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate)
+
+        # model initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu')
+            elif isinstance(m, nn.BatchNorm2d) or isinstance(m, InplaceAbn):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # residual connections special initialization
+        for m in self.modules():
+            if isinstance(m, BasicBlock):
+                m.conv2[1].weight = nn.Parameter(torch.zeros_like(m.conv2[1].weight))  # BN to zero
+            if isinstance(m, Bottleneck):
+                m.conv3[1].weight = nn.Parameter(torch.zeros_like(m.conv3[1].weight))  # BN to zero
+            if isinstance(m, nn.Linear):
+                m.weight.data.normal_(0, 0.01)
+
+    def _make_layer(self, block, planes, blocks, stride=1, use_se=True, aa_layer=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            layers = []
+            if stride == 2:
+                # avg pooling before 1x1 conv
+                layers.append(nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=True, count_include_pad=False))
+            layers += [conv2d_iabn(
+                self.inplanes, planes * block.expansion, kernel_size=1, stride=1, act_layer="identity")]
+            downsample = nn.Sequential(*layers)
+
+        layers = []
+        layers.append(block(
+            self.inplanes, planes, stride, downsample, use_se=use_se, aa_layer=aa_layer))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(self.inplanes, planes, use_se=use_se, aa_layer=aa_layer))
+        return nn.Sequential(*layers)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(stem=r'^body\.conv1', blocks=r'^body\.layer(\d+)' if coarse else r'^body\.layer(\d+)\.(\d+)')
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        assert not enable, 'gradient checkpointing not supported'
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool='fast'):
+        self.head = ClassifierHead(
+            self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+    def forward_features(self, x):
+        return self.body(x)
+
+    def forward_head(self, x, pre_logits: bool = False):
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_tresnet(variant, pretrained=False, **kwargs):
+    return build_model_with_cfg(
+        TResNet, variant, pretrained,
+        feature_cfg=dict(out_indices=(1, 2, 3, 4), flatten_sequential=True),
+        **kwargs)
+
+
+@register_model
+def tresnet_m(pretrained=False, **kwargs):
+    model_kwargs = dict(layers=[3, 4, 11, 3], **kwargs)
+    return _create_tresnet('tresnet_m', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def tresnet_m_miil_in21k(pretrained=False, **kwargs):
+    model_kwargs = dict(layers=[3, 4, 11, 3], **kwargs)
+    return _create_tresnet('tresnet_m_miil_in21k', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def tresnet_l(pretrained=False, **kwargs):
+    model_kwargs = dict(layers=[4, 5, 18, 3], width_factor=1.2, **kwargs)
+    return _create_tresnet('tresnet_l', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def tresnet_v2_l(pretrained=False, **kwargs):
+    model_kwargs = dict(layers=[3, 4, 23, 3], width_factor=1.0, v2=True, **kwargs)
+    return _create_tresnet('tresnet_v2_l', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def tresnet_xl(pretrained=False, **kwargs):
+    model_kwargs = dict(layers=[4, 5, 24, 3], width_factor=1.3, **kwargs)
+    return _create_tresnet('tresnet_xl', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def tresnet_m_448(pretrained=False, **kwargs):
+    model_kwargs = dict(layers=[3, 4, 11, 3], **kwargs)
+    return _create_tresnet('tresnet_m_448', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def tresnet_l_448(pretrained=False, **kwargs):
+    model_kwargs = dict(layers=[4, 5, 18, 3], width_factor=1.2, **kwargs)
+    return _create_tresnet('tresnet_l_448', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def tresnet_xl_448(pretrained=False, **kwargs):
+    model_kwargs = dict(layers=[4, 5, 24, 3], width_factor=1.3, **kwargs)
+    return _create_tresnet('tresnet_xl_448', pretrained=pretrained, **model_kwargs)
diff --git a/src/custom_timm/models/twins.py b/src/custom_timm/models/twins.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfde68ca6e85558e2b094d138fe7e522395404f8
--- /dev/null
+++ b/src/custom_timm/models/twins.py
@@ -0,0 +1,449 @@
+""" Twins
+A PyTorch impl of : `Twins: Revisiting the Design of Spatial Attention in Vision Transformers`
+    - https://arxiv.org/pdf/2104.13840.pdf
+
+Code/weights from https://github.com/Meituan-AutoML/Twins, original copyright/license info below
+
+"""
+# --------------------------------------------------------
+# Twins
+# Copyright (c) 2021 Meituan
+# Licensed under The Apache 2.0 License [see LICENSE for details]
+# Written by Xinjie Li, Xiangxiang Chu
+# --------------------------------------------------------
+import math
+from copy import deepcopy
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .layers import Mlp, DropPath, to_2tuple, trunc_normal_
+from .fx_features import register_notrace_module
+from .registry import register_model
+from .vision_transformer import Attention
+from .helpers import build_model_with_cfg
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embeds.0.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'twins_pcpvt_small': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_pcpvt_small-e70e7e7a.pth',
+        ),
+    'twins_pcpvt_base': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_pcpvt_base-e5ecb09b.pth',
+        ),
+    'twins_pcpvt_large': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_pcpvt_large-d273f802.pth',
+        ),
+    'twins_svt_small': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_svt_small-42e5f78c.pth',
+        ),
+    'twins_svt_base': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_svt_base-c2265010.pth',
+        ),
+    'twins_svt_large': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_svt_large-90f6aaa9.pth',
+        ),
+}
+
+Size_ = Tuple[int, int]
+
+
+@register_notrace_module  # reason: FX can't symbolically trace control flow in forward method
+class LocallyGroupedAttn(nn.Module):
+    """ LSA: self attention within a group
+    """
+    def __init__(self, dim, num_heads=8, attn_drop=0., proj_drop=0., ws=1):
+        assert ws != 1
+        super(LocallyGroupedAttn, self).__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.ws = ws
+
+    def forward(self, x, size: Size_):
+        # There are two implementations for this function, zero padding or mask. We don't observe obvious difference for
+        # both. You can choose any one, we recommend forward_padding because it's neat. However,
+        # the masking implementation is more reasonable and accurate.
+        B, N, C = x.shape
+        H, W = size
+        x = x.view(B, H, W, C)
+        pad_l = pad_t = 0
+        pad_r = (self.ws - W % self.ws) % self.ws
+        pad_b = (self.ws - H % self.ws) % self.ws
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        _h, _w = Hp // self.ws, Wp // self.ws
+        x = x.reshape(B, _h, self.ws, _w, self.ws, C).transpose(2, 3)
+        qkv = self.qkv(x).reshape(
+            B, _h * _w, self.ws * self.ws, 3, self.num_heads, C // self.num_heads).permute(3, 0, 1, 4, 2, 5)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        attn = (attn @ v).transpose(2, 3).reshape(B, _h, _w, self.ws, self.ws, C)
+        x = attn.transpose(2, 3).reshape(B, _h * self.ws, _w * self.ws, C)
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    # def forward_mask(self, x, size: Size_):
+    #     B, N, C = x.shape
+    #     H, W = size
+    #     x = x.view(B, H, W, C)
+    #     pad_l = pad_t = 0
+    #     pad_r = (self.ws - W % self.ws) % self.ws
+    #     pad_b = (self.ws - H % self.ws) % self.ws
+    #     x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+    #     _, Hp, Wp, _ = x.shape
+    #     _h, _w = Hp // self.ws, Wp // self.ws
+    #     mask = torch.zeros((1, Hp, Wp), device=x.device)
+    #     mask[:, -pad_b:, :].fill_(1)
+    #     mask[:, :, -pad_r:].fill_(1)
+    #
+    #     x = x.reshape(B, _h, self.ws, _w, self.ws, C).transpose(2, 3)  # B, _h, _w, ws, ws, C
+    #     mask = mask.reshape(1, _h, self.ws, _w, self.ws).transpose(2, 3).reshape(1,  _h * _w, self.ws * self.ws)
+    #     attn_mask = mask.unsqueeze(2) - mask.unsqueeze(3)  # 1, _h*_w, ws*ws, ws*ws
+    #     attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-1000.0)).masked_fill(attn_mask == 0, float(0.0))
+    #     qkv = self.qkv(x).reshape(
+    #         B, _h * _w, self.ws * self.ws, 3, self.num_heads, C // self.num_heads).permute(3, 0, 1, 4, 2, 5)
+    #     # n_h, B, _w*_h, nhead, ws*ws, dim
+    #     q, k, v = qkv[0], qkv[1], qkv[2]  # B, _h*_w, n_head, ws*ws, dim_head
+    #     attn = (q @ k.transpose(-2, -1)) * self.scale  # B, _h*_w, n_head, ws*ws, ws*ws
+    #     attn = attn + attn_mask.unsqueeze(2)
+    #     attn = attn.softmax(dim=-1)
+    #     attn = self.attn_drop(attn)  # attn @v ->  B, _h*_w, n_head, ws*ws, dim_head
+    #     attn = (attn @ v).transpose(2, 3).reshape(B, _h, _w, self.ws, self.ws, C)
+    #     x = attn.transpose(2, 3).reshape(B, _h * self.ws, _w * self.ws, C)
+    #     if pad_r > 0 or pad_b > 0:
+    #         x = x[:, :H, :W, :].contiguous()
+    #     x = x.reshape(B, N, C)
+    #     x = self.proj(x)
+    #     x = self.proj_drop(x)
+    #     return x
+
+
+class GlobalSubSampleAttn(nn.Module):
+    """ GSA: using a  key to summarize the information for a group to be efficient.
+    """
+    def __init__(self, dim, num_heads=8, attn_drop=0., proj_drop=0., sr_ratio=1):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+
+        self.q = nn.Linear(dim, dim, bias=True)
+        self.kv = nn.Linear(dim, dim * 2, bias=True)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+            self.norm = nn.LayerNorm(dim)
+        else:
+            self.sr = None
+            self.norm = None
+
+    def forward(self, x, size: Size_):
+        B, N, C = x.shape
+        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+
+        if self.sr is not None:
+            x = x.permute(0, 2, 1).reshape(B, C, *size)
+            x = self.sr(x).reshape(B, C, -1).permute(0, 2, 1)
+            x = self.norm(x)
+        kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(
+            self, dim, num_heads, mlp_ratio=4., drop=0., attn_drop=0., drop_path=0.,
+            act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, ws=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        if ws is None:
+            self.attn = Attention(dim, num_heads, False, None, attn_drop, drop)
+        elif ws == 1:
+            self.attn = GlobalSubSampleAttn(dim, num_heads, attn_drop, drop, sr_ratio)
+        else:
+            self.attn = LocallyGroupedAttn(dim, num_heads, attn_drop, drop, ws)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x, size: Size_):
+        x = x + self.drop_path(self.attn(self.norm1(x), size))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PosConv(nn.Module):
+    # PEG  from https://arxiv.org/abs/2102.10882
+    def __init__(self, in_chans, embed_dim=768, stride=1):
+        super(PosConv, self).__init__()
+        self.proj = nn.Sequential(nn.Conv2d(in_chans, embed_dim, 3, stride, 1, bias=True, groups=embed_dim), )
+        self.stride = stride
+
+    def forward(self, x, size: Size_):
+        B, N, C = x.shape
+        cnn_feat_token = x.transpose(1, 2).view(B, C, *size)
+        x = self.proj(cnn_feat_token)
+        if self.stride == 1:
+            x += cnn_feat_token
+        x = x.flatten(2).transpose(1, 2)
+        return x
+
+    def no_weight_decay(self):
+        return ['proj.%d.weight' % i for i in range(4)]
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        assert img_size[0] % patch_size[0] == 0 and img_size[1] % patch_size[1] == 0, \
+            f"img_size {img_size} should be divided by patch_size {patch_size}."
+        self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.norm = nn.LayerNorm(embed_dim)
+
+    def forward(self, x) -> Tuple[torch.Tensor, Size_]:
+        B, C, H, W = x.shape
+
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        out_size = (H // self.patch_size[0], W // self.patch_size[1])
+
+        return x, out_size
+
+
+class Twins(nn.Module):
+    """ Twins Vision Transfomer (Revisiting Spatial Attention)
+
+    Adapted from PVT (PyramidVisionTransformer) class at https://github.com/whai362/PVT.git
+    """
+    def __init__(
+            self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, global_pool='avg',
+            embed_dims=(64, 128, 256, 512), num_heads=(1, 2, 4, 8), mlp_ratios=(4, 4, 4, 4), depths=(3, 4, 6, 3),
+            sr_ratios=(8, 4, 2, 1), wss=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6), block_cls=Block):
+        super().__init__()
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.depths = depths
+        self.embed_dims = embed_dims
+        self.num_features = embed_dims[-1]
+        self.grad_checkpointing = False
+
+        img_size = to_2tuple(img_size)
+        prev_chs = in_chans
+        self.patch_embeds = nn.ModuleList()
+        self.pos_drops = nn.ModuleList()
+        for i in range(len(depths)):
+            self.patch_embeds.append(PatchEmbed(img_size, patch_size, prev_chs, embed_dims[i]))
+            self.pos_drops.append(nn.Dropout(p=drop_rate))
+            prev_chs = embed_dims[i]
+            img_size = tuple(t // patch_size for t in img_size)
+            patch_size = 2
+
+        self.blocks = nn.ModuleList()
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        cur = 0
+        for k in range(len(depths)):
+            _block = nn.ModuleList([block_cls(
+                dim=embed_dims[k], num_heads=num_heads[k], mlp_ratio=mlp_ratios[k], drop=drop_rate,
+                attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, sr_ratio=sr_ratios[k],
+                ws=1 if wss is None or i % 2 == 1 else wss[k]) for i in range(depths[k])])
+            self.blocks.append(_block)
+            cur += depths[k]
+
+        self.pos_block = nn.ModuleList([PosConv(embed_dim, embed_dim) for embed_dim in embed_dims])
+
+        self.norm = norm_layer(self.num_features)
+
+        # classification head
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+        # init weights
+        self.apply(self._init_weights)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return set(['pos_block.' + n for n, p in self.pos_block.named_parameters()])
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(
+            stem=r'^patch_embeds.0',  # stem and embed
+            blocks=[
+                (r'^(?:blocks|patch_embeds|pos_block)\.(\d+)', None),
+                ('^norm', (99999,))
+            ] if coarse else [
+                (r'^blocks\.(\d+)\.(\d+)', None),
+                (r'^(?:patch_embeds|pos_block)\.(\d+)', (0,)),
+                (r'^norm', (99999,))
+            ]
+        )
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        assert not enable, 'gradient checkpointing not supported'
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ('', 'avg')
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        for i, (embed, drop, blocks, pos_blk) in enumerate(
+                zip(self.patch_embeds, self.pos_drops, self.blocks, self.pos_block)):
+            x, size = embed(x)
+            x = drop(x)
+            for j, blk in enumerate(blocks):
+                x = blk(x, size)
+                if j == 0:
+                    x = pos_blk(x, size)  # PEG here
+            if i < len(self.depths) - 1:
+                x = x.reshape(B, *size, -1).permute(0, 3, 1, 2).contiguous()
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool == 'avg':
+            x = x.mean(dim=1)
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_twins(variant, pretrained=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+
+    model = build_model_with_cfg(Twins, variant, pretrained, **kwargs)
+    return model
+
+
+@register_model
+def twins_pcpvt_small(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4],
+        depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], **kwargs)
+    return _create_twins('twins_pcpvt_small', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def twins_pcpvt_base(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4],
+        depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1], **kwargs)
+    return _create_twins('twins_pcpvt_base', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def twins_pcpvt_large(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4],
+        depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1], **kwargs)
+    return _create_twins('twins_pcpvt_large', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def twins_svt_small(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=4, embed_dims=[64, 128, 256, 512], num_heads=[2, 4, 8, 16], mlp_ratios=[4, 4, 4, 4],
+        depths=[2, 2, 10, 4], wss=[7, 7, 7, 7], sr_ratios=[8, 4, 2, 1], **kwargs)
+    return _create_twins('twins_svt_small', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def twins_svt_base(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=4, embed_dims=[96, 192, 384, 768], num_heads=[3, 6, 12, 24], mlp_ratios=[4, 4, 4, 4],
+        depths=[2, 2, 18, 2], wss=[7, 7, 7, 7], sr_ratios=[8, 4, 2, 1], **kwargs)
+    return _create_twins('twins_svt_base', pretrained=pretrained, **model_kwargs)
+
+
+@register_model
+def twins_svt_large(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=4, embed_dims=[128, 256, 512, 1024], num_heads=[4, 8, 16, 32], mlp_ratios=[4, 4, 4, 4],
+        depths=[2, 2, 18, 2], wss=[7, 7, 7, 7], sr_ratios=[8, 4, 2, 1], **kwargs)
+    return _create_twins('twins_svt_large', pretrained=pretrained, **model_kwargs)
diff --git a/src/custom_timm/models/vgg.py b/src/custom_timm/models/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..11cf08bd5426f58d4a831849b6780d4b05b1b592
--- /dev/null
+++ b/src/custom_timm/models/vgg.py
@@ -0,0 +1,279 @@
+"""VGG
+
+Adapted from https://github.com/pytorch/vision 'vgg.py' (BSD-3-Clause) with a few changes for
+timm functionality.
+
+Copyright 2021 Ross Wightman
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Union, List, Dict, Any, cast
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg, checkpoint_seq
+from .fx_features import register_notrace_module
+from .layers import ClassifierHead
+from .registry import register_model
+
+__all__ = [
+    'VGG', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn',
+    'vgg19_bn', 'vgg19',
+]
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'features.0', 'classifier': 'head.fc',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'vgg11': _cfg(url='https://download.pytorch.org/models/vgg11-bbd30ac9.pth'),
+    'vgg13': _cfg(url='https://download.pytorch.org/models/vgg13-c768596a.pth'),
+    'vgg16': _cfg(url='https://download.pytorch.org/models/vgg16-397923af.pth'),
+    'vgg19': _cfg(url='https://download.pytorch.org/models/vgg19-dcbb9e9d.pth'),
+    'vgg11_bn': _cfg(url='https://download.pytorch.org/models/vgg11_bn-6002323d.pth'),
+    'vgg13_bn': _cfg(url='https://download.pytorch.org/models/vgg13_bn-abd245e5.pth'),
+    'vgg16_bn': _cfg(url='https://download.pytorch.org/models/vgg16_bn-6c64b313.pth'),
+    'vgg19_bn': _cfg(url='https://download.pytorch.org/models/vgg19_bn-c79401a0.pth'),
+}
+
+
+cfgs: Dict[str, List[Union[str, int]]] = {
+    'vgg11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+    'vgg13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+    'vgg16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
+    'vgg19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
+}
+
+
+@register_notrace_module  # reason: FX can't symbolically trace control flow in forward method
+class ConvMlp(nn.Module):
+
+    def __init__(
+            self, in_features=512, out_features=4096, kernel_size=7, mlp_ratio=1.0,
+            drop_rate: float = 0.2, act_layer: nn.Module = None, conv_layer: nn.Module = None):
+        super(ConvMlp, self).__init__()
+        self.input_kernel_size = kernel_size
+        mid_features = int(out_features * mlp_ratio)
+        self.fc1 = conv_layer(in_features, mid_features, kernel_size, bias=True)
+        self.act1 = act_layer(True)
+        self.drop = nn.Dropout(drop_rate)
+        self.fc2 = conv_layer(mid_features, out_features, 1, bias=True)
+        self.act2 = act_layer(True)
+
+    def forward(self, x):
+        if x.shape[-2] < self.input_kernel_size or x.shape[-1] < self.input_kernel_size:
+            # keep the input size >= 7x7
+            output_size = (max(self.input_kernel_size, x.shape[-2]), max(self.input_kernel_size, x.shape[-1]))
+            x = F.adaptive_avg_pool2d(x, output_size)
+        x = self.fc1(x)
+        x = self.act1(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.act2(x)
+        return x
+
+
+class VGG(nn.Module):
+
+    def __init__(
+            self,
+            cfg: List[Any],
+            num_classes: int = 1000,
+            in_chans: int = 3,
+            output_stride: int = 32,
+            mlp_ratio: float = 1.0,
+            act_layer: nn.Module = nn.ReLU,
+            conv_layer: nn.Module = nn.Conv2d,
+            norm_layer: nn.Module = None,
+            global_pool: str = 'avg',
+            drop_rate: float = 0.,
+    ) -> None:
+        super(VGG, self).__init__()
+        assert output_stride == 32
+        self.num_classes = num_classes
+        self.num_features = 4096
+        self.drop_rate = drop_rate
+        self.grad_checkpointing = False
+        self.use_norm = norm_layer is not None
+        self.feature_info = []
+        prev_chs = in_chans
+        net_stride = 1
+        pool_layer = nn.MaxPool2d
+        layers: List[nn.Module] = []
+        for v in cfg:
+            last_idx = len(layers) - 1
+            if v == 'M':
+                self.feature_info.append(dict(num_chs=prev_chs, reduction=net_stride, module=f'features.{last_idx}'))
+                layers += [pool_layer(kernel_size=2, stride=2)]
+                net_stride *= 2
+            else:
+                v = cast(int, v)
+                conv2d = conv_layer(prev_chs, v, kernel_size=3, padding=1)
+                if norm_layer is not None:
+                    layers += [conv2d, norm_layer(v), act_layer(inplace=True)]
+                else:
+                    layers += [conv2d, act_layer(inplace=True)]
+                prev_chs = v
+        self.features = nn.Sequential(*layers)
+        self.feature_info.append(dict(num_chs=prev_chs, reduction=net_stride, module=f'features.{len(layers) - 1}'))
+
+        self.pre_logits = ConvMlp(
+            prev_chs, self.num_features, 7, mlp_ratio=mlp_ratio,
+            drop_rate=drop_rate, act_layer=act_layer, conv_layer=conv_layer)
+        self.head = ClassifierHead(
+            self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate)
+
+        self._initialize_weights()
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        # this treats BN layers as separate groups for bn variants, a lot of effort to fix that
+        return dict(stem=r'^features\.0', blocks=r'^features\.(\d+)')
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        assert not enable, 'gradient checkpointing not supported'
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.head = ClassifierHead(
+            self.num_features, self.num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.features(x)
+        return x
+
+    def forward_head(self, x: torch.Tensor, pre_logits: bool = False):
+        x = self.pre_logits(x)
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+    def _initialize_weights(self) -> None:
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.constant_(m.bias, 0)
+
+
+def _filter_fn(state_dict):
+    """ convert patch embedding weight from manual patchify + linear proj to conv"""
+    out_dict = {}
+    for k, v in state_dict.items():
+        k_r = k
+        k_r = k_r.replace('classifier.0', 'pre_logits.fc1')
+        k_r = k_r.replace('classifier.3', 'pre_logits.fc2')
+        k_r = k_r.replace('classifier.6', 'head.fc')
+        if 'classifier.0.weight' in k:
+            v = v.reshape(-1, 512, 7, 7)
+        if 'classifier.3.weight' in k:
+            v = v.reshape(-1, 4096, 1, 1)
+        out_dict[k_r] = v
+    return out_dict
+
+
+def _create_vgg(variant: str, pretrained: bool, **kwargs: Any) -> VGG:
+    cfg = variant.split('_')[0]
+    # NOTE: VGG is one of few models with stride==1 features w/ 6 out_indices [0..5]
+    out_indices = kwargs.pop('out_indices', (0, 1, 2, 3, 4, 5))
+    model = build_model_with_cfg(
+        VGG, variant, pretrained,
+        model_cfg=cfgs[cfg],
+        feature_cfg=dict(flatten_sequential=True, out_indices=out_indices),
+        pretrained_filter_fn=_filter_fn,
+        **kwargs)
+    return model
+
+
+@register_model
+def vgg11(pretrained: bool = False, **kwargs: Any) -> VGG:
+    r"""VGG 11-layer model (configuration "A") from
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
+    """
+    model_args = dict(**kwargs)
+    return _create_vgg('vgg11', pretrained=pretrained, **model_args)
+
+
+@register_model
+def vgg11_bn(pretrained: bool = False, **kwargs: Any) -> VGG:
+    r"""VGG 11-layer model (configuration "A") with batch normalization
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
+    """
+    model_args = dict(norm_layer=nn.BatchNorm2d, **kwargs)
+    return _create_vgg('vgg11_bn', pretrained=pretrained, **model_args)
+
+
+@register_model
+def vgg13(pretrained: bool = False, **kwargs: Any) -> VGG:
+    r"""VGG 13-layer model (configuration "B")
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
+    """
+    model_args = dict(**kwargs)
+    return _create_vgg('vgg13', pretrained=pretrained, **model_args)
+
+
+@register_model
+def vgg13_bn(pretrained: bool = False, **kwargs: Any) -> VGG:
+    r"""VGG 13-layer model (configuration "B") with batch normalization
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
+    """
+    model_args = dict(norm_layer=nn.BatchNorm2d, **kwargs)
+    return _create_vgg('vgg13_bn', pretrained=pretrained, **model_args)
+
+
+@register_model
+def vgg16(pretrained: bool = False, **kwargs: Any) -> VGG:
+    r"""VGG 16-layer model (configuration "D")
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
+    """
+    model_args = dict(**kwargs)
+    return _create_vgg('vgg16', pretrained=pretrained, **model_args)
+
+
+@register_model
+def vgg16_bn(pretrained: bool = False, **kwargs: Any) -> VGG:
+    r"""VGG 16-layer model (configuration "D") with batch normalization
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
+    """
+    model_args = dict(norm_layer=nn.BatchNorm2d, **kwargs)
+    return _create_vgg('vgg16_bn', pretrained=pretrained, **model_args)
+
+
+@register_model
+def vgg19(pretrained: bool = False, **kwargs: Any) -> VGG:
+    r"""VGG 19-layer model (configuration "E")
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
+    """
+    model_args = dict(**kwargs)
+    return _create_vgg('vgg19', pretrained=pretrained, **model_args)
+
+
+@register_model
+def vgg19_bn(pretrained: bool = False, **kwargs: Any) -> VGG:
+    r"""VGG 19-layer model (configuration 'E') with batch normalization
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
+    """
+    model_args = dict(norm_layer=nn.BatchNorm2d, **kwargs)
+    return _create_vgg('vgg19_bn', pretrained=pretrained, **model_args)
\ No newline at end of file
diff --git a/src/custom_timm/models/visformer.py b/src/custom_timm/models/visformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a95be8cbc7c92c6242cb3c3e762949f6f6be8f4
--- /dev/null
+++ b/src/custom_timm/models/visformer.py
@@ -0,0 +1,429 @@
+""" Visformer
+
+Paper: Visformer: The Vision-friendly Transformer - https://arxiv.org/abs/2104.12533
+
+From original at https://github.com/danczs/Visformer
+
+Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman
+"""
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg, checkpoint_seq
+from .layers import to_2tuple, trunc_normal_, DropPath, PatchEmbed, LayerNorm2d, create_classifier
+from .registry import register_model
+
+
+__all__ = ['Visformer']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.0', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = dict(
+    visformer_tiny=_cfg(),
+    visformer_small=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/visformer_small-839e1f5b.pth'
+    ),
+)
+
+
+class SpatialMlp(nn.Module):
+    def __init__(
+            self, in_features, hidden_features=None, out_features=None,
+            act_layer=nn.GELU, drop=0., group=8, spatial_conv=False):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        drop_probs = to_2tuple(drop)
+
+        self.in_features = in_features
+        self.out_features = out_features
+        self.spatial_conv = spatial_conv
+        if self.spatial_conv:
+            if group < 2:  # net setting
+                hidden_features = in_features * 5 // 6
+            else:
+                hidden_features = in_features * 2
+        self.hidden_features = hidden_features
+        self.group = group
+        self.conv1 = nn.Conv2d(in_features, hidden_features, 1, stride=1, padding=0, bias=False)
+        self.act1 = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        if self.spatial_conv:
+            self.conv2 = nn.Conv2d(
+                hidden_features, hidden_features, 3, stride=1, padding=1, groups=self.group, bias=False)
+            self.act2 = act_layer()
+        else:
+            self.conv2 = None
+            self.act2 = None
+        self.conv3 = nn.Conv2d(hidden_features, out_features, 1, stride=1, padding=0, bias=False)
+        self.drop3 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.act1(x)
+        x = self.drop1(x)
+        if self.conv2 is not None:
+            x = self.conv2(x)
+            x = self.act2(x)
+        x = self.conv3(x)
+        x = self.drop3(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, head_dim_ratio=1., attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = round(dim // num_heads * head_dim_ratio)
+        self.head_dim = head_dim
+        self.scale = head_dim ** -0.5
+        self.qkv = nn.Conv2d(dim, head_dim * num_heads * 3, 1, stride=1, padding=0, bias=False)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Conv2d(self.head_dim * self.num_heads, dim, 1, stride=1, padding=0, bias=False)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = self.qkv(x).reshape(B, 3, self.num_heads, self.head_dim, -1).permute(1, 0, 2, 4, 3)
+        q, k, v = x.unbind(0)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = attn @ v
+
+        x = x.permute(0, 1, 3, 2).reshape(B, -1, H, W)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+    def __init__(
+            self, dim, num_heads, head_dim_ratio=1., mlp_ratio=4.,
+            drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=LayerNorm2d,
+            group=8, attn_disabled=False, spatial_conv=False):
+        super().__init__()
+        self.spatial_conv = spatial_conv
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        if attn_disabled:
+            self.norm1 = None
+            self.attn = None
+        else:
+            self.norm1 = norm_layer(dim)
+            self.attn = Attention(
+                dim, num_heads=num_heads, head_dim_ratio=head_dim_ratio, attn_drop=attn_drop, proj_drop=drop)
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = SpatialMlp(
+            in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop,
+            group=group, spatial_conv=spatial_conv)  # new setting
+
+    def forward(self, x):
+        if self.attn is not None:
+            x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class Visformer(nn.Module):
+    def __init__(
+            self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, init_channels=32, embed_dim=384,
+            depth=12, num_heads=6, mlp_ratio=4., drop_rate=0., attn_drop_rate=0., drop_path_rate=0.,
+            norm_layer=LayerNorm2d, attn_stage='111', pos_embed=True, spatial_conv='111',
+            vit_stem=False, group=8, global_pool='avg', conv_init=False, embed_norm=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        self.num_classes = num_classes
+        self.embed_dim = embed_dim
+        self.init_channels = init_channels
+        self.img_size = img_size
+        self.vit_stem = vit_stem
+        self.conv_init = conv_init
+        if isinstance(depth, (list, tuple)):
+            self.stage_num1, self.stage_num2, self.stage_num3 = depth
+            depth = sum(depth)
+        else:
+            self.stage_num1 = self.stage_num3 = depth // 3
+            self.stage_num2 = depth - self.stage_num1 - self.stage_num3
+        self.pos_embed = pos_embed
+        self.grad_checkpointing = False
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        # stage 1
+        if self.vit_stem:
+            self.stem = None
+            self.patch_embed1 = PatchEmbed(
+                img_size=img_size, patch_size=patch_size, in_chans=in_chans,
+                embed_dim=embed_dim, norm_layer=embed_norm, flatten=False)
+            img_size = [x // patch_size for x in img_size]
+        else:
+            if self.init_channels is None:
+                self.stem = None
+                self.patch_embed1 = PatchEmbed(
+                    img_size=img_size, patch_size=patch_size // 2, in_chans=in_chans,
+                    embed_dim=embed_dim // 2, norm_layer=embed_norm, flatten=False)
+                img_size = [x // (patch_size // 2) for x in img_size]
+            else:
+                self.stem = nn.Sequential(
+                    nn.Conv2d(in_chans, self.init_channels, 7, stride=2, padding=3, bias=False),
+                    nn.BatchNorm2d(self.init_channels),
+                    nn.ReLU(inplace=True)
+                )
+                img_size = [x // 2 for x in img_size]
+                self.patch_embed1 = PatchEmbed(
+                    img_size=img_size, patch_size=patch_size // 4, in_chans=self.init_channels,
+                    embed_dim=embed_dim // 2, norm_layer=embed_norm, flatten=False)
+                img_size = [x // (patch_size // 4) for x in img_size]
+
+        if self.pos_embed:
+            if self.vit_stem:
+                self.pos_embed1 = nn.Parameter(torch.zeros(1, embed_dim, *img_size))
+            else:
+                self.pos_embed1 = nn.Parameter(torch.zeros(1, embed_dim//2, *img_size))
+            self.pos_drop = nn.Dropout(p=drop_rate)
+        self.stage1 = nn.Sequential(*[
+            Block(
+                dim=embed_dim//2, num_heads=num_heads, head_dim_ratio=0.5, mlp_ratio=mlp_ratio,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                group=group, attn_disabled=(attn_stage[0] == '0'), spatial_conv=(spatial_conv[0] == '1')
+            )
+            for i in range(self.stage_num1)
+        ])
+
+        # stage2
+        if not self.vit_stem:
+            self.patch_embed2 = PatchEmbed(
+                img_size=img_size, patch_size=patch_size // 8, in_chans=embed_dim // 2,
+                embed_dim=embed_dim, norm_layer=embed_norm, flatten=False)
+            img_size = [x // (patch_size // 8) for x in img_size]
+            if self.pos_embed:
+                self.pos_embed2 = nn.Parameter(torch.zeros(1, embed_dim, *img_size))
+        self.stage2 = nn.Sequential(*[
+            Block(
+                dim=embed_dim, num_heads=num_heads, head_dim_ratio=1.0, mlp_ratio=mlp_ratio,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                group=group, attn_disabled=(attn_stage[1] == '0'), spatial_conv=(spatial_conv[1] == '1')
+            )
+            for i in range(self.stage_num1, self.stage_num1+self.stage_num2)
+        ])
+
+        # stage 3
+        if not self.vit_stem:
+            self.patch_embed3 = PatchEmbed(
+                img_size=img_size, patch_size=patch_size // 8, in_chans=embed_dim,
+                embed_dim=embed_dim * 2, norm_layer=embed_norm, flatten=False)
+            img_size = [x // (patch_size // 8) for x in img_size]
+            if self.pos_embed:
+                self.pos_embed3 = nn.Parameter(torch.zeros(1, embed_dim*2, *img_size))
+        self.stage3 = nn.Sequential(*[
+            Block(
+                dim=embed_dim*2, num_heads=num_heads, head_dim_ratio=1.0, mlp_ratio=mlp_ratio,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                group=group, attn_disabled=(attn_stage[2] == '0'), spatial_conv=(spatial_conv[2] == '1')
+            )
+            for i in range(self.stage_num1+self.stage_num2, depth)
+        ])
+
+        # head
+        self.num_features = embed_dim if self.vit_stem else embed_dim * 2
+        self.norm = norm_layer(self.num_features)
+        self.global_pool, self.head = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
+
+        # weights init
+        if self.pos_embed:
+            trunc_normal_(self.pos_embed1, std=0.02)
+            if not self.vit_stem:
+                trunc_normal_(self.pos_embed2, std=0.02)
+                trunc_normal_(self.pos_embed3, std=0.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.Conv2d):
+            if self.conv_init:
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            else:
+                trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0.)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^patch_embed1|pos_embed1|stem',  # stem and embed
+            blocks=[
+                (r'^stage(\d+)\.(\d+)' if coarse else r'^stage(\d+)\.(\d+)', None),
+                (r'^(?:patch_embed|pos_embed)(\d+)', (0,)),
+                (r'^norm', (99999,))
+            ]
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool, self.head = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
+
+    def forward_features(self, x):
+        if self.stem is not None:
+            x = self.stem(x)
+
+        # stage 1
+        x = self.patch_embed1(x)
+        if self.pos_embed:
+            x = self.pos_drop(x + self.pos_embed1)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.stage1, x)
+        else:
+            x = self.stage1(x)
+
+        # stage 2
+        if not self.vit_stem:
+            x = self.patch_embed2(x)
+            if self.pos_embed:
+                x = self.pos_drop(x + self.pos_embed2)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.stage2, x)
+        else:
+            x = self.stage2(x)
+
+        # stage3
+        if not self.vit_stem:
+            x = self.patch_embed3(x)
+            if self.pos_embed:
+                x = self.pos_drop(x + self.pos_embed3)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.stage3, x)
+        else:
+            x = self.stage3(x)
+
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        x = self.global_pool(x)
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_visformer(variant, pretrained=False, default_cfg=None, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+    model = build_model_with_cfg(Visformer, variant, pretrained, **kwargs)
+    return model
+
+
+@register_model
+def visformer_tiny(pretrained=False, **kwargs):
+    model_cfg = dict(
+        init_channels=16, embed_dim=192, depth=(7, 4, 4), num_heads=3, mlp_ratio=4., group=8,
+        attn_stage='011', spatial_conv='100', norm_layer=nn.BatchNorm2d, conv_init=True,
+        embed_norm=nn.BatchNorm2d, **kwargs)
+    model = _create_visformer('visformer_tiny', pretrained=pretrained, **model_cfg)
+    return model
+
+
+@register_model
+def visformer_small(pretrained=False, **kwargs):
+    model_cfg = dict(
+        init_channels=32, embed_dim=384, depth=(7, 4, 4), num_heads=6, mlp_ratio=4., group=8,
+        attn_stage='011', spatial_conv='100', norm_layer=nn.BatchNorm2d, conv_init=True,
+        embed_norm=nn.BatchNorm2d, **kwargs)
+    model = _create_visformer('visformer_small', pretrained=pretrained, **model_cfg)
+    return model
+
+
+# @register_model
+# def visformer_net1(pretrained=False, **kwargs):
+#     model = Visformer(
+#         init_channels=None, embed_dim=384, depth=(0, 12, 0), num_heads=6, mlp_ratio=4., attn_stage='111',
+#         spatial_conv='000', vit_stem=True, conv_init=True, **kwargs)
+#     model.default_cfg = _cfg()
+#     return model
+#
+#
+# @register_model
+# def visformer_net2(pretrained=False, **kwargs):
+#     model = Visformer(
+#         init_channels=32, embed_dim=384, depth=(0, 12, 0), num_heads=6, mlp_ratio=4., attn_stage='111',
+#         spatial_conv='000', vit_stem=False, conv_init=True, **kwargs)
+#     model.default_cfg = _cfg()
+#     return model
+#
+#
+# @register_model
+# def visformer_net3(pretrained=False, **kwargs):
+#     model = Visformer(
+#         init_channels=32, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4., attn_stage='111',
+#         spatial_conv='000', vit_stem=False, conv_init=True, **kwargs)
+#     model.default_cfg = _cfg()
+#     return model
+#
+#
+# @register_model
+# def visformer_net4(pretrained=False, **kwargs):
+#     model = Visformer(
+#         init_channels=32, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4., attn_stage='111',
+#         spatial_conv='000', vit_stem=False, conv_init=True, **kwargs)
+#     model.default_cfg = _cfg()
+#     return model
+#
+#
+# @register_model
+# def visformer_net5(pretrained=False, **kwargs):
+#     model = Visformer(
+#         init_channels=32, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4., group=1, attn_stage='111',
+#         spatial_conv='111', vit_stem=False, conv_init=True, **kwargs)
+#     model.default_cfg = _cfg()
+#     return model
+#
+#
+# @register_model
+# def visformer_net6(pretrained=False, **kwargs):
+#     model = Visformer(
+#         init_channels=32, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4., group=1, attn_stage='111',
+#         pos_embed=False, spatial_conv='111', conv_init=True, **kwargs)
+#     model.default_cfg = _cfg()
+#     return model
+#
+#
+# @register_model
+# def visformer_net7(pretrained=False, **kwargs):
+#     model = Visformer(
+#         init_channels=32, embed_dim=384, depth=(6, 7, 7), num_heads=6, group=1, attn_stage='000',
+#         pos_embed=False, spatial_conv='111', conv_init=True, **kwargs)
+#     model.default_cfg = _cfg()
+#     return model
+
+
+
+
diff --git a/src/custom_timm/models/vision_transformer.py b/src/custom_timm/models/vision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..52c406b23b7dc1aace4e955febe59964b666894b
--- /dev/null
+++ b/src/custom_timm/models/vision_transformer.py
@@ -0,0 +1,1256 @@
+""" Vision Transformer (ViT) in PyTorch
+
+A PyTorch implement of Vision Transformers as described in:
+
+'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale'
+    - https://arxiv.org/abs/2010.11929
+
+`How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers`
+    - https://arxiv.org/abs/2106.10270
+
+The official jax code is released and available at https://github.com/google-research/vision_transformer
+
+Acknowledgments:
+* The paper authors for releasing code and weights, thanks!
+* I fixed my class token impl based on Phil Wang's https://github.com/lucidrains/vit-pytorch ... check it out
+for some einops/einsum fun
+* Simple transformer style inspired by Andrej Karpathy's https://github.com/karpathy/minGPT
+* Bert reference code checks against Huggingface Transformers and Tensorflow Bert
+
+Hacked together by / Copyright 2020, Ross Wightman
+"""
+import math
+import logging
+from functools import partial
+from collections import OrderedDict
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD,\
+    OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from .helpers import build_model_with_cfg, resolve_pretrained_cfg, named_apply, adapt_input_conv, checkpoint_seq
+from .layers import PatchEmbed, Mlp, DropPath, trunc_normal_, lecun_normal_
+from .registry import register_model
+
+_logger = logging.getLogger(__name__)
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD,
+        'first_conv': 'patch_embed.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    # patch models (weights from official Google JAX impl)
+    'vit_tiny_patch16_224': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz'),
+    'vit_tiny_patch16_384': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz',
+        input_size=(3, 384, 384), crop_pct=1.0),
+    'vit_small_patch32_224': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz'),
+    'vit_small_patch32_384': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz',
+        input_size=(3, 384, 384), crop_pct=1.0),
+    'vit_small_patch16_224': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz'),
+    'vit_small_patch16_384': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz',
+        input_size=(3, 384, 384), crop_pct=1.0),
+    'vit_base_patch32_224': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'B_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz'),
+    'vit_base_patch32_384': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'B_32-i21k-300ep-lr_0.001-aug_light1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz',
+        input_size=(3, 384, 384), crop_pct=1.0),
+    'vit_base_patch16_224': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.npz'),
+    'vit_base_patch16_384': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz',
+        input_size=(3, 384, 384), crop_pct=1.0),
+    'vit_base_patch8_224': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'B_8-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.npz'),
+    'vit_large_patch32_224': _cfg(
+        url='',  # no official model weights for this combo, only for in21k
+        ),
+    'vit_large_patch32_384': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p32_384-9b920ba8.pth',
+        input_size=(3, 384, 384), crop_pct=1.0),
+    'vit_large_patch16_224': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npz'),
+    'vit_large_patch16_384': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.npz',
+        input_size=(3, 384, 384), crop_pct=1.0),
+
+    'vit_large_patch14_224': _cfg(url=''),
+    'vit_huge_patch14_224': _cfg(url=''),
+    'vit_giant_patch14_224': _cfg(url=''),
+    'vit_gigantic_patch14_224': _cfg(url=''),
+
+
+    # patch models, imagenet21k (weights from official Google JAX impl)
+    'vit_tiny_patch16_224_in21k': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz',
+        num_classes=21843),
+    'vit_small_patch32_224_in21k': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0.npz',
+        num_classes=21843),
+    'vit_small_patch16_224_in21k': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0.npz',
+        num_classes=21843),
+    'vit_base_patch32_224_in21k': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/B_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.03-do_0.0-sd_0.0.npz',
+        num_classes=21843),
+    'vit_base_patch16_224_in21k': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npz',
+        num_classes=21843),
+    'vit_base_patch8_224_in21k': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/B_8-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npz',
+        num_classes=21843),
+    'vit_large_patch32_224_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth',
+        num_classes=21843),
+    'vit_large_patch16_224_in21k': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1.npz',
+        num_classes=21843),
+    'vit_huge_patch14_224_in21k': _cfg(
+        url='https://storage.googleapis.com/vit_models/imagenet21k/ViT-H_14.npz',
+        hf_hub_id='timm/vit_huge_patch14_224_in21k',
+        num_classes=21843),
+
+    # SAM trained models (https://arxiv.org/abs/2106.01548)
+    'vit_base_patch32_224_sam': _cfg(
+        url='https://storage.googleapis.com/vit_models/sam/ViT-B_32.npz'),
+    'vit_base_patch16_224_sam': _cfg(
+        url='https://storage.googleapis.com/vit_models/sam/ViT-B_16.npz'),
+
+    # DINO pretrained - https://arxiv.org/abs/2104.14294 (no classifier head, for fine-tune only)
+    'vit_small_patch16_224_dino': _cfg(
+        url='https://dl.fbaipublicfiles.com/dino/dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0),
+    'vit_small_patch8_224_dino': _cfg(
+        url='https://dl.fbaipublicfiles.com/dino/dino_deitsmall8_pretrain/dino_deitsmall8_pretrain.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0),
+    'vit_base_patch16_224_dino': _cfg(
+        url='https://dl.fbaipublicfiles.com/dino/dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0),
+    'vit_base_patch8_224_dino': _cfg(
+        url='https://dl.fbaipublicfiles.com/dino/dino_vitbase8_pretrain/dino_vitbase8_pretrain.pth',
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0),
+
+
+    # ViT ImageNet-21K-P pretraining by MILL
+    'vit_base_patch16_224_miil_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/vit_base_patch16_224_in21k_miil-887286df.pth',
+        mean=(0., 0., 0.), std=(1., 1., 1.), crop_pct=0.875, interpolation='bilinear', num_classes=11221),
+    'vit_base_patch16_224_miil': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/vit_base_patch16_224_1k_miil_84_4-2deb18e3.pth',
+        mean=(0., 0., 0.), std=(1., 1., 1.), crop_pct=0.875, interpolation='bilinear'),
+
+    'vit_base_patch16_rpn_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_base_patch16_rpn_224-sw-3b07e89d.pth'),
+
+    # experimental (may be removed)
+    'vit_base_patch32_plus_256': _cfg(url='', input_size=(3, 256, 256), crop_pct=0.95),
+    'vit_base_patch16_plus_240': _cfg(url='', input_size=(3, 240, 240), crop_pct=0.95),
+    'vit_small_patch16_36x1_224': _cfg(url=''),
+    'vit_small_patch16_18x2_224': _cfg(url=''),
+    'vit_base_patch16_18x2_224': _cfg(url=''),
+
+    'vit_base_patch32_224_clip_laion2b': _cfg(
+        hf_hub_id='laion/CLIP-ViT-B-32-laion2B-s34B-b79K',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
+    'vit_large_patch14_224_clip_laion2b': _cfg(
+        hf_hub_id='laion/CLIP-ViT-L-14-laion2B-s32B-b82K',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, num_classes=768),
+    'vit_huge_patch14_224_clip_laion2b': _cfg(
+        hf_hub_id='laion/CLIP-ViT-H-14-laion2B-s32B-b79K',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=1024),
+    'vit_giant_patch14_224_clip_laion2b': _cfg(
+        hf_hub_id='laion/CLIP-ViT-g-14-laion2B-s12B-b42K',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=1024),
+
+}
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)   # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x):
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+
+
+class Block(nn.Module):
+
+    def __init__(
+            self,
+            dim,
+            num_heads,
+            mlp_ratio=4.,
+            qkv_bias=False,
+            drop=0.,
+            attn_drop=0.,
+            init_values=None,
+            drop_path=0.,
+            act_layer=nn.GELU,
+            norm_layer=nn.LayerNorm
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+
+
+class ResPostBlock(nn.Module):
+
+    def __init__(
+            self,
+            dim,
+            num_heads,
+            mlp_ratio=4.,
+            qkv_bias=False,
+            drop=0.,
+            attn_drop=0.,
+            init_values=None,
+            drop_path=0.,
+            act_layer=nn.GELU,
+            norm_layer=nn.LayerNorm
+    ):
+        super().__init__()
+        self.init_values = init_values
+
+        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.norm1 = norm_layer(dim)
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
+        self.norm2 = norm_layer(dim)
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.init_weights()
+
+    def init_weights(self):
+        # NOTE this init overrides that base model init with specific changes for the block type
+        if self.init_values is not None:
+            nn.init.constant_(self.norm1.weight, self.init_values)
+            nn.init.constant_(self.norm2.weight, self.init_values)
+
+    def forward(self, x):
+        x = x + self.drop_path1(self.norm1(self.attn(x)))
+        x = x + self.drop_path2(self.norm2(self.mlp(x)))
+        return x
+
+
+class ParallelBlock(nn.Module):
+
+    def __init__(
+            self,
+            dim,
+            num_heads,
+            num_parallel=2,
+            mlp_ratio=4.,
+            qkv_bias=False,
+            init_values=None,
+            drop=0.,
+            attn_drop=0.,
+            drop_path=0.,
+            act_layer=nn.GELU,
+            norm_layer=nn.LayerNorm
+    ):
+        super().__init__()
+        self.num_parallel = num_parallel
+        self.attns = nn.ModuleList()
+        self.ffns = nn.ModuleList()
+        for _ in range(num_parallel):
+            self.attns.append(nn.Sequential(OrderedDict([
+                ('norm', norm_layer(dim)),
+                ('attn', Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)),
+                ('ls', LayerScale(dim, init_values=init_values) if init_values else nn.Identity()),
+                ('drop_path', DropPath(drop_path) if drop_path > 0. else nn.Identity())
+            ])))
+            self.ffns.append(nn.Sequential(OrderedDict([
+                ('norm', norm_layer(dim)),
+                ('mlp', Mlp(dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)),
+                ('ls', LayerScale(dim, init_values=init_values) if init_values else nn.Identity()),
+                ('drop_path', DropPath(drop_path) if drop_path > 0. else nn.Identity())
+            ])))
+
+    def _forward_jit(self, x):
+        x = x + torch.stack([attn(x) for attn in self.attns]).sum(dim=0)
+        x = x + torch.stack([ffn(x) for ffn in self.ffns]).sum(dim=0)
+        return x
+
+    @torch.jit.ignore
+    def _forward(self, x):
+        x = x + sum(attn(x) for attn in self.attns)
+        x = x + sum(ffn(x) for ffn in self.ffns)
+        return x
+
+    def forward(self, x):
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            return self._forward_jit(x)
+        else:
+            return self._forward(x)
+
+
+class VisionTransformer(nn.Module):
+    """ Vision Transformer
+
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    """
+
+    def __init__(
+            self,
+            img_size=224,
+            patch_size=16,
+            in_chans=3,
+            num_classes=1000,
+            global_pool='token',
+            embed_dim=768,
+            depth=12,
+            num_heads=12,
+            mlp_ratio=4.,
+            qkv_bias=True,
+            init_values=None,
+            class_token=True,
+            no_embed_class=False,
+            pre_norm=False,
+            fc_norm=None,
+            drop_rate=0.,
+            attn_drop_rate=0.,
+            drop_path_rate=0.,
+            weight_init='',
+            embed_layer=PatchEmbed,
+            norm_layer=None,
+            act_layer=None,
+            block_fn=Block,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            global_pool (str): type of global pooling for final sequence (default: 'token')
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            init_values: (float): layer-scale init values
+            class_token (bool): use class token
+            fc_norm (Optional[bool]): pre-fc norm after pool, set if global_pool == 'avg' if None (default: None)
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            weight_init (str): weight init scheme
+            embed_layer (nn.Module): patch embedding layer
+            norm_layer: (nn.Module): normalization layer
+            act_layer: (nn.Module): MLP activation layer
+        """
+        super().__init__()
+        assert global_pool in ('', 'avg', 'token')
+        assert class_token or global_pool != 'token'
+        use_fc_norm = global_pool == 'avg' if fc_norm is None else fc_norm
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_prefix_tokens = 1 if class_token else 0
+        self.no_embed_class = no_embed_class
+        self.grad_checkpointing = False
+
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            bias=not pre_norm,  # disable bias if pre-norm is used (e.g. CLIP)
+        )
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None
+        embed_len = num_patches if no_embed_class else num_patches + self.num_prefix_tokens
+        self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * .02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity()
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(*[
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                init_values=init_values,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer
+            )
+            for i in range(depth)])
+        self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity()
+
+        # Classifier Head
+        self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity()
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        if weight_init != 'skip':
+            self.init_weights(weight_init)
+
+    def init_weights(self, mode=''):
+        assert mode in ('jax', 'jax_nlhb', 'moco', '')
+        head_bias = -math.log(self.num_classes) if 'nlhb' in mode else 0.
+        trunc_normal_(self.pos_embed, std=.02)
+        if self.cls_token is not None:
+            nn.init.normal_(self.cls_token, std=1e-6)
+        named_apply(get_init_weights_vit(mode, head_bias), self)
+
+    def _init_weights(self, m):
+        # this fn left here for compat with downstream users
+        init_weights_vit_timm(m)
+
+    @torch.jit.ignore()
+    def load_pretrained(self, checkpoint_path, prefix=''):
+        _load_weights(self, checkpoint_path, prefix)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token', 'dist_token'}
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^cls_token|pos_embed|patch_embed',  # stem and embed
+            blocks=[(r'^blocks\.(\d+)', None), (r'^norm', (99999,))]
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes: int, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ('', 'avg', 'token')
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def _pos_embed(self, x):
+        if self.no_embed_class:
+            # deit-3, updated JAX (big vision)
+            # position embedding does not overlap with class token, add then concat
+            x = x + self.pos_embed
+            if self.cls_token is not None:
+                x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        else:
+            # original timm, JAX, and deit vit impl
+            # pos_embed has entry for class token, concat then add
+            if self.cls_token is not None:
+                x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+            x = x + self.pos_embed
+        return self.pos_drop(x)
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.norm_pre(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool:
+            x = x[:, self.num_prefix_tokens:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0]
+        x = self.fc_norm(x)
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ''):
+    """ ViT weight initialization, original timm impl (for reproducibility) """
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif hasattr(module, 'init_weights'):
+        module.init_weights()
+
+
+def init_weights_vit_jax(module: nn.Module, name: str = '', head_bias: float = 0.):
+    """ ViT weight initialization, matching JAX (Flax) impl """
+    if isinstance(module, nn.Linear):
+        if name.startswith('head'):
+            nn.init.zeros_(module.weight)
+            nn.init.constant_(module.bias, head_bias)
+        else:
+            nn.init.xavier_uniform_(module.weight)
+            if module.bias is not None:
+                nn.init.normal_(module.bias, std=1e-6) if 'mlp' in name else nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Conv2d):
+        lecun_normal_(module.weight)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif hasattr(module, 'init_weights'):
+        module.init_weights()
+
+
+def init_weights_vit_moco(module: nn.Module, name: str = ''):
+    """ ViT weight initialization, matching moco-v3 impl minus fixed PatchEmbed """
+    if isinstance(module, nn.Linear):
+        if 'qkv' in name:
+            # treat the weights of Q, K, V separately
+            val = math.sqrt(6. / float(module.weight.shape[0] // 3 + module.weight.shape[1]))
+            nn.init.uniform_(module.weight, -val, val)
+        else:
+            nn.init.xavier_uniform_(module.weight)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif hasattr(module, 'init_weights'):
+        module.init_weights()
+
+
+def get_init_weights_vit(mode='jax', head_bias: float = 0.):
+    if 'jax' in mode:
+        return partial(init_weights_vit_jax, head_bias=head_bias)
+    elif 'moco' in mode:
+        return init_weights_vit_moco
+    else:
+        return init_weights_vit_timm
+
+
+@torch.no_grad()
+def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = ''):
+    """ Load weights from .npz checkpoints for official Google Brain Flax implementation
+    """
+    import numpy as np
+
+    def _n2p(w, t=True):
+        if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
+            w = w.flatten()
+        if t:
+            if w.ndim == 4:
+                w = w.transpose([3, 2, 0, 1])
+            elif w.ndim == 3:
+                w = w.transpose([2, 0, 1])
+            elif w.ndim == 2:
+                w = w.transpose([1, 0])
+        return torch.from_numpy(w)
+
+    w = np.load(checkpoint_path)
+    if not prefix and 'opt/target/embedding/kernel' in w:
+        prefix = 'opt/target/'
+
+    if hasattr(model.patch_embed, 'backbone'):
+        # hybrid
+        backbone = model.patch_embed.backbone
+        stem_only = not hasattr(backbone, 'stem')
+        stem = backbone if stem_only else backbone.stem
+        stem.conv.weight.copy_(adapt_input_conv(stem.conv.weight.shape[1], _n2p(w[f'{prefix}conv_root/kernel'])))
+        stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale']))
+        stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias']))
+        if not stem_only:
+            for i, stage in enumerate(backbone.stages):
+                for j, block in enumerate(stage.blocks):
+                    bp = f'{prefix}block{i + 1}/unit{j + 1}/'
+                    for r in range(3):
+                        getattr(block, f'conv{r + 1}').weight.copy_(_n2p(w[f'{bp}conv{r + 1}/kernel']))
+                        getattr(block, f'norm{r + 1}').weight.copy_(_n2p(w[f'{bp}gn{r + 1}/scale']))
+                        getattr(block, f'norm{r + 1}').bias.copy_(_n2p(w[f'{bp}gn{r + 1}/bias']))
+                    if block.downsample is not None:
+                        block.downsample.conv.weight.copy_(_n2p(w[f'{bp}conv_proj/kernel']))
+                        block.downsample.norm.weight.copy_(_n2p(w[f'{bp}gn_proj/scale']))
+                        block.downsample.norm.bias.copy_(_n2p(w[f'{bp}gn_proj/bias']))
+        embed_conv_w = _n2p(w[f'{prefix}embedding/kernel'])
+    else:
+        embed_conv_w = adapt_input_conv(
+            model.patch_embed.proj.weight.shape[1], _n2p(w[f'{prefix}embedding/kernel']))
+    model.patch_embed.proj.weight.copy_(embed_conv_w)
+    model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias']))
+    model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False))
+    pos_embed_w = _n2p(w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False)
+    if pos_embed_w.shape != model.pos_embed.shape:
+        pos_embed_w = resize_pos_embed(  # resize pos embedding when different size from pretrained weights
+            pos_embed_w,
+            model.pos_embed,
+            getattr(model, 'num_prefix_tokens', 1),
+            model.patch_embed.grid_size
+        )
+    model.pos_embed.copy_(pos_embed_w)
+    model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale']))
+    model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias']))
+    if isinstance(model.head, nn.Linear) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]:
+        model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
+        model.head.bias.copy_(_n2p(w[f'{prefix}head/bias']))
+    # NOTE representation layer has been removed, not used in latest 21k/1k pretrained weights
+    # if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w:
+    #     model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel']))
+    #     model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias']))
+    for i, block in enumerate(model.blocks.children()):
+        block_prefix = f'{prefix}Transformer/encoderblock_{i}/'
+        mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/'
+        block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
+        block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
+        block.attn.qkv.weight.copy_(torch.cat([
+            _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('query', 'key', 'value')]))
+        block.attn.qkv.bias.copy_(torch.cat([
+            _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('query', 'key', 'value')]))
+        block.attn.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
+        block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
+        for r in range(2):
+            getattr(block.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel']))
+            getattr(block.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias']))
+        block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale']))
+        block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias']))
+
+
+def resize_pos_embed(posemb, posemb_new, num_prefix_tokens=1, gs_new=()):
+    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
+    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
+    _logger.info('Resized position embedding: %s to %s', posemb.shape, posemb_new.shape)
+    ntok_new = posemb_new.shape[1]
+    if num_prefix_tokens:
+        posemb_prefix, posemb_grid = posemb[:, :num_prefix_tokens], posemb[0, num_prefix_tokens:]
+        ntok_new -= num_prefix_tokens
+    else:
+        posemb_prefix, posemb_grid = posemb[:, :0], posemb[0]
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    if not len(gs_new):  # backwards compatibility
+        gs_new = [int(math.sqrt(ntok_new))] * 2
+    assert len(gs_new) >= 2
+    _logger.info('Position embedding grid-size from %s to %s', [gs_old, gs_old], gs_new)
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(posemb_grid, size=gs_new, mode='bicubic', align_corners=False)
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new[0] * gs_new[1], -1)
+    posemb = torch.cat([posemb_prefix, posemb_grid], dim=1)
+    return posemb
+
+
+def _convert_openai_clip(state_dict, model):
+    out_dict = {}
+    swaps = [
+        ('visual.', ''), ('conv1', 'patch_embed.proj'), ('positional_embedding', 'pos_embed'),
+        ('transformer.resblocks.', 'blocks.'), ('ln_pre', 'norm_pre'), ('ln_post', 'norm'), ('ln_', 'norm'),
+        ('in_proj_', 'qkv.'), ('out_proj', 'proj'), ('mlp.c_fc', 'mlp.fc1'), ('mlp.c_proj', 'mlp.fc2'),
+    ]
+    for k, v in state_dict.items():
+        if not k.startswith('visual.'):
+            continue
+        for sp in swaps:
+            k = k.replace(sp[0], sp[1])
+
+        if k == 'proj':
+            k = 'head.weight'
+            v = v.transpose(0, 1)
+            out_dict['head.bias'] = torch.zeros(v.shape[0])
+        elif k == 'class_embedding':
+            k = 'cls_token'
+            v = v.unsqueeze(0).unsqueeze(1)
+        elif k == 'pos_embed':
+            v = v.unsqueeze(0)
+            if v.shape[1] != model.pos_embed.shape[1]:
+                # To resize pos embedding when using model at different size from pretrained weights
+                v = resize_pos_embed(
+                    v,
+                    model.pos_embed,
+                    0 if getattr(model, 'no_embed_class') else getattr(model, 'num_prefix_tokens', 1),
+                    model.patch_embed.grid_size
+                )
+        out_dict[k] = v
+    return out_dict
+
+
+def checkpoint_filter_fn(state_dict, model, adapt_layer_scale=False):
+    """ convert patch embedding weight from manual patchify + linear proj to conv"""
+    import re
+    out_dict = {}
+    if 'model' in state_dict:
+        # For deit models
+        state_dict = state_dict['model']
+
+    if 'visual.class_embedding' in state_dict:
+        return _convert_openai_clip(state_dict, model)
+
+    for k, v in state_dict.items():
+        if 'patch_embed.proj.weight' in k and len(v.shape) < 4:
+            # For old models that I trained prior to conv based patchification
+            O, I, H, W = model.patch_embed.proj.weight.shape
+            v = v.reshape(O, -1, H, W)
+        elif k == 'pos_embed' and v.shape[1] != model.pos_embed.shape[1]:
+            # To resize pos embedding when using model at different size from pretrained weights
+            v = resize_pos_embed(
+                v,
+                model.pos_embed,
+                0 if getattr(model, 'no_embed_class') else getattr(model, 'num_prefix_tokens', 1),
+                model.patch_embed.grid_size
+            )
+        elif adapt_layer_scale and 'gamma_' in k:
+            # remap layer-scale gamma into sub-module (deit3 models)
+            k = re.sub(r'gamma_([0-9])', r'ls\1.gamma', k)
+        elif 'pre_logits' in k:
+            # NOTE representation layer removed as not used in latest 21k/1k pretrained weights
+            continue
+        out_dict[k] = v
+    return out_dict
+
+
+def _create_vision_transformer(variant, pretrained=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+
+    pretrained_cfg = resolve_pretrained_cfg(variant, pretrained_cfg=kwargs.pop('pretrained_cfg', None))
+    model = build_model_with_cfg(
+        VisionTransformer, variant, pretrained,
+        pretrained_cfg=pretrained_cfg,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        pretrained_custom_load='npz' in pretrained_cfg['url'],
+        **kwargs)
+    return model
+
+
+@register_model
+def vit_tiny_patch16_224(pretrained=False, **kwargs):
+    """ ViT-Tiny (Vit-Ti/16)
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs)
+    model = _create_vision_transformer('vit_tiny_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_tiny_patch16_384(pretrained=False, **kwargs):
+    """ ViT-Tiny (Vit-Ti/16) @ 384x384.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs)
+    model = _create_vision_transformer('vit_tiny_patch16_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_small_patch32_224(pretrained=False, **kwargs):
+    """ ViT-Small (ViT-S/32)
+    """
+    model_kwargs = dict(patch_size=32, embed_dim=384, depth=12, num_heads=6, **kwargs)
+    model = _create_vision_transformer('vit_small_patch32_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_small_patch32_384(pretrained=False, **kwargs):
+    """ ViT-Small (ViT-S/32) at 384x384.
+    """
+    model_kwargs = dict(patch_size=32, embed_dim=384, depth=12, num_heads=6, **kwargs)
+    model = _create_vision_transformer('vit_small_patch32_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_small_patch16_224(pretrained=False, **kwargs):
+    """ ViT-Small (ViT-S/16)
+    NOTE I've replaced my previous 'small' model definition and weights with the small variant from the DeiT paper
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs)
+    model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_small_patch16_384(pretrained=False, **kwargs):
+    """ ViT-Small (ViT-S/16)
+    NOTE I've replaced my previous 'small' model definition and weights with the small variant from the DeiT paper
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs)
+    model = _create_vision_transformer('vit_small_patch16_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch32_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_base_patch32_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch32_384(pretrained=False, **kwargs):
+    """ ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_base_patch32_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch16_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_base_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch16_384(pretrained=False, **kwargs):
+    """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_base_patch16_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch8_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/8) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(patch_size=8, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_base_patch8_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_large_patch32_224(pretrained=False, **kwargs):
+    """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929). No pretrained weights.
+    """
+    model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16, **kwargs)
+    model = _create_vision_transformer('vit_large_patch32_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_large_patch32_384(pretrained=False, **kwargs):
+    """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16, **kwargs)
+    model = _create_vision_transformer('vit_large_patch32_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_large_patch16_224(pretrained=False, **kwargs):
+    """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, **kwargs)
+    model = _create_vision_transformer('vit_large_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_large_patch16_384(pretrained=False, **kwargs):
+    """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, **kwargs)
+    model = _create_vision_transformer('vit_large_patch16_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_large_patch14_224(pretrained=False, **kwargs):
+    """ ViT-Large model (ViT-L/14)
+    """
+    model_kwargs = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16, **kwargs)
+    model = _create_vision_transformer('vit_large_patch14_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_huge_patch14_224(pretrained=False, **kwargs):
+    """ ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
+    """
+    model_kwargs = dict(patch_size=14, embed_dim=1280, depth=32, num_heads=16, **kwargs)
+    model = _create_vision_transformer('vit_huge_patch14_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_giant_patch14_224(pretrained=False, **kwargs):
+    """ ViT-Giant (little-g) model (ViT-g/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
+    """
+    model_kwargs = dict(patch_size=14, embed_dim=1408, mlp_ratio=48/11, depth=40, num_heads=16, **kwargs)
+    model = _create_vision_transformer('vit_giant_patch14_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_gigantic_patch14_224(pretrained=False, **kwargs):
+    """ ViT-Gigantic (big-G) model (ViT-G/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
+    """
+    model_kwargs = dict(patch_size=14, embed_dim=1664, mlp_ratio=64/13, depth=48, num_heads=16, **kwargs)
+    model = _create_vision_transformer('vit_gigantic_patch14_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_tiny_patch16_224_in21k(pretrained=False, **kwargs):
+    """ ViT-Tiny (Vit-Ti/16).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    NOTE: this model has valid 21k classifier head and no representation (pre-logits) layer
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs)
+    model = _create_vision_transformer('vit_tiny_patch16_224_in21k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_small_patch32_224_in21k(pretrained=False, **kwargs):
+    """ ViT-Small (ViT-S/16)
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    NOTE: this model has valid 21k classifier head and no representation (pre-logits) layer
+    """
+    model_kwargs = dict(patch_size=32, embed_dim=384, depth=12, num_heads=6, **kwargs)
+    model = _create_vision_transformer('vit_small_patch32_224_in21k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_small_patch16_224_in21k(pretrained=False, **kwargs):
+    """ ViT-Small (ViT-S/16)
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    NOTE: this model has valid 21k classifier head and no representation (pre-logits) layer
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs)
+    model = _create_vision_transformer('vit_small_patch16_224_in21k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch32_224_in21k(pretrained=False, **kwargs):
+    """ ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    NOTE: this model has valid 21k classifier head and no representation (pre-logits) layer
+    """
+    model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_base_patch32_224_in21k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch16_224_in21k(pretrained=False, **kwargs):
+    """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    NOTE: this model has valid 21k classifier head and no representation (pre-logits) layer
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_base_patch16_224_in21k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch8_224_in21k(pretrained=False, **kwargs):
+    """ ViT-Base model (ViT-B/8) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    NOTE: this model has valid 21k classifier head and no representation (pre-logits) layer
+    """
+    model_kwargs = dict(patch_size=8, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_base_patch8_224_in21k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_large_patch32_224_in21k(pretrained=False, **kwargs):
+    """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    NOTE: this model has a representation layer but the 21k classifier head is zero'd out in original weights
+    """
+    model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16, **kwargs)
+    model = _create_vision_transformer('vit_large_patch32_224_in21k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_large_patch16_224_in21k(pretrained=False, **kwargs):
+    """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    NOTE: this model has valid 21k classifier head and no representation (pre-logits) layer
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, **kwargs)
+    model = _create_vision_transformer('vit_large_patch16_224_in21k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_huge_patch14_224_in21k(pretrained=False, **kwargs):
+    """ ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    NOTE: this model has a representation layer but the 21k classifier head is zero'd out in original weights
+    """
+    model_kwargs = dict(patch_size=14, embed_dim=1280, depth=32, num_heads=16, **kwargs)
+    model = _create_vision_transformer('vit_huge_patch14_224_in21k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch16_224_sam(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) w/ SAM pretrained weights. Paper: https://arxiv.org/abs/2106.01548
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_base_patch16_224_sam', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch32_224_sam(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/32) w/ SAM pretrained weights. Paper: https://arxiv.org/abs/2106.01548
+    """
+    model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_base_patch32_224_sam', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_small_patch16_224_dino(pretrained=False, **kwargs):
+    """ ViT-Small (ViT-S/16) w/ DINO pretrained weights (no head) - https://arxiv.org/abs/2104.14294
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs)
+    model = _create_vision_transformer('vit_small_patch16_224_dino', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_small_patch8_224_dino(pretrained=False, **kwargs):
+    """ ViT-Small (ViT-S/8) w/ DINO pretrained weights (no head) - https://arxiv.org/abs/2104.14294
+    """
+    model_kwargs = dict(patch_size=8, embed_dim=384, depth=12, num_heads=6, **kwargs)
+    model = _create_vision_transformer('vit_small_patch8_224_dino', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch16_224_dino(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) /w DINO pretrained weights (no head) - https://arxiv.org/abs/2104.14294
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_base_patch16_224_dino', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch8_224_dino(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/8) w/ DINO pretrained weights (no head) - https://arxiv.org/abs/2104.14294
+    """
+    model_kwargs = dict(patch_size=8, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_base_patch8_224_dino', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch16_224_miil_in21k(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, **kwargs)
+    model = _create_vision_transformer('vit_base_patch16_224_miil_in21k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch16_224_miil(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, **kwargs)
+    model = _create_vision_transformer('vit_base_patch16_224_miil', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+# Experimental models below
+
+@register_model
+def vit_base_patch32_plus_256(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/32+)
+    """
+    model_kwargs = dict(patch_size=32, embed_dim=896, depth=12, num_heads=14, init_values=1e-5, **kwargs)
+    model = _create_vision_transformer('vit_base_patch32_plus_256', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch16_plus_240(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16+)
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=896, depth=12, num_heads=14, init_values=1e-5, **kwargs)
+    model = _create_vision_transformer('vit_base_patch16_plus_240', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch16_rpn_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) w/ residual post-norm
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, init_values=1e-5, class_token=False,
+        block_fn=ResPostBlock, global_pool=kwargs.pop('global_pool', 'avg'), **kwargs)
+    model = _create_vision_transformer('vit_base_patch16_rpn_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_small_patch16_36x1_224(pretrained=False, **kwargs):
+    """ ViT-Base w/ LayerScale + 36 x 1 (36 block serial) config. Experimental, may remove.
+    Based on `Three things everyone should know about Vision Transformers` - https://arxiv.org/abs/2203.09795
+    Paper focuses on 24x2 + 48x1 for 'Small' width but those are extremely slow.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=384, depth=36, num_heads=6, init_values=1e-5, **kwargs)
+    model = _create_vision_transformer('vit_small_patch16_36x1_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_small_patch16_18x2_224(pretrained=False, **kwargs):
+    """ ViT-Small w/ LayerScale + 18 x 2 (36 block parallel) config. Experimental, may remove.
+    Based on `Three things everyone should know about Vision Transformers` - https://arxiv.org/abs/2203.09795
+    Paper focuses on 24x2 + 48x1 for 'Small' width but those are extremely slow.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=384, depth=18, num_heads=6, init_values=1e-5, block_fn=ParallelBlock, **kwargs)
+    model = _create_vision_transformer('vit_small_patch16_18x2_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch16_18x2_224(pretrained=False, **kwargs):
+    """ ViT-Base w/ LayerScale + 18 x 2 (36 block parallel) config. Experimental, may remove.
+    Based on `Three things everyone should know about Vision Transformers` - https://arxiv.org/abs/2203.09795
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=18, num_heads=12, init_values=1e-5, block_fn=ParallelBlock, **kwargs)
+    model = _create_vision_transformer('vit_base_patch16_18x2_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch32_224_clip_laion2b(pretrained=False, **kwargs):
+    """ ViT-B/32
+    Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs.
+    """
+    model_kwargs = dict(
+        patch_size=32, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm, **kwargs)
+    model = _create_vision_transformer('vit_base_patch32_224_clip_laion2b', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_large_patch14_224_clip_laion2b(pretrained=False, **kwargs):
+    """ ViT-Large model (ViT-L/14)
+    Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs.
+    """
+    model_kwargs = dict(
+        patch_size=14, embed_dim=1024, depth=24, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm, **kwargs)
+    model = _create_vision_transformer('vit_large_patch14_224_clip_laion2b', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_huge_patch14_224_clip_laion2b(pretrained=False, **kwargs):
+    """ ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
+    Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs.
+    """
+    model_kwargs = dict(
+        patch_size=14, embed_dim=1280, depth=32, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm, **kwargs)
+    model = _create_vision_transformer('vit_huge_patch14_224_clip_laion2b', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_giant_patch14_224_clip_laion2b(pretrained=False, **kwargs):
+    """ ViT-Giant (little-g) model (ViT-g/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
+    Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs.
+    """
+    model_kwargs = dict(
+        patch_size=14, embed_dim=1408, mlp_ratio=48/11, depth=40, num_heads=16,
+        pre_norm=True, norm_layer=nn.LayerNorm, **kwargs)
+    model = _create_vision_transformer('vit_giant_patch14_224_clip_laion2b', pretrained=pretrained, **model_kwargs)
+    return model
diff --git a/src/custom_timm/models/vision_transformer_hybrid.py b/src/custom_timm/models/vision_transformer_hybrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e8a2b1354094fd5d73e4e3c4a6231ed3f44b64b
--- /dev/null
+++ b/src/custom_timm/models/vision_transformer_hybrid.py
@@ -0,0 +1,371 @@
+""" Hybrid Vision Transformer (ViT) in PyTorch
+
+A PyTorch implement of the Hybrid Vision Transformers as described in:
+
+'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale'
+    - https://arxiv.org/abs/2010.11929
+
+`How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers`
+    - https://arxiv.org/abs/2106.10270
+
+NOTE These hybrid model definitions depend on code in vision_transformer.py.
+They were moved here to keep file sizes sane.
+
+Hacked together by / Copyright 2020, Ross Wightman
+"""
+from copy import deepcopy
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .layers import StdConv2dSame, StdConv2d, to_2tuple
+from .resnet import resnet26d, resnet50d
+from .resnetv2 import ResNetV2, create_resnetv2_stem
+from .registry import register_model
+from custom_timm.models.vision_transformer import _create_vision_transformer
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+        'first_conv': 'patch_embed.backbone.stem.conv', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    # hybrid in-1k models (weights from official JAX impl where they exist)
+    'vit_tiny_r_s16_p8_224': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz',
+        first_conv='patch_embed.backbone.conv'),
+    'vit_tiny_r_s16_p8_384': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz',
+        first_conv='patch_embed.backbone.conv', input_size=(3, 384, 384), crop_pct=1.0),
+    'vit_small_r26_s32_224': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'R26_S_32-i21k-300ep-lr_0.001-aug_light0-wd_0.03-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.03-res_224.npz',
+    ),
+    'vit_small_r26_s32_384': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'R26_S_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz',
+        input_size=(3, 384, 384), crop_pct=1.0),
+    'vit_base_r26_s32_224': _cfg(),
+    'vit_base_r50_s16_224': _cfg(),
+    'vit_base_r50_s16_384': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_resnet50_384-9fd3c705.pth',
+        input_size=(3, 384, 384), crop_pct=1.0),
+    'vit_large_r50_s32_224': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'R50_L_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npz'
+    ),
+    'vit_large_r50_s32_384': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/'
+            'R50_L_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz',
+        input_size=(3, 384, 384), crop_pct=1.0
+    ),
+
+    # hybrid in-21k models (weights from official Google JAX impl where they exist)
+    'vit_tiny_r_s16_p8_224_in21k': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz',
+        num_classes=21843, crop_pct=0.9, first_conv='patch_embed.backbone.conv'),
+    'vit_small_r26_s32_224_in21k': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/R26_S_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.03-do_0.0-sd_0.0.npz',
+        num_classes=21843, crop_pct=0.9),
+    'vit_base_r50_s16_224_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_resnet50_224_in21k-6f7c7740.pth',
+        num_classes=21843, crop_pct=0.9),
+    'vit_large_r50_s32_224_in21k': _cfg(
+        url='https://storage.googleapis.com/vit_models/augreg/R50_L_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0.npz',
+        num_classes=21843, crop_pct=0.9),
+
+    # hybrid models (using timm resnet backbones)
+    'vit_small_resnet26d_224': _cfg(
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, first_conv='patch_embed.backbone.conv1.0'),
+    'vit_small_resnet50d_s16_224': _cfg(
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, first_conv='patch_embed.backbone.conv1.0'),
+    'vit_base_resnet26d_224': _cfg(
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, first_conv='patch_embed.backbone.conv1.0'),
+    'vit_base_resnet50d_224': _cfg(
+        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, first_conv='patch_embed.backbone.conv1.0'),
+}
+
+
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+    def __init__(
+            self,
+            backbone,
+            img_size=224,
+            patch_size=1,
+            feature_size=None,
+            in_chans=3,
+            embed_dim=768,
+            bias=True,
+    ):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                # NOTE Most reliable way of determining output dims is to run forward pass
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))
+                if isinstance(o, (list, tuple)):
+                    o = o[-1]  # last feature if backbone outputs list/tuple of features
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            if hasattr(self.backbone, 'feature_info'):
+                feature_dim = self.backbone.feature_info.channels()[-1]
+            else:
+                feature_dim = self.backbone.num_features
+        assert feature_size[0] % patch_size[0] == 0 and feature_size[1] % patch_size[1] == 0
+        self.grid_size = (feature_size[0] // patch_size[0], feature_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.proj = nn.Conv2d(feature_dim, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
+
+    def forward(self, x):
+        x = self.backbone(x)
+        if isinstance(x, (list, tuple)):
+            x = x[-1]  # last feature if backbone outputs list/tuple of features
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+
+def _create_vision_transformer_hybrid(variant, backbone, pretrained=False, **kwargs):
+    embed_layer = partial(HybridEmbed, backbone=backbone)
+    kwargs.setdefault('patch_size', 1)  # default patch size for hybrid models if not set
+    return _create_vision_transformer(variant, pretrained=pretrained, embed_layer=embed_layer, **kwargs)
+
+
+def _resnetv2(layers=(3, 4, 9), **kwargs):
+    """ ResNet-V2 backbone helper"""
+    padding_same = kwargs.get('padding_same', True)
+    stem_type = 'same' if padding_same else ''
+    conv_layer = partial(StdConv2dSame, eps=1e-8) if padding_same else partial(StdConv2d, eps=1e-8)
+    if len(layers):
+        backbone = ResNetV2(
+            layers=layers, num_classes=0, global_pool='', in_chans=kwargs.get('in_chans', 3),
+            preact=False, stem_type=stem_type, conv_layer=conv_layer)
+    else:
+        backbone = create_resnetv2_stem(
+            kwargs.get('in_chans', 3), stem_type=stem_type, preact=False, conv_layer=conv_layer)
+    return backbone
+
+
+@register_model
+def vit_tiny_r_s16_p8_224(pretrained=False, **kwargs):
+    """ R+ViT-Ti/S16 w/ 8x8 patch hybrid @ 224 x 224.
+    """
+    backbone = _resnetv2(layers=(), **kwargs)
+    model_kwargs = dict(patch_size=8, embed_dim=192, depth=12, num_heads=3, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_tiny_r_s16_p8_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_tiny_r_s16_p8_384(pretrained=False, **kwargs):
+    """ R+ViT-Ti/S16 w/ 8x8 patch hybrid @ 384 x 384.
+    """
+    backbone = _resnetv2(layers=(), **kwargs)
+    model_kwargs = dict(patch_size=8, embed_dim=192, depth=12, num_heads=3, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_tiny_r_s16_p8_384', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_small_r26_s32_224(pretrained=False, **kwargs):
+    """ R26+ViT-S/S32 hybrid.
+    """
+    backbone = _resnetv2((2, 2, 2, 2), **kwargs)
+    model_kwargs = dict(embed_dim=384, depth=12, num_heads=6, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_small_r26_s32_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_small_r26_s32_384(pretrained=False, **kwargs):
+    """ R26+ViT-S/S32 hybrid.
+    """
+    backbone = _resnetv2((2, 2, 2, 2), **kwargs)
+    model_kwargs = dict(embed_dim=384, depth=12, num_heads=6, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_small_r26_s32_384', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_r26_s32_224(pretrained=False, **kwargs):
+    """ R26+ViT-B/S32 hybrid.
+    """
+    backbone = _resnetv2((2, 2, 2, 2), **kwargs)
+    model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_base_r26_s32_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_r50_s16_224(pretrained=False, **kwargs):
+    """ R50+ViT-B/S16 hybrid from original paper (https://arxiv.org/abs/2010.11929).
+    """
+    backbone = _resnetv2((3, 4, 9), **kwargs)
+    model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_base_r50_s16_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_r50_s16_384(pretrained=False, **kwargs):
+    """ R50+ViT-B/16 hybrid from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
+    """
+    backbone = _resnetv2((3, 4, 9), **kwargs)
+    model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_base_r50_s16_384', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_resnet50_384(pretrained=False, **kwargs):
+    # DEPRECATED this is forwarding to model def above for backwards compatibility
+    return vit_base_r50_s16_384(pretrained=pretrained, **kwargs)
+
+
+@register_model
+def vit_large_r50_s32_224(pretrained=False, **kwargs):
+    """ R50+ViT-L/S32 hybrid.
+    """
+    backbone = _resnetv2((3, 4, 6, 3), **kwargs)
+    model_kwargs = dict(embed_dim=1024, depth=24, num_heads=16, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_large_r50_s32_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_large_r50_s32_384(pretrained=False, **kwargs):
+    """ R50+ViT-L/S32 hybrid.
+    """
+    backbone = _resnetv2((3, 4, 6, 3), **kwargs)
+    model_kwargs = dict(embed_dim=1024, depth=24, num_heads=16, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_large_r50_s32_384', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_tiny_r_s16_p8_224_in21k(pretrained=False, **kwargs):
+    """ R+ViT-Ti/S16 w/ 8x8 patch hybrid.  ImageNet-21k.
+    """
+    backbone = _resnetv2(layers=(), **kwargs)
+    model_kwargs = dict(patch_size=8, embed_dim=192, depth=12, num_heads=3, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_tiny_r_s16_p8_224_in21k', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_small_r26_s32_224_in21k(pretrained=False, **kwargs):
+    """ R26+ViT-S/S32 hybrid. ImageNet-21k.
+    """
+    backbone = _resnetv2((2, 2, 2, 2), **kwargs)
+    model_kwargs = dict(embed_dim=384, depth=12, num_heads=6, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_small_r26_s32_224_in21k', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_r50_s16_224_in21k(pretrained=False, **kwargs):
+    """ R50+ViT-B/16 hybrid model from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    """
+    backbone = _resnetv2(layers=(3, 4, 9), **kwargs)
+    model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_base_r50_s16_224_in21k', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_resnet50_224_in21k(pretrained=False, **kwargs):
+    # DEPRECATED this is forwarding to model def above for backwards compatibility
+    return vit_base_r50_s16_224_in21k(pretrained=pretrained, **kwargs)
+
+
+@register_model
+def vit_large_r50_s32_224_in21k(pretrained=False, **kwargs):
+    """ R50+ViT-L/S32 hybrid. ImageNet-21k.
+    """
+    backbone = _resnetv2((3, 4, 6, 3), **kwargs)
+    model_kwargs = dict(embed_dim=1024, depth=24, num_heads=16, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_large_r50_s32_224_in21k', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_small_resnet26d_224(pretrained=False, **kwargs):
+    """ Custom ViT small hybrid w/ ResNet26D stride 32. No pretrained weights.
+    """
+    backbone = resnet26d(pretrained=pretrained, in_chans=kwargs.get('in_chans', 3), features_only=True, out_indices=[4])
+    model_kwargs = dict(embed_dim=768, depth=8, num_heads=8, mlp_ratio=3, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_small_resnet26d_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_small_resnet50d_s16_224(pretrained=False, **kwargs):
+    """ Custom ViT small hybrid w/ ResNet50D 3-stages, stride 16. No pretrained weights.
+    """
+    backbone = resnet50d(pretrained=pretrained, in_chans=kwargs.get('in_chans', 3), features_only=True, out_indices=[3])
+    model_kwargs = dict(embed_dim=768, depth=8, num_heads=8, mlp_ratio=3, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_small_resnet50d_s16_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_resnet26d_224(pretrained=False, **kwargs):
+    """ Custom ViT base hybrid w/ ResNet26D stride 32. No pretrained weights.
+    """
+    backbone = resnet26d(pretrained=pretrained, in_chans=kwargs.get('in_chans', 3), features_only=True, out_indices=[4])
+    model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_base_resnet26d_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_resnet50d_224(pretrained=False, **kwargs):
+    """ Custom ViT base hybrid w/ ResNet50D stride 32. No pretrained weights.
+    """
+    backbone = resnet50d(pretrained=pretrained, in_chans=kwargs.get('in_chans', 3), features_only=True, out_indices=[4])
+    model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer_hybrid(
+        'vit_base_resnet50d_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
+    return model
diff --git a/src/custom_timm/models/vision_transformer_relpos.py b/src/custom_timm/models/vision_transformer_relpos.py
new file mode 100644
index 0000000000000000000000000000000000000000..288195adf4dde547efc7fc6af2b4350b6ea114e2
--- /dev/null
+++ b/src/custom_timm/models/vision_transformer_relpos.py
@@ -0,0 +1,654 @@
+""" Relative Position Vision Transformer (ViT) in PyTorch
+
+NOTE: these models are experimental / WIP, expect changes
+
+Hacked together by / Copyright 2022, Ross Wightman
+"""
+import math
+import logging
+from functools import partial
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+from .helpers import build_model_with_cfg, resolve_pretrained_cfg, named_apply
+from .layers import PatchEmbed, Mlp, DropPath, trunc_normal_, lecun_normal_, to_2tuple
+from .registry import register_model
+
+_logger = logging.getLogger(__name__)
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD,
+        'first_conv': 'patch_embed.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'vit_relpos_base_patch32_plus_rpn_256': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_replos_base_patch32_plus_rpn_256-sw-dd486f51.pth',
+        input_size=(3, 256, 256)),
+    'vit_relpos_base_patch16_plus_240': _cfg(url='', input_size=(3, 240, 240)),
+
+    'vit_relpos_small_patch16_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_small_patch16_224-sw-ec2778b4.pth'),
+    'vit_relpos_medium_patch16_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_medium_patch16_224-sw-11c174af.pth'),
+    'vit_relpos_base_patch16_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_base_patch16_224-sw-49049aed.pth'),
+
+    'vit_srelpos_small_patch16_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_srelpos_small_patch16_224-sw-6cdb8849.pth'),
+    'vit_srelpos_medium_patch16_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_srelpos_medium_patch16_224-sw-ad702b8c.pth'),
+
+    'vit_relpos_medium_patch16_cls_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_medium_patch16_cls_224-sw-cfe8e259.pth'),
+    'vit_relpos_base_patch16_cls_224': _cfg(
+        url=''),
+    'vit_relpos_base_patch16_clsgap_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_base_patch16_gapcls_224-sw-1a341d6c.pth'),
+
+    'vit_relpos_small_patch16_rpn_224': _cfg(url=''),
+    'vit_relpos_medium_patch16_rpn_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_medium_patch16_rpn_224-sw-5d2befd8.pth'),
+    'vit_relpos_base_patch16_rpn_224': _cfg(url=''),
+}
+
+
+def gen_relative_position_index(
+        q_size: Tuple[int, int],
+        k_size: Tuple[int, int] = None,
+        class_token: bool = False) -> torch.Tensor:
+    # Adapted with significant modifications from Swin / BeiT codebases
+    # get pair-wise relative position index for each token inside the window
+    q_coords = torch.stack(torch.meshgrid([torch.arange(q_size[0]), torch.arange(q_size[1])])).flatten(1)  # 2, Wh, Ww
+    if k_size is None:
+        k_coords = q_coords
+        k_size = q_size
+    else:
+        # different q vs k sizes is a WIP
+        k_coords = torch.stack(torch.meshgrid([torch.arange(k_size[0]), torch.arange(k_size[1])])).flatten(1)
+    relative_coords = q_coords[:, :, None] - k_coords[:, None, :]  # 2, Wh*Ww, Wh*Ww
+    relative_coords = relative_coords.permute(1, 2, 0)  # Wh*Ww, Wh*Ww, 2
+    _, relative_position_index = torch.unique(relative_coords.view(-1, 2), return_inverse=True, dim=0)
+
+    if class_token:
+        # handle cls to token & token 2 cls & cls to cls as per beit for rel pos bias
+        # NOTE not intended or tested with MLP log-coords
+        max_size = (max(q_size[0], k_size[0]), max(q_size[1], k_size[1]))
+        num_relative_distance = (2 * max_size[0] - 1) * (2 * max_size[1] - 1) + 3
+        relative_position_index = F.pad(relative_position_index, [1, 0, 1, 0])
+        relative_position_index[0, 0:] = num_relative_distance - 3
+        relative_position_index[0:, 0] = num_relative_distance - 2
+        relative_position_index[0, 0] = num_relative_distance - 1
+
+    return relative_position_index.contiguous()
+
+
+def gen_relative_log_coords(
+        win_size: Tuple[int, int],
+        pretrained_win_size: Tuple[int, int] = (0, 0),
+        mode='swin',
+):
+    assert mode in ('swin', 'cr', 'rw')
+    # as per official swin-v2 impl, supporting timm specific 'cr' and 'rw' log coords as well
+    relative_coords_h = torch.arange(-(win_size[0] - 1), win_size[0], dtype=torch.float32)
+    relative_coords_w = torch.arange(-(win_size[1] - 1), win_size[1], dtype=torch.float32)
+    relative_coords_table = torch.stack(torch.meshgrid([relative_coords_h, relative_coords_w]))
+    relative_coords_table = relative_coords_table.permute(1, 2, 0).contiguous()  # 2*Wh-1, 2*Ww-1, 2
+    if mode == 'swin':
+        if pretrained_win_size[0] > 0:
+            relative_coords_table[:, :, 0] /= (pretrained_win_size[0] - 1)
+            relative_coords_table[:, :, 1] /= (pretrained_win_size[1] - 1)
+        else:
+            relative_coords_table[:, :, 0] /= (win_size[0] - 1)
+            relative_coords_table[:, :, 1] /= (win_size[1] - 1)
+        relative_coords_table *= 8  # normalize to -8, 8
+        relative_coords_table = torch.sign(relative_coords_table) * torch.log2(
+            1.0 + relative_coords_table.abs()) / math.log2(8)
+    else:
+        if mode == 'rw':
+            # cr w/ window size normalization -> [-1,1] log coords
+            relative_coords_table[:, :, 0] /= (win_size[0] - 1)
+            relative_coords_table[:, :, 1] /= (win_size[1] - 1)
+            relative_coords_table *= 8  # scale to -8, 8
+            relative_coords_table = torch.sign(relative_coords_table) * torch.log2(
+                1.0 + relative_coords_table.abs())
+            relative_coords_table /= math.log2(9)   # -> [-1, 1]
+        else:
+            # mode == 'cr'
+            relative_coords_table = torch.sign(relative_coords_table) * torch.log(
+                1.0 + relative_coords_table.abs())
+
+    return relative_coords_table
+
+
+class RelPosMlp(nn.Module):
+    def __init__(
+            self,
+            window_size,
+            num_heads=8,
+            hidden_dim=128,
+            prefix_tokens=0,
+            mode='cr',
+            pretrained_window_size=(0, 0)
+    ):
+        super().__init__()
+        self.window_size = window_size
+        self.window_area = self.window_size[0] * self.window_size[1]
+        self.prefix_tokens = prefix_tokens
+        self.num_heads = num_heads
+        self.bias_shape = (self.window_area,) * 2 + (num_heads,)
+        if mode == 'swin':
+            self.bias_act = nn.Sigmoid()
+            self.bias_gain = 16
+            mlp_bias = (True, False)
+        elif mode == 'rw':
+            self.bias_act = nn.Tanh()
+            self.bias_gain = 4
+            mlp_bias = True
+        else:
+            self.bias_act = nn.Identity()
+            self.bias_gain = None
+            mlp_bias = True
+
+        self.mlp = Mlp(
+            2,  # x, y
+            hidden_features=hidden_dim,
+            out_features=num_heads,
+            act_layer=nn.ReLU,
+            bias=mlp_bias,
+            drop=(0.125, 0.)
+        )
+
+        self.register_buffer(
+            "relative_position_index",
+            gen_relative_position_index(window_size),
+            persistent=False)
+
+        # get relative_coords_table
+        self.register_buffer(
+            "rel_coords_log",
+            gen_relative_log_coords(window_size, pretrained_window_size, mode=mode),
+            persistent=False)
+
+    def get_bias(self) -> torch.Tensor:
+        relative_position_bias = self.mlp(self.rel_coords_log)
+        if self.relative_position_index is not None:
+            relative_position_bias = relative_position_bias.view(-1, self.num_heads)[
+                self.relative_position_index.view(-1)]  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.view(self.bias_shape)
+        relative_position_bias = relative_position_bias.permute(2, 0, 1)
+        relative_position_bias = self.bias_act(relative_position_bias)
+        if self.bias_gain is not None:
+            relative_position_bias = self.bias_gain * relative_position_bias
+        if self.prefix_tokens:
+            relative_position_bias = F.pad(relative_position_bias, [self.prefix_tokens, 0, self.prefix_tokens, 0])
+        return relative_position_bias.unsqueeze(0).contiguous()
+
+    def forward(self, attn, shared_rel_pos: Optional[torch.Tensor] = None):
+        return attn + self.get_bias()
+
+
+class RelPosBias(nn.Module):
+
+    def __init__(self, window_size, num_heads, prefix_tokens=0):
+        super().__init__()
+        assert prefix_tokens <= 1
+        self.window_size = window_size
+        self.window_area = window_size[0] * window_size[1]
+        self.bias_shape = (self.window_area + prefix_tokens,) * 2 + (num_heads,)
+
+        num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 * prefix_tokens
+        self.relative_position_bias_table = nn.Parameter(torch.zeros(num_relative_distance, num_heads))
+        self.register_buffer(
+            "relative_position_index",
+            gen_relative_position_index(self.window_size, class_token=prefix_tokens > 0),
+            persistent=False,
+        )
+
+        self.init_weights()
+
+    def init_weights(self):
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+
+    def get_bias(self) -> torch.Tensor:
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)]
+        # win_h * win_w, win_h * win_w, num_heads
+        relative_position_bias = relative_position_bias.view(self.bias_shape).permute(2, 0, 1)
+        return relative_position_bias.unsqueeze(0).contiguous()
+
+    def forward(self, attn, shared_rel_pos: Optional[torch.Tensor] = None):
+        return attn + self.get_bias()
+
+
+class RelPosAttention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, rel_pos_cls=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.rel_pos = rel_pos_cls(num_heads=num_heads) if rel_pos_cls else None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, shared_rel_pos: Optional[torch.Tensor] = None):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)   # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        if self.rel_pos is not None:
+            attn = self.rel_pos(attn, shared_rel_pos=shared_rel_pos)
+        elif shared_rel_pos is not None:
+            attn = attn + shared_rel_pos
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x):
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+
+
+class RelPosBlock(nn.Module):
+
+    def __init__(
+            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, rel_pos_cls=None, init_values=None,
+            drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = RelPosAttention(
+            dim, num_heads, qkv_bias=qkv_bias, rel_pos_cls=rel_pos_cls, attn_drop=attn_drop, proj_drop=drop)
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x, shared_rel_pos: Optional[torch.Tensor] = None):
+        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x), shared_rel_pos=shared_rel_pos)))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+
+
+class ResPostRelPosBlock(nn.Module):
+
+    def __init__(
+            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, rel_pos_cls=None, init_values=None,
+            drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.init_values = init_values
+
+        self.attn = RelPosAttention(
+            dim, num_heads, qkv_bias=qkv_bias, rel_pos_cls=rel_pos_cls, attn_drop=attn_drop, proj_drop=drop)
+        self.norm1 = norm_layer(dim)
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
+        self.norm2 = norm_layer(dim)
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.init_weights()
+
+    def init_weights(self):
+        # NOTE this init overrides that base model init with specific changes for the block type
+        if self.init_values is not None:
+            nn.init.constant_(self.norm1.weight, self.init_values)
+            nn.init.constant_(self.norm2.weight, self.init_values)
+
+    def forward(self, x, shared_rel_pos: Optional[torch.Tensor] = None):
+        x = x + self.drop_path1(self.norm1(self.attn(x, shared_rel_pos=shared_rel_pos)))
+        x = x + self.drop_path2(self.norm2(self.mlp(x)))
+        return x
+
+
+class VisionTransformerRelPos(nn.Module):
+    """ Vision Transformer w/ Relative Position Bias
+
+    Differing from classic vit, this impl
+      * uses relative position index (swin v1 / beit) or relative log coord + mlp (swin v2) pos embed
+      * defaults to no class token (can be enabled)
+      * defaults to global avg pool for head (can be changed)
+      * layer-scale (residual branch gain) enabled
+    """
+
+    def __init__(
+            self,
+            img_size=224,
+            patch_size=16,
+            in_chans=3,
+            num_classes=1000,
+            global_pool='avg',
+            embed_dim=768,
+            depth=12,
+            num_heads=12,
+            mlp_ratio=4.,
+            qkv_bias=True,
+            init_values=1e-6,
+            class_token=False,
+            fc_norm=False,
+            rel_pos_type='mlp',
+            rel_pos_dim=None,
+            shared_rel_pos=False,
+            drop_rate=0.,
+            attn_drop_rate=0.,
+            drop_path_rate=0.,
+            weight_init='skip',
+            embed_layer=PatchEmbed,
+            norm_layer=None,
+            act_layer=None,
+            block_fn=RelPosBlock
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            global_pool (str): type of global pooling for final sequence (default: 'avg')
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            init_values: (float): layer-scale init values
+            class_token (bool): use class token (default: False)
+            fc_norm (bool): use pre classifier norm instead of pre-pool
+            rel_pos_ty pe (str): type of relative position
+            shared_rel_pos (bool): share relative pos across all blocks
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            weight_init (str): weight init scheme
+            embed_layer (nn.Module): patch embedding layer
+            norm_layer: (nn.Module): normalization layer
+            act_layer: (nn.Module): MLP activation layer
+        """
+        super().__init__()
+        assert global_pool in ('', 'avg', 'token')
+        assert class_token or global_pool != 'token'
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_prefix_tokens = 1 if class_token else 0
+        self.grad_checkpointing = False
+
+        self.patch_embed = embed_layer(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        feat_size = self.patch_embed.grid_size
+
+        rel_pos_args = dict(window_size=feat_size, prefix_tokens=self.num_prefix_tokens)
+        if rel_pos_type.startswith('mlp'):
+            if rel_pos_dim:
+                rel_pos_args['hidden_dim'] = rel_pos_dim
+            # FIXME experimenting with different relpos log coord configs
+            if 'swin' in rel_pos_type:
+                rel_pos_args['mode'] = 'swin'
+            elif 'rw' in rel_pos_type:
+                rel_pos_args['mode'] = 'rw'
+            rel_pos_cls = partial(RelPosMlp, **rel_pos_args)
+        else:
+            rel_pos_cls = partial(RelPosBias, **rel_pos_args)
+        self.shared_rel_pos = None
+        if shared_rel_pos:
+            self.shared_rel_pos = rel_pos_cls(num_heads=num_heads)
+            # NOTE shared rel pos currently mutually exclusive w/ per-block, but could support both...
+            rel_pos_cls = None
+
+        self.cls_token = nn.Parameter(torch.zeros(1, self.num_prefix_tokens, embed_dim)) if class_token else None
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            block_fn(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, rel_pos_cls=rel_pos_cls,
+                init_values=init_values, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i],
+                norm_layer=norm_layer, act_layer=act_layer)
+            for i in range(depth)])
+        self.norm = norm_layer(embed_dim) if not fc_norm else nn.Identity()
+
+        # Classifier Head
+        self.fc_norm = norm_layer(embed_dim) if fc_norm else nn.Identity()
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        if weight_init != 'skip':
+            self.init_weights(weight_init)
+
+    def init_weights(self, mode=''):
+        assert mode in ('jax', 'moco', '')
+        if self.cls_token is not None:
+            nn.init.normal_(self.cls_token, std=1e-6)
+        # FIXME weight init scheme using PyTorch defaults curently
+        #named_apply(get_init_weights_vit(mode, head_bias), self)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'cls_token'}
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^cls_token|patch_embed',  # stem and embed
+            blocks=[(r'^blocks\.(\d+)', None), (r'^norm', (99999,))]
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes: int, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ('', 'avg', 'token')
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        if self.cls_token is not None:
+            x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+
+        shared_rel_pos = self.shared_rel_pos.get_bias() if self.shared_rel_pos is not None else None
+        for blk in self.blocks:
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(blk, x, shared_rel_pos=shared_rel_pos)
+            else:
+                x = blk(x, shared_rel_pos=shared_rel_pos)
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool:
+            x = x[:, self.num_prefix_tokens:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0]
+        x = self.fc_norm(x)
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_vision_transformer_relpos(variant, pretrained=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+
+    model = build_model_with_cfg(VisionTransformerRelPos, variant, pretrained, **kwargs)
+    return model
+
+
+@register_model
+def vit_relpos_base_patch32_plus_rpn_256(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/32+) w/ relative log-coord position and residual post-norm, no class token
+    """
+    model_kwargs = dict(
+        patch_size=32, embed_dim=896, depth=12, num_heads=14, block_fn=ResPostRelPosBlock, **kwargs)
+    model = _create_vision_transformer_relpos(
+        'vit_relpos_base_patch32_plus_rpn_256', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_relpos_base_patch16_plus_240(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16+) w/ relative log-coord position, no class token
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=896, depth=12, num_heads=14, **kwargs)
+    model = _create_vision_transformer_relpos('vit_relpos_base_patch16_plus_240', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_relpos_small_patch16_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) w/ relative log-coord position, no class token
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=384, depth=12, num_heads=6, qkv_bias=False, fc_norm=True, **kwargs)
+    model = _create_vision_transformer_relpos('vit_relpos_small_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_relpos_medium_patch16_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) w/ relative log-coord position, no class token
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=512, depth=12, num_heads=8, qkv_bias=False, fc_norm=True, **kwargs)
+    model = _create_vision_transformer_relpos('vit_relpos_medium_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_relpos_base_patch16_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) w/ relative log-coord position, no class token
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, fc_norm=True, **kwargs)
+    model = _create_vision_transformer_relpos('vit_relpos_base_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_srelpos_small_patch16_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) w/ shared relative log-coord position, no class token
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=384, depth=12, num_heads=6, qkv_bias=False, fc_norm=False,
+        rel_pos_dim=384, shared_rel_pos=True, **kwargs)
+    model = _create_vision_transformer_relpos('vit_srelpos_small_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_srelpos_medium_patch16_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) w/ shared relative log-coord position, no class token
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=512, depth=12, num_heads=8, qkv_bias=False, fc_norm=False,
+        rel_pos_dim=512, shared_rel_pos=True, **kwargs)
+    model = _create_vision_transformer_relpos(
+        'vit_srelpos_medium_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_relpos_medium_patch16_cls_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-M/16) w/ relative log-coord position, class token present
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=512, depth=12, num_heads=8, qkv_bias=False, fc_norm=False,
+        rel_pos_dim=256, class_token=True, global_pool='token', **kwargs)
+    model = _create_vision_transformer_relpos(
+        'vit_relpos_medium_patch16_cls_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_relpos_base_patch16_cls_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) w/ relative log-coord position, class token present
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False,
+        class_token=True, global_pool='token', **kwargs)
+    model = _create_vision_transformer_relpos('vit_relpos_base_patch16_cls_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_relpos_base_patch16_clsgap_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) w/ relative log-coord position, class token present
+    NOTE this config is a bit of a mistake, class token was enabled but global avg-pool w/ fc-norm was not disabled
+    Leaving here for comparisons w/ a future re-train as it performs quite well.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, fc_norm=True, class_token=True, **kwargs)
+    model = _create_vision_transformer_relpos('vit_relpos_base_patch16_clsgap_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_relpos_small_patch16_rpn_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) w/ relative log-coord position and residual post-norm, no class token
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=384, depth=12, num_heads=6, qkv_bias=False, block_fn=ResPostRelPosBlock, **kwargs)
+    model = _create_vision_transformer_relpos(
+        'vit_relpos_small_patch16_rpn_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_relpos_medium_patch16_rpn_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) w/ relative log-coord position and residual post-norm, no class token
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=512, depth=12, num_heads=8, qkv_bias=False, block_fn=ResPostRelPosBlock, **kwargs)
+    model = _create_vision_transformer_relpos(
+        'vit_relpos_medium_patch16_rpn_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_relpos_base_patch16_rpn_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) w/ relative log-coord position and residual post-norm, no class token
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, block_fn=ResPostRelPosBlock, **kwargs)
+    model = _create_vision_transformer_relpos(
+        'vit_relpos_base_patch16_rpn_224', pretrained=pretrained, **model_kwargs)
+    return model
diff --git a/src/custom_timm/models/volo.py b/src/custom_timm/models/volo.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c2886af59a29bc8bd7493a85a8158eecce70914
--- /dev/null
+++ b/src/custom_timm/models/volo.py
@@ -0,0 +1,750 @@
+""" Vision OutLOoker (VOLO) implementation
+
+Paper: `VOLO: Vision Outlooker for Visual Recognition` - https://arxiv.org/abs/2106.13112
+
+Code adapted from official impl at https://github.com/sail-sg/volo, original copyright in comment below
+
+Modifications and additions for timm by / Copyright 2022, Ross Wightman
+"""
+# Copyright 2021 Sea Limited.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from custom_timm.models.layers import DropPath, Mlp, to_2tuple, to_ntuple, trunc_normal_
+from custom_timm.models.registry import register_model
+from custom_timm.models.helpers import build_model_with_cfg
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .96, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.conv.0', 'classifier': ('head', 'aux_head'),
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'volo_d1_224': _cfg(
+        url='https://github.com/sail-sg/volo/releases/download/volo_1/d1_224_84.2.pth.tar',
+        crop_pct=0.96),
+    'volo_d1_384': _cfg(
+        url='https://github.com/sail-sg/volo/releases/download/volo_1/d1_384_85.2.pth.tar',
+        crop_pct=1.0, input_size=(3, 384, 384)),
+    'volo_d2_224': _cfg(
+        url='https://github.com/sail-sg/volo/releases/download/volo_1/d2_224_85.2.pth.tar',
+        crop_pct=0.96),
+    'volo_d2_384': _cfg(
+        url='https://github.com/sail-sg/volo/releases/download/volo_1/d2_384_86.0.pth.tar',
+        crop_pct=1.0, input_size=(3, 384, 384)),
+    'volo_d3_224': _cfg(
+        url='https://github.com/sail-sg/volo/releases/download/volo_1/d3_224_85.4.pth.tar',
+        crop_pct=0.96),
+    'volo_d3_448': _cfg(
+        url='https://github.com/sail-sg/volo/releases/download/volo_1/d3_448_86.3.pth.tar',
+        crop_pct=1.0, input_size=(3, 448, 448)),
+    'volo_d4_224': _cfg(
+        url='https://github.com/sail-sg/volo/releases/download/volo_1/d4_224_85.7.pth.tar',
+        crop_pct=0.96),
+    'volo_d4_448': _cfg(
+        url='https://github.com/sail-sg/volo/releases/download/volo_1/d4_448_86.79.pth.tar',
+        crop_pct=1.15, input_size=(3, 448, 448)),
+    'volo_d5_224': _cfg(
+        url='https://github.com/sail-sg/volo/releases/download/volo_1/d5_224_86.10.pth.tar',
+        crop_pct=0.96),
+    'volo_d5_448': _cfg(
+        url='https://github.com/sail-sg/volo/releases/download/volo_1/d5_448_87.0.pth.tar',
+        crop_pct=1.15, input_size=(3, 448, 448)),
+    'volo_d5_512': _cfg(
+        url='https://github.com/sail-sg/volo/releases/download/volo_1/d5_512_87.07.pth.tar',
+        crop_pct=1.15, input_size=(3, 512, 512)),
+}
+
+
+class OutlookAttention(nn.Module):
+
+    def __init__(self, dim, num_heads, kernel_size=3, padding=1, stride=1, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        head_dim = dim // num_heads
+        self.num_heads = num_heads
+        self.kernel_size = kernel_size
+        self.padding = padding
+        self.stride = stride
+        self.scale = head_dim ** -0.5
+
+        self.v = nn.Linear(dim, dim, bias=qkv_bias)
+        self.attn = nn.Linear(dim, kernel_size ** 4 * num_heads)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.unfold = nn.Unfold(kernel_size=kernel_size, padding=padding, stride=stride)
+        self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride, ceil_mode=True)
+
+    def forward(self, x):
+        B, H, W, C = x.shape
+
+        v = self.v(x).permute(0, 3, 1, 2)  # B, C, H, W
+
+        h, w = math.ceil(H / self.stride), math.ceil(W / self.stride)
+        v = self.unfold(v).reshape(
+            B, self.num_heads, C // self.num_heads,
+            self.kernel_size * self.kernel_size, h * w).permute(0, 1, 4, 3, 2)  # B,H,N,kxk,C/H
+
+        attn = self.pool(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
+        attn = self.attn(attn).reshape(
+            B, h * w, self.num_heads, self.kernel_size * self.kernel_size,
+            self.kernel_size * self.kernel_size).permute(0, 2, 1, 3, 4)  # B,H,N,kxk,kxk
+        attn = attn * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).permute(0, 1, 4, 3, 2).reshape(B, C * self.kernel_size * self.kernel_size, h * w)
+        x = F.fold(x, output_size=(H, W), kernel_size=self.kernel_size, padding=self.padding, stride=self.stride)
+
+        x = self.proj(x.permute(0, 2, 3, 1))
+        x = self.proj_drop(x)
+
+        return x
+
+
+class Outlooker(nn.Module):
+    def __init__(
+            self, dim, kernel_size, padding, stride=1, num_heads=1, mlp_ratio=3., attn_drop=0.,
+            drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, qkv_bias=False
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = OutlookAttention(
+            dim, num_heads, kernel_size=kernel_size,
+            padding=padding, stride=stride,
+            qkv_bias=qkv_bias, attn_drop=attn_drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class Attention(nn.Module):
+
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, H, W, C = x.shape
+
+        qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, H, W, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class Transformer(nn.Module):
+
+    def __init__(
+            self, dim, num_heads, mlp_ratio=4., qkv_bias=False,
+            attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop)
+
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class ClassAttention(nn.Module):
+
+    def __init__(
+            self, dim, num_heads=8, head_dim=None, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        if head_dim is not None:
+            self.head_dim = head_dim
+        else:
+            head_dim = dim // num_heads
+            self.head_dim = head_dim
+        self.scale = head_dim ** -0.5
+
+        self.kv = nn.Linear(dim, self.head_dim * self.num_heads * 2, bias=qkv_bias)
+        self.q = nn.Linear(dim, self.head_dim * self.num_heads, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(self.head_dim * self.num_heads, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+
+        kv = self.kv(x).reshape(B, N, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        k, v = kv.unbind(0)
+        q = self.q(x[:, :1, :]).reshape(B, self.num_heads, 1, self.head_dim)
+        attn = ((q * self.scale) @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        cls_embed = (attn @ v).transpose(1, 2).reshape(B, 1, self.head_dim * self.num_heads)
+        cls_embed = self.proj(cls_embed)
+        cls_embed = self.proj_drop(cls_embed)
+        return cls_embed
+
+
+class ClassBlock(nn.Module):
+
+    def __init__(
+            self, dim, num_heads, head_dim=None, mlp_ratio=4., qkv_bias=False,
+            drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = ClassAttention(
+            dim, num_heads=num_heads, head_dim=head_dim, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x):
+        cls_embed = x[:, :1]
+        cls_embed = cls_embed + self.drop_path(self.attn(self.norm1(x)))
+        cls_embed = cls_embed + self.drop_path(self.mlp(self.norm2(cls_embed)))
+        return torch.cat([cls_embed, x[:, 1:]], dim=1)
+
+
+def get_block(block_type, **kargs):
+    if block_type == 'ca':
+        return ClassBlock(**kargs)
+
+
+def rand_bbox(size, lam, scale=1):
+    """
+    get bounding box as token labeling (https://github.com/zihangJiang/TokenLabeling)
+    return: bounding box
+    """
+    W = size[1] // scale
+    H = size[2] // scale
+    cut_rat = np.sqrt(1. - lam)
+    cut_w = np.int(W * cut_rat)
+    cut_h = np.int(H * cut_rat)
+
+    # uniform
+    cx = np.random.randint(W)
+    cy = np.random.randint(H)
+
+    bbx1 = np.clip(cx - cut_w // 2, 0, W)
+    bby1 = np.clip(cy - cut_h // 2, 0, H)
+    bbx2 = np.clip(cx + cut_w // 2, 0, W)
+    bby2 = np.clip(cy + cut_h // 2, 0, H)
+
+    return bbx1, bby1, bbx2, bby2
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding.
+    Different with ViT use 1 conv layer, we use 4 conv layers to do patch embedding
+    """
+
+    def __init__(
+            self, img_size=224, stem_conv=False, stem_stride=1,
+            patch_size=8, in_chans=3, hidden_dim=64, embed_dim=384):
+        super().__init__()
+        assert patch_size in [4, 8, 16]
+        if stem_conv:
+            self.conv = nn.Sequential(
+                nn.Conv2d(in_chans, hidden_dim, kernel_size=7, stride=stem_stride, padding=3, bias=False),  # 112x112
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, stride=1, padding=1, bias=False),  # 112x112
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, stride=1, padding=1, bias=False),  # 112x112
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU(inplace=True),
+            )
+        else:
+            self.conv = None
+
+        self.proj = nn.Conv2d(
+            hidden_dim, embed_dim, kernel_size=patch_size // stem_stride, stride=patch_size // stem_stride)
+        self.num_patches = (img_size // patch_size) * (img_size // patch_size)
+
+    def forward(self, x):
+        if self.conv is not None:
+            x = self.conv(x)
+        x = self.proj(x)  # B, C, H, W
+        return x
+
+
+class Downsample(nn.Module):
+    """ Image to Patch Embedding, downsampling between stage1 and stage2
+    """
+
+    def __init__(self, in_embed_dim, out_embed_dim, patch_size=2):
+        super().__init__()
+        self.proj = nn.Conv2d(in_embed_dim, out_embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        x = x.permute(0, 3, 1, 2)
+        x = self.proj(x)  # B, C, H, W
+        x = x.permute(0, 2, 3, 1)
+        return x
+
+
+def outlooker_blocks(
+        block_fn, index, dim, layers, num_heads=1, kernel_size=3, padding=1, stride=2,
+        mlp_ratio=3., qkv_bias=False, attn_drop=0, drop_path_rate=0., **kwargs):
+    """
+    generate outlooker layer in stage1
+    return: outlooker layers
+    """
+    blocks = []
+    for block_idx in range(layers[index]):
+        block_dpr = drop_path_rate * (block_idx + sum(layers[:index])) / (sum(layers) - 1)
+        blocks.append(
+            block_fn(
+                dim, kernel_size=kernel_size, padding=padding,
+                stride=stride, num_heads=num_heads, mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias, attn_drop=attn_drop, drop_path=block_dpr))
+    blocks = nn.Sequential(*blocks)
+    return blocks
+
+
+def transformer_blocks(
+        block_fn, index, dim, layers, num_heads, mlp_ratio=3.,
+        qkv_bias=False, attn_drop=0, drop_path_rate=0., **kwargs):
+    """
+    generate transformer layers in stage2
+    return: transformer layers
+    """
+    blocks = []
+    for block_idx in range(layers[index]):
+        block_dpr = drop_path_rate * (block_idx + sum(layers[:index])) / (sum(layers) - 1)
+        blocks.append(
+            block_fn(
+                dim, num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                attn_drop=attn_drop,
+                drop_path=block_dpr))
+    blocks = nn.Sequential(*blocks)
+    return blocks
+
+
+class VOLO(nn.Module):
+    """
+    Vision Outlooker, the main class of our model
+    """
+
+    def __init__(
+            self,
+            layers,
+            img_size=224,
+            in_chans=3,
+            num_classes=1000,
+            global_pool='token',
+            patch_size=8,
+            stem_hidden_dim=64,
+            embed_dims=None,
+            num_heads=None,
+            downsamples=(True, False, False, False),
+            outlook_attention=(True, False, False, False),
+            mlp_ratio=3.0,
+            qkv_bias=False,
+            drop_rate=0.,
+            attn_drop_rate=0.,
+            drop_path_rate=0.,
+            norm_layer=nn.LayerNorm,
+            post_layers=('ca', 'ca'),
+            use_aux_head=True,
+            use_mix_token=False,
+            pooling_scale=2,
+    ):
+        super().__init__()
+        num_layers = len(layers)
+        mlp_ratio = to_ntuple(num_layers)(mlp_ratio)
+        img_size = to_2tuple(img_size)
+
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.mix_token = use_mix_token
+        self.pooling_scale = pooling_scale
+        self.num_features = embed_dims[-1]
+        if use_mix_token:  # enable token mixing, see token labeling for details.
+            self.beta = 1.0
+            assert global_pool == 'token', "return all tokens if mix_token is enabled"
+        self.grad_checkpointing = False
+
+        self.patch_embed = PatchEmbed(
+            stem_conv=True, stem_stride=2, patch_size=patch_size,
+            in_chans=in_chans, hidden_dim=stem_hidden_dim,
+            embed_dim=embed_dims[0])
+
+        # inital positional encoding, we add positional encoding after outlooker blocks
+        patch_grid = (img_size[0] // patch_size // pooling_scale, img_size[1] // patch_size // pooling_scale)
+        self.pos_embed = nn.Parameter(torch.zeros(1, patch_grid[0], patch_grid[1], embed_dims[-1]))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # set the main block in network
+        network = []
+        for i in range(len(layers)):
+            if outlook_attention[i]:
+                # stage 1
+                stage = outlooker_blocks(
+                    Outlooker, i, embed_dims[i], layers, num_heads[i], mlp_ratio=mlp_ratio[i],
+                    qkv_bias=qkv_bias, attn_drop=attn_drop_rate, norm_layer=norm_layer)
+                network.append(stage)
+            else:
+                # stage 2
+                stage = transformer_blocks(
+                    Transformer, i, embed_dims[i], layers, num_heads[i], mlp_ratio=mlp_ratio[i], qkv_bias=qkv_bias,
+                    drop_path_rate=drop_path_rate, attn_drop=attn_drop_rate, norm_layer=norm_layer)
+                network.append(stage)
+
+            if downsamples[i]:
+                # downsampling between two stages
+                network.append(Downsample(embed_dims[i], embed_dims[i + 1], 2))
+
+        self.network = nn.ModuleList(network)
+
+        # set post block, for example, class attention layers
+        self.post_network = None
+        if post_layers is not None:
+            self.post_network = nn.ModuleList(
+                [
+                    get_block(
+                        post_layers[i],
+                        dim=embed_dims[-1],
+                        num_heads=num_heads[-1],
+                        mlp_ratio=mlp_ratio[-1],
+                        qkv_bias=qkv_bias,
+                        attn_drop=attn_drop_rate,
+                        drop_path=0.,
+                        norm_layer=norm_layer)
+                    for i in range(len(post_layers))
+                ])
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims[-1]))
+            trunc_normal_(self.cls_token, std=.02)
+
+        # set output type
+        if use_aux_head:
+            self.aux_head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        else:
+            self.aux_head = None
+        self.norm = norm_layer(self.num_features)
+
+        # Classifier head
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+        trunc_normal_(self.pos_embed, std=.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^cls_token|pos_embed|patch_embed',  # stem and embed
+            blocks=[
+                (r'^network\.(\d+)\.(\d+)', None),
+                (r'^network\.(\d+)', (0,)),
+            ],
+            blocks2=[
+                (r'^cls_token', (0,)),
+                (r'^post_network\.(\d+)', None),
+                (r'^norm', (99999,))
+            ],
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        if self.aux_head is not None:
+            self.aux_head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_tokens(self, x):
+        for idx, block in enumerate(self.network):
+            if idx == 2:
+                # add positional encoding after outlooker blocks
+                x = x + self.pos_embed
+                x = self.pos_drop(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(block, x)
+            else:
+                x = block(x)
+
+        B, H, W, C = x.shape
+        x = x.reshape(B, -1, C)
+        return x
+
+    def forward_cls(self, x):
+        B, N, C = x.shape
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat([cls_tokens, x], dim=1)
+        for block in self.post_network:
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(block, x)
+            else:
+                x = block(x)
+        return x
+
+    def forward_train(self, x):
+        """ A separate forward fn for training with mix_token (if a train script supports).
+        Combining multiple modes in as single forward with different return types is torchscript hell.
+        """
+        x = self.patch_embed(x)
+        x = x.permute(0, 2, 3, 1)  # B,C,H,W-> B,H,W,C
+
+        # mix token, see token labeling for details.
+        if self.mix_token and self.training:
+            lam = np.random.beta(self.beta, self.beta)
+            patch_h, patch_w = x.shape[1] // self.pooling_scale, x.shape[2] // self.pooling_scale
+            bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), lam, scale=self.pooling_scale)
+            temp_x = x.clone()
+            sbbx1, sbby1 = self.pooling_scale * bbx1, self.pooling_scale * bby1
+            sbbx2, sbby2 = self.pooling_scale * bbx2, self.pooling_scale * bby2
+            temp_x[:, sbbx1:sbbx2, sbby1:sbby2, :] = x.flip(0)[:, sbbx1:sbbx2, sbby1:sbby2, :]
+            x = temp_x
+        else:
+            bbx1, bby1, bbx2, bby2 = 0, 0, 0, 0
+
+        # step2: tokens learning in the two stages
+        x = self.forward_tokens(x)
+
+        # step3: post network, apply class attention or not
+        if self.post_network is not None:
+            x = self.forward_cls(x)
+        x = self.norm(x)
+
+        if self.global_pool == 'avg':
+            x_cls = x.mean(dim=1)
+        elif self.global_pool == 'token':
+            x_cls = x[:, 0]
+        else:
+            x_cls = x
+
+        if self.aux_head is None:
+            return x_cls
+
+        x_aux = self.aux_head(x[:, 1:])  # generate classes in all feature tokens, see token labeling
+        if not self.training:
+            return x_cls + 0.5 * x_aux.max(1)[0]
+
+        if self.mix_token and self.training:  # reverse "mix token", see token labeling for details.
+            x_aux = x_aux.reshape(x_aux.shape[0], patch_h, patch_w, x_aux.shape[-1])
+            temp_x = x_aux.clone()
+            temp_x[:, bbx1:bbx2, bby1:bby2, :] = x_aux.flip(0)[:, bbx1:bbx2, bby1:bby2, :]
+            x_aux = temp_x
+            x_aux = x_aux.reshape(x_aux.shape[0], patch_h * patch_w, x_aux.shape[-1])
+
+        # return these: 1. class token, 2. classes from all feature tokens, 3. bounding box
+        return x_cls, x_aux, (bbx1, bby1, bbx2, bby2)
+
+    def forward_features(self, x):
+        x = self.patch_embed(x).permute(0, 2, 3, 1)  # B,C,H,W-> B,H,W,C
+
+        # step2: tokens learning in the two stages
+        x = self.forward_tokens(x)
+
+        # step3: post network, apply class attention or not
+        if self.post_network is not None:
+            x = self.forward_cls(x)
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool == 'avg':
+            out = x.mean(dim=1)
+        elif self.global_pool == 'token':
+            out = x[:, 0]
+        else:
+            out = x
+        if pre_logits:
+            return out
+        out = self.head(out)
+        if self.aux_head is not None:
+            # generate classes in all feature tokens, see token labeling
+            aux = self.aux_head(x[:, 1:])
+            out = out + 0.5 * aux.max(1)[0]
+        return out
+
+    def forward(self, x):
+        """ simplified forward (without mix token training) """
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_volo(variant, pretrained=False, **kwargs):
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+    return build_model_with_cfg(VOLO, variant, pretrained, **kwargs)
+
+
+@register_model
+def volo_d1_224(pretrained=False, **kwargs):
+    """ VOLO-D1 model, Params: 27M """
+    model_args = dict(layers=(4, 4, 8, 2), embed_dims=(192, 384, 384, 384), num_heads=(6, 12, 12, 12), **kwargs)
+    model = _create_volo('volo_d1_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def volo_d1_384(pretrained=False, **kwargs):
+    """ VOLO-D1 model, Params: 27M """
+    model_args = dict(layers=(4, 4, 8, 2), embed_dims=(192, 384, 384, 384), num_heads=(6, 12, 12, 12), **kwargs)
+    model = _create_volo('volo_d1_384', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def volo_d2_224(pretrained=False, **kwargs):
+    """ VOLO-D2 model, Params: 59M """
+    model_args = dict(layers=(6, 4, 10, 4), embed_dims=(256, 512, 512, 512), num_heads=(8, 16, 16, 16), **kwargs)
+    model = _create_volo('volo_d2_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def volo_d2_384(pretrained=False, **kwargs):
+    """ VOLO-D2 model, Params: 59M """
+    model_args = dict(layers=(6, 4, 10, 4), embed_dims=(256, 512, 512, 512), num_heads=(8, 16, 16, 16), **kwargs)
+    model = _create_volo('volo_d2_384', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def volo_d3_224(pretrained=False, **kwargs):
+    """ VOLO-D3 model, Params: 86M """
+    model_args = dict(layers=(8, 8, 16, 4), embed_dims=(256, 512, 512, 512), num_heads=(8, 16, 16, 16), **kwargs)
+    model = _create_volo('volo_d3_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def volo_d3_448(pretrained=False, **kwargs):
+    """ VOLO-D3 model, Params: 86M """
+    model_args = dict(layers=(8, 8, 16, 4), embed_dims=(256, 512, 512, 512), num_heads=(8, 16, 16, 16), **kwargs)
+    model = _create_volo('volo_d3_448', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def volo_d4_224(pretrained=False, **kwargs):
+    """ VOLO-D4 model, Params: 193M """
+    model_args = dict(layers=(8, 8, 16, 4), embed_dims=(384, 768, 768, 768), num_heads=(12, 16, 16, 16), **kwargs)
+    model = _create_volo('volo_d4_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def volo_d4_448(pretrained=False, **kwargs):
+    """ VOLO-D4 model, Params: 193M """
+    model_args = dict(layers=(8, 8, 16, 4), embed_dims=(384, 768, 768, 768), num_heads=(12, 16, 16, 16), **kwargs)
+    model = _create_volo('volo_d4_448', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def volo_d5_224(pretrained=False, **kwargs):
+    """ VOLO-D5 model, Params: 296M
+    stem_hidden_dim=128, the dim in patch embedding is 128 for VOLO-D5
+    """
+    model_args = dict(
+        layers=(12, 12, 20, 4), embed_dims=(384, 768, 768, 768), num_heads=(12, 16, 16, 16),
+        mlp_ratio=4, stem_hidden_dim=128, **kwargs)
+    model = _create_volo('volo_d5_224', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def volo_d5_448(pretrained=False, **kwargs):
+    """ VOLO-D5 model, Params: 296M
+    stem_hidden_dim=128, the dim in patch embedding is 128 for VOLO-D5
+    """
+    model_args = dict(
+        layers=(12, 12, 20, 4), embed_dims=(384, 768, 768, 768), num_heads=(12, 16, 16, 16),
+        mlp_ratio=4, stem_hidden_dim=128, **kwargs)
+    model = _create_volo('volo_d5_448', pretrained=pretrained, **model_args)
+    return model
+
+
+@register_model
+def volo_d5_512(pretrained=False, **kwargs):
+    """ VOLO-D5 model, Params: 296M
+    stem_hidden_dim=128, the dim in patch embedding is 128 for VOLO-D5
+    """
+    model_args = dict(
+        layers=(12, 12, 20, 4), embed_dims=(384, 768, 768, 768), num_heads=(12, 16, 16, 16),
+        mlp_ratio=4, stem_hidden_dim=128, **kwargs)
+    model = _create_volo('volo_d5_512', pretrained=pretrained, **model_args)
+    return model
diff --git a/src/custom_timm/models/vovnet.py b/src/custom_timm/models/vovnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e80ffc66c432f6e174c70f5d33bb0dbcde50409
--- /dev/null
+++ b/src/custom_timm/models/vovnet.py
@@ -0,0 +1,424 @@
+""" VoVNet (V1 & V2)
+
+Papers:
+* `An Energy and GPU-Computation Efficient Backbone Network` - https://arxiv.org/abs/1904.09730
+* `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
+
+Looked at  https://github.com/youngwanLEE/vovnet-detectron2 &
+https://github.com/stigma0617/VoVNet.pytorch/blob/master/models_vovnet/vovnet.py
+for some reference, rewrote most of the code.
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .registry import register_model
+from .helpers import build_model_with_cfg, checkpoint_seq
+from .layers import ConvNormAct, SeparableConvNormAct, BatchNormAct2d, ClassifierHead, DropPath,\
+    create_attn, create_norm_act_layer, get_norm_act_layer
+
+
+# model cfgs adapted from https://github.com/youngwanLEE/vovnet-detectron2 &
+# https://github.com/stigma0617/VoVNet.pytorch/blob/master/models_vovnet/vovnet.py
+model_cfgs = dict(
+    vovnet39a=dict(
+        stem_chs=[64, 64, 128],
+        stage_conv_chs=[128, 160, 192, 224],
+        stage_out_chs=[256, 512, 768, 1024],
+        layer_per_block=5,
+        block_per_stage=[1, 1, 2, 2],
+        residual=False,
+        depthwise=False,
+        attn='',
+    ),
+    vovnet57a=dict(
+        stem_chs=[64, 64, 128],
+        stage_conv_chs=[128, 160, 192, 224],
+        stage_out_chs=[256, 512, 768, 1024],
+        layer_per_block=5,
+        block_per_stage=[1, 1, 4, 3],
+        residual=False,
+        depthwise=False,
+        attn='',
+
+    ),
+    ese_vovnet19b_slim_dw=dict(
+        stem_chs=[64, 64, 64],
+        stage_conv_chs=[64, 80, 96, 112],
+        stage_out_chs=[112, 256, 384, 512],
+        layer_per_block=3,
+        block_per_stage=[1, 1, 1, 1],
+        residual=True,
+        depthwise=True,
+        attn='ese',
+
+    ),
+    ese_vovnet19b_dw=dict(
+        stem_chs=[64, 64, 64],
+        stage_conv_chs=[128, 160, 192, 224],
+        stage_out_chs=[256, 512, 768, 1024],
+        layer_per_block=3,
+        block_per_stage=[1, 1, 1, 1],
+        residual=True,
+        depthwise=True,
+        attn='ese',
+    ),
+    ese_vovnet19b_slim=dict(
+        stem_chs=[64, 64, 128],
+        stage_conv_chs=[64, 80, 96, 112],
+        stage_out_chs=[112, 256, 384, 512],
+        layer_per_block=3,
+        block_per_stage=[1, 1, 1, 1],
+        residual=True,
+        depthwise=False,
+        attn='ese',
+    ),
+    ese_vovnet19b=dict(
+        stem_chs=[64, 64, 128],
+        stage_conv_chs=[128, 160, 192, 224],
+        stage_out_chs=[256, 512, 768, 1024],
+        layer_per_block=3,
+        block_per_stage=[1, 1, 1, 1],
+        residual=True,
+        depthwise=False,
+        attn='ese',
+
+    ),
+    ese_vovnet39b=dict(
+        stem_chs=[64, 64, 128],
+        stage_conv_chs=[128, 160, 192, 224],
+        stage_out_chs=[256, 512, 768, 1024],
+        layer_per_block=5,
+        block_per_stage=[1, 1, 2, 2],
+        residual=True,
+        depthwise=False,
+        attn='ese',
+    ),
+    ese_vovnet57b=dict(
+        stem_chs=[64, 64, 128],
+        stage_conv_chs=[128, 160, 192, 224],
+        stage_out_chs=[256, 512, 768, 1024],
+        layer_per_block=5,
+        block_per_stage=[1, 1, 4, 3],
+        residual=True,
+        depthwise=False,
+        attn='ese',
+
+    ),
+    ese_vovnet99b=dict(
+        stem_chs=[64, 64, 128],
+        stage_conv_chs=[128, 160, 192, 224],
+        stage_out_chs=[256, 512, 768, 1024],
+        layer_per_block=5,
+        block_per_stage=[1, 3, 9, 3],
+        residual=True,
+        depthwise=False,
+        attn='ese',
+    ),
+    eca_vovnet39b=dict(
+        stem_chs=[64, 64, 128],
+        stage_conv_chs=[128, 160, 192, 224],
+        stage_out_chs=[256, 512, 768, 1024],
+        layer_per_block=5,
+        block_per_stage=[1, 1, 2, 2],
+        residual=True,
+        depthwise=False,
+        attn='eca',
+    ),
+)
+model_cfgs['ese_vovnet39b_evos'] = model_cfgs['ese_vovnet39b']
+model_cfgs['ese_vovnet99b_iabn'] = model_cfgs['ese_vovnet99b']
+
+
+def _cfg(url=''):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.0.conv', 'classifier': 'head.fc',
+    }
+
+
+default_cfgs = dict(
+    vovnet39a=_cfg(url=''),
+    vovnet57a=_cfg(url=''),
+    ese_vovnet19b_slim_dw=_cfg(url=''),
+    ese_vovnet19b_dw=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ese_vovnet19b_dw-a8741004.pth'),
+    ese_vovnet19b_slim=_cfg(url=''),
+    ese_vovnet39b=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ese_vovnet39b-f912fe73.pth'),
+    ese_vovnet57b=_cfg(url=''),
+    ese_vovnet99b=_cfg(url=''),
+    eca_vovnet39b=_cfg(url=''),
+    ese_vovnet39b_evos=_cfg(url=''),
+    ese_vovnet99b_iabn=_cfg(url=''),
+)
+
+
+class SequentialAppendList(nn.Sequential):
+    def __init__(self, *args):
+        super(SequentialAppendList, self).__init__(*args)
+
+    def forward(self, x: torch.Tensor, concat_list: List[torch.Tensor]) -> torch.Tensor:
+        for i, module in enumerate(self):
+            if i == 0:
+                concat_list.append(module(x))
+            else:
+                concat_list.append(module(concat_list[-1]))
+        x = torch.cat(concat_list, dim=1)
+        return x
+
+
+class OsaBlock(nn.Module):
+
+    def __init__(
+            self, in_chs, mid_chs, out_chs, layer_per_block, residual=False,
+            depthwise=False, attn='', norm_layer=BatchNormAct2d, act_layer=nn.ReLU, drop_path=None):
+        super(OsaBlock, self).__init__()
+
+        self.residual = residual
+        self.depthwise = depthwise
+        conv_kwargs = dict(norm_layer=norm_layer, act_layer=act_layer)
+
+        next_in_chs = in_chs
+        if self.depthwise and next_in_chs != mid_chs:
+            assert not residual
+            self.conv_reduction = ConvNormAct(next_in_chs, mid_chs, 1, **conv_kwargs)
+        else:
+            self.conv_reduction = None
+
+        mid_convs = []
+        for i in range(layer_per_block):
+            if self.depthwise:
+                conv = SeparableConvNormAct(mid_chs, mid_chs, **conv_kwargs)
+            else:
+                conv = ConvNormAct(next_in_chs, mid_chs, 3, **conv_kwargs)
+            next_in_chs = mid_chs
+            mid_convs.append(conv)
+        self.conv_mid = SequentialAppendList(*mid_convs)
+
+        # feature aggregation
+        next_in_chs = in_chs + layer_per_block * mid_chs
+        self.conv_concat = ConvNormAct(next_in_chs, out_chs, **conv_kwargs)
+
+        self.attn = create_attn(attn, out_chs) if attn else None
+
+        self.drop_path = drop_path
+
+    def forward(self, x):
+        output = [x]
+        if self.conv_reduction is not None:
+            x = self.conv_reduction(x)
+        x = self.conv_mid(x, output)
+        x = self.conv_concat(x)
+        if self.attn is not None:
+            x = self.attn(x)
+        if self.drop_path is not None:
+            x = self.drop_path(x)
+        if self.residual:
+            x = x + output[0]
+        return x
+
+
+class OsaStage(nn.Module):
+
+    def __init__(
+            self, in_chs, mid_chs, out_chs, block_per_stage, layer_per_block, downsample=True,
+            residual=True, depthwise=False, attn='ese', norm_layer=BatchNormAct2d, act_layer=nn.ReLU,
+            drop_path_rates=None):
+        super(OsaStage, self).__init__()
+        self.grad_checkpointing = False
+
+        if downsample:
+            self.pool = nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)
+        else:
+            self.pool = None
+
+        blocks = []
+        for i in range(block_per_stage):
+            last_block = i == block_per_stage - 1
+            if drop_path_rates is not None and drop_path_rates[i] > 0.:
+                drop_path = DropPath(drop_path_rates[i])
+            else:
+                drop_path = None
+            blocks += [OsaBlock(
+                in_chs, mid_chs, out_chs, layer_per_block, residual=residual and i > 0, depthwise=depthwise,
+                attn=attn if last_block else '', norm_layer=norm_layer, act_layer=act_layer, drop_path=drop_path)
+            ]
+            in_chs = out_chs
+        self.blocks = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        if self.pool is not None:
+            x = self.pool(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        return x
+
+
+class VovNet(nn.Module):
+
+    def __init__(
+            self, cfg, in_chans=3, num_classes=1000, global_pool='avg', drop_rate=0., stem_stride=4,
+            output_stride=32, norm_layer=BatchNormAct2d, act_layer=nn.ReLU, drop_path_rate=0.):
+        """ VovNet (v2)
+        """
+        super(VovNet, self).__init__()
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        assert stem_stride in (4, 2)
+        assert output_stride == 32  # FIXME support dilation
+
+        stem_chs = cfg["stem_chs"]
+        stage_conv_chs = cfg["stage_conv_chs"]
+        stage_out_chs = cfg["stage_out_chs"]
+        block_per_stage = cfg["block_per_stage"]
+        layer_per_block = cfg["layer_per_block"]
+        conv_kwargs = dict(norm_layer=norm_layer, act_layer=act_layer)
+
+        # Stem module
+        last_stem_stride = stem_stride // 2
+        conv_type = SeparableConvNormAct if cfg["depthwise"] else ConvNormAct
+        self.stem = nn.Sequential(*[
+            ConvNormAct(in_chans, stem_chs[0], 3, stride=2, **conv_kwargs),
+            conv_type(stem_chs[0], stem_chs[1], 3, stride=1, **conv_kwargs),
+            conv_type(stem_chs[1], stem_chs[2], 3, stride=last_stem_stride, **conv_kwargs),
+        ])
+        self.feature_info = [dict(
+            num_chs=stem_chs[1], reduction=2, module=f'stem.{1 if stem_stride == 4 else 2}')]
+        current_stride = stem_stride
+
+        # OSA stages
+        stage_dpr = torch.split(torch.linspace(0, drop_path_rate, sum(block_per_stage)), block_per_stage)
+        in_ch_list = stem_chs[-1:] + stage_out_chs[:-1]
+        stage_args = dict(residual=cfg["residual"], depthwise=cfg["depthwise"], attn=cfg["attn"], **conv_kwargs)
+        stages = []
+        for i in range(4):  # num_stages
+            downsample = stem_stride == 2 or i > 0  # first stage has no stride/downsample if stem_stride is 4
+            stages += [OsaStage(
+                in_ch_list[i], stage_conv_chs[i], stage_out_chs[i], block_per_stage[i], layer_per_block,
+                downsample=downsample, drop_path_rates=stage_dpr[i], **stage_args)
+            ]
+            self.num_features = stage_out_chs[i]
+            current_stride *= 2 if downsample else 1
+            self.feature_info += [dict(num_chs=self.num_features, reduction=current_stride, module=f'stages.{i}')]
+
+        self.stages = nn.Sequential(*stages)
+
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate)
+
+        for n, m in self.named_modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.Linear):
+                nn.init.zeros_(m.bias)
+
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^stem',
+            blocks=r'^stages\.(\d+)' if coarse else r'^stages\.(\d+).blocks\.(\d+)',
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for s in self.stages:
+            s.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        return self.stages(x)
+
+    def forward_head(self, x, pre_logits: bool = False):
+        return self.head(x, pre_logits=pre_logits)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _create_vovnet(variant, pretrained=False, **kwargs):
+    return build_model_with_cfg(
+        VovNet, variant, pretrained,
+        model_cfg=model_cfgs[variant],
+        feature_cfg=dict(flatten_sequential=True),
+        **kwargs)
+
+
+@register_model
+def vovnet39a(pretrained=False, **kwargs):
+    return _create_vovnet('vovnet39a', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def vovnet57a(pretrained=False, **kwargs):
+    return _create_vovnet('vovnet57a', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def ese_vovnet19b_slim_dw(pretrained=False, **kwargs):
+    return _create_vovnet('ese_vovnet19b_slim_dw', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def ese_vovnet19b_dw(pretrained=False, **kwargs):
+    return _create_vovnet('ese_vovnet19b_dw', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def ese_vovnet19b_slim(pretrained=False, **kwargs):
+    return _create_vovnet('ese_vovnet19b_slim', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def ese_vovnet39b(pretrained=False, **kwargs):
+    return _create_vovnet('ese_vovnet39b', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def ese_vovnet57b(pretrained=False, **kwargs):
+    return _create_vovnet('ese_vovnet57b', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def ese_vovnet99b(pretrained=False, **kwargs):
+    return _create_vovnet('ese_vovnet99b', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def eca_vovnet39b(pretrained=False, **kwargs):
+    return _create_vovnet('eca_vovnet39b', pretrained=pretrained, **kwargs)
+
+
+# Experimental Models
+
+@register_model
+def ese_vovnet39b_evos(pretrained=False, **kwargs):
+    def norm_act_fn(num_features, **nkwargs):
+        return create_norm_act_layer('evonorms0', num_features, jit=False, **nkwargs)
+    return _create_vovnet('ese_vovnet39b_evos', pretrained=pretrained, norm_layer=norm_act_fn, **kwargs)
+
+
+@register_model
+def ese_vovnet99b_iabn(pretrained=False, **kwargs):
+    norm_layer = get_norm_act_layer('iabn', act_layer='leaky_relu')
+    return _create_vovnet(
+        'ese_vovnet99b_iabn', pretrained=pretrained, norm_layer=norm_layer, act_layer=nn.LeakyReLU, **kwargs)
diff --git a/src/custom_timm/models/xception.py b/src/custom_timm/models/xception.py
new file mode 100644
index 0000000000000000000000000000000000000000..99d02c467b5b40944fb00eed7f40f6bd62c66839
--- /dev/null
+++ b/src/custom_timm/models/xception.py
@@ -0,0 +1,249 @@
+"""
+Ported to pytorch thanks to [tstandley](https://github.com/tstandley/Xception-PyTorch)
+
+@author: tstandley
+Adapted by cadene
+
+Creates an Xception Model as defined in:
+
+Francois Chollet
+Xception: Deep Learning with Depthwise Separable Convolutions
+https://arxiv.org/pdf/1610.02357.pdf
+
+This weights ported from the Keras implementation. Achieves the following performance on the validation set:
+
+Loss:0.9173 Prec@1:78.892 Prec@5:94.292
+
+REMEMBER to set your image size to 3x299x299 for both test and validation
+
+normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5],
+                                  std=[0.5, 0.5, 0.5])
+
+The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
+"""
+import torch.jit
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .helpers import build_model_with_cfg
+from .layers import create_classifier
+from .registry import register_model
+
+__all__ = ['Xception']
+
+default_cfgs = {
+    'xception': {
+        'url': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/xception-43020ad28.pth',
+        'input_size': (3, 299, 299),
+        'pool_size': (10, 10),
+        'crop_pct': 0.8975,
+        'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5),
+        'std': (0.5, 0.5, 0.5),
+        'num_classes': 1000,
+        'first_conv': 'conv1',
+        'classifier': 'fc'
+        # The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
+    }
+}
+
+
+class SeparableConv2d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1):
+        super(SeparableConv2d, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            in_channels, in_channels, kernel_size, stride, padding, dilation, groups=in_channels, bias=False)
+        self.pointwise = nn.Conv2d(in_channels, out_channels, 1, 1, 0, 1, 1, bias=False)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.pointwise(x)
+        return x
+
+
+class Block(nn.Module):
+    def __init__(self, in_channels, out_channels, reps, strides=1, start_with_relu=True, grow_first=True):
+        super(Block, self).__init__()
+
+        if out_channels != in_channels or strides != 1:
+            self.skip = nn.Conv2d(in_channels, out_channels, 1, stride=strides, bias=False)
+            self.skipbn = nn.BatchNorm2d(out_channels)
+        else:
+            self.skip = None
+
+        rep = []
+        for i in range(reps):
+            if grow_first:
+                inc = in_channels if i == 0 else out_channels
+                outc = out_channels
+            else:
+                inc = in_channels
+                outc = in_channels if i < (reps - 1) else out_channels
+            rep.append(nn.ReLU(inplace=True))
+            rep.append(SeparableConv2d(inc, outc, 3, stride=1, padding=1))
+            rep.append(nn.BatchNorm2d(outc))
+
+        if not start_with_relu:
+            rep = rep[1:]
+        else:
+            rep[0] = nn.ReLU(inplace=False)
+
+        if strides != 1:
+            rep.append(nn.MaxPool2d(3, strides, 1))
+        self.rep = nn.Sequential(*rep)
+
+    def forward(self, inp):
+        x = self.rep(inp)
+
+        if self.skip is not None:
+            skip = self.skip(inp)
+            skip = self.skipbn(skip)
+        else:
+            skip = inp
+
+        x += skip
+        return x
+
+
+class Xception(nn.Module):
+    """
+    Xception optimized for the ImageNet dataset, as specified in
+    https://arxiv.org/pdf/1610.02357.pdf
+    """
+
+    def __init__(self, num_classes=1000, in_chans=3, drop_rate=0., global_pool='avg'):
+        """ Constructor
+        Args:
+            num_classes: number of classes
+        """
+        super(Xception, self).__init__()
+        self.drop_rate = drop_rate
+        self.global_pool = global_pool
+        self.num_classes = num_classes
+        self.num_features = 2048
+
+        self.conv1 = nn.Conv2d(in_chans, 32, 3, 2, 0, bias=False)
+        self.bn1 = nn.BatchNorm2d(32)
+        self.act1 = nn.ReLU(inplace=True)
+
+        self.conv2 = nn.Conv2d(32, 64, 3, bias=False)
+        self.bn2 = nn.BatchNorm2d(64)
+        self.act2 = nn.ReLU(inplace=True)
+
+        self.block1 = Block(64, 128, 2, 2, start_with_relu=False)
+        self.block2 = Block(128, 256, 2, 2)
+        self.block3 = Block(256, 728, 2, 2)
+
+        self.block4 = Block(728, 728, 3, 1)
+        self.block5 = Block(728, 728, 3, 1)
+        self.block6 = Block(728, 728, 3, 1)
+        self.block7 = Block(728, 728, 3, 1)
+
+        self.block8 = Block(728, 728, 3, 1)
+        self.block9 = Block(728, 728, 3, 1)
+        self.block10 = Block(728, 728, 3, 1)
+        self.block11 = Block(728, 728, 3, 1)
+
+        self.block12 = Block(728, 1024, 2, 2, grow_first=False)
+
+        self.conv3 = SeparableConv2d(1024, 1536, 3, 1, 1)
+        self.bn3 = nn.BatchNorm2d(1536)
+        self.act3 = nn.ReLU(inplace=True)
+
+        self.conv4 = SeparableConv2d(1536, self.num_features, 3, 1, 1)
+        self.bn4 = nn.BatchNorm2d(self.num_features)
+        self.act4 = nn.ReLU(inplace=True)
+        self.feature_info = [
+            dict(num_chs=64, reduction=2, module='act2'),
+            dict(num_chs=128, reduction=4, module='block2.rep.0'),
+            dict(num_chs=256, reduction=8, module='block3.rep.0'),
+            dict(num_chs=728, reduction=16, module='block12.rep.0'),
+            dict(num_chs=2048, reduction=32, module='act4'),
+        ]
+
+        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
+
+        # #------- init weights --------
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^conv[12]|bn[12]',
+            blocks=[
+                (r'^block(\d+)', None),
+                (r'^conv[34]|bn[34]', (99,)),
+            ],
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        assert not enable, "gradient checkpointing not supported"
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
+
+    def forward_features(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.act2(x)
+
+        x = self.block1(x)
+        x = self.block2(x)
+        x = self.block3(x)
+        x = self.block4(x)
+        x = self.block5(x)
+        x = self.block6(x)
+        x = self.block7(x)
+        x = self.block8(x)
+        x = self.block9(x)
+        x = self.block10(x)
+        x = self.block11(x)
+        x = self.block12(x)
+
+        x = self.conv3(x)
+        x = self.bn3(x)
+        x = self.act3(x)
+
+        x = self.conv4(x)
+        x = self.bn4(x)
+        x = self.act4(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        x = self.global_pool(x)
+        if self.drop_rate:
+            F.dropout(x, self.drop_rate, training=self.training)
+        return x if pre_logits else self.fc(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _xception(variant, pretrained=False, **kwargs):
+    return build_model_with_cfg(
+        Xception, variant, pretrained,
+        feature_cfg=dict(feature_cls='hook'),
+        **kwargs)
+
+
+@register_model
+def xception(pretrained=False, **kwargs):
+    return _xception('xception', pretrained=pretrained, **kwargs)
diff --git a/src/custom_timm/models/xception_aligned.py b/src/custom_timm/models/xception_aligned.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ac75ff05e53279b72cfaea2809f78a757f8e540
--- /dev/null
+++ b/src/custom_timm/models/xception_aligned.py
@@ -0,0 +1,358 @@
+"""Pytorch impl of Aligned Xception 41, 65, 71
+
+This is a correct, from scratch impl of Aligned Xception (Deeplab) models compatible with TF weights at
+https://github.com/tensorflow/models/blob/master/research/deeplab/g3doc/model_zoo.md
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+from custom_timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+from .helpers import build_model_with_cfg, checkpoint_seq
+from .layers import ClassifierHead, ConvNormAct, create_conv2d, get_norm_act_layer
+from .layers.helpers import to_3tuple
+from .registry import register_model
+
+__all__ = ['XceptionAligned']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 299, 299), 'pool_size': (10, 10),
+        'crop_pct': 0.903, 'interpolation': 'bicubic',
+        'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD,
+        'first_conv': 'stem.0.conv', 'classifier': 'head.fc',
+        **kwargs
+    }
+
+
+default_cfgs = dict(
+    xception41=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_xception_41-e6439c97.pth'),
+    xception65=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/xception65_ra3-1447db8d.pth',
+        crop_pct=0.94,
+    ),
+    xception71=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_xception_71-8eec7df1.pth'),
+
+    xception41p=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/xception41p_ra3-33195bc8.pth',
+        crop_pct=0.94,
+    ),
+    xception65p=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/xception65p_ra3-3c6114e4.pth',
+        crop_pct=0.94,
+    ),
+)
+
+
+class SeparableConv2d(nn.Module):
+    def __init__(
+            self, in_chs, out_chs, kernel_size=3, stride=1, dilation=1, padding='',
+            act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
+        super(SeparableConv2d, self).__init__()
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+
+        # depthwise convolution
+        self.conv_dw = create_conv2d(
+            in_chs, in_chs, kernel_size, stride=stride,
+            padding=padding, dilation=dilation, depthwise=True)
+        self.bn_dw = norm_layer(in_chs)
+        self.act_dw = act_layer(inplace=True) if act_layer is not None else nn.Identity()
+
+        # pointwise convolution
+        self.conv_pw = create_conv2d(in_chs, out_chs, kernel_size=1)
+        self.bn_pw = norm_layer(out_chs)
+        self.act_pw = act_layer(inplace=True) if act_layer is not None else nn.Identity()
+
+    def forward(self, x):
+        x = self.conv_dw(x)
+        x = self.bn_dw(x)
+        x = self.act_dw(x)
+        x = self.conv_pw(x)
+        x = self.bn_pw(x)
+        x = self.act_pw(x)
+        return x
+
+
+class PreSeparableConv2d(nn.Module):
+    def __init__(
+            self, in_chs, out_chs, kernel_size=3, stride=1, dilation=1, padding='',
+            act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, first_act=True):
+        super(PreSeparableConv2d, self).__init__()
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer=act_layer)
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+
+        self.norm = norm_act_layer(in_chs, inplace=True) if first_act else nn.Identity()
+        # depthwise convolution
+        self.conv_dw = create_conv2d(
+            in_chs, in_chs, kernel_size, stride=stride,
+            padding=padding, dilation=dilation, depthwise=True)
+
+        # pointwise convolution
+        self.conv_pw = create_conv2d(in_chs, out_chs, kernel_size=1)
+
+    def forward(self, x):
+        x = self.norm(x)
+        x = self.conv_dw(x)
+        x = self.conv_pw(x)
+        return x
+
+
+class XceptionModule(nn.Module):
+    def __init__(
+            self, in_chs, out_chs, stride=1, dilation=1, pad_type='',
+            start_with_relu=True, no_skip=False, act_layer=nn.ReLU, norm_layer=None):
+        super(XceptionModule, self).__init__()
+        out_chs = to_3tuple(out_chs)
+        self.in_channels = in_chs
+        self.out_channels = out_chs[-1]
+        self.no_skip = no_skip
+        if not no_skip and (self.out_channels != self.in_channels or stride != 1):
+            self.shortcut = ConvNormAct(
+                in_chs, self.out_channels, 1, stride=stride, norm_layer=norm_layer, apply_act=False)
+        else:
+            self.shortcut = None
+
+        separable_act_layer = None if start_with_relu else act_layer
+        self.stack = nn.Sequential()
+        for i in range(3):
+            if start_with_relu:
+                self.stack.add_module(f'act{i + 1}', act_layer(inplace=i > 0))
+            self.stack.add_module(f'conv{i + 1}', SeparableConv2d(
+                in_chs, out_chs[i], 3, stride=stride if i == 2 else 1, dilation=dilation, padding=pad_type,
+                act_layer=separable_act_layer, norm_layer=norm_layer))
+            in_chs = out_chs[i]
+
+    def forward(self, x):
+        skip = x
+        x = self.stack(x)
+        if self.shortcut is not None:
+            skip = self.shortcut(skip)
+        if not self.no_skip:
+            x = x + skip
+        return x
+
+
+class PreXceptionModule(nn.Module):
+    def __init__(
+            self, in_chs, out_chs, stride=1, dilation=1, pad_type='',
+            no_skip=False, act_layer=nn.ReLU, norm_layer=None):
+        super(PreXceptionModule, self).__init__()
+        out_chs = to_3tuple(out_chs)
+        self.in_channels = in_chs
+        self.out_channels = out_chs[-1]
+        self.no_skip = no_skip
+        if not no_skip and (self.out_channels != self.in_channels or stride != 1):
+            self.shortcut = create_conv2d(in_chs, self.out_channels, 1, stride=stride)
+        else:
+            self.shortcut = nn.Identity()
+
+        self.norm = get_norm_act_layer(norm_layer, act_layer=act_layer)(in_chs, inplace=True)
+        self.stack = nn.Sequential()
+        for i in range(3):
+            self.stack.add_module(f'conv{i + 1}', PreSeparableConv2d(
+                in_chs, out_chs[i], 3, stride=stride if i == 2 else 1, dilation=dilation, padding=pad_type,
+                act_layer=act_layer, norm_layer=norm_layer, first_act=i > 0))
+            in_chs = out_chs[i]
+
+    def forward(self, x):
+        x = self.norm(x)
+        skip = x
+        x = self.stack(x)
+        if not self.no_skip:
+            x = x + self.shortcut(skip)
+        return x
+
+
+class XceptionAligned(nn.Module):
+    """Modified Aligned Xception
+    """
+
+    def __init__(
+            self, block_cfg, num_classes=1000, in_chans=3, output_stride=32, preact=False,
+            act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, drop_rate=0., global_pool='avg'):
+        super(XceptionAligned, self).__init__()
+        assert output_stride in (8, 16, 32)
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        self.grad_checkpointing = False
+
+        layer_args = dict(act_layer=act_layer, norm_layer=norm_layer)
+        self.stem = nn.Sequential(*[
+            ConvNormAct(in_chans, 32, kernel_size=3, stride=2, **layer_args),
+            create_conv2d(32, 64, kernel_size=3, stride=1) if preact else
+            ConvNormAct(32, 64, kernel_size=3, stride=1, **layer_args)
+        ])
+
+        curr_dilation = 1
+        curr_stride = 2
+        self.feature_info = []
+        self.blocks = nn.Sequential()
+        module_fn = PreXceptionModule if preact else XceptionModule
+        for i, b in enumerate(block_cfg):
+            b['dilation'] = curr_dilation
+            if b['stride'] > 1:
+                name = f'blocks.{i}.stack.conv2' if preact else f'blocks.{i}.stack.act3'
+                self.feature_info += [dict(num_chs=to_3tuple(b['out_chs'])[-2], reduction=curr_stride, module=name)]
+                next_stride = curr_stride * b['stride']
+                if next_stride > output_stride:
+                    curr_dilation *= b['stride']
+                    b['stride'] = 1
+                else:
+                    curr_stride = next_stride
+            self.blocks.add_module(str(i), module_fn(**b, **layer_args))
+            self.num_features = self.blocks[-1].out_channels
+
+        self.feature_info += [dict(
+            num_chs=self.num_features, reduction=curr_stride, module='blocks.' + str(len(self.blocks) - 1))]
+        self.act = act_layer(inplace=True) if preact else nn.Identity()
+        self.head = ClassifierHead(
+            in_chs=self.num_features, num_classes=num_classes, pool_type=global_pool, drop_rate=drop_rate)
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^stem',
+            blocks=r'^blocks\.(\d+)',
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        x = self.act(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        return self.head(x, pre_logits=pre_logits)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _xception(variant, pretrained=False, **kwargs):
+    return build_model_with_cfg(
+        XceptionAligned, variant, pretrained,
+        feature_cfg=dict(flatten_sequential=True, feature_cls='hook'),
+        **kwargs)
+
+
+@register_model
+def xception41(pretrained=False, **kwargs):
+    """ Modified Aligned Xception-41
+    """
+    block_cfg = [
+        # entry flow
+        dict(in_chs=64, out_chs=128, stride=2),
+        dict(in_chs=128, out_chs=256, stride=2),
+        dict(in_chs=256, out_chs=728, stride=2),
+        # middle flow
+        *([dict(in_chs=728, out_chs=728, stride=1)] * 8),
+        # exit flow
+        dict(in_chs=728, out_chs=(728, 1024, 1024), stride=2),
+        dict(in_chs=1024, out_chs=(1536, 1536, 2048), stride=1, no_skip=True, start_with_relu=False),
+    ]
+    model_args = dict(block_cfg=block_cfg, norm_layer=partial(nn.BatchNorm2d, eps=.001, momentum=.1), **kwargs)
+    return _xception('xception41', pretrained=pretrained, **model_args)
+
+
+@register_model
+def xception65(pretrained=False, **kwargs):
+    """ Modified Aligned Xception-65
+    """
+    block_cfg = [
+        # entry flow
+        dict(in_chs=64, out_chs=128, stride=2),
+        dict(in_chs=128, out_chs=256, stride=2),
+        dict(in_chs=256, out_chs=728, stride=2),
+        # middle flow
+        *([dict(in_chs=728, out_chs=728, stride=1)] * 16),
+        # exit flow
+        dict(in_chs=728, out_chs=(728, 1024, 1024), stride=2),
+        dict(in_chs=1024, out_chs=(1536, 1536, 2048), stride=1, no_skip=True, start_with_relu=False),
+    ]
+    model_args = dict(block_cfg=block_cfg, norm_layer=partial(nn.BatchNorm2d, eps=.001, momentum=.1), **kwargs)
+    return _xception('xception65', pretrained=pretrained, **model_args)
+
+
+@register_model
+def xception71(pretrained=False, **kwargs):
+    """ Modified Aligned Xception-71
+    """
+    block_cfg = [
+        # entry flow
+        dict(in_chs=64, out_chs=128, stride=2),
+        dict(in_chs=128, out_chs=256, stride=1),
+        dict(in_chs=256, out_chs=256, stride=2),
+        dict(in_chs=256, out_chs=728, stride=1),
+        dict(in_chs=728, out_chs=728, stride=2),
+        # middle flow
+        *([dict(in_chs=728, out_chs=728, stride=1)] * 16),
+        # exit flow
+        dict(in_chs=728, out_chs=(728, 1024, 1024), stride=2),
+        dict(in_chs=1024, out_chs=(1536, 1536, 2048), stride=1, no_skip=True, start_with_relu=False),
+    ]
+    model_args = dict(block_cfg=block_cfg, norm_layer=partial(nn.BatchNorm2d, eps=.001, momentum=.1), **kwargs)
+    return _xception('xception71', pretrained=pretrained, **model_args)
+
+
+@register_model
+def xception41p(pretrained=False, **kwargs):
+    """ Modified Aligned Xception-41 w/ Pre-Act
+    """
+    block_cfg = [
+        # entry flow
+        dict(in_chs=64, out_chs=128, stride=2),
+        dict(in_chs=128, out_chs=256, stride=2),
+        dict(in_chs=256, out_chs=728, stride=2),
+        # middle flow
+        *([dict(in_chs=728, out_chs=728, stride=1)] * 8),
+        # exit flow
+        dict(in_chs=728, out_chs=(728, 1024, 1024), stride=2),
+        dict(in_chs=1024, out_chs=(1536, 1536, 2048), no_skip=True, stride=1),
+    ]
+    model_args = dict(block_cfg=block_cfg, preact=True, norm_layer=nn.BatchNorm2d, **kwargs)
+    return _xception('xception41p', pretrained=pretrained, **model_args)
+
+
+@register_model
+def xception65p(pretrained=False, **kwargs):
+    """ Modified Aligned Xception-65 w/ Pre-Act
+    """
+    block_cfg = [
+        # entry flow
+        dict(in_chs=64, out_chs=128, stride=2),
+        dict(in_chs=128, out_chs=256, stride=2),
+        dict(in_chs=256, out_chs=728, stride=2),
+        # middle flow
+        *([dict(in_chs=728, out_chs=728, stride=1)] * 16),
+        # exit flow
+        dict(in_chs=728, out_chs=(728, 1024, 1024), stride=2),
+        dict(in_chs=1024, out_chs=(1536, 1536, 2048), stride=1, no_skip=True),
+    ]
+    model_args = dict(
+        block_cfg=block_cfg, preact=True, norm_layer=partial(nn.BatchNorm2d, eps=.001, momentum=.1), **kwargs)
+    return _xception('xception65p', pretrained=pretrained, **model_args)
diff --git a/src/custom_timm/models/xcit.py b/src/custom_timm/models/xcit.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c706df76cc54703c6a74623247298449e508a17
--- /dev/null
+++ b/src/custom_timm/models/xcit.py
@@ -0,0 +1,842 @@
+""" Cross-Covariance Image Transformer (XCiT) in PyTorch
+
+Paper:
+    - https://arxiv.org/abs/2106.09681
+
+Same as the official implementation, with some minor adaptations, original copyright below
+    - https://github.com/facebookresearch/xcit/blob/master/xcit.py
+
+Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman
+"""
+# Copyright (c) 2015-present, Facebook, Inc.
+# All rights reserved.
+
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+
+from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg
+from .vision_transformer import _cfg, Mlp
+from .registry import register_model
+from .layers import DropPath, trunc_normal_, to_2tuple
+from .cait import ClassAttn
+from .fx_features import register_notrace_module
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': 1.0, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.proj.0.0', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    # Patch size 16
+    'xcit_nano_12_p16_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p16_224.pth'),  
+    'xcit_nano_12_p16_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p16_224_dist.pth'),
+    'xcit_nano_12_p16_384_dist': _cfg(
+        url='https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p16_384_dist.pth', input_size=(3, 384, 384)),
+    'xcit_tiny_12_p16_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p16_224.pth'),
+    'xcit_tiny_12_p16_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p16_224_dist.pth'),
+    'xcit_tiny_12_p16_384_dist': _cfg(
+        url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p16_384_dist.pth', input_size=(3, 384, 384)),
+    'xcit_tiny_24_p16_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p16_224.pth'),
+    'xcit_tiny_24_p16_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p16_224_dist.pth'),
+    'xcit_tiny_24_p16_384_dist': _cfg(
+        url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p16_384_dist.pth', input_size=(3, 384, 384)),
+    'xcit_small_12_p16_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p16_224.pth'),
+    'xcit_small_12_p16_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p16_224_dist.pth'),
+    'xcit_small_12_p16_384_dist': _cfg(
+        url='https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p16_384_dist.pth', input_size=(3, 384, 384)),
+    'xcit_small_24_p16_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p16_224.pth'),
+    'xcit_small_24_p16_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p16_224_dist.pth'),
+    'xcit_small_24_p16_384_dist': _cfg(
+        url='https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p16_384_dist.pth', input_size=(3, 384, 384)),
+    'xcit_medium_24_p16_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p16_224.pth'),
+    'xcit_medium_24_p16_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p16_224_dist.pth'),
+    'xcit_medium_24_p16_384_dist': _cfg(
+        url='https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p16_384_dist.pth', input_size=(3, 384, 384)),
+    'xcit_large_24_p16_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p16_224.pth'),
+    'xcit_large_24_p16_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p16_224_dist.pth'),
+    'xcit_large_24_p16_384_dist': _cfg(
+        url='https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p16_384_dist.pth', input_size=(3, 384, 384)),
+
+    # Patch size 8
+    'xcit_nano_12_p8_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p8_224.pth'),  
+    'xcit_nano_12_p8_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p8_224_dist.pth'),
+    'xcit_nano_12_p8_384_dist': _cfg(
+        url='https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p8_384_dist.pth', input_size=(3, 384, 384)),
+    'xcit_tiny_12_p8_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p8_224.pth'),
+    'xcit_tiny_12_p8_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p8_224_dist.pth'),
+    'xcit_tiny_12_p8_384_dist': _cfg(
+        url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p8_384_dist.pth', input_size=(3, 384, 384)),
+    'xcit_tiny_24_p8_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p8_224.pth'),
+    'xcit_tiny_24_p8_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p8_224_dist.pth'),
+    'xcit_tiny_24_p8_384_dist': _cfg(
+        url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p8_384_dist.pth', input_size=(3, 384, 384)),
+    'xcit_small_12_p8_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p8_224.pth'),
+    'xcit_small_12_p8_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p8_224_dist.pth'),
+    'xcit_small_12_p8_384_dist': _cfg(
+        url='https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p8_384_dist.pth', input_size=(3, 384, 384)),
+    'xcit_small_24_p8_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p8_224.pth'),
+    'xcit_small_24_p8_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p8_224_dist.pth'),
+    'xcit_small_24_p8_384_dist': _cfg(
+        url='https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p8_384_dist.pth', input_size=(3, 384, 384)),
+    'xcit_medium_24_p8_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p8_224.pth'),
+    'xcit_medium_24_p8_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p8_224_dist.pth'),
+    'xcit_medium_24_p8_384_dist': _cfg(
+        url='https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p8_384_dist.pth', input_size=(3, 384, 384)),
+    'xcit_large_24_p8_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p8_224.pth'),
+    'xcit_large_24_p8_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p8_224_dist.pth'),
+    'xcit_large_24_p8_384_dist': _cfg(
+        url='https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p8_384_dist.pth', input_size=(3, 384, 384)),
+}
+
+
+@register_notrace_module  # reason: FX can't symbolically trace torch.arange in forward method
+class PositionalEncodingFourier(nn.Module):
+    """
+    Positional encoding relying on a fourier kernel matching the one used in the "Attention is all of Need" paper.
+    Based on the official XCiT code
+        - https://github.com/facebookresearch/xcit/blob/master/xcit.py
+    """
+
+    def __init__(self, hidden_dim=32, dim=768, temperature=10000):
+        super().__init__()
+        self.token_projection = nn.Conv2d(hidden_dim * 2, dim, kernel_size=1)
+        self.scale = 2 * math.pi
+        self.temperature = temperature
+        self.hidden_dim = hidden_dim
+        self.dim = dim
+        self.eps = 1e-6
+
+    def forward(self, B: int, H: int, W: int):
+        device = self.token_projection.weight.device
+        y_embed = torch.arange(1, H+1, dtype=torch.float32, device=device).unsqueeze(1).repeat(1, 1, W)
+        x_embed = torch.arange(1, W+1, dtype=torch.float32, device=device).repeat(1, H, 1)
+        y_embed = y_embed / (y_embed[:, -1:, :] + self.eps) * self.scale
+        x_embed = x_embed / (x_embed[:, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(self.hidden_dim, dtype=torch.float32, device=device)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode='floor') / self.hidden_dim)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack([pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()], dim=4).flatten(3)
+        pos_y = torch.stack([pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()], dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        pos = self.token_projection(pos)
+        return pos.repeat(B, 1, 1, 1)  # (B, C, H, W)
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution + batch norm"""
+    return torch.nn.Sequential(
+        nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False),
+        nn.BatchNorm2d(out_planes)
+    )
+
+
+class ConvPatchEmbed(nn.Module):
+    """Image to Patch Embedding using multiple convolutional layers"""
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, act_layer=nn.GELU):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        num_patches = (img_size[1] // patch_size) * (img_size[0] // patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        if patch_size == 16:
+            self.proj = torch.nn.Sequential(
+                conv3x3(in_chans, embed_dim // 8, 2),
+                act_layer(),
+                conv3x3(embed_dim // 8, embed_dim // 4, 2),
+                act_layer(),
+                conv3x3(embed_dim // 4, embed_dim // 2, 2),
+                act_layer(),
+                conv3x3(embed_dim // 2, embed_dim, 2),
+            )
+        elif patch_size == 8:
+            self.proj = torch.nn.Sequential(
+                conv3x3(in_chans, embed_dim // 4, 2),
+                act_layer(),
+                conv3x3(embed_dim // 4, embed_dim // 2, 2),
+                act_layer(),
+                conv3x3(embed_dim // 2, embed_dim, 2),
+            )
+        else:
+            raise('For convolutional projection, patch size has to be in [8, 16]')
+
+    def forward(self, x):
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+        x = x.flatten(2).transpose(1, 2)  # (B, N, C)
+        return x, (Hp, Wp)
+
+
+class LPI(nn.Module):
+    """
+    Local Patch Interaction module that allows explicit communication between tokens in 3x3 windows to augment the
+    implicit communication performed by the block diagonal scatter attention. Implemented using 2 layers of separable
+    3x3 convolutions with GeLU and BatchNorm2d
+    """
+
+    def __init__(self, in_features, out_features=None, act_layer=nn.GELU, kernel_size=3):
+        super().__init__()
+        out_features = out_features or in_features
+
+        padding = kernel_size // 2
+
+        self.conv1 = torch.nn.Conv2d(
+            in_features, in_features, kernel_size=kernel_size, padding=padding, groups=in_features)
+        self.act = act_layer()
+        self.bn = nn.BatchNorm2d(in_features)
+        self.conv2 = torch.nn.Conv2d(
+            in_features, out_features, kernel_size=kernel_size, padding=padding, groups=out_features)
+
+    def forward(self, x, H: int, W: int):
+        B, N, C = x.shape
+        x = x.permute(0, 2, 1).reshape(B, C, H, W)
+        x = self.conv1(x)
+        x = self.act(x)
+        x = self.bn(x)
+        x = self.conv2(x)
+        x = x.reshape(B, C, N).permute(0, 2, 1)
+        return x
+
+
+class ClassAttentionBlock(nn.Module):
+    """Class Attention Layer as in CaiT https://arxiv.org/abs/2103.17239"""
+
+    def __init__(
+            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., drop_path=0.,
+            act_layer=nn.GELU, norm_layer=nn.LayerNorm, eta=1., tokens_norm=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+
+        self.attn = ClassAttn(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
+
+        if eta is not None:  # LayerScale Initialization (no layerscale when None)
+            self.gamma1 = nn.Parameter(eta * torch.ones(dim))
+            self.gamma2 = nn.Parameter(eta * torch.ones(dim))
+        else:
+            self.gamma1, self.gamma2 = 1.0, 1.0
+
+        # See https://github.com/rwightman/pytorch-image-models/pull/747#issuecomment-877795721
+        self.tokens_norm = tokens_norm
+
+    def forward(self, x):
+        x_norm1 = self.norm1(x)
+        x_attn = torch.cat([self.attn(x_norm1), x_norm1[:, 1:]], dim=1)
+        x = x + self.drop_path(self.gamma1 * x_attn)
+        if self.tokens_norm:
+            x = self.norm2(x)
+        else:
+            x = torch.cat([self.norm2(x[:, 0:1]), x[:, 1:]], dim=1)
+        x_res = x
+        cls_token = x[:, 0:1]
+        cls_token = self.gamma2 * self.mlp(cls_token)
+        x = torch.cat([cls_token, x[:, 1:]], dim=1)
+        x = x_res + self.drop_path(x)
+        return x
+
+
+class XCA(nn.Module):
+    """ Cross-Covariance Attention (XCA)
+    Operation where the channels are updated using a weighted sum. The weights are obtained from the (softmax
+    normalized) Cross-covariance matrix (Q^T \\cdot K \\in d_h \\times d_h)
+    """
+
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        self.temperature = nn.Parameter(torch.ones(num_heads, 1, 1))
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        # Result of next line is (qkv, B, num (H)eads,  (C')hannels per head, N)
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 4, 1)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        
+        # Paper section 3.2 l2-Normalization and temperature scaling
+        q = torch.nn.functional.normalize(q, dim=-1)
+        k = torch.nn.functional.normalize(k, dim=-1)
+        attn = (q @ k.transpose(-2, -1)) * self.temperature
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        # (B, H, C', N), permute -> (B, N, H, C')
+        x = (attn @ v).permute(0, 3, 1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'temperature'}
+
+
+class XCABlock(nn.Module):
+    def __init__(
+            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+            drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, eta=1.):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = XCA(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.norm3 = norm_layer(dim)
+        self.local_mp = LPI(in_features=dim, act_layer=act_layer)
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
+
+        self.gamma1 = nn.Parameter(eta * torch.ones(dim))
+        self.gamma3 = nn.Parameter(eta * torch.ones(dim))
+        self.gamma2 = nn.Parameter(eta * torch.ones(dim))
+
+    def forward(self, x, H: int, W: int):
+        x = x + self.drop_path(self.gamma1 * self.attn(self.norm1(x)))
+        # NOTE official code has 3 then 2, so keeping it the same to be consistent with loaded weights
+        # See https://github.com/rwightman/pytorch-image-models/pull/747#issuecomment-877795721
+        x = x + self.drop_path(self.gamma3 * self.local_mp(self.norm3(x), H, W))
+        x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class XCiT(nn.Module):
+    """
+    Based on timm and DeiT code bases
+    https://github.com/rwightman/pytorch-image-models/tree/master/timm
+    https://github.com/facebookresearch/deit/
+    """
+
+    def __init__(
+            self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, global_pool='token', embed_dim=768,
+            depth=12, num_heads=12, mlp_ratio=4., qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.,
+            act_layer=None, norm_layer=None, cls_attn_layers=2, use_pos_embed=True, eta=1., tokens_norm=False):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            drop_rate (float): dropout rate after positional embedding, and in XCA/CA projection + MLP
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate (constant across all layers)
+            norm_layer: (nn.Module): normalization layer
+            cls_attn_layers: (int) Depth of Class attention layers
+            use_pos_embed: (bool) whether to use positional encoding
+            eta: (float) layerscale initialization value
+            tokens_norm: (bool) Whether to normalize all tokens or just the cls_token in the CA
+
+        Notes:
+            - Although `layer_norm` is user specifiable, there are hard-coded `BatchNorm2d`s in the local patch
+              interaction (class LPI) and the patch embedding (class ConvPatchEmbed)
+        """
+        super().__init__()
+        assert global_pool in ('', 'avg', 'token')
+        img_size = to_2tuple(img_size)
+        assert (img_size[0] % patch_size == 0) and (img_size[0] % patch_size == 0), \
+            '`patch_size` should divide image dimensions evenly'
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim
+        self.global_pool = global_pool
+        self.grad_checkpointing = False
+
+        self.patch_embed = ConvPatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, act_layer=act_layer)
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.use_pos_embed = use_pos_embed
+        if use_pos_embed:
+            self.pos_embed = PositionalEncodingFourier(dim=embed_dim)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        self.blocks = nn.ModuleList([
+            XCABlock(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop_rate,
+                attn_drop=attn_drop_rate, drop_path=drop_path_rate, act_layer=act_layer, norm_layer=norm_layer, eta=eta)
+            for _ in range(depth)])
+
+        self.cls_attn_blocks = nn.ModuleList([
+            ClassAttentionBlock(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop_rate,
+                attn_drop=attn_drop_rate, act_layer=act_layer, norm_layer=norm_layer, eta=eta, tokens_norm=tokens_norm)
+            for _ in range(cls_attn_layers)])
+
+        # Classifier head
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+        # Init weights
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^cls_token|pos_embed|patch_embed',  # stem and embed
+            blocks=r'^blocks\.(\d+)',
+            cls_attn_blocks=[(r'^cls_attn_blocks\.(\d+)', None), (r'^norm', (99999,))]
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ('', 'avg', 'token')
+            self.global_pool = global_pool
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        # x is (B, N, C). (Hp, Hw) is (height in units of patches, width in units of patches)
+        x, (Hp, Wp) = self.patch_embed(x)
+
+        if self.use_pos_embed:
+            # `pos_embed` (B, C, Hp, Wp), reshape -> (B, C, N), permute -> (B, N, C)
+            pos_encoding = self.pos_embed(B, Hp, Wp).reshape(B, -1, x.shape[1]).permute(0, 2, 1)
+            x = x + pos_encoding
+        x = self.pos_drop(x)
+
+        for blk in self.blocks:
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(blk, x, Hp, Wp)
+            else:
+                x = blk(x, Hp, Wp)
+
+        x = torch.cat((self.cls_token.expand(B, -1, -1), x), dim=1)
+
+        for blk in self.cls_attn_blocks:
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(blk, x)
+            else:
+                x = blk(x)
+
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool:
+            x = x[:, 1:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0]
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def checkpoint_filter_fn(state_dict, model):
+    if 'model' in state_dict:
+        state_dict = state_dict['model']
+    # For consistency with timm's transformer models while being compatible with official weights source we rename
+    # pos_embeder to pos_embed. Also account for use_pos_embed == False
+    use_pos_embed = getattr(model, 'pos_embed', None) is not None
+    pos_embed_keys = [k for k in state_dict if k.startswith('pos_embed')]
+    for k in pos_embed_keys:
+        if use_pos_embed:
+            state_dict[k.replace('pos_embeder.', 'pos_embed.')] = state_dict.pop(k)
+        else:
+            del state_dict[k]
+    # timm's implementation of class attention in CaiT is slightly more efficient as it does not compute query vectors
+    # for all tokens, just the class token. To use official weights source we must split qkv into q, k, v
+    if 'cls_attn_blocks.0.attn.qkv.weight' in state_dict and 'cls_attn_blocks.0.attn.q.weight' in model.state_dict():
+        num_ca_blocks = len(model.cls_attn_blocks)
+        for i in range(num_ca_blocks):
+            qkv_weight = state_dict.pop(f'cls_attn_blocks.{i}.attn.qkv.weight')
+            qkv_weight = qkv_weight.reshape(3, -1, qkv_weight.shape[-1])
+            for j, subscript in enumerate('qkv'):
+                state_dict[f'cls_attn_blocks.{i}.attn.{subscript}.weight'] = qkv_weight[j]
+            qkv_bias = state_dict.pop(f'cls_attn_blocks.{i}.attn.qkv.bias', None)
+            if qkv_bias is not None:
+                qkv_bias = qkv_bias.reshape(3, -1)
+                for j, subscript in enumerate('qkv'):
+                    state_dict[f'cls_attn_blocks.{i}.attn.{subscript}.bias'] = qkv_bias[j]
+    return state_dict
+
+
+def _create_xcit(variant, pretrained=False, default_cfg=None, **kwargs):
+    model = build_model_with_cfg(
+        XCiT, variant, pretrained, pretrained_filter_fn=checkpoint_filter_fn, **kwargs)
+    return model
+
+
+@register_model
+def xcit_nano_12_p16_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=128, depth=12, num_heads=4, eta=1.0, tokens_norm=False, **kwargs)
+    model = _create_xcit('xcit_nano_12_p16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_nano_12_p16_224_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=128, depth=12, num_heads=4, eta=1.0, tokens_norm=False, **kwargs)
+    model = _create_xcit('xcit_nano_12_p16_224_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_nano_12_p16_384_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=128, depth=12, num_heads=4, eta=1.0, tokens_norm=False, img_size=384, **kwargs)
+    model = _create_xcit('xcit_nano_12_p16_384_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_tiny_12_p16_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=192, depth=12, num_heads=4, eta=1.0, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_tiny_12_p16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_tiny_12_p16_224_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=192, depth=12, num_heads=4, eta=1.0, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_tiny_12_p16_224_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_tiny_12_p16_384_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=192, depth=12, num_heads=4, eta=1.0, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_tiny_12_p16_384_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_small_12_p16_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=384, depth=12, num_heads=8, eta=1.0, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_small_12_p16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_small_12_p16_224_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=384, depth=12, num_heads=8, eta=1.0, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_small_12_p16_224_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_small_12_p16_384_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=384, depth=12, num_heads=8, eta=1.0, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_small_12_p16_384_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_tiny_24_p16_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=192, depth=24, num_heads=4, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_tiny_24_p16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_tiny_24_p16_224_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=192, depth=24, num_heads=4, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_tiny_24_p16_224_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_tiny_24_p16_384_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=192, depth=24, num_heads=4, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_tiny_24_p16_384_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_small_24_p16_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=384, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_small_24_p16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_small_24_p16_224_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=384, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_small_24_p16_224_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_small_24_p16_384_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=384, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_small_24_p16_384_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_medium_24_p16_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=512, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_medium_24_p16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_medium_24_p16_224_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=512, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_medium_24_p16_224_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_medium_24_p16_384_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=512, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_medium_24_p16_384_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_large_24_p16_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=24, num_heads=16, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_large_24_p16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_large_24_p16_224_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=24, num_heads=16, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_large_24_p16_224_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_large_24_p16_384_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=24, num_heads=16, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_large_24_p16_384_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+# Patch size 8x8 models
+@register_model
+def xcit_nano_12_p8_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=128, depth=12, num_heads=4, eta=1.0, tokens_norm=False, **kwargs)
+    model = _create_xcit('xcit_nano_12_p8_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_nano_12_p8_224_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=128, depth=12, num_heads=4, eta=1.0, tokens_norm=False, **kwargs)
+    model = _create_xcit('xcit_nano_12_p8_224_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_nano_12_p8_384_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=128, depth=12, num_heads=4, eta=1.0, tokens_norm=False, **kwargs)
+    model = _create_xcit('xcit_nano_12_p8_384_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_tiny_12_p8_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=192, depth=12, num_heads=4, eta=1.0, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_tiny_12_p8_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_tiny_12_p8_224_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=192, depth=12, num_heads=4, eta=1.0, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_tiny_12_p8_224_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_tiny_12_p8_384_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=192, depth=12, num_heads=4, eta=1.0, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_tiny_12_p8_384_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_small_12_p8_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=384, depth=12, num_heads=8, eta=1.0, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_small_12_p8_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_small_12_p8_224_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=384, depth=12, num_heads=8, eta=1.0, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_small_12_p8_224_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_small_12_p8_384_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=384, depth=12, num_heads=8, eta=1.0, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_small_12_p8_384_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_tiny_24_p8_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=192, depth=24, num_heads=4, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_tiny_24_p8_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_tiny_24_p8_224_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=192, depth=24, num_heads=4, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_tiny_24_p8_224_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_tiny_24_p8_384_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=192, depth=24, num_heads=4, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_tiny_24_p8_384_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_small_24_p8_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=384, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_small_24_p8_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_small_24_p8_224_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=384, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_small_24_p8_224_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_small_24_p8_384_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=384, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_small_24_p8_384_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_medium_24_p8_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=512, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_medium_24_p8_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_medium_24_p8_224_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=512, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_medium_24_p8_224_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_medium_24_p8_384_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=512, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_medium_24_p8_384_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_large_24_p8_224(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=768, depth=24, num_heads=16, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_large_24_p8_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_large_24_p8_224_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=768, depth=24, num_heads=16, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_large_24_p8_224_dist', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def xcit_large_24_p8_384_dist(pretrained=False, **kwargs):
+    model_kwargs = dict(
+        patch_size=8, embed_dim=768, depth=24, num_heads=16, eta=1e-5, tokens_norm=True, **kwargs)
+    model = _create_xcit('xcit_large_24_p8_384_dist', pretrained=pretrained, **model_kwargs)
+    return model