Spaces:

cellverse
/

draco

Sleeping

App Files Files Community

Felix-Xu commited on Nov 18, 2024

Commit

3bf7d18

1 Parent(s): fe94f9f

denoise model update

Browse files

Files changed (19) hide show

app.py +215 -3
draco.yaml +19 -0
draco/__init__.py +2 -0
draco/configuration/__init__.py +4 -0
draco/configuration/base.yaml +57 -0
draco/configuration/config.py +52 -0
draco/configuration/configurable.py +138 -0
draco/configuration/draco2d-b_triplet_pretrain.yaml +20 -0
draco/configuration/draco2d-h_triplet_pretrain.yaml +5 -0
draco/configuration/draco2d-l_triplet_pretrain.yaml +4 -0
draco/model/__init__.py +6 -0
draco/model/build.py +22 -0
draco/model/checkpoint.py +61 -0
draco/model/draco2d.py +663 -0
draco/model/draco_base.py +35 -0
draco/model/layer/__init__.py +3 -0
draco/model/layer/normalization.py +22 -0
draco/model/utils/constant.py +24 -0
requirements.txt +11 -0

app.py CHANGED Viewed

@@ -1,7 +1,219 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 demo.launch()

 import gradio as gr
+import h5py
+import mrcfile
+import numpy as np
+from PIL import Image
+from omegaconf import DictConfig
+import torch
+from pathlib import Path
+from torchvision.transforms import functional as F
+import torchvision.transforms.v2 as v2
+from draco.configuration import CfgNode
+from draco.model import (
+    build_model,
+    load_pretrained
+)
+class DRACODenoiser(object):
+    def __init__(self,
+        cfg: DictConfig,
+        ckpt_path: Path,
+    ) -> None:
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.transform = self.build_transform()
+        self.model = build_model(cfg).to(self.device).eval()
+        self.model = load_pretrained(self.model, ckpt_path, self.device)
+        self.patch_size = cfg.MODEL.PATCH_SIZE
+    def patchify(self, image: torch.Tensor) -> torch.Tensor:
+        B, C, H, W = image.shape
+        P = self.patch_size
+        if H % P != 0 or W % P != 0:
+            image = torch.nn.functional.pad(image, (0, (P - W % P) % P, 0, (P - H % P) % P), mode='constant', value=0)
+        patches = image.unfold(2, P, P).unfold(3, P, P)
+        patches = patches.permute(0, 2, 3, 4, 5, 1)
+        patches = patches.reshape(B, -1, P * P * C)
+        return patches
+    def unpatchify(self, patches: torch.Tensor, H: int, W: int) -> torch.Tensor:
+        B = patches.shape[0]
+        P = self.patch_size
+        images = patches.reshape(B, (H + P - 1) // P, (W + P - 1) // P, P, P, -1)
+        images = images.permute(0, 5, 1, 3, 2, 4)
+        images = images.reshape(B, -1, (H + P - 1) // P * P, (W + P - 1) // P * P)
+        images = images[..., :H, :W]
+        return images
+    @classmethod
+    def build_transform(cls) -> v2.Compose:
+        return v2.Compose([
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True)
+        ])
+    @torch.inference_mode()
+    def inference(self, image: Image.Image) -> None:
+        W, H = image.size
+        x = self.transform(image).unsqueeze(0).to(self.device)
+        y = self.model(x)
+        x = self.patchify(x).detach().cpu().numpy()
+        denoised = self.unpatchify(y, H, W).squeeze(0).permute(1, 2, 0).detach().cpu().numpy()
+        return denoised
+# Model Initialization
+cfg = CfgNode.load_yaml_with_base(Path("draco.yaml"))
+CfgNode.merge_with_dotlist(cfg, [])
+ckpt_path = Path("denoise.ckpt")
+denoiser = DRACODenoiser(cfg, ckpt_path)
+def Auto_contrast(image, t_mean=150.0/255.0, t_sd=40.0/255.0) -> np.ndarray:
+    image = (image - image.min()) / (image.max() - image.min())
+    mean = image.mean()
+    std = image.std()
+    f = std / t_sd
+    black = mean - t_mean * f
+    white = mean + (1 - t_mean) * f
+    new_image = np.clip(image, black, white)
+    new_image = (new_image - black) / (white - black)
+    return new_image
+def load_data(file_path) -> np.ndarray:
+    if file_path.endswith('.h5'):
+        with h5py.File(file_path, "r") as f:
+            full_micrograph = f["micrograph"] if "micrograph" in f else f["data"]
+            full_mean = full_micrograph.attrs["mean"] if "mean" in full_micrograph.attrs else full_micrograph[:].astype(np.float32).mean()
+            full_std = full_micrograph.attrs["std"] if "std" in full_micrograph.attrs else full_micrograph[:].astype(np.float32).std()
+            data = full_micrograph[:].astype(np.float32)
+    elif file_path.endswith('.mrc'):
+        with mrcfile.open(file_path, "r") as f:
+            data = f.data[:].astype(np.float32)
+            full_mean = data.mean()
+            full_std = data.std()
+    else:
+        raise ValueError("Unsupported file format. Please upload a .mrc or .h5 file.")
+    data = (data - full_mean) / full_std
+    return data
+def display_crop(data, x_offset, y_offset, auto_contrast) -> Image:
+    crop = data[y_offset:y_offset + 1024, x_offset:x_offset + 1024]
+    original_image_normalized = Auto_contrast(crop) if auto_contrast else (crop - crop.min()) / (crop.max() - crop.min())
+    input_image = Image.fromarray((original_image_normalized * 255).astype(np.uint8))
+    return input_image
+def process_and_denoise(data, x_offset, y_offset, auto_contrast) -> Image:
+    crop = data[y_offset:y_offset + 1024, x_offset:x_offset + 1024]
+    denoised_data = denoiser.inference(Image.fromarray(crop))
+    denoised_data = denoised_data.squeeze()
+    denoised_image_normalized = Auto_contrast(denoised_data) if auto_contrast else (denoised_data - denoised_data.min()) / (denoised_data.max() - denoised_data.min())
+    denoised_image = Image.fromarray((denoised_image_normalized * 255).astype(np.uint8))
+    return denoised_image
+def clear_images() -> tuple:
+    return None, None, None, gr.update(maximum=512), gr.update(maximum=512)
+with gr.Blocks(css="""
+    .gradio-container {
+        background-color: #f7f9fc;
+        font-family: Arial, sans-serif;
+    }
+    .title-text {
+        text-align: center;
+        font-size: 30px;
+        font-weight: bold;
+        margin-bottom: 10px;
+    }
+    .description-text {
+        text-align: center;
+        font-size: 18px;
+        margin-bottom: 20px;
+    }
+""") as demo:
+    # Centered Title and Description
+    with gr.Column():
+        gr.Markdown(
+            """
+            <div style="text-align: center; font-size: 30px; font-weight: bold; margin-bottom: 10px;">
+                Denoising Demo
+            </div>
+            <div style="text-align: center; font-size: 18px;">
+                Upload a Raw file or select an example to view the original and denoised images
+            </div>
+            """
+        )
+        file_input = gr.File(label="Or upload a Micrograph File in .h5 or .mrc format")
+        auto_contrast = gr.Checkbox(label="Enable Auto Contrast", value=False)
+        x_slider = gr.Slider(0, 512, step=10, label="X Offset")
+        y_slider = gr.Slider(0, 512, step=10, label="Y Offset")
+        with gr.Row():
+            denoise_button = gr.Button("Denoise")
+            clear_button = gr.Button("Clear")
+    with gr.Row():
+        with gr.Column():
+            original_image = gr.Image(type="pil", label="Original Image")
+        with gr.Column():
+            denoised_image = gr.Image(type="pil", label="Denoised Image")
+    active_data = gr.State()
+    def load_image_and_update_sliders(file_path) -> tuple:
+        data = load_data(file_path)
+        h, w = data.shape[:2]
+        return data, gr.update(maximum=w-1024), gr.update(maximum=h-1024)
+    file_input.clear(
+        clear_images,
+        inputs=None,
+        outputs=[original_image, denoised_image, active_data, x_slider, y_slider]
+    )
+    file_input.change(
+        lambda file: load_image_and_update_sliders(file.name) if file else (None, None, None, gr.update(maximum=512), gr.update(maximum=512)),
+        inputs=file_input,
+        outputs=[active_data, x_slider, y_slider]
+    )
+    x_slider.change(
+        display_crop,
+        inputs=[active_data, x_slider, y_slider, auto_contrast],
+        outputs=original_image
+    )
+    y_slider.change(
+        display_crop,
+        inputs=[active_data, x_slider, y_slider, auto_contrast],
+        outputs=original_image
+    )
+    denoise_button.click(
+        process_and_denoise,
+        inputs=[active_data, x_slider, y_slider, auto_contrast],
+        outputs=denoised_image
+    )
+    clear_button.click(clear_images, inputs=None, outputs=[original_image, denoised_image, active_data, x_slider, y_slider])
 demo.launch()

draco.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+MODEL:
+  NAME: DracoDenoiseAutoencoder
+  DEVICE: cuda
+  IMG_SIZE: 1024
+  PATCH_SIZE: 32
+  IN_CHANS: 1
+  VIT_SCALE: base
+  DYNAMIC_IMG_SIZE: True
+  DYNAMIC_IMG_PAD: True
+  DECODER_EMBED_DIM: 512
+  DECODER_DEPTH: 8
+  DECODER_NUM_HEADS: 16
+  DECODER_USE_NECK: True
+  DECODER_NECK_DIM: 256
+  USE_ABS_POS: true
+  USE_DECODER_NECK: True
+  WINDOW_SIZE: 28
+  DECODER_GLOBAL_ATTN_INDEXES: [3, 7]

draco/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+ import draco.model

draco/configuration/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .config import CfgNode
+from .configurable import configurable
+__all__ = [k for k in globals().keys() if not k.startswith("_")]

draco/configuration/base.yaml ADDED Viewed

	@@ -0,0 +1,57 @@

+DATALOADER:
+  BATCH_SIZE: 0
+  NUM_WORKERS: 0
+  PIN_MEMORY: False
+  DROP_LAST: False
+  PERSISTENT_WORKERS: False
+DATASET:
+  NAME: null
+  TRANSFORM:
+    NAME: null
+MODEL:
+  NAME: null
+  DEVICE: cuda
+  METRIC:
+    NAME: null
+    TYPE: null
+MODULE:
+  NAME: null
+  COMPILE: False
+  OPTIMIZER:
+    NAME: null
+  SCHEDULER:
+    NAME: null
+TRAINER:
+  STRATEGY: auto        # Set to `auto`, `ddp`, `deepspeed_stage_2`, `deepspeed_stage_3` ...
+  MIXED_PRECISION: False
+  CHECKPOINT:
+    EVERY_N_EPOCHS: 10
+    SAVE_BEST: False    # If True, monitor will be required
+    MONITOR: null
+    MONITOR_MODE: min   # Set to `min` or `max`
+  MAX_EPOCHS: -1        # If profiler is enabled, this will be *automatically* set to 1
+  LOG_EVERY_N_STEPS: 1
+  ACCUMULATE_GRAD_BATCHES: 1
+  CLIP_GRAD:
+    ALGORITHM: null
+    VALUE: null
+  DETERMINISTIC: False  # Set to True to enable cudnn.deterministic
+  BENCHMARK: False      # Set to True to enable cudnn.benchmark
+  PROFILER: null        # Set to `advanced` or `pytorch` to enable profiling
+  DETECT_ANOMALY: False # Set to True to enable anomaly detection
+  SYNC_BATCHNORM: False # Set to True to enable sync batchnorm
+SEED: null
+OUTPUT_DIR: null

draco/configuration/config.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import os.path
+from typing import Any
+from omegaconf import DictConfig, OmegaConf
+BASE_KEY = "_BASE_"
+ROOT_KEY = "cfg"
+class CfgNode(OmegaConf):
+    """
+    A wrapper around OmegaConf that provides some additional functionality.
+    """
+    @staticmethod
+    def load_yaml_with_base(filename: str) -> DictConfig:
+        cfg = OmegaConf.load(filename)
+        def _load_with_base(base_cfg_file: str) -> dict[str, Any]:
+            if base_cfg_file.startswith("~"):
+                base_cfg_file = os.path.expanduser(base_cfg_file)
+            if not any(map(base_cfg_file.startswith, ["/", "https://", "http://"])):
+                # the path to base cfg is relative to the config file itself.
+                base_cfg_file = os.path.join(os.path.dirname(filename), base_cfg_file)
+            return CfgNode.load_yaml_with_base(base_cfg_file)
+        if BASE_KEY in cfg:
+            if isinstance(cfg[BASE_KEY], list):
+                base_cfg: dict[str, Any] = {}
+                base_cfg_files = cfg[BASE_KEY]
+                for base_cfg_file in base_cfg_files:
+                    base_cfg = CfgNode.merge(base_cfg, _load_with_base(base_cfg_file))
+            else:
+                base_cfg_file = cfg[BASE_KEY]
+                base_cfg = _load_with_base(base_cfg_file)
+            del cfg[BASE_KEY]
+            base_cfg = CfgNode.merge(base_cfg, cfg)
+            return base_cfg
+        if ROOT_KEY in cfg:
+            return cfg[ROOT_KEY]
+        return cfg
+    @staticmethod
+    def merge_with_dotlist(cfg: DictConfig, dotlist: list[str]) -> None:
+        if len(dotlist) == 0:
+            return
+        new_dotlist = []
+        for key, value in zip(dotlist[::2], dotlist[1::2]):
+            new_dotlist.append(f"{key}={value}")
+        cfg.merge_with_dotlist(new_dotlist)

draco/configuration/configurable.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import functools
+import inspect
+from typing import Any, Callable
+from omegaconf import DictConfig
+__all__ = ["configurable"]
+def _called_with_cfg(*args, **kwargs) -> bool:
+    """
+    Check if the function is called with a `DictConfig` as the first argument.
+    Returns:
+        (bool): whether the function is called with a `DictConfig` as the first argument.
+            Or the `cfg` keyword argument is a `DictConfig`.
+    """
+    if len(args) > 0 and isinstance(args[0], DictConfig):
+        return True
+    if isinstance(kwargs.get("cfg", None), DictConfig):
+        return True
+    return False
+def _get_args_from_cfg(from_config_func: Callable[[Any], dict[str, Any]], *args, **kwargs) -> dict[str, Any]:
+    """
+    Get the input arguments of the decorated function from a `DictConfig` object.
+    Returns:
+        (dict): The input arguments of the class `__init__` method.
+    """
+    signature = inspect.signature(from_config_func)
+    if list(signature.parameters.keys())[0] != "cfg":
+        raise ValueError("The first argument of `{}` must be named as `cfg`.".format(from_config_func.__name__))
+    # Forwarding all arguments to `from_config`, if the arguments of `from_config` are only `*args` or `*kwargs`.
+    if any(param.kind in [param.VAR_POSITIONAL or param.VAR_KEYWORD] for param in signature.parameters.values()):
+        result = from_config_func(*args, **kwargs)
+    # If there is any positional arguments.
+    else:
+        positional_args_name = set(signature.parameters.keys())
+        extra_kwargs = {}
+        for name in kwargs.keys():
+            if name not in positional_args_name:
+                extra_kwargs[name] = kwargs.pop(name)
+        result = from_config_func(*args, **kwargs)
+        # These args are forwarded directly to `__init__` method.
+        result.update(extra_kwargs)
+    return result
+def configurable(init_func: Callable = None, *, from_config: Callable[[Any], dict[str, Any]] | None = None) -> Callable:
+    """
+    A decorator of a function or a class `__init__` method,
+    to make it configurable by a `DictConfig` object.
+    Example:
+    ```python
+    # 1. Decorate a function.
+    @configurable(from_config=lambda cfg: { "x": cfg.x })
+    def func(x, y=2, z=3):
+        pass
+    a1 = func(x=1, y=2) # Call with regular args.
+    a2 = func(cfg) # Call with a `DictConfig` object.
+    a3 = func(cfg, y=2, z=3) # Call with a `DictConfig` object and regular arguments.
+    # 2. Decorate a class `__init__` method.
+    class A:
+        @configurable
+        def __init__(self, *args, **kwargs) -> None:
+            pass
+        @classmethod
+        def from_config(cls, cfg) -> dict:
+            pass
+    a1 = A(x, y) # Call with regular constructor.
+    a2 = A(cfg) # Call with a `DictConfig` object.
+    a3 = A(cfg, x, y) # Call with a `DictConfig` object and regular arguments.
+    ```
+    Args:
+        `init_func` (callable): a function or a class method.
+        `from_config` (callable): a function that converts a `DictConfig` to the
+            input arguments of the decorated function.
+            It is always required.
+    """
+    # Decorating a function
+    if init_func is None:
+        # Prevent common misuse: `@configurable()`.
+        if from_config is None:
+            return configurable
+        assert inspect.isfunction(from_config), "`from_config` must be a function."
+        def wrapper(func):
+            @functools.wraps(func)
+            def wrapped(*args, **kwargs):
+                if _called_with_cfg(*args, **kwargs):
+                    explicit_args = _get_args_from_cfg(from_config, *args, **kwargs)
+                    return func(**explicit_args)
+                else:
+                    return func(*args, **kwargs)
+            wrapped.from_config = from_config
+            return wrapped
+        return wrapper
+    # Decorating a class `__init__` method
+    else:
+        assert(
+            inspect.isfunction(init_func) and from_config is None and init_func.__name__ == "__init__"
+        ), "Invalid usage of @configurable."
+        @functools.wraps(init_func)
+        def wrapped(self, *args, **kwargs):
+            try:
+                from_config_func = getattr(self, "from_config")
+            except AttributeError as e:
+                raise AttributeError("Class with `@configurable` should have a `from_config` classmethod.") from e
+            if not inspect.ismethod(from_config_func):
+                raise AttributeError("Class with `@configurable` should have a `from_config` classmethod.")
+            if _called_with_cfg(*args, **kwargs):
+                explicit_args = _get_args_from_cfg(from_config_func, *args, **kwargs)
+                init_func(self, **explicit_args)
+            else:
+                init_func(self, *args, **kwargs)
+        return wrapped

draco/configuration/draco2d-b_triplet_pretrain.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+_BASE_: base.yaml
+MODEL:
+  NAME: DenoisingReconstructionAutoencoderVisionTransformer2d
+  IMG_SIZE: 256
+  PATCH_SIZE: 16
+  IN_CHANS: 1
+  VIT_SCALE: base
+  DYNAMIC_IMG_SIZE: False
+  DYNAMIC_IMG_PAD: False
+  USE_ABS_POS: True
+  DECODER_EMBED_DIM: 512
+  DECODER_DEPTH: 8
+  DECODER_NUM_HEADS: 16
+  DECODER_USE_NECK: True
+  DECODER_NECK_DIM: 256
+SEED: 0
+OUTPUT_DIR: null

draco/configuration/draco2d-h_triplet_pretrain.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+_BASE_: draco-b_imagenet_pretrain.yaml
+MODEL:
+  PATCH_SIZE: 14
+  VIT_SCALE: huge

draco/configuration/draco2d-l_triplet_pretrain.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+_BASE_: draco-b_imagenet_pretrain.yaml
+MODEL:
+  VIT_SCALE: large

draco/model/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .build import MODEL_REGISTRY, build_model
+from .checkpoint import load_pretrained
+from .draco2d import DenoisingReconstructionAutoencoderVisionTransformer2d, DracoDenoiseAutoencoder
+__all__ = [k for k in globals().keys() if not k.startswith("_")]

draco/model/build.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from fvcore.common.registry import Registry
+from omegaconf import DictConfig
+import torch
+__all__ = ["MODEL_REGISTRY", "build_model"]
+MODEL_REGISTRY = Registry("MODEL")
+MODEL_REGISTRY.__doc__ = "Registry for the model."
+def build_model(cfg: DictConfig) -> torch.nn.Module:
+    """
+    Build the model defined by `cfg.MODEL.NAME`.
+    It moves the model to the device defined by `cfg.MODEL.DEVICE`.
+    It does not load checkpoints from `cfg`.
+    """
+    model_name = cfg.MODEL.NAME
+    try:
+        model = MODEL_REGISTRY.get(model_name)(cfg)
+    except KeyError as e:
+        raise KeyError(MODEL_REGISTRY) from e
+    return model

draco/model/checkpoint.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from pathlib import Path
+from typing import Any
+import torch
+def _strip_prefix_if_present(state_dict: dict[str, Any], prefix: str) -> None:
+    """
+    Strip the prefix in metadata, if any.
+    Args:
+        state_dict (OrderedDict): a state-dict to be loaded to the model.
+        prefix (str): prefix.
+    """
+    keys = sorted(state_dict.keys())
+    if not all(len(key) == 0 or key.startswith(prefix) for key in keys):
+        return
+    for key in keys:
+        newkey = key[len(prefix) :]
+        state_dict[newkey] = state_dict.pop(key)
+    # also strip the prefix in metadata, if any..
+    try:
+        metadata = state_dict._metadata  # pyre-ignore
+    except AttributeError:
+        pass
+    else:
+        for key in list(metadata.keys()):
+            # for the metadata dict, the key can be:
+            # '': for the DDP module, which we want to remove.
+            # 'module': for the actual model.
+            # 'module.xx.xx': for the rest.
+            if len(key) == 0:
+                continue
+            newkey = key[len(prefix) :]
+            metadata[newkey] = metadata.pop(key)
+def load_pretrained(model: torch.nn.Module, ckpt_path: Path, device: torch.device = "cuda") -> torch.nn.Module:
+    """
+    Load the pre-trained model from the checkpoint file.
+    """
+    ckpt = torch.load(ckpt_path, map_location=device, weights_only=False)
+    if "state_dict" in ckpt:
+        checkpoint_state_dict = ckpt["state_dict"]
+    elif "model" in ckpt:
+        checkpoint_state_dict = ckpt["model"]
+    else:
+        checkpoint_state_dict = ckpt
+    _strip_prefix_if_present(checkpoint_state_dict, "module.")    # for DistributedDataParallel
+    _strip_prefix_if_present(checkpoint_state_dict, "model.")     # for PyTorch Lightning Module
+    _strip_prefix_if_present(checkpoint_state_dict, "_orig_mod.") # for torch.compile
+    msg = model.load_state_dict(checkpoint_state_dict, strict=False)
+    print(f"Loaded pre-trained model from {ckpt_path} with message: {msg}")
+    return model

draco/model/draco2d.py ADDED Viewed

	@@ -0,0 +1,663 @@

+from functools import partial
+from typing import Any, Callable
+from omegaconf import DictConfig
+from timm.layers import build_sincos2d_pos_embed, resample_abs_pos_embed_nhwc, PatchEmbed, Mlp, LayerType
+from timm.models.vision_transformer import Block
+from timm.models.vision_transformer_sam import Block as SAMBlock
+import torch
+import torch.nn as nn
+from draco.configuration import configurable
+from .build import MODEL_REGISTRY
+from .layer import LayerNorm2d
+from .draco_base import DenoisingReconstructionAutoencoderVisionTransformerBase
+from .utils.constant import get_vit_scale, get_global_attn_indexes
+__all__ = ["DenoisingReconstructionAutoencoderVisionTransformer2d", "DracoDenoiseAutoencoder"]
+@MODEL_REGISTRY.register()
+class DenoisingReconstructionAutoencoderVisionTransformer2d(DenoisingReconstructionAutoencoderVisionTransformerBase):
+    @configurable
+    def __init__(self, *,
+        img_size: int = 224,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_layer: Callable = PatchEmbed,
+        dynamic_img_size: bool = False,
+        dynamic_img_pad: bool = False,
+        use_abs_pos: bool = True,
+        block_fn: nn.Module = Block,
+        norm_layer: LayerType = partial(nn.LayerNorm, eps=1e-6),
+        act_layer: LayerType = nn.GELU,
+        mlp_layer: nn.Module = Mlp,
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        decoder_block_fn: nn.Module = Block,
+        decoder_norm_layer: LayerType = partial(nn.LayerNorm, eps=1e-6),
+        decoder_act_layer: LayerType = nn.GELU,
+        decoder_mlp_layer: nn.Module = Mlp,
+        decoder_embed_dim: int = 512,
+        decoder_depth: int = 8,
+        decoder_num_heads: int = 16,
+        decoder_use_neck: bool = True,
+        decoder_neck_dim: int = 256,
+    ) -> None:
+        super().__init__()
+        self.dynamic_img_size = dynamic_img_size
+        self.decoder_use_neck = decoder_use_neck
+        self.init_encoder(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_layer=embed_layer,
+            dynamic_img_size=dynamic_img_size,
+            dynamic_img_pad=dynamic_img_pad,
+            use_abs_pos=use_abs_pos,
+            block_fn=block_fn,
+            norm_layer=norm_layer,
+            act_layer=act_layer,
+            mlp_layer=mlp_layer,
+            embed_dim=embed_dim,
+            depth=depth,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+        )
+        self.init_decoder(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            use_abs_pos=use_abs_pos,
+            decoder_block_fn=decoder_block_fn,
+            decoder_norm_layer=decoder_norm_layer,
+            decoder_act_layer=decoder_act_layer,
+            decoder_mlp_layer=decoder_mlp_layer,
+            decoder_embed_dim=decoder_embed_dim,
+            decoder_depth=decoder_depth,
+            decoder_num_heads=decoder_num_heads,
+            decoder_use_neck=decoder_use_neck,
+            decoder_neck_dim=decoder_neck_dim,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+        )
+        self.init_weights(
+            grid_size=self.patch_embed.grid_size,
+            embed_dim=embed_dim,
+            decoder_embed_dim=decoder_embed_dim,
+        )
+    @classmethod
+    def from_config(cls, cfg: DictConfig) -> dict[str, Any]:
+        embed_dim, depth, num_heads = get_vit_scale(cfg.MODEL.VIT_SCALE)
+        return {
+            "img_size": cfg.MODEL.IMG_SIZE,
+            "patch_size": cfg.MODEL.PATCH_SIZE,
+            "in_chans": cfg.MODEL.IN_CHANS,
+            "dynamic_img_size": cfg.MODEL.DYNAMIC_IMG_SIZE,
+            "dynamic_img_pad": cfg.MODEL.DYNAMIC_IMG_PAD,
+            "use_abs_pos": cfg.MODEL.USE_ABS_POS,
+            "embed_dim": embed_dim,
+            "depth": depth,
+            "num_heads": num_heads,
+            "decoder_embed_dim": cfg.MODEL.DECODER_EMBED_DIM,
+            "decoder_depth": cfg.MODEL.DECODER_DEPTH,
+            "decoder_num_heads": cfg.MODEL.DECODER_NUM_HEADS,
+            "decoder_use_neck": cfg.MODEL.DECODER_USE_NECK,
+            "decoder_neck_dim": cfg.MODEL.DECODER_NECK_DIM,
+        }
+    def init_encoder(self, *,
+        img_size: int,
+        patch_size: int,
+        in_chans: int,
+        embed_layer: Callable,
+        dynamic_img_size: bool,
+        dynamic_img_pad: bool,
+        use_abs_pos: bool,
+        block_fn: nn.Module,
+        norm_layer: LayerType | None,
+        act_layer: LayerType | None,
+        mlp_layer: nn.Module,
+        embed_dim: int,
+        depth: int,
+        num_heads: int,
+        mlp_ratio: float,
+        qkv_bias: bool,
+        qk_norm: bool,
+    ) -> None:
+        embed_args = {}
+        if dynamic_img_size:
+            embed_args.update(dict(strict_img_size=False))
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            dynamic_img_pad=dynamic_img_pad,
+            output_fmt="NHWC",
+            **embed_args
+        )
+        self.pos_embed = nn.Parameter(torch.zeros(1, *self.patch_embed.grid_size, embed_dim)) if use_abs_pos else None
+        self.blocks = nn.ModuleList([
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_norm=qk_norm,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                mlp_layer=mlp_layer,
+            ) for _ in range(depth)
+        ])
+        self.norm = norm_layer(embed_dim)
+    def init_decoder(self, *,
+        patch_size: int,
+        in_chans: int,
+        embed_dim: int,
+        use_abs_pos: bool,
+        decoder_block_fn: nn.Module,
+        decoder_norm_layer: LayerType | None,
+        decoder_act_layer: LayerType | None,
+        decoder_mlp_layer: nn.Module,
+        decoder_embed_dim: int,
+        decoder_depth: int,
+        decoder_num_heads: int,
+        decoder_use_neck: bool,
+        decoder_neck_dim: int,
+        mlp_ratio: float,
+        qkv_bias: bool,
+        qk_norm: bool,
+    ) -> None:
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
+        self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim)
+        self.decoder_pos_embed = nn.Parameter(torch.zeros(1, *self.patch_embed.grid_size, decoder_embed_dim)) if use_abs_pos else None
+        self.decoder_blocks = nn.ModuleList([
+            decoder_block_fn(
+                dim=decoder_embed_dim,
+                num_heads=decoder_num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_norm=qk_norm,
+                norm_layer=decoder_norm_layer,
+                act_layer=decoder_act_layer,
+                mlp_layer=decoder_mlp_layer,
+            ) for _ in range(decoder_depth)
+        ])
+        self.decoder_norm = decoder_norm_layer(decoder_embed_dim)
+        if decoder_use_neck:
+            self.decoder_neck = nn.Sequential(
+                nn.Conv2d(
+                    in_channels=decoder_embed_dim,
+                    out_channels=decoder_neck_dim,
+                    kernel_size=1,
+                    bias=False,
+                ),
+                LayerNorm2d(decoder_neck_dim),
+                decoder_act_layer(),
+                nn.Conv2d(
+                    in_channels=decoder_neck_dim,
+                    out_channels=decoder_neck_dim,
+                    kernel_size=3,
+                    padding=1,
+                    bias=False,
+                ),
+                LayerNorm2d(decoder_neck_dim),
+                decoder_act_layer(),
+                nn.Conv2d(
+                    in_channels=decoder_neck_dim,
+                    out_channels=decoder_embed_dim,
+                    kernel_size=1,
+                    bias=False,
+                ),
+                LayerNorm2d(decoder_embed_dim),
+            )
+        self.decoder_pred = nn.Linear(decoder_embed_dim, patch_size ** 2 * in_chans)
+    def init_weights(self, *,
+        grid_size: tuple[int, int],
+        embed_dim: int,
+        decoder_embed_dim: int
+    ) -> None:
+        w = self.patch_embed.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view(w.size(0), -1))
+        torch.nn.init.normal_(self.mask_token, std=0.02)
+        if self.pos_embed is not None:
+            self.pos_embed.data.copy_(build_sincos2d_pos_embed(
+                feat_shape=grid_size,
+                dim=embed_dim,
+                interleave_sin_cos=True
+            ).reshape(1, *grid_size, -1).transpose(1, 2))
+        if self.decoder_pos_embed is not None:
+            self.decoder_pos_embed.data.copy_(build_sincos2d_pos_embed(
+                feat_shape=grid_size,
+                dim=decoder_embed_dim,
+                interleave_sin_cos=True
+            ).reshape(1, *grid_size, -1).transpose(1, 2))
+        if self.decoder_use_neck:
+            for m in self.decoder_neck.modules():
+                if isinstance(m, nn.Conv2d):
+                    nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                    if m.bias is not None:
+                        nn.init.zeros_(m.bias)
+            nn.init.zeros_(self.decoder_neck[-1].weight)
+            nn.init.zeros_(self.decoder_neck[-1].bias)
+        self.apply(self._init_weights)
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, nn.Linear):
+            nn.init.xavier_uniform_(module.weight)
+            if module.bias is not None:
+                nn.init.constant_(module.bias, 0.0)
+    def forward_encoder(self, x: torch.Tensor, mask_ratio: float) -> tuple[torch.Tensor, torch.BoolTensor, int, int]:
+        x = self.patch_embed(x)
+        B, H, W, E = x.shape
+        if self.pos_embed is not None:
+            x = x + resample_abs_pos_embed_nhwc(self.pos_embed, (H, W))
+        x = x.view(B, -1, E)
+        mask = super().random_masking(x, mask_ratio)
+        x = x[~mask].reshape(B, -1, E)
+        for block in self.blocks:
+            x = block(x)
+        x = self.norm(x)
+        return x, mask, H, W
+    def forward_decoder(self, x: torch.Tensor, mask: torch.BoolTensor, H: int, W: int) -> torch.Tensor:
+        x = self.decoder_embed(x)
+        B, L = mask.shape
+        E = x.shape[-1]
+        mask_tokens = self.mask_token.repeat(B, L, 1).to(x.dtype)
+        mask_tokens[~mask] = x.reshape(-1, E)
+        x = mask_tokens
+        if self.decoder_pos_embed is not None:
+            x = x.view(B, H, W, E)
+            x = x + resample_abs_pos_embed_nhwc(self.decoder_pos_embed, (H, W))
+            x = x.view(B, -1, E)
+        for block in self.decoder_blocks:
+            x = block(x)
+        x = self.decoder_norm(x)
+        if self.decoder_use_neck:
+            x = x + self.decoder_neck(
+                x.permute(0, 2, 1).reshape(B, E, H, W).contiguous()
+            ).permute(0, 2, 3, 1).reshape(B, L, -1).contiguous()
+        x = self.decoder_pred(x)
+        return x
+    def forward(self, x: torch.Tensor, mask_ratio: float) -> tuple[torch.Tensor, torch.BoolTensor]:
+        x, mask, H, W = self.forward_encoder(x, mask_ratio)
+        x = self.forward_decoder(x, mask, H, W)
+        return x, mask
+@MODEL_REGISTRY.register()
+class DracoDenoiseAutoencoder(DenoisingReconstructionAutoencoderVisionTransformerBase):
+    """
+    Masked Autoencoder (MAE) with Vision Transformer backbone.
+    Note that `cls_token` is discarded.
+    """
+    @configurable
+    def __init__(self, *,
+        img_size: int = 224,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_layer: Callable = PatchEmbed,
+        dynamic_img_size: bool = False,
+        dynamic_img_pad: bool = False,
+        use_abs_pos: bool = True,
+        block_fn: nn.Module = SAMBlock,
+        norm_layer: LayerType = partial(nn.LayerNorm, eps=1e-6),
+        act_layer: LayerType = nn.GELU,
+        mlp_layer: nn.Module = Mlp,
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        window_size: int = 16,
+        global_attn_indexes: list[int] = [2, 5, 8, 11],
+        decoder_block_fn: nn.Module = SAMBlock,
+        decoder_norm_layer: LayerType = partial(nn.LayerNorm, eps=1e-6),
+        decoder_act_layer: LayerType = nn.GELU,
+        decoder_mlp_layer: nn.Module = Mlp,
+        decoder_embed_dim: int = 512,
+        decoder_depth: int = 8,
+        decoder_num_heads: int = 16,
+        decoder_use_neck: bool = True,
+        decoder_neck_dim: int = 256,
+        decoder_global_attn_indexes: list[int] = [3, 7],
+    ) -> None:
+        super().__init__()
+        self.dynamic_img_size = dynamic_img_size
+        self.decoder_use_neck = decoder_use_neck
+        self.init_encoder(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_layer=embed_layer,
+            dynamic_img_size=dynamic_img_size,
+            dynamic_img_pad=dynamic_img_pad,
+            use_abs_pos=use_abs_pos,
+            block_fn=block_fn,
+            norm_layer=norm_layer,
+            act_layer=act_layer,
+            mlp_layer=mlp_layer,
+            embed_dim=embed_dim,
+            depth=depth,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            window_size=window_size,
+            global_attn_indexes=global_attn_indexes
+        )
+        self.init_decoder(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            use_abs_pos=use_abs_pos,
+            decoder_block_fn=decoder_block_fn,
+            decoder_norm_layer=decoder_norm_layer,
+            decoder_act_layer=decoder_act_layer,
+            decoder_mlp_layer=decoder_mlp_layer,
+            decoder_embed_dim=decoder_embed_dim,
+            decoder_depth=decoder_depth,
+            decoder_num_heads=decoder_num_heads,
+            decoder_use_neck=decoder_use_neck,
+            decoder_neck_dim=decoder_neck_dim,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            window_size=window_size,
+            decoder_global_attn_indexes=decoder_global_attn_indexes
+        )
+        self.init_weights(
+            grid_size=self.patch_embed.grid_size,
+            embed_dim=embed_dim,
+            decoder_embed_dim=decoder_embed_dim,
+        )
+    @classmethod
+    def from_config(cls, cfg: DictConfig) -> dict[str, Any]:
+        embed_dim, depth, num_heads = get_vit_scale(cfg.MODEL.VIT_SCALE)
+        global_attn_indexes = get_global_attn_indexes(depth)
+        return {
+            "img_size": cfg.MODEL.IMG_SIZE,
+            "patch_size": cfg.MODEL.PATCH_SIZE,
+            "in_chans": cfg.MODEL.IN_CHANS,
+            "dynamic_img_size": cfg.MODEL.DYNAMIC_IMG_SIZE,
+            "dynamic_img_pad": cfg.MODEL.DYNAMIC_IMG_PAD,
+            "use_abs_pos": cfg.MODEL.USE_ABS_POS,
+            "embed_dim": embed_dim,
+            "depth": depth,
+            "num_heads": num_heads,
+            "window_size": cfg.MODEL.WINDOW_SIZE,
+            "global_attn_indexes": global_attn_indexes,
+            "decoder_embed_dim": cfg.MODEL.DECODER_EMBED_DIM,
+            "decoder_depth": cfg.MODEL.DECODER_DEPTH,
+            "decoder_num_heads": cfg.MODEL.DECODER_NUM_HEADS,
+            "decoder_use_neck": cfg.MODEL.DECODER_USE_NECK,
+            "decoder_neck_dim": cfg.MODEL.DECODER_NECK_DIM,
+            "decoder_global_attn_indexes": cfg.MODEL.DECODER_GLOBAL_ATTN_INDEXES,
+        }
+    def init_encoder(self, *,
+        img_size: int,
+        patch_size: int,
+        in_chans: int,
+        embed_layer: Callable,
+        dynamic_img_size: bool,
+        dynamic_img_pad: bool,
+        use_abs_pos: bool,
+        block_fn: nn.Module,
+        norm_layer: LayerType | None,
+        act_layer: LayerType | None,
+        mlp_layer: nn.Module,
+        embed_dim: int,
+        depth: int,
+        num_heads: int,
+        mlp_ratio: float,
+        qkv_bias: bool,
+        qk_norm: bool,
+        window_size: int,
+        global_attn_indexes: list,
+    ) -> None:
+        embed_args = {}
+        if dynamic_img_size:
+            # flatten deferred until after pos embed
+            embed_args.update(dict(strict_img_size=False))
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            dynamic_img_pad=dynamic_img_pad,
+            output_fmt="NHWC",
+            **embed_args
+        )
+        self.pos_embed = nn.Parameter(torch.zeros(1, *self.patch_embed.grid_size, embed_dim)) if use_abs_pos else None
+        self.blocks = nn.ModuleList(
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_norm=qk_norm,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                mlp_layer=mlp_layer,
+                use_rel_pos=True,
+                window_size=window_size if i not in global_attn_indexes else 0,
+                input_size=(img_size // patch_size, img_size // patch_size),
+            ) for i in range(depth)
+        )
+        self.norm = norm_layer(embed_dim)
+    def init_decoder(self, *,
+        img_size: int,
+        patch_size: int,
+        in_chans: int,
+        embed_dim: int,
+        use_abs_pos: bool,
+        decoder_block_fn: nn.Module,
+        decoder_norm_layer: LayerType | None,
+        decoder_act_layer: LayerType | None,
+        decoder_mlp_layer: nn.Module,
+        decoder_embed_dim: int,
+        decoder_depth: int,
+        decoder_num_heads: int,
+        decoder_use_neck: bool,
+        decoder_neck_dim: int,
+        mlp_ratio: float,
+        qkv_bias: bool,
+        qk_norm: bool,
+        window_size: int,
+        decoder_global_attn_indexes: list[int]
+    ) -> None:
+        self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim)
+        self.decoder_pos_embed = nn.Parameter(torch.zeros(1, *self.patch_embed.grid_size, decoder_embed_dim)) if use_abs_pos else None
+        self.decoder_blocks = nn.ModuleList(
+            decoder_block_fn(
+                dim=decoder_embed_dim,
+                num_heads=decoder_num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_norm=qk_norm,
+                norm_layer=decoder_norm_layer,
+                act_layer=decoder_act_layer,
+                mlp_layer=decoder_mlp_layer,
+                use_rel_pos=True,
+                window_size=window_size if i not in decoder_global_attn_indexes else 0,
+                input_size=(img_size // patch_size, img_size // patch_size),
+            ) for i in range(decoder_depth)
+        )
+        self.decoder_norm = decoder_norm_layer(decoder_embed_dim)
+        if decoder_use_neck:
+            self.decoder_neck = nn.Sequential(
+                nn.Conv2d(
+                    in_channels=decoder_embed_dim,
+                    out_channels=decoder_neck_dim,
+                    kernel_size=1,
+                    bias=False,
+                ),
+                LayerNorm2d(decoder_neck_dim),
+                decoder_act_layer(),
+                nn.Conv2d(
+                    in_channels=decoder_neck_dim,
+                    out_channels=decoder_neck_dim,
+                    kernel_size=3,
+                    padding=1,
+                    bias=False,
+                ),
+                LayerNorm2d(decoder_neck_dim),
+                decoder_act_layer(),
+                nn.Conv2d(
+                    in_channels=decoder_neck_dim,
+                    out_channels=decoder_embed_dim,
+                    kernel_size=1,
+                    bias=False,
+                ),
+                LayerNorm2d(decoder_embed_dim),
+            )
+        self.decoder_pred = nn.Linear(decoder_embed_dim, patch_size ** 2 * in_chans)
+    def init_weights(self, *,
+        grid_size: tuple[int, int],
+        embed_dim: int,
+        decoder_embed_dim: int
+    ) -> None:
+        w = self.patch_embed.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view(w.size(0), -1))
+        if self.pos_embed is not None:
+            self.pos_embed.data.copy_(build_sincos2d_pos_embed(
+                feat_shape=grid_size,
+                dim=embed_dim,
+                interleave_sin_cos=True
+            ).reshape(1, *grid_size, -1).transpose(1, 2))
+        if self.decoder_pos_embed is not None:
+            self.decoder_pos_embed.data.copy_(build_sincos2d_pos_embed(
+                feat_shape=grid_size,
+                dim=decoder_embed_dim,
+                interleave_sin_cos=True
+            ).reshape(1, *grid_size, -1).transpose(1, 2))
+        # Zero-initialize the neck
+        if self.decoder_use_neck:
+            for m in self.decoder_neck.modules():
+                if isinstance(m, nn.Conv2d):
+                    nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                    if m.bias is not None:
+                        nn.init.zeros_(m.bias)
+            nn.init.zeros_(self.decoder_neck[-1].weight)
+            nn.init.zeros_(self.decoder_neck[-1].bias)
+        self.apply(self._init_weights)
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, nn.Linear):
+            nn.init.xavier_uniform_(module.weight)
+            if module.bias is not None:
+                nn.init.constant_(module.bias, 0.0)
+    def forward_encoder(self, x: torch.Tensor) -> tuple[torch.Tensor, int, int]:
+        """
+        Forward pass of the encoder.
+        Args:
+            `x` (torch.Tensor): Image of shape [B, C, H, W].
+        Returns:
+            (torch.Tensor): Encoded image of shape [B, num_kept, E].
+            (int): Height of the encoded tokens.
+            (int): Width of the encoded tokens.
+        """
+        x = self.patch_embed(x)
+        B, H, W, E = x.shape
+        if self.pos_embed is not None:
+            x = x + resample_abs_pos_embed_nhwc(self.pos_embed, (H, W))
+        for block in self.blocks:
+            x = block(x)
+        x = x.view(B, -1, E)
+        x = self.norm(x)
+        return x, H, W
+    def forward_decoder(self, x: torch.Tensor, H: int, W: int) -> torch.Tensor:
+        """
+        Forward pass of the decoder.
+        Args:
+            `x` (torch.Tensor): Encoded image of shape [B, num_kept, E].
+            `H` (int): Height of the encoded tokens.
+            `W` (int): Width of the encoded tokens.
+        Returns:
+            (torch.Tensor): Decoded image of shape [B, L, E].
+        """
+        x = self.decoder_embed(x) # [B, num_kept, E]
+        B, L, E = x.shape
+        if self.decoder_pos_embed is not None:
+            x = x.view(B, H, W, E)
+            x = x + resample_abs_pos_embed_nhwc(self.decoder_pos_embed, (H, W))
+        for block in self.decoder_blocks:
+            x = block(x)
+        x = x.view(B, -1, E)
+        x = self.decoder_norm(x)
+        if self.decoder_use_neck:
+            x = x + self.decoder_neck(
+                x.permute(0, 2, 1).reshape(B, E, H, W).contiguous()
+            ).permute(0, 2, 3, 1).reshape(B, L, -1).contiguous()
+        x = self.decoder_pred(x)
+        return x
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            `x` (torch.Tensor): Image of shape [B, C, H, W].
+        Returns:
+            (torch.Tensor): The prediction of shape [B, L, E].
+        """
+        x, H, W = self.forward_encoder(x)
+        x = self.forward_decoder(x, H, W)
+        return x

draco/model/draco_base.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from abc import ABCMeta, abstractmethod
+import torch
+import torch.nn as nn
+class DenoisingReconstructionAutoencoderVisionTransformerBase(nn.Module, metaclass=ABCMeta):
+    def __init__(self) -> None:
+        super().__init__()
+    @torch.jit.ignore
+    def no_weight_decay(self) -> set:
+        return {"cls_token"}
+    @torch.jit.ignore
+    def group_matcher(self, coarse: bool = False) -> dict:
+        return dict(
+            stem=r'^(?:_orig_mod\.)?cls_token|^(?:_orig_mod\.)?pos_embed|^(?:_orig_mod\.)?patch_embed',
+            blocks=[(r'^(?:_orig_mod\.)?blocks\.(\d+)', None), (r'^(?:_orig_mod\.)?norm', (99999,))]
+        )
+    @classmethod
+    def random_masking(cls, x: torch.Tensor, mask_ratio: float) -> torch.BoolTensor:
+        B, L = x.shape[:2]
+        num_masked = int(L * mask_ratio)
+        noise = torch.rand(B, L, device=x.device)
+        rank = noise.argsort(dim=1)
+        mask = rank < num_masked
+        return mask
+    @abstractmethod
+    def forward(self) -> None:
+        raise NotImplementedError

draco/model/layer/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .normalization import LayerNorm2d
2	+
3	+ __all__ = [k for k in globals().keys() if not k.startswith("_")]

draco/model/layer/normalization.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+import torch.nn as nn
+__all__ = [
+    "LayerNorm2d",
+]
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_features: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_features))
+        self.bias = nn.Parameter(torch.zeros(num_features))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).square().mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x

draco/model/utils/constant.py ADDED Viewed

	@@ -0,0 +1,24 @@

+def get_vit_scale(scale: str) -> tuple[int, int, int]:
+    if scale == "tiny":
+        return 192, 12, 3
+    elif scale == "small":
+        return 384, 12, 6
+    elif scale == "base":
+        return 768, 12, 12
+    elif scale == "large":
+        return 1024, 24, 16
+    elif scale == "huge":
+        return 1280, 32, 16
+    else:
+        raise KeyError(f"Unknown Vision Transformer scale: {scale}")
+def get_global_attn_indexes(num_layers: int) -> list[int]:
+    """
+    Args:
+        num_layers (int): The number of layers.
+    Returns:
+        List[int]: The global attention indexes.
+    """
+    return list(range(num_layers // 4 - 1, num_layers, num_layers // 4))

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch==2.5.1
+torchvision==0.20.1
+h5py==3.12.1
+numpy==1.26.4
+pandas==2.2.2
+mrcfile==1.5.3
+scipy==1.13.1
+pycocotools==2.0.8
+omegaconf==2.3.0
+pillow
+fvcore