Spaces:

dxm21
/

vdpm

Running on Zero

File size: 9,081 Bytes

import time
import torch
import torch.nn as nn

from vggt.heads.camera_head import CameraHead
from vggt.heads.dpt_head import DPTHead

from .aggregator import Aggregator
from .decoder import Decoder


def freeze_all_params(modules):
    for module in modules:
        try:
            for n, param in module.named_parameters():
                param.requires_grad = False
        except AttributeError:
            # module is directly a parameter
            module.requires_grad = False


class VDPM(nn.Module):
    def __init__(self, cfg, img_size=518, patch_size=14, embed_dim=1024):
        super().__init__()
        self.cfg = cfg

        self.aggregator = Aggregator(
            img_size=img_size,
            patch_size=patch_size,
            embed_dim=embed_dim,
        )
        self.decoder = Decoder(
            cfg,
            dim_in=2*embed_dim,
            embed_dim=embed_dim,
            depth=cfg.model.decoder_depth
        )
        self.point_head = DPTHead(dim_in=2 * embed_dim, output_dim=4, activation="inv_log", conf_activation="expp1")

        self.camera_head = CameraHead(dim_in=2 * embed_dim)
        self.profile = False
        self.set_freeze()

    def set_freeze(self):
        to_be_frozen = [self.aggregator.patch_embed]
        freeze_all_params(to_be_frozen)

    def forward(
        self,
        views, autocast_dpt=None
    ):
        images = torch.stack([view["img"] for view in views], dim=1)
        aggregated_tokens_list, patch_start_idx = self.aggregator(images)

        res_dynamic = dict()

        if self.decoder is not None:
            cond_view_idxs = torch.stack([view["view_idxs"][:, 1] for view in views], dim=1)
            decoded_tokens = self.decoder(images, aggregated_tokens_list, patch_start_idx, cond_view_idxs)

        if autocast_dpt is None:
            autocast_dpt = torch.amp.autocast("cuda", enabled=False)

        with autocast_dpt:
            pts3d, pts3d_conf = self.point_head(
                aggregated_tokens_list, images, patch_start_idx
            )

            padded_decoded_tokens = {
                layer_idx: decoded_tokens[idx]
                for idx, layer_idx in enumerate(self.point_head.intermediate_layer_idx)
            }
            pts3d_dyn, pts3d_dyn_conf = self.point_head(
                padded_decoded_tokens, images, patch_start_idx
            )

            res_dynamic |= {
                "pts3d": pts3d_dyn,
                "conf": pts3d_dyn_conf
            }

            pose_enc_list = self.camera_head(aggregated_tokens_list)
            res_dynamic |= {"pose_enc_list": pose_enc_list}

        res_static = dict(
            pts3d=pts3d,
            conf=pts3d_conf
        )
        return res_static, res_dynamic

    def inference(
        self,
        views,
        images=None,
        num_timesteps=None
    ):
        profile = self.profile and torch.cuda.is_available()
        if profile:
            ev = lambda: torch.cuda.Event(enable_timing=True)
            e_start, e_agg, e_cam_end = ev(), ev(), ev()
            e_dec_starts, e_dec_ends = [], []
            e_head_starts, e_head_ends = [], []
            e_cam_start = ev()
            mem_before = torch.cuda.memory_allocated() / 1024**3
            e_start.record()

        autocast_amp = torch.amp.autocast("cuda", enabled=True, dtype=torch.float16)

        if images is None:
            images = torch.stack([view["img"] for view in views], dim=1)

        # If not profiling per-stage, measure a single total inference time (minimal overhead)
        if not profile:
            torch.cuda.synchronize()
            _t_start = time.time()

        with autocast_amp:
            aggregated_tokens_list, patch_start_idx = self.aggregator(images)

        if profile:
            e_agg.record()
            mem_after_agg = torch.cuda.memory_allocated() / 1024**3

        S = images.shape[1]

        # Determine number of timesteps to query
        if num_timesteps is None:
            if views is not None and "view_idxs" in views[0]:
                try:
                    all_idxs = torch.cat([v["view_idxs"][:, 1] for v in views])
                    num_timesteps = int(all_idxs.max().item()) + 1
                except:
                    num_timesteps = S
            else:
                num_timesteps = S

        predictions = dict()
        pointmaps = []
        ones = torch.ones(1, S, dtype=torch.int64)

        for time_ in range(num_timesteps):
            cond_view_idxs = ones * time_

            if profile:
                e_ds = ev(); e_ds.record(); e_dec_starts.append(e_ds)

            with autocast_amp:
                decoded_tokens = self.decoder(images, aggregated_tokens_list, patch_start_idx, cond_view_idxs)

            if profile:
                e_de = ev(); e_de.record(); e_dec_ends.append(e_de)

            padded_decoded_tokens = {
                layer_idx: decoded_tokens[idx]
                for idx, layer_idx in enumerate(self.point_head.intermediate_layer_idx)
            }

            if profile:
                e_hs = ev(); e_hs.record(); e_head_starts.append(e_hs)
            pts3d, pts3d_conf = self.point_head(
                padded_decoded_tokens, images, patch_start_idx
            )

            if profile:
                e_he = ev(); e_he.record(); e_head_ends.append(e_he)

            pointmaps.append(dict(
                pts3d=pts3d,
                conf=pts3d_conf
            ))

        if profile:
            e_cam_start.record()

        pose_enc_list = self.camera_head(aggregated_tokens_list)

        if profile:
            e_cam_end.record()
            torch.cuda.synchronize()  # single sync at the very end
            mem_peak = torch.cuda.max_memory_allocated() / 1024**3

            t_agg = e_start.elapsed_time(e_agg) / 1000
            t_dec = sum(s.elapsed_time(e) / 1000 for s, e in zip(e_dec_starts, e_dec_ends))
            t_head = sum(s.elapsed_time(e) / 1000 for s, e in zip(e_head_starts, e_head_ends))
            t_cam = e_cam_start.elapsed_time(e_cam_end) / 1000
            t_total = e_start.elapsed_time(e_cam_end) / 1000

            print(f"  [PROFILE] Aggregator: {t_agg:.3f}s | VRAM: {mem_before:.2f} -> {mem_after_agg:.2f} GB (+{mem_after_agg - mem_before:.2f})")
            print(f"  [PROFILE] Stored layers: {sorted(k for k in aggregated_tokens_list if k >= 0)}")
            print(f"  [PROFILE] Decoder:    {t_dec:.3f}s  ({num_timesteps} timesteps, {t_dec/max(num_timesteps,1)*1000:.0f}ms each)")
            print(f"  [PROFILE] Point Head: {t_head:.3f}s  ({num_timesteps} timesteps, {t_head/max(num_timesteps,1)*1000:.0f}ms each)")
            print(f"  [PROFILE] Camera Head:{t_cam:.3f}s")
            print(f"  [PROFILE] Total:      {t_total:.3f}s | Peak VRAM: {mem_peak:.2f} GB")
            print(f"  [PROFILE] Breakdown:  Agg {t_agg/t_total*100:.0f}% | Dec {t_dec/t_total*100:.0f}% | PtHead {t_head/t_total*100:.0f}% | CamHead {t_cam/t_total*100:.0f}%")

        predictions["pose_enc"] = pose_enc_list[-1]
        predictions["pose_enc_list"] = pose_enc_list
        predictions["pointmaps"] = pointmaps

        if not profile:
            # single final sync and lightweight wall-clock timing
            torch.cuda.synchronize()
            t_total = time.time() - _t_start
            print(f"  [PROFILE] Total inference time: {t_total:.3f}s")

        return predictions

    def load_state_dict(self, ckpt, is_VGGT_static=False, **kw):
        # don't load these VGGT heads as not needed
        exclude = ["depth_head", "track_head"]
        ckpt = {k:v for k, v in ckpt.items() if k.split('.')[0] not in exclude}

        res = super().load_state_dict(ckpt, **kw)

        # Compile decoder blocks after weights are loaded so state_dict keys match the checkpoint.
        if hasattr(self, "decoder") and hasattr(self.decoder, "compile_blocks"):
            self.decoder.compile_blocks()

        return res

    def to_fp16(self, keep_norm_fp32: bool = False):
        """Convert model parameters and buffers to FP16 for inference.

        Args:
            keep_norm_fp32 (bool): If True, keep normalization layers (LayerNorm/BatchNorm)
                in FP32 for numerical stability. If False, convert everything to FP16.
        """
        # Convert whole model to half first
        self.half()

        if keep_norm_fp32:
            for m in self.modules():
                if isinstance(m, (torch.nn.LayerNorm, torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.SyncBatchNorm)):
                    m.float()

        # Ensure any stored dtype-sensitive tensors are converted appropriately
        try:
            # camera/register/time tokens are Parameters and are handled by self.half(),
            # but ensure any other buffers are also cast
            for name, buf in list(self._buffers.items()):
                if isinstance(buf, torch.Tensor):
                    self.register_buffer(name, buf.half(), persistent=(getattr(buf, 'persistent', False)))
        except Exception:
            pass

        return self