Delete files modeling_ego3d.py with huggingface_hub
Browse files- modeling_ego3d.py +0 -124
modeling_ego3d.py
DELETED
|
@@ -1,124 +0,0 @@
|
|
| 1 |
-
# MIT License
|
| 2 |
-
# Copyright (c) 2025 IPEC at Shanghai AI Laboratory
|
| 3 |
-
# Permission is hereby granted, free of charge, to use, copy, modify, merge, publish,
|
| 4 |
-
# distribute, sublicense, and/or sell copies of the Software, subject to the following conditions:
|
| 5 |
-
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
| 6 |
-
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
|
| 7 |
-
# coding=utf-8
|
| 8 |
-
|
| 9 |
-
import torch.utils.checkpoint
|
| 10 |
-
from torch import nn
|
| 11 |
-
from transformers.utils import logging
|
| 12 |
-
import torchvision.transforms.functional as F
|
| 13 |
-
import numpy as np
|
| 14 |
-
import math
|
| 15 |
-
|
| 16 |
-
logger = logging.get_logger(__name__)
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
class Ego3DPositionEmbeddingMLP(nn.Module):
|
| 20 |
-
"""Absolute pos embedding, learned.
|
| 21 |
-
https://github.com/kwea123/nerf_pl/blob/52aeb387da64a9ad9a0f914ea9b049ffc598b20c/models/nerf.py#L4
|
| 22 |
-
"""
|
| 23 |
-
|
| 24 |
-
def __init__(self, in_channels=3, num_pos_feats=768, n_freqs=8, logscale=True):
|
| 25 |
-
super(Ego3DPositionEmbeddingMLP, self).__init__()
|
| 26 |
-
self.n_freqs = n_freqs
|
| 27 |
-
self.freq_out_channels = in_channels * (2 * n_freqs + 1)
|
| 28 |
-
if logscale:
|
| 29 |
-
freq_bands = 2 ** torch.linspace(0, n_freqs - 1, n_freqs)
|
| 30 |
-
else:
|
| 31 |
-
freq_bands = torch.linspace(1, 2 ** (n_freqs - 1), n_freqs)
|
| 32 |
-
|
| 33 |
-
center = torch.tensor([0., 0., 2.]).repeat(in_channels // 3)
|
| 34 |
-
self.register_buffer("freq_bands", freq_bands, persistent=False)
|
| 35 |
-
self.register_buffer("center", center, persistent=False)
|
| 36 |
-
|
| 37 |
-
self.position_embedding_head = nn.Sequential(
|
| 38 |
-
nn.Linear(self.freq_out_channels, num_pos_feats),
|
| 39 |
-
nn.LayerNorm(num_pos_feats),
|
| 40 |
-
nn.ReLU(),
|
| 41 |
-
nn.Linear(num_pos_feats, num_pos_feats),
|
| 42 |
-
)
|
| 43 |
-
self._reset_parameters()
|
| 44 |
-
|
| 45 |
-
def _reset_parameters(self):
|
| 46 |
-
"""init with small weights to maintain stable training."""
|
| 47 |
-
for p in self.parameters():
|
| 48 |
-
if p.dim() > 1:
|
| 49 |
-
nn.init.xavier_uniform_(p, gain=0.01)
|
| 50 |
-
|
| 51 |
-
@torch.no_grad()
|
| 52 |
-
def frequency_encoding(self, xyz):
|
| 53 |
-
"""
|
| 54 |
-
Embeds x to (x, sin(2^k x), cos(2^k x), ...)
|
| 55 |
-
Different from the paper, "x" is also in the output
|
| 56 |
-
See https://github.com/bmild/nerf/issues/12
|
| 57 |
-
x \in [-2, 2]
|
| 58 |
-
y \in [-2, 2]
|
| 59 |
-
z \in [0., 4]
|
| 60 |
-
Inputs:
|
| 61 |
-
x: (b n m)
|
| 62 |
-
Outputs:
|
| 63 |
-
out: (b n o)
|
| 64 |
-
"""
|
| 65 |
-
xyz_n = ((xyz - self.center) / 2.0).to(self.freq_bands.dtype)
|
| 66 |
-
xyz_feq = xyz_n.unsqueeze(-1) * self.freq_bands # (b n m 1)
|
| 67 |
-
sin_xyz, cos_xyz = torch.sin(xyz_feq), torch.cos(xyz_feq) # (b n m nf)
|
| 68 |
-
encoding = torch.cat([xyz_n.unsqueeze(-1), sin_xyz, cos_xyz], -1).reshape(*xyz.shape[:2], -1)
|
| 69 |
-
return encoding
|
| 70 |
-
|
| 71 |
-
def forward(self, xyz):
|
| 72 |
-
"""Forward pass, xyz is (B, N, 3or6), output (B, N, F)."""
|
| 73 |
-
# TODO: encoding with 3D position
|
| 74 |
-
freq_encoding = self.frequency_encoding(xyz)
|
| 75 |
-
position_embedding = self.position_embedding_head(freq_encoding)
|
| 76 |
-
return position_embedding
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
def get_resize_output_image_size(
|
| 80 |
-
input_height: int,
|
| 81 |
-
input_width: int,
|
| 82 |
-
output_size: tuple = (384, 512),
|
| 83 |
-
keep_aspect_ratio: bool = True,
|
| 84 |
-
multiple: int = 32,
|
| 85 |
-
):
|
| 86 |
-
def constrain_to_multiple_of(val, multiple, min_val=0):
|
| 87 |
-
x = (np.round(val / multiple) * multiple).astype(int)
|
| 88 |
-
if x < min_val:
|
| 89 |
-
x = math.ceil(val / multiple) * multiple
|
| 90 |
-
return x
|
| 91 |
-
|
| 92 |
-
output_height, output_width = output_size
|
| 93 |
-
scale_height = output_height / input_height
|
| 94 |
-
scale_width = output_width / input_width
|
| 95 |
-
|
| 96 |
-
if keep_aspect_ratio:
|
| 97 |
-
# scale as little as possible
|
| 98 |
-
if abs(1 - scale_width) < abs(1 - scale_height):
|
| 99 |
-
scale_height = scale_width
|
| 100 |
-
else:
|
| 101 |
-
scale_width = scale_height
|
| 102 |
-
|
| 103 |
-
new_height = constrain_to_multiple_of(scale_height * input_height, multiple=multiple)
|
| 104 |
-
new_width = constrain_to_multiple_of(scale_width * input_width, multiple=multiple)
|
| 105 |
-
|
| 106 |
-
return (int(new_height), int(new_width))
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
def process_zoe(pixel_values, pad_mode="reflect", output_size=(384, 512)):
|
| 110 |
-
"""https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/zoedepth/image_processing_zoedepth.py"""
|
| 111 |
-
# h, w = images.shape[-2:]
|
| 112 |
-
# pad images
|
| 113 |
-
ph, pw = 31, 31 # int((h / 2)**0.5 * 3), int((w / 2)**0.5 * 3) # 32, 31
|
| 114 |
-
images = torch.nn.functional.pad(pixel_values, (pw, pw, ph, ph), mode=pad_mode)
|
| 115 |
-
|
| 116 |
-
# resize images
|
| 117 |
-
size = (384, 384) # get_resize_output_image_size(h, w, output_size=output_size, keep_aspect_ratio=True, multiple=32) # 384, 384
|
| 118 |
-
images = torch.nn.functional.interpolate(images, size=size, mode="bicubic", align_corners=True)
|
| 119 |
-
|
| 120 |
-
# NOTE: zoe: padding -> resize -> nomalize.
|
| 121 |
-
# BUT: siglip processor get nomalized image, we simplely follow `nomalize -> padding -> resize` in reflect pad mode
|
| 122 |
-
ZOE_MEAN, ZOE_STD = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
|
| 123 |
-
images = F.normalize(images, mean=ZOE_MEAN, std=ZOE_STD)
|
| 124 |
-
return images, ph, pw
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|