delinqu commited on
Commit
ca972dc
·
verified ·
1 Parent(s): 365b6d8

Delete files modeling_ego3d.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. modeling_ego3d.py +0 -124
modeling_ego3d.py DELETED
@@ -1,124 +0,0 @@
1
- # MIT License
2
- # Copyright (c) 2025 IPEC at Shanghai AI Laboratory
3
- # Permission is hereby granted, free of charge, to use, copy, modify, merge, publish,
4
- # distribute, sublicense, and/or sell copies of the Software, subject to the following conditions:
5
- # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
7
- # coding=utf-8
8
-
9
- import torch.utils.checkpoint
10
- from torch import nn
11
- from transformers.utils import logging
12
- import torchvision.transforms.functional as F
13
- import numpy as np
14
- import math
15
-
16
- logger = logging.get_logger(__name__)
17
-
18
-
19
- class Ego3DPositionEmbeddingMLP(nn.Module):
20
- """Absolute pos embedding, learned.
21
- https://github.com/kwea123/nerf_pl/blob/52aeb387da64a9ad9a0f914ea9b049ffc598b20c/models/nerf.py#L4
22
- """
23
-
24
- def __init__(self, in_channels=3, num_pos_feats=768, n_freqs=8, logscale=True):
25
- super(Ego3DPositionEmbeddingMLP, self).__init__()
26
- self.n_freqs = n_freqs
27
- self.freq_out_channels = in_channels * (2 * n_freqs + 1)
28
- if logscale:
29
- freq_bands = 2 ** torch.linspace(0, n_freqs - 1, n_freqs)
30
- else:
31
- freq_bands = torch.linspace(1, 2 ** (n_freqs - 1), n_freqs)
32
-
33
- center = torch.tensor([0., 0., 2.]).repeat(in_channels // 3)
34
- self.register_buffer("freq_bands", freq_bands, persistent=False)
35
- self.register_buffer("center", center, persistent=False)
36
-
37
- self.position_embedding_head = nn.Sequential(
38
- nn.Linear(self.freq_out_channels, num_pos_feats),
39
- nn.LayerNorm(num_pos_feats),
40
- nn.ReLU(),
41
- nn.Linear(num_pos_feats, num_pos_feats),
42
- )
43
- self._reset_parameters()
44
-
45
- def _reset_parameters(self):
46
- """init with small weights to maintain stable training."""
47
- for p in self.parameters():
48
- if p.dim() > 1:
49
- nn.init.xavier_uniform_(p, gain=0.01)
50
-
51
- @torch.no_grad()
52
- def frequency_encoding(self, xyz):
53
- """
54
- Embeds x to (x, sin(2^k x), cos(2^k x), ...)
55
- Different from the paper, "x" is also in the output
56
- See https://github.com/bmild/nerf/issues/12
57
- x \in [-2, 2]
58
- y \in [-2, 2]
59
- z \in [0., 4]
60
- Inputs:
61
- x: (b n m)
62
- Outputs:
63
- out: (b n o)
64
- """
65
- xyz_n = ((xyz - self.center) / 2.0).to(self.freq_bands.dtype)
66
- xyz_feq = xyz_n.unsqueeze(-1) * self.freq_bands # (b n m 1)
67
- sin_xyz, cos_xyz = torch.sin(xyz_feq), torch.cos(xyz_feq) # (b n m nf)
68
- encoding = torch.cat([xyz_n.unsqueeze(-1), sin_xyz, cos_xyz], -1).reshape(*xyz.shape[:2], -1)
69
- return encoding
70
-
71
- def forward(self, xyz):
72
- """Forward pass, xyz is (B, N, 3or6), output (B, N, F)."""
73
- # TODO: encoding with 3D position
74
- freq_encoding = self.frequency_encoding(xyz)
75
- position_embedding = self.position_embedding_head(freq_encoding)
76
- return position_embedding
77
-
78
-
79
- def get_resize_output_image_size(
80
- input_height: int,
81
- input_width: int,
82
- output_size: tuple = (384, 512),
83
- keep_aspect_ratio: bool = True,
84
- multiple: int = 32,
85
- ):
86
- def constrain_to_multiple_of(val, multiple, min_val=0):
87
- x = (np.round(val / multiple) * multiple).astype(int)
88
- if x < min_val:
89
- x = math.ceil(val / multiple) * multiple
90
- return x
91
-
92
- output_height, output_width = output_size
93
- scale_height = output_height / input_height
94
- scale_width = output_width / input_width
95
-
96
- if keep_aspect_ratio:
97
- # scale as little as possible
98
- if abs(1 - scale_width) < abs(1 - scale_height):
99
- scale_height = scale_width
100
- else:
101
- scale_width = scale_height
102
-
103
- new_height = constrain_to_multiple_of(scale_height * input_height, multiple=multiple)
104
- new_width = constrain_to_multiple_of(scale_width * input_width, multiple=multiple)
105
-
106
- return (int(new_height), int(new_width))
107
-
108
-
109
- def process_zoe(pixel_values, pad_mode="reflect", output_size=(384, 512)):
110
- """https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/zoedepth/image_processing_zoedepth.py"""
111
- # h, w = images.shape[-2:]
112
- # pad images
113
- ph, pw = 31, 31 # int((h / 2)**0.5 * 3), int((w / 2)**0.5 * 3) # 32, 31
114
- images = torch.nn.functional.pad(pixel_values, (pw, pw, ph, ph), mode=pad_mode)
115
-
116
- # resize images
117
- size = (384, 384) # get_resize_output_image_size(h, w, output_size=output_size, keep_aspect_ratio=True, multiple=32) # 384, 384
118
- images = torch.nn.functional.interpolate(images, size=size, mode="bicubic", align_corners=True)
119
-
120
- # NOTE: zoe: padding -> resize -> nomalize.
121
- # BUT: siglip processor get nomalized image, we simplely follow `nomalize -> padding -> resize` in reflect pad mode
122
- ZOE_MEAN, ZOE_STD = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
123
- images = F.normalize(images, mean=ZOE_MEAN, std=ZOE_STD)
124
- return images, ph, pw