Spaces:
Running on Zero
Running on Zero
update
Browse files- .gitignore +3 -0
- InfiniDepth/model/block/pe.py +5 -2
- InfiniDepth/model/block/perceive_io.py +32 -46
- InfiniDepth/model/block/prompt_models/__pycache__/__init__.cpython-310.pyc +0 -0
- InfiniDepth/model/block/prompt_models/__pycache__/rope.cpython-310.pyc +0 -0
- InfiniDepth/model/block/prompt_models/__pycache__/sam.cpython-310.pyc +0 -0
- InfiniDepth/model/block/prompt_models/__pycache__/selfattn.cpython-310.pyc +0 -0
- InfiniDepth/model/block/prompt_models/rope.py +6 -2
- InfiniDepth/model/block/prompt_models/selfattn.py +6 -2
- InfiniDepth/model/block/prompt_models/utils/__pycache__/__init__.cpython-310.pyc +0 -0
- InfiniDepth/model/block/prompt_models/utils/__pycache__/pe_utils.cpython-310.pyc +0 -0
- InfiniDepth/model/block/prompt_models/utils/__pycache__/transformer.cpython-310.pyc +0 -0
- InfiniDepth/model/block/utils.py +1 -1
- InfiniDepth/model/model.py +5 -1
- InfiniDepth/utils/__pycache__/hf_demo_utils.cpython-310.pyc +0 -0
- InfiniDepth/utils/hf_demo_utils.py +29 -33
- InfiniDepth/utils/io_utils.py +33 -3
- InfiniDepth/utils/sampling_utils.py +1 -1
- __pycache__/app.cpython-310.pyc +0 -0
- app.py +22 -4
- requirements.txt +1 -0
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
checkpoints
|
| 2 |
+
__pycache__
|
| 3 |
+
.pyc
|
InfiniDepth/model/block/pe.py
CHANGED
|
@@ -7,7 +7,11 @@ import torch.nn as nn
|
|
| 7 |
import torch.nn.functional as F
|
| 8 |
from typing import Any, Optional, Tuple, Dict
|
| 9 |
|
| 10 |
-
acc_dtype =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
POS_EMB_REGISTRY = {}
|
| 13 |
|
|
@@ -217,4 +221,3 @@ def build_pos_emb(pos_emb_type="nerf", **kwargs):
|
|
| 217 |
|
| 218 |
|
| 219 |
|
| 220 |
-
|
|
|
|
| 7 |
import torch.nn.functional as F
|
| 8 |
from typing import Any, Optional, Tuple, Dict
|
| 9 |
|
| 10 |
+
acc_dtype = (
|
| 11 |
+
torch.bfloat16
|
| 12 |
+
if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8
|
| 13 |
+
else torch.float16
|
| 14 |
+
)
|
| 15 |
|
| 16 |
POS_EMB_REGISTRY = {}
|
| 17 |
|
|
|
|
| 221 |
|
| 222 |
|
| 223 |
|
|
|
InfiniDepth/model/block/perceive_io.py
CHANGED
|
@@ -3,6 +3,7 @@ import torch
|
|
| 3 |
from einops import rearrange
|
| 4 |
from ...utils.logger import Log
|
| 5 |
from torch import Tensor, nn
|
|
|
|
| 6 |
|
| 7 |
try:
|
| 8 |
from xformers.ops import memory_efficient_attention, unbind
|
|
@@ -97,42 +98,6 @@ class MemEffCrossAttention(CrossAttention):
|
|
| 97 |
return x
|
| 98 |
|
| 99 |
|
| 100 |
-
# class Attention(nn.Module):
|
| 101 |
-
# def __init__(
|
| 102 |
-
# self,
|
| 103 |
-
# dim: int,
|
| 104 |
-
# num_heads: int = 8,
|
| 105 |
-
# qkv_bias: bool = False,
|
| 106 |
-
# proj_bias: bool = True,
|
| 107 |
-
# attn_drop: float = 0.0,
|
| 108 |
-
# proj_drop: float = 0.0,
|
| 109 |
-
# ) -> None:
|
| 110 |
-
# super().__init__()
|
| 111 |
-
# self.num_heads = num_heads
|
| 112 |
-
# head_dim = dim // num_heads
|
| 113 |
-
# self.scale = head_dim**-0.5
|
| 114 |
-
|
| 115 |
-
# self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
| 116 |
-
# self.attn_drop = nn.Dropout(attn_drop)
|
| 117 |
-
# self.proj = nn.Linear(dim, dim, bias=proj_bias)
|
| 118 |
-
# self.proj_drop = nn.Dropout(proj_drop)
|
| 119 |
-
|
| 120 |
-
# def forward(self, x: Tensor) -> Tensor:
|
| 121 |
-
# B, N, C = x.shape
|
| 122 |
-
# qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
| 123 |
-
|
| 124 |
-
# q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
|
| 125 |
-
# attn = q @ k.transpose(-2, -1)
|
| 126 |
-
|
| 127 |
-
# attn = attn.softmax(dim=-1)
|
| 128 |
-
# attn = self.attn_drop(attn)
|
| 129 |
-
|
| 130 |
-
# x = (attn @ v).transpose(1, 2).reshape(B, N, C)
|
| 131 |
-
# x = self.proj(x)
|
| 132 |
-
# x = self.proj_drop(x)
|
| 133 |
-
# return x
|
| 134 |
-
|
| 135 |
-
|
| 136 |
class Attention(nn.Module):
|
| 137 |
def __init__(
|
| 138 |
self,
|
|
@@ -142,7 +107,7 @@ class Attention(nn.Module):
|
|
| 142 |
proj_bias: bool = True,
|
| 143 |
attn_drop: float = 0.0,
|
| 144 |
proj_drop: float = 0.0,
|
| 145 |
-
pe: str = "
|
| 146 |
) -> None:
|
| 147 |
super().__init__()
|
| 148 |
self.num_heads = num_heads
|
|
@@ -160,20 +125,41 @@ class Attention(nn.Module):
|
|
| 160 |
|
| 161 |
def forward(self, x: Tensor, x_pe: Tensor = None) -> Tensor:
|
| 162 |
B, N, C = x.shape
|
| 163 |
-
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
|
| 164 |
-
|
| 165 |
-
q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
|
| 166 |
if self.pe == "qk":
|
|
|
|
|
|
|
|
|
|
| 167 |
q = self.norm1(q + x_pe)
|
| 168 |
k = self.norm2(k + x_pe)
|
| 169 |
elif self.pe == "apply":
|
| 170 |
pass
|
| 171 |
-
attn = q @ k.transpose(-2, -1)
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
x = self.proj(x)
|
| 178 |
x = self.proj_drop(x)
|
| 179 |
return x
|
|
@@ -188,7 +174,7 @@ class MemEffAttention(Attention):
|
|
| 188 |
) -> Tensor:
|
| 189 |
if not XFORMERS_AVAILABLE:
|
| 190 |
assert attn_bias is None, "xFormers is required for nested tensors usage"
|
| 191 |
-
return super().forward(x)
|
| 192 |
|
| 193 |
B, N, C = x.shape
|
| 194 |
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
|
|
|
|
| 3 |
from einops import rearrange
|
| 4 |
from ...utils.logger import Log
|
| 5 |
from torch import Tensor, nn
|
| 6 |
+
import torch.nn.functional as F
|
| 7 |
|
| 8 |
try:
|
| 9 |
from xformers.ops import memory_efficient_attention, unbind
|
|
|
|
| 98 |
return x
|
| 99 |
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
class Attention(nn.Module):
|
| 102 |
def __init__(
|
| 103 |
self,
|
|
|
|
| 107 |
proj_bias: bool = True,
|
| 108 |
attn_drop: float = 0.0,
|
| 109 |
proj_drop: float = 0.0,
|
| 110 |
+
pe: str = "qk",
|
| 111 |
) -> None:
|
| 112 |
super().__init__()
|
| 113 |
self.num_heads = num_heads
|
|
|
|
| 125 |
|
| 126 |
def forward(self, x: Tensor, x_pe: Tensor = None) -> Tensor:
|
| 127 |
B, N, C = x.shape
|
| 128 |
+
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
|
| 129 |
+
q, k, v = qkv.unbind(dim=2)
|
|
|
|
| 130 |
if self.pe == "qk":
|
| 131 |
+
if x_pe is None:
|
| 132 |
+
raise ValueError("x_pe must be provided when pe='qk'")
|
| 133 |
+
x_pe = rearrange(x_pe, "b n (m c) -> b n m c", m=self.num_heads)
|
| 134 |
q = self.norm1(q + x_pe)
|
| 135 |
k = self.norm2(k + x_pe)
|
| 136 |
elif self.pe == "apply":
|
| 137 |
pass
|
|
|
|
| 138 |
|
| 139 |
+
# Keep behavior aligned with MemEffAttention when norm changes q/k dtype.
|
| 140 |
+
q = q.to(dtype=v.dtype)
|
| 141 |
+
k = k.to(dtype=v.dtype)
|
| 142 |
+
|
| 143 |
+
q = q.transpose(1, 2)
|
| 144 |
+
k = k.transpose(1, 2)
|
| 145 |
+
v = v.transpose(1, 2)
|
| 146 |
+
|
| 147 |
+
# Use SDPA and run attention kernel in float32 under mixed precision for better numerical stability.
|
| 148 |
+
if q.dtype in (torch.float16, torch.bfloat16):
|
| 149 |
+
q_attn = q.float()
|
| 150 |
+
k_attn = k.float()
|
| 151 |
+
v_attn = v.float()
|
| 152 |
+
else:
|
| 153 |
+
q_attn = q
|
| 154 |
+
k_attn = k
|
| 155 |
+
v_attn = v
|
| 156 |
+
x = F.scaled_dot_product_attention(
|
| 157 |
+
q_attn,
|
| 158 |
+
k_attn,
|
| 159 |
+
v_attn,
|
| 160 |
+
dropout_p=self.attn_drop.p if self.training else 0.0,
|
| 161 |
+
)
|
| 162 |
+
x = x.to(dtype=v.dtype).transpose(1, 2).reshape(B, N, C)
|
| 163 |
x = self.proj(x)
|
| 164 |
x = self.proj_drop(x)
|
| 165 |
return x
|
|
|
|
| 174 |
) -> Tensor:
|
| 175 |
if not XFORMERS_AVAILABLE:
|
| 176 |
assert attn_bias is None, "xFormers is required for nested tensors usage"
|
| 177 |
+
return super().forward(x, x_pe)
|
| 178 |
|
| 179 |
B, N, C = x.shape
|
| 180 |
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
|
InfiniDepth/model/block/prompt_models/__pycache__/__init__.cpython-310.pyc
CHANGED
|
Binary files a/InfiniDepth/model/block/prompt_models/__pycache__/__init__.cpython-310.pyc and b/InfiniDepth/model/block/prompt_models/__pycache__/__init__.cpython-310.pyc differ
|
|
|
InfiniDepth/model/block/prompt_models/__pycache__/rope.cpython-310.pyc
CHANGED
|
Binary files a/InfiniDepth/model/block/prompt_models/__pycache__/rope.cpython-310.pyc and b/InfiniDepth/model/block/prompt_models/__pycache__/rope.cpython-310.pyc differ
|
|
|
InfiniDepth/model/block/prompt_models/__pycache__/sam.cpython-310.pyc
CHANGED
|
Binary files a/InfiniDepth/model/block/prompt_models/__pycache__/sam.cpython-310.pyc and b/InfiniDepth/model/block/prompt_models/__pycache__/sam.cpython-310.pyc differ
|
|
|
InfiniDepth/model/block/prompt_models/__pycache__/selfattn.cpython-310.pyc
CHANGED
|
Binary files a/InfiniDepth/model/block/prompt_models/__pycache__/selfattn.cpython-310.pyc and b/InfiniDepth/model/block/prompt_models/__pycache__/selfattn.cpython-310.pyc differ
|
|
|
InfiniDepth/model/block/prompt_models/rope.py
CHANGED
|
@@ -3,7 +3,11 @@ import torch
|
|
| 3 |
import torch.nn as nn
|
| 4 |
import torch.nn.functional as F
|
| 5 |
|
| 6 |
-
acc_dtype =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
class PositionGetter:
|
|
@@ -209,4 +213,4 @@ class RotaryPositionEmbedding2D(nn.Module):
|
|
| 209 |
)
|
| 210 |
|
| 211 |
def forward(self, size: Tuple[int, int], device: torch.device) -> torch.Tensor:
|
| 212 |
-
return self.forward_encoding(size, device)
|
|
|
|
| 3 |
import torch.nn as nn
|
| 4 |
import torch.nn.functional as F
|
| 5 |
|
| 6 |
+
acc_dtype = (
|
| 7 |
+
torch.bfloat16
|
| 8 |
+
if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8
|
| 9 |
+
else torch.float16
|
| 10 |
+
)
|
| 11 |
|
| 12 |
|
| 13 |
class PositionGetter:
|
|
|
|
| 213 |
)
|
| 214 |
|
| 215 |
def forward(self, size: Tuple[int, int], device: torch.device) -> torch.Tensor:
|
| 216 |
+
return self.forward_encoding(size, device)
|
InfiniDepth/model/block/prompt_models/selfattn.py
CHANGED
|
@@ -6,7 +6,11 @@ from .rope import RotaryPositionEmbedding2D
|
|
| 6 |
from .utils.pe_utils import PositionEmbeddingRandom
|
| 7 |
from torch import Tensor
|
| 8 |
|
| 9 |
-
acc_dtype =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
class SelfAttnPromptModel(nn.Module):
|
|
@@ -283,4 +287,4 @@ class SelfAttenPromptBlock(nn.Module):
|
|
| 283 |
else:
|
| 284 |
context = x[:, x_len : x_len + context_len, :]
|
| 285 |
|
| 286 |
-
return query, context
|
|
|
|
| 6 |
from .utils.pe_utils import PositionEmbeddingRandom
|
| 7 |
from torch import Tensor
|
| 8 |
|
| 9 |
+
acc_dtype = (
|
| 10 |
+
torch.bfloat16
|
| 11 |
+
if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8
|
| 12 |
+
else torch.float16
|
| 13 |
+
)
|
| 14 |
|
| 15 |
|
| 16 |
class SelfAttnPromptModel(nn.Module):
|
|
|
|
| 287 |
else:
|
| 288 |
context = x[:, x_len : x_len + context_len, :]
|
| 289 |
|
| 290 |
+
return query, context
|
InfiniDepth/model/block/prompt_models/utils/__pycache__/__init__.cpython-310.pyc
CHANGED
|
Binary files a/InfiniDepth/model/block/prompt_models/utils/__pycache__/__init__.cpython-310.pyc and b/InfiniDepth/model/block/prompt_models/utils/__pycache__/__init__.cpython-310.pyc differ
|
|
|
InfiniDepth/model/block/prompt_models/utils/__pycache__/pe_utils.cpython-310.pyc
CHANGED
|
Binary files a/InfiniDepth/model/block/prompt_models/utils/__pycache__/pe_utils.cpython-310.pyc and b/InfiniDepth/model/block/prompt_models/utils/__pycache__/pe_utils.cpython-310.pyc differ
|
|
|
InfiniDepth/model/block/prompt_models/utils/__pycache__/transformer.cpython-310.pyc
CHANGED
|
Binary files a/InfiniDepth/model/block/prompt_models/utils/__pycache__/transformer.cpython-310.pyc and b/InfiniDepth/model/block/prompt_models/utils/__pycache__/transformer.cpython-310.pyc differ
|
|
|
InfiniDepth/model/block/utils.py
CHANGED
|
@@ -111,7 +111,7 @@ def make_coord(shape, ranges=None, flatten=True):
|
|
| 111 |
r = (v1 - v0) / (2 * n)
|
| 112 |
seq = v0 + r + (2 * r) * torch.arange(n).float()
|
| 113 |
coord_seqs.append(seq)
|
| 114 |
-
ret = torch.stack(torch.meshgrid(*coord_seqs), dim=-1)
|
| 115 |
if flatten:
|
| 116 |
ret = ret.view(-1, ret.shape[-1])
|
| 117 |
return ret
|
|
|
|
| 111 |
r = (v1 - v0) / (2 * n)
|
| 112 |
seq = v0 + r + (2 * r) * torch.arange(n).float()
|
| 113 |
coord_seqs.append(seq)
|
| 114 |
+
ret = torch.stack(torch.meshgrid(*coord_seqs, indexing="ij"), dim=-1)
|
| 115 |
if flatten:
|
| 116 |
ret = ret.view(-1, ret.shape[-1])
|
| 117 |
return ret
|
InfiniDepth/model/model.py
CHANGED
|
@@ -17,7 +17,11 @@ from .block.prompt_models import GeneralPromptModel, SelfAttnPromptModel
|
|
| 17 |
from .block.implicit_decoder import ImplicitHead
|
| 18 |
from .block.convolution import BasicEncoder
|
| 19 |
|
| 20 |
-
acc_dtype =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
def _resolve_local_dinov3_repo() -> str:
|
|
|
|
| 17 |
from .block.implicit_decoder import ImplicitHead
|
| 18 |
from .block.convolution import BasicEncoder
|
| 19 |
|
| 20 |
+
acc_dtype = (
|
| 21 |
+
torch.bfloat16
|
| 22 |
+
if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8
|
| 23 |
+
else torch.float16
|
| 24 |
+
)
|
| 25 |
|
| 26 |
|
| 27 |
def _resolve_local_dinov3_repo() -> str:
|
InfiniDepth/utils/__pycache__/hf_demo_utils.cpython-310.pyc
CHANGED
|
Binary files a/InfiniDepth/utils/__pycache__/hf_demo_utils.cpython-310.pyc and b/InfiniDepth/utils/__pycache__/hf_demo_utils.cpython-310.pyc differ
|
|
|
InfiniDepth/utils/hf_demo_utils.py
CHANGED
|
@@ -16,16 +16,17 @@ from .inference_utils import (
|
|
| 16 |
resolve_camera_intrinsics,
|
| 17 |
resolve_output_size_from_mode,
|
| 18 |
)
|
| 19 |
-
from .io_utils import depth2pcd
|
| 20 |
from .model_utils import build_model
|
| 21 |
-
from .sampling_utils import make_2d_uniform_coord
|
| 22 |
from .vis_utils import clip_outliers_by_percentile, colorize_depth_maps
|
| 23 |
|
| 24 |
|
| 25 |
DEFAULT_MODEL_PATHS = {
|
| 26 |
-
"InfiniDepth": "checkpoints/depth/
|
| 27 |
-
"InfiniDepth_DC": "checkpoints/depth/
|
| 28 |
}
|
|
|
|
| 29 |
DEFAULT_MOGE2_PRETRAINED = os.getenv("INFINIDEPTH_MOGE2_PRETRAINED", "Ruicheng/moge-2-vitl-normal")
|
| 30 |
|
| 31 |
|
|
@@ -144,6 +145,7 @@ def _build_and_save_point_cloud(
|
|
| 144 |
cx: float,
|
| 145 |
cy: float,
|
| 146 |
output_path: str,
|
|
|
|
| 147 |
) -> tuple[np.ndarray, np.ndarray]:
|
| 148 |
ixt = np.array([[fx, 0.0, cx], [0.0, fy, cy], [0.0, 0.0, 1.0]], dtype=np.float32)
|
| 149 |
pcd = depth2pcd(
|
|
@@ -164,7 +166,14 @@ def _build_and_save_point_cloud(
|
|
| 164 |
if not o3d.io.write_point_cloud(output_path, pcd):
|
| 165 |
raise RuntimeError(f"Failed to save point cloud to {output_path}")
|
| 166 |
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
|
| 170 |
def resolve_checkpoint_path(model_type: str) -> str:
|
|
@@ -233,12 +242,18 @@ def run_single_image_demo(
|
|
| 233 |
upsample_ratio=upsample_ratio,
|
| 234 |
)
|
| 235 |
|
|
|
|
|
|
|
|
|
|
| 236 |
gt_depth, prompt_depth, gt_depth_mask, prompt_mask, depth_source_label = _resolve_depth_inputs(
|
| 237 |
depth_path=depth_path,
|
| 238 |
input_size=input_size,
|
| 239 |
image=image,
|
| 240 |
device=device,
|
| 241 |
)
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
ckpt_path = resolve_checkpoint_path(model_type)
|
| 244 |
model_cache = model_cache or ModelCache()
|
|
@@ -248,10 +263,10 @@ def run_single_image_demo(
|
|
| 248 |
pred_depth, _ = model.inference(
|
| 249 |
image=image,
|
| 250 |
query_coord=query_2d_uniform_coord,
|
| 251 |
-
gt_depth=
|
| 252 |
gt_depth_mask=gt_depth_mask,
|
| 253 |
-
prompt_depth=
|
| 254 |
-
prompt_mask=prompt_mask
|
| 255 |
)
|
| 256 |
|
| 257 |
pred_depthmap = pred_depth.permute(0, 2, 1).reshape(1, 1, h_out, w_out)
|
|
@@ -265,45 +280,26 @@ def run_single_image_demo(
|
|
| 265 |
cy_org=cy,
|
| 266 |
org_h=org_h,
|
| 267 |
org_w=org_w,
|
| 268 |
-
h=
|
| 269 |
-
w=
|
| 270 |
device=device,
|
| 271 |
)
|
| 272 |
|
| 273 |
-
rgb_out_np = _resize_rgb_image(image_np, (h_out, w_out))
|
| 274 |
-
rgb_out_tensor = _image_tensor_from_numpy(rgb_out_np, device)
|
| 275 |
-
query_3d_uniform_coord = make_3d_uniform_coord_triangle(
|
| 276 |
-
depth_hw=pred_depthmap[0, 0],
|
| 277 |
-
fx=fx_out,
|
| 278 |
-
fy=fy_out,
|
| 279 |
-
cx=cx_out,
|
| 280 |
-
cy=cy_out,
|
| 281 |
-
N=int(max_points_preview),
|
| 282 |
-
deterministic=True,
|
| 283 |
-
).unsqueeze(0)
|
| 284 |
-
pred_depth_3d, _ = model.inference(
|
| 285 |
-
image=image,
|
| 286 |
-
query_coord=query_3d_uniform_coord,
|
| 287 |
-
gt_depth=gt_depth,
|
| 288 |
-
gt_depth_mask=gt_depth_mask,
|
| 289 |
-
prompt_depth=prompt_depth if model_type == "InfiniDepth_DC" else None,
|
| 290 |
-
prompt_mask=prompt_mask if model_type == "InfiniDepth_DC" else None,
|
| 291 |
-
)
|
| 292 |
-
|
| 293 |
output_dir = tempfile.mkdtemp(prefix="infinidepth_demo_")
|
| 294 |
ply_path = os.path.join(output_dir, f"{model_type}_point_cloud.ply")
|
| 295 |
depth_npy_path = os.path.join(output_dir, f"{model_type}_depth.npy")
|
| 296 |
np.save(depth_npy_path, pred_depthmap[0, 0].detach().cpu().numpy().astype(np.float32))
|
| 297 |
|
| 298 |
xyz, rgb = _build_and_save_point_cloud(
|
| 299 |
-
query_coord=
|
| 300 |
-
pred_depth=
|
| 301 |
-
rgb_image=
|
| 302 |
fx=float(fx_out),
|
| 303 |
fy=float(fy_out),
|
| 304 |
cx=float(cx_out),
|
| 305 |
cy=float(cy_out),
|
| 306 |
output_path=ply_path,
|
|
|
|
| 307 |
)
|
| 308 |
|
| 309 |
return DemoResult(
|
|
|
|
| 16 |
resolve_camera_intrinsics,
|
| 17 |
resolve_output_size_from_mode,
|
| 18 |
)
|
| 19 |
+
from .io_utils import depth2pcd, depth_to_disparity
|
| 20 |
from .model_utils import build_model
|
| 21 |
+
from .sampling_utils import make_2d_uniform_coord
|
| 22 |
from .vis_utils import clip_outliers_by_percentile, colorize_depth_maps
|
| 23 |
|
| 24 |
|
| 25 |
DEFAULT_MODEL_PATHS = {
|
| 26 |
+
"InfiniDepth": "checkpoints/depth/infinidepth.ckpt",
|
| 27 |
+
"InfiniDepth_DC": "checkpoints/depth/infinidepth_dc.ckpt",
|
| 28 |
}
|
| 29 |
+
os.environ.setdefault("INFINIDEPTH_MOGE2_PRETRAINED", "checkpoints/depth/moge2.pt")
|
| 30 |
DEFAULT_MOGE2_PRETRAINED = os.getenv("INFINIDEPTH_MOGE2_PRETRAINED", "Ruicheng/moge-2-vitl-normal")
|
| 31 |
|
| 32 |
|
|
|
|
| 145 |
cx: float,
|
| 146 |
cy: float,
|
| 147 |
output_path: str,
|
| 148 |
+
max_points_preview: int,
|
| 149 |
) -> tuple[np.ndarray, np.ndarray]:
|
| 150 |
ixt = np.array([[fx, 0.0, cx], [0.0, fy, cy], [0.0, 0.0, 1.0]], dtype=np.float32)
|
| 151 |
pcd = depth2pcd(
|
|
|
|
| 166 |
if not o3d.io.write_point_cloud(output_path, pcd):
|
| 167 |
raise RuntimeError(f"Failed to save point cloud to {output_path}")
|
| 168 |
|
| 169 |
+
xyz = np.asarray(pcd.points)
|
| 170 |
+
rgb = np.asarray(pcd.colors)
|
| 171 |
+
if xyz.shape[0] > max_points_preview:
|
| 172 |
+
index = np.random.choice(xyz.shape[0], int(max_points_preview), replace=False)
|
| 173 |
+
xyz = xyz[index]
|
| 174 |
+
rgb = rgb[index]
|
| 175 |
+
|
| 176 |
+
return xyz, rgb
|
| 177 |
|
| 178 |
|
| 179 |
def resolve_checkpoint_path(model_type: str) -> str:
|
|
|
|
| 242 |
upsample_ratio=upsample_ratio,
|
| 243 |
)
|
| 244 |
|
| 245 |
+
if model_type == "InfiniDepth_DC":
|
| 246 |
+
assert depth_path is not None and os.path.exists(depth_path), "InfiniDepth_DC requires a valid input depth map for depth completion. Please provide --input_depth_path."
|
| 247 |
+
|
| 248 |
gt_depth, prompt_depth, gt_depth_mask, prompt_mask, depth_source_label = _resolve_depth_inputs(
|
| 249 |
depth_path=depth_path,
|
| 250 |
input_size=input_size,
|
| 251 |
image=image,
|
| 252 |
device=device,
|
| 253 |
)
|
| 254 |
+
gt = depth_to_disparity(gt_depth)
|
| 255 |
+
prompt = depth_to_disparity(prompt_depth)
|
| 256 |
+
prompt_mask = prompt > 0
|
| 257 |
|
| 258 |
ckpt_path = resolve_checkpoint_path(model_type)
|
| 259 |
model_cache = model_cache or ModelCache()
|
|
|
|
| 263 |
pred_depth, _ = model.inference(
|
| 264 |
image=image,
|
| 265 |
query_coord=query_2d_uniform_coord,
|
| 266 |
+
gt_depth=gt,
|
| 267 |
gt_depth_mask=gt_depth_mask,
|
| 268 |
+
prompt_depth=prompt,
|
| 269 |
+
prompt_mask=prompt_mask,
|
| 270 |
)
|
| 271 |
|
| 272 |
pred_depthmap = pred_depth.permute(0, 2, 1).reshape(1, 1, h_out, w_out)
|
|
|
|
| 280 |
cy_org=cy,
|
| 281 |
org_h=org_h,
|
| 282 |
org_w=org_w,
|
| 283 |
+
h=h_in,
|
| 284 |
+
w=w_in,
|
| 285 |
device=device,
|
| 286 |
)
|
| 287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
output_dir = tempfile.mkdtemp(prefix="infinidepth_demo_")
|
| 289 |
ply_path = os.path.join(output_dir, f"{model_type}_point_cloud.ply")
|
| 290 |
depth_npy_path = os.path.join(output_dir, f"{model_type}_depth.npy")
|
| 291 |
np.save(depth_npy_path, pred_depthmap[0, 0].detach().cpu().numpy().astype(np.float32))
|
| 292 |
|
| 293 |
xyz, rgb = _build_and_save_point_cloud(
|
| 294 |
+
query_coord=query_2d_uniform_coord,
|
| 295 |
+
pred_depth=pred_depth,
|
| 296 |
+
rgb_image=image,
|
| 297 |
fx=float(fx_out),
|
| 298 |
fy=float(fy_out),
|
| 299 |
cx=float(cx_out),
|
| 300 |
cy=float(cy_out),
|
| 301 |
output_path=ply_path,
|
| 302 |
+
max_points_preview=int(max_points_preview),
|
| 303 |
)
|
| 304 |
|
| 305 |
return DemoResult(
|
InfiniDepth/utils/io_utils.py
CHANGED
|
@@ -377,6 +377,24 @@ def depth2pcd(
|
|
| 377 |
):
|
| 378 |
device = sampled_coord.device if torch.is_tensor(sampled_coord) else torch.device("cpu")
|
| 379 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
if rgb_image.ndim == 3 and rgb_image.shape[0] == 3:
|
| 381 |
rgb_image = rgb_image.unsqueeze(0)
|
| 382 |
elif rgb_image.ndim == 3 and rgb_image.shape[2] == 3:
|
|
@@ -398,7 +416,7 @@ def depth2pcd(
|
|
| 398 |
p_y = ((sampled_coord[:, 0] + 1) * (rgb_image.shape[2] / 2.0)) - 0.5 # h --> y
|
| 399 |
ones = torch.ones_like(p_x)
|
| 400 |
|
| 401 |
-
cam_coords = torch.stack([p_x, p_y, ones], dim=1) * sampled_depth
|
| 402 |
cam_coords = cam_coords @ torch.inverse(torch.from_numpy(ixt).float().to(device)).T
|
| 403 |
|
| 404 |
if ext is not None:
|
|
@@ -420,9 +438,21 @@ def depth2pcd(
|
|
| 420 |
if clip_box[4] is not None: mask &= cam_coords[:, 1] >= clip_box[4]
|
| 421 |
if clip_box[5] is not None: mask &= cam_coords[:, 2] >= clip_box[5]
|
| 422 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
pcd = o3d.geometry.PointCloud()
|
| 424 |
-
pcd.points = o3d.utility.Vector3dVector(
|
| 425 |
-
pcd.colors = o3d.utility.Vector3dVector(
|
| 426 |
|
| 427 |
if ret_mask:
|
| 428 |
return pcd, mask
|
|
|
|
| 377 |
):
|
| 378 |
device = sampled_coord.device if torch.is_tensor(sampled_coord) else torch.device("cpu")
|
| 379 |
|
| 380 |
+
if not torch.is_tensor(sampled_coord):
|
| 381 |
+
sampled_coord = torch.as_tensor(sampled_coord, dtype=torch.float32, device=device)
|
| 382 |
+
else:
|
| 383 |
+
sampled_coord = sampled_coord.to(device=device, dtype=torch.float32)
|
| 384 |
+
|
| 385 |
+
if not torch.is_tensor(sampled_depth):
|
| 386 |
+
sampled_depth = torch.as_tensor(sampled_depth, dtype=torch.float32, device=device)
|
| 387 |
+
else:
|
| 388 |
+
sampled_depth = sampled_depth.to(device=device, dtype=torch.float32)
|
| 389 |
+
|
| 390 |
+
sampled_coord = sampled_coord.reshape(-1, 2)
|
| 391 |
+
sampled_depth = sampled_depth.reshape(-1)
|
| 392 |
+
if sampled_coord.shape[0] != sampled_depth.shape[0]:
|
| 393 |
+
raise ValueError(
|
| 394 |
+
f"sampled_coord and sampled_depth must contain the same number of points, "
|
| 395 |
+
f"got {sampled_coord.shape[0]} and {sampled_depth.shape[0]}"
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
if rgb_image.ndim == 3 and rgb_image.shape[0] == 3:
|
| 399 |
rgb_image = rgb_image.unsqueeze(0)
|
| 400 |
elif rgb_image.ndim == 3 and rgb_image.shape[2] == 3:
|
|
|
|
| 416 |
p_y = ((sampled_coord[:, 0] + 1) * (rgb_image.shape[2] / 2.0)) - 0.5 # h --> y
|
| 417 |
ones = torch.ones_like(p_x)
|
| 418 |
|
| 419 |
+
cam_coords = torch.stack([p_x, p_y, ones], dim=1) * sampled_depth[:, None]
|
| 420 |
cam_coords = cam_coords @ torch.inverse(torch.from_numpy(ixt).float().to(device)).T
|
| 421 |
|
| 422 |
if ext is not None:
|
|
|
|
| 438 |
if clip_box[4] is not None: mask &= cam_coords[:, 1] >= clip_box[4]
|
| 439 |
if clip_box[5] is not None: mask &= cam_coords[:, 2] >= clip_box[5]
|
| 440 |
|
| 441 |
+
points_np = np.ascontiguousarray(
|
| 442 |
+
cam_coords[mask, :3].detach().cpu().to(torch.float64).numpy()
|
| 443 |
+
)
|
| 444 |
+
colors_np = np.ascontiguousarray(
|
| 445 |
+
sampled_rgb[mask].detach().cpu().to(torch.float64).numpy()
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
if points_np.ndim != 2 or points_np.shape[1] != 3:
|
| 449 |
+
raise ValueError(f"Point cloud points must have shape [N, 3], got {points_np.shape}")
|
| 450 |
+
if colors_np.ndim != 2 or colors_np.shape[1] != 3:
|
| 451 |
+
raise ValueError(f"Point cloud colors must have shape [N, 3], got {colors_np.shape}")
|
| 452 |
+
|
| 453 |
pcd = o3d.geometry.PointCloud()
|
| 454 |
+
pcd.points = o3d.utility.Vector3dVector(points_np)
|
| 455 |
+
pcd.colors = o3d.utility.Vector3dVector(colors_np)
|
| 456 |
|
| 457 |
if ret_mask:
|
| 458 |
return pcd, mask
|
InfiniDepth/utils/sampling_utils.py
CHANGED
|
@@ -49,7 +49,7 @@ def make_2d_uniform_coord(shape, ranges=None, flatten=True):
|
|
| 49 |
r = (v1 - v0) / (2 * n)
|
| 50 |
seq = v0 + r + (2 * r) * torch.arange(n).float()
|
| 51 |
coord_seqs.append(seq)
|
| 52 |
-
query_coords = torch.stack(torch.meshgrid(*coord_seqs), dim=-1)
|
| 53 |
if flatten:
|
| 54 |
query_coords = query_coords.view(-1, query_coords.shape[-1])
|
| 55 |
return query_coords
|
|
|
|
| 49 |
r = (v1 - v0) / (2 * n)
|
| 50 |
seq = v0 + r + (2 * r) * torch.arange(n).float()
|
| 51 |
coord_seqs.append(seq)
|
| 52 |
+
query_coords = torch.stack(torch.meshgrid(*coord_seqs, indexing="ij"), dim=-1)
|
| 53 |
if flatten:
|
| 54 |
query_coords = query_coords.view(-1, query_coords.shape[-1])
|
| 55 |
return query_coords
|
__pycache__/app.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/app.cpython-310.pyc and b/__pycache__/app.cpython-310.pyc differ
|
|
|
app.py
CHANGED
|
@@ -66,6 +66,23 @@ CUSTOM_CSS = """
|
|
| 66 |
"""
|
| 67 |
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
def _none_if_invalid(value: Optional[float]) -> Optional[float]:
|
| 70 |
if value is None:
|
| 71 |
return None
|
|
@@ -195,7 +212,7 @@ with gr.Blocks(title="InfiniDepth Demo", theme=gr.themes.Soft(), css=CUSTOM_CSS)
|
|
| 195 |
label="Model Type",
|
| 196 |
)
|
| 197 |
input_size = gr.Dropdown(
|
| 198 |
-
choices=["
|
| 199 |
value="768x1024",
|
| 200 |
label="Inference Resolution (HxW)",
|
| 201 |
)
|
|
@@ -212,8 +229,8 @@ with gr.Blocks(title="InfiniDepth Demo", theme=gr.themes.Soft(), css=CUSTOM_CSS)
|
|
| 212 |
label="Super-resolution Ratio",
|
| 213 |
)
|
| 214 |
max_points_preview = gr.Slider(
|
| 215 |
-
minimum=
|
| 216 |
-
maximum=
|
| 217 |
value=60000,
|
| 218 |
step=5000,
|
| 219 |
label="Max Preview Points",
|
|
@@ -275,4 +292,5 @@ demo = demo.queue()
|
|
| 275 |
|
| 276 |
|
| 277 |
if __name__ == "__main__":
|
| 278 |
-
|
|
|
|
|
|
| 66 |
"""
|
| 67 |
|
| 68 |
|
| 69 |
+
def _ensure_localhost_bypasses_proxy() -> None:
|
| 70 |
+
localhost_hosts = ("localhost", "127.0.0.1", "::1")
|
| 71 |
+
for env_key in ("NO_PROXY", "no_proxy"):
|
| 72 |
+
current = os.environ.get(env_key, "")
|
| 73 |
+
items = [item.strip() for item in current.split(",") if item.strip()]
|
| 74 |
+
changed = False
|
| 75 |
+
for host in localhost_hosts:
|
| 76 |
+
if host not in items:
|
| 77 |
+
items.append(host)
|
| 78 |
+
changed = True
|
| 79 |
+
if changed or current:
|
| 80 |
+
os.environ[env_key] = ",".join(items)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
_ensure_localhost_bypasses_proxy()
|
| 84 |
+
|
| 85 |
+
|
| 86 |
def _none_if_invalid(value: Optional[float]) -> Optional[float]:
|
| 87 |
if value is None:
|
| 88 |
return None
|
|
|
|
| 212 |
label="Model Type",
|
| 213 |
)
|
| 214 |
input_size = gr.Dropdown(
|
| 215 |
+
choices=["512x672", "768x1024"],
|
| 216 |
value="768x1024",
|
| 217 |
label="Inference Resolution (HxW)",
|
| 218 |
)
|
|
|
|
| 229 |
label="Super-resolution Ratio",
|
| 230 |
)
|
| 231 |
max_points_preview = gr.Slider(
|
| 232 |
+
minimum=10000,
|
| 233 |
+
maximum=1000000,
|
| 234 |
value=60000,
|
| 235 |
step=5000,
|
| 236 |
label="Max Preview Points",
|
|
|
|
| 292 |
|
| 293 |
|
| 294 |
if __name__ == "__main__":
|
| 295 |
+
server_name = "0.0.0.0" if os.getenv("SPACE_ID") else "127.0.0.1"
|
| 296 |
+
demo.launch(server_name=server_name, server_port=int(os.getenv("PORT", "7861")))
|
requirements.txt
CHANGED
|
@@ -4,6 +4,7 @@
|
|
| 4 |
torch==2.9.1
|
| 5 |
torchvision==0.24.1
|
| 6 |
torchaudio==2.9.1
|
|
|
|
| 7 |
hydra-colorlog
|
| 8 |
hydra-core
|
| 9 |
h5py
|
|
|
|
| 4 |
torch==2.9.1
|
| 5 |
torchvision==0.24.1
|
| 6 |
torchaudio==2.9.1
|
| 7 |
+
xformers==0.0.33.post1
|
| 8 |
hydra-colorlog
|
| 9 |
hydra-core
|
| 10 |
h5py
|