Update apdepth/marigold_pipeline.py
Browse files- apdepth/marigold_pipeline.py +43 -9
apdepth/marigold_pipeline.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
| 2 |
# Last modified: 2024-05-24
|
| 3 |
#
|
|
@@ -23,7 +24,6 @@ import logging
|
|
| 23 |
from typing import Dict, Optional, Union
|
| 24 |
|
| 25 |
import numpy as np
|
| 26 |
-
import cv2
|
| 27 |
import torch
|
| 28 |
import torch.nn as nn
|
| 29 |
import torch.nn.functional as F
|
|
@@ -52,7 +52,6 @@ from .util.image_util import (
|
|
| 52 |
)
|
| 53 |
from DA2.depth_anything_v2.dpt import DepthAnythingV2
|
| 54 |
|
| 55 |
-
|
| 56 |
class MarigoldDepthOutput(BaseOutput):
|
| 57 |
"""
|
| 58 |
Output class for Marigold monocular depth prediction pipeline.
|
|
@@ -98,6 +97,12 @@ class MarigoldPipeline(DiffusionPipeline):
|
|
| 98 |
A model property specifying whether the predicted depth maps are shift-invariant. This value must be set in
|
| 99 |
the model config. When used together with the `scale_invariant=True` flag, the model is also called
|
| 100 |
"affine-invariant". NB: overriding this value is not supported.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
default_processing_resolution (`int`, *optional*):
|
| 102 |
The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
|
| 103 |
the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
|
|
@@ -112,35 +117,35 @@ class MarigoldPipeline(DiffusionPipeline):
|
|
| 112 |
self,
|
| 113 |
unet: UNet2DConditionModel,
|
| 114 |
vae: AutoencoderKL,
|
| 115 |
-
scheduler: Union[DDIMScheduler, LCMScheduler],
|
| 116 |
text_encoder: CLIPTextModel,
|
| 117 |
tokenizer: CLIPTokenizer,
|
| 118 |
scale_invariant: Optional[bool] = True,
|
| 119 |
shift_invariant: Optional[bool] = True,
|
|
|
|
| 120 |
default_processing_resolution: Optional[int] = None,
|
| 121 |
):
|
| 122 |
super().__init__()
|
| 123 |
self.register_modules(
|
| 124 |
unet=unet,
|
| 125 |
vae=vae,
|
| 126 |
-
scheduler=scheduler,
|
| 127 |
text_encoder=text_encoder,
|
| 128 |
tokenizer=tokenizer,
|
| 129 |
)
|
| 130 |
self.register_to_config(
|
| 131 |
scale_invariant=scale_invariant,
|
| 132 |
shift_invariant=shift_invariant,
|
|
|
|
| 133 |
default_processing_resolution=default_processing_resolution,
|
| 134 |
)
|
| 135 |
|
| 136 |
self.scale_invariant = scale_invariant
|
| 137 |
self.shift_invariant = shift_invariant
|
|
|
|
| 138 |
self.default_processing_resolution = default_processing_resolution
|
| 139 |
|
| 140 |
self.empty_text_embed = None
|
| 141 |
|
| 142 |
self._fft_masks = {}
|
| 143 |
-
|
| 144 |
da2_config = {
|
| 145 |
'encoder': 'vits', # 'vits', 'vitb', 'vitl', 'vitg'
|
| 146 |
'features': 64,
|
|
@@ -155,11 +160,13 @@ class MarigoldPipeline(DiffusionPipeline):
|
|
| 155 |
else:
|
| 156 |
self.da2 = None
|
| 157 |
|
|
|
|
| 158 |
@torch.no_grad()
|
| 159 |
def __call__(
|
| 160 |
self,
|
| 161 |
input_image: Union[Image.Image, torch.Tensor],
|
| 162 |
-
|
|
|
|
| 163 |
processing_res: Optional[int] = None,
|
| 164 |
match_input_res: bool = True,
|
| 165 |
resample_method: str = "bilinear",
|
|
@@ -174,6 +181,10 @@ class MarigoldPipeline(DiffusionPipeline):
|
|
| 174 |
Args:
|
| 175 |
input_image (`Image`):
|
| 176 |
Input RGB (or gray-scale) image.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
ensemble_size (`int`, *optional*, defaults to `10`):
|
| 178 |
Number of predictions to be ensembled.
|
| 179 |
processing_res (`int`, *optional*, defaults to `None`):
|
|
@@ -213,6 +224,9 @@ class MarigoldPipeline(DiffusionPipeline):
|
|
| 213 |
|
| 214 |
assert processing_res >= 0
|
| 215 |
|
|
|
|
|
|
|
|
|
|
| 216 |
resample_method: InterpolationMode = get_tv_resample_method(resample_method)
|
| 217 |
|
| 218 |
# ----------------- Image Preprocess -----------------
|
|
@@ -246,7 +260,7 @@ class MarigoldPipeline(DiffusionPipeline):
|
|
| 246 |
|
| 247 |
# ----------------- Predicting depth -----------------
|
| 248 |
# Batch repeated input image
|
| 249 |
-
duplicated_rgb = rgb_norm.expand(
|
| 250 |
single_rgb_dataset = TensorDataset(duplicated_rgb)
|
| 251 |
if batch_size > 0:
|
| 252 |
_bs = batch_size
|
|
@@ -322,6 +336,27 @@ class MarigoldPipeline(DiffusionPipeline):
|
|
| 322 |
uncertainty=pred_uncert,
|
| 323 |
)
|
| 324 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
def encode_empty_text(self):
|
| 326 |
"""
|
| 327 |
Encode text embedding for empty prompt
|
|
@@ -358,9 +393,8 @@ class MarigoldPipeline(DiffusionPipeline):
|
|
| 358 |
`torch.Tensor`: Predicted depth map.
|
| 359 |
"""
|
| 360 |
device = self.device
|
| 361 |
-
# preprare data
|
| 362 |
rgb_in = rgb_in.to(device)
|
| 363 |
-
|
| 364 |
|
| 365 |
with torch.no_grad():
|
| 366 |
# Encode image
|
|
|
|
| 1 |
+
huggingface备份
|
| 2 |
# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
|
| 3 |
# Last modified: 2024-05-24
|
| 4 |
#
|
|
|
|
| 24 |
from typing import Dict, Optional, Union
|
| 25 |
|
| 26 |
import numpy as np
|
|
|
|
| 27 |
import torch
|
| 28 |
import torch.nn as nn
|
| 29 |
import torch.nn.functional as F
|
|
|
|
| 52 |
)
|
| 53 |
from DA2.depth_anything_v2.dpt import DepthAnythingV2
|
| 54 |
|
|
|
|
| 55 |
class MarigoldDepthOutput(BaseOutput):
|
| 56 |
"""
|
| 57 |
Output class for Marigold monocular depth prediction pipeline.
|
|
|
|
| 97 |
A model property specifying whether the predicted depth maps are shift-invariant. This value must be set in
|
| 98 |
the model config. When used together with the `scale_invariant=True` flag, the model is also called
|
| 99 |
"affine-invariant". NB: overriding this value is not supported.
|
| 100 |
+
default_denoising_steps (`int`, *optional*):
|
| 101 |
+
The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
|
| 102 |
+
quality with the given model. This value must be set in the model config. When the pipeline is called
|
| 103 |
+
without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
|
| 104 |
+
reasonable results with various model flavors compatible with the pipeline, such as those relying on very
|
| 105 |
+
short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
|
| 106 |
default_processing_resolution (`int`, *optional*):
|
| 107 |
The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
|
| 108 |
the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
|
|
|
|
| 117 |
self,
|
| 118 |
unet: UNet2DConditionModel,
|
| 119 |
vae: AutoencoderKL,
|
|
|
|
| 120 |
text_encoder: CLIPTextModel,
|
| 121 |
tokenizer: CLIPTokenizer,
|
| 122 |
scale_invariant: Optional[bool] = True,
|
| 123 |
shift_invariant: Optional[bool] = True,
|
| 124 |
+
default_denoising_steps: Optional[int] = None,
|
| 125 |
default_processing_resolution: Optional[int] = None,
|
| 126 |
):
|
| 127 |
super().__init__()
|
| 128 |
self.register_modules(
|
| 129 |
unet=unet,
|
| 130 |
vae=vae,
|
|
|
|
| 131 |
text_encoder=text_encoder,
|
| 132 |
tokenizer=tokenizer,
|
| 133 |
)
|
| 134 |
self.register_to_config(
|
| 135 |
scale_invariant=scale_invariant,
|
| 136 |
shift_invariant=shift_invariant,
|
| 137 |
+
default_denoising_steps=default_denoising_steps,
|
| 138 |
default_processing_resolution=default_processing_resolution,
|
| 139 |
)
|
| 140 |
|
| 141 |
self.scale_invariant = scale_invariant
|
| 142 |
self.shift_invariant = shift_invariant
|
| 143 |
+
self.default_denoising_steps = default_denoising_steps
|
| 144 |
self.default_processing_resolution = default_processing_resolution
|
| 145 |
|
| 146 |
self.empty_text_embed = None
|
| 147 |
|
| 148 |
self._fft_masks = {}
|
|
|
|
| 149 |
da2_config = {
|
| 150 |
'encoder': 'vits', # 'vits', 'vitb', 'vitl', 'vitg'
|
| 151 |
'features': 64,
|
|
|
|
| 160 |
else:
|
| 161 |
self.da2 = None
|
| 162 |
|
| 163 |
+
|
| 164 |
@torch.no_grad()
|
| 165 |
def __call__(
|
| 166 |
self,
|
| 167 |
input_image: Union[Image.Image, torch.Tensor],
|
| 168 |
+
denoising_steps: Optional[int] = None,
|
| 169 |
+
ensemble_size: int = 5,
|
| 170 |
processing_res: Optional[int] = None,
|
| 171 |
match_input_res: bool = True,
|
| 172 |
resample_method: str = "bilinear",
|
|
|
|
| 181 |
Args:
|
| 182 |
input_image (`Image`):
|
| 183 |
Input RGB (or gray-scale) image.
|
| 184 |
+
denoising_steps (`int`, *optional*, defaults to `None`):
|
| 185 |
+
Number of denoising diffusion steps during inference. The default value `None` results in automatic
|
| 186 |
+
selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
|
| 187 |
+
for Marigold-LCM models.
|
| 188 |
ensemble_size (`int`, *optional*, defaults to `10`):
|
| 189 |
Number of predictions to be ensembled.
|
| 190 |
processing_res (`int`, *optional*, defaults to `None`):
|
|
|
|
| 224 |
|
| 225 |
assert processing_res >= 0
|
| 226 |
|
| 227 |
+
# Check if denoising step is reasonable
|
| 228 |
+
# self._check_inference_step(denoising_steps)
|
| 229 |
+
|
| 230 |
resample_method: InterpolationMode = get_tv_resample_method(resample_method)
|
| 231 |
|
| 232 |
# ----------------- Image Preprocess -----------------
|
|
|
|
| 260 |
|
| 261 |
# ----------------- Predicting depth -----------------
|
| 262 |
# Batch repeated input image
|
| 263 |
+
duplicated_rgb = rgb_norm.expand(1, -1, -1, -1)
|
| 264 |
single_rgb_dataset = TensorDataset(duplicated_rgb)
|
| 265 |
if batch_size > 0:
|
| 266 |
_bs = batch_size
|
|
|
|
| 336 |
uncertainty=pred_uncert,
|
| 337 |
)
|
| 338 |
|
| 339 |
+
def _check_inference_step(self, n_step: int) -> None:
|
| 340 |
+
"""
|
| 341 |
+
Check if denoising step is reasonable
|
| 342 |
+
Args:
|
| 343 |
+
n_step (`int`): denoising steps
|
| 344 |
+
"""
|
| 345 |
+
assert n_step >= 1
|
| 346 |
+
|
| 347 |
+
if isinstance(self.scheduler, DDIMScheduler):
|
| 348 |
+
if n_step < 10:
|
| 349 |
+
logging.warning(
|
| 350 |
+
f"Too few denoising steps: {n_step}. Recommended to use the LCM checkpoint for few-step inference."
|
| 351 |
+
)
|
| 352 |
+
elif isinstance(self.scheduler, LCMScheduler):
|
| 353 |
+
if not 1 <= n_step <= 4:
|
| 354 |
+
logging.warning(
|
| 355 |
+
f"Non-optimal setting of denoising steps: {n_step}. Recommended setting is 1-4 steps."
|
| 356 |
+
)
|
| 357 |
+
else:
|
| 358 |
+
raise RuntimeError(f"Unsupported scheduler type: {type(self.scheduler)}")
|
| 359 |
+
|
| 360 |
def encode_empty_text(self):
|
| 361 |
"""
|
| 362 |
Encode text embedding for empty prompt
|
|
|
|
| 393 |
`torch.Tensor`: Predicted depth map.
|
| 394 |
"""
|
| 395 |
device = self.device
|
|
|
|
| 396 |
rgb_in = rgb_in.to(device)
|
| 397 |
+
depth_da2 = self.da2.infer_batch(rgb_in).to(device)
|
| 398 |
|
| 399 |
with torch.no_grad():
|
| 400 |
# Encode image
|