developy commited on
Commit
9f63d59
·
verified ·
1 Parent(s): 576b7f9

Update apdepth/marigold_pipeline.py

Browse files
Files changed (1) hide show
  1. apdepth/marigold_pipeline.py +43 -9
apdepth/marigold_pipeline.py CHANGED
@@ -1,3 +1,4 @@
 
1
  # Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
2
  # Last modified: 2024-05-24
3
  #
@@ -23,7 +24,6 @@ import logging
23
  from typing import Dict, Optional, Union
24
 
25
  import numpy as np
26
- import cv2
27
  import torch
28
  import torch.nn as nn
29
  import torch.nn.functional as F
@@ -52,7 +52,6 @@ from .util.image_util import (
52
  )
53
  from DA2.depth_anything_v2.dpt import DepthAnythingV2
54
 
55
-
56
  class MarigoldDepthOutput(BaseOutput):
57
  """
58
  Output class for Marigold monocular depth prediction pipeline.
@@ -98,6 +97,12 @@ class MarigoldPipeline(DiffusionPipeline):
98
  A model property specifying whether the predicted depth maps are shift-invariant. This value must be set in
99
  the model config. When used together with the `scale_invariant=True` flag, the model is also called
100
  "affine-invariant". NB: overriding this value is not supported.
 
 
 
 
 
 
101
  default_processing_resolution (`int`, *optional*):
102
  The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
103
  the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
@@ -112,35 +117,35 @@ class MarigoldPipeline(DiffusionPipeline):
112
  self,
113
  unet: UNet2DConditionModel,
114
  vae: AutoencoderKL,
115
- scheduler: Union[DDIMScheduler, LCMScheduler],
116
  text_encoder: CLIPTextModel,
117
  tokenizer: CLIPTokenizer,
118
  scale_invariant: Optional[bool] = True,
119
  shift_invariant: Optional[bool] = True,
 
120
  default_processing_resolution: Optional[int] = None,
121
  ):
122
  super().__init__()
123
  self.register_modules(
124
  unet=unet,
125
  vae=vae,
126
- scheduler=scheduler,
127
  text_encoder=text_encoder,
128
  tokenizer=tokenizer,
129
  )
130
  self.register_to_config(
131
  scale_invariant=scale_invariant,
132
  shift_invariant=shift_invariant,
 
133
  default_processing_resolution=default_processing_resolution,
134
  )
135
 
136
  self.scale_invariant = scale_invariant
137
  self.shift_invariant = shift_invariant
 
138
  self.default_processing_resolution = default_processing_resolution
139
 
140
  self.empty_text_embed = None
141
 
142
  self._fft_masks = {}
143
-
144
  da2_config = {
145
  'encoder': 'vits', # 'vits', 'vitb', 'vitl', 'vitg'
146
  'features': 64,
@@ -155,11 +160,13 @@ class MarigoldPipeline(DiffusionPipeline):
155
  else:
156
  self.da2 = None
157
 
 
158
  @torch.no_grad()
159
  def __call__(
160
  self,
161
  input_image: Union[Image.Image, torch.Tensor],
162
- ensemble_size: int = 1,
 
163
  processing_res: Optional[int] = None,
164
  match_input_res: bool = True,
165
  resample_method: str = "bilinear",
@@ -174,6 +181,10 @@ class MarigoldPipeline(DiffusionPipeline):
174
  Args:
175
  input_image (`Image`):
176
  Input RGB (or gray-scale) image.
 
 
 
 
177
  ensemble_size (`int`, *optional*, defaults to `10`):
178
  Number of predictions to be ensembled.
179
  processing_res (`int`, *optional*, defaults to `None`):
@@ -213,6 +224,9 @@ class MarigoldPipeline(DiffusionPipeline):
213
 
214
  assert processing_res >= 0
215
 
 
 
 
216
  resample_method: InterpolationMode = get_tv_resample_method(resample_method)
217
 
218
  # ----------------- Image Preprocess -----------------
@@ -246,7 +260,7 @@ class MarigoldPipeline(DiffusionPipeline):
246
 
247
  # ----------------- Predicting depth -----------------
248
  # Batch repeated input image
249
- duplicated_rgb = rgb_norm.expand(ensemble_size, -1, -1, -1)
250
  single_rgb_dataset = TensorDataset(duplicated_rgb)
251
  if batch_size > 0:
252
  _bs = batch_size
@@ -322,6 +336,27 @@ class MarigoldPipeline(DiffusionPipeline):
322
  uncertainty=pred_uncert,
323
  )
324
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  def encode_empty_text(self):
326
  """
327
  Encode text embedding for empty prompt
@@ -358,9 +393,8 @@ class MarigoldPipeline(DiffusionPipeline):
358
  `torch.Tensor`: Predicted depth map.
359
  """
360
  device = self.device
361
- # preprare data
362
  rgb_in = rgb_in.to(device)
363
- depth_da2 = self.da2.infer_batch(rgb_in).to(device)
364
 
365
  with torch.no_grad():
366
  # Encode image
 
1
+ huggingface备份
2
  # Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
3
  # Last modified: 2024-05-24
4
  #
 
24
  from typing import Dict, Optional, Union
25
 
26
  import numpy as np
 
27
  import torch
28
  import torch.nn as nn
29
  import torch.nn.functional as F
 
52
  )
53
  from DA2.depth_anything_v2.dpt import DepthAnythingV2
54
 
 
55
  class MarigoldDepthOutput(BaseOutput):
56
  """
57
  Output class for Marigold monocular depth prediction pipeline.
 
97
  A model property specifying whether the predicted depth maps are shift-invariant. This value must be set in
98
  the model config. When used together with the `scale_invariant=True` flag, the model is also called
99
  "affine-invariant". NB: overriding this value is not supported.
100
+ default_denoising_steps (`int`, *optional*):
101
+ The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
102
+ quality with the given model. This value must be set in the model config. When the pipeline is called
103
+ without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
104
+ reasonable results with various model flavors compatible with the pipeline, such as those relying on very
105
+ short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
106
  default_processing_resolution (`int`, *optional*):
107
  The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
108
  the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
 
117
  self,
118
  unet: UNet2DConditionModel,
119
  vae: AutoencoderKL,
 
120
  text_encoder: CLIPTextModel,
121
  tokenizer: CLIPTokenizer,
122
  scale_invariant: Optional[bool] = True,
123
  shift_invariant: Optional[bool] = True,
124
+ default_denoising_steps: Optional[int] = None,
125
  default_processing_resolution: Optional[int] = None,
126
  ):
127
  super().__init__()
128
  self.register_modules(
129
  unet=unet,
130
  vae=vae,
 
131
  text_encoder=text_encoder,
132
  tokenizer=tokenizer,
133
  )
134
  self.register_to_config(
135
  scale_invariant=scale_invariant,
136
  shift_invariant=shift_invariant,
137
+ default_denoising_steps=default_denoising_steps,
138
  default_processing_resolution=default_processing_resolution,
139
  )
140
 
141
  self.scale_invariant = scale_invariant
142
  self.shift_invariant = shift_invariant
143
+ self.default_denoising_steps = default_denoising_steps
144
  self.default_processing_resolution = default_processing_resolution
145
 
146
  self.empty_text_embed = None
147
 
148
  self._fft_masks = {}
 
149
  da2_config = {
150
  'encoder': 'vits', # 'vits', 'vitb', 'vitl', 'vitg'
151
  'features': 64,
 
160
  else:
161
  self.da2 = None
162
 
163
+
164
  @torch.no_grad()
165
  def __call__(
166
  self,
167
  input_image: Union[Image.Image, torch.Tensor],
168
+ denoising_steps: Optional[int] = None,
169
+ ensemble_size: int = 5,
170
  processing_res: Optional[int] = None,
171
  match_input_res: bool = True,
172
  resample_method: str = "bilinear",
 
181
  Args:
182
  input_image (`Image`):
183
  Input RGB (or gray-scale) image.
184
+ denoising_steps (`int`, *optional*, defaults to `None`):
185
+ Number of denoising diffusion steps during inference. The default value `None` results in automatic
186
+ selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
187
+ for Marigold-LCM models.
188
  ensemble_size (`int`, *optional*, defaults to `10`):
189
  Number of predictions to be ensembled.
190
  processing_res (`int`, *optional*, defaults to `None`):
 
224
 
225
  assert processing_res >= 0
226
 
227
+ # Check if denoising step is reasonable
228
+ # self._check_inference_step(denoising_steps)
229
+
230
  resample_method: InterpolationMode = get_tv_resample_method(resample_method)
231
 
232
  # ----------------- Image Preprocess -----------------
 
260
 
261
  # ----------------- Predicting depth -----------------
262
  # Batch repeated input image
263
+ duplicated_rgb = rgb_norm.expand(1, -1, -1, -1)
264
  single_rgb_dataset = TensorDataset(duplicated_rgb)
265
  if batch_size > 0:
266
  _bs = batch_size
 
336
  uncertainty=pred_uncert,
337
  )
338
 
339
+ def _check_inference_step(self, n_step: int) -> None:
340
+ """
341
+ Check if denoising step is reasonable
342
+ Args:
343
+ n_step (`int`): denoising steps
344
+ """
345
+ assert n_step >= 1
346
+
347
+ if isinstance(self.scheduler, DDIMScheduler):
348
+ if n_step < 10:
349
+ logging.warning(
350
+ f"Too few denoising steps: {n_step}. Recommended to use the LCM checkpoint for few-step inference."
351
+ )
352
+ elif isinstance(self.scheduler, LCMScheduler):
353
+ if not 1 <= n_step <= 4:
354
+ logging.warning(
355
+ f"Non-optimal setting of denoising steps: {n_step}. Recommended setting is 1-4 steps."
356
+ )
357
+ else:
358
+ raise RuntimeError(f"Unsupported scheduler type: {type(self.scheduler)}")
359
+
360
  def encode_empty_text(self):
361
  """
362
  Encode text embedding for empty prompt
 
393
  `torch.Tensor`: Predicted depth map.
394
  """
395
  device = self.device
 
396
  rgb_in = rgb_in.to(device)
397
+ depth_da2 = self.da2.infer_batch(rgb_in).to(device)
398
 
399
  with torch.no_grad():
400
  # Encode image