Spaces:

developy
/

ApDepth

Sleeping

App Files Files Community

developy commited on Oct 15, 2025

Commit

9f63d59

verified ·

1 Parent(s): 576b7f9

Update apdepth/marigold_pipeline.py

Browse files

Files changed (1) hide show

apdepth/marigold_pipeline.py +43 -9

apdepth/marigold_pipeline.py CHANGED Viewed

@@ -1,3 +1,4 @@
 # Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
 # Last modified: 2024-05-24
 #
@@ -23,7 +24,6 @@ import logging
 from typing import Dict, Optional, Union
 import numpy as np
-import cv2
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -52,7 +52,6 @@ from .util.image_util import (
 )
 from DA2.depth_anything_v2.dpt import DepthAnythingV2
 class MarigoldDepthOutput(BaseOutput):
     """
     Output class for Marigold monocular depth prediction pipeline.
@@ -98,6 +97,12 @@ class MarigoldPipeline(DiffusionPipeline):
             A model property specifying whether the predicted depth maps are shift-invariant. This value must be set in
             the model config. When used together with the `scale_invariant=True` flag, the model is also called
             "affine-invariant". NB: overriding this value is not supported.
         default_processing_resolution (`int`, *optional*):
             The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
             the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
@@ -112,35 +117,35 @@ class MarigoldPipeline(DiffusionPipeline):
         self,
         unet: UNet2DConditionModel,
         vae: AutoencoderKL,
-        scheduler: Union[DDIMScheduler, LCMScheduler],
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
         scale_invariant: Optional[bool] = True,
         shift_invariant: Optional[bool] = True,
         default_processing_resolution: Optional[int] = None,
     ):
         super().__init__()
         self.register_modules(
             unet=unet,
             vae=vae,
-            scheduler=scheduler,
             text_encoder=text_encoder,
             tokenizer=tokenizer,
         )
         self.register_to_config(
             scale_invariant=scale_invariant,
             shift_invariant=shift_invariant,
             default_processing_resolution=default_processing_resolution,
         )
         self.scale_invariant = scale_invariant
         self.shift_invariant = shift_invariant
         self.default_processing_resolution = default_processing_resolution
         self.empty_text_embed = None
         self._fft_masks = {}
         da2_config = {
             'encoder': 'vits',  # 'vits', 'vitb', 'vitl', 'vitg'
             'features': 64,
@@ -155,11 +160,13 @@ class MarigoldPipeline(DiffusionPipeline):
         else:
             self.da2 = None
     @torch.no_grad()
     def __call__(
         self,
         input_image: Union[Image.Image, torch.Tensor],
-        ensemble_size: int = 1,
         processing_res: Optional[int] = None,
         match_input_res: bool = True,
         resample_method: str = "bilinear",
@@ -174,6 +181,10 @@ class MarigoldPipeline(DiffusionPipeline):
         Args:
             input_image (`Image`):
                 Input RGB (or gray-scale) image.
             ensemble_size (`int`, *optional*, defaults to `10`):
                 Number of predictions to be ensembled.
             processing_res (`int`, *optional*, defaults to `None`):
@@ -213,6 +224,9 @@ class MarigoldPipeline(DiffusionPipeline):
         assert processing_res >= 0
         resample_method: InterpolationMode = get_tv_resample_method(resample_method)
         # ----------------- Image Preprocess -----------------
@@ -246,7 +260,7 @@ class MarigoldPipeline(DiffusionPipeline):
         # ----------------- Predicting depth -----------------
         # Batch repeated input image
-        duplicated_rgb = rgb_norm.expand(ensemble_size, -1, -1, -1)
         single_rgb_dataset = TensorDataset(duplicated_rgb)
         if batch_size > 0:
             _bs = batch_size
@@ -322,6 +336,27 @@ class MarigoldPipeline(DiffusionPipeline):
             uncertainty=pred_uncert,
         )
     def encode_empty_text(self):
         """
         Encode text embedding for empty prompt
@@ -358,9 +393,8 @@ class MarigoldPipeline(DiffusionPipeline):
             `torch.Tensor`: Predicted depth map.
         """
         device = self.device
-        # preprare data
         rgb_in = rgb_in.to(device)
-        depth_da2 = self.da2.infer_batch(rgb_in).to(device)
         with torch.no_grad():
             # Encode image

+huggingface备份
 # Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
 # Last modified: 2024-05-24
 #
 from typing import Dict, Optional, Union
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 )
 from DA2.depth_anything_v2.dpt import DepthAnythingV2
 class MarigoldDepthOutput(BaseOutput):
     """
     Output class for Marigold monocular depth prediction pipeline.
             A model property specifying whether the predicted depth maps are shift-invariant. This value must be set in
             the model config. When used together with the `scale_invariant=True` flag, the model is also called
             "affine-invariant". NB: overriding this value is not supported.
+        default_denoising_steps (`int`, *optional*):
+            The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
+            quality with the given model. This value must be set in the model config. When the pipeline is called
+            without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
+            reasonable results with various model flavors compatible with the pipeline, such as those relying on very
+            short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
         default_processing_resolution (`int`, *optional*):
             The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
             the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
         self,
         unet: UNet2DConditionModel,
         vae: AutoencoderKL,
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
         scale_invariant: Optional[bool] = True,
         shift_invariant: Optional[bool] = True,
+        default_denoising_steps: Optional[int] = None,
         default_processing_resolution: Optional[int] = None,
     ):
         super().__init__()
         self.register_modules(
             unet=unet,
             vae=vae,
             text_encoder=text_encoder,
             tokenizer=tokenizer,
         )
         self.register_to_config(
             scale_invariant=scale_invariant,
             shift_invariant=shift_invariant,
+            default_denoising_steps=default_denoising_steps,
             default_processing_resolution=default_processing_resolution,
         )
         self.scale_invariant = scale_invariant
         self.shift_invariant = shift_invariant
+        self.default_denoising_steps = default_denoising_steps
         self.default_processing_resolution = default_processing_resolution
         self.empty_text_embed = None
         self._fft_masks = {}
         da2_config = {
             'encoder': 'vits',  # 'vits', 'vitb', 'vitl', 'vitg'
             'features': 64,
         else:
             self.da2 = None
     @torch.no_grad()
     def __call__(
         self,
         input_image: Union[Image.Image, torch.Tensor],
+        denoising_steps: Optional[int] = None,
+        ensemble_size: int = 5,
         processing_res: Optional[int] = None,
         match_input_res: bool = True,
         resample_method: str = "bilinear",
         Args:
             input_image (`Image`):
                 Input RGB (or gray-scale) image.
+            denoising_steps (`int`, *optional*, defaults to `None`):
+                Number of denoising diffusion steps during inference. The default value `None` results in automatic
+                selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
+                for Marigold-LCM models.
             ensemble_size (`int`, *optional*, defaults to `10`):
                 Number of predictions to be ensembled.
             processing_res (`int`, *optional*, defaults to `None`):
         assert processing_res >= 0
+        # Check if denoising step is reasonable
+        # self._check_inference_step(denoising_steps)
         resample_method: InterpolationMode = get_tv_resample_method(resample_method)
         # ----------------- Image Preprocess -----------------
         # ----------------- Predicting depth -----------------
         # Batch repeated input image
+        duplicated_rgb = rgb_norm.expand(1, -1, -1, -1)
         single_rgb_dataset = TensorDataset(duplicated_rgb)
         if batch_size > 0:
             _bs = batch_size
             uncertainty=pred_uncert,
         )
+    def _check_inference_step(self, n_step: int) -> None:
+        """
+        Check if denoising step is reasonable
+        Args:
+            n_step (`int`): denoising steps
+        """
+        assert n_step >= 1
+        if isinstance(self.scheduler, DDIMScheduler):
+            if n_step < 10:
+                logging.warning(
+                    f"Too few denoising steps: {n_step}. Recommended to use the LCM checkpoint for few-step inference."
+                )
+        elif isinstance(self.scheduler, LCMScheduler):
+            if not 1 <= n_step <= 4:
+                logging.warning(
+                    f"Non-optimal setting of denoising steps: {n_step}. Recommended setting is 1-4 steps."
+                )
+        else:
+            raise RuntimeError(f"Unsupported scheduler type: {type(self.scheduler)}")
     def encode_empty_text(self):
         """
         Encode text embedding for empty prompt
             `torch.Tensor`: Predicted depth map.
         """
         device = self.device
         rgb_in = rgb_in.to(device)
+         depth_da2 = self.da2.infer_batch(rgb_in).to(device)
         with torch.no_grad():
             # Encode image