GonzaloMG
/

stable-diffusion-e2e-ft-depth

@@ -108,8 +108,6 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
             Text-encoder, for empty text embedding.
         tokenizer (`CLIPTokenizer`):
             CLIP tokenizer.
-        prediction_type (`str`, *optional*):
-            Type of predictions made by the model.
         default_processing_resolution (`int`, *optional*):
             The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
             the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
@@ -118,7 +116,6 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
-    supported_prediction_types = ("depth", "disparity")
     def __init__(
         self,
@@ -127,17 +124,10 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
         scheduler: Union[DDIMScheduler],
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
-        prediction_type: Optional[str] = None,
-        default_processing_resolution: Optional[int] = None,
     ):
         super().__init__()
-        if prediction_type not in self.supported_prediction_types:
-            logger.warning(
-                f"Potentially unsupported `prediction_type='{prediction_type}'`; values supported by the pipeline: "
-                f"{self.supported_prediction_types}."
-            )
         self.register_modules(
             unet=unet,
             vae=vae,
@@ -146,7 +136,6 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
             tokenizer=tokenizer,
         )
         self.register_to_config(
-            prediction_type=prediction_type,
             default_processing_resolution=default_processing_resolution,
         )
@@ -473,6 +462,8 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
         prediction = prediction.mean(dim=1, keepdim=True)  # [B,1,H,W]
         prediction = torch.clip(prediction, -1.0, 1.0)  # [B,1,H,W]
-        prediction = (prediction + 1.0) / 2.0
-        return prediction  # [B,1,H,W]

             Text-encoder, for empty text embedding.
         tokenizer (`CLIPTokenizer`):
             CLIP tokenizer.
         default_processing_resolution (`int`, *optional*):
             The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
             the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     def __init__(
         self,
         scheduler: Union[DDIMScheduler],
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
+        default_processing_resolution: Optional[int] = 768,
     ):
         super().__init__()
         self.register_modules(
             unet=unet,
             vae=vae,
             tokenizer=tokenizer,
         )
         self.register_to_config(
             default_processing_resolution=default_processing_resolution,
         )
         prediction = prediction.mean(dim=1, keepdim=True)  # [B,1,H,W]
         prediction = torch.clip(prediction, -1.0, 1.0)  # [B,1,H,W]
+        # add
+        prediction = (prediction - prediction.min()) / (prediction.max() - prediction.min())
+        return prediction  # [B,1,H,W]