Spaces:

GDAOSU
/

Olbedo

Running

App Files Files Community

degbo commited on 7 days ago

Commit

f2dd2b8

1 Parent(s): 5e75ca4

update with new code

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +5 -8
config/dataset_depth/data_diode_all.yaml +0 -4
config/dataset_depth/data_eth3d.yaml +0 -4
config/dataset_depth/data_hypersim_train.yaml +0 -4
config/dataset_depth/data_hypersim_val.yaml +0 -4
config/dataset_depth/data_kitti_eigen_test.yaml +0 -6
config/dataset_depth/data_kitti_val.yaml +0 -6
config/dataset_depth/data_nyu_test.yaml +0 -5
config/dataset_depth/data_nyu_train.yaml +0 -5
config/dataset_depth/data_scannet_val.yaml +0 -4
config/dataset_depth/data_vkitti_train.yaml +0 -6
config/dataset_depth/data_vkitti_val.yaml +0 -6
config/dataset_depth/dataset_train.yaml +0 -18
config/dataset_depth/dataset_val.yaml +0 -45
config/dataset_depth/dataset_vis.yaml +0 -9
config/dataset_iid/data_appearance_interiorverse_test.yaml +0 -4
config/dataset_iid/data_appearance_synthetic_test.yaml +0 -4
config/dataset_iid/data_art_test.yaml +0 -4
config/dataset_iid/data_lighting_hypersim_test.yaml +0 -4
config/dataset_iid/dataset_appearance_train.yaml +0 -9
config/dataset_iid/dataset_appearance_val.yaml +0 -6
config/dataset_iid/dataset_appearance_vis.yaml +0 -6
config/dataset_iid/dataset_lighting_train.yaml +0 -12
config/dataset_iid/dataset_lighting_val.yaml +0 -6
config/dataset_iid/dataset_lighting_vis.yaml +0 -6
config/dataset_iid/osu_data_appearance_interiorverse_test.yaml +0 -4
config/dataset_normals/data_diode_test.yaml +0 -4
config/dataset_normals/data_ibims_test.yaml +0 -4
config/dataset_normals/data_nyu_test.yaml +0 -4
config/dataset_normals/data_oasis_test.yaml +0 -4
config/dataset_normals/data_scannet_test.yaml +0 -4
config/dataset_normals/dataset_train.yaml +0 -25
config/dataset_normals/dataset_val.yaml +0 -7
config/dataset_normals/dataset_vis.yaml +0 -7
config/logging.yaml +0 -5
config/model_sdv2.yaml +0 -4
config/train_debug_depth.yaml +0 -10
config/train_debug_iid.yaml +0 -11
config/train_debug_normals.yaml +0 -10
config/train_marigold_depth.yaml +0 -94
config/train_marigold_iid_appearance.yaml +0 -81
config/train_marigold_iid_appearance_finetuned.yaml +0 -81
config/train_marigold_iid_lighting.yaml +0 -82
config/train_marigold_normals.yaml +0 -86
config/wandb.yaml +0 -3
marigold/__init__.py +0 -41
marigold/marigold_depth_pipeline.py +0 -516
marigold/marigold_normals_pipeline.py +0 -479
{src → olbedo}/__init__.py +2 -0
olbedo/__pycache__/__init__.cpython-310.pyc +0 -0

app.py CHANGED Viewed

@@ -7,15 +7,13 @@ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..",
 import gradio as gr
 import numpy as np
 import torch
-from marigold import MarigoldIIDOutput, MarigoldIIDPipeline
 from src.util.image_util import read_img_from_file, img_hwc2chw, img_linear2srgb, is_hdr
-from marigold.util.image_util import float2int
 from src.util.seeding import seed_all
 import logging
 from huggingface_hub import snapshot_download
-HF_TOKEN = os.getenv("HF_TOKEN")
 seed = 1234
 seed_all(seed)
 if torch.cuda.is_available():
@@ -46,12 +44,11 @@ def get_demo():
         local_dir = snapshot_download(
             repo_id="GDAOSU/olbedo",
             allow_patterns=f"{selected_model}/*",
-            token=HF_TOKEN,
         )
         model_path = os.path.join(local_dir, selected_model)
-        pipe = MarigoldIIDPipeline.from_pretrained(
             model_path,
             torch_dtype=torch.float32,
         ).to(device)
@@ -102,7 +99,7 @@ def get_demo():
         if "rgbx" in selected_model:
             pipe.prompt = prompt
-        pipe_out: MarigoldIIDOutput = pipe(
             input_image,
             denoising_steps=inference_step,
             ensemble_size=1,
@@ -136,7 +133,7 @@ def get_demo():
     block = gr.Blocks()
     with block:
         with gr.Row():
-            gr.Markdown("## OSU albedo demo")
         with gr.Row():
             # Input side
             with gr.Column():

 import gradio as gr
 import numpy as np
 import torch
+from olbedo import OlbedoIIDOutput, OlbedoIIDPipeline
 from src.util.image_util import read_img_from_file, img_hwc2chw, img_linear2srgb, is_hdr
+from olbedo.util.image_util import float2int
 from src.util.seeding import seed_all
 import logging
 from huggingface_hub import snapshot_download
 seed = 1234
 seed_all(seed)
 if torch.cuda.is_available():
         local_dir = snapshot_download(
             repo_id="GDAOSU/olbedo",
             allow_patterns=f"{selected_model}/*",
         )
         model_path = os.path.join(local_dir, selected_model)
+        pipe = OlbedoIIDPipeline.from_pretrained(
             model_path,
             torch_dtype=torch.float32,
         ).to(device)
         if "rgbx" in selected_model:
             pipe.prompt = prompt
+        pipe_out: OlbedoIIDOutput = pipe(
             input_image,
             denoising_steps=inference_step,
             ensemble_size=1,
     block = gr.Blocks()
     with block:
         with gr.Row():
+            gr.Markdown("## Olbedo: An Albedo and Shading Aerial Dataset for Large-Scale Outdoor Environments")
         with gr.Row():
             # Input side
             with gr.Column():

config/dataset_depth/data_diode_all.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-name: diode_depth
-disp_name: diode_depth_val_all
-dir: diode/diode_val.tar
-filenames: data_split/diode_depth/diode_val_all_filename_list.txt

config/dataset_depth/data_eth3d.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-name: eth3d_depth
-disp_name: eth3d_depth_full
-dir: eth3d/eth3d.tar
-filenames: data_split/eth3d_depth/eth3d_filename_list.txt

config/dataset_depth/data_hypersim_train.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-name: hypersim_depth
-disp_name: hypersim_depth_train
-dir: hypersim/hypersim_processed_train.tar
-filenames: data_split/hypersim_depth/filename_list_train_filtered.txt

config/dataset_depth/data_hypersim_val.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-name: hypersim_depth
-disp_name: hypersim_depth_val
-dir: hypersim/hypersim_processed_val.tar
-filenames: data_split/hypersim_depth/filename_list_val_filtered.txt

config/dataset_depth/data_kitti_eigen_test.yaml DELETED Viewed

@@ -1,6 +0,0 @@
-name: kitti_depth
-disp_name: kitti_depth_eigen_test_full
-dir: kitti/kitti_eigen_split_test.tar
-filenames: data_split/kitti_depth/eigen_test_files_with_gt.txt
-kitti_bm_crop: true
-valid_mask_crop: eigen

config/dataset_depth/data_kitti_val.yaml DELETED Viewed

@@ -1,6 +0,0 @@
-name: kitti_depth
-disp_name: kitti_depth_val800_from_eigen_train
-dir: kitti/kitti_sampled_val_800.tar
-filenames: data_split/kitti_depth/eigen_val_from_train_800.txt
-kitti_bm_crop: true
-valid_mask_crop: eigen

config/dataset_depth/data_nyu_test.yaml DELETED Viewed

@@ -1,5 +0,0 @@
-name: nyu_depth
-disp_name: nyu_depth_test_full
-dir: nyuv2/nyu_labeled_extracted.tar
-filenames: data_split/nyu_depth/labeled/filename_list_test.txt
-eigen_valid_mask: true

config/dataset_depth/data_nyu_train.yaml DELETED Viewed

@@ -1,5 +0,0 @@
-name: nyu_depth
-disp_name: nyu_depth_train_full
-dir: nyuv2/nyu_labeled_extracted.tar
-filenames: data_split/nyu_depth/labeled/filename_list_train.txt
-eigen_valid_mask: true

config/dataset_depth/data_scannet_val.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-name: scannet_depth
-disp_name: scannet_depth_val_800_1
-dir: scannet/scannet_val_sampled_800_1.tar
-filenames: data_split/scannet_depth/scannet_val_sampled_list_800_1.txt

config/dataset_depth/data_vkitti_train.yaml DELETED Viewed

@@ -1,6 +0,0 @@
-name: vkitti_depth
-disp_name: vkitti_depth_train
-dir: vkitti/vkitti.tar
-filenames: data_split/vkitti_depth/vkitti_train.txt
-kitti_bm_crop: true
-valid_mask_crop: null  # no valid_mask_crop for training

config/dataset_depth/data_vkitti_val.yaml DELETED Viewed

@@ -1,6 +0,0 @@
-name: vkitti_depth
-disp_name: vkitti_depth_val
-dir: vkitti/vkitti.tar
-filenames: data_split/vkitti_depth/vkitti_val.txt
-kitti_bm_crop: true
-valid_mask_crop: eigen

config/dataset_depth/dataset_train.yaml DELETED Viewed

@@ -1,18 +0,0 @@
-dataset:
-  train:
-    name: mixed
-    prob_ls: [0.9, 0.1]
-    dataset_list:
-    - name: hypersim_depth
-      disp_name: hypersim_depth_train
-      dir: hypersim/hypersim_processed_train.tar
-      filenames: data_split/hypersim_depth/filename_list_train_filtered.txt
-      resize_to_hw:
-      - 480
-      - 640
-    - name: vkitti_depth
-      disp_name: vkitti_depth_train
-      dir: vkitti/vkitti.tar
-      filenames: data_split/vkitti_depth/vkitti_train.txt
-      kitti_bm_crop: true
-      valid_mask_crop: null

config/dataset_depth/dataset_val.yaml DELETED Viewed

@@ -1,45 +0,0 @@
-dataset:
-  val:
-  # - name: hypersim_depth
-  #   disp_name: hypersim_depth_val
-  #   dir: hypersim/hypersim_processed_val.tar
-  #   filenames: data_split/hypersim_depth/filename_list_val_filtered.txt
-  #   resize_to_hw:
-  #   - 480
-  #   - 640
-  # - name: nyu_depth
-  #   disp_name: nyu_depth_train_full
-  #   dir: nyuv2/nyu_labeled_extracted.tar
-  #   filenames: data_split/nyu_depth/labeled/filename_list_train.txt
-  #   eigen_valid_mask: true
-  # - name: kitti_depth
-  #   disp_name: kitti_depth_val800_from_eigen_train
-  #   dir: kitti/kitti_depth_sampled_val_800.tar
-  #   filenames: data_split/kitti_depth/eigen_val_from_train_800.txt
-  #   kitti_bm_crop: true
-  #   valid_mask_crop: eigen
-  # Smaller subsets for faster validation during training
-  # The first dataset is used to calculate main eval metric.
-  - name: hypersim_depth
-    disp_name: hypersim_depth_val_small_80
-    dir: hypersim/hypersim_processed_val.tar
-    filenames: data_split/hypersim_depth/filename_list_val_filtered_small_80.txt
-    resize_to_hw:
-    - 480
-    - 640
-  - name: nyu_depth
-    disp_name: nyu_depth_train_small_100
-    dir: nyuv2/nyu_labeled_extracted.tar
-    filenames: data_split/nyu_depth/labeled/filename_list_train_small_100.txt
-    eigen_valid_mask: true
-  - name: kitti_depth
-    disp_name: kitti_depth_val_from_train_sub_100
-    dir: kitti/kitti_sampled_val_800.tar
-    filenames: data_split/kitti_depth/eigen_val_from_train_sub_100.txt
-    kitti_bm_crop: true
-    valid_mask_crop: eigen

config/dataset_depth/dataset_vis.yaml DELETED Viewed

@@ -1,9 +0,0 @@
-dataset:
-  vis:
-  - name: hypersim_depth
-    disp_name: hypersim_depth_vis
-    dir: hypersim/hypersim_processed_val.tar
-    filenames: data_split/hypersim_depth/selected_vis_sample.txt
-    resize_to_hw:
-    - 480
-    - 640

config/dataset_iid/data_appearance_interiorverse_test.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-name: interiorverse_iid
-disp_name: interiorverse_iid_appearance_test
-dir: interiorverse/InteriorVerse.tar
-filenames: data_split/interiorverse_iid/interiorverse_test_scenes_85.txt

config/dataset_iid/data_appearance_synthetic_test.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-name: interiorverse_iid
-disp_name: interiorverse_iid_appearance_test
-dir: synthetic
-filenames: data_split/osu/osu_test_scenes_85.txt

config/dataset_iid/data_art_test.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-name: interiorverse_iid
-disp_name: interiorverse_iid_appearance_test
-dir: art
-filenames: data_split/osu/art_test_scenes.txt

config/dataset_iid/data_lighting_hypersim_test.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-name: hypersim_iid
-disp_name: hypersim_iid_lighting_test
-dir: hypersim
-filenames: data_split/hypersim_iid/hypersim_test.txt

config/dataset_iid/dataset_appearance_train.yaml DELETED Viewed

@@ -1,9 +0,0 @@
-dataset:
-  train:
-    name: mixed
-    prob_ls: [1.0]
-    dataset_list:
-    - name: interiorverse_iid
-      disp_name: interiorverse_iid_appearance_train
-      dir: osu_albedo_new
-      filenames: data_split/osu/osu_train_scenes_85.txt

config/dataset_iid/dataset_appearance_val.yaml DELETED Viewed

@@ -1,6 +0,0 @@
-dataset:
-  val:
-  - name: interiorverse_iid
-    disp_name: interiorverse_iid_appearance_val
-    dir: synthetic
-    filenames: data_split/MatrixCity/matrixcity_val_scenes_small.txt

config/dataset_iid/dataset_appearance_vis.yaml DELETED Viewed

@@ -1,6 +0,0 @@
-dataset:
-  vis:
-  - name: interiorverse_iid
-    disp_name: interiorverse_iid_appearance_vis
-    dir: synthetic
-    filenames: data_split/MatrixCity/matrixcity_vis_scenes.txt

config/dataset_iid/dataset_lighting_train.yaml DELETED Viewed

@@ -1,12 +0,0 @@
-dataset:
-  train:
-    name: mixed
-    prob_ls: [1.0]
-    dataset_list:
-    - name: hypersim_iid
-      disp_name: hypersim_iid_lighting_train
-      dir: hypersim
-      filenames: data_split/hypersim_iid/hypersim_train_filtered.txt
-      resize_to_hw:
-      - 480
-      - 640

config/dataset_iid/dataset_lighting_val.yaml DELETED Viewed

@@ -1,6 +0,0 @@
-dataset:
-  val:
-  - name: hypersim_iid
-    disp_name: hypersim_iid_lighting_val
-    dir: hypersim
-    filenames: data_split/hypersim_iid/hypersim_val.txt

config/dataset_iid/dataset_lighting_vis.yaml DELETED Viewed

@@ -1,6 +0,0 @@
-dataset:
-  vis:
-  - name: hypersim_iid
-    disp_name: hypersim_iid_lighting_vis
-    dir: hypersim
-    filenames: data_split/hypersim_iid/hypersim_vis.txt

config/dataset_iid/osu_data_appearance_interiorverse_test.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-name: interiorverse_iid
-disp_name: interiorverse_iid_appearance_test
-dir: synthetic
-filenames: data_split/osu/osu_test_scenes_85.txt

config/dataset_normals/data_diode_test.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-name: diode_normals
-disp_name: diode_normals_test
-dir: diode/val
-filenames: data_split/diode_normals/diode_test.txt

config/dataset_normals/data_ibims_test.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-name: ibims_normals
-disp_name: ibims_normals_test
-dir: ibims/ibims
-filenames: data_split/ibims_normals/ibims_test.txt

config/dataset_normals/data_nyu_test.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-name: nyu_normals
-disp_name: nyu_normals_test
-dir: nyuv2/test
-filenames: data_split/nyu_normals/nyuv2_test.txt

config/dataset_normals/data_oasis_test.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-name: oasis_normals
-disp_name: oasis_normals_test
-dir: oasis/val
-filenames: data_split/oasis_normals/oasis_test.txt

config/dataset_normals/data_scannet_test.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-name: scannet_normals
-disp_name: scannet_normals_test
-dir: scannet
-filenames: data_split/scannet_normals/scannet_test.txt

config/dataset_normals/dataset_train.yaml DELETED Viewed

@@ -1,25 +0,0 @@
-dataset:
-  train:
-    name: mixed
-    prob_ls: [0.5, 0.49, 0.01]
-    dataset_list:
-    - name: hypersim_normals
-      disp_name: hypersim_normals_train
-      dir: hypersim
-      filenames: data_split/hypersim_normals/hypersim_filtered_all.txt
-      resize_to_hw:
-      - 480
-      - 640
-    - name: interiorverse_normals
-      disp_name: interiorverse_normals_train
-      dir: interiorverse/scenes_85
-      filenames: data_split/interiorverse_normals/interiorverse_filtered_all.txt
-      resize_to_hw: null
-    - name: sintel_normals
-      disp_name: sintel_normals_train
-      dir: sintel
-      filenames: data_split/sintel_normals/sintel_filtered.txt
-      resize_to_hw:
-      - 480
-      - 640
-      center_crop: true

config/dataset_normals/dataset_val.yaml DELETED Viewed

@@ -1,7 +0,0 @@
-dataset:
-  val:
-  - name: hypersim_normals
-    disp_name: hypersim_normals_val_small_100
-    dir: hypersim
-    filenames: data_split/hypersim_normals/hypersim_filtered_val_100.txt
-    resize_to_hw: null

config/dataset_normals/dataset_vis.yaml DELETED Viewed

@@ -1,7 +0,0 @@
-dataset:
-  vis:
-  - name: hypersim_normals
-    disp_name: hypersim_normals_vis
-    dir: hypersim
-    filenames: data_split/hypersim_normals/hypersim_filtered_vis_20.txt
-    resize_to_hw: null

config/logging.yaml DELETED Viewed

@@ -1,5 +0,0 @@
-logging:
-  filename: logging.log
-  format: ' %(asctime)s - %(levelname)s -%(filename)s - %(funcName)s >> %(message)s'
-  console_level: 20
-  file_level: 10

config/model_sdv2.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-model:
-  name: marigold_pipeline
-  pretrained_path: stable-diffusion-2
-  latent_scale_factor: 0.18215

config/train_debug_depth.yaml DELETED Viewed

@@ -1,10 +0,0 @@
-base_config:
-  - config/train_marigold_depth.yaml
-trainer:
-  save_period: 5
-  backup_period: 10
-  validation_period: 5
-  visualization_period: 5
-max_iter: 50

config/train_debug_iid.yaml DELETED Viewed

@@ -1,11 +0,0 @@
-base_config:
-  # - config/train_marigold_iid_lighting.yaml
-  - config/train_marigold_iid_appearance.yaml
-trainer:
-  save_period: 10
-  backup_period: 10
-  validation_period: 5
-  visualization_period: 5
-max_iter: 50

config/train_debug_normals.yaml DELETED Viewed

@@ -1,10 +0,0 @@
-base_config:
-  - config/train_marigold_normals.yaml
-trainer:
-  save_period: 5
-  backup_period: 10
-  validation_period: 5
-  visualization_period: 5
-max_iter: 50

config/train_marigold_depth.yaml DELETED Viewed

@@ -1,94 +0,0 @@
-base_config:
-- config/logging.yaml
-- config/wandb.yaml
-- config/dataset_depth/dataset_train.yaml
-- config/dataset_depth/dataset_val.yaml
-- config/dataset_depth/dataset_vis.yaml
-- config/model_sdv2.yaml
-pipeline:
-  name: MarigoldDepthPipeline
-  kwargs:
-    scale_invariant: true
-    shift_invariant: true
-    default_denoising_steps: 4
-    default_processing_resolution: 768
-depth_normalization:
-  type: scale_shift_depth
-  clip: true
-  norm_min: -1.0
-  norm_max: 1.0
-  min_max_quantile: 0.02
-augmentation:
-  lr_flip_p: 0.5
-dataloader:
-  num_workers: 2
-  effective_batch_size: 32
-  max_train_batch_size: 2
-  seed: 2024  # to ensure continuity when resuming from checkpoint
-trainer:
-  name: MarigoldDepthTrainer
-  training_noise_scheduler:
-    pretrained_path: stable-diffusion-2
-  init_seed: 2024  # use null to train w/o seeding
-  save_period: 50
-  backup_period: 2000
-  validation_period: 500
-  visualization_period: 1000
-multi_res_noise:
-  strength: 0.9
-  annealed: true
-  downscale_strategy: original
-gt_depth_type: depth_raw_norm
-gt_mask_type: valid_mask_raw
-max_epoch: 10000  # a large enough number
-max_iter: 30000  # usually converges at around 20k
-optimizer:
-  name: Adam
-loss:
-  name: mse_loss
-  kwargs:
-    reduction: mean
-lr: 3.0e-05
-lr_scheduler:
-  name: IterExponential
-  kwargs:
-    total_iter: 25000
-    final_ratio: 0.01
-    warmup_steps: 100
-# Light setting for the in-training validation and visualization
-validation:
-  denoising_steps: 1
-  ensemble_size: 1
-  processing_res: 0
-  match_input_res: false
-  resample_method: bilinear
-  main_val_metric: abs_relative_difference
-  main_val_metric_goal: minimize
-  init_seed: 2024
-eval:
-  alignment: least_square
-  align_max_res: null
-  eval_metrics:
-  - abs_relative_difference
-  - squared_relative_difference
-  - rmse_linear
-  - rmse_log
-  - log10
-  - delta1_acc
-  - delta2_acc
-  - delta3_acc
-  - i_rmse
-  - silog_rmse

config/train_marigold_iid_appearance.yaml DELETED Viewed

@@ -1,81 +0,0 @@
-base_config:
-- config/logging.yaml
-- config/wandb.yaml
-- config/dataset_iid/dataset_appearance_train.yaml
-- config/dataset_iid/dataset_appearance_val.yaml
-- config/dataset_iid/dataset_appearance_vis.yaml
-- config/model_sdv2.yaml
-pipeline:
-  name: MarigoldIIDPipeline
-  kwargs:
-    default_denoising_steps: 4
-    default_processing_resolution: 768
-    target_properties:
-      target_names:
-          - albedo
-      albedo:
-        prediction_space: srgb
-augmentation:
-  lr_flip_p: 0.5
-dataloader:
-  num_workers: 2
-  effective_batch_size: 32
-  max_train_batch_size: 8
-  seed: 2024  # to ensure continuity when resuming from checkpoint
-trainer:
-  name: MarigoldIIDTrainer
-  training_noise_scheduler:
-    pretrained_path: stable-diffusion-2
-  init_seed: 2024  # use null to train w/o seeding
-  save_period: 50
-  backup_period: 2000
-  validation_period: 500
-  visualization_period: 1000
-multi_res_noise:
-  strength: 0.9
-  annealed: true
-  downscale_strategy: original
-gt_mask_type: mask
-max_epoch: 10000  # a large enough number
-max_iter: 10000  # usually converges at around 40k
-optimizer:
-  name: Adam
-loss:
-  name: mse_loss
-  kwargs:
-    reduction: mean
-lr: 2.0e-05
-lr_scheduler:
-  name: IterExponential
-  kwargs:
-    total_iter: 5000
-    final_ratio: 0.01
-    warmup_steps: 100
-# Light setting for the in-training validation and visualization
-validation:
-  denoising_steps: 4
-  ensemble_size: 1
-  processing_res: 0
-  match_input_res: true
-  resample_method: bilinear
-  main_val_metric: psnr
-  main_val_metric_goal: maximize
-  init_seed: 2024
-  use_mask: false
-eval:
-  eval_metrics:
-  - psnr
-  targets_to_eval_in_linear_space:
-  - material

config/train_marigold_iid_appearance_finetuned.yaml DELETED Viewed

@@ -1,81 +0,0 @@
-base_config:
-- config/logging.yaml
-- config/wandb.yaml
-- config/dataset_iid/dataset_appearance_train.yaml
-- config/dataset_iid/dataset_appearance_val.yaml
-- config/dataset_iid/dataset_appearance_vis.yaml
-- config/model_sdv2.yaml
-pipeline:
-  name: MarigoldIIDPipeline
-  kwargs:
-    default_denoising_steps: 4
-    default_processing_resolution: 768
-    target_properties:
-      target_names:
-          - albedo
-      albedo:
-        prediction_space: srgb
-augmentation:
-  lr_flip_p: 0.5
-dataloader:
-  num_workers: 2
-  effective_batch_size: 32
-  max_train_batch_size: 8
-  seed: 2024  # to ensure continuity when resuming from checkpoint
-trainer:
-  name: MarigoldIIDTrainer
-  training_noise_scheduler:
-    pretrained_path: stable-diffusion-2
-  init_seed: 2024  # use null to train w/o seeding
-  save_period: 50
-  backup_period: 2000
-  validation_period: 177
-  visualization_period: 177
-multi_res_noise:
-  strength: 0.9
-  annealed: true
-  downscale_strategy: original
-gt_mask_type: null
-max_epoch: 10000  # a large enough number
-max_iter: 5000  # usually converges at around 40k
-optimizer:
-  name: Adam
-loss:
-  name: mse_loss
-  kwargs:
-    reduction: mean
-lr: 5.0e-07
-lr_scheduler:
-  name: IterExponential
-  kwargs:
-    total_iter: 2500
-    final_ratio: 0.01
-    warmup_steps: 100
-# Light setting for the in-training validation and visualization
-validation:
-  denoising_steps: 4
-  ensemble_size: 1
-  processing_res: 1000
-  match_input_res: true
-  resample_method: bilinear
-  main_val_metric: psnr
-  main_val_metric_goal: maximize
-  init_seed: 2024
-  use_mask: false
-eval:
-  eval_metrics:
-  - psnr
-  targets_to_eval_in_linear_space:
-  - material

config/train_marigold_iid_lighting.yaml DELETED Viewed

@@ -1,82 +0,0 @@
-base_config:
-- config/logging.yaml
-- config/wandb.yaml
-- config/dataset_iid/dataset_lighting_train.yaml
-- config/dataset_iid/dataset_lighting_val.yaml
-- config/dataset_iid/dataset_lighting_vis.yaml
-- config/model_sdv2.yaml
-pipeline:
-  name: MarigoldIIDPipeline
-  kwargs:
-    default_denoising_steps: 4
-    default_processing_resolution: 768
-    target_properties:
-      target_names:
-      - albedo
-      albedo:
-        prediction_space: linear
-        up_to_scale: false
-augmentation:
-  lr_flip_p: 0.5
-dataloader:
-  num_workers: 2
-  effective_batch_size: 32
-  max_train_batch_size: 8
-  seed: 2024  # to ensure continuity when resuming from checkpoint
-trainer:
-  name: MarigoldIIDTrainer
-  training_noise_scheduler:
-    pretrained_path: stable-diffusion-2
-  init_seed: 2024  # use null to train w/o seeding
-  save_period: 50
-  backup_period: 2000
-  validation_period: 500
-  visualization_period: 1000
-multi_res_noise:
-  strength: 0.9
-  annealed: true
-  downscale_strategy: original
-gt_mask_type: mask
-max_epoch: 10000  # a large enough number
-max_iter: 50000  # usually converges at around 34k
-optimizer:
-  name: Adam
-loss:
-  name: mse_loss
-  kwargs:
-    reduction: mean
-lr: 8e-05
-lr_scheduler:
-  name: IterExponential
-  kwargs:
-    total_iter: 45000
-    final_ratio: 0.01
-    warmup_steps: 100
-# Light setting for the in-training validation and visualization
-validation:
-  denoising_steps: 4
-  ensemble_size: 1
-  processing_res: 0
-  match_input_res: true
-  resample_method: bilinear
-  main_val_metric: psnr
-  main_val_metric_goal: maximize
-  init_seed: 2024
-  use_mask: false
-eval:
-  eval_metrics:
-  - psnr
-  targets_to_eval_in_linear_space:
-  - None

config/train_marigold_normals.yaml DELETED Viewed

@@ -1,86 +0,0 @@
-base_config:
-- config/logging.yaml
-- config/wandb.yaml
-- config/dataset_normals/dataset_train.yaml
-- config/dataset_normals/dataset_val.yaml
-- config/dataset_normals/dataset_vis.yaml
-- config/model_sdv2.yaml
-pipeline:
-  name: MarigoldNormalsPipeline
-  kwargs:
-    default_denoising_steps: 4
-    default_processing_resolution: 768
-augmentation:
-  lr_flip_p: 0.5
-  color_jitter_p: 0.3
-  gaussian_blur_p: 0.3
-  motion_blur_p: 0.3
-  gaussian_blur_sigma: 4
-  motion_blur_kernel_size: 11
-  motion_blur_angle_range: 360
-  jitter_brightness_factor: 0.5
-  jitter_contrast_factor: 0.5
-  jitter_saturation_factor: 0.5
-  jitter_hue_factor: 0.2
-dataloader:
-  num_workers: 2
-  effective_batch_size: 32
-  max_train_batch_size: 2
-  seed: 2024  # to ensure continuity when resuming from checkpoint
-trainer:
-  name: MarigoldNormalsTrainer
-  training_noise_scheduler:
-    pretrained_path: stable-diffusion-2
-  init_seed: 2024  # use null to train w/o seeding
-  save_period: 50
-  backup_period: 2000
-  validation_period: 500
-  visualization_period: 1000
-multi_res_noise:
-  strength: 0.9
-  annealed: true
-  downscale_strategy: original
-gt_normals_type: normals
-gt_mask_type: null
-max_epoch: 10000  # a large enough number
-max_iter: 30000  # usually converges at around 26k
-optimizer:
-  name: Adam
-loss:
-  name: mse_loss
-  kwargs:
-    reduction: mean
-lr: 6.0e-05
-lr_scheduler:
-  name: IterExponential
-  kwargs:
-    total_iter: 25000
-    final_ratio: 0.01
-    warmup_steps: 100
-# Light setting for the in-training validation and visualization
-validation:
-  denoising_steps: 4
-  ensemble_size: 1
-  processing_res: 768
-  match_input_res: true
-  resample_method: bilinear
-  main_val_metric: mean_angular_error
-  main_val_metric_goal: minimize
-  init_seed: 0
-eval:
-  align_max_res: null
-  eval_metrics:
-  - mean_angular_error
-  - sub11_25_error

config/wandb.yaml DELETED Viewed

@@ -1,3 +0,0 @@
-wandb:
-  # entity: your_entity
-  project: marigold

marigold/__init__.py DELETED Viewed

@@ -1,41 +0,0 @@
-# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# --------------------------------------------------------------------------
-# More information about Marigold:
-#   https://marigoldmonodepth.github.io
-#   https://marigoldcomputervision.github.io
-# Efficient inference pipelines are now part of diffusers:
-#   https://huggingface.co/docs/diffusers/using-diffusers/marigold_usage
-#   https://huggingface.co/docs/diffusers/api/pipelines/marigold
-# Examples of trained models and live demos:
-#   https://huggingface.co/prs-eth
-# Related projects:
-#   https://rollingdepth.github.io/
-#   https://marigolddepthcompletion.github.io/
-# Citation (BibTeX):
-#   https://github.com/prs-eth/Marigold#-citation
-# If you find Marigold useful, we kindly ask you to cite our papers.
-# --------------------------------------------------------------------------
-from .marigold_depth_pipeline import (
-    MarigoldDepthPipeline,
-    MarigoldDepthOutput,  # noqa: F401
-)
-from .marigold_iid_pipeline import MarigoldIIDPipeline, MarigoldIIDOutput  # noqa: F401
-from .marigold_normals_pipeline import (
-    MarigoldNormalsPipeline,  # noqa: F401
-    MarigoldNormalsOutput,  # noqa: F401
-)
-MarigoldPipeline = MarigoldDepthPipeline  # for backward compatibility

marigold/marigold_depth_pipeline.py DELETED Viewed

@@ -1,516 +0,0 @@
-# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# --------------------------------------------------------------------------
-# More information about Marigold:
-#   https://marigoldmonodepth.github.io
-#   https://marigoldcomputervision.github.io
-# Efficient inference pipelines are now part of diffusers:
-#   https://huggingface.co/docs/diffusers/using-diffusers/marigold_usage
-#   https://huggingface.co/docs/diffusers/api/pipelines/marigold
-# Examples of trained models and live demos:
-#   https://huggingface.co/prs-eth
-# Related projects:
-#   https://rollingdepth.github.io/
-#   https://marigolddepthcompletion.github.io/
-# Citation (BibTeX):
-#   https://github.com/prs-eth/Marigold#-citation
-# If you find Marigold useful, we kindly ask you to cite our papers.
-# --------------------------------------------------------------------------
-import logging
-import numpy as np
-import torch
-from PIL import Image
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DiffusionPipeline,
-    LCMScheduler,
-    UNet2DConditionModel,
-)
-from diffusers.utils import BaseOutput
-from torch.utils.data import DataLoader, TensorDataset
-from torchvision.transforms import InterpolationMode
-from torchvision.transforms.functional import pil_to_tensor, resize
-from tqdm.auto import tqdm
-from transformers import CLIPTextModel, CLIPTokenizer
-from typing import Dict, Optional, Union
-from .util.batchsize import find_batch_size
-from .util.ensemble import ensemble_depth
-from .util.image_util import (
-    chw2hwc,
-    colorize_depth_maps,
-    get_tv_resample_method,
-    resize_max_res,
-)
-class MarigoldDepthOutput(BaseOutput):
-    """
-    Output class for Marigold Monocular Depth Estimation pipeline.
-    Args:
-        depth_np (`np.ndarray`):
-            Predicted depth map, with depth values in the range of [0, 1].
-        depth_colored (`PIL.Image.Image`):
-            Colorized depth map, with the shape of [H, W, 3] and values in [0, 255].
-        uncertainty (`None` or `np.ndarray`):
-            Uncalibrated uncertainty(MAD, median absolute deviation) coming from ensembling.
-    """
-    depth_np: np.ndarray
-    depth_colored: Union[None, Image.Image]
-    uncertainty: Union[None, np.ndarray]
-class MarigoldDepthPipeline(DiffusionPipeline):
-    """
-    Pipeline for Marigold Monocular Depth Estimation: https://marigoldcomputervision.github.io.
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-    Args:
-        unet (`UNet2DConditionModel`):
-            Conditional U-Net to denoise the prediction latent, conditioned on image latent.
-        vae (`AutoencoderKL`):
-            Variational Auto-Encoder (VAE) Model to encode and decode images and predictions
-            to and from latent representations.
-        scheduler (`DDIMScheduler`):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
-        text_encoder (`CLIPTextModel`):
-            Text-encoder, for empty text embedding.
-        tokenizer (`CLIPTokenizer`):
-            CLIP tokenizer.
-        scale_invariant (`bool`, *optional*):
-            A model property specifying whether the predicted depth maps are scale-invariant. This value must be set in
-            the model config. When used together with the `shift_invariant=True` flag, the model is also called
-            "affine-invariant". NB: overriding this value is not supported.
-        shift_invariant (`bool`, *optional*):
-            A model property specifying whether the predicted depth maps are shift-invariant. This value must be set in
-            the model config. When used together with the `scale_invariant=True` flag, the model is also called
-            "affine-invariant". NB: overriding this value is not supported.
-        default_denoising_steps (`int`, *optional*):
-            The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
-            quality with the given model. This value must be set in the model config. When the pipeline is called
-            without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
-            reasonable results with various model flavors compatible with the pipeline, such as those relying on very
-            short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
-        default_processing_resolution (`int`, *optional*):
-            The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
-            the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
-            default value is used. This is required to ensure reasonable results with various model flavors trained
-            with varying optimal processing resolution values.
-    """
-    latent_scale_factor = 0.18215
-    def __init__(
-        self,
-        unet: UNet2DConditionModel,
-        vae: AutoencoderKL,
-        scheduler: Union[DDIMScheduler, LCMScheduler],
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        scale_invariant: Optional[bool] = True,
-        shift_invariant: Optional[bool] = True,
-        default_denoising_steps: Optional[int] = None,
-        default_processing_resolution: Optional[int] = None,
-    ):
-        super().__init__()
-        self.register_modules(
-            unet=unet,
-            vae=vae,
-            scheduler=scheduler,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-        )
-        self.register_to_config(
-            scale_invariant=scale_invariant,
-            shift_invariant=shift_invariant,
-            default_denoising_steps=default_denoising_steps,
-            default_processing_resolution=default_processing_resolution,
-        )
-        self.scale_invariant = scale_invariant
-        self.shift_invariant = shift_invariant
-        self.default_denoising_steps = default_denoising_steps
-        self.default_processing_resolution = default_processing_resolution
-        self.empty_text_embed = None
-    @torch.no_grad()
-    def __call__(
-        self,
-        input_image: Union[Image.Image, torch.Tensor],
-        denoising_steps: Optional[int] = None,
-        ensemble_size: int = 1,
-        processing_res: Optional[int] = None,
-        match_input_res: bool = True,
-        resample_method: str = "bilinear",
-        batch_size: int = 0,
-        generator: Union[torch.Generator, None] = None,
-        color_map: str = "Spectral",
-        show_progress_bar: bool = True,
-        ensemble_kwargs: Dict = None,
-    ) -> MarigoldDepthOutput:
-        """
-        Function invoked when calling the pipeline.
-        Args:
-            input_image (`Image`):
-                Input RGB (or gray-scale) image.
-            denoising_steps (`int`, *optional*, defaults to `None`):
-                Number of denoising diffusion steps during inference. The default value `None` results in automatic
-                selection.
-            ensemble_size (`int`, *optional*, defaults to `1`):
-                Number of predictions to be ensembled.
-            processing_res (`int`, *optional*, defaults to `None`):
-                Effective processing resolution. When set to `0`, processes at the original image resolution. This
-                produces crisper predictions, but may also lead to the overall loss of global context. The default
-                value `None` resolves to the optimal value from the model config.
-            match_input_res (`bool`, *optional*, defaults to `True`):
-                Resize the prediction to match the input resolution.
-                Only valid if `processing_res` > 0.
-            resample_method: (`str`, *optional*, defaults to `bilinear`):
-                Resampling method used to resize images and predictions. This can be one of `bilinear`, `bicubic` or
-                `nearest`, defaults to: `bilinear`.
-            batch_size (`int`, *optional*, defaults to `0`):
-                Inference batch size, no bigger than `num_ensemble`.
-                If set to 0, the script will automatically decide the proper batch size.
-            generator (`torch.Generator`, *optional*, defaults to `None`)
-                Random generator for initial noise generation.
-            show_progress_bar (`bool`, *optional*, defaults to `True`):
-                Display a progress bar of diffusion denoising.
-            color_map (`str`, *optional*, defaults to `"Spectral"`, pass `None` to skip colorized depth map generation):
-                Colormap used to colorize the depth map.
-            scale_invariant (`str`, *optional*, defaults to `True`):
-                Flag of scale-invariant prediction, if True, scale will be adjusted from the raw prediction.
-            shift_invariant (`str`, *optional*, defaults to `True`):
-                Flag of shift-invariant prediction, if True, shift will be adjusted from the raw prediction, if False,
-                near plane will be fixed at 0m.
-            ensemble_kwargs (`dict`, *optional*, defaults to `None`):
-                Arguments for detailed ensembling settings.
-        Returns:
-            `MarigoldDepthOutput`: Output class for Marigold monocular depth prediction pipeline, including:
-            - **depth_np** (`np.ndarray`) Predicted depth map with depth values in the range of [0, 1]
-            - **depth_colored** (`PIL.Image.Image`) Colorized depth map, with the shape of [H, W, 3] and values in [0, 255], None if `color_map` is `None`
-            - **uncertainty** (`None` or `np.ndarray`) Uncalibrated uncertainty(MAD, median absolute deviation)
-                    coming from ensembling. None if `ensemble_size = 1`
-        """
-        # Model-specific optimal default values leading to fast and reasonable results.
-        if denoising_steps is None:
-            denoising_steps = self.default_denoising_steps
-        if processing_res is None:
-            processing_res = self.default_processing_resolution
-        assert processing_res >= 0
-        assert ensemble_size >= 1
-        # Check if denoising step is reasonable
-        self._check_inference_step(denoising_steps)
-        resample_method: InterpolationMode = get_tv_resample_method(resample_method)
-        # ----------------- Image Preprocess -----------------
-        # Convert to torch tensor
-        if isinstance(input_image, Image.Image):
-            input_image = input_image.convert("RGB")
-            # convert to torch tensor [H, W, rgb] -> [rgb, H, W]
-            rgb = pil_to_tensor(input_image)
-            rgb = rgb.unsqueeze(0)  # [1, rgb, H, W]
-        elif isinstance(input_image, torch.Tensor):
-            rgb = input_image
-        else:
-            raise TypeError(f"Unknown input type: {type(input_image) = }")
-        input_size = rgb.shape
-        assert (
-            4 == rgb.dim() and 3 == input_size[-3]
-        ), f"Wrong input shape {input_size}, expected [1, rgb, H, W]"
-        # Resize image
-        if processing_res > 0:
-            rgb = resize_max_res(
-                rgb,
-                max_edge_resolution=processing_res,
-                resample_method=resample_method,
-            )
-        # Normalize rgb values
-        rgb_norm: torch.Tensor = rgb / 255.0 * 2.0 - 1.0  #  [0, 255] -> [-1, 1]
-        rgb_norm = rgb_norm.to(self.dtype)
-        assert rgb_norm.min() >= -1.0 and rgb_norm.max() <= 1.0
-        # ----------------- Predicting depth -----------------
-        # Batch repeated input image
-        duplicated_rgb = rgb_norm.expand(ensemble_size, -1, -1, -1)
-        single_rgb_dataset = TensorDataset(duplicated_rgb)
-        if batch_size > 0:
-            _bs = batch_size
-        else:
-            _bs = find_batch_size(
-                ensemble_size=ensemble_size,
-                input_res=max(rgb_norm.shape[1:]),
-                dtype=self.dtype,
-            )
-        single_rgb_loader = DataLoader(
-            single_rgb_dataset, batch_size=_bs, shuffle=False
-        )
-        # Predict depth maps (batched)
-        target_pred_ls = []
-        if show_progress_bar:
-            iterable = tqdm(
-                single_rgb_loader, desc=" " * 2 + "Inference batches", leave=False
-            )
-        else:
-            iterable = single_rgb_loader
-        for batch in iterable:
-            (batched_img,) = batch
-            target_pred_raw = self.single_infer(
-                rgb_in=batched_img,
-                num_inference_steps=denoising_steps,
-                show_pbar=show_progress_bar,
-                generator=generator,
-            )
-            target_pred_ls.append(target_pred_raw.detach())
-        target_preds = torch.concat(target_pred_ls, dim=0)
-        torch.cuda.empty_cache()  # clear vram cache for ensembling
-        # ----------------- Test-time ensembling -----------------
-        if ensemble_size > 1:
-            final_pred, pred_uncert = ensemble_depth(
-                target_preds,
-                scale_invariant=self.scale_invariant,
-                shift_invariant=self.shift_invariant,
-                **(ensemble_kwargs or {}),
-            )
-        else:
-            final_pred = target_preds
-            pred_uncert = None
-        # Resize back to original resolution
-        if match_input_res:
-            final_pred = resize(
-                final_pred,
-                input_size[-2:],
-                interpolation=resample_method,
-                antialias=True,
-            )
-        # Convert to numpy
-        final_pred = final_pred.squeeze()
-        final_pred = final_pred.cpu().numpy()
-        if pred_uncert is not None:
-            pred_uncert = pred_uncert.squeeze().cpu().numpy()
-        # Clip output range
-        final_pred = final_pred.clip(0, 1)
-        # Colorize
-        if color_map is not None:
-            depth_colored = colorize_depth_maps(
-                final_pred, 0, 1, cmap=color_map
-            ).squeeze()  # [3, H, W], value in (0, 1)
-            depth_colored = (depth_colored * 255).astype(np.uint8)
-            depth_colored_hwc = chw2hwc(depth_colored)
-            depth_colored_img = Image.fromarray(depth_colored_hwc)
-        else:
-            depth_colored_img = None
-        return MarigoldDepthOutput(
-            depth_np=final_pred,
-            depth_colored=depth_colored_img,
-            uncertainty=pred_uncert,
-        )
-    def _check_inference_step(self, n_step: int) -> None:
-        """
-        Check if denoising step is reasonable
-        Args:
-            n_step (`int`): denoising steps
-        """
-        assert n_step >= 1
-        if isinstance(self.scheduler, DDIMScheduler):
-            if "trailing" != self.scheduler.config.timestep_spacing:
-                logging.warning(
-                    f"The loaded `DDIMScheduler` is configured with `timestep_spacing="
-                    f'"{self.scheduler.config.timestep_spacing}"`; the recommended setting is `"trailing"`. '
-                    f"This change is backward-compatible and yields better results. "
-                    f"Consider using `prs-eth/marigold-depth-v1-1` for the best experience."
-                )
-            else:
-                if n_step > 10:
-                    logging.warning(
-                        f"Setting too many denoising steps ({n_step}) may degrade the prediction; consider relying on "
-                        f"the default values."
-                    )
-            if not self.scheduler.config.rescale_betas_zero_snr:
-                logging.warning(
-                    f"The loaded `DDIMScheduler` is configured with `rescale_betas_zero_snr="
-                    f"{self.scheduler.config.rescale_betas_zero_snr}`; the recommended setting is True. "
-                    f"Consider using `prs-eth/marigold-depth-v1-1` for the best experience."
-                )
-        elif isinstance(self.scheduler, LCMScheduler):
-            logging.warning(
-                "DeprecationWarning: LCMScheduler will not be supported in the future. "
-                "Consider using `prs-eth/marigold-depth-v1-1` for the best experience."
-            )
-            if n_step > 10:
-                logging.warning(
-                    f"Setting too many denoising steps ({n_step}) may degrade the prediction; consider relying on "
-                    f"the default values."
-                )
-        else:
-            raise RuntimeError(f"Unsupported scheduler type: {type(self.scheduler)}")
-    def encode_empty_text(self):
-        """
-        Encode text embedding for empty prompt
-        """
-        prompt = ""
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="do_not_pad",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids.to(self.text_encoder.device)
-        self.empty_text_embed = self.text_encoder(text_input_ids)[0].to(self.dtype)
-    @torch.no_grad()
-    def single_infer(
-        self,
-        rgb_in: torch.Tensor,
-        num_inference_steps: int,
-        generator: Union[torch.Generator, None],
-        show_pbar: bool,
-    ) -> torch.Tensor:
-        """
-        Perform a single prediction without ensembling.
-        Args:
-            rgb_in (`torch.Tensor`):
-                Input RGB image.
-            num_inference_steps (`int`):
-                Number of diffusion denoisign steps (DDIM) during inference.
-            show_pbar (`bool`):
-                Display a progress bar of diffusion denoising.
-            generator (`torch.Generator`)
-                Random generator for initial noise generation.
-        Returns:
-            `torch.Tensor`: Predicted targets.
-        """
-        device = self.device
-        rgb_in = rgb_in.to(device)
-        # Set timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps  # [T]
-        # Encode image
-        rgb_latent = self.encode_rgb(rgb_in)  # [B, 4, h, w]
-        # Noisy latent for outputs
-        target_latent = torch.randn(
-            rgb_latent.shape,
-            device=device,
-            dtype=self.dtype,
-            generator=generator,
-        )  # [B, 4, h, w]
-        # Batched empty text embedding
-        if self.empty_text_embed is None:
-            self.encode_empty_text()
-        batch_empty_text_embed = self.empty_text_embed.repeat(
-            (rgb_latent.shape[0], 1, 1)
-        ).to(device)  # [B, 2, 1024]
-        # Denoising loop
-        if show_pbar:
-            iterable = tqdm(
-                enumerate(timesteps),
-                total=len(timesteps),
-                leave=False,
-                desc=" " * 4 + "Diffusion denoising",
-            )
-        else:
-            iterable = enumerate(timesteps)
-        for i, t in iterable:
-            unet_input = torch.cat(
-                [rgb_latent, target_latent], dim=1
-            )  # this order is important
-            # predict the noise residual
-            noise_pred = self.unet(
-                unet_input, t, encoder_hidden_states=batch_empty_text_embed
-            ).sample  # [B, 4, h, w]
-            # compute the previous noisy sample x_t -> x_t-1
-            target_latent = self.scheduler.step(
-                noise_pred, t, target_latent, generator=generator
-            ).prev_sample
-        depth = self.decode_depth(target_latent)  # [B,3,H,W]
-        # clip prediction
-        depth = torch.clip(depth, -1.0, 1.0)
-        # shift to [0, 1]
-        depth = (depth + 1.0) / 2.0
-        return depth
-    def encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
-        """
-        Encode RGB image into latent.
-        Args:
-            rgb_in (`torch.Tensor`):
-                Input RGB image to be encoded.
-        Returns:
-            `torch.Tensor`: Image latent.
-        """
-        # encode
-        h = self.vae.encoder(rgb_in)
-        moments = self.vae.quant_conv(h)
-        mean, logvar = torch.chunk(moments, 2, dim=1)
-        # scale latent
-        rgb_latent = mean * self.latent_scale_factor
-        return rgb_latent
-    def decode_depth(self, depth_latent: torch.Tensor) -> torch.Tensor:
-        """
-        Decode depth latent into depth map.
-        Args:
-            depth_latent (`torch.Tensor`):
-                Depth latent to be decoded.
-        Returns:
-            `torch.Tensor`: Decoded depth map.
-        """
-        # scale latent
-        depth_latent = depth_latent / self.latent_scale_factor
-        # decode
-        z = self.vae.post_quant_conv(depth_latent)
-        stacked = self.vae.decoder(z)
-        # mean of output channels
-        depth_mean = stacked.mean(dim=1, keepdim=True)
-        return depth_mean

marigold/marigold_normals_pipeline.py DELETED Viewed

@@ -1,479 +0,0 @@
-# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# --------------------------------------------------------------------------
-# More information about Marigold:
-#   https://marigoldmonodepth.github.io
-#   https://marigoldcomputervision.github.io
-# Efficient inference pipelines are now part of diffusers:
-#   https://huggingface.co/docs/diffusers/using-diffusers/marigold_usage
-#   https://huggingface.co/docs/diffusers/api/pipelines/marigold
-# Examples of trained models and live demos:
-#   https://huggingface.co/prs-eth
-# Related projects:
-#   https://rollingdepth.github.io/
-#   https://marigolddepthcompletion.github.io/
-# Citation (BibTeX):
-#   https://github.com/prs-eth/Marigold#-citation
-# If you find Marigold useful, we kindly ask you to cite our papers.
-# --------------------------------------------------------------------------
-import logging
-import numpy as np
-import torch
-from PIL import Image
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DiffusionPipeline,
-    LCMScheduler,
-    UNet2DConditionModel,
-)
-from diffusers.utils import BaseOutput
-from torch.utils.data import DataLoader, TensorDataset
-from torchvision.transforms import InterpolationMode
-from torchvision.transforms.functional import pil_to_tensor, resize
-from tqdm.auto import tqdm
-from transformers import CLIPTextModel, CLIPTokenizer
-from typing import Dict, Optional, Union
-from .util.batchsize import find_batch_size
-from .util.ensemble import ensemble_normals
-from .util.image_util import (
-    chw2hwc,
-    get_tv_resample_method,
-    resize_max_res,
-)
-class MarigoldNormalsOutput(BaseOutput):
-    """
-    Output class for Marigold Surface Normals Estimation pipeline.
-    Args:
-        normals_np (`np.ndarray`):
-            Predicted normals map of shape [3, H, W] with values in the range of [-1, 1] (unit length vectors).
-        normals_img (`PIL.Image.Image`):
-            Normals image, with the shape of [H, W, 3] and values in [0, 255].
-        uncertainty (`None` or `np.ndarray`):
-            Uncalibrated uncertainty(MAD, median absolute deviation) coming from ensembling.
-    """
-    normals_np: np.ndarray
-    normals_img: Image.Image
-    uncertainty: Union[None, np.ndarray]
-class MarigoldNormalsPipeline(DiffusionPipeline):
-    """
-    Pipeline for Marigold Surface Normals Estimation: https://marigoldcomputervision.github.io.
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-    Args:
-        unet (`UNet2DConditionModel`):
-            Conditional U-Net to denoise the prediction latent, conditioned on image latent.
-        vae (`AutoencoderKL`):
-            Variational Auto-Encoder (VAE) Model to encode and decode images and predictions
-            to and from latent representations.
-        scheduler (`DDIMScheduler`):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
-        text_encoder (`CLIPTextModel`):
-            Text-encoder, for empty text embedding.
-        tokenizer (`CLIPTokenizer`):
-            CLIP tokenizer.
-        default_denoising_steps (`int`, *optional*):
-            The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
-            quality with the given model. This value must be set in the model config. When the pipeline is called
-            without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
-            reasonable results with various model flavors compatible with the pipeline, such as those relying on very
-            short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
-        default_processing_resolution (`int`, *optional*):
-            The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
-            the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
-            default value is used. This is required to ensure reasonable results with various model flavors trained
-            with varying optimal processing resolution values.
-    """
-    latent_scale_factor = 0.18215
-    def __init__(
-        self,
-        unet: UNet2DConditionModel,
-        vae: AutoencoderKL,
-        scheduler: Union[DDIMScheduler, LCMScheduler],
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        default_denoising_steps: Optional[int] = None,
-        default_processing_resolution: Optional[int] = None,
-    ):
-        super().__init__()
-        self.register_modules(
-            unet=unet,
-            vae=vae,
-            scheduler=scheduler,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-        )
-        self.register_to_config(
-            default_denoising_steps=default_denoising_steps,
-            default_processing_resolution=default_processing_resolution,
-        )
-        self.default_denoising_steps = default_denoising_steps
-        self.default_processing_resolution = default_processing_resolution
-        self.empty_text_embed = None
-    @torch.no_grad()
-    def __call__(
-        self,
-        input_image: Union[Image.Image, torch.Tensor],
-        denoising_steps: Optional[int] = None,
-        ensemble_size: int = 1,
-        processing_res: Optional[int] = None,
-        match_input_res: bool = True,
-        resample_method: str = "bilinear",
-        batch_size: int = 0,
-        generator: Union[torch.Generator, None] = None,
-        show_progress_bar: bool = True,
-        ensemble_kwargs: Dict = None,
-    ) -> MarigoldNormalsOutput:
-        """
-        Function invoked when calling the pipeline.
-        Args:
-            input_image (`Image`):
-                Input RGB (or gray-scale) image.
-            denoising_steps (`int`, *optional*, defaults to `None`):
-                Number of denoising diffusion steps during inference. The default value `None` results in automatic
-                selection.
-            ensemble_size (`int`, *optional*, defaults to `1`):
-                Number of predictions to be ensembled.
-            processing_res (`int`, *optional*, defaults to `None`):
-                Effective processing resolution. When set to `0`, processes at the original image resolution. This
-                produces crisper predictions, but may also lead to the overall loss of global context. The default
-                value `None` resolves to the optimal value from the model config.
-            match_input_res (`bool`, *optional*, defaults to `True`):
-                Resize the prediction to match the input resolution.
-                Only valid if `processing_res` > 0.
-            resample_method: (`str`, *optional*, defaults to `bilinear`):
-                Resampling method used to resize images and predictions. This can be one of `bilinear`, `bicubic` or
-                `nearest`, defaults to: `bilinear`.
-            batch_size (`int`, *optional*, defaults to `0`):
-                Inference batch size, no bigger than `num_ensemble`.
-                If set to 0, the script will automatically decide the proper batch size.
-            generator (`torch.Generator`, *optional*, defaults to `None`)
-                Random generator for initial noise generation.
-            show_progress_bar (`bool`, *optional*, defaults to `True`):
-                Display a progress bar of diffusion denoising.
-            ensemble_kwargs (`dict`, *optional*, defaults to `None`):
-                Arguments for detailed ensembling settings.
-        Returns:
-            `MarigoldNormalsOutput`: Output class for Marigold monocular surface normals estimation pipeline, including:
-            - **normals_np** (`np.ndarray`) Predicted normals map of shape [3, H, W] with values in the range of [-1, 1]
-                    (unit length vectors)
-            - **normals_img** (`PIL.Image.Image`) Normals image, with the shape of [H, W, 3] and values in [0, 255]
-            - **uncertainty** (`None` or `np.ndarray`) Uncalibrated uncertainty(MAD, median absolute deviation)
-                    coming from ensembling. None if `ensemble_size = 1`
-        """
-        # Model-specific optimal default values leading to fast and reasonable results.
-        if denoising_steps is None:
-            denoising_steps = self.default_denoising_steps
-        if processing_res is None:
-            processing_res = self.default_processing_resolution
-        assert processing_res >= 0
-        assert ensemble_size >= 1
-        # Check if denoising step is reasonable
-        self._check_inference_step(denoising_steps)
-        resample_method: InterpolationMode = get_tv_resample_method(resample_method)
-        # ----------------- Image Preprocess -----------------
-        # Convert to torch tensor
-        if isinstance(input_image, Image.Image):
-            input_image = input_image.convert("RGB")
-            # convert to torch tensor [H, W, rgb] -> [rgb, H, W]
-            rgb = pil_to_tensor(input_image)
-            rgb = rgb.unsqueeze(0)  # [1, rgb, H, W]
-        elif isinstance(input_image, torch.Tensor):
-            rgb = input_image
-        else:
-            raise TypeError(f"Unknown input type: {type(input_image) = }")
-        input_size = rgb.shape
-        assert (
-            4 == rgb.dim() and 3 == input_size[-3]
-        ), f"Wrong input shape {input_size}, expected [1, rgb, H, W]"
-        # Resize image
-        if processing_res > 0:
-            rgb = resize_max_res(
-                rgb,
-                max_edge_resolution=processing_res,
-                resample_method=resample_method,
-            )
-        # Normalize rgb values
-        rgb_norm: torch.Tensor = rgb / 255.0 * 2.0 - 1.0  #  [0, 255] -> [-1, 1]
-        rgb_norm = rgb_norm.to(self.dtype)
-        assert rgb_norm.min() >= -1.0 and rgb_norm.max() <= 1.0
-        # ----------------- Predicting normals -----------------
-        # Batch repeated input image
-        duplicated_rgb = rgb_norm.expand(ensemble_size, -1, -1, -1)
-        single_rgb_dataset = TensorDataset(duplicated_rgb)
-        if batch_size > 0:
-            _bs = batch_size
-        else:
-            _bs = find_batch_size(
-                ensemble_size=ensemble_size,
-                input_res=max(rgb_norm.shape[1:]),
-                dtype=self.dtype,
-            )
-        single_rgb_loader = DataLoader(
-            single_rgb_dataset, batch_size=_bs, shuffle=False
-        )
-        # Predict normals maps (batched)
-        target_pred_ls = []
-        if show_progress_bar:
-            iterable = tqdm(
-                single_rgb_loader, desc=" " * 2 + "Inference batches", leave=False
-            )
-        else:
-            iterable = single_rgb_loader
-        for batch in iterable:
-            (batched_img,) = batch
-            target_pred_raw = self.single_infer(
-                rgb_in=batched_img,
-                num_inference_steps=denoising_steps,
-                show_pbar=show_progress_bar,
-                generator=generator,
-            )
-            target_pred_ls.append(target_pred_raw.detach())
-        target_preds = torch.concat(target_pred_ls, dim=0)
-        torch.cuda.empty_cache()  # clear vram cache for ensembling
-        # ----------------- Test-time ensembling -----------------
-        if ensemble_size > 1:
-            final_pred, pred_uncert = ensemble_normals(
-                target_preds,
-                **(ensemble_kwargs or {}),
-            )
-        else:
-            final_pred = target_preds
-            pred_uncert = None
-        # Resize back to original resolution
-        if match_input_res:
-            final_pred = resize(
-                final_pred,
-                input_size[-2:],
-                interpolation=resample_method,
-                antialias=True,
-            )
-        # Convert to numpy
-        final_pred = final_pred.squeeze()
-        final_pred = final_pred.cpu().numpy()
-        if pred_uncert is not None:
-            pred_uncert = pred_uncert.squeeze().cpu().numpy()
-        # Clip output range
-        final_pred = final_pred.clip(-1, 1)
-        # Colorize
-        normals_img = ((final_pred + 1) * 127.5).astype(np.uint8)
-        normals_img = chw2hwc(normals_img)
-        normals_img = Image.fromarray(normals_img)
-        return MarigoldNormalsOutput(
-            normals_np=final_pred,
-            normals_img=normals_img,
-            uncertainty=pred_uncert,
-        )
-    def _check_inference_step(self, n_step: int) -> None:
-        """
-        Check if denoising step is reasonable
-        Args:
-            n_step (`int`): denoising steps
-        """
-        assert n_step >= 1
-        if isinstance(self.scheduler, DDIMScheduler):
-            if "trailing" != self.scheduler.config.timestep_spacing:
-                logging.warning(
-                    f"The loaded `DDIMScheduler` is configured with `timestep_spacing="
-                    f'"{self.scheduler.config.timestep_spacing}"`; the recommended setting is `"trailing"`. '
-                    f"This change is backward-compatible and yields better results. "
-                    f"Consider using `prs-eth/marigold-normals-v1-1` for the best experience."
-                )
-            else:
-                if n_step > 10:
-                    logging.warning(
-                        f"Setting too many denoising steps ({n_step}) may degrade the prediction; consider relying on "
-                        f"the default values."
-                    )
-            if not self.scheduler.config.rescale_betas_zero_snr:
-                logging.warning(
-                    f"The loaded `DDIMScheduler` is configured with `rescale_betas_zero_snr="
-                    f"{self.scheduler.config.rescale_betas_zero_snr}`; the recommended setting is True. "
-                    f"Consider using `prs-eth/marigold-normals-v1-1` for the best experience."
-                )
-        elif isinstance(self.scheduler, LCMScheduler):
-            raise RuntimeError(
-                "This pipeline implementation does not support the LCMScheduler. Please refer to the project "
-                "README.md for instructions about using LCM."
-            )
-        else:
-            raise RuntimeError(f"Unsupported scheduler type: {type(self.scheduler)}")
-    def encode_empty_text(self):
-        """
-        Encode text embedding for empty prompt
-        """
-        prompt = ""
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="do_not_pad",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids.to(self.text_encoder.device)
-        self.empty_text_embed = self.text_encoder(text_input_ids)[0].to(self.dtype)
-    @torch.no_grad()
-    def single_infer(
-        self,
-        rgb_in: torch.Tensor,
-        num_inference_steps: int,
-        generator: Union[torch.Generator, None],
-        show_pbar: bool,
-    ) -> torch.Tensor:
-        """
-        Perform a single prediction without ensembling.
-        Args:
-            rgb_in (`torch.Tensor`):
-                Input RGB image.
-            num_inference_steps (`int`):
-                Number of diffusion denoisign steps (DDIM) during inference.
-            show_pbar (`bool`):
-                Display a progress bar of diffusion denoising.
-            generator (`torch.Generator`)
-                Random generator for initial noise generation.
-        Returns:
-            `torch.Tensor`: Predicted targets.
-        """
-        device = self.device
-        rgb_in = rgb_in.to(device)
-        # Set timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps  # [T]
-        # Encode image
-        rgb_latent = self.encode_rgb(rgb_in)  # [B, 4, h, w]
-        # Noisy latent for outputs
-        target_latent = torch.randn(
-            rgb_latent.shape,
-            device=device,
-            dtype=self.dtype,
-            generator=generator,
-        )  # [B, 4, h, w]
-        # Batched empty text embedding
-        if self.empty_text_embed is None:
-            self.encode_empty_text()
-        batch_empty_text_embed = self.empty_text_embed.repeat(
-            (rgb_latent.shape[0], 1, 1)
-        ).to(device)  # [B, 2, 1024]
-        # Denoising loop
-        if show_pbar:
-            iterable = tqdm(
-                enumerate(timesteps),
-                total=len(timesteps),
-                leave=False,
-                desc=" " * 4 + "Diffusion denoising",
-            )
-        else:
-            iterable = enumerate(timesteps)
-        for i, t in iterable:
-            unet_input = torch.cat(
-                [rgb_latent, target_latent], dim=1
-            )  # this order is important
-            # predict the noise residual
-            noise_pred = self.unet(
-                unet_input, t, encoder_hidden_states=batch_empty_text_embed
-            ).sample  # [B, 4, h, w]
-            # compute the previous noisy sample x_t -> x_t-1
-            target_latent = self.scheduler.step(
-                noise_pred, t, target_latent, generator=generator
-            ).prev_sample
-        normals = self.decode_normals(target_latent)  # [B,3,H,W]
-        # clip prediction
-        normals = torch.clip(normals, -1.0, 1.0)
-        norm = torch.norm(normals, dim=1, keepdim=True)
-        normals /= norm.clamp(min=1e-6)  # [B,3,H,W]
-        return normals
-    def encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
-        """
-        Encode RGB image into latent.
-        Args:
-            rgb_in (`torch.Tensor`):
-                Input RGB image to be encoded.
-        Returns:
-            `torch.Tensor`: Image latent.
-        """
-        # encode
-        h = self.vae.encoder(rgb_in)
-        moments = self.vae.quant_conv(h)
-        mean, logvar = torch.chunk(moments, 2, dim=1)
-        # scale latent
-        rgb_latent = mean * self.latent_scale_factor
-        return rgb_latent
-    def decode_normals(self, normals_latent: torch.Tensor) -> torch.Tensor:
-        """
-        Decode normals latent into normals map.
-        Args:
-            normals_latent (`torch.Tensor`):
-                Normals latent to be decoded.
-        Returns:
-            `torch.Tensor`: Decoded normals map.
-        """
-        # scale latent
-        normals_latent = normals_latent / self.latent_scale_factor
-        # decode
-        z = self.vae.post_quant_conv(normals_latent)
-        stacked = self.vae.decoder(z)
-        return stacked

{src → olbedo}/__init__.py RENAMED Viewed

@@ -27,3 +27,5 @@
 #   https://github.com/prs-eth/Marigold#-citation
 # If you find Marigold useful, we kindly ask you to cite our papers.
 # --------------------------------------------------------------------------

 #   https://github.com/prs-eth/Marigold#-citation
 # If you find Marigold useful, we kindly ask you to cite our papers.
 # --------------------------------------------------------------------------
+from .olbedo_iid_pipeline import OlbedoIIDPipeline, OlbedoIIDOutput  # noqa: F401

olbedo/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (231 Bytes). View file