Spaces:

prs-eth
/

PaGeR

Running

App Files Files Community

vulus98 commited on Feb 2

Commit

05d33a4

1 Parent(s): 4817e08

Save work before migration

Browse files

Files changed (17) hide show

.gitattributes +1 -0
.gitignore +2 -0
README.md +18 -1
app.py +248 -0
examples/alice.jpg +3 -0
examples/example_1.jpg +3 -0
examples/example_2.jpg +3 -0
examples/greenhouse.jpg +3 -0
requirements.txt +18 -0
src/__init__.py +0 -0
src/pager.py +308 -0
src/utils/__init__.py +0 -0
src/utils/conv_padding.py +123 -0
src/utils/geometry_utils.py +270 -0
src/utils/loss.py +68 -0
src/utils/lr_scheduler.py +41 -0
src/utils/utils.py +214 -0

.gitattributes CHANGED Viewed

@@ -33,4 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.jpg filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jpg~ filter=lfs diff=lfs merge=lfs -text
 *.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/
2	+ .vscode/

README.md CHANGED Viewed

@@ -8,7 +8,24 @@ sdk_version: 6.0.2
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: Panorama Geometry Estimation using onestep diffusion models
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 app_file: app.py
 pinned: false
 license: apache-2.0
+python_version: 3.10
+models:
+    - prs-eth/PaGeR-depth
+    - prs-eth/PaGeR-normals
+tags:
+    - computer-vision
+    - image-processing
+    - diffusion-models
+    - panorama
+    - geometry-estimation
+    - depth-estimation
+    - normal-estimation
+    - single-step-diffusion
+preload_from_hub:
+    - prs-eth/PaGeR-depth
+    - prs-eth/PaGeR-normals
+suggested_hardware: a10g-large
+short_description: Panorama Geometry Estimation
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import sys
+import gc
+import torch
+import numpy as np
+import argparse
+import logging
+import gradio as gr
+from PIL import Image
+from pathlib import Path
+from omegaconf import OmegaConf
+from tempfile import NamedTemporaryFile
+from huggingface_hub import hf_hub_download
+from matplotlib import pyplot as plt
+from src.pager import Pager
+from src.utils.geometry_utils import compute_edge_mask, erp_to_point_cloud_glb, erp_to_cubemap
+from src.utils.utils import prepare_image_for_logging
+MIN_DEPTH = np.log(1e-2)
+DEPTH_RANGE = np.log(75.0)
+POINTCLOUD_DOWNSAMPLE_FACTOR = 2
+MAX_POINTCLOUD_POINTS = 200000
+EXAMPLES_DIR = Path(__file__).parent / "examples"
+EXAMPLE_IMAGES = [
+    str(p)
+    for p in sorted(EXAMPLES_DIR.glob("*"))
+    if p.suffix.lower() in {".jpg", ".jpeg", ".png", ".webp"}
+]
+def parse_args():
+    parser = argparse.ArgumentParser(description="Inference script for panorama depth estimation using diffusion models.")
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="A seed for reproducibility."
+    )
+    parser.add_argument(
+        "--depth_checkpoint_path",
+        default="prs-eth/PaGeR-depth",
+        type=str,
+        help="UNet checkpoint to load.",
+    )
+    parser.add_argument(
+        "--normals_checkpoint_path",
+        default="prs-eth/PaGeR-normals",
+        type=str,
+        help="UNet checkpoint to load.",
+    )
+    parser.add_argument(
+        "--enable_xformers",
+        action="store_true",
+        help="Whether or not to use xformers."
+    )
+    args = parser.parse_args()
+    return args
+def _release_cuda_memory():
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+    gc.collect()
+def generate_ERP(input_rgb, modality):
+    batch = {}
+    input_rgb = torch.from_numpy(input_rgb).permute(2,0,1).to(torch.float32) / 255.0
+    input_rgb = input_rgb * 2.0 - 1.0
+    batch['rgb_cubemap'] = erp_to_cubemap(input_rgb).unsqueeze(0).to(device)
+    with torch.inference_mode():
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+        pred_cubemap = pager(batch, modality)
+        if modality == "depth":
+            pred, pred_image = pager.process_depth_output(pred_cubemap, orig_size=(1024, 2048),
+                                                        min_depth=MIN_DEPTH,
+                                                        depth_range=DEPTH_RANGE,
+                                                        log_scale=pager.model_configs["depth"]["config"].log_scale)
+            pred, pred_image = pred[0].cpu().numpy(), pred_image.cpu().numpy()
+            pred_image = np.clip(pred_image, pred_image.min(), np.quantile(pred_image, 0.99))
+            pred_image = prepare_image_for_logging(pred_image)
+            pred_image = cmap(pred_image[0,...]/255.0)
+            pred_image = (pred_image[..., :3] * 255).astype(np.uint8)
+        elif modality == "normal":
+            pred = pager.process_normal_output(pred_cubemap, orig_size=(1024, 2048))
+            pred = pred.cpu().numpy()
+            pred_image = pred.copy()
+            pred_image = prepare_image_for_logging(pred_image).transpose(1,2,0)
+    return pred_image, pred
+def process_panorama(image_path, output_type, include_pointcloud):
+    loaded_image = Image.open(image_path).convert("RGB").resize((2048, 1024))
+    input_rgb = np.array(loaded_image)
+    modality = "depth" if output_type.lower() == "depth" else "normal"
+    is_depth = modality == "depth"
+    main_label = "Depth Output" if is_depth else "Surface Normal Output"
+    pc_label = (
+        "RGB-colored Point Cloud" if is_depth else "Surface Normals-Colored Point Cloud"
+    )
+    output_image, raw_pred = generate_ERP(input_rgb, modality)
+    point_cloud = None
+    if include_pointcloud:
+        if is_depth:
+            depth = np.squeeze(np.array(raw_pred))
+            color = (input_rgb.astype(np.float32) / 127.5) - 1.0
+        else:
+            color = np.array(raw_pred)
+            color = np.transpose(color, (1, 2, 0))
+            _release_cuda_memory()
+            depth = np.squeeze(generate_ERP(input_rgb, "depth", )[1])
+        edge_filtered_mask = compute_edge_mask(
+            depth,
+            abs_thresh=0.002,
+            rel_thresh=0.002,
+        )
+        if POINTCLOUD_DOWNSAMPLE_FACTOR > 1:
+            depth = depth[::POINTCLOUD_DOWNSAMPLE_FACTOR, ::POINTCLOUD_DOWNSAMPLE_FACTOR]
+            color = color[::POINTCLOUD_DOWNSAMPLE_FACTOR, ::POINTCLOUD_DOWNSAMPLE_FACTOR]
+            edge_filtered_mask = edge_filtered_mask[::POINTCLOUD_DOWNSAMPLE_FACTOR, ::POINTCLOUD_DOWNSAMPLE_FACTOR]
+        tmp = NamedTemporaryFile(suffix=".glb", delete=False)
+        erp_to_point_cloud_glb(
+            color, depth, edge_filtered_mask, export_path=tmp.name)
+        tmp.close()
+        point_cloud = tmp.name
+    _release_cuda_memory()
+    return (
+        gr.update(value=output_image, label=main_label),
+        gr.update(value=point_cloud, label=pc_label),
+    )
+def clear_pointcloud():
+    return gr.update(value=None)
+args = parse_args()
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+logger = logging.getLogger("simple")
+handler = logging.StreamHandler(sys.stdout)
+formatter = logging.Formatter("%(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.setLevel(logging.INFO)
+logger.propagate = False
+cmap = plt.get_cmap("Spectral")
+checkpoint_config = {}
+try:
+    depth_checkpoint_config_path = hf_hub_download(
+        repo_id=args.depth_checkpoint_path,
+        filename="config.yaml"
+    )
+except Exception as e:
+    depth_checkpoint_config_path = Path(args.depth_checkpoint_path) / "config.yaml"
+depth_config = OmegaConf.load(depth_checkpoint_config_path)
+checkpoint_config["depth"] = {"path": args.depth_checkpoint_path, "mode": "trained", "config": depth_config.model}
+try:
+    normal_checkpoint_config_path = hf_hub_download(
+        repo_id=args.normals_checkpoint_path,
+        filename="config.yaml"
+    )
+except Exception as e:
+    normal_checkpoint_config_path = Path(args.normals_checkpoint_path) / "config.yaml"
+normal_config = OmegaConf.load(normal_checkpoint_config_path)
+checkpoint_config["normal"] = {"path": args.normals_checkpoint_path, "mode": "trained", "config": normal_config.model}
+pager = Pager(model_configs=checkpoint_config, pretrained_path = depth_config.model.pretrained_path, device=device)
+pager.unet["depth"].to(device, dtype=pager.weight_dtype)
+pager.unet["depth"].eval()
+pager.unet["normal"].to(device, dtype=pager.weight_dtype)
+pager.unet["normal"].eval()
+with gr.Blocks() as demo:
+    gr.Markdown("## 📟 PaGeR: Panoramic Geometry Reconstruction")
+    with gr.Row():
+        with gr.Column(scale=1):
+            image_input = gr.Image(
+                label="RGB ERP Image",
+                type="filepath",
+                height=320,
+            )
+            output_choice = gr.Radio(
+                ["Depth", "Surface Normals"],
+                value="Depth",
+                label="Output Type",
+            )
+            pointcloud_checkbox = gr.Checkbox(
+                label="Generate Point Cloud",
+                value=True,
+            )
+            gr.Examples(
+                examples=EXAMPLE_IMAGES,
+                inputs=image_input,
+                label="Pick an example (or upload your own above)",
+                examples_per_page=8,
+                cache_examples=False,
+            )
+            run_button = gr.Button("Run Inference")
+        with gr.Column(scale=1):
+            rendered_output = gr.Image(
+                label="Output",
+                type="numpy",
+                height=320,
+            )
+    with gr.Row():
+        pointcloud_output = gr.Model3D(
+            label="Point Cloud",
+            height=360,
+            clear_color=[0.0, 0.0, 0.0, 0.0],
+        )
+    (
+        run_button.click(
+            fn=clear_pointcloud,
+            outputs=pointcloud_output,
+            queue=False,
+        )
+        .then(
+            fn=process_panorama,
+            inputs=[image_input, output_choice, pointcloud_checkbox],
+            outputs=[rendered_output, pointcloud_output],
+        )
+    )
+if __name__ == "__main__":
+    _release_cuda_memory()
+    demo.launch()

examples/alice.jpg ADDED Viewed

Git LFS Details

SHA256: 08bc06d4f11394aba2ed22a211186cda79979449fde0a6216549e547c430c3e5
Pointer size: 131 Bytes
Size of remote file: 287 kB

examples/example_1.jpg ADDED Viewed

Git LFS Details

SHA256: 875fc6107ae7b2d666767e2847e5652efab9d069805487c33c26fdc3897f6210
Pointer size: 131 Bytes
Size of remote file: 246 kB

examples/example_2.jpg ADDED Viewed

Git LFS Details

SHA256: 540fbc57b080fada086be2e3bed0bc5d7b58f9eb55d4637853f6772c355a42b6
Pointer size: 131 Bytes
Size of remote file: 144 kB

examples/greenhouse.jpg ADDED Viewed

Git LFS Details

SHA256: 2f17cebe56b3df5c02e30ff1ae2ad1771bfd10441ed6842543dae1e2f09a540a
Pointer size: 131 Bytes
Size of remote file: 517 kB

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+torch==2.2.0
+xformers==0.0.24
+accelerate==0.27.2
+gradio==6.0.2
+huggingface-hub==0.36.0
+transformers
+diffusers==0.30.2
+numpy==1.26.4
+scipy==1.15.1
+matplotlib==3.10.0
+tqdm==4.67.1
+einops==0.8.1
+datasets==3.3.0
+python-dotenv==1.1.1
+wandb==0.19.6
+opencv-python==4.11.0.86
+pytorch360convert==0.2.3
+trimesh==4.9.0

src/__init__.py ADDED Viewed

File without changes

src/pager.py ADDED Viewed

	@@ -0,0 +1,308 @@

+import torch
+from torch import nn
+from torch.nn import Conv2d
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers import DDPMScheduler
+from diffusers.utils.import_utils import is_xformers_available
+from Marigold.unet.unet_2d_condition import UNet2DConditionModel
+from Marigold.vae.autoencoder_kl import AutoencoderKL
+from src.utils.conv_padding import PaddedConv2d, valid_pad_conv_fn
+from src.utils.loss import L1Loss, GradL1Loss, CosineNormalLoss
+from src.utils.geometry_utils import (
+    get_positional_encoding,
+      compute_scale_and_shift,
+        compute_shift,
+        depth_to_normals_erp,
+        cubemap_to_erp
+    )
+class Pager(nn.Module):
+    def __init__(self,
+                 model_configs,
+                 pretrained_path,
+                 train_modality=None,
+                 device=torch.device("cpu"),
+                 weight_dtype=torch.float32):
+        super().__init__()
+        self.model_configs = model_configs
+        self.weight_dtype = weight_dtype
+        self.rgb_latent_scale_factor = 0.18215
+        self.depth_latent_scale_factor = 0.18215
+        self.train_modality = train_modality
+        self.device = device
+        self.prepare_model_components(pretrained_path, model_configs)
+        self.prepare_empty_encoding()
+        self.alpha_prod = self.noise_scheduler.alphas_cumprod.to(device, dtype=weight_dtype)
+        self.beta_prod = 1 - self.alpha_prod
+        self.num_timesteps = self.noise_scheduler.config.num_train_timesteps - 1
+        del self.noise_scheduler
+    def prepare_model_components(self, pretrained_path, model_configs):
+        vae_use_RoPE = None
+        for checkpoint_cfg in model_configs.values():
+            if vae_use_RoPE is None:
+                vae_use_RoPE = checkpoint_cfg['config'].vae_use_RoPE == "RoPE"
+            elif vae_use_RoPE != (checkpoint_cfg['config'].vae_use_RoPE == "RoPE"):
+                raise ValueError("All UNet checkpoints must use the same VAE positional encoding configuration.")
+        self.noise_scheduler = DDPMScheduler.from_pretrained(pretrained_path, subfolder="scheduler", rescale_betas_zero_snr=True)
+        self.tokenizer    = CLIPTokenizer.from_pretrained(pretrained_path, subfolder="tokenizer", revision=None)
+        self.text_encoder = CLIPTextModel.from_pretrained(pretrained_path, subfolder="text_encoder", revision=None, variant=None)
+        self.vae = AutoencoderKL.from_pretrained(pretrained_path, subfolder="vae", revision=None, variant=None,
+                                    use_RoPE = vae_use_RoPE)
+        self.set_valid_pad_conv(self.vae)
+        self.vae.requires_grad_(False)
+        self.vae.to(self.device, dtype=self.weight_dtype)
+        self.vae.eval()
+        self.text_encoder.requires_grad_(False)
+        self.text_encoder.to(self.device, dtype=self.weight_dtype)
+        self.text_encoder.eval()
+        base_in_channels = 8
+        pe_channels_size = 0
+        self.unet = {}
+        for modality, checkpoint_cfg in model_configs.items():
+            if checkpoint_cfg['config'].unet_positional_encoding == "uv":
+                pe_channels_size = 2
+            target_in_channels = base_in_channels + pe_channels_size
+            self.unet[modality] = UNet2DConditionModel.from_pretrained(
+                checkpoint_cfg["path"],
+                subfolder="unet",
+                revision=None,
+                in_channels=target_in_channels if checkpoint_cfg["mode"] == "trained" else base_in_channels,
+                use_RoPE=checkpoint_cfg['config'].unet_positional_encoding == "RoPE"
+            )
+            if target_in_channels > base_in_channels and checkpoint_cfg["mode"] != "trained":
+                self.extend_unet_conv_in(self.unet[modality], new_in_channels=target_in_channels)
+            self.set_valid_pad_conv(self.unet[modality])
+        if checkpoint_cfg['config'].enable_xformers:
+            if is_xformers_available():
+                import xformers
+                if self.unet.get("depth"):
+                    self.unet["depth"].enable_xformers_memory_efficient_attention()
+                if self.unet.get("normal"):
+                    self.unet["normal"].enable_xformers_memory_efficient_attention()
+                self.vae.enable_xformers_memory_efficient_attention()
+    def prepare_training(self, accelerator, gradient_checkpointing):
+        self.unwrapped_unet = self.unet[self.train_modality]
+        self.unet[self.train_modality] = accelerator.prepare(self.unet[self.train_modality])
+        self.trained_unet = self.unet[self.train_modality]
+        if gradient_checkpointing:
+            self.trained_unet._set_gradient_checkpointing()
+            self.vae._set_gradient_checkpointing()
+    def prepare_cubemap_PE(self, image_height, image_width):
+        use_uv_PE = False
+        for checkpoint_cfg in self.model_configs.values():
+                if checkpoint_cfg['config'].unet_positional_encoding == "uv":
+                    use_uv_PE = True
+        if use_uv_PE:
+            PE_cubemap = get_positional_encoding(image_height, image_width)
+            self.PE_cubemap = PE_cubemap.to(device=self.device, dtype=self.weight_dtype)
+    def prepare_empty_encoding(self):
+        with torch.inference_mode():
+            empty_token    = self.tokenizer([""], padding="max_length", truncation=True, return_tensors="pt").input_ids
+            empty_token    = empty_token.to(self.device)
+            empty_encoding = self.text_encoder(empty_token, return_dict=False)[0]
+            self.empty_encoding = empty_encoding.to(self.device, dtype=self.weight_dtype)
+        del empty_token
+        del self.text_encoder
+        del self.tokenizer
+    def forward(self, batch, modality):
+        with torch.inference_mode():
+            c, h, w = batch["rgb_cubemap"].shape[2:]
+            rgb_vae_input = batch["rgb_cubemap"].reshape(-1, c, h, w).to(dtype=self.weight_dtype)
+            rgb_latents = self.vae.encode(rgb_vae_input, deterministic=True)
+            rgb_latents = rgb_latents * self.rgb_latent_scale_factor
+            del rgb_vae_input
+        timesteps = torch.ones((rgb_latents.shape[0],), device=self.device) * self.num_timesteps
+        timesteps = timesteps.long()
+        alpha_prod_t = self.alpha_prod[timesteps].view(-1, 1, 1, 1)
+        beta_prod_t = self.beta_prod[timesteps].view(-1, 1, 1, 1)
+        noisy_latents = torch.zeros_like(rgb_latents).to(self.device)
+        encoder_hidden_states = self.empty_encoding.repeat(batch["rgb_cubemap"].shape[0] * 6, 1, 1)
+        if self.model_configs[modality]['config'].unet_positional_encoding == "uv":
+            batch_PE_cubemap = self.PE_cubemap.repeat(batch["rgb_cubemap"].shape[0], 1, 1, 1)
+            unet_input = torch.cat((rgb_latents, noisy_latents, batch_PE_cubemap), dim=1).to(
+                self.device
+            )
+        else:
+            unet_input = torch.cat((rgb_latents, noisy_latents), dim=1).to(self.device)
+        del rgb_latents
+        model_pred = self.unet[modality](
+            unet_input,
+            timesteps,
+            encoder_hidden_states,
+            return_dict=False,
+        )[0]
+        current_latent_estimate = (alpha_prod_t**0.5) * noisy_latents - (beta_prod_t**0.5) * model_pred
+        current_scaled_latent_estimate = current_latent_estimate / self.depth_latent_scale_factor
+        pred_cubemap = self.vae.decode(current_scaled_latent_estimate, deterministic=True)
+        if modality == "depth":
+            pred_cubemap = pred_cubemap.mean(dim=1, keepdim=True)
+        return pred_cubemap
+    def prepare_losses_dict(self, loss_cfg):
+        self.losses_dict = {}
+        if self.train_modality == "depth":
+            self.losses_dict["l1_loss"] = {"loss_fn": L1Loss(invalid_mask_weight=loss_cfg.invalid_mask_weight),
+                                           "weight": loss_cfg.l1_loss_weight}
+            if loss_cfg.grad_loss_weight > 0.0:
+                self.losses_dict["grad_loss"] = {"loss_fn": GradL1Loss(), "weight": loss_cfg.grad_loss_weight}
+            if loss_cfg.normals_consistency_loss_weight > 0.0:
+                self.losses_dict["normals_consistency_loss"] = {"loss_fn": CosineNormalLoss(),
+                                                                "weight": loss_cfg.normals_consistency_loss_weight}
+        else:
+            self.losses_dict["cosine_normal_loss"] = {"loss_fn": CosineNormalLoss(), "weight": 1.0}
+    def calculate_depth_loss(self, batch, pred_cubemap, min_depth, depth_range, log_scale, metric_depth):
+        loss = {"total_loss": 0.0}
+        gt_depth_cubemap =  batch['depth_cubemap'].squeeze(0).mean(dim=1, keepdim=True)
+        mask_cubemap = batch["mask_cubemap"].squeeze(0)
+        if not metric_depth:
+            if log_scale:
+                scale = compute_shift(pred_cubemap, gt_depth_cubemap, mask_cubemap)
+            else:
+                scale, shift = compute_scale_and_shift(pred_cubemap, gt_depth_cubemap, mask_cubemap)
+            if log_scale:
+                pred_cubemap += scale
+            else:
+                pred_cubemap = pred_cubemap * scale + shift
+        for loss_name, loss_params in self.losses_dict.items():
+            if loss_name == "normals_consistency_loss":
+                gt = batch['normal']
+                pred_depth = pred_cubemap
+                mask = batch["mask"]
+                pred_depth = self.process_depth_output(pred_depth, orig_size=gt.shape[2:], min_depth=min_depth,
+                                                       depth_range=depth_range, log_scale=log_scale)[0]
+                pred = depth_to_normals_erp(pred_depth).unsqueeze(0)
+            else:
+                pred = pred_cubemap
+                gt = gt_depth_cubemap
+                mask = mask_cubemap
+            loss[loss_name] = loss_params["loss_fn"](pred, gt, mask)
+            loss["total_loss"] += loss[loss_name] * loss_params["weight"]
+        return loss
+    def calculate_normal_loss(self, batch, pred_cubemap):
+        loss = {"total_loss": 0.0}
+        gt_normal_cubemap =  batch['normal_cubemap'].squeeze(0)
+        mask_cubemap = batch["mask_cubemap"].squeeze(0)
+        for loss_name, loss_params in self.losses_dict.items():
+            pred = pred_cubemap
+            gt = gt_normal_cubemap
+            loss[loss_name] = loss_params["loss_fn"](pred, gt, mask_cubemap)
+            loss["total_loss"] += loss[loss_name] * loss_params["weight"]
+        return loss
+    def process_depth_output(self,pred_cubemap, orig_size, min_depth, depth_range, log_scale, mask=None):
+        pred_panorama = cubemap_to_erp(pred_cubemap, *orig_size)
+        pred_panorama = torch.clamp(pred_panorama, -1, 1)
+        pred_panorama = (pred_panorama + 1) / 2
+        if mask is not None:
+            pred_panorama *= mask
+        pred_panorama = pred_panorama * depth_range + min_depth
+        if log_scale:
+            pred_panorama_viz = pred_panorama.clone()
+            pred_panorama = torch.exp(pred_panorama)
+        else:
+            pred_panorama_viz = torch.log(pred_panorama)
+        return pred_panorama, pred_panorama_viz
+    def process_normal_output(self,pred_cubemap, orig_size):
+        pred_panorama = cubemap_to_erp(pred_cubemap, *orig_size)
+        pred_panorama = torch.clamp(pred_panorama, -1, 1)
+        return pred_panorama
+    def extend_unet_conv_in(self, unet, new_in_channels: int):
+        if new_in_channels < unet.conv_in.in_channels:
+            raise ValueError(
+                f"new_in_channels ({new_in_channels}) must be >= current "
+                f"{unet.conv_in.in_channels}"
+            )
+        if new_in_channels == unet.conv_in.in_channels:
+            return
+        old_conv = unet.conv_in
+        old_in = old_conv.in_channels
+        device, dtype = old_conv.weight.device, old_conv.weight.dtype
+        bias_flag = old_conv.bias is not None
+        new_conv = Conv2d(
+            new_in_channels,
+            old_conv.out_channels,
+            kernel_size=old_conv.kernel_size,
+            stride=old_conv.stride,
+            padding=old_conv.padding,
+            bias=bias_flag,
+            padding_mode=old_conv.padding_mode,
+        ).to(device=device, dtype=dtype)
+        new_conv.weight.zero_()
+        new_conv.weight[:, :old_in].copy_(old_conv.weight)
+        if bias_flag:
+            new_conv.bias.copy_(old_conv.bias)
+        unet.conv_in = new_conv
+        unet.config["in_channels"] = new_in_channels
+    def set_valid_pad_conv(self, module: nn.Module):
+        for name, child in list(module.named_children()):
+            if isinstance(child, nn.Conv2d):
+                if child.padding != (0, 0):
+                    setattr(module, name, PaddedConv2d.from_existing(child, valid_pad_conv_fn))
+                elif module.__class__.__name__ == "Downsample2D" and module.use_conv:
+                    setattr(module, name, PaddedConv2d.from_existing(child, valid_pad_conv_fn, one_side_pad=True))
+            else:
+                self.set_valid_pad_conv(child)
+    def save_model(self, ema_unet, model_save_dir):
+        self.unwrapped_unet.save_pretrained(model_save_dir / "original")
+        if ema_unet is not None:
+            ema_unet.store(self.unwrapped_unet.parameters())
+            ema_unet.copy_to(self.unwrapped_unet.parameters())
+            self.unwrapped_unet.save_pretrained(model_save_dir / f"EMA")
+            ema_unet.restore(self.unwrapped_unet.parameters())

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/conv_padding.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import torch.nn as nn
+import torch.nn.functional as F
+orderings = [
+    [0, 1, 3, 4, 5],
+    [1, 2, 0, 4, 5],
+    [2, 3, 1, 4, 5],
+    [3, 0, 2, 4, 5],
+    [4, 1, 3, 2, 0],
+    [5, 1, 3, 0, 2],
+]
+rotations = [
+    [0, 0, 0, 0, 0],
+    [0, 0, 0,-1, 1],
+    [0, 0, 0, 2, 2],
+    [0, 0, 0, 1,-1],
+    [0, 1,-1, 2, 0],
+    [0,-1, 1, 0, 2]
+]
+def _take_right(face, rot):
+    if rot == 0:
+        return face[:, :, 0]
+    elif rot == 1:
+        return face[:, 0, :].flip(1)
+    elif rot == 2:
+        return face[:, :, -1].flip(1)
+    elif rot == -1:
+        return face[:, -1, :]
+def _take_left(face, rot):
+    if rot == 0:
+        return face[:, :, -1]
+    elif rot == 1:
+        return face[:, -1, :].flip(1)
+    elif rot == 2:
+        return face[:, :, 0].flip(1)
+    elif rot == -1:
+        return face[:, 0, :]
+def _take_top(face, rot):
+    if rot == 0:
+        return face[:, -1, :]
+    elif rot == 1:
+        return face[:, :, 0]
+    elif rot == 2:
+        return face[:, 0, :].flip(1)
+    elif rot == -1:
+        return face[:, :, -1].flip(1)
+def _take_bottom(face, rot):
+    if rot == 0:
+        return face[:, 0, :]
+    elif rot == 1:
+        return face[:, :, -1]
+    elif rot == 2:
+        return face[:, -1, :].flip(1)
+    elif rot == -1:
+        return face[:, :, 0].flip(1)
+def valid_pad_conv_fn(x, one_side_pad=False):
+    if one_side_pad:
+        x = x[:, :, :-1, :-1]
+    assert x.ndim == 4 and x.shape[0] == 6
+    _, C, H, W = x.shape
+    y = x.new_empty(6, C, H+2, W+2)
+    y[..., 1:-1, 1:-1] = x
+    for i in range(6):
+        r_idx, l_idx, t_idx, b_idx = orderings[i][1:5]
+        r_rot, l_rot, t_rot, b_rot = rotations[i][1:5]
+        r_edge = _take_right (x[r_idx], r_rot)
+        l_edge = _take_left  (x[l_idx], l_rot)
+        t_edge = _take_top   (x[t_idx], t_rot)
+        b_edge = _take_bottom(x[b_idx], b_rot)
+        y[i, :, 1:-1, 0   ] = l_edge
+        y[i, :, 1:-1, -1  ] = r_edge
+        y[i, :, 0,     1:-1] = t_edge
+        y[i, :, -1,    1:-1] = b_edge
+        y[i, :, 0,  0 ] = 0.5*(y[i, :, 0, 1]   + y[i, :, 1, 0])
+        y[i, :, 0, -1 ] = 0.5*(y[i, :, 0, -2]  + y[i, :, 1, -1])
+        y[i, :, -1, 0 ] = 0.5*(y[i, :, -2, 0]  + y[i, :, -1, 1])
+        y[i, :, -1,-1 ] = 0.5*(y[i, :, -2, -1] + y[i, :, -1, -2])
+    if one_side_pad:
+        return y[:, :, 1:, 1:]
+    return y
+class PaddedConv2d(nn.Conv2d):
+    def __init__(self, *args, pad_fn=None, one_side_pad=False, **kwargs):
+        kwargs = dict(kwargs)
+        kwargs["padding"] = 0
+        super().__init__(*args, **kwargs)
+        self.pad_fn = pad_fn
+        self.one_side_pad = one_side_pad
+    def forward(self, x):
+        x = self.pad_fn(x, one_side_pad=self.one_side_pad)
+        return F.conv2d(
+            x, self.weight, self.bias,
+            stride=self.stride, padding=0,
+            dilation=self.dilation, groups=self.groups
+        )
+    @classmethod
+    def from_existing(cls, conv: nn.Conv2d, pad_fn, one_side_pad=False):
+        new = cls(
+            conv.in_channels, conv.out_channels, conv.kernel_size,
+            stride=conv.stride, padding=0, dilation=conv.dilation,
+            groups=conv.groups, bias=(conv.bias is not None),
+            padding_mode="zeros", pad_fn=pad_fn, one_side_pad=one_side_pad
+        )
+        new.weight = conv.weight
+        if conv.bias is not None:
+            new.bias = conv.bias
+        return new

src/utils/geometry_utils.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import torch
+import numpy as np
+import trimesh
+from pytorch360convert import e2c, c2e
+def erp_to_cubemap(erp_tensor, face_w = 768, cube_format = "stack", mode = "bilinear", **kwargs):
+    return e2c(erp_tensor, face_w=face_w, cube_format=cube_format, mode=mode, **kwargs)
+def cubemap_to_erp(cube_tensor, erp_h = 1024, erp_w = 2048, cube_format = "stack", mode = "bilinear", **kwargs):
+    return c2e(cube_tensor, h=erp_h, w=erp_w, cube_format=cube_format, mode=mode, **kwargs)
+def roll_augment(data, shift_x):
+    if data.ndim == 2:
+        data = data[:, :, np.newaxis]
+        originally_2d = True
+    else:
+        originally_2d = False
+    if data.ndim == 3 and data.shape[0] != 3:
+        data = np.moveaxis(data, -1, 0)
+        moved_axis = True
+    else:
+        moved_axis = False
+    data_rolled = np.roll(data, int(shift_x), axis=2)
+    if moved_axis:
+        data_rolled = np.moveaxis(data_rolled, 0, -1)
+    if originally_2d:
+        data_rolled = data_rolled[:, :, 0]
+    return data_rolled
+def roll_normal(normal, shift_x):
+    if normal.ndim == 2:
+        normal = normal[:, :, np.newaxis]
+        originally_2d = True
+    else:
+        originally_2d = False
+    if normal.ndim == 3 and normal.shape[0] != 3:
+        normal = np.moveaxis(normal, -1, 0)
+        moved_axis = True
+    else:
+        moved_axis = False
+    _, H, W = normal.shape
+    angle = - 2.0 * np.pi * (shift_x / float(W))
+    cos_a, sin_a = np.cos(angle), np.sin(angle)
+    R = np.array([
+        [ cos_a, 0.0, -sin_a],
+        [ 0.0,   1.0,  0.0  ],
+        [ sin_a, 0.0,  cos_a]
+    ], dtype=normal.dtype)
+    n_flat = normal.reshape(3, -1)
+    normal = (R @ n_flat).reshape(3, H, W)
+    if moved_axis:
+        normal = np.moveaxis(normal, 0, -1)
+    if originally_2d:
+        normal = normal[:, :, 0]
+    return normal
+def compute_scale_and_shift(pred_g, targ_g, mask_g = None, eps = 0.0, fit_shift = True):
+    if mask_g is None:
+        mask_g = torch.ones_like(pred_g, dtype=torch.bool)
+    if pred_g.shape[0] == 6:
+        pred_g = pred_g.view(1, 6, pred_g.shape[2], pred_g.shape[3])
+        targ_g = targ_g.view(1, 6, targ_g.shape[2], targ_g.shape[3])
+        mask_g = mask_g.view(1, 6, mask_g.shape[2], mask_g.shape[3])
+    elif pred_g.shape[0] == 1 and pred_g.dim() == 3:
+        pred_g = pred_g.unsqueeze(0)
+        targ_g = targ_g.unsqueeze(0)
+        mask_g = mask_g.unsqueeze(0)
+    mask_g = mask_g.to(dtype=pred_g.dtype)
+    a_00 = torch.sum(mask_g * pred_g * pred_g, dim=(1, 2, 3))
+    a_01 = torch.sum(mask_g * pred_g, dim=(1, 2, 3))
+    a_11 = torch.sum(mask_g, dim=(1, 2, 3))
+    b_0  = torch.sum(mask_g * pred_g * targ_g, dim=(1, 2, 3))
+    b_1  = torch.sum(mask_g * targ_g, dim=(1, 2, 3))
+    if fit_shift:
+        det = a_00 * a_11 - a_01 * a_01
+        det = det + eps
+        scale = torch.zeros_like(b_0)
+        shift = torch.zeros_like(b_1)
+        valid = det > 0
+        scale[valid] = (a_11[valid] * b_0[valid] - a_01[valid] * b_1[valid]) / det[valid]
+        shift[valid] = (-a_01[valid] * b_0[valid] + a_00[valid] * b_1[valid]) / det[valid]
+        return scale, shift
+    else:
+        denom = a_00 + eps
+        scale = b_0 / denom
+        shift = torch.zeros_like(scale)
+        return scale, shift
+def compute_shift(pred, targ, mask, eps = 1e-6):
+    if pred.shape[0] == 6:
+        pred = pred.view(1, 6, *pred.shape[2:])
+        targ = targ.view(1, 6, *targ.shape[2:])
+        mask     = mask.view(1, 6, *mask.shape[2:])
+    w = mask.float()
+    num = torch.sum(w * (targ - pred), dim=(1,2,3))
+    den = torch.sum(w, dim=(1,2,3)).clamp_min(eps)
+    beta = num / den
+    return beta
+def get_positional_encoding(H, W, pixel_center = True, hw = 96):
+    jj = np.arange(W, dtype=np.float64)
+    ii = np.arange(H, dtype=np.float64)
+    if pixel_center:
+        jj = jj + 0.5
+        ii = ii + 0.5
+    U = (jj / W) * 2.0 - 1.0
+    V = (ii / H) * 2.0 - 1.0
+    U, V = np.meshgrid(U, V, indexing='xy')
+    erp = np.stack([U, V], axis=-1)
+    erp_tensor = torch.from_numpy(erp).permute(2, 0, 1).float()
+    faces = erp_to_cubemap(erp_tensor, face_w=hw)
+    return faces
+def unit_normals(n, eps = 1e-6):
+    assert n.dim() >= 3 and n.size(-3) == 3, "normals must have channel=3 at dim -3"
+    denom = torch.clamp(torch.linalg.norm(n, dim=-3, keepdim=True), min=eps)
+    return n / denom
+def _erp_dirs(H, W, device=None, dtype=None):
+    u = (torch.arange(W, device=device, dtype=dtype) + 0.5) / W
+    v = (torch.arange(H, device=device, dtype=dtype) + 0.5) / H
+    theta = u * (2.0 * torch.pi) - torch.pi
+    phi   = (0.5 - v) * torch.pi
+    theta = theta.view(1, W).expand(H, W)
+    phi   = phi.view(H, 1).expand(H, W)
+    cosphi = torch.cos(phi)
+    sinphi = torch.sin(phi)
+    costhe = torch.cos(theta)
+    sinthe = torch.sin(theta)
+    x =  cosphi * costhe
+    y = sinphi
+    z =  -cosphi * sinthe
+    dirs = torch.stack([x, y, z], dim=0)
+    return dirs
+def depth_to_normals_erp(depth, eps = 1e-6):
+    assert depth.dim() == 3 and depth.size(0) == 1, "depth must be (B,1,H,W)"
+    _, H, W = depth.shape
+    device, dtype = depth.device, depth.dtype
+    dirs = _erp_dirs(H, W, device=device, dtype=dtype)
+    P = depth * dirs
+    dtheta = 2.0 * torch.pi / W
+    dphi   = torch.pi / H
+    P_l = torch.roll(P, shifts=+1, dims=-1)
+    P_r = torch.roll(P, shifts=-1, dims=-1)
+    dP_dtheta = (P_r - P_l) / (2.0 * dtheta)
+    P_u = torch.cat([P[:, :1, :],  P[:, :-1, :]], dim=-2)
+    P_d = torch.cat([P[:, 1:, :],  P[:, -1:, :]], dim=-2)
+    dP_dphi = (P_d - P_u) / (2.0 * dphi)
+    n = torch.cross(dP_dtheta, dP_dphi, dim=0)
+    n = unit_normals(n, eps=eps)
+    return n
+def compute_edge_mask(depth, abs_thresh = 0.1, rel_thresh = 0.1):
+    assert depth.ndim == 2
+    depth = depth.astype(np.float32, copy=False)
+    valid = depth > 0
+    eps = 1e-6
+    edge = np.zeros_like(valid, dtype=bool)
+    d1 = depth[:, :-1]
+    d2 = depth[:, 1:]
+    v_pair = valid[:, :-1] & valid[:, 1:]
+    diff = np.abs(d1 - d2)
+    rel = diff / (np.minimum(d1, d2) + eps)
+    edge_pair = v_pair & (diff > abs_thresh) & (rel > rel_thresh)
+    edge[:, :-1] |= edge_pair
+    edge[:, 1:] |= edge_pair
+    d1 = depth[:-1, :]
+    d2 = depth[1:, :]
+    v_pair = valid[:-1, :] & valid[1:, :]
+    diff = np.abs(d1 - d2)
+    rel = diff / (np.minimum(d1, d2) + eps)
+    edge_pair = v_pair & (diff > abs_thresh) & (rel > rel_thresh)
+    edge[:-1, :] |= edge_pair
+    edge[1:, :]  |= edge_pair
+    keep = valid & (~edge)
+    return keep
+def erp_to_pointcloud(rgb, depth, mask = None):
+    assert rgb.ndim == 3 and rgb.shape[-1] == 3, "rgb must be (H, W, 3)"
+    assert depth.ndim == 2 and depth.shape[:2] == rgb.shape[:2], "depth must be (H, W) and match rgb H,W"
+    H, W, _ = rgb.shape
+    depth = depth.astype(np.float32, copy=False)
+    u = (np.arange(W, dtype=np.float32) + 0.5) / W
+    v = (np.arange(H, dtype=np.float32) + 0.5) / H
+    theta = u * (2.0 * np.pi) - np.pi
+    phi   = (1 - v) * np.pi - (np.pi / 2.0)
+    theta, phi = np.meshgrid(theta, phi, indexing="xy")
+    cos_phi = np.cos(phi)
+    dir_x = cos_phi * np.cos(theta)
+    dir_y = np.sin(phi)
+    dir_z = cos_phi * np.sin(theta)
+    X = depth * dir_x
+    Y = depth * dir_y
+    Z = depth * dir_z
+    if mask is None:
+        keep = depth > 0
+    else:
+        keep = (mask.astype(bool)) & (depth > 0)
+    points = np.stack([X, Y, Z], axis=-1)[keep]
+    rgb_clamped = np.clip(rgb, -1.0, 1.0)
+    colors = ((rgb_clamped * 0.5 + 0.5) * 255.0).astype(np.uint8)
+    colors = colors.reshape(H, W, 3)[keep]
+    return points.astype(np.float32, copy=False), colors
+def erp_to_point_cloud_glb(rgb, depth, mask=None, export_path=None):
+    points, colors = erp_to_pointcloud(rgb, depth, mask)
+    scene = trimesh.Scene()
+    scene.add_geometry(trimesh.PointCloud(vertices=points, colors=colors))
+    scene.export(export_path)
+    return scene

src/utils/loss.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import torch
+import torch.nn as nn
+from src.utils.geometry_utils import unit_normals
+class L1Loss(nn.Module):
+    def __init__(self, invalid_mask_weight=0.0):
+        super(L1Loss, self).__init__()
+        self.name = 'L1'
+        self.invalid_mask_weight = invalid_mask_weight
+    def forward(self, pred, target, mask):
+        loss = nn.functional.l1_loss(pred[mask], target[mask])
+        if self.invalid_mask_weight > 0.0:
+            invalid_mask = ~mask
+            if invalid_mask.sum() > 0:
+                invalid_loss = nn.functional.l1_loss(pred[invalid_mask], target[invalid_mask])
+                loss = loss + self.invalid_mask_weight * invalid_loss
+        return loss
+class GradL1Loss(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.name = 'GradL1'
+    def grad(self, x):
+        dx = x[..., :-1, 1:] - x[..., :-1, :-1]
+        dy = x[..., 1:, :-1] - x[..., :-1, :-1]
+        return dx, dy
+    def grad_mask(self, mask):
+        return (mask[..., :-1, :-1] & mask[..., :-1, 1:] &
+                mask[..., 1:, :-1] & mask[..., 1:, 1:])
+    def forward(self, pred, target, mask):
+        dx_p, dy_p = self.grad(pred)
+        dx_t, dy_t = self.grad(target)
+        mask_g = self.grad_mask(mask)
+        loss_x = nn.functional.l1_loss(dx_p[mask_g], dx_t[mask_g], reduction='mean')
+        loss_y = nn.functional.l1_loss(dy_p[mask_g], dy_t[mask_g], reduction='mean')
+        return 0.5 * (loss_x + loss_y)
+class CosineNormalLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.name = "CosineNormalLoss"
+    def forward(self, pred: torch.Tensor,
+                      target: torch.Tensor,
+                      mask: torch.Tensor) -> torch.Tensor:
+        assert pred.shape == target.shape, "pred and target must have same shape"
+        pred = unit_normals(pred)
+        target = unit_normals(target)
+        dot = (pred * target).sum(dim=1, keepdim=True).clamp(-1.0, 1.0)
+        cos_term = 1.0 - dot
+        if mask is not None:
+            loss = cos_term[mask].mean()
+        else:
+            loss = cos_term.mean()
+        return loss

src/utils/lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# @Vukasin Bozic 2026
+# This file contains the modified version of Marigold's exponential LR scheduler.
+# https://github.com/prs-eth/Marigold/blob/main/src/util/lr_scheduler.py
+# Author: Bingxin Ke
+import numpy as np
+class IterExponential:
+    def __init__(self, total_iter_length, final_ratio, warmup_steps=0) -> None:
+        self.total_length = total_iter_length
+        self.effective_length = int(total_iter_length * (1 - warmup_steps))
+        self.final_ratio = final_ratio
+        self.warmup_steps = int(total_iter_length * warmup_steps)
+    def __call__(self, n_iter) -> float:
+        if n_iter < self.warmup_steps:
+            alpha = 1.0 * n_iter / self.warmup_steps
+        elif n_iter >= self.total_length:
+            alpha = self.final_ratio
+        else:
+            actual_iter = n_iter - self.warmup_steps
+            alpha = np.exp(
+                actual_iter / self.effective_length * np.log(self.final_ratio)
+            )
+        return alpha
+class IterConstant:
+    def __init__(self, total_iter_length: int, warmup_steps: float = 0.0) -> None:
+        self.total_length = int(total_iter_length)
+        self.warmup_steps = int(total_iter_length * warmup_steps)
+    def __call__(self, n_iter: int) -> float:
+        if self.warmup_steps <= 0:
+            return 1.0
+        if n_iter < self.warmup_steps:
+            return float(n_iter + 1) / float(self.warmup_steps)
+        return 1.0

src/utils/utils.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import torch
+import numpy as np
+import cv2
+import random
+import wandb
+from tqdm.auto import tqdm
+from omegaconf import OmegaConf, DictConfig
+from pathlib import Path
+def args_to_omegaconf(args, base_cfg=None):
+    cfg = OmegaConf.create(base_cfg)
+    def _override_if_provided(container, key):
+        if hasattr(args, key):
+            value = getattr(args, key)
+            if value is not None:
+                container[key] = value
+    for key in cfg.keys():
+        node = cfg[key]
+        if isinstance(node, DictConfig):
+            for subkey in node.keys():
+                _override_if_provided(node, subkey)
+        else:
+            _override_if_provided(cfg, key)
+    return cfg
+def _tb_sanitize(v):
+    if v is None:
+        return "null"
+    if isinstance(v, (bool, int, float, str, torch.Tensor)):
+        return v
+    if isinstance(v, Path):
+        return str(v)
+    return str(v)
+def _flatten_dict(d, prefix=""):
+    out = {}
+    if isinstance(d, dict):
+        for k, v in d.items():
+            key = f"{prefix}.{k}" if prefix else str(k)
+            if isinstance(v, dict):
+                out.update(_flatten_dict(v, key))
+            else:
+                out[key] = _tb_sanitize(v)
+    else:
+        out[prefix or "cfg"] = _tb_sanitize(d)
+    return out
+def convert_paths_to_pathlib(cfg):
+    for key, value in cfg.items():
+        if isinstance(value, DictConfig):
+            cfg[key] = convert_paths_to_pathlib(value)
+        elif 'path' in key.lower():
+            cfg[key] = Path(value) if value is not None else None
+    return cfg
+def convert_pathlib_to_strings(cfg):
+    for key, value in cfg.items():
+        if isinstance(value, DictConfig):
+            cfg[key] = convert_pathlib_to_strings(value)
+        elif isinstance(value, Path):
+            cfg[key] = str(value)
+    return cfg
+def prepare_trained_parameters(unet, cfg):
+    unet_parameters = []
+    if cfg.training.only_train_attention_layers:
+        for name, param in unet.named_parameters():
+            if (cfg.model.unet_positional_encoding == "uv" and "conv_in" in name) or \
+                "transformer_blocks" in name:
+                unet_parameters.append(param)
+                param.requires_grad_(True)
+            else:
+                param.requires_grad_(False)
+    else:
+        for param in unet.parameters():
+            unet_parameters.append(param)
+            param.requires_grad_(True)
+    return unet_parameters
+@torch.no_grad()
+def validation_loop(accelerator, dataloader, pager, ema_unet, cfg, epoch, global_step, val_type="val"):
+    if val_type == "val":
+        desc = "Validation"
+        x_axis_name = "epoch"
+        x_axis = epoch
+    elif val_type == "tiny_val":
+        desc = "Tiny Validation"
+        x_axis_name = "global_step"
+        x_axis = global_step
+    else:
+        raise ValueError(f"Unknown val type {val_type}")
+    if cfg.training.use_EMA:
+        ema_unet.store(pager.unwrapped_unet.parameters())
+        ema_unet.copy_to(pager.unwrapped_unet.parameters())
+    val_epoch_loss = 0.0
+    log_val_images = {"rgb": [], cfg.model.modality: []}
+    log_img_ids = random.sample(range(len(dataloader)), 4)
+    progress_bar = tqdm(dataloader, desc=desc, total=len(dataloader), disable=not accelerator.is_main_process)
+    for i, batch in enumerate(progress_bar):
+        pred_cubemap = pager(batch, cfg.model.modality)
+        if cfg.model.modality == "depth":
+            min_depth = dataloader.dataset.LOG_MIN_DEPTH if cfg.model.log_scale else dataloader.dataset.MIN_DEPTH
+            depth_range = dataloader.dataset.LOG_DEPTH_RANGE if cfg.model.log_scale else dataloader.dataset.DEPTH_RANGE
+            loss = pager.calculate_depth_loss(batch, pred_cubemap, min_depth, depth_range, cfg.model.log_scale, cfg.model.metric_depth)
+        elif cfg.model.modality == "normal":
+            loss = pager.calculate_normal_loss(batch, pred_cubemap)
+        avg_loss = accelerator.reduce(loss["total_loss"].detach(), reduction="mean")
+        if accelerator.is_main_process:
+            progress_bar.set_postfix({"loss": avg_loss.item()})
+            val_epoch_loss += avg_loss
+        if i in log_img_ids:
+            log_val_images["rgb"].append(prepare_image_for_logging(batch["rgb"][0].cpu().numpy()))
+            if cfg.model.modality == "depth":
+                result_image = pager.process_depth_output(pred_cubemap, orig_size=batch['depth'].shape[2:4], min_depth=min_depth,
+                                                          depth_range=depth_range, log_scale=cfg.model.log_scale)[1].cpu().numpy()
+            elif cfg.model.modality == "normal":
+                result_image = pager.process_normal_output(pred_cubemap, orig_size=batch['normal'].shape[2:4]).cpu().numpy()
+            log_val_images[cfg.model.modality].append(prepare_image_for_logging(result_image))
+    val_epoch_loss = val_epoch_loss / len(dataloader)
+    if accelerator.is_main_process:
+        accelerator.log({x_axis_name: x_axis, f"{val_type}/loss": float(val_epoch_loss)}, step=global_step)
+        img_mix_rgb = log_images_mosaic(log_val_images["rgb"])
+        img_mix_depth = log_images_mosaic(log_val_images[cfg.model.modality])
+        if cfg.logging.report_to == "wandb":
+            accelerator.log(
+                {x_axis_name: x_axis, f"{val_type}/pred_panorama_rgb": wandb.Image(img_mix_rgb)},
+                step=global_step,
+            )
+            accelerator.log(
+                {x_axis_name: x_axis, f"{val_type}/pred_panorama_{cfg.model.modality}": wandb.Image(img_mix_depth)},
+                step=global_step,
+            )
+        elif cfg.logging.report_to == "tensorboard":
+            tb_writer = accelerator.get_tracker("tensorboard").writer
+            tb_writer.add_image(
+                f"{val_type}/pred_panorama_rgb",
+                img_mix_rgb,
+                global_step,
+                dataformats="HWC",
+            )
+            tb_writer.add_image(
+                f"{val_type}/pred_panorama_{cfg.model.modality}",
+                img_mix_depth,
+                global_step,
+                dataformats="HWC",
+            )
+    if cfg.training.use_EMA:
+        ema_unet.restore(pager.unwrapped_unet.parameters())
+    return val_epoch_loss
+def prepare_image_for_logging(image):
+    image = (image - image.min()) / (image.max() - image.min() + 1e-8)
+    image = (image * 255).astype("uint8")
+    return image
+def log_images_mosaic(images):
+    n = len(images)
+    assert 1 <= n <= 4, "Provide between 1 and 4 images (CHW uint8)."
+    fullhd_imgs = []
+    for img in images:
+        assert img.dtype == np.uint8 and img.ndim == 3 and img.shape[0] in (1, 3), \
+            "Each image must be uint8 with shape (C,H,W), C in {1,3}."
+        if img.shape[0] == 1:
+            img = np.repeat(img, 3, axis=0)
+        img_hwc = np.transpose(img, (1, 2, 0))
+        img_fullhd = cv2.resize(img_hwc, (1920, 1080), interpolation=cv2.INTER_LINEAR)
+        fullhd_imgs.append(img_fullhd)
+    H, W, C = 1080, 1920, 3
+    if n == 1:
+        return fullhd_imgs[0]
+    if n == 2:
+        canvas = np.zeros((H, 2*W, C), dtype=np.uint8)
+        canvas[:, 0:W, :]   = fullhd_imgs[0]
+        canvas[:, W:2*W, :] = fullhd_imgs[1]
+        return canvas
+    if n == 3:
+        canvas = np.zeros((2*H, 2*W, C), dtype=np.uint8)
+        x_off = W // 2
+        canvas[0:H, x_off:x_off+W, :] = fullhd_imgs[0]
+        canvas[H:2*H, 0:W,   :] = fullhd_imgs[1]
+        canvas[H:2*H, W:2*W, :] = fullhd_imgs[2]
+        return canvas
+    canvas = np.zeros((2*H, 2*W, C), dtype=np.uint8)
+    canvas[0:H,   0:W,   :] = fullhd_imgs[0]
+    canvas[0:H,   W:2*W, :] = fullhd_imgs[1]
+    canvas[H:2*H, 0:W,   :] = fullhd_imgs[2]
+    canvas[H:2*H, W:2*W, :] = fullhd_imgs[3]
+    return canvas