Spaces:

ksangk
/

chord-demo

Running on Zero

App Files Files Community

ksangk commited on 25 days ago

Commit

a846205

1 Parent(s): e5992ee

demo

Browse files

Files changed (13) hide show

.gitignore +2 -0
LICENSE.txt +99 -0
app.py +141 -0
chord/__init__.py +14 -0
chord/io.py +80 -0
chord/module/__init__.py +19 -0
chord/module/base.py +13 -0
chord/module/chord.py +281 -0
chord/module/light.py +96 -0
chord/module/stable_diffusion.py +105 -0
chord/util.py +67 -0
config/chord.yaml +29 -0
requirements.txt +8 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ output

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,99 @@

+Ubisoft Machine Learning License (Research-Only - Copyleft)
+This license governs the use, reproduction, and distribution of the Licensed
+Materials, including AI Models and associated source code for the sole purpose
+of scientific research. By accessing, downloading or using the Licensed
+Materials, you hereby accept to be bound by this [Ubisoft Machine Learning
+License (Research-Only - Copyleft)] agreement (hereinafter the “License”).
+1. Licensed Materials
+- AI Models
+- Source Code
+2. Definitions
+“Licensed Materials”: Refers to the AI Models and/or Source Code licensed under
+this agreement.
+"Source Code" means the preferred form of the work for making modifications to
+it corresponding to text written using human-readable programming language.
+"Object Code" means any non-source form of a work.
+“AI Model” means any machine learning based assembly or assemblies (including
+checkpoints), consisting of learnt weights, parameters (including optimizer
+states), corresponding to the model architecture as embodied in the Source Code.
+“Output” means the results of operating an AI Model as embodied in
+informational content resulting therefrom.
+“Derivative”: Any work derived from or based upon the Licensed Materials,
+including modifications.
+“Permitted Purpose”: Use for academic or research purposes only. Commercial
+use is strictly prohibited.
+“Distribution”: Any sharing of the Licensed Materials or Derivatives with third
+parties, including hosting as a service.
+“Licensor”: The rights holder or authorized entity granting this License.
+“You”: The individual or entity receiving and exercising rights under this
+License.
+3. Grant of Rights
+Subject to compliance with the terms of this License, You are granted a
+worldwide, royalty-free, non-exclusive License to use, study, reproduce,
+modify, and distribute the Licensed Materials and Derivatives solely for the
+Permitted Purpose. As between You and Licensor, Licensor claims no rights in
+the Outputs You generate using the AI Models used in accordance with the
+Permitted Purpose.
+4. Distribution of Licensed Materials and Derivatives
+Any Distribution of the Derivatives of the Licensed Materials, or the Licensed
+Materials shall be licensed under the same exact terms as this License.
+Redistribution shall include this License and retain all notices of author
+attribution and all modifications shall be clearly marked.
+5. Use Restrictions
+You shall not use the Licensed Materials or its Derivatives for:
+- any other purposes than the Permitted Purpose, including for commercial
+  purposes such as using the Licensed Materials in any activity intended for
+  commercial advantage or monetary compensation directly or indirectly;
+- weaponry, warfare, military applications, surveillance, or any activity that
+  may cause harm or violate human rights;
+- engaging or enabling fully automated decision-making that may adversely
+  impacts a natural person's legal rights;
+- providing medical advice or making clinical decisions;
+- generating content that promotes or incites hatred, violence, discrimination,
+  or harm based on race, ethnicity, religion, gender, sexual orientation, or
+  any other protected characteristic;
+- generating content that includes depictions of sexual abuse, sexual
+  violence, explicit pornography, or any form of non-consensual acts and/or
+  generating content that includes depictions of child nudity, child
+  pornography, or any form of child exploitation;
+6. Disclaimer of Warranty
+THE LICENSED MATERIALS IS PROVIDED "AS IS" AND “AS AVAILABLE” WITHOUT
+WARRANTIES OF ANY KIND WHETHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
+THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE,
+NON-INFRINGEMENT, CORRECTNESS, ACCURACY, OR RELIABILITY. THE LICENSOR DISCLAIMS
+ALL LIABILITY FOR DAMAGES RESULTING FROM THE USE OR INABILITY TO USE THE
+LICENSED MATERIALS. THE USE OF THE LICENSED MATERIALS AND ANY OUTPUTS YOU MAY
+GENERATE SHALL BE AT YOUR OWN RISK.
+7. Termination
+This License terminates automatically if You violate any of its terms. Upon
+termination, You shall cease all use and distribution of the Licensed
+Materials and its Derivatives.
+8. Governing Law
+The validity of this Agreement and any of its terms and provisions, as well as
+the rights and duties of the parties hereunder, shall be governed, interpreted
+and enforced in accordance with the laws of France.
+9. Miscellaneous
+If any provision of this License is held to be invalid, illegal or
+unenforceable, the remaining provisions shall be unaffected thereby and remain
+valid as if such provision had not been set forth herein.
+Copyright (C) 2025 UBISOFT ENTERTAINMENT. All Rights Reserved.

app.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import gradio as gr
+import os
+import numpy as np
+from PIL import Image
+import torch
+import copy
+from omegaconf import OmegaConf
+from torchvision.transforms import v2
+from torchvision.transforms.functional import to_pil_image
+from chord import ChordModel
+from chord.module import make
+from chord.util import get_positions, rgb_to_srgb
+EXAMPLES_USECASE_1 = [
+    [f"examples/generated/{f}"]
+    for f in sorted(os.listdir("examples/generated"))
+]
+EXAMPLES_USECASE_2 = [
+    [f"examples/in_the_wild/{f}"]
+    for f in sorted(os.listdir("examples/in_the_wild"))
+]
+EXAMPLES_USECASE_3 = [
+    [f"examples/specular/{f}"]
+    for f in sorted(os.listdir("examples/specular"))
+]
+MODEL_OBJ = None
+def load_model(ckpt_path):
+    print("Loading model from:", ckpt_path)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    config = OmegaConf.load("config/chord.yaml")
+    model = ChordModel(config)
+    ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)
+    model.load_state_dict(ckpt["state_dict"])
+    model.eval()
+    model.to(device)
+    return model
+def run_model(model, img: Image.Image):
+    to_tensor = v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])
+    image = to_tensor(img).to(next(model.parameters()).device)
+    x = v2.Resize(size=(1024, 1024), antialias=True)(image).unsqueeze(0)
+    with torch.no_grad() as no_grad, torch.autocast(device_type="cuda") as amp:
+        output = model(x)
+    output.update({"input": image})
+    return output
+def relit(model, maps):
+    maps['metallic'] = maps.get('metalness', torch.zeros_like(maps['basecolor']))
+    device = next(model.parameters()).device
+    h, w = maps["basecolor"].shape[-2:]
+    light = make("point-light", {"position": [0, 0, 10]}).to(device)
+    pos = get_positions(h, w, 10).to(device)
+    camera = torch.tensor([0, 0, 10.0]).to(device)
+    for key in maps:
+        if maps[key].dim() == 3:
+            maps[key] = maps[key].unsqueeze(0)
+        maps[key] = maps[key].permute(0,2,3,1)  # BxCxHxW -> BxHxWxC
+    rgb = model.model.compute_render(maps, camera, pos, light).squeeze(0).permute(0,3,1,2)  # GxBxHxWxC -> BxCxHxW
+    return torch.clamp(rgb_to_srgb(rgb), 0, 1)
+def inference(img, ckpt_path):
+    global MODEL_OBJ
+    if MODEL_OBJ is None or getattr(MODEL_OBJ, "_ckpt", None) != ckpt_path:
+        MODEL_OBJ = load_model(ckpt_path)
+        MODEL_OBJ._ckpt = ckpt_path  # store path inside object
+    if img is None:
+        return None, None, None, None, None
+    ori_h, ori_w = img.size[1], img.size[0]
+    out = run_model(MODEL_OBJ, img)
+    maps = copy.deepcopy(out)
+    rendered = relit(MODEL_OBJ, maps)
+    resize_back = v2.Resize(size=(ori_h, ori_w), antialias=True)
+    return (
+        to_pil_image(resize_back(out["basecolor"]).squeeze(0)),
+        to_pil_image(resize_back(out["normal"]).squeeze(0)),
+        to_pil_image(resize_back(out["roughness"]).squeeze(0)),
+        to_pil_image(resize_back(out["metalness"]).squeeze(0)),
+        to_pil_image(resize_back(rendered).squeeze(0)),
+    )
+with gr.Blocks(title="Chord") as demo:
+    gr.Markdown("# **Chord: Chain of Rendering Decomposition for PBR Material Estimation from Generated Texture images**")
+    ckpt_path = gr.Textbox(
+        label="Model Checkpoint Path",
+        value="chord_v1.ckpt",
+        placeholder="Path to your model checkpoint",
+    )
+    gr.Markdown("Upload an image or select an example to estimate PBR channels and render the result under custom lighting.")
+    with gr.Row():
+        with gr.Column():
+            input_img = gr.Image(type="pil", label="Input Image", height=512)
+            gr.Markdown("### Example Inputs — Generated Textures")
+            gr.Examples(
+                examples=EXAMPLES_USECASE_1,
+                inputs=[input_img],
+                label="Examples (Generated Textures)"
+            )
+            gr.Markdown("### Example Inputs — In The Wild Photographs")
+            gr.Examples(
+                examples=EXAMPLES_USECASE_2,
+                inputs=[input_img],
+                label="Examples (In The Wild Photographs)"
+            )
+            gr.Markdown("### Example Inputs — Specular Textures")
+            gr.Examples(
+                examples=EXAMPLES_USECASE_3,
+                inputs=[input_img],
+                label="Examples (Specular Textures)"
+            )
+            run_button = gr.Button("Run Estimation")
+        with gr.Column():
+            gr.Markdown("### Predicted Channels")
+            basecolor_out = gr.Image(label="Basecolor", height=512)
+            normal_out = gr.Image(label="Normal", height=512)
+            roughness_out = gr.Image(label="Roughness", height=512)
+            metallic_out = gr.Image(label="Metalness", height=512)
+            gr.Markdown("### Relit Output")
+            render_out = gr.Image(label="Relit Image (Centered Point Light)", height=512)
+    run_button.click(
+        inference,
+        inputs=[input_img, ckpt_path],
+        outputs=[basecolor_out, normal_out, roughness_out, metallic_out, render_out]
+    )
+if __name__ == "__main__":
+    demo.launch()

chord/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import torch
+import torch.nn as nn
+from chord.module import make
+from chord.module.chord import post_decoder
+class ChordModel(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.model = make(config.model.name, config.model)
+    def forward(self, x: torch.Tensor):
+        x = {"render": x}
+        pred = self.model(x)
+        return post_decoder(pred)

chord/io.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch
+import imageio.v3 as imageio
+import numpy as np
+import warnings
+import os
+import torchvision.transforms.functional as F
+def read_image(filename: str, out: torch.Tensor=None) -> torch.Tensor:
+    '''
+    Read a local image file into a float tensor (pixel values are normalized to [0, 1], CxHxW)
+    Args:
+        filename: Image file path.
+        out: Fill in this tensor rather than return a new tensor if provided.
+    Returns:
+        Loaded image tensor.
+    '''
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore") # ignore PIL's user warning that reads fp16 img as fp32
+        img: np.ndarray = imageio.imread(filename)
+    # Convert the image array to float tensor according to its data type
+    res = None
+    if img.dtype == np.uint8:
+        img = img.astype(np.float32) / 255.0
+    elif img.dtype == np.uint16 or img.dtype == np.int32:
+        img = img.astype(np.float32) / 65535.0
+    else:
+        raise ValueError(f'Unrecognized image pixel value type: {img.dtype}')
+    if img.ndim == 2:
+        res = torch.from_numpy(img).unsqueeze(0)  # 1xHxW for grayscale images
+    elif img.ndim == 3:
+        res = torch.from_numpy(img).movedim(2, 0)[:3] # HxWxC to CxHxW
+    else:
+        raise ValueError(f'Unrecognized image dimension: {img.shape}')
+    if out is None:
+        return res
+    out.copy_(res)
+def create_img(img: torch.Tensor):
+    '''
+    Convert tensor to PIL image
+    Args:
+        path: Image tensor CxHxW. Squeeze if BxCxHxW and B==1
+    Returns:
+        PIL image
+    '''
+    if img.dim() == 4:
+        assert img.shape[0] == 1
+        img = img.squeeze(0)
+    if img.shape[0] == 4:
+        out_img = F.to_pil_image(img, mode="CMYK")
+        out_img = out_img.convert('RGB')
+    elif img.shape[0] == 3:
+        out_img = F.to_pil_image(img, mode="RGB")
+    elif img.shape[0] == 1:
+        out_img = F.to_pil_image(img, mode="L")
+    else:
+        raise ValueError("Unsupported image dimension.")
+    return out_img
+def save_maps(path: str, maps: dict):
+    '''
+    Save SVBRDF maps to a given path.
+    Args:
+        path: Output path.
+        maps: Named maps of tensor images.
+    '''
+    if not os.path.exists(path):
+        os.makedirs(path)
+    for name, image in maps.items():
+        out_img = create_img(image)
+        out_img.save(os.path.join(path, name+".png"))

chord/module/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+modules = {}
+def register(name):
+    def decorator(cls):
+        modules[name] = cls
+        return cls
+    return decorator
+def make(name, config):
+    model = modules[name](config)
+    return model
+from . import (
+    light,
+    stable_diffusion,
+    chord,
+)

chord/module/base.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch
+import torch.nn as nn
+class Base(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.setup()
+    def setup(self):
+        raise NotImplementedError

chord/module/chord.py ADDED Viewed

	@@ -0,0 +1,281 @@

+import copy
+import torch
+from torch import nn
+import torch.nn.functional as Fn
+from torchvision.transforms import v2
+from . import register, make
+from .base import Base
+from chord.util import fresnelSchlick, GeometrySchlickGGX, DistributionGGX
+from chord.util import srgb_to_rgb, tone_gamma, get_positions, safe_01_div
+class dummy_module(nn.Module):
+    def forward(self, x): return x
+def post_decoder(out_dict):
+    out = {}
+    for key in out_dict.keys():
+        if key.startswith("approx"): continue
+        elif key == "normal":
+            out[key] = Fn.normalize(2. * out_dict[key] - 1., dim=1) / 2. + 0.5
+        elif key == "rou_met":
+            out['roughness'], out['metalness'] = out_dict['rou_met'][:,0], out_dict['rou_met'][:,1]
+        else: out[key] = out_dict[key]
+    return out
+def process_irradiance(radiance, kernel_size=25, res=64):
+    """
+    Process the irradiance using PyTorch, equivalent to the original OpenCV-based function.
+    Args:
+        radiance (torch.Tensor): Input radiance tensor (H, W).
+        kernel_size (int): Size of the kernel for the median blur.
+        res (int): Target resolution for resizing the image.
+    Returns:
+        torch.Tensor: Processed radiance tensor (res, res).
+    """
+    # Ensure the input radiance is a 4D tensor (B, 1, H, W)
+    assert radiance.shape[1] == 1 and radiance.dim() == 4, f"Invalid radiance shape, got {radiance.shape}"
+    # resize to low resolution
+    resizer = v2.Resize(size=res, antialias=True)
+    radiance = resizer(radiance)
+    # Define a 11x11 averaging kernel
+    kernel = torch.ones((1, 1, 11, 11), dtype=torch.float32).to(radiance) / 121.0
+    # Apply convolution (averaging filter)
+    radiance = Fn.pad(radiance, (5,)*4, mode="reflect")  # Pad for edge handling
+    radiance = Fn.conv2d(radiance, kernel, padding=0)  # 'padding=2' to maintain input dimensions
+    # Clamp values and scale to [0, 255] for median filtering
+    radiance = torch.clamp(radiance * 255, 0, 255)  # Remove batch/channel dims
+    # Apply median filtering
+    paded_radiance = Fn.pad(radiance, (kernel_size // 2,) * 4, mode="reflect")  # Pad for edge handling
+    unfolded = Fn.unfold(paded_radiance, kernel_size)  # Extract patches
+    radiance = torch.median(unfolded, dim=1).values.view(radiance.shape)  # Median of patches
+    # Normalize to [0, 1]
+    rad_min, rad_max = radiance.amin([2,3], keepdim=True), radiance.amax([2,3], keepdim=True)
+    radiance = (radiance - rad_min) / (rad_max - rad_min)
+    return radiance
+def opt_light_dir(_radiance, _num_samples=6):
+    '''
+        _radiance: (bs, 1, h, w)
+    '''
+    assert _radiance.shape[1] == 1 and _radiance.dim()==4
+    bs, _, h, w = _radiance.shape
+    def evenly_sample(_num_samples, min=0, max=2*torch.pi):
+        # returns torch.tensor([1, _num_samples])
+        return torch.tensor(range(_num_samples+1)) * (max - min) / _num_samples + min
+    def compute_radiance_diff(angles):
+        num = angles.shape[-1]
+        dirs = torch.cat([torch.cos(angles), torch.sin(angles)]).T
+        pos_dir = grid_pos.repeat(num, 1, 1, 1)
+        pos_mask = torch.einsum("abcd,ad->abc", pos_dir, dirs) > 0
+        neg_mask = torch.einsum("abcd,ad->abc", pos_dir, dirs) < 0
+        samples_radiance = _radiance.repeat(1,num,1,1)
+        radiance_diff = (samples_radiance*pos_mask[None] - samples_radiance*neg_mask[None]).sum([2,3])
+        return radiance_diff
+    angle_min, angle_max = 0, 2*torch.pi
+    grid_pos = Fn.normalize(get_positions(h,w,10)[...,:2], dim=-1, eps=1e-6).to(_radiance)
+    while(((angle_max - angle_min) > (torch.pi/90))):
+        angles = evenly_sample(_num_samples, angle_min, angle_max)[None].to(_radiance)
+        diffs = compute_radiance_diff(angles).mean(0)
+        angle_min = angles[:,diffs.argmax()].item() - (angle_max - angle_min)/_num_samples
+        angle_max = angles[:,diffs.argmax()].item() + (angle_max - angle_min)/_num_samples
+    light_angle = angles[:, diffs.argmax()]
+    return torch.tensor([torch.cos(light_angle), torch.sin(light_angle)]).to(_radiance)
+def find_light_dir(raw_irradiance, light):
+    raw_irradiance = v2.functional.rgb_to_grayscale(raw_irradiance)
+    irradiance = process_irradiance(raw_irradiance)
+    dir = opt_light_dir(irradiance)
+    dir = torch.cat([dir, torch.tensor([0.5**0.5]).to(dir)])
+    _light = copy.deepcopy(light)
+    _light.direction = dir
+    return _light
+@register("chord")
+class Chord(Base):
+    def setup(self):
+        # Define forward chain
+        self.chain_type = self.config.get("chain_type", "chord")
+        self.chain = self.config.get("chain_library", {})[self.chain_type]
+        self.prompts = self.config.get("rgbx_prompts", {})
+        self.roughness_step = self.config.get("roughness_step", 10)
+        self.metallic_step = self.config.get("metallic_step", 0.2)
+        self.sd = make(self.config.stable_diffusion.name, self.config.stable_diffusion)
+        self.dtype = self.sd.dtype
+        self.device = self.sd.device
+        # LEGO-conditioning
+        self.sd.unet.ConvIns = nn.ModuleDict()
+        self.sd.unet.ConvOuts = nn.ModuleDict()
+        self.sd.unet.FirstDownBlocks = nn.ModuleDict()
+        self.sd.unet.LastUpBlocks = nn.ModuleDict()
+        for key in list(set("_".join(self.chain.values()).split("_"))) + ["noise"]:
+            if "0" in key or "1" in key: continue
+            self.sd.unet.ConvIns[key] = nn.Conv2d(4, 320, 3, 1 , 1, device=self.device, dtype=self.dtype)
+            self.sd.unet.ConvIns[key].load_state_dict(self.sd.unet.conv_in.state_dict())
+        for kout in list(set(self.chain.keys())):
+            self.sd.unet.ConvOuts[kout] = nn.Conv2d(320, 4, 3, 1 , 1, device=self.device, dtype=self.dtype)
+            self.sd.unet.ConvOuts[kout].load_state_dict(self.sd.unet.conv_out.state_dict())
+            self.sd.unet.LastUpBlocks[kout] = copy.deepcopy(self.sd.unet.up_blocks[-1]).to(self.device)
+            self.sd.unet.FirstDownBlocks[kout] = copy.deepcopy(self.sd.unet.down_blocks[0]).to(self.device)
+        self.sd.unet.ConvIns.train()
+        self.sd.unet.ConvOuts.train()
+        self.sd.unet.FirstDownBlocks.train()
+        self.sd.unet.LastUpBlocks.train()
+        self.sd.unet.conv_in = dummy_module()
+        self.sd.unet.conv_out = dummy_module()
+        # Load Lights
+        if self.config.get("prior_light", None) is None:
+            self.prior_light = make("point-light", {"position": [0, 0, 10]})
+        else:
+            self.prior_light = make(self.config.prior_light.name, self.config.prior_light)
+        # Init Embeddings
+        self.text_emb = {}
+    # Eq.3
+    def compute_approxIrr(self, render, basecolor):
+        approxIrr = safe_01_div.apply(srgb_to_rgb(render), srgb_to_rgb(basecolor))
+        return tone_gamma(approxIrr)
+    # Eq.6
+    @torch.no_grad()
+    def compute_approxRouMet(self, render, maps, seperate=False, light=None):
+        render = srgb_to_rgb(render)
+        bs, _, h, w = render.shape
+        light = find_light_dir(maps['approxIrr'], self.prior_light) if light is None else light
+        # light.direction = estimate_light_dir(render, maps)
+        pos = get_positions(h, w, 10).to(self.device)
+        cameras = torch.tensor([0, 0, 10.0]).to(self.device)
+        # sample grid
+        r_samples = torch.arange(25, 225+self.roughness_step, self.roughness_step) / 255
+        m_samples = torch.arange(0., 1.+self.metallic_step, self.metallic_step)
+        grid_maps = {} # change map size into: gs, bs, h, w, c
+        grid_maps['basecolor'] = maps['basecolor'][None].permute(0,1,3,4,2)
+        grid_maps['normal'] = maps['normal'][None].permute(0,1,3,4,2)
+        r_values = r_samples[:,None].repeat(1,len(m_samples)).reshape(-1,1,1,1,1).to(maps['basecolor'])
+        m_values = m_samples[None].repeat(len(r_samples),1).reshape(-1,1,1,1,1).to(maps['basecolor'])
+        # split into chunks to avoid OOM
+        chunk_size = 25
+        rgb_list, r_list, m_list = [], [], []
+        for _r, _m in zip(torch.split(r_values, chunk_size), torch.split(m_values, chunk_size)):
+            grid_maps['roughness'], grid_maps['metallic'] = _r, _m
+            _rgb = self.compute_render(grid_maps, cameras, pos, light)
+            loss = (render[None].permute(0,1,3,4,2) - _rgb).abs().sum(-1,keepdim=True)
+            min_idx = loss.argmin(dim=0,keepdim=True)
+            r_list.append(torch.gather(grid_maps['roughness'].flatten(), 0, min_idx.flatten()).reshape(min_idx.shape))
+            m_list.append(torch.gather(grid_maps['metallic'].flatten(), 0, min_idx.flatten()).reshape(min_idx.shape))
+            rgb_list.append(torch.gather(_rgb, 0, min_idx.repeat(1,1,1,1,3)))
+        rgb = torch.cat(rgb_list).permute(0,1,4,2,3)
+        roughness = torch.cat(r_list).permute(0,1,4,2,3)
+        metallic = torch.cat(m_list).permute(0,1,4,2,3)
+        loss = (render[None] - rgb).abs().sum(2,keepdim=True)
+        roughness = torch.gather(roughness, 0, loss.argmin(dim=0,keepdim=True))[0]
+        metallic = torch.gather(metallic, 0, loss.argmin(dim=0,keepdim=True))[0]
+        torch.cuda.empty_cache()
+        if seperate:
+            return roughness, metallic
+        else:
+            out = torch.cat([roughness, metallic, torch.zeros_like(roughness)], dim=1)
+            return out
+    @torch.no_grad()
+    def compute_render(self, maps, camera_position, pos, light):
+        '''
+            maps: gs, bs, h, w, c (gs: the number of grids)
+        '''
+        def cos(x, y):
+            return torch.clamp((x*y).sum(-1, keepdim=True), min=0, max=1)
+        # pre-process
+        albedo = srgb_to_rgb(maps['basecolor'])
+        normal = maps['normal'].clone()
+        normal[..., :2] = normal[..., [1,0]]
+        N = Fn.normalize((normal - 0.5) * 2.0, dim=-1, eps=1e-6)
+        roughness = maps['roughness']
+        metallic = maps['metallic']
+        V = Fn.normalize(camera_position - pos, dim=-1, eps=1e-6).repeat(1,1,1,1,1).to(self.device)
+        irradiance, L = light(pos)
+        irradiance, L = irradiance.repeat(1,1,1,1,1).to(self.device), L.repeat(1,1,1,1,1).to(self.device)
+        # rendering
+        H = Fn.normalize(L+V, dim=-1, eps=1e-6)
+        f0 = torch.ones_like(albedo).to(self.device) * 0.04
+        F0 = torch.lerp(f0, albedo, metallic)
+        F = fresnelSchlick(cos(H,V), F0)
+        ks = F
+        diffuse = (1-ks) * albedo / torch.pi
+        diffuse *= 1-metallic
+        NDF = DistributionGGX(cos(N,H), roughness)
+        G = GeometrySchlickGGX(cos(N,L), roughness) * GeometrySchlickGGX(cos(N,V), roughness)
+        numerator = NDF * G * F
+        denominator = 4.0 * cos(N,V) * cos(N,L) + 1e-3
+        specular = numerator / denominator
+        ambient = 0.3 * albedo
+        rgb = (diffuse + specular) * irradiance * cos(N,L) + ambient
+        return rgb
+    def forward(self, maps:dict):
+        # prepare
+        bs = maps['render'].shape[0]
+        self.sd.scheduler.set_timesteps(1)
+        t = self.sd.scheduler.timesteps[0]
+        # chain processing
+        pred, pred_latent, arxiv_latent = {}, {}, {}
+        for kout, info in self.chain.items():
+            info = info.split("_")
+            keys, ids = info[:-1], info[-1]
+            # Swap active LEGO blocks
+            self.sd.unet.down_blocks[0] = self.sd.unet.FirstDownBlocks[kout]
+            self.sd.unet.up_blocks[-1] = self.sd.unet.LastUpBlocks[kout]
+            # Eq.2, summing input latents
+            in_latent = 0
+            for k, i in zip(keys, ids):
+                if i=="0":
+                    if not k in arxiv_latent.keys(): arxiv_latent[k] = self.sd.encode_imgs_deterministic(maps[k])
+                    zx = arxiv_latent[k]
+                else:
+                    zx = pred_latent[k]
+                in_latent += self.sd.unet.ConvIns[k](zx)
+            in_latent = in_latent / len(keys)
+            # single-step denoising
+            embs = self.produce_embeddings(kout, bs)
+            out_latent = self.sd.unet(in_latent, t, **embs)[0]
+            out_latent = self.sd.unet.ConvOuts[kout](out_latent)
+            pred_latent[kout] = self.sd.scheduler.step(out_latent, t, torch.zeros_like(zx)).pred_original_sample
+            pred[kout] = self.sd.decode_latents(pred_latent[kout]).float()
+            # compute intermediate representations
+            if self.chain_type in ["chord"] and kout == "basecolor":
+                pred['approxIrr'] = self.compute_approxIrr(maps['render'], pred['basecolor'])
+                pred_latent['approxIrr'] = self.sd.encode_imgs_deterministic(pred['approxIrr'])
+            if self.chain_type in ["chord"] and kout == "normal":
+                pred['approxRM'] = self.compute_approxRouMet(maps['render'], pred, seperate=False)
+                pred_latent['approxRM'] = self.sd.encode_imgs_deterministic(pred['approxRM'])
+        return pred
+    @torch.no_grad()
+    def produce_embeddings(self, key, batch_size):
+        if key not in self.text_emb.keys():
+            self.text_emb[key] = self.sd.encode_text(self.prompts[key], "max_length")
+        prompt_emb = self.text_emb[key].expand(batch_size, -1, -1)
+        return { "encoder_hidden_states": prompt_emb }

chord/module/light.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import torch
+from typing import Optional
+import torch.nn.functional as Fn
+import math
+import copy
+from . import register
+from .base import Base
+class BaseLight(Base):
+    """
+    Base class for light models.
+    """
+    def setup(self):
+        pass
+    def forward(self, x: Optional[torch.Tensor] = None):
+        """
+        Get the light intensity.
+        Args:
+        x: positions of shape (..., 3).
+        Returns:
+        color: radiance intensity of shape (..., 3)
+        d: directions of shape (..., 3).
+        """
+        raise NotImplementedError
+@register("point-light")
+class PointLight(BaseLight):
+    """Point light definitions
+    """
+    def setup(self):
+        """Initialize point light.
+        Args:
+            position (float, float, float): World coordinate of the light.
+            color (float, float, float): Light color in (R, G, B).
+            power (float): Light power, it will be directly multiplied to each color channel.
+        """
+        position = self.config.get("position", [0., 0., 10.])
+        color = self.config.get("color", [23.47, 21.31, 20.79])
+        power = self.config.get("power", 10.)
+        self.register_buffer("position", torch.tensor(position))
+        self.register_buffer("color", torch.tensor(color) * power)
+    def forward(self, x: Optional[torch.Tensor] = None):
+        """Compute light radiance and direction.
+        Args:
+            x : World coordinate of the interacting surface. [B, H, W, 3]
+        Returns:
+            color: radiance intensity of shape [B, H, W, 3]
+            d: directions of shape [B, H, W, 3], V = (light_pos - world_pos)
+        """
+        distance    = torch.norm(self.position - x, dim=-1, keepdim=True)
+        attenuation = 1.0 / (distance ** 2)
+        radiance    = self.color * attenuation
+        direction = Fn.normalize(self.position - x, dim=-1)
+        return radiance, direction
+@register("distant-light")
+class DistantLight(BaseLight):
+    """Distant light definitions
+    """
+    def setup(self):
+        """Initialize distant light.
+        Args:
+            direction (float, float, float):The direction of light vector.
+            color (float, float, float): Light color in (R, G, B).
+            power (float): Light power, it will be directly multiplied to each color channel.
+        """
+        direction = self.config.get("direction", [0., 0., 1.])
+        color = self.config.get("color", [23.47, 21.31, 20.79])
+        power = self.config.get("power", 0.1)
+        self.register_buffer("color", torch.tensor(color) * power)
+        self.register_buffer("direction", Fn.normalize(torch.tensor(direction), dim=0))
+    def forward(self, x: Optional[torch.Tensor] = None):
+        """Compute light radiance and direction.
+        Args:
+            x : World coordinate of the interacting surface. [B, H, W, 3]
+        Returns:
+            color: radiance intensity of shape [B, H, W, 3]
+            d: directions of shape [B, H, W, 3]
+        """
+        radiance = self.color.repeat(*x.shape[:-1], 1)
+        direction = self.direction.repeat(*x.shape[:-1], 1)
+        return radiance, direction

chord/module/stable_diffusion.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+from torchvision.transforms import v2
+from diffusers import UNet2DConditionModel, AutoencoderKL, DDIMScheduler
+from transformers import CLIPTextModel, CLIPTextConfig, CLIPTokenizer
+from . import register
+from .base import Base
+def apply_padding(model, mode):
+    for layer in [layer for _, layer in model.named_modules() if isinstance(layer, torch.nn.Conv2d)]:
+        if mode == 'circular':
+            layer.padding_mode = 'circular'
+        else:
+            layer.padding_mode = 'zeros'
+    return model
+def freeze(model):
+    model = model.eval()
+    for param in model.parameters():
+        param.requires_grad = False
+    return model
+@register("stable_diffusion")
+class StableDiffusion(Base):
+    def setup(self):
+        hf_key = self.config.get("hf_key", None)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        fp16 = self.config.get("fp16", True)
+        self.dtype = torch.bfloat16 if fp16 else torch.float32
+        vae_padding = self.config.get("vae_padding", "zeros")
+        self.sd_version = self.config.get("version", 2.1)
+        local_files_only = False
+        if hf_key is not None:
+            print(f"[INFO] using hugging face custom model key: {hf_key}")
+            model_key = hf_key
+            local_files_only = True
+        elif str(self.sd_version) == "2.1":
+            # model_key = "stabilityai/stable-diffusion-2-1"
+            # StabilityAI deleted the original 2.1 model from HF, use a community version
+            model_key = "RedbeardNZ/stable-diffusion-2-1-base"
+        else:
+            raise ValueError(
+                f"Stable-diffusion version {self.sd_version} not supported."
+            )
+        # Load components separately to avoid download unnecessary weights
+        # 1. UNet (diffusion backbone)
+        unet_config = UNet2DConditionModel.load_config(model_key, subfolder="unet")
+        self.unet = UNet2DConditionModel.from_config(unet_config, local_files_only=local_files_only)
+        self.unet.to(self.device, dtype=self.dtype).eval()
+        # 2. VAE (image autoencoder)
+        vae_config = AutoencoderKL.load_config(model_key, subfolder="vae")
+        self.vae = AutoencoderKL.from_config(vae_config, local_files_only=local_files_only)
+        self.vae.to(self.device, dtype=self.dtype).eval()
+        self.vae = apply_padding(freeze(self.vae), vae_padding)
+        # 3. Text encoder (CLIP)
+        text_encoder_config = CLIPTextConfig.from_pretrained(model_key, subfolder="text_encoder", local_files_only=local_files_only)
+        self.text_encoder = CLIPTextModel(text_encoder_config)
+        self.text_encoder.to(self.device, dtype=self.dtype).eval()
+        # 4. Tokenizer (CLIP tokenizer, this one has vocab so from_pretrained is needed)
+        self.tokenizer = CLIPTokenizer.from_pretrained(model_key, subfolder="tokenizer", local_files_only=local_files_only)
+        # 5. Scheduler
+        scheduler_config = DDIMScheduler.load_config(model_key, subfolder="scheduler")
+        scheduler_config["prediction_type"] = "v_prediction"
+        scheduler_config["timestep_spacing"] = "trailing"
+        scheduler_config["rescale_betas_zero_snr"] = True
+        self.scheduler = DDIMScheduler.from_config(scheduler_config)
+    def encode_text(self, prompt, padding_mode="do_not_pad"):
+        # prompt: [str]
+        inputs = self.tokenizer(
+            prompt,
+            padding=padding_mode,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        embeddings = self.text_encoder(inputs.input_ids.to(self.device))[0]
+        return embeddings
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        imgs = self.vae.decode(latents).sample
+        imgs = (imgs / 2 + 0.5).clamp(0, 1)
+        return imgs
+    def encode_imgs(self, imgs):
+        if imgs.shape[1] == 1: # for grayscale maps
+            imgs = v2.functional.grayscale_to_rgb(imgs)
+        imgs = 2 * imgs - 1
+        posterior = self.vae.encode(imgs).latent_dist
+        latents = posterior.sample() * self.vae.config.scaling_factor
+        return latents
+    def encode_imgs_deterministic(self, imgs):
+        if imgs.shape[1] == 1: # for grayscale maps
+            imgs = v2.functional.grayscale_to_rgb(imgs)
+        imgs = 2 * imgs - 1
+        h = self.vae.encoder(imgs)
+        moments = self.vae.quant_conv(h)
+        mean, logvar = torch.chunk(moments, 2, dim=1)
+        latents = mean * self.vae.config.scaling_factor
+        return latents

chord/util.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+def vector_dot(A: torch.Tensor, B: torch.Tensor, min=0.0) -> torch.Tensor:
+    return torch.clamp((A * B).sum(1, keepdim=True), min=min, max=1.0)
+def srgb_to_rgb(f: torch.Tensor) -> torch.Tensor:
+    return torch.where(f <= 0.04045, f / 12.92, torch.pow((torch.clamp(f, 0.04045) + 0.055) / 1.055, 2.4)).to(f.dtype)
+def rgb_to_srgb(f: torch.Tensor) -> torch.Tensor:
+    return torch.where(f <= 0.0031308, f * 12.92, torch.pow(torch.clamp(f, 0.0031308), 1.0/2.4)*1.055 - 0.055).to(f.dtype)
+def tone_gamma(x: torch.Tensor) -> torch.Tensor:
+    x = 1 - torch.exp(-x)
+    return torch.pow(x, 1.0/2.2)
+# safe division for value range 0-1
+class safe_01_div(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, a, b):
+        ctx.save_for_backward(a, b)
+        return torch.div(a, torch.clamp(b, min=1e-4, max=1.0))
+    @staticmethod
+    def backward(ctx, grad_output):
+        a, b = ctx.saved_tensors
+        grad_input = grad_output.clone()
+        return torch.div(1, torch.clamp(b, min=1e-4, max=1.0)) * grad_input, -1 * torch.div(a, torch.clamp(b, min=1e-2, max=1.0)**2) * grad_input
+def get_positions(h, w, real_size, use_pixel_centers=True) -> torch.Tensor:
+    pixel_center = 0.5 if use_pixel_centers else 0
+    i, j = torch.meshgrid(
+        torch.arange(h) + pixel_center,
+        torch.arange(w) + pixel_center,
+        indexing='ij'
+    )
+    if not isinstance(real_size, list):
+        real_size = [real_size] * 2
+    pos = torch.stack([(i / h - 0.5) * real_size[0], (j / w - 0.5) * real_size[1], torch.zeros_like(i)], dim=-1)
+    return pos
+# N, H: （Bx3xHxW), roughness: (Bx1xHxW)
+# The "D", facet distribution function in Cook-Torrence model
+def DistributionGGX(cosNH, roughness):
+    a = roughness * roughness
+    a2 = a * a
+    cosNH2 = cosNH * cosNH
+    num = a2
+    denom = cosNH2 * (a2 - 1.0) + 1.0
+    denom = torch.pi * denom * denom
+    return num / denom
+# NdotV, roughness: (Bx1xHxW)
+def GeometrySchlickGGX(NdotV: torch.Tensor, roughness: torch.Tensor) -> torch.Tensor:
+    r = (roughness + 1.0)
+    k = (r*r) / 8.0
+    num   = NdotV
+    denom = NdotV * (1.0 - k) + k
+    return num / denom
+# cosTheta, F0 (Bx1xHxW)
+# The "F"
+def fresnelSchlick(cosTheta: torch.Tensor, F0: torch.Tensor) -> torch.Tensor:
+    return F0 + (1.0 - F0) * torch.pow(1.0 - cosTheta, 5.0)

config/chord.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+model:
+  name: chord
+  roughness_step: 5.
+  metallic_step: 1.
+  # format: "OutputMapName": ConvInInput1_ConvInInput2_{0/1}
+  # 0/1 stands for using gt/pred image;
+  chain_type: chord
+  chain_library:
+    chord:
+      basecolor: render_0
+      normal: render_approxIrr_01
+      rou_met: render_approxRM_01
+  rgbx_prompts:
+    basecolor: Basecolor
+    normal: Normal
+    roughness: Roughness
+    metallic: Metallic
+    irradiance: Irradiance
+    rou_met: Roughness and Metallic
+  prior_light:
+    name: distant-light
+    direction: [-1.0, -1.0, 1.0]   # Top-left corner towards bottom right
+    color: [23.47, 21.31, 20.79]
+    power: 0.1
+  stable_diffusion:
+    name: stable_diffusion
+    fp16: true
+    vae_padding: circular
+    version: 2.1

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+huggingface_hub
+diffusers
+transformers
+typer
+omegaconf
+imageio
+tqdm
+gradio