Spaces:

3ZadeSSG
/

RT-MPINet

Sleeping

App Files Files Community

3ZadeSSG commited on Aug 7, 2025

Commit

ff00a24

1 Parent(s): 0032477

initial commit

Browse files

Files changed (32) hide show

README.md +30 -14
app.py +188 -0
helperFunctions.py +26 -0
helper_image_functions.py +290 -0
model_Large.py +535 -0
model_Medium.py +535 -0
model_Small.py +544 -0
parameters.py +48 -0
post-install.sh +2 -0
requirements.txt +19 -0
utils.py +243 -0
utils/.DS_Store +0 -0
utils/__init__.py +0 -0
utils/__pycache__/__init__.cpython-38.pyc +0 -0
utils/__pycache__/__init__.cpython-39.pyc +0 -0
utils/__pycache__/rendererBackbone.cpython-39.pyc +0 -0
utils/__pycache__/utils.cpython-38.pyc +0 -0
utils/__pycache__/utils.cpython-39.pyc +0 -0
utils/mpi/__init__.py +0 -0
utils/mpi/__pycache__/__init__.cpython-38.pyc +0 -0
utils/mpi/__pycache__/__init__.cpython-39.pyc +0 -0
utils/mpi/__pycache__/homography_sampler.cpython-38.pyc +0 -0
utils/mpi/__pycache__/homography_sampler.cpython-39.pyc +0 -0
utils/mpi/__pycache__/mpi_rendering.cpython-38.pyc +0 -0
utils/mpi/__pycache__/mpi_rendering.cpython-39.pyc +0 -0
utils/mpi/__pycache__/rendering_utils.cpython-38.pyc +0 -0
utils/mpi/__pycache__/rendering_utils.cpython-39.pyc +0 -0
utils/mpi/homography_sampler.py +159 -0
utils/mpi/mpi_rendering.py +272 -0
utils/mpi/rendering_utils.py +139 -0
utils/rendererBackbone.py +147 -0
utils/utils.py +150 -0

README.md CHANGED Viewed

@@ -1,14 +1,30 @@
----
-title: RT MPINet
-emoji: 👀
-colorFrom: red
-colorTo: indigo
-sdk: gradio
-sdk_version: 5.41.1
-app_file: app.py
-pinned: false
-license: gpl-2.0
-short_description: Multiplane Image Network for Real-Time View Synthesis
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<div align="center">
+  <a href="#"><img src='https://img.shields.io/badge/-Paper-00629B?style=flat&logo=ieee&logoColor=white' alt='arXiv'></a>
+  <a href='https://realistic3d-miun.github.io/Research/RT_MPINet/index.html'><img src='https://img.shields.io/badge/Project_Page-Website-green?logo=googlechrome&logoColor=white' alt='Project Page'></a>
+  <a href='https://huggingface.co/spaces/3ZadeSSG/RT-MPINet'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo_(RT_MPINet)-blue'></a>
+</div>
+# RT-MPINet
+#### Real-Time View Synthesis with Multiplane Image Network using Multimodal Supervision (RT-MPINet)
+We present a real-time multiplane image (MPI) network. Unlike existing MPI based approaches that often rely on a separate depth estimation network to guide the network for estimating MPI parameters, our method directly predicts these parameters from a single RGB image. To guide the network we present a multimodal training strategy utilizing joint supervision from view synthesis and depth estimation losses. More details can be found in the paper.
+**Please head to the [Project Page](https://realistic3d-miun.github.io/Research/RT_MPINet/index.html) to see supplementary materials and Full Code**
+## Acknowledgements
+- We thank the authors of [AdaMPI](https://github.com/yxuhan/AdaMPI) for their implementation of the homography renderer which has been used in this codebase under `./utils` directory
+- We tank the author of [Deepview renderer](https://github.com/Findeton/deepview) template, which was used in our project page.
+## Citation
+If you use our work please use following citation:
+```
+@inproceedings{gond2025rtmpi,
+  title={Real-Time View Synthesis with Multiplane Image Network using Multimodal Supervision},
+  author={Gond, Manu and Shamshirgarha, Mohammadreza and Zerman, Emin and Knorr, Sebastian and Sj{\"o}str{\"o}m, M{\aa}rten},
+  booktitle={2025 IEEE 27th International Workshop on Multimedia Signal Processing (MMSP)},
+  pages={},
+  year={2025},
+  organization={IEEE}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import gradio as gr
+import torch
+import numpy as np
+import cv2
+import tempfile
+from PIL import Image
+import torchvision.transforms as transforms
+import matplotlib.pyplot as plt
+from model_Small import MMPI as MMPI_S
+from model_Medium import MMPI as MMPI_M
+from model_Large import MMPI as MMPI_L
+import helperFunctions as helper
+import socket
+import parameters as params
+from utils.mpi.homography_sampler import HomographySample
+from utils.utils import (
+    render_novel_view,
+)
+# Checkpoint locations for all models
+MODEL_S_LOCATION = "./checkpoint/checkpoint_RT_MPI_Small.pth"
+MODEL_M_LOCATION = "./checkpoint/checkpoint_RT_MPI_Medium.pth"
+MODEL_L_LOCATION = "./checkpoint/checkpoint_RT_MPI_Large.pth"
+DEVICE = "cpu"
+def getPositionVector(x, y, z, pose):
+    pose[0,0,3] = x
+    pose[0,1,3] = y
+    pose[0,2,3] = z
+    return pose
+def generateCircularTrajectory(radius, num_frames):
+    angles = np.linspace(0, 2 * np.pi, num_frames, endpoint=False)
+    return [[radius * np.cos(angle), radius * np.sin(angle), 0] for angle in angles]
+def generateWiggleTrajectory(radius, num_frames):
+    angles = np.linspace(0, 2 * np.pi, num_frames, endpoint=False)
+    return [[radius * np.cos(angle), 0, radius * np.sin(angle)] for angle in angles]
+def create_video_from_memory(frames, fps=60):
+    if not frames:
+        return None
+    height, width, _ = frames[0].shape
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+    out = cv2.VideoWriter(temp_video.name, fourcc, fps, (width, height))
+    for frame in frames:
+        out.write(frame)
+    out.release()
+    return temp_video.name
+def process_image(img, video_type, radius, num_frames, num_loops, model_type, resolution):
+    # Parse resolution string
+    height, width = map(int, resolution.lower().split("x"))
+    # Select model class and checkpoint
+    if model_type == "Small":
+        model_class = MMPI_S
+        checkpoint = MODEL_S_LOCATION
+    elif model_type == "Medium":
+        model_class = MMPI_M
+        checkpoint = MODEL_M_LOCATION
+    else:
+        model_class = MMPI_L
+        checkpoint = MODEL_L_LOCATION
+    # Load model
+    model = model_class(total_image_input=params.params_number_input, height=height, width=width)
+    model = helper.load_Checkpoint(checkpoint, model, load_cpu=True)
+    model.to(DEVICE)
+    model.eval()
+    min_side = min(img.width, img.height)
+    left = (img.width - min_side) // 2
+    top = (img.height - min_side) // 2
+    right = left + min_side
+    bottom = top + min_side
+    img = img.crop((left, top, right, bottom))
+    if video_type == "Circle":
+        trajectory = generateCircularTrajectory(radius, num_frames)
+    elif video_type == "Swing":
+        trajectory = generateWiggleTrajectory(radius, num_frames)
+    else:
+        trajectory = generateCircularTrajectory(radius, num_frames)
+    transform = transforms.Compose([
+        transforms.Resize((height, width)),
+        transforms.ToTensor()
+    ])
+    img_input = transform(img).to(DEVICE).unsqueeze(0)
+    grid = params.get_disparity_all_src().unsqueeze(0).to(DEVICE)
+    k_tgt = torch.tensor([
+        [0.58, 0, 0.5],
+        [0, 0.58, 0.5],
+        [0, 0, 1]]).to(DEVICE)
+    k_tgt[0, :] *= height
+    k_tgt[1, :] *= width
+    k_tgt = k_tgt.unsqueeze(0)
+    k_src_inv = torch.inverse(k_tgt)
+    pose = torch.eye(4).to(DEVICE).unsqueeze(0)
+    homography_sampler = HomographySample(height, width, DEVICE)
+    with torch.no_grad():
+        rgb_layers, sigma_layers = model.get_layers(img_input, height=height, width=width)
+        predicted_depth = model.get_depth(img_input)
+        predicted_depth = (predicted_depth-predicted_depth.min())/(predicted_depth.max()-predicted_depth.min())
+        img_predicted_depth = predicted_depth.squeeze().cpu().detach().numpy()
+        img_predicted_depth_colored = plt.get_cmap('inferno')(img_predicted_depth / np.max(img_predicted_depth))[:, :, :3]
+        img_predicted_depth_colored = (img_predicted_depth_colored * 255).astype(np.uint8)
+        img_predicted_depth_colored = Image.fromarray(img_predicted_depth_colored)
+        layer_depth = model.get_layer_depth(img_input, grid)
+        img_layer_depth = layer_depth.squeeze().cpu().detach().numpy()
+        img_layer_depth_colored = plt.get_cmap('inferno')(img_layer_depth / np.max(img_layer_depth))[:, :, :3]
+        img_layer_depth_colored = (img_layer_depth_colored * 255).astype(np.uint8)
+        img_layer_depth_colored = Image.fromarray(img_layer_depth_colored)
+    single_loop_frames = []
+    for idx, pose_coords in enumerate(trajectory):
+        #print(f"  - Rendering frame {idx + 1}/{len(trajectory)}", end="\r")
+        with torch.no_grad():
+            target_pose = getPositionVector(pose_coords[0], pose_coords[1], pose_coords[2], pose)
+            output_img = render_novel_view(rgb_layers,
+                                         sigma_layers,
+                                         grid,
+                                         target_pose,
+                                         k_src_inv,
+                                         k_tgt,
+                                         homography_sampler)
+        img_np = output_img.detach().cpu().squeeze(0).permute(1, 2, 0).numpy()
+        img_np = (img_np * 255).astype(np.uint8)
+        img_bgr = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
+        single_loop_frames.append(img_bgr)
+    final_frames = single_loop_frames * int(num_loops)
+    video_path = create_video_from_memory(final_frames)
+    #print("Video generation complete!")
+    return video_path, img_predicted_depth_colored, img_layer_depth_colored
+with gr.Blocks(title="RT-MPINet", theme="default") as demo:
+    gr.Markdown(
+    """
+    ## Parallax Video Generator via Real-Time Multiplane Image Network (RT-MPINet)
+    We use a smaller 256x256 model for faster inference on CPU instances.
+    #### Notes:
+    1. Use a higher number of frames (>80) and loops (>4) to get a smoother video.
+    2. The default uses 60 frames and 4 camera loops for fast video generation.
+    3. We have 3 models available (larger the model, slower the inference):
+        * **Small:** 6.6 Million parameters
+        * **Medium:** 69 Million parameters
+        * **Large:** 288 Million parameters (Not available in this demo due to storage limits, you need to download this model and run locally)
+    """)
+    with gr.Row():
+        img_input = gr.Image(type="pil", label="Upload Image")
+        video_type = gr.Dropdown(["Circle", "Swing"], label="Video Type", value="Swing")
+        with gr.Column():
+            with gr.Accordion("Advanced Settings", open=False):
+                radius = gr.Slider(0.001, 0.1, value=0.05, label="Radius (for Circle/Swing)")
+                num_frames = gr.Slider(10, 180, value=60, step=1, label="Frames per Loop")
+                num_loops = gr.Slider(1, 10, value=4, step=1, label="Number of Loops")
+                with gr.Column():
+                    model_type_dropdown = gr.Dropdown(["Small", "Medium"], label="Model Type", value="Medium")
+                    resolution_dropdown = gr.Dropdown(["256x256", "384x384", "512x512"], label="Input Resolution", value="384x384")
+        generate_btn = gr.Button("Generate Video", variant="primary")
+    with gr.Row():
+        video_output = gr.Video(label="Generated Video")
+        depth_output = gr.Image(label="Depth Map - From Depth Decoder")
+        layer_depth_output = gr.Image(label="Layer Depth Map - From MPI Layers")
+    def toggle_custom_path(video_type_selection):
+        is_custom = (video_type_selection == "Custom")
+        return gr.update(visible=is_custom)
+    generate_btn.click(fn=process_image,
+                      inputs=[img_input, video_type, radius, num_frames, num_loops, model_type_dropdown, resolution_dropdown],
+                      outputs=[video_output, depth_output, layer_depth_output])
+demo.launch()

helperFunctions.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+import os
+import torch.nn.functional as F
+def save_checkpoint(model, filelocation, save_parallel = True):
+    if save_parallel:
+        torch.save(model.module.state_dict(), filelocation)
+    else:
+        torch.save(model.state_dict(), filelocation)
+def load_Checkpoint(fileLocation,model, load_cpu=False):
+    if load_cpu:
+        model.load_state_dict(torch.load(fileLocation,map_location=lambda storage, loc: storage))
+    else:
+        model.load_state_dict(torch.load(fileLocation))
+    return model
+def writeLog(logList, filename):
+    with open(filename, 'w') as outfile:
+        outfile.write("\n".join(logList))
+def kl_loss(mu, logvar):
+    return -0.5 * (1 + logvar - mu.pow(2) - logvar.exp()).mean()

helper_image_functions.py ADDED Viewed

	@@ -0,0 +1,290 @@

+'''
+Author: Manu Gond (manu.gond@miun.se)
+Date: Nov-15-2022
+Objective:  Accumulation of some general functions which I
+            use daily in my code realted to image relasted task.
+            The function names and parameters are self explanetory.
+Requirements: Installed python libraries which have been imported.
+'''
+import torch
+from torchvision.utils import save_image
+from torchvision.transforms import transforms
+import torchmetrics
+import cv2
+import numpy as np
+from PIL import Image
+import utils
+#======================= Read and Write =====================#
+def readImage(location):
+    image = Image.open(location).convert("RGB")
+    return image
+def writeImage(image, location):
+    image.save(location)
+def writeTensorImage(image, filename):
+    save_image(image, filename)
+def removeChannel(sourceLocation, targetLocation):
+    img = readImage(sourceLocation)
+    writeImage(img, targetLocation)
+def getImageTransform(width, height):
+    transform = transforms.Compose([transforms.Resize((height,width)),
+                                    transforms.ToTensor()])
+    return transform
+def convertTensor(image):
+    transform = getImageTransform(image.size[0], image.size[1])
+    image = transform(image)
+    return image
+#=================== 360 Images =======================#
+def rotateERP180(image):
+    '''
+    :param image: PIL Image
+    :return: BxHxW Torch Tensor Image
+    '''
+    W = image.size[0]
+    H = image.size[1]
+    transform = getImageTransform(W, H)
+    image = transform(image)
+    image1 = image[:, :, 0:(W//2)]
+    image2 = image[:, :, (W//2):W]
+    image3 = torch.zeros(image.size())
+    image3[:, :, 0:(W//2)] = image2
+    image3[:, :, (W//2):W] = image1
+    return image3
+def convertERP2Cube(e_img, face_w=256, mode='bilinear', cube_format='dice'):
+    '''
+        e_img:  ndarray in shape of [H, W, *]
+        face_w: int, the length of each face of the cubemap
+        '''
+    assert len(e_img.shape) == 3
+    h, w = e_img.shape[:2]
+    if mode == 'bilinear':
+        order = 1
+    elif mode == 'nearest':
+        order = 0
+    else:
+        raise NotImplementedError('unknown mode')
+    xyz = utils.xyzcube(face_w)
+    uv = utils.xyz2uv(xyz)
+    coor_xy = utils.uv2coor(uv, h, w)
+    cubemap = np.stack([
+        utils.sample_equirec(e_img[..., i], coor_xy, order=order)
+        for i in range(e_img.shape[2])
+    ], axis=-1)
+    if cube_format == 'horizon':
+        pass
+    elif cube_format == 'list':
+        cubemap = utils.cube_h2list(cubemap)
+    elif cube_format == 'dict':
+        cubemap = utils.cube_h2dict(cubemap)
+    elif cube_format == 'dice':
+        cubemap = utils.cube_h2dice(cubemap)
+    else:
+        raise NotImplementedError()
+    return cubemap
+def convertCube2ERP(cubemap, h, w, mode='bilinear', cube_format='dice'):
+    if mode == 'bilinear':
+        order = 1
+    elif mode == 'nearest':
+        order = 0
+    else:
+        raise NotImplementedError('unknown mode')
+    if cube_format == 'horizon':
+        pass
+    elif cube_format == 'list':
+        cubemap = utils.cube_list2h(cubemap)
+    elif cube_format == 'dict':
+        cubemap = utils.cube_dict2h(cubemap)
+    elif cube_format == 'dice':
+        cubemap = utils.cube_dice2h(cubemap)
+    else:
+        raise NotImplementedError('unknown cube_format')
+    assert len(cubemap.shape) == 3
+    assert cubemap.shape[0] * 6 == cubemap.shape[1]
+    assert w % 8 == 0
+    face_w = cubemap.shape[0]
+    uv = utils.equirect_uvgrid(h, w)
+    u, v = np.split(uv, 2, axis=-1)
+    u = u[..., 0]
+    v = v[..., 0]
+    cube_faces = np.stack(np.split(cubemap, 6, 1), 0)
+    # Get face id to each pixel: 0F 1R 2B 3L 4U 5D
+    tp = utils.equirect_facetype(h, w)
+    coor_x = np.zeros((h, w))
+    coor_y = np.zeros((h, w))
+    for i in range(4):
+        mask = (tp == i)
+        coor_x[mask] = 0.5 * np.tan(u[mask] - np.pi * i / 2)
+        coor_y[mask] = -0.5 * np.tan(v[mask]) / np.cos(u[mask] - np.pi * i / 2)
+    mask = (tp == 4)
+    c = 0.5 * np.tan(np.pi / 2 - v[mask])
+    coor_x[mask] = c * np.sin(u[mask])
+    coor_y[mask] = c * np.cos(u[mask])
+    mask = (tp == 5)
+    c = 0.5 * np.tan(np.pi / 2 - np.abs(v[mask]))
+    coor_x[mask] = c * np.sin(u[mask])
+    coor_y[mask] = -c * np.cos(u[mask])
+    # Final renormalize
+    coor_x = (np.clip(coor_x, -0.5, 0.5) + 0.5) * face_w
+    coor_y = (np.clip(coor_y, -0.5, 0.5) + 0.5) * face_w
+    equirec = np.stack([
+        utils.sample_cubefaces(cube_faces[..., i], tp, coor_y, coor_x, order=order)
+        for i in range(cube_faces.shape[3])
+    ], axis=-1)
+    return equirec
+def convertCube2Slices(image):
+    '''
+    :param image: Image numpy array
+    :return: List of Torch Tensors, CxHxW
+    '''
+    image = convertTensor(image)
+    C, H, W = image.size()
+    #print(C,H,W)
+    top = torch.zeros((C,W//4,W//4))
+    left = torch.zeros(top.size())
+    front = torch.zeros(top.size())
+    right = torch.zeros(top.size())
+    back = torch.zeros(top.size())
+    bottom = torch.zeros(top.size())
+    top = image[:, 0:H//3, (W//4):(W//4)*2]
+    left = image[:, (H//3):(H//3)*2, 0:W//4]
+    front = image[:, (H//3):(H//3)*2, (W//4):(W//4)*2]
+    right = image[:, (H//3):(H//3)*2, (W//4)*2:(W//4)*3]
+    back = image[:, (H // 3):(H // 3) * 2, (W // 4) * 3:]
+    bottom = image[:, (H//3)*2:, (W//4):(W//4)*2]
+    '''
+        save_image(top, 'top.png')
+        save_image(left, 'left.png')
+        save_image(front, 'front.png')
+        save_image(right, 'right.png')
+        save_image(back, 'back.png')
+        save_image(bottom, 'bottom.png')
+    '''
+    return [top, left, front, right, back, bottom]
+def convertSlicesToCube(imageList):
+    '''
+    top = convertTensor(readImage(imageList[0]))
+    left = convertTensor(readImage(imageList[1]))
+    front = convertTensor(readImage(imageList[2]))
+    right = convertTensor(readImage(imageList[3]))
+    back = convertTensor(readImage(imageList[4]))
+    bottom = convertTensor(readImage(imageList[5]))
+    '''
+    top = imageList[0]
+    left = imageList[1]
+    front = imageList[2]
+    right = imageList[3]
+    back = imageList[4]
+    bottom = imageList[5]
+    C, H, W = 3,  top.size()[1]*3, top.size()[2]*4
+    cube = torch.zeros((C, H, W))
+    cube[:, 0:H//3, (W//4):(W//4)*2] = top
+    cube[:, (H // 3):(H // 3) * 2, 0:W // 4] = left
+    cube[:, (H // 3):(H // 3) * 2, (W // 4):(W // 4) * 2] = front
+    cube[:, (H // 3):(H // 3) * 2, (W // 4) * 2:(W // 4) * 3] = right
+    cube[:, (H // 3):(H // 3) * 2, (W // 4) * 3:] = back
+    cube[:, (H // 3) * 2:, (W // 4):(W // 4) * 2] = bottom
+    return cube
+#=================== Quality Measures =======================#
+'''
+Predicted Shape : BxCxHxW
+Original Shape  : BxCxHxW
+Data Type: Torch Tensor
+'''
+def getSSIM(predicted, original):
+    SSIM = torchmetrics.StructuralSimilarityIndexMeasure()
+    return SSIM(predicted, original).item()
+def getPSNR(predicted, original):
+    PSNR = torchmetrics.PeakSignalNoiseRatio()
+    return PSNR(predicted, original).item()
+def getMSE(predicted, original):
+    MSE = torchmetrics.MeanSquaredError()
+    return MSE(predicted, original).item()
+def getMAE(predicted, original):
+    MAE = torchmetrics.MeanAbsoluteError()
+    return MAE(predicted, original).item()
+if __name__ == "__main__":
+    '''
+    img = readImage("31_image_0_0.png")
+    img = convertERP2Cube(e_img=np.asarray(img), face_w=256)
+    img = Image.fromarray(img.astype('uint8'),'RGB')
+    convertCube2Slices(img)
+    '''
+    #image = convertSlicesToCube(["top.png", "left.png", "front.png", "right.png", "back.png", "bottom.png"])
+    #writeTensorImage(image,'this.png')
+    '''
+    writeImage(img, 'cube.png')
+    img = readImage('cube.png')
+    img = convertCube2ERP(np.asarray(img),512,1024)
+    img = Image.fromarray(img.astype('uint8'),'RGB')
+    writeImage(img, 'cubeERP.png')
+    img1 = readImage("31_image_0_0.png")
+    img2 = readImage("cubeERP.png")
+    img1 = convertTensor(img1)
+    img2 = convertTensor(img2)
+    print(getSSIM(img1.unsqueeze(0), img2.unsqueeze(0)))
+    '''
+    #img = rotateERP180(img)
+    #writeTensorImage(img, 'rotated_image.png')
+    #img = convertTensor(img)
+    #print(getMAE(img.unsqueeze(0),img.unsqueeze(0)))

model_Large.py ADDED Viewed

	@@ -0,0 +1,535 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import warnings
+warnings.filterwarnings("ignore")
+import torchvision
+import parameters as params
+import timm
+class DinoV2FeatureExtractor(nn.Module):
+    def __init__(self, out_channels=256, out_size=(64, 64)):
+        super().__init__()
+        self.dino = timm.create_model('vit_base_patch14_dinov2.lvd142m', pretrained=False)
+        self.dino.eval()
+        for p in self.dino.parameters():
+            p.requires_grad = False
+        self.out_size = out_size
+        self.feat_proj = nn.Sequential(
+            nn.Conv2d(self.dino.embed_dim, out_channels, kernel_size=1),
+            nn.ReLU(),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.interpolate(x, size=(518, 518), mode='bilinear', align_corners=False)
+        patch_tokens = self.dino.forward_features(x)
+        patch_tokens = patch_tokens[:, 1:]
+        B, N, C = patch_tokens.shape
+        h = w = int(N ** 0.5)
+        feat_map = patch_tokens.transpose(1, 2).reshape(B, C, h, w)  # [B, C, H', W']
+        feat_map = F.interpolate(feat_map, size=self.out_size, mode='bilinear', align_corners=False)
+        return self.feat_proj(feat_map)
+def getLinearLayer(in_feat, out_feat, activation=nn.ReLU(True)):
+    return nn.Sequential(
+        nn.Linear(in_features=in_feat, out_features=out_feat, bias=True),
+        activation
+    )
+def getConvLayer(in_channel,out_channel,stride=1,padding=1,activation=nn.ReLU()):
+    return nn.Sequential(nn.Conv2d(in_channel,
+                    out_channel,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=padding,
+                    padding_mode='reflect'),
+                    activation)
+def getConvTransposeLayer(in_channel, out_channel,kernel=3,stride=1,padding=1,activation=nn.ReLU()):
+    return nn.Sequential(nn.ConvTranspose2d(in_channel,
+                                            out_channel,
+                                            kernel_size = kernel,
+                                            stride=stride,
+                                            padding=padding),
+                                            activation)
+class ResidualBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride=1):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
+        self.stride = stride
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_channels != out_channels:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_channels)
+            )
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = out + self.shortcut(residual)
+        out = self.relu(out)
+        return out
+# class ResidualBlock(nn.Module):
+#     def __init__(self, in_channels, out_channels, stride=1, expansion=4):
+#         super().__init__()
+#         mid_channels = out_channels // expansion
+#         self.pw_reduce = nn.Conv2d(in_channels, mid_channels, kernel_size=1, bias=False)
+#         self.bn1 = nn.BatchNorm2d(mid_channels)
+#         self.dw = nn.Conv2d(mid_channels, mid_channels, kernel_size=3,
+#                             stride=stride, padding=1, groups=mid_channels, bias=False)
+#         self.bn2 = nn.BatchNorm2d(mid_channels)
+#         self.pw_expand = nn.Conv2d(mid_channels, out_channels, kernel_size=1, bias=False)
+#         self.bn3 = nn.BatchNorm2d(out_channels)
+#         self.relu = nn.ReLU(inplace=True)
+#         self.stride = stride
+#         if stride != 1 or in_channels != out_channels:
+#             self.shortcut = nn.Sequential(
+#                 nn.Conv2d(in_channels, out_channels, kernel_size=1,
+#                           stride=stride, bias=False),
+#                 nn.BatchNorm2d(out_channels),
+#             )
+#         else:
+#             self.shortcut = nn.Identity()
+#     def forward(self, x):
+#         identity = x
+#         out = self.pw_reduce(x)
+#         out = self.bn1(out)
+#         out = self.relu(out)
+#         out = self.dw(out)
+#         out = self.bn2(out)
+#         out = self.relu(out)
+#         out = self.pw_expand(out)
+#         out = self.bn3(out)
+#         out += self.shortcut(identity)
+#         out = self.relu(out)
+#         return out
+class FeatureNet(nn.Module):
+    def __init__(self,height,width):
+        super().__init__()
+        model = torchvision.models.resnet152(pretrained=False)
+        layers = list(model.children())
+        self.FeatureEncoder = torch.nn.Sequential(*layers[:5].copy())
+        self.expand_layer = ResidualBlock(256, 500)
+    def forward(self, x):
+        x = self.FeatureEncoder(x)
+        x = self.expand_layer(x)
+        return x
+    def apply_feature_encoder(self, x):
+        x = self.FeatureEncoder(x)
+        x = self.expand_layer(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(self, height, width, total_image_input=1):
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.encoder_pre = ResidualBlock((total_image_input*3), 20)
+        self.encoder_layer1 = ResidualBlock(20, 30)
+        self.encoder_layer2 = ResidualBlock(30, 50)
+        self.encoder_layer3 = nn.Sequential(
+            ResidualBlock(50, 100),
+            nn.MaxPool2d(kernel_size=2, stride=2)
+        )
+        self.encoder_layer4 = ResidualBlock(100, 500)
+        self.encoder_layer5 = nn.Sequential(
+            ResidualBlock(500, 500),
+            nn.MaxPool2d(kernel_size=2, stride=2)
+        )
+        self.encoder_layer6 = ResidualBlock(500, 500)
+        self.encoder_layer7 = nn.Sequential(
+            ResidualBlock(500, 500),
+            nn.MaxPool2d(kernel_size=2, stride=2)
+        )
+        self.encoder_layer8 = ResidualBlock(500, 1000)
+        self.encoder_layer9 = nn.Sequential(
+            ResidualBlock(1000, 1000),
+            nn.MaxPool2d(kernel_size=2, stride=2)
+        )
+        self.encoder_layer10 = ResidualBlock(1000, 1000)
+        self.encoder_layer11 = ResidualBlock(1000, 1000)
+    def forward(self, x, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        x = self.encoder_pre(x)
+        x = self.encoder_layer1(x)
+        x = self.encoder_layer2(x)
+        skip1 = self.encoder_layer3(x)
+        x = self.encoder_layer4(skip1)
+        skip2 = self.encoder_layer5(x)
+        x = self.encoder_layer6(skip2)
+        skip3 = self.encoder_layer7(x)
+        x = self.encoder_layer8(skip3)
+        skip4 = self.encoder_layer9(x)
+        x = self.encoder_layer10(skip4)
+        x = self.encoder_layer11(x)
+        return x, [skip1, skip2, skip3, skip4]
+class DecoderRGB(nn.Module):
+    def __init__(self,height,width):
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.decoder_layer1 = ResidualBlock(1000, 1000)
+        self.decoder_layer2 = ResidualBlock(1000, 1000)
+        self.decoder_layer3 = ResidualBlock(1000, 1000)
+        self.decoder_layer4 = nn.Sequential(
+            nn.ConvTranspose2d(1000, 500, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer5 = ResidualBlock(500, 500)
+        self.decoder_layer6 = nn.Sequential(
+            nn.ConvTranspose2d(500, 500, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer7 = ResidualBlock(500, 500)
+        self.decoder_layer8 = nn.Sequential(
+            nn.ConvTranspose2d(500, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer9 = ResidualBlock(100, 100)
+        self.decoder_layer10 = nn.Sequential(
+            nn.ConvTranspose2d(100, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer11 = ResidualBlock(100, 100)
+        self.decoder_layer12 = ResidualBlock(100, 96)
+        self.decoder_layer13 = ResidualBlock(96, 96)
+        self.decoder_layer14 = ResidualBlock(96, 96)
+        self.decoder_layer15 = nn.Sequential(
+            nn.Conv2d(96, 96, 3, stride=1, padding=1),
+            nn.Sigmoid()
+        )
+        self.decoder_layer16 = nn.Sequential(
+            nn.Conv2d(96, 96, 3, stride=1, padding=1),
+            nn.Sigmoid()
+        )
+    def forward(self, x, lower_skip_list, imagenet_features, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        x = self.decoder_layer1(x)
+        x = self.decoder_layer2(x)
+        x = x + lower_skip_list[3]
+        x = self.decoder_layer3(x)
+        x = self.decoder_layer4(x)
+        x = x + lower_skip_list[2]
+        x = self.decoder_layer5(x)
+        x = self.decoder_layer6(x)
+        x = x + lower_skip_list[1] + imagenet_features
+        x = self.decoder_layer7(x)
+        x = self.decoder_layer8(x)
+        x = x + lower_skip_list[0]
+        x = self.decoder_layer9(x)
+        x = self.decoder_layer10(x)
+        x = self.decoder_layer11(x)
+        x = self.decoder_layer12(x)
+        x = self.decoder_layer13(x)
+        x = self.decoder_layer14(x)
+        x = self.decoder_layer15(x)
+        x = self.decoder_layer16(x)
+        x = x.view(x.size()[0], 32, 3, height, width)
+        return x
+class DecoderSigma(nn.Module):
+    def __init__(self,height,width):
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.decoder_layer1 = ResidualBlock(1000, 1000)
+        self.decoder_layer2 = ResidualBlock(1000, 1000)
+        self.decoder_layer3 = ResidualBlock(1000, 1000)
+        self.decoder_layer4 = nn.Sequential(
+            nn.ConvTranspose2d(1000, 500, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer5 = ResidualBlock(500, 500)
+        self.decoder_layer6 = nn.Sequential(
+            nn.ConvTranspose2d(500, 500, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer7 = ResidualBlock(500, 500)
+        self.decoder_layer8 = nn.Sequential(
+            nn.ConvTranspose2d(500, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer9 = ResidualBlock(100, 100)
+        self.decoder_layer10 = nn.Sequential(
+            nn.ConvTranspose2d(100, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer11 = ResidualBlock(100, 100)
+        self.decoder_layer12 = ResidualBlock(100, 50)
+        self.decoder_layer13 = ResidualBlock(50, 40)
+        self.decoder_layer14 = ResidualBlock(40, 32)
+        self.decoder_layer15 = nn.Sequential(
+            nn.Conv2d(32, 32, 3, stride=1, padding=1),
+            nn.ReLU(True)
+        )
+        self.decoder_layer16 = nn.Sequential(
+            nn.Conv2d(32, 32, 3, stride=1, padding=1),
+            nn.ReLU(True)
+        )
+    def forward(self, x, lower_skip_list, imagenet_features, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        x = self.decoder_layer1(x)
+        x = self.decoder_layer2(x)
+        x = x + lower_skip_list[3]
+        x = self.decoder_layer3(x)
+        x = self.decoder_layer4(x)
+        x = x + lower_skip_list[2]
+        x = self.decoder_layer5(x)
+        x = self.decoder_layer6(x)
+        x = x + lower_skip_list[1] + imagenet_features
+        x = self.decoder_layer7(x)
+        x = self.decoder_layer8(x)
+        x = x + lower_skip_list[0]
+        x = self.decoder_layer9(x)
+        x = self.decoder_layer10(x)
+        x = self.decoder_layer11(x)
+        x = self.decoder_layer12(x)
+        x = self.decoder_layer13(x)
+        x = self.decoder_layer14(x)
+        x = self.decoder_layer15(x)
+        x = self.decoder_layer16(x)
+        x = x.view(x.size()[0], 32, 1, height, width)
+        return x
+class DecoderDepth(nn.Module):
+    def __init__(self,height,width):
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.decoder_layer1 = ResidualBlock(1000, 1000)
+        self.decoder_layer2 = ResidualBlock(1000, 1000)
+        self.decoder_layer3 = ResidualBlock(1000, 1000)
+        self.decoder_layer4 = nn.Sequential(
+            nn.ConvTranspose2d(1000, 500, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer5 = ResidualBlock(500, 500)
+        self.decoder_layer6 = nn.Sequential(
+            nn.ConvTranspose2d(500, 500, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer7 = ResidualBlock(500, 500)
+        self.decoder_layer8 = nn.Sequential(
+            nn.ConvTranspose2d(500, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer9 = ResidualBlock(100, 100)
+        self.decoder_layer10 = nn.Sequential(
+            nn.ConvTranspose2d(100, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer11 = ResidualBlock(100, 100)
+        self.decoder_layer12 = ResidualBlock(100, 50)
+        self.decoder_layer13 = ResidualBlock(50, 40)
+        self.decoder_layer14 = ResidualBlock(40, 16)
+        self.decoder_layer15 = nn.Sequential(
+            nn.Conv2d(16, 8, 3, stride=1, padding=1),
+            nn.ReLU(True)
+        )
+        self.decoder_layer16 = nn.Sequential(
+            nn.Conv2d(8, 1, 3, stride=1, padding=1),
+            nn.ReLU(True)
+        )
+    def forward(self, x, lower_skip_list, imagenet_features, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        x = self.decoder_layer1(x)
+        x = self.decoder_layer2(x)
+        x = x + lower_skip_list[3]
+        x = self.decoder_layer3(x)
+        x = self.decoder_layer4(x)
+        x = x + lower_skip_list[2]
+        x = self.decoder_layer5(x)
+        x = self.decoder_layer6(x)
+        x = x + lower_skip_list[1] + imagenet_features
+        x = self.decoder_layer7(x)
+        x = self.decoder_layer8(x)
+        x = x + lower_skip_list[0]
+        x = self.decoder_layer9(x)
+        x = self.decoder_layer10(x)
+        x = self.decoder_layer11(x)
+        x = self.decoder_layer12(x)
+        x = self.decoder_layer13(x)
+        x = self.decoder_layer14(x)
+        x = self.decoder_layer15(x)
+        x = self.decoder_layer16(x)
+        return x
+class MMPI(nn.Module):
+    def __init__(self,total_image_input=1, height=384,width=384):
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.feature_encoder = FeatureNet(height,width)
+        self.lower_encoder = Encoder(height, width, total_image_input)
+        self.merge_decoder_rgb = DecoderRGB(height, width)
+        self.merge_decoder_sigma = DecoderSigma(height, width)
+        self.depth_decoder = DecoderDepth(height, width)
+    def forward(self, x, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        imagenet_fatures = self.feature_encoder.apply_feature_encoder(x)
+        lower_feature, skip_list = self.lower_encoder(x, height, width)
+        merged_feature_rgb = self.merge_decoder_rgb(lower_feature, skip_list, imagenet_fatures, height, width)
+        merged_feature_sigma = self.merge_decoder_sigma(lower_feature, skip_list, imagenet_fatures, height, width)
+        merged_feature_depth = self.depth_decoder(lower_feature, skip_list, imagenet_fatures)
+        return merged_feature_rgb, merged_feature_sigma, merged_feature_depth
+    def get_rgb_sigma(self, x, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        imagenet_fatures = self.feature_encoder.apply_feature_encoder(x)
+        lower_feature, skip_list = self.lower_encoder(x, height, width)
+        merged_feature_rgb = self.merge_decoder_rgb(lower_feature, skip_list, imagenet_fatures, height, width)
+        merged_feature_sigma = self.merge_decoder_sigma(lower_feature, skip_list, imagenet_fatures, height, width)
+        return merged_feature_rgb, merged_feature_sigma
+    def get_depth(self, x, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        imagenet_fatures = self.feature_encoder.apply_feature_encoder(x)
+        lower_feature, skip_list = self.lower_encoder(x, height, width)
+        merged_feature_depth = self.depth_decoder(lower_feature, skip_list, imagenet_fatures)
+        return merged_feature_depth
+    def get_layer_depth(self, x, grid, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        imagenet_fatures = self.feature_encoder.apply_feature_encoder(x)
+        lower_feature, skip_list = self.lower_encoder(x, height, width)
+        rgb_layers = self.merge_decoder_rgb(lower_feature, skip_list, imagenet_fatures, height, width)
+        sigma_layers = self.merge_decoder_sigma(lower_feature, skip_list, imagenet_fatures, height, width)
+        pred_mpi_planes = torch.randn((1, 4, height, width)).to(params.DEVICE)
+        for i in range(params.params_num_planes):
+            RGBA = torch.cat((rgb_layers[0,i,:,:,:],sigma_layers[0,i,:,:,:]),dim=0).unsqueeze(0)
+            pred_mpi_planes = torch.cat((pred_mpi_planes,RGBA),dim=0)
+        pred_mpi_planes = pred_mpi_planes[1:,:,:,:].unsqueeze(0)
+        sigma = pred_mpi_planes[:, :, 3, :, :]
+        B, D, H, W = sigma.shape
+        pred_mpi_disp = grid
+        disp_sorted, _ = pred_mpi_disp.sort(dim=1)
+        delta = disp_sorted[:, 1:] - disp_sorted[:, :-1]
+        delta_last = delta[:, -1:]
+        delta = torch.cat([delta, delta_last], dim=1)
+        delta = delta.unsqueeze(-1).unsqueeze(-1).expand_as(sigma)
+        alpha = 1.0 - torch.exp(-delta * sigma)
+        transmittance = torch.cumprod(1 - alpha + 1e-7, dim=1)
+        shifted_transmittance = torch.ones_like(transmittance)
+        shifted_transmittance[:, 1:, :, :] = transmittance[:, :-1, :, :]
+        disparity = pred_mpi_disp.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, H, W)
+        disparity_map = (disparity * alpha * shifted_transmittance).sum(dim=1, keepdim=True)
+        return disparity_map
+    def get_layers(self, x, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        imagenet_fatures = self.feature_encoder.apply_feature_encoder(x)
+        lower_feature, skip_list = self.lower_encoder(x, height, width)
+        merged_feature_rgb = self.merge_decoder_rgb(lower_feature, skip_list, imagenet_fatures, height, width)
+        merged_feature_sigma = self.merge_decoder_sigma(lower_feature, skip_list, imagenet_fatures, height, width)
+        return merged_feature_rgb, merged_feature_sigma

model_Medium.py ADDED Viewed

	@@ -0,0 +1,535 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import warnings
+warnings.filterwarnings("ignore")
+import torchvision
+import parameters as params
+import timm
+class DinoV2FeatureExtractor(nn.Module):
+    def __init__(self, out_channels=256, out_size=(64, 64)):
+        super().__init__()
+        self.dino = timm.create_model('vit_base_patch14_dinov2.lvd142m', pretrained=False)
+        self.dino.eval()
+        for p in self.dino.parameters():
+            p.requires_grad = False
+        self.out_size = out_size
+        self.feat_proj = nn.Sequential(
+            nn.Conv2d(self.dino.embed_dim, out_channels, kernel_size=1),
+            nn.ReLU(),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.interpolate(x, size=(518, 518), mode='bilinear', align_corners=False)
+        patch_tokens = self.dino.forward_features(x)
+        patch_tokens = patch_tokens[:, 1:]
+        B, N, C = patch_tokens.shape
+        h = w = int(N ** 0.5)
+        feat_map = patch_tokens.transpose(1, 2).reshape(B, C, h, w)  # [B, C, H', W']
+        feat_map = F.interpolate(feat_map, size=self.out_size, mode='bilinear', align_corners=False)
+        return self.feat_proj(feat_map)
+def getLinearLayer(in_feat, out_feat, activation=nn.ReLU(True)):
+    return nn.Sequential(
+        nn.Linear(in_features=in_feat, out_features=out_feat, bias=True),
+        activation
+    )
+def getConvLayer(in_channel,out_channel,stride=1,padding=1,activation=nn.ReLU()):
+    return nn.Sequential(nn.Conv2d(in_channel,
+                    out_channel,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=padding,
+                    padding_mode='reflect'),
+                    activation)
+def getConvTransposeLayer(in_channel, out_channel,kernel=3,stride=1,padding=1,activation=nn.ReLU()):
+    return nn.Sequential(nn.ConvTranspose2d(in_channel,
+                                            out_channel,
+                                            kernel_size = kernel,
+                                            stride=stride,
+                                            padding=padding),
+                                            activation)
+class ResidualBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride=1):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
+        self.stride = stride
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_channels != out_channels:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_channels)
+            )
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = out + self.shortcut(residual)
+        out = self.relu(out)
+        return out
+# class ResidualBlock(nn.Module):
+#     def __init__(self, in_channels, out_channels, stride=1, expansion=4):
+#         super().__init__()
+#         mid_channels = out_channels // expansion
+#         self.pw_reduce = nn.Conv2d(in_channels, mid_channels, kernel_size=1, bias=False)
+#         self.bn1 = nn.BatchNorm2d(mid_channels)
+#         self.dw = nn.Conv2d(mid_channels, mid_channels, kernel_size=3,
+#                             stride=stride, padding=1, groups=mid_channels, bias=False)
+#         self.bn2 = nn.BatchNorm2d(mid_channels)
+#         self.pw_expand = nn.Conv2d(mid_channels, out_channels, kernel_size=1, bias=False)
+#         self.bn3 = nn.BatchNorm2d(out_channels)
+#         self.relu = nn.ReLU(inplace=True)
+#         self.stride = stride
+#         if stride != 1 or in_channels != out_channels:
+#             self.shortcut = nn.Sequential(
+#                 nn.Conv2d(in_channels, out_channels, kernel_size=1,
+#                           stride=stride, bias=False),
+#                 nn.BatchNorm2d(out_channels),
+#             )
+#         else:
+#             self.shortcut = nn.Identity()
+#     def forward(self, x):
+#         identity = x
+#         out = self.pw_reduce(x)
+#         out = self.bn1(out)
+#         out = self.relu(out)
+#         out = self.dw(out)
+#         out = self.bn2(out)
+#         out = self.relu(out)
+#         out = self.pw_expand(out)
+#         out = self.bn3(out)
+#         out += self.shortcut(identity)
+#         out = self.relu(out)
+#         return out
+class FeatureNet(nn.Module):
+    def __init__(self,height,width):
+        super().__init__()
+        model = torchvision.models.resnet152(pretrained=False)
+        layers = list(model.children())
+        self.FeatureEncoder = torch.nn.Sequential(*layers[:5].copy())
+        self.expand_layer = ResidualBlock(256, 200)
+    def forward(self, x):
+        x = self.FeatureEncoder(x)
+        x = self.expand_layer(x)
+        return x
+    def apply_feature_encoder(self, x):
+        x = self.FeatureEncoder(x)
+        x = self.expand_layer(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(self, height, width, total_image_input=1):
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.encoder_pre = ResidualBlock((total_image_input*3), 20)
+        self.encoder_layer1 = ResidualBlock(20, 30)
+        self.encoder_layer2 = ResidualBlock(30, 50)
+        self.encoder_layer3 = nn.Sequential(
+            ResidualBlock(50, 100),
+            nn.MaxPool2d(kernel_size=2, stride=2)
+        )
+        self.encoder_layer4 = ResidualBlock(100, 200)
+        self.encoder_layer5 = nn.Sequential(
+            ResidualBlock(200, 200),
+            nn.MaxPool2d(kernel_size=2, stride=2)
+        )
+        self.encoder_layer6 = ResidualBlock(200, 200)
+        self.encoder_layer7 = nn.Sequential(
+            ResidualBlock(200, 200),
+            nn.MaxPool2d(kernel_size=2, stride=2)
+        )
+        self.encoder_layer8 = ResidualBlock(200, 500)
+        self.encoder_layer9 = nn.Sequential(
+            ResidualBlock(500, 500),
+            nn.MaxPool2d(kernel_size=2, stride=2)
+        )
+        self.encoder_layer10 = ResidualBlock(500, 500)
+        self.encoder_layer11 = ResidualBlock(500, 500)
+    def forward(self, x, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        x = self.encoder_pre(x)
+        x = self.encoder_layer1(x)
+        x = self.encoder_layer2(x)
+        skip1 = self.encoder_layer3(x)
+        x = self.encoder_layer4(skip1)
+        skip2 = self.encoder_layer5(x)
+        x = self.encoder_layer6(skip2)
+        skip3 = self.encoder_layer7(x)
+        x = self.encoder_layer8(skip3)
+        skip4 = self.encoder_layer9(x)
+        x = self.encoder_layer10(skip4)
+        x = self.encoder_layer11(x)
+        return x, [skip1, skip2, skip3, skip4]
+class DecoderRGB(nn.Module):
+    def __init__(self,height,width):
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.decoder_layer1 = ResidualBlock(500, 500)
+        self.decoder_layer2 = ResidualBlock(500, 500)
+        self.decoder_layer3 = ResidualBlock(500, 500)
+        self.decoder_layer4 = nn.Sequential(
+            nn.ConvTranspose2d(500, 200, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer5 = ResidualBlock(200, 200)
+        self.decoder_layer6 = nn.Sequential(
+            nn.ConvTranspose2d(200, 200, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer7 = ResidualBlock(200, 200)
+        self.decoder_layer8 = nn.Sequential(
+            nn.ConvTranspose2d(200, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer9 = ResidualBlock(100, 100)
+        self.decoder_layer10 = nn.Sequential(
+            nn.ConvTranspose2d(100, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer11 = ResidualBlock(100, 100)
+        self.decoder_layer12 = ResidualBlock(100, 96)
+        self.decoder_layer13 = ResidualBlock(96, 96)
+        self.decoder_layer14 = ResidualBlock(96, 96)
+        self.decoder_layer15 = nn.Sequential(
+            nn.Conv2d(96, 96, 3, stride=1, padding=1),
+            nn.Sigmoid()
+        )
+        self.decoder_layer16 = nn.Sequential(
+            nn.Conv2d(96, 96, 3, stride=1, padding=1),
+            nn.Sigmoid()
+        )
+    def forward(self, x, lower_skip_list, imagenet_features, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        x = self.decoder_layer1(x)
+        x = self.decoder_layer2(x)
+        x = x + lower_skip_list[3]
+        x = self.decoder_layer3(x)
+        x = self.decoder_layer4(x)
+        x = x + lower_skip_list[2]
+        x = self.decoder_layer5(x)
+        x = self.decoder_layer6(x)
+        x = x + lower_skip_list[1] + imagenet_features
+        x = self.decoder_layer7(x)
+        x = self.decoder_layer8(x)
+        x = x + lower_skip_list[0]
+        x = self.decoder_layer9(x)
+        x = self.decoder_layer10(x)
+        x = self.decoder_layer11(x)
+        x = self.decoder_layer12(x)
+        x = self.decoder_layer13(x)
+        x = self.decoder_layer14(x)
+        x = self.decoder_layer15(x)
+        x = self.decoder_layer16(x)
+        x = x.view(x.size()[0], 32, 3, height, width)
+        return x
+class DecoderSigma(nn.Module):
+    def __init__(self,height,width):
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.decoder_layer1 = ResidualBlock(500, 500)
+        self.decoder_layer2 = ResidualBlock(500, 500)
+        self.decoder_layer3 = ResidualBlock(500, 500)
+        self.decoder_layer4 = nn.Sequential(
+            nn.ConvTranspose2d(500, 200, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer5 = ResidualBlock(200, 200)
+        self.decoder_layer6 = nn.Sequential(
+            nn.ConvTranspose2d(200, 200, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer7 = ResidualBlock(200, 200)
+        self.decoder_layer8 = nn.Sequential(
+            nn.ConvTranspose2d(200, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer9 = ResidualBlock(100, 100)
+        self.decoder_layer10 = nn.Sequential(
+            nn.ConvTranspose2d(100, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer11 = ResidualBlock(100, 100)
+        self.decoder_layer12 = ResidualBlock(100, 50)
+        self.decoder_layer13 = ResidualBlock(50, 40)
+        self.decoder_layer14 = ResidualBlock(40, 32)
+        self.decoder_layer15 = nn.Sequential(
+            nn.Conv2d(32, 32, 3, stride=1, padding=1),
+            nn.ReLU(True)
+        )
+        self.decoder_layer16 = nn.Sequential(
+            nn.Conv2d(32, 32, 3, stride=1, padding=1),
+            nn.ReLU(True)
+        )
+    def forward(self, x, lower_skip_list, imagenet_features, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        x = self.decoder_layer1(x)
+        x = self.decoder_layer2(x)
+        x = x + lower_skip_list[3]
+        x = self.decoder_layer3(x)
+        x = self.decoder_layer4(x)
+        x = x + lower_skip_list[2]
+        x = self.decoder_layer5(x)
+        x = self.decoder_layer6(x)
+        x = x + lower_skip_list[1] + imagenet_features
+        x = self.decoder_layer7(x)
+        x = self.decoder_layer8(x)
+        x = x + lower_skip_list[0]
+        x = self.decoder_layer9(x)
+        x = self.decoder_layer10(x)
+        x = self.decoder_layer11(x)
+        x = self.decoder_layer12(x)
+        x = self.decoder_layer13(x)
+        x = self.decoder_layer14(x)
+        x = self.decoder_layer15(x)
+        x = self.decoder_layer16(x)
+        x = x.view(x.size()[0], 32, 1, height, width)
+        return x
+class DecoderDepth(nn.Module):
+    def __init__(self,height,width):
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.decoder_layer1 = ResidualBlock(500, 500)
+        self.decoder_layer2 = ResidualBlock(500, 500)
+        self.decoder_layer3 = ResidualBlock(500, 500)
+        self.decoder_layer4 = nn.Sequential(
+            nn.ConvTranspose2d(500, 200, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer5 = ResidualBlock(200, 200)
+        self.decoder_layer6 = nn.Sequential(
+            nn.ConvTranspose2d(200, 200, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer7 = ResidualBlock(200, 200)
+        self.decoder_layer8 = nn.Sequential(
+            nn.ConvTranspose2d(200, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer9 = ResidualBlock(100, 100)
+        self.decoder_layer10 = nn.Sequential(
+            nn.ConvTranspose2d(100, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer11 = ResidualBlock(100, 100)
+        self.decoder_layer12 = ResidualBlock(100, 50)
+        self.decoder_layer13 = ResidualBlock(50, 40)
+        self.decoder_layer14 = ResidualBlock(40, 16)
+        self.decoder_layer15 = nn.Sequential(
+            nn.Conv2d(16, 8, 3, stride=1, padding=1),
+            nn.ReLU(True)
+        )
+        self.decoder_layer16 = nn.Sequential(
+            nn.Conv2d(8, 1, 3, stride=1, padding=1),
+            nn.ReLU(True)
+        )
+    def forward(self, x, lower_skip_list, imagenet_features, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        x = self.decoder_layer1(x)
+        x = self.decoder_layer2(x)
+        x = x + lower_skip_list[3]
+        x = self.decoder_layer3(x)
+        x = self.decoder_layer4(x)
+        x = x + lower_skip_list[2]
+        x = self.decoder_layer5(x)
+        x = self.decoder_layer6(x)
+        x = x + lower_skip_list[1] + imagenet_features
+        x = self.decoder_layer7(x)
+        x = self.decoder_layer8(x)
+        x = x + lower_skip_list[0]
+        x = self.decoder_layer9(x)
+        x = self.decoder_layer10(x)
+        x = self.decoder_layer11(x)
+        x = self.decoder_layer12(x)
+        x = self.decoder_layer13(x)
+        x = self.decoder_layer14(x)
+        x = self.decoder_layer15(x)
+        x = self.decoder_layer16(x)
+        return x
+class MMPI(nn.Module):
+    def __init__(self,total_image_input=1, height=384,width=384):
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.feature_encoder = FeatureNet(height,width)
+        self.lower_encoder = Encoder(height, width, total_image_input)
+        self.merge_decoder_rgb = DecoderRGB(height, width)
+        self.merge_decoder_sigma = DecoderSigma(height, width)
+        self.depth_decoder = DecoderDepth(height, width)
+    def forward(self, x, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        imagenet_fatures = self.feature_encoder.apply_feature_encoder(x)
+        lower_feature, skip_list = self.lower_encoder(x, height, width)
+        merged_feature_rgb = self.merge_decoder_rgb(lower_feature, skip_list, imagenet_fatures, height, width)
+        merged_feature_sigma = self.merge_decoder_sigma(lower_feature, skip_list, imagenet_fatures, height, width)
+        merged_feature_depth = self.depth_decoder(lower_feature, skip_list, imagenet_fatures)
+        return merged_feature_rgb, merged_feature_sigma, merged_feature_depth
+    def get_rgb_sigma(self, x, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        imagenet_fatures = self.feature_encoder.apply_feature_encoder(x)
+        lower_feature, skip_list = self.lower_encoder(x, height, width)
+        merged_feature_rgb = self.merge_decoder_rgb(lower_feature, skip_list, imagenet_fatures, height, width)
+        merged_feature_sigma = self.merge_decoder_sigma(lower_feature, skip_list, imagenet_fatures, height, width)
+        return merged_feature_rgb, merged_feature_sigma
+    def get_depth(self, x, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        imagenet_fatures = self.feature_encoder.apply_feature_encoder(x)
+        lower_feature, skip_list = self.lower_encoder(x, height, width)
+        merged_feature_depth = self.depth_decoder(lower_feature, skip_list, imagenet_fatures)
+        return merged_feature_depth
+    def get_layer_depth(self, x, grid, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        imagenet_fatures = self.feature_encoder.apply_feature_encoder(x)
+        lower_feature, skip_list = self.lower_encoder(x, height, width)
+        rgb_layers = self.merge_decoder_rgb(lower_feature, skip_list, imagenet_fatures, height, width)
+        sigma_layers = self.merge_decoder_sigma(lower_feature, skip_list, imagenet_fatures, height, width)
+        pred_mpi_planes = torch.randn((1, 4, height, width)).to(params.DEVICE)
+        for i in range(params.params_num_planes):
+            RGBA = torch.cat((rgb_layers[0,i,:,:,:],sigma_layers[0,i,:,:,:]),dim=0).unsqueeze(0)
+            pred_mpi_planes = torch.cat((pred_mpi_planes,RGBA),dim=0)
+        pred_mpi_planes = pred_mpi_planes[1:,:,:,:].unsqueeze(0)
+        sigma = pred_mpi_planes[:, :, 3, :, :]
+        B, D, H, W = sigma.shape
+        pred_mpi_disp = grid
+        disp_sorted, _ = pred_mpi_disp.sort(dim=1)
+        delta = disp_sorted[:, 1:] - disp_sorted[:, :-1]
+        delta_last = delta[:, -1:]
+        delta = torch.cat([delta, delta_last], dim=1)
+        delta = delta.unsqueeze(-1).unsqueeze(-1).expand_as(sigma)
+        alpha = 1.0 - torch.exp(-delta * sigma)
+        transmittance = torch.cumprod(1 - alpha + 1e-7, dim=1)
+        shifted_transmittance = torch.ones_like(transmittance)
+        shifted_transmittance[:, 1:, :, :] = transmittance[:, :-1, :, :]
+        disparity = pred_mpi_disp.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, H, W)
+        disparity_map = (disparity * alpha * shifted_transmittance).sum(dim=1, keepdim=True)
+        return disparity_map
+    def get_layers(self, x, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        imagenet_fatures = self.feature_encoder.apply_feature_encoder(x)
+        lower_feature, skip_list = self.lower_encoder(x, height, width)
+        merged_feature_rgb = self.merge_decoder_rgb(lower_feature, skip_list, imagenet_fatures, height, width)
+        merged_feature_sigma = self.merge_decoder_sigma(lower_feature, skip_list, imagenet_fatures, height, width)
+        return merged_feature_rgb, merged_feature_sigma

model_Small.py ADDED Viewed

	@@ -0,0 +1,544 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import warnings
+warnings.filterwarnings("ignore")
+import torchvision
+import parameters as params
+import timm
+class DinoV2FeatureExtractor(nn.Module):
+    def __init__(self, out_channels=256, out_size=(64, 64)):
+        super().__init__()
+        self.dino = timm.create_model('vit_base_patch14_dinov2.lvd142m', pretrained=False)
+        self.dino.eval()
+        for p in self.dino.parameters():
+            p.requires_grad = False
+        self.out_size = out_size
+        self.feat_proj = nn.Sequential(
+            nn.Conv2d(self.dino.embed_dim, out_channels, kernel_size=1),
+            nn.ReLU(),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.interpolate(x, size=(518, 518), mode='bilinear', align_corners=False)
+        patch_tokens = self.dino.forward_features(x)
+        patch_tokens = patch_tokens[:, 1:]
+        B, N, C = patch_tokens.shape
+        h = w = int(N ** 0.5)
+        feat_map = patch_tokens.transpose(1, 2).reshape(B, C, h, w)  # [B, C, H', W']
+        feat_map = F.interpolate(feat_map, size=self.out_size, mode='bilinear', align_corners=False)
+        return self.feat_proj(feat_map)
+def getLinearLayer(in_feat, out_feat, activation=nn.ReLU(True)):
+    return nn.Sequential(
+        nn.Linear(in_features=in_feat, out_features=out_feat, bias=True),
+        activation
+    )
+def getConvLayer(in_channel,out_channel,stride=1,padding=1,activation=nn.ReLU()):
+    return nn.Sequential(nn.Conv2d(in_channel,
+                    out_channel,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=padding,
+                    padding_mode='reflect'),
+                    activation)
+def getConvTransposeLayer(in_channel, out_channel,kernel=3,stride=1,padding=1,activation=nn.ReLU()):
+    return nn.Sequential(nn.ConvTranspose2d(in_channel,
+                                            out_channel,
+                                            kernel_size = kernel,
+                                            stride=stride,
+                                            padding=padding),
+                                            activation)
+class ResidualBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride=1):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
+        self.stride = stride
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_channels != out_channels:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_channels)
+            )
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = out + self.shortcut(residual)
+        out = self.relu(out)
+        return out
+# class ResidualBlock(nn.Module):
+#     def __init__(self, in_channels, out_channels, stride=1, expansion=4):
+#         super().__init__()
+#         mid_channels = out_channels // expansion
+#         self.pw_reduce = nn.Conv2d(in_channels, mid_channels, kernel_size=1, bias=False)
+#         self.bn1 = nn.BatchNorm2d(mid_channels)
+#         self.dw = nn.Conv2d(mid_channels, mid_channels, kernel_size=3,
+#                             stride=stride, padding=1, groups=mid_channels, bias=False)
+#         self.bn2 = nn.BatchNorm2d(mid_channels)
+#         self.pw_expand = nn.Conv2d(mid_channels, out_channels, kernel_size=1, bias=False)
+#         self.bn3 = nn.BatchNorm2d(out_channels)
+#         self.relu = nn.ReLU(inplace=True)
+#         self.stride = stride
+#         if stride != 1 or in_channels != out_channels:
+#             self.shortcut = nn.Sequential(
+#                 nn.Conv2d(in_channels, out_channels, kernel_size=1,
+#                           stride=stride, bias=False),
+#                 nn.BatchNorm2d(out_channels),
+#             )
+#         else:
+#             self.shortcut = nn.Identity()
+#     def forward(self, x):
+#         identity = x
+#         out = self.pw_reduce(x)
+#         out = self.bn1(out)
+#         out = self.relu(out)
+#         out = self.dw(out)
+#         out = self.bn2(out)
+#         out = self.relu(out)
+#         out = self.pw_expand(out)
+#         out = self.bn3(out)
+#         out += self.shortcut(identity)
+#         out = self.relu(out)
+#         return out
+class FeatureNet(nn.Module):
+    def __init__(self,height,width):
+        super().__init__()
+        model = torchvision.models.resnet152(pretrained=False)
+        layers = list(model.children())
+        self.FeatureEncoder = torch.nn.Sequential(*layers[:5].copy())
+        del model
+    def forward(self, x):
+        x = self.FeatureEncoder(x)
+        return x
+    def apply_feature_encoder(self, x):
+        x = self.FeatureEncoder(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(self, height, width, total_image_input=1):
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.encoder_pre = ResidualBlock((total_image_input*3), 20)
+        self.encoder_layer1 = ResidualBlock(20, 30)
+        self.encoder_layer2 = ResidualBlock(30, 50)
+        self.encoder_layer3 = nn.Sequential(
+            ResidualBlock(50, 100),
+            nn.MaxPool2d(kernel_size=2, stride=2)
+        )
+        self.encoder_layer4 = ResidualBlock(100, 100)
+        self.encoder_layer5 = nn.Sequential(
+            ResidualBlock(100, 100),
+            nn.MaxPool2d(kernel_size=2, stride=2)
+        )
+        self.encoder_layer6 = ResidualBlock(100, 100)
+        self.encoder_layer7 = nn.Sequential(
+            ResidualBlock(100, 100),
+            nn.MaxPool2d(kernel_size=2, stride=2)
+        )
+        self.encoder_layer8 = ResidualBlock(100, 100)
+        self.encoder_layer9 = nn.Sequential(
+            ResidualBlock(100, 100),
+            nn.MaxPool2d(kernel_size=2, stride=2)
+        )
+        self.encoder_layer10 = ResidualBlock(100, 100)
+        self.encoder_layer11 = ResidualBlock(100, 100)
+    def forward(self, x, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        x = self.encoder_pre(x)
+        x = self.encoder_layer1(x)
+        x = self.encoder_layer2(x)
+        skip1 = self.encoder_layer3(x)
+        x = self.encoder_layer4(skip1)
+        skip2 = self.encoder_layer5(x)
+        x = self.encoder_layer6(skip2)
+        skip3 = self.encoder_layer7(x)
+        x = self.encoder_layer8(skip3)
+        skip4 = self.encoder_layer9(x)
+        x = self.encoder_layer10(skip4)
+        x = self.encoder_layer11(x)
+        return x, [skip1, skip2, skip3, skip4]
+class DecoderRGB(nn.Module):
+    def __init__(self,height,width):
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.decoder_layer1 = ResidualBlock(100, 100)
+        self.decoder_layer2 = ResidualBlock(100, 100)
+        self.decoder_layer3 = ResidualBlock(100, 100)
+        self.decoder_layer4 = nn.Sequential(
+            nn.ConvTranspose2d(100, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer5 = ResidualBlock(100, 100)
+        self.decoder_layer6 = nn.Sequential(
+            nn.ConvTranspose2d(100, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer7 = ResidualBlock(100, 100)
+        self.decoder_layer8 = nn.Sequential(
+            nn.ConvTranspose2d(100, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer9 = ResidualBlock(100, 100)
+        self.decoder_layer10 = nn.Sequential(
+            nn.ConvTranspose2d(100, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer11 = ResidualBlock(100, 100)
+        self.decoder_layer12 = ResidualBlock(100, 96)
+        self.decoder_layer13 = ResidualBlock(96, 96)
+        self.decoder_layer14 = ResidualBlock(96, 96)
+        self.decoder_layer15 = nn.Sequential(
+            nn.Conv2d(96, 96, 3, stride=1, padding=1),
+            nn.Sigmoid()
+        )
+        self.decoder_layer16 = nn.Sequential(
+            nn.Conv2d(96, 96, 3, stride=1, padding=1),
+            nn.Sigmoid()
+        )
+    def forward(self, x, lower_skip_list, upper_skip_list, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        x = self.decoder_layer1(x)
+        x = self.decoder_layer2(x)
+        x = x + lower_skip_list[3] + upper_skip_list[1]
+        x = self.decoder_layer3(x)
+        x = self.decoder_layer4(x)
+        x = x + lower_skip_list[2] + upper_skip_list[0]
+        x = self.decoder_layer5(x)
+        x = self.decoder_layer6(x)
+        x = x + lower_skip_list[1]
+        x = self.decoder_layer7(x)
+        x = self.decoder_layer8(x)
+        x = x + lower_skip_list[0]
+        x = self.decoder_layer9(x)
+        x = self.decoder_layer10(x)
+        x = self.decoder_layer11(x)
+        x = self.decoder_layer12(x)
+        x = self.decoder_layer13(x)
+        x = self.decoder_layer14(x)
+        x = self.decoder_layer15(x)
+        x = self.decoder_layer16(x)
+        x = x.view(x.size()[0], 32, 3, height, width)
+        return x
+class DecoderSigma(nn.Module):
+    def __init__(self,height,width):
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.decoder_layer1 = ResidualBlock(100, 100)
+        self.decoder_layer2 = ResidualBlock(100, 100)
+        self.decoder_layer3 = ResidualBlock(100, 100)
+        self.decoder_layer4 = nn.Sequential(
+            nn.ConvTranspose2d(100, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer5 = ResidualBlock(100, 100)
+        self.decoder_layer6 = nn.Sequential(
+            nn.ConvTranspose2d(100, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer7 = ResidualBlock(100, 100)
+        self.decoder_layer8 = nn.Sequential(
+            nn.ConvTranspose2d(100, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer9 = ResidualBlock(100, 100)
+        self.decoder_layer10 = nn.Sequential(
+            nn.ConvTranspose2d(100, 50, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer11 = nn.Sequential(
+            nn.Conv2d(50, 32, 3, stride=1, padding=1),
+            nn.ReLU(True)
+        )
+        self.decoder_layer12 = nn.Sequential(
+            nn.Conv2d(32, 32, 3, stride=1, padding=1),
+            nn.ReLU(True)
+        )
+    def forward(self, x, lower_skip_list, upper_skip_list, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        x = self.decoder_layer1(x)
+        x = self.decoder_layer2(x)
+        x = x + lower_skip_list[3] + upper_skip_list[1]
+        x = self.decoder_layer3(x)
+        x = self.decoder_layer4(x)
+        x = x + lower_skip_list[2] + upper_skip_list[0]
+        x = self.decoder_layer5(x)
+        x = self.decoder_layer6(x)
+        x = x + lower_skip_list[1]
+        x = self.decoder_layer7(x)
+        x = self.decoder_layer8(x)
+        x = x + lower_skip_list[0]
+        x = self.decoder_layer9(x)
+        x = self.decoder_layer10(x)
+        x = self.decoder_layer11(x)
+        x = self.decoder_layer12(x)
+        x = x.view(x.size()[0], 32, 1, height, width)
+        return x
+class DecoderDepth(nn.Module):
+    def __init__(self,height,width):
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.decoder_layer1 = ResidualBlock(100, 100)
+        self.decoder_layer2 = ResidualBlock(100, 100)
+        self.decoder_layer3 = ResidualBlock(100, 100)
+        self.decoder_layer4 = nn.Sequential(
+            nn.ConvTranspose2d(100, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer5 = ResidualBlock(100, 100)
+        self.decoder_layer6 = nn.Sequential(
+            nn.ConvTranspose2d(100, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer7 = ResidualBlock(100, 100)
+        self.decoder_layer8 = nn.Sequential(
+            nn.ConvTranspose2d(100, 100, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer9 = ResidualBlock(100, 50)
+        self.decoder_layer10 = nn.Sequential(
+            nn.ConvTranspose2d(50, 20, 2, stride=2, padding=0),
+            nn.ReLU(True)
+        )
+        self.decoder_layer11 = nn.Sequential(
+            nn.Conv2d(20, 5, 3, stride=1, padding=1),
+            nn.ReLU(True)
+        )
+        self.decoder_layer12 = nn.Sequential(
+            nn.Conv2d(5, 1, 3, stride=1, padding=1),
+            nn.ReLU(True)
+        )
+    def forward(self, x, lower_skip_list, upper_skip_list, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        x = self.decoder_layer1(x)
+        x = self.decoder_layer2(x)
+        x = x + lower_skip_list[3] + upper_skip_list[1]
+        x = self.decoder_layer3(x)
+        x = self.decoder_layer4(x)
+        x = x + lower_skip_list[2] + upper_skip_list[0]
+        x = self.decoder_layer5(x)
+        x = self.decoder_layer6(x)
+        x = x + lower_skip_list[1]
+        x = self.decoder_layer7(x)
+        x = self.decoder_layer8(x)
+        x = x + lower_skip_list[0]
+        x = self.decoder_layer9(x)
+        x = self.decoder_layer10(x)
+        x = self.decoder_layer11(x)
+        x = self.decoder_layer12(x)
+        return x
+class MMPI(nn.Module):
+    def __init__(self,total_image_input=1, height=384,width=384):
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.feature_encoder = FeatureNet(height,width)
+        self.lower_encoder = Encoder(height, width, total_image_input)
+        self.merge_decoder_rgb = DecoderRGB(height, width)
+        self.merge_decoder_sigma = DecoderSigma(height, width)
+        self.depth_decoder = DecoderDepth(height, width)
+        self.upper_encoder_extra_1 = nn.Sequential(
+            ResidualBlock(256, 100),
+            nn.MaxPool2d(kernel_size=2, stride=2)
+        )
+        self.upper_encoder_extra_2 = nn.Sequential(
+            ResidualBlock(100, 100),
+            nn.MaxPool2d(kernel_size=2, stride=2)
+        )
+    def forward(self, x, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        upper_features_1 = self.feature_encoder.apply_feature_encoder(x)
+        upper_features_1 = self.upper_encoder_extra_1(upper_features_1)
+        upper_features_2 = self.upper_encoder_extra_2(upper_features_1)
+        lower_feature, skip_list = self.lower_encoder(x, height, width)
+        merged_feature_rgb = self.merge_decoder_rgb(lower_feature, skip_list, [upper_features_1, upper_features_2], height, width)
+        merged_feature_sigma = self.merge_decoder_sigma(lower_feature, skip_list, [upper_features_1, upper_features_2], height, width)
+        merged_feature_depth = self.depth_decoder(lower_feature, skip_list, [upper_features_1, upper_features_2])
+        return merged_feature_rgb, merged_feature_sigma, merged_feature_depth
+    def get_rgb_sigma(self, x, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        upper_features_1 = self.feature_encoder.apply_feature_encoder(x)
+        upper_features_1 = self.upper_encoder_extra_1(upper_features_1)
+        upper_features_2 = self.upper_encoder_extra_2(upper_features_1)
+        lower_feature, skip_list = self.lower_encoder(x, height, width)
+        merged_feature_rgb = self.merge_decoder_rgb(lower_feature, skip_list, [upper_features_1, upper_features_2], height, width)
+        merged_feature_sigma = self.merge_decoder_sigma(lower_feature, skip_list, [upper_features_1, upper_features_2], height, width)
+        return merged_feature_rgb, merged_feature_sigma
+    def get_depth(self, x, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        upper_features_1 = self.feature_encoder.apply_feature_encoder(x)
+        upper_features_1 = self.upper_encoder_extra_1(upper_features_1)
+        upper_features_2 = self.upper_encoder_extra_2(upper_features_1)
+        lower_feature, skip_list = self.lower_encoder(x, height, width)
+        merged_feature_depth = self.depth_decoder(lower_feature, skip_list, [upper_features_1, upper_features_2])
+        return merged_feature_depth
+    def get_layer_depth(self, x, grid, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        upper_features_1 = self.feature_encoder.apply_feature_encoder(x)
+        upper_features_1 = self.upper_encoder_extra_1(upper_features_1)
+        upper_features_2 = self.upper_encoder_extra_2(upper_features_1)
+        lower_feature, skip_list = self.lower_encoder(x, height, width)
+        rgb_layers = self.merge_decoder_rgb(lower_feature, skip_list, [upper_features_1, upper_features_2], height, width)
+        sigma_layers = self.merge_decoder_sigma(lower_feature, skip_list, [upper_features_1, upper_features_2], height, width)
+        pred_mpi_planes = torch.randn((1, 4, height, width)).to(params.DEVICE)
+        for i in range(params.params_num_planes):
+            RGBA = torch.cat((rgb_layers[0,i,:,:,:],sigma_layers[0,i,:,:,:]),dim=0).unsqueeze(0)
+            pred_mpi_planes = torch.cat((pred_mpi_planes,RGBA),dim=0)
+        pred_mpi_planes = pred_mpi_planes[1:,:,:,:].unsqueeze(0)
+        sigma = pred_mpi_planes[:, :, 3, :, :]
+        B, D, H, W = sigma.shape
+        pred_mpi_disp = grid
+        disp_sorted, _ = pred_mpi_disp.sort(dim=1)
+        delta = disp_sorted[:, 1:] - disp_sorted[:, :-1]
+        delta_last = delta[:, -1:]
+        delta = torch.cat([delta, delta_last], dim=1)
+        delta = delta.unsqueeze(-1).unsqueeze(-1).expand_as(sigma)
+        alpha = 1.0 - torch.exp(-delta * sigma)
+        transmittance = torch.cumprod(1 - alpha + 1e-7, dim=1)
+        shifted_transmittance = torch.ones_like(transmittance)
+        shifted_transmittance[:, 1:, :, :] = transmittance[:, :-1, :, :]
+        disparity = pred_mpi_disp.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, H, W)
+        disparity_map = (disparity * alpha * shifted_transmittance).sum(dim=1, keepdim=True)
+        return disparity_map
+    def get_layers(self, x, height=None, width=None):
+        if height == None and width == None:
+            height = self.height
+            width = self.width
+        upper_features_1 = self.feature_encoder.apply_feature_encoder(x)
+        upper_features_1 = self.upper_encoder_extra_1(upper_features_1)
+        upper_features_2 = self.upper_encoder_extra_2(upper_features_1)
+        lower_feature, skip_list = self.lower_encoder(x, height, width)
+        merged_feature_rgb = self.merge_decoder_rgb(lower_feature, skip_list, [upper_features_1, upper_features_2], height, width)
+        merged_feature_sigma = self.merge_decoder_sigma(lower_feature, skip_list, [upper_features_1, upper_features_2], height, width)
+        return merged_feature_rgb, merged_feature_sigma

parameters.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import os
+import torch
+params_height = 256
+params_width = 256
+params_m = 32
+params_number_input = 1
+params_step_size = 2
+params_gamma = 0.2
+params_num_planes = 32
+TRAIN_LOCATION = "./lf_train.txt"
+VALIDATION_LOCATION = "./lf_validate.txt"
+TEST_LOCATION = "./lf_test.txt"
+LOG_FILE_LOCATION = "./logs/training_log_0.txt"
+CHECKPOINT_LOCATION = "./checkpoint/"
+RESUME_CHECKPOINT_LOCATION = "./checkpoint/checkpoint_best.pth"
+START_CHECKPOINT_LOCATION = "./checkpoint/checkpoint_init.pth"
+DEVICE = "cpu"
+BATCH_SIZE = 32
+LEARNING_RATE = 0.0001
+NUM_EPOCHS = 150
+START_EPOCH = 0
+PRINT_INTERVAL = 20
+T_max = 150
+os.makedirs("./logs",exist_ok=True)
+os.makedirs("./checkpoint",exist_ok=True)
+os.makedirs("./output",exist_ok=True)
+def uniform_planes(a: float, b: float, n: int) -> torch.Tensor:
+    """
+    Return n values uniformly spaced *within* (a, b),
+    i.e. excluding the exact endpoints a and b.
+    """
+    step = (b - a) / (n + 1)
+    # torch.arange(1, n+1) gives [1,2,...,n]
+    return a + step * torch.arange(1, n + 1, dtype=torch.float32)
+def get_disparity_all_src():
+    d1 = uniform_planes(0.0, 0.4, 20)
+    d2 = uniform_planes(0.4, 1.0, 12)
+    disparities = torch.cat([d1, d2], dim=0)
+    return disparities

post-install.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #!/bin/bash
2	+ pip install "pytorch3d @ git+https://github.com/facebookresearch/pytorch3d.git@89653419d0973396f3eff1a381ba09a07fffc2ed"

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+numpy==1.26.4
+torch==2.1.0
+torchvision==0.16.0
+pytorch-lightning==2.1.3
+pytorch-msssim==1.0.0
+pytorchvideo==0.1.5
+grpcio==1.57.0
+opencv-contrib-python==4.10.0.84
+opencv-python==4.6.0.66
+pillow==10.4.0
+pillow_heif==0.15.0
+matplotlib==3.7.2
+matplotlib-inline==0.1.6
+transformers==4.43.3
+tqdm==4.65.0
+moviepy==1.0.3
+scikit-image==0.21.0
+scikit-learn==1.3.0
+scipy==1.11.2

utils.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import numpy as np
+from scipy.ndimage import map_coordinates
+def xyzcube(face_w):
+    '''
+    Return the xyz cordinates of the unit cube in [F R B L U D] format.
+    '''
+    out = np.zeros((face_w, face_w * 6, 3), np.float32)
+    rng = np.linspace(-0.5, 0.5, num=face_w, dtype=np.float32)
+    grid = np.stack(np.meshgrid(rng, -rng), -1)
+    # Front face (z = 0.5)
+    out[:, 0*face_w:1*face_w, [0, 1]] = grid
+    out[:, 0*face_w:1*face_w, 2] = 0.5
+    # Right face (x = 0.5)
+    out[:, 1*face_w:2*face_w, [2, 1]] = grid
+    out[:, 1*face_w:2*face_w, 0] = 0.5
+    # Back face (z = -0.5)
+    out[:, 2*face_w:3*face_w, [0, 1]] = grid
+    out[:, 2*face_w:3*face_w, 2] = -0.5
+    # Left face (x = -0.5)
+    out[:, 3*face_w:4*face_w, [2, 1]] = grid
+    out[:, 3*face_w:4*face_w, 0] = -0.5
+    # Up face (y = 0.5)
+    out[:, 4*face_w:5*face_w, [0, 2]] = grid
+    out[:, 4*face_w:5*face_w, 1] = 0.5
+    # Down face (y = -0.5)
+    out[:, 5*face_w:6*face_w, [0, 2]] = grid
+    out[:, 5*face_w:6*face_w, 1] = -0.5
+    return out
+def equirect_uvgrid(h, w):
+    u = np.linspace(-np.pi, np.pi, num=w, dtype=np.float32)
+    v = np.linspace(np.pi, -np.pi, num=h, dtype=np.float32) / 2
+    return np.stack(np.meshgrid(u, v), axis=-1)
+def equirect_facetype(h, w):
+    '''
+    0F 1R 2B 3L 4U 5D
+    '''
+    tp = np.roll(np.arange(4).repeat(w // 4)[None, :].repeat(h, 0), 3 * w // 8, 1)
+    # Prepare ceil mask
+    mask = np.zeros((h, w // 4), np.bool)
+    idx = np.linspace(-np.pi, np.pi, w // 4) / 4
+    idx = h // 2 - np.round(np.arctan(np.cos(idx)) * h / np.pi).astype(int)
+    for i, j in enumerate(idx):
+        mask[:j, i] = 1
+    mask = np.roll(np.concatenate([mask] * 4, 1), 3 * w // 8, 1)
+    tp[mask] = 4
+    tp[np.flip(mask, 0)] = 5
+    return tp.astype(np.int32)
+def xyzpers(h_fov, v_fov, u, v, out_hw, in_rot):
+    out = np.ones((*out_hw, 3), np.float32)
+    x_max = np.tan(h_fov / 2)
+    y_max = np.tan(v_fov / 2)
+    x_rng = np.linspace(-x_max, x_max, num=out_hw[1], dtype=np.float32)
+    y_rng = np.linspace(-y_max, y_max, num=out_hw[0], dtype=np.float32)
+    out[..., :2] = np.stack(np.meshgrid(x_rng, -y_rng), -1)
+    Rx = rotation_matrix(v, [1, 0, 0])
+    Ry = rotation_matrix(u, [0, 1, 0])
+    Ri = rotation_matrix(in_rot, np.array([0, 0, 1.0]).dot(Rx).dot(Ry))
+    return out.dot(Rx).dot(Ry).dot(Ri)
+def xyz2uv(xyz):
+    '''
+    xyz: ndarray in shape of [..., 3]
+    '''
+    x, y, z = np.split(xyz, 3, axis=-1)
+    u = np.arctan2(x, z)
+    c = np.sqrt(x**2 + z**2)
+    v = np.arctan2(y, c)
+    return np.concatenate([u, v], axis=-1)
+def uv2unitxyz(uv):
+    u, v = np.split(uv, 2, axis=-1)
+    y = np.sin(v)
+    c = np.cos(v)
+    x = c * np.sin(u)
+    z = c * np.cos(u)
+    return np.concatenate([x, y, z], axis=-1)
+def uv2coor(uv, h, w):
+    '''
+    uv: ndarray in shape of [..., 2]
+    h: int, height of the equirectangular image
+    w: int, width of the equirectangular image
+    '''
+    u, v = np.split(uv, 2, axis=-1)
+    coor_x = (u / (2 * np.pi) + 0.5) * w - 0.5
+    coor_y = (-v / np.pi + 0.5) * h - 0.5
+    return np.concatenate([coor_x, coor_y], axis=-1)
+def coor2uv(coorxy, h, w):
+    coor_x, coor_y = np.split(coorxy, 2, axis=-1)
+    u = ((coor_x + 0.5) / w - 0.5) * 2 * np.pi
+    v = -((coor_y + 0.5) / h - 0.5) * np.pi
+    return np.concatenate([u, v], axis=-1)
+def sample_equirec(e_img, coor_xy, order):
+    w = e_img.shape[1]
+    coor_x, coor_y = np.split(coor_xy, 2, axis=-1)
+    pad_u = np.roll(e_img[[0]], w // 2, 1)
+    pad_d = np.roll(e_img[[-1]], w // 2, 1)
+    e_img = np.concatenate([e_img, pad_d, pad_u], 0)
+    return map_coordinates(e_img, [coor_y, coor_x],
+                           order=order, mode='wrap')[..., 0]
+def sample_cubefaces(cube_faces, tp, coor_y, coor_x, order):
+    cube_faces = cube_faces.copy()
+    cube_faces[1] = np.flip(cube_faces[1], 1)
+    cube_faces[2] = np.flip(cube_faces[2], 1)
+    cube_faces[4] = np.flip(cube_faces[4], 0)
+    # Pad up down
+    pad_ud = np.zeros((6, 2, cube_faces.shape[2]))
+    pad_ud[0, 0] = cube_faces[5, 0, :]
+    pad_ud[0, 1] = cube_faces[4, -1, :]
+    pad_ud[1, 0] = cube_faces[5, :, -1]
+    pad_ud[1, 1] = cube_faces[4, ::-1, -1]
+    pad_ud[2, 0] = cube_faces[5, -1, ::-1]
+    pad_ud[2, 1] = cube_faces[4, 0, ::-1]
+    pad_ud[3, 0] = cube_faces[5, ::-1, 0]
+    pad_ud[3, 1] = cube_faces[4, :, 0]
+    pad_ud[4, 0] = cube_faces[0, 0, :]
+    pad_ud[4, 1] = cube_faces[2, 0, ::-1]
+    pad_ud[5, 0] = cube_faces[2, -1, ::-1]
+    pad_ud[5, 1] = cube_faces[0, -1, :]
+    cube_faces = np.concatenate([cube_faces, pad_ud], 1)
+    # Pad left right
+    pad_lr = np.zeros((6, cube_faces.shape[1], 2))
+    pad_lr[0, :, 0] = cube_faces[1, :, 0]
+    pad_lr[0, :, 1] = cube_faces[3, :, -1]
+    pad_lr[1, :, 0] = cube_faces[2, :, 0]
+    pad_lr[1, :, 1] = cube_faces[0, :, -1]
+    pad_lr[2, :, 0] = cube_faces[3, :, 0]
+    pad_lr[2, :, 1] = cube_faces[1, :, -1]
+    pad_lr[3, :, 0] = cube_faces[0, :, 0]
+    pad_lr[3, :, 1] = cube_faces[2, :, -1]
+    pad_lr[4, 1:-1, 0] = cube_faces[1, 0, ::-1]
+    pad_lr[4, 1:-1, 1] = cube_faces[3, 0, :]
+    pad_lr[5, 1:-1, 0] = cube_faces[1, -2, :]
+    pad_lr[5, 1:-1, 1] = cube_faces[3, -2, ::-1]
+    cube_faces = np.concatenate([cube_faces, pad_lr], 2)
+    return map_coordinates(cube_faces, [tp, coor_y, coor_x], order=order, mode='wrap')
+def cube_h2list(cube_h):
+    assert cube_h.shape[0] * 6 == cube_h.shape[1]
+    return np.split(cube_h, 6, axis=1)
+def cube_list2h(cube_list):
+    assert len(cube_list) == 6
+    assert sum(face.shape == cube_list[0].shape for face in cube_list) == 6
+    return np.concatenate(cube_list, axis=1)
+def cube_h2dict(cube_h):
+    cube_list = cube_h2list(cube_h)
+    return dict([(k, cube_list[i])
+                 for i, k in enumerate(['F', 'R', 'B', 'L', 'U', 'D'])])
+def cube_dict2h(cube_dict, face_k=['F', 'R', 'B', 'L', 'U', 'D']):
+    assert len(face_k) == 6
+    return cube_list2h([cube_dict[k] for k in face_k])
+def cube_h2dice(cube_h):
+    assert cube_h.shape[0] * 6 == cube_h.shape[1]
+    w = cube_h.shape[0]
+    cube_dice = np.zeros((w * 3, w * 4, cube_h.shape[2]), dtype=cube_h.dtype)
+    cube_list = cube_h2list(cube_h)
+    # Order: F R B L U D
+    sxy = [(1, 1), (2, 1), (3, 1), (0, 1), (1, 0), (1, 2)]
+    for i, (sx, sy) in enumerate(sxy):
+        face = cube_list[i]
+        if i in [1, 2]:
+            face = np.flip(face, axis=1)
+        if i == 4:
+            face = np.flip(face, axis=0)
+        cube_dice[sy*w:(sy+1)*w, sx*w:(sx+1)*w] = face
+    return cube_dice
+def cube_dice2h(cube_dice):
+    w = cube_dice.shape[0] // 3
+    assert cube_dice.shape[0] == w * 3 and cube_dice.shape[1] == w * 4
+    cube_h = np.zeros((w, w * 6, cube_dice.shape[2]), dtype=cube_dice.dtype)
+    # Order: F R B L U D
+    sxy = [(1, 1), (2, 1), (3, 1), (0, 1), (1, 0), (1, 2)]
+    for i, (sx, sy) in enumerate(sxy):
+        face = cube_dice[sy*w:(sy+1)*w, sx*w:(sx+1)*w]
+        if i in [1, 2]:
+            face = np.flip(face, axis=1)
+        if i == 4:
+            face = np.flip(face, axis=0)
+        cube_h[:, i*w:(i+1)*w] = face
+    return cube_h
+def rotation_matrix(rad, ax):
+    ax = np.array(ax)
+    assert len(ax.shape) == 1 and ax.shape[0] == 3
+    ax = ax / np.sqrt((ax**2).sum())
+    R = np.diag([np.cos(rad)] * 3)
+    R = R + np.outer(ax, ax) * (1.0 - np.cos(rad))
+    ax = ax * np.sin(rad)
+    R = R + np.array([[0, -ax[2], ax[1]],
+                      [ax[2], 0, -ax[0]],
+                      [-ax[1], ax[0], 0]])
+    return R

utils/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

utils/__init__.py ADDED Viewed

File without changes

utils/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (164 Bytes). View file

utils/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (156 Bytes). View file

utils/__pycache__/rendererBackbone.cpython-39.pyc ADDED Viewed

Binary file (4.02 kB). View file

utils/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (4.03 kB). View file

utils/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (4.04 kB). View file

utils/mpi/__init__.py ADDED Viewed

File without changes

utils/mpi/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (168 Bytes). View file

utils/mpi/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (160 Bytes). View file

utils/mpi/__pycache__/homography_sampler.cpython-38.pyc ADDED Viewed

Binary file (4.62 kB). View file

utils/mpi/__pycache__/homography_sampler.cpython-39.pyc ADDED Viewed

Binary file (4.64 kB). View file

utils/mpi/__pycache__/mpi_rendering.cpython-38.pyc ADDED Viewed

Binary file (7.43 kB). View file

utils/mpi/__pycache__/mpi_rendering.cpython-39.pyc ADDED Viewed

Binary file (7.45 kB). View file

utils/mpi/__pycache__/rendering_utils.cpython-38.pyc ADDED Viewed

Binary file (4.09 kB). View file

utils/mpi/__pycache__/rendering_utils.cpython-39.pyc ADDED Viewed

Binary file (4.07 kB). View file

utils/mpi/homography_sampler.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import torch
+import numpy as np
+from scipy.spatial.transform import Rotation
+def inverse(matrices):
+    """
+    torch.inverse() sometimes produces outputs with nan the when batch size is 2.
+    Ref https://github.com/pytorch/pytorch/issues/47272
+    this function keeps inversing the matrix until successful or maximum tries is reached
+    :param matrices Bx3x3
+    """
+    inverse = None
+    max_tries = 5
+    while (inverse is None) or (torch.isnan(inverse)).any():
+        #torch.cuda.synchronize()
+        inverse = torch.inverse(matrices)
+        # Break out of the loop when the inverse is successful or there"re no more tries
+        max_tries -= 1
+        if max_tries == 0:
+            break
+    # Raise an Exception if the inverse contains nan
+    if (torch.isnan(inverse)).any():
+        raise Exception("Matrix inverse contains nan!")
+    return inverse
+class HomographySample:
+    def __init__(self, H_tgt, W_tgt, device=None):
+        if device is None:
+            self.device = torch.device("cpu")
+        else:
+            self.device = device
+        self.Height_tgt = H_tgt
+        self.Width_tgt = W_tgt
+        self.meshgrid = self.grid_generation(self.Height_tgt, self.Width_tgt, self.device)
+        self.meshgrid = self.meshgrid.permute(2, 0, 1).contiguous()  # 3xHxW
+        self.n = self.plane_normal_generation(self.device)
+    @staticmethod
+    def grid_generation(H, W, device):
+        x = np.linspace(0, W-1, W)
+        y = np.linspace(0, H-1, H)
+        xv, yv = np.meshgrid(x, y)  # HxW
+        xv = torch.from_numpy(xv.astype(np.float32)).to(dtype=torch.float32, device=device)
+        yv = torch.from_numpy(yv.astype(np.float32)).to(dtype=torch.float32, device=device)
+        ones = torch.ones_like(xv)
+        meshgrid = torch.stack((xv, yv, ones), dim=2)  # HxWx3
+        return meshgrid
+    @staticmethod
+    def plane_normal_generation(device):
+        n = torch.tensor([0, 0, 1], dtype=torch.float32, device=device)
+        return n
+    @staticmethod
+    def euler_to_rotation_matrix(x_angle, y_angle, z_angle, seq='xyz', degrees=False):
+        """
+        Note that here we want to return a rotation matrix rot_mtx, which transform the tgt points into src frame,
+        i.e, rot_mtx * p_tgt = p_src
+        Therefore we need to add negative to x/y/z_angle
+        :param roll:
+        :param pitch:
+        :param yaw:
+        :return:
+        """
+        r = Rotation.from_euler(seq,
+                                [-x_angle, -y_angle, -z_angle],
+                                degrees=degrees)
+        rot_mtx = r.as_matrix().astype(np.float32)
+        return rot_mtx
+    def sample(self, src_BCHW, d_src_B,
+               G_tgt_src,
+               K_src_inv, K_tgt):
+        """
+        Coordinate system: x, y are the image directions, z is pointing to depth direction
+        :param src_BCHW: torch tensor float, 0-1, rgb/rgba. BxCxHxW
+                         Assume to be at position P=[I|0]
+        :param d_src_B: distance of image plane to src camera origin
+        :param G_tgt_src: Bx4x4
+        :param K_src_inv: Bx3x3
+        :param K_tgt: Bx3x3
+        :return: tgt_BCHW
+        """
+        # parameter processing ------ begin ------
+        B, channels, Height_src, Width_src = src_BCHW.size(0), src_BCHW.size(1), src_BCHW.size(2), src_BCHW.size(3)
+        R_tgt_src = G_tgt_src[:, 0:3, 0:3]
+        t_tgt_src = G_tgt_src[:, 0:3, 3]
+        Height_tgt = self.Height_tgt
+        Width_tgt = self.Width_tgt
+        # if R_src_tgt is None:
+        #     R_src_tgt = torch.eye(3, dtype=torch.float32, device=src_BCHW.device)
+        #     R_src_tgt = R_src_tgt.unsqueeze(0).expand(B, 3, 3)
+        # if t_src_tgt is None:
+        #     t_src_tgt = torch.tensor([0, 0, 0],
+        #                              dtype=torch.float32,
+        #                              device=src_BCHW.device)
+        #     t_src_tgt = t_src_tgt.unsqueeze(0).expand(B, 3)
+        # relationship between FoV and focal length:
+        # assume W > H
+        # W / 2 = f*tan(\theta / 2)
+        # here we default the horizontal FoV as 53.13 degree
+        # the vertical FoV can be computed as H/2 = W*tan(\theta/2)
+        R_tgt_src = R_tgt_src.to(device=src_BCHW.device)
+        t_tgt_src = t_tgt_src.to(device=src_BCHW.device)
+        K_src_inv = K_src_inv.to(device=src_BCHW.device)
+        K_tgt = K_tgt.to(device=src_BCHW.device)
+        # parameter processing ------ end ------
+        # the goal is compute H_src_tgt, that maps a tgt pixel to src pixel
+        # so we compute H_tgt_src first, and then inverse
+        n = self.n.to(device=src_BCHW.device)
+        n = n.unsqueeze(0).repeat(B, 1)  # Bx3
+        # Bx3x3 - (Bx3x1 * Bx1x3)
+        # note here we use -d_src, because the plane function is n^T * X - d_src = 0
+        d_src_B33 = d_src_B.reshape(B, 1, 1).repeat(1, 3, 3)  # B -> Bx3x3
+        R_tnd = R_tgt_src - torch.matmul(t_tgt_src.unsqueeze(2), n.unsqueeze(1)) / -d_src_B33
+        H_tgt_src = torch.matmul(K_tgt,
+                                 torch.matmul(R_tnd, K_src_inv))
+        # TODO: fix cuda inverse
+        with torch.no_grad():
+            H_src_tgt = inverse(H_tgt_src)
+        # create tgt image grid, and map to src
+        meshgrid_tgt_homo = self.meshgrid.to(src_BCHW.device)
+        # 3xHxW -> Bx3xHxW
+        meshgrid_tgt_homo = meshgrid_tgt_homo.unsqueeze(0).expand(B, 3, Height_tgt, Width_tgt)
+        # wrap meshgrid_tgt_homo to meshgrid_src
+        meshgrid_tgt_homo_B3N = meshgrid_tgt_homo.view(B, 3, -1)  # Bx3xHW
+        meshgrid_src_homo_B3N = torch.matmul(H_src_tgt, meshgrid_tgt_homo_B3N)  # Bx3x3 * Bx3xHW -> Bx3xHW
+        # Bx3xHW -> Bx3xHxW -> BxHxWx3
+        meshgrid_src_homo = meshgrid_src_homo_B3N.view(B, 3, Height_tgt, Width_tgt).permute(0, 2, 3, 1)
+        meshgrid_src = meshgrid_src_homo[:, :, :, 0:2] / meshgrid_src_homo[:, :, :, 2:]  # BxHxWx2
+        valid_mask_x = torch.logical_and(meshgrid_src[:, :, :, 0] < Width_src,
+                                         meshgrid_src[:, :, :, 0] > -1)
+        valid_mask_y = torch.logical_and(meshgrid_src[:, :, :, 1] < Height_src,
+                                         meshgrid_src[:, :, :, 1] > -1)
+        valid_mask = torch.logical_and(valid_mask_x, valid_mask_y)  # BxHxW
+        # sample from src_BCHW
+        # normalize meshgrid_src to [-1,1]
+        meshgrid_src[:, :, :, 0] = (meshgrid_src[:, :, :, 0]+0.5) / (Width_src * 0.5) - 1
+        meshgrid_src[:, :, :, 1] = (meshgrid_src[:, :, :, 1]+0.5) / (Height_src * 0.5) - 1
+        tgt_BCHW = torch.nn.functional.grid_sample(src_BCHW, grid=meshgrid_src, padding_mode='border',
+                                                   align_corners=False)
+        # BxCxHxW, BxHxW
+        return tgt_BCHW, valid_mask

utils/mpi/mpi_rendering.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import torch
+from utils.mpi.homography_sampler import HomographySample
+from utils.mpi.rendering_utils import transform_G_xyz, sample_pdf, gather_pixel_by_pxpy
+def render(rgb_BS3HW, sigma_BS1HW, xyz_BS3HW, use_alpha=False, is_bg_depth_inf=False):
+    if not use_alpha:
+        imgs_syn, depth_syn, blend_weights, weights = plane_volume_rendering(
+            rgb_BS3HW,
+            sigma_BS1HW,
+            xyz_BS3HW,
+            is_bg_depth_inf
+        )
+    else:
+        imgs_syn, weights = alpha_composition(sigma_BS1HW, rgb_BS3HW)
+        depth_syn, _ = alpha_composition(sigma_BS1HW, xyz_BS3HW[:, :, 2:])
+        # No rgb blending with alpha composition
+        blend_weights = torch.cumprod(1 - sigma_BS1HW + 1e-6, dim=1)
+        # blend_weights = torch.zeros_like(rgb_BS3HW).cuda()
+    return imgs_syn, depth_syn, blend_weights, weights
+def alpha_composition(alpha_BK1HW, value_BKCHW):
+    """
+    composition equation from 'Single-View View Synthesis with Multiplane Images'
+    K is the number of planes, k=0 means the nearest plane, k=K-1 means the farthest plane
+    :param alpha_BK1HW: alpha at each of the K planes
+    :param value_BKCHW: rgb/disparity at each of the K planes
+    :return:
+    """
+    B, K, _, H, W = alpha_BK1HW.size()
+    alpha_comp_cumprod = torch.cumprod(1 - alpha_BK1HW, dim=1)  # BxKx1xHxW
+    preserve_ratio = torch.cat((torch.ones((B, 1, 1, H, W), dtype=alpha_BK1HW.dtype, device=alpha_BK1HW.device),
+                                alpha_comp_cumprod[:, 0:K-1, :, :, :]), dim=1)  # BxKx1xHxW
+    weights = alpha_BK1HW * preserve_ratio  # BxKx1xHxW
+    value_composed = torch.sum(value_BKCHW * weights, dim=1, keepdim=False)  # Bx3xHxW
+    return value_composed, weights
+def plane_volume_rendering(rgb_BS3HW, sigma_BS1HW, xyz_BS3HW, is_bg_depth_inf):
+    B, S, _, H, W = sigma_BS1HW.size()
+    xyz_diff_BS3HW = xyz_BS3HW[:, 1:, :, :, :] - xyz_BS3HW[:, 0:-1, :, :, :]  # Bx(S-1)x3xHxW
+    xyz_dist_BS1HW = torch.norm(xyz_diff_BS3HW, dim=2, keepdim=True)  # Bx(S-1)x1xHxW
+    xyz_dist_BS1HW = torch.cat((xyz_dist_BS1HW,
+                                torch.full((B, 1, 1, H, W),
+                                           fill_value=1e3,
+                                           dtype=xyz_BS3HW.dtype,
+                                           device=xyz_BS3HW.device)),
+                               dim=1)  # BxSx3xHxW
+    transparency = torch.exp(-sigma_BS1HW * xyz_dist_BS1HW)  # BxSx1xHxW
+    alpha = 1 - transparency # BxSx1xHxW
+    # add small eps to avoid zero transparency_acc
+    # pytorch.cumprod is like: [a, b, c] -> [a, a*b, a*b*c], we need to modify it to [1, a, a*b]
+    transparency_acc = torch.cumprod(transparency + 1e-6, dim=1)  # BxSx1xHxW
+    transparency_acc = torch.cat((torch.ones((B, 1, 1, H, W), dtype=transparency.dtype, device=transparency.device),
+                                  transparency_acc[:, 0:-1, :, :, :]),
+                                 dim=1)  # BxSx1xHxW
+    weights = transparency_acc * alpha  # BxSx1xHxW
+    rgb_out, depth_out = weighted_sum_mpi(rgb_BS3HW, xyz_BS3HW, weights, is_bg_depth_inf)
+    return rgb_out, depth_out, transparency_acc, weights
+def weighted_sum_mpi(rgb_BS3HW, xyz_BS3HW, weights, is_bg_depth_inf):
+    weights_sum = torch.sum(weights, dim=1, keepdim=False)  # Bx1xHxW
+    rgb_out = torch.sum(weights * rgb_BS3HW, dim=1, keepdim=False)  # Bx3xHxW
+    if is_bg_depth_inf:
+        # for dtu dataset, set large depth if weight_sum is small
+        depth_out = torch.sum(weights * xyz_BS3HW[:, :, 2:, :, :], dim=1, keepdim=False) \
+                    + (1 - weights_sum) * 1000
+    else:
+        depth_out = torch.sum(weights * xyz_BS3HW[:, :, 2:, :, :], dim=1, keepdim=False) \
+                    / (weights_sum + 1e-5)  # Bx1xHxW
+    return rgb_out, depth_out
+def get_xyz_from_depth(meshgrid_homo,
+                       depth,
+                       K_inv):
+    """
+    :param meshgrid_homo: 3xHxW
+    :param depth: Bx1xHxW
+    :param K_inv: Bx3x3
+    :return:
+    """
+    H, W = meshgrid_homo.size(1), meshgrid_homo.size(2)
+    B, _, H_d, W_d = depth.size()
+    assert H==H_d, W==W_d
+    # 3xHxW -> Bx3xHxW
+    meshgrid_src_homo = meshgrid_homo.unsqueeze(0).repeat(B, 1, 1, 1)
+    meshgrid_src_homo_B3N = meshgrid_src_homo.reshape(B, 3, -1)
+    xyz_src = torch.matmul(K_inv, meshgrid_src_homo_B3N)  # Bx3xHW
+    xyz_src = xyz_src.reshape(B, 3, H, W) * depth  # Bx3xHxW
+    return xyz_src
+def disparity_consistency_src_to_tgt(meshgrid_homo, K_src_inv, disparity_src,
+                                     G_tgt_src, K_tgt, disparity_tgt):
+    """
+    :param xyz_src_B3N: Bx3xN
+    :param G_tgt_src: Bx4x4
+    :param K_tgt: Bx3x3
+    :param disparity_tgt: Bx1xHxW
+    :return:
+    """
+    B, _, H, W = disparity_src.size()
+    depth_src = torch.reciprocal(disparity_src)
+    xyz_src_B3N = get_xyz_from_depth(meshgrid_homo, depth_src, K_src_inv).view(B, 3, H*W)
+    xyz_tgt_B3N = transform_G_xyz(G_tgt_src, xyz_src_B3N, is_return_homo=False)
+    K_xyz_tgt_B3N = torch.matmul(K_tgt, xyz_tgt_B3N)
+    pxpy_tgt_B2N = K_xyz_tgt_B3N[:, 0:2, :] / K_xyz_tgt_B3N[:, 2:, :]  # Bx2xN
+    pxpy_tgt_mask = torch.logical_and(
+        torch.logical_and(pxpy_tgt_B2N[:, 0:1, :] >= 0,
+                          pxpy_tgt_B2N[:, 0:1, :] <= W - 1),
+        torch.logical_and(pxpy_tgt_B2N[:, 1:2, :] >= 0,
+                          pxpy_tgt_B2N[:, 1:2, :] <= H - 1)
+    )  # B1N
+    disparity_src = torch.reciprocal(xyz_tgt_B3N[:, 2:, :])  # Bx1xN
+    disparity_tgt = gather_pixel_by_pxpy(disparity_tgt, pxpy_tgt_B2N)  # Bx1xN
+    depth_diff = torch.abs(disparity_src - disparity_tgt)
+    return torch.mean(depth_diff[pxpy_tgt_mask])
+def get_src_xyz_from_plane_disparity(meshgrid_src_homo,
+                                     mpi_disparity_src,
+                                     K_src_inv):
+    """
+    :param meshgrid_src_homo: 3xHxW
+    :param mpi_disparity_src: BxS
+    :param K_src_inv: Bx3x3
+    :return:
+    """
+    B, S = mpi_disparity_src.size()
+    H, W = meshgrid_src_homo.size(1), meshgrid_src_homo.size(2)
+    mpi_depth_src = torch.reciprocal(mpi_disparity_src)  # BxS
+    K_src_inv_Bs33 = K_src_inv.unsqueeze(1).repeat(1, S, 1, 1).reshape(B * S, 3, 3)
+    # 3xHxW -> BxSx3xHxW
+    meshgrid_src_homo = meshgrid_src_homo.unsqueeze(0).unsqueeze(1).repeat(B, S, 1, 1, 1)
+    meshgrid_src_homo_Bs3N = meshgrid_src_homo.reshape(B * S, 3, -1)
+    xyz_src = torch.matmul(K_src_inv_Bs33, meshgrid_src_homo_Bs3N)  # BSx3xHW
+    xyz_src = xyz_src.reshape(B, S, 3, H * W) * mpi_depth_src.unsqueeze(2).unsqueeze(3)  # BxSx3xHW
+    xyz_src_BS3HW = xyz_src.reshape(B, S, 3, H, W)
+    return xyz_src_BS3HW
+def get_tgt_xyz_from_plane_disparity(xyz_src_BS3HW,
+                                     G_tgt_src):
+    """
+    :param xyz_src_BS3HW: BxSx3xHxW
+    :param G_tgt_src: Bx4x4
+    :return:
+    """
+    B, S, _, H, W = xyz_src_BS3HW.size()
+    G_tgt_src_Bs33 = G_tgt_src.unsqueeze(1).repeat(1, S, 1, 1).reshape(B*S, 4, 4)
+    xyz_tgt = transform_G_xyz(G_tgt_src_Bs33, xyz_src_BS3HW.reshape(B*S, 3, H*W))  # Bsx3xHW
+    xyz_tgt_BS3HW = xyz_tgt.reshape(B, S, 3, H, W)  # BxSx3xHxW
+    return xyz_tgt_BS3HW
+def render_tgt_rgb_depth(H_sampler: HomographySample,
+                         mpi_rgb_src,
+                         mpi_sigma_src,
+                         mpi_disparity_src,
+                         xyz_tgt_BS3HW,
+                         G_tgt_src,
+                         K_src_inv, K_tgt,
+                         use_alpha=False,
+                         is_bg_depth_inf=False):
+    """
+    :param H_sampler:
+    :param mpi_rgb_src: BxSx3xHxW
+    :param mpi_sigma_src: BxSx1xHxW
+    :param mpi_disparity_src: BxS
+    :param xyz_tgt_BS3HW: BxSx3xHxW
+    :param G_tgt_src: Bx4x4
+    :param K_src_inv: Bx3x3
+    :param K_tgt: Bx3x3
+    :return:
+    """
+    B, S, _, H, W = mpi_rgb_src.size()
+    mpi_depth_src = torch.reciprocal(mpi_disparity_src)  # BxS
+    # note that here we concat the mpi_src with xyz_tgt, because H_sampler will sample them for tgt frame
+    # mpi_src is the same in whatever frame, but xyz has to be in tgt frame
+    mpi_xyz_src = torch.cat((mpi_rgb_src, mpi_sigma_src, xyz_tgt_BS3HW), dim=2)  # BxSx(3+1+3)xHxW
+    # homography warping of mpi_src into tgt frame
+    G_tgt_src_Bs44 = G_tgt_src.unsqueeze(1).repeat(1, S, 1, 1).contiguous().reshape(B*S, 4, 4)  # Bsx4x4
+    K_src_inv_Bs33 = K_src_inv.unsqueeze(1).repeat(1, S, 1, 1).contiguous().reshape(B*S, 3, 3)  # Bsx3x3
+    K_tgt_Bs33 = K_tgt.unsqueeze(1).repeat(1, S, 1, 1).contiguous().reshape(B*S, 3, 3)  # Bsx3x3
+    # BsxCxHxW, BsxHxW
+    tgt_mpi_xyz_BsCHW, tgt_mask_BsHW = H_sampler.sample(mpi_xyz_src.view(B*S, 7, H, W),
+                                                        mpi_depth_src.view(B*S),
+                                                        G_tgt_src_Bs44,
+                                                        K_src_inv_Bs33,
+                                                        K_tgt_Bs33)
+    # mpi composition
+    tgt_mpi_xyz = tgt_mpi_xyz_BsCHW.view(B, S, 7, H, W)
+    tgt_rgb_BS3HW = tgt_mpi_xyz[:, :, 0:3, :, :]
+    tgt_sigma_BS1HW = tgt_mpi_xyz[:, :, 3:4, :, :]
+    tgt_xyz_BS3HW = tgt_mpi_xyz[:, :, 4:, :, :]
+    tgt_mask_BSHW = tgt_mask_BsHW.view(B, S, H, W)
+    tgt_mask_BSHW = torch.where(tgt_mask_BSHW,
+                                torch.ones((B, S, H, W), dtype=torch.float32, device=mpi_rgb_src.device),
+                                torch.zeros((B, S, H, W), dtype=torch.float32, device=mpi_rgb_src.device))
+    # Bx3xHxW, Bx1xHxW, Bx1xHxW
+    tgt_z_BS1HW = tgt_xyz_BS3HW[:, :, -1:]
+    tgt_sigma_BS1HW = torch.where(tgt_z_BS1HW >= 0,
+                                  tgt_sigma_BS1HW,
+                                  torch.zeros_like(tgt_sigma_BS1HW, device=tgt_sigma_BS1HW.device))
+    tgt_rgb_syn, tgt_depth_syn, _, _ = render(tgt_rgb_BS3HW, tgt_sigma_BS1HW, tgt_xyz_BS3HW,
+                                              use_alpha=use_alpha,
+                                              is_bg_depth_inf=is_bg_depth_inf)
+    tgt_mask = torch.sum(tgt_mask_BSHW, dim=1, keepdim=True)  # Bx1xHxW
+    return tgt_rgb_syn, tgt_depth_syn, tgt_mask
+def predict_mpi_coarse_to_fine(mpi_predictor, src_imgs, xyz_src_BS3HW_coarse,
+                               disparity_coarse_src, S_fine, is_bg_depth_inf):
+    if S_fine > 0:
+        with torch.no_grad():
+            # predict coarse mpi
+            mpi_coarse_src_list = mpi_predictor(src_imgs, disparity_coarse_src)  # BxS_coarsex4xHxW
+            mpi_coarse_rgb_src = mpi_coarse_src_list[0][:, :, 0:3, :, :]  # BxSx1xHxW
+            mpi_coarse_sigma_src = mpi_coarse_src_list[0][:, :, 3:, :, :]  # BxSx1xHxW
+            _, _, _, weights = plane_volume_rendering(
+                mpi_coarse_rgb_src,
+                mpi_coarse_sigma_src,
+                xyz_src_BS3HW_coarse,
+                is_bg_depth_inf
+            )
+            weights = weights.mean((2, 3, 4)).unsqueeze(1).unsqueeze(2)
+            # sample fine disparity
+            disparity_fine_src = sample_pdf(disparity_coarse_src.unsqueeze(1).unsqueeze(2), weights, S_fine)
+            disparity_fine_src = disparity_fine_src.squeeze(2).squeeze(1)
+            # assemble coarse and fine disparity
+            disparity_all_src = torch.cat((disparity_coarse_src, disparity_fine_src), dim=1) # Bx(S_coarse + S_fine)
+            disparity_all_src, _ = torch.sort(disparity_all_src, dim=1, descending=True)
+        mpi_all_src_list = mpi_predictor(src_imgs, disparity_all_src)  # BxS_coarsex4xHxW
+        return mpi_all_src_list, disparity_all_src
+    else:
+        mpi_coarse_src_list = mpi_predictor(src_imgs, disparity_coarse_src)  # BxS_coarsex4xHxW
+        return mpi_coarse_src_list, disparity_coarse_src

utils/mpi/rendering_utils.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import torch
+def transform_G_xyz(G, xyz, is_return_homo=False):
+    """
+    :param G: Bx4x4
+    :param xyz: Bx3xN
+    :return:
+    """
+    assert len(G.size()) == len(xyz.size())
+    if len(G.size()) == 2:
+        G_B44 = G.unsqueeze(0)
+        xyz_B3N = xyz.unsqueeze(0)
+    else:
+        G_B44 = G
+        xyz_B3N = xyz
+    xyz_B4N = torch.cat((xyz_B3N, torch.ones_like(xyz_B3N[:, 0:1, :])), dim=1)
+    G_xyz_B4N = torch.matmul(G_B44, xyz_B4N)
+    if is_return_homo:
+        return G_xyz_B4N
+    else:
+        return G_xyz_B4N[:, 0:3, :]
+def gather_pixel_by_pxpy(img, pxpy):
+    """
+    :param img: Bx3xHxW
+    :param pxpy: Bx2xN
+    :return:
+    """
+    with torch.no_grad():
+        B, C, H, W = img.size()
+        if pxpy.dtype == torch.float32:
+            pxpy_int = torch.round(pxpy).to(torch.int64)
+        pxpy_int = pxpy_int.to(torch.int64)
+        pxpy_int[:, 0, :] = torch.clamp(pxpy_int[:, 0, :], min=0, max=W-1)
+        pxpy_int[:, 1, :] = torch.clamp(pxpy_int[:, 1, :], min=0, max=H-1)
+        pxpy_idx = pxpy_int[:, 0:1, :] + W * pxpy_int[:, 1:2, :]  # Bx1xN_pt
+    rgb = torch.gather(img.view(B, C, H * W), dim=2,
+                       index=pxpy_idx.repeat(1, C, 1))  # BxCxN_pt
+    return rgb
+def uniformly_sample_disparity_from_bins(batch_size, disparity_np, device):
+    """
+    In the disparity dimension, it has to be from large to small, i.e., depth from small (near) to large (far)
+    :param start:
+    :param end:
+    :param num_bins:
+    :return:
+    """
+    assert disparity_np[0] > disparity_np[-1]
+    S = disparity_np.shape[0] - 1
+    B = batch_size
+    bin_edges = torch.from_numpy(disparity_np).to(dtype=torch.float32, device=device) # S+1
+    interval = bin_edges[1:] - bin_edges[0:-1]  # S
+    bin_edges_start = bin_edges[0:-1].unsqueeze(0).repeat(B, 1)  # S -> BxS
+    # bin_edges_end = bin_edges[1:].unsqueeze(0).repeat(B, 1)  # S -> BxS
+    interval = interval.unsqueeze(0).repeat(B, 1)  # S -> BxS
+    random_float = torch.rand((B, S), dtype=torch.float32, device=device) # BxS
+    disparity_array = bin_edges_start + interval * random_float
+    return disparity_array  # BxS
+def uniformly_sample_disparity_from_linspace_bins(batch_size, num_bins, start, end, device):
+    """
+    In the disparity dimension, it has to be from large to small, i.e., depth from small (near) to large (far)
+    :param start:
+    :param end:
+    :param num_bins:
+    :return:
+    """
+    assert start > end
+    B, S = batch_size, num_bins
+    bin_edges = torch.linspace(start, end, num_bins+1, dtype=torch.float32, device=device)  # S+1
+    interval = bin_edges[1] - bin_edges[0]  # scalar
+    bin_edges_start = bin_edges[0:-1].unsqueeze(0).repeat(B, 1)  # S -> BxS
+    # bin_edges_end = bin_edges[1:].unsqueeze(0).repeat(B, 1)  # S -> BxS
+    random_float = torch.rand((B, S), dtype=torch.float32, device=device) # BxS
+    disparity_array = bin_edges_start + interval * random_float
+    return disparity_array  # BxS
+def sample_pdf(values, weights, N_samples):
+    """
+    draw samples from distribution approximated by values and weights.
+    the probability distribution can be denoted as weights = p(values)
+    :param values: Bx1xNxS
+    :param weights: Bx1xNxS
+    :param N_samples: number of sample to draw
+    :return:
+    """
+    B, N, S = weights.size(0), weights.size(2), weights.size(3)
+    assert values.size() == (B, 1, N, S)
+    # convert values to bin edges
+    bin_edges = (values[:, :, :, 1:] + values[:, :, :, :-1]) * 0.5  # Bx1xNxS-1
+    bin_edges = torch.cat((values[:, :, :, 0:1],
+                           bin_edges,
+                           values[:, :, :, -1:]), dim=3)  # Bx1xNxS+1
+    pdf = weights / (torch.sum(weights, dim=3, keepdim=True) + 1e-5)  # Bx1xNxS
+    cdf = torch.cumsum(pdf, dim=3)  # Bx1xNxS
+    cdf = torch.cat((torch.zeros((B, 1, N, 1), dtype=cdf.dtype, device=cdf.device),
+                     cdf), dim=3)  # Bx1xNxS+1
+    # uniform sample over the cdf values
+    u = torch.rand((B, 1, N, N_samples), dtype=weights.dtype, device=weights.device)  # Bx1xNxN_samples
+    # get the index on the cdf array
+    cdf_idx = torch.searchsorted(cdf, u, right=True)  # Bx1xNxN_samples
+    cdf_idx_lower = torch.clamp(cdf_idx-1, min=0)  # Bx1xNxN_samples
+    cdf_idx_upper = torch.clamp(cdf_idx, max=S)  # Bx1xNxN_samples
+    # linear approximation for each bin
+    cdf_idx_lower_upper = torch.cat((cdf_idx_lower, cdf_idx_upper), dim=3)  # Bx1xNx(N_samplesx2)
+    cdf_bounds_N2 = torch.gather(cdf, index=cdf_idx_lower_upper, dim=3)  # Bx1xNx(N_samplesx2)
+    cdf_bounds = torch.stack((cdf_bounds_N2[..., 0:N_samples], cdf_bounds_N2[..., N_samples:]), dim=4)
+    bin_bounds_N2 = torch.gather(bin_edges, index=cdf_idx_lower_upper, dim=3)  # Bx1xNx(N_samplesx2)
+    bin_bounds = torch.stack((bin_bounds_N2[..., 0:N_samples], bin_bounds_N2[..., N_samples:]), dim=4)
+    # avoid zero cdf_intervals
+    cdf_intervals = cdf_bounds[:, :, :, :, 1] - cdf_bounds[:, :, :, :, 0] # Bx1xNxN_samples
+    bin_intervals = bin_bounds[:, :, :, :, 1] - bin_bounds[:, :, :, :, 0]  # Bx1xNxN_samples
+    u_cdf_lower = u - cdf_bounds[:, :, :, :, 0]  # Bx1xNxN_samples
+    # there is the case that cdf_interval = 0, caused by the cdf_idx_lower/upper clamp above, need special handling
+    t = u_cdf_lower / torch.clamp(cdf_intervals, min=1e-5)
+    t = torch.where(cdf_intervals <= 1e-4,
+                    torch.full_like(u_cdf_lower, 0.5),
+                    t)
+    samples = bin_bounds[:, :, :, :, 0] + t*bin_intervals
+    return samples

utils/rendererBackbone.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# System Imports
+import os
+import math
+import argparse
+import time
+# Common Libs
+import numpy as np
+from pathlib import Path
+import cv2
+import tkinter as tk
+import threading
+import queue
+# Torch Imports
+import torch
+import torch.nn.functional as F
+from torchvision import transforms
+from torchvision.utils import save_image
+# 3rd party imports
+from transformers import DPTForDepthEstimation, DPTImageProcessor
+from tqdm import tqdm
+import mediapipe as mp
+from PIL import Image, ImageTk
+from moviepy.editor import ImageSequenceClip
+# From Codebase
+from utils.mpi import mpi_rendering
+from utils.mpi.homography_sampler import HomographySample
+from utils.mpi.homography_sampler import HomographySample
+from utils.utils import (
+    image_to_tensor,
+    disparity_to_tensor,
+    render_3dphoto,
+    render_novel_view,
+)
+from model.AdaMPI import MPIPredictor
+from parameters import *
+#=================================================
+# Define the MPI Layers Processing Module Here
+#=================================================
+def processMPIs(src_imgs, mpi_all_src, disparity_all_src, k_src, k_tgt, save_path=None):
+    h, w = mpi_all_src.shape[-2:]
+    device = mpi_all_src.device
+    homography_sampler = HomographySample(h, w, device)
+    k_src_inv = torch.inverse(k_src)
+    # preprocess the predict MPI
+    xyz_src_BS3HW = mpi_rendering.get_src_xyz_from_plane_disparity(
+        homography_sampler.meshgrid,
+        disparity_all_src,
+        k_src_inv,
+    )
+    mpi_all_rgb_src = mpi_all_src[:, :, 0:3, :, :]  # BxSx3xHxW
+    mpi_all_sigma_src = mpi_all_src[:, :, 3:, :, :]  # BxSx1xHxW
+    _, _, blend_weights, _ = mpi_rendering.render(
+        mpi_all_rgb_src,
+        mpi_all_sigma_src,
+        xyz_src_BS3HW,
+        use_alpha=False,
+        is_bg_depth_inf=False,
+    )
+    mpi_all_rgb_src = blend_weights * src_imgs.unsqueeze(1) + (1 - blend_weights) * mpi_all_rgb_src
+    return mpi_all_rgb_src, mpi_all_sigma_src, disparity_all_src, k_src_inv,k_tgt,homography_sampler
+def cropFOV(image, original_fov, new_fov):
+    image = np.array(image)
+    if new_fov >= original_fov:
+        raise ValueError("New FoV must be smaller than the original FoV")
+    crop_ratio = new_fov / original_fov
+    height, width = image.shape[:2]
+    new_width = int(width * crop_ratio)
+    new_height = int(height * crop_ratio)
+    start_x = (width - new_width) // 2
+    start_y = (height - new_height) // 2
+    cropped_image = image[start_y:start_y + new_height, start_x:start_x + new_width]
+    cropped_image = Image.fromarray(cropped_image)
+    return cropped_image
+def renderSingleFrame(mpi_all_rgb_src, mpi_all_sigma_src, disparity_all_src, cam_ext, k_src_inv, k_tgt, homography_sampler):
+    frame = render_novel_view(
+        mpi_all_rgb_src,
+        mpi_all_sigma_src,
+        disparity_all_src,
+        cam_ext.to(device),
+        k_src_inv,
+        k_tgt,
+        homography_sampler,
+    )
+    frame_np = frame[0].permute(1, 2, 0).contiguous().cpu().numpy()  # [b,h,w,3]
+    frame_np = np.clip(np.round(frame_np * 255), a_min=0, a_max=255).astype(np.uint8)
+    im = Image.fromarray(frame_np)
+    return im
+class VideoCapture:
+  def __init__(self, name):
+    self.cap = cv2.VideoCapture(name)
+    self.q = queue.Queue()
+    t = threading.Thread(target=self._reader)
+    t.daemon = True
+    t.start()
+  def _reader(self):
+    while True:
+      ret, frame = self.cap.read()
+      if not ret:
+        break
+      if not self.q.empty():
+        try:
+          self.q.get_nowait()
+        except queue.Empty:
+          pass
+      self.q.put(frame)
+  def read(self):
+    return self.q.get()
+def captureBackground(capture_device):
+    frame_background = capture_device.read()
+    img = cv2.cvtColor(frame_background, cv2.COLOR_BGR2RGB)
+    im_pil = Image.fromarray(img)
+    return im_pil
+def getImageTensor(pil_image, height, width, unsqueeze=True):
+    t = transforms.Compose([transforms.CenterCrop((height, width)),transforms.ToTensor()])
+    rgb = t(pil_image)
+    if unsqueeze:
+        rgb = rgb.unsqueeze(0)
+    return rgb

utils/utils.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import os
+import math
+from PIL import Image
+import cv2
+from tqdm import tqdm
+import torch
+import torch.nn.functional as F
+from torchvision import transforms
+from torchvision.utils import save_image
+import numpy as np
+from moviepy.editor import ImageSequenceClip
+from utils.mpi import mpi_rendering
+from utils.mpi.homography_sampler import HomographySample
+def image_to_tensor(img_path, unsqueeze=True):
+    rgb = transforms.ToTensor()(Image.open(img_path))
+    if unsqueeze:
+        rgb = rgb.unsqueeze(0)
+    return rgb
+def disparity_to_tensor(disp_path, unsqueeze=True):
+    disp = cv2.imread(disp_path, -1) / (2 ** 16 - 1)
+    disp = torch.from_numpy(disp)[None, ...]
+    if unsqueeze:
+        disp = disp.unsqueeze(0)
+    return disp.float()
+def gen_swing_path(num_frames=90, r_x=0.14, r_y=0., r_z=0.10):
+    "Return a list of matrix [4, 4]"
+    t = torch.arange(num_frames) / (num_frames - 1)
+    poses = torch.eye(4).repeat(num_frames, 1, 1)
+    poses[:, 0, 3] = r_x * torch.sin(2. * math.pi * t)
+    poses[:, 1, 3] = r_y * torch.cos(2. * math.pi * t)
+    poses[:, 2, 3] = r_z * (torch.cos(2. * math.pi * t) - 1.)
+    return poses.unbind()
+def render_3dphoto(
+    src_imgs,  # [b,3,h,w]
+    mpi_all_src,  # [b,s,4,h,w]
+    disparity_all_src,  # [b,s]
+    k_src,  # [b,3,3]
+    k_tgt,  # [b,3,3]
+    save_path,
+):
+    h, w = mpi_all_src.shape[-2:]
+    device = mpi_all_src.device
+    homography_sampler = HomographySample(h, w, device)
+    k_src_inv = torch.inverse(k_src)
+    # preprocess the predict MPI
+    xyz_src_BS3HW = mpi_rendering.get_src_xyz_from_plane_disparity(
+        homography_sampler.meshgrid,
+        disparity_all_src,
+        k_src_inv,
+    )
+    mpi_all_rgb_src = mpi_all_src[:, :, 0:3, :, :]  # BxSx3xHxW
+    mpi_all_sigma_src = mpi_all_src[:, :, 3:, :, :]  # BxSx1xHxW
+    _, _, blend_weights, _ = mpi_rendering.render(
+        mpi_all_rgb_src,
+        mpi_all_sigma_src,
+        xyz_src_BS3HW,
+        use_alpha=False,
+        is_bg_depth_inf=False,
+    )
+    mpi_all_rgb_src = blend_weights * src_imgs.unsqueeze(1) + (1 - blend_weights) * mpi_all_rgb_src
+    # render novel views
+    swing_path_list = gen_swing_path()
+    frames = []
+    for cam_ext in tqdm(swing_path_list):
+        frame = render_novel_view(
+            mpi_all_rgb_src,
+            mpi_all_sigma_src,
+            disparity_all_src,
+            cam_ext,
+            k_src_inv,
+            k_tgt,
+            homography_sampler,
+        )
+        frame_np = frame[0].permute(1, 2, 0).contiguous().cpu().numpy()  # [b,h,w,3]
+        frame_np = np.clip(np.round(frame_np * 255), a_min=0, a_max=255).astype(np.uint8)
+        frames.append(frame_np)
+    rgb_clip = ImageSequenceClip(frames, fps=30)
+    rgb_clip.write_videofile(save_path, verbose=False, codec='mpeg4', logger=None, bitrate='2000k')
+def render_novel_view(
+    mpi_all_rgb_src,
+    mpi_all_sigma_src,
+    disparity_all_src,
+    G_tgt_src,
+    K_src_inv,
+    K_tgt,
+    homography_sampler,
+):
+    xyz_src_BS3HW = mpi_rendering.get_src_xyz_from_plane_disparity(
+        homography_sampler.meshgrid,
+        disparity_all_src,
+        K_src_inv
+    )
+    xyz_tgt_BS3HW = mpi_rendering.get_tgt_xyz_from_plane_disparity(
+        xyz_src_BS3HW,
+        G_tgt_src
+    )
+    tgt_imgs_syn, _, _ = mpi_rendering.render_tgt_rgb_depth(
+        homography_sampler,
+        mpi_all_rgb_src,
+        mpi_all_sigma_src,
+        disparity_all_src,
+        xyz_tgt_BS3HW,
+        G_tgt_src,
+        K_src_inv,
+        K_tgt,
+        use_alpha=False,
+        is_bg_depth_inf=False,
+    )
+    return tgt_imgs_syn
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=":f"):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+    def __str__(self):
+        # fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
+        # return fmtstr.format(**self.__dict__)
+        return f"{self.name:s}: {self.avg:.6f}"