3DTopia-XL_space

Runtime error

App Files Files Community

FrozenBurning commited on Aug 9, 2024

Commit

81ecb2b

1 Parent(s): 06ea84f

single view to 3D init release

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +4 -0
README.md +2 -1
app.py +209 -0
assets/examples/blue_cat.png +0 -0
assets/examples/bubble_mart_blue.png +0 -0
assets/examples/bulldog.png +0 -0
assets/examples/ceramic.png +0 -0
assets/examples/chair_watermelon.png +0 -0
assets/examples/cup_rgba.png +0 -0
assets/examples/cute_horse.jpg +0 -0
assets/examples/earphone.jpg +0 -0
assets/examples/firedragon.png +0 -0
assets/examples/fox.jpg +0 -0
assets/examples/fruit_elephant.jpg +0 -0
assets/examples/hatsune_miku.png +0 -0
assets/examples/ikun_rgba.png +0 -0
assets/examples/mailbox.png +0 -0
assets/examples/mario.png +0 -0
assets/examples/mei_ling_panda.png +0 -0
assets/examples/mushroom_teapot.jpg +0 -0
assets/examples/pikachu.png +0 -0
assets/examples/potplant_rgba.png +0 -0
assets/examples/seed_frog.png +0 -0
assets/examples/shuai_panda_notail.png +0 -0
assets/examples/yellow_duck.png +0 -0
configs/inference_dit.yml +97 -0
dva/__init__.py +5 -0
dva/attr_dict.py +66 -0
dva/geom.py +653 -0
dva/io.py +56 -0
dva/layers.py +157 -0
dva/losses.py +239 -0
dva/mvp/extensions/mvpraymarch/bvh.cu +292 -0
dva/mvp/extensions/mvpraymarch/cudadispatch.h +104 -0
dva/mvp/extensions/mvpraymarch/helper_math.h +1453 -0
dva/mvp/extensions/mvpraymarch/makefile +2 -0
dva/mvp/extensions/mvpraymarch/mvpraymarch.cpp +405 -0
dva/mvp/extensions/mvpraymarch/mvpraymarch.py +559 -0
dva/mvp/extensions/mvpraymarch/mvpraymarch_kernel.cu +208 -0
dva/mvp/extensions/mvpraymarch/mvpraymarch_subset_kernel.h +218 -0
dva/mvp/extensions/mvpraymarch/primaccum.h +101 -0
dva/mvp/extensions/mvpraymarch/primsampler.h +94 -0
dva/mvp/extensions/mvpraymarch/primtransf.h +182 -0
dva/mvp/extensions/mvpraymarch/setup.py +30 -0
dva/mvp/extensions/mvpraymarch/utils.h +847 -0
dva/mvp/extensions/utils/helper_math.h +1453 -0
dva/mvp/extensions/utils/makefile +2 -0
dva/mvp/extensions/utils/setup.py +29 -0
dva/mvp/extensions/utils/utils.cpp +137 -0
dva/mvp/extensions/utils/utils.py +211 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__
+build
+*.so
+runs

README.md CHANGED Viewed

@@ -1,10 +1,11 @@
 ---
-title: 3DTopia XL
 emoji: 🌖
 colorFrom: green
 colorTo: pink
 sdk: gradio
 sdk_version: 4.41.0
 app_file: app.py
 pinned: false
 ---

 ---
+title: 3DTopia-XL
 emoji: 🌖
 colorFrom: green
 colorTo: pink
 sdk: gradio
 sdk_version: 4.41.0
+python_version: 3.9
 app_file: app.py
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import os
+import imageio
+import numpy as np
+os.system("bash install.sh")
+from omegaconf import OmegaConf
+import tqdm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms.functional as TF
+import rembg
+import gradio as gr
+from dva.io import load_from_config
+from dva.ray_marcher import RayMarcher
+from dva.visualize import visualize_primvolume, visualize_video_primvolume
+from inference import remove_background, resize_foreground, extract_texmesh
+from models.diffusion import create_diffusion
+from huggingface_hub import hf_hub_download
+ckpt_path = hf_hub_download(repo_id="frozenburning/3DTopia-XL", filename="model_sview_dit_fp16.pt")
+vae_ckpt_path = hf_hub_download(repo_id="frozenburning/3DTopia-XL", filename="model_vae_fp16.pt")
+GRADIO_PRIM_VIDEO_PATH = 'prim.mp4'
+GRADIO_RGB_VIDEO_PATH = 'rgb.mp4'
+GRADIO_MAT_VIDEO_PATH = 'mat.mp4'
+GRADIO_GLB_PATH = 'pbr_mesh.glb'
+CONFIG_PATH = "./configs/inference_dit.yml"
+config = OmegaConf.load(CONFIG_PATH)
+config.checkpoint_path = ckpt_path
+config.model.vae_checkpoint_path = vae_ckpt_path
+# model
+model = load_from_config(config.model.generator)
+state_dict = torch.load(config.checkpoint_path, map_location='cpu')
+model.load_state_dict(state_dict['ema'])
+vae = load_from_config(config.model.vae)
+vae_state_dict = torch.load(config.model.vae_checkpoint_path, map_location='cpu')
+vae.load_state_dict(vae_state_dict['model_state_dict'])
+conditioner = load_from_config(config.model.conditioner)
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+vae = vae.to(device)
+conditioner = conditioner.to(device)
+model = model.to(device)
+model.eval()
+amp = True
+precision_dtype = torch.float16
+rm = RayMarcher(
+    config.image_height,
+    config.image_width,
+    **config.rm,
+).to(device)
+perchannel_norm = False
+if "latent_mean" in config.model:
+    latent_mean = torch.Tensor(config.model.latent_mean)[None, None, :].to(device)
+    latent_std = torch.Tensor(config.model.latent_std)[None, None, :].to(device)
+    assert latent_mean.shape[-1] == config.model.generator.in_channels
+    perchannel_norm = True
+config.diffusion.pop("timestep_respacing")
+config.model.pop("vae")
+config.model.pop("vae_checkpoint_path")
+config.model.pop("conditioner")
+config.model.pop("generator")
+config.model.pop("latent_nf")
+config.model.pop("latent_mean")
+config.model.pop("latent_std")
+model_primx = load_from_config(config.model)
+# load rembg
+rembg_session = rembg.new_session()
+# process function
+def process(input_image, input_num_steps=25, input_seed=42, input_cfg=6.0):
+    # seed
+    torch.manual_seed(input_seed)
+    os.makedirs(config.output_dir, exist_ok=True)
+    output_rgb_video_path = os.path.join(config.output_dir, GRADIO_RGB_VIDEO_PATH)
+    output_prim_video_path = os.path.join(config.output_dir, GRADIO_PRIM_VIDEO_PATH)
+    output_mat_video_path = os.path.join(config.output_dir, GRADIO_MAT_VIDEO_PATH)
+    output_glb_path = os.path.join(config.output_dir, GRADIO_GLB_PATH)
+    diffusion = create_diffusion(timestep_respacing=respacing, **config.diffusion)
+    sample_fn = diffusion.ddim_sample_loop_progressive
+    fwd_fn = model.forward_with_cfg
+    # text-conditioned
+    if input_image is None:
+        raise NotImplementedError
+    # image-conditioned (may also input text, but no text usually works too)
+    else:
+        input_image = remove_background(input_image, rembg_session)
+        input_image = resize_foreground(input_image, 0.85)
+        raw_image = np.array(input_image)
+        mask = (raw_image[..., -1][..., None] > 0) * 1
+        raw_image = raw_image[..., :3] * mask
+        input_cond = torch.from_numpy(np.array(raw_image)[None, ...]).to(device)
+    with torch.no_grad():
+        latent = torch.randn(1, config.model.num_prims, 1, 4, 4, 4)
+        batch = {}
+        inf_bs = 1
+        inf_x = torch.randn(inf_bs, config.model.num_prims, 68).to(device)
+        y = conditioner.encoder(input_cond)
+        model_kwargs = dict(y=y[:inf_bs, ...], precision_dtype=precision_dtype, enable_amp=amp)
+        if input_cfg >= 0:
+            model_kwargs['cfg_scale'] = input_cfg
+        for samples in sample_fn(fwd_fn, inf_x.shape, inf_x, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device):
+            final_samples = samples
+        recon_param = final_samples["sample"].reshape(inf_bs, config.model.num_prims, -1)
+        if perchannel_norm:
+            recon_param = recon_param / config.model.latent_nf * latent_std + latent_mean
+        recon_srt_param = recon_param[:, :, 0:4]
+        recon_feat_param = recon_param[:, :, 4:] # [8, 2048, 64]
+        recon_feat_param_list = []
+        # one-by-one to avoid oom
+        for inf_bidx in range(inf_bs):
+            if not perchannel_norm:
+                decoded = vae.decode(recon_feat_param[inf_bidx, ...].reshape(1*config.model.num_prims, *latent.shape[-4:]) / config.model.latent_nf)
+            else:
+                decoded = vae.decode(recon_feat_param[inf_bidx, ...].reshape(1*config.model.num_prims, *latent.shape[-4:]))
+            recon_feat_param_list.append(decoded.detach())
+        recon_feat_param = torch.concat(recon_feat_param_list, dim=0)
+        # invert normalization
+        if not perchannel_norm:
+            recon_srt_param[:, :, 0:1] = (recon_srt_param[:, :, 0:1] / 10) + 0.05
+        recon_feat_param[:, 0:1, ...] /= 5.
+        recon_feat_param[:, 1:, ...] = (recon_feat_param[:, 1:, ...] + 1) / 2.
+        recon_feat_param = recon_feat_param.reshape(inf_bs, config.model.num_prims, -1)
+        recon_param = torch.concat([recon_srt_param, recon_feat_param], dim=-1)
+        visualize_video_primvolume(config.output_dir, batch, recon_param, 60, rm, device)
+        prim_params = {'srt_param': recon_srt_param[0].detach().cpu(), 'feat_param': recon_feat_param[0].detach().cpu()}
+        torch.save({'model_state_dict': prim_params}, "{}/denoised.pt".format(config.output_dir))
+    # exporting GLB mesh
+    denoise_param_path = os.path.join(config.output_dir, 'denoised.pt')
+    primx_ckpt_weight = torch.load(denoise_param_path, map_location='cpu')['model_state_dict']
+    model_primx.load_state_dict(ckpt_weight)
+    model_primx.to(device)
+    model_primx.eval()
+    with torch.no_grad():
+        model_primx.srt_param[:, 1:4] *= 0.85
+        extract_texmesh(config.inference, model_primx, output_glb_path, device)
+    return output_rgb_video_path, output_prim_video_path, output_mat_video_path, output_glb_path
+# gradio UI
+_TITLE = '''3DTopia-XL'''
+_DESCRIPTION = '''
+<div>
+<a style="display:inline-block" href="https://frozenburning.github.io/projects/3DTopia-XL/"><img src='https://img.shields.io/badge/public_website-8A2BE2'></a>
+<a style="display:inline-block; margin-left: .5em" href="https://github.com/3DTopia/3DTopia-XL"><img src='https://img.shields.io/github/stars/3DTopia/3DTopia-XL?style=social'/></a>
+</div>
+* Now we offer 1) single image conditioned model, we will release 2) multiview images conditioned model and 3) pure text conditioned model in the future!
+* If you find the output unsatisfying, try using different seeds!
+'''
+block = gr.Blocks(title=_TITLE).queue()
+with block:
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown('# ' + _TITLE)
+    gr.Markdown(_DESCRIPTION)
+    with gr.Row(variant='panel'):
+        with gr.Column(scale=1):
+            # input image
+            input_image = gr.Image(label="image", type='pil')
+            # inference steps
+            input_num_steps = gr.Slider(label="inference steps", minimum=1, maximum=100, step=1, value=25)
+            # random seed
+            input_cfg = gr.Slider(label="CFG scale", minimum=0, maximum=15, step=1, value=6)
+            # random seed
+            input_seed = gr.Slider(label="random seed", minimum=0, maximum=100000, step=1, value=42)
+            # gen button
+            button_gen = gr.Button("Generate")
+        with gr.Column(scale=1):
+            with gr.Tab("Video"):
+                # final video results
+                output_rgb_video = gr.Video(label="video")
+                output_prim_video = gr.Video(label="video")
+                output_mat_video = gr.Video(label="video")
+            with gr.Tab("GLB"):
+                # glb file
+                output_glb = gr.File(label="glb")
+        button_gen.click(process, inputs=[input_image, input_num_steps, input_seed, input_cfg], outputs=[output_rgb_video, output_prim_video, output_mat_video, output_glb])
+    gr.Examples(
+        examples=[
+            "assets/examples/fruit_elephant.jpg",
+            "assets/examples/mei_ling_panda.png",
+            "assets/examples/shuai_panda_notail.png",
+        ],
+        inputs=[input_image],
+        outputs=[output_rgb_video, output_prim_video, output_mat_video, output_glb],
+        fn=lambda x: process(input_image=x),
+        cache_examples=False,
+        label='Single Image to 3D PBR Asset'
+    )
+block.launch(server_name="0.0.0.0", share=True)

assets/examples/blue_cat.png ADDED Viewed

assets/examples/bubble_mart_blue.png ADDED Viewed

assets/examples/bulldog.png ADDED Viewed

assets/examples/ceramic.png ADDED Viewed

assets/examples/chair_watermelon.png ADDED Viewed

assets/examples/cup_rgba.png ADDED Viewed

assets/examples/cute_horse.jpg ADDED Viewed

assets/examples/earphone.jpg ADDED Viewed

assets/examples/firedragon.png ADDED Viewed

assets/examples/fox.jpg ADDED Viewed

assets/examples/fruit_elephant.jpg ADDED Viewed

assets/examples/hatsune_miku.png ADDED Viewed

assets/examples/ikun_rgba.png ADDED Viewed

assets/examples/mailbox.png ADDED Viewed

assets/examples/mario.png ADDED Viewed

assets/examples/mei_ling_panda.png ADDED Viewed

assets/examples/mushroom_teapot.jpg ADDED Viewed

assets/examples/pikachu.png ADDED Viewed

assets/examples/potplant_rgba.png ADDED Viewed

assets/examples/seed_frog.png ADDED Viewed

assets/examples/shuai_panda_notail.png ADDED Viewed

assets/examples/yellow_duck.png ADDED Viewed

configs/inference_dit.yml ADDED Viewed

	@@ -0,0 +1,97 @@

+debug: False
+root_data_dir: ./runs
+checkpoint_path:
+global_seed: 42
+inference:
+  input_dir:
+  ddim: 25
+  cfg: 6
+  seed: ${global_seed}
+  precision: fp16
+  export_glb: True
+  decimate: 100000
+  mc_resolution: 256
+  batch_size: 4096
+  remesh: False
+image_height: 518
+image_width: 518
+model:
+  class_name: models.primsdf.PrimSDF
+  num_prims: 2048
+  dim_feat: 6
+  prim_shape: 8
+  init_scale: 0.05 # useless if auto_scale_init == True
+  sdf2alpha_var: 0.005
+  auto_scale_init: True
+  init_sampling: uniform
+  vae:
+    class_name: models.vae3d_dib.VAE
+    in_channels: ${model.dim_feat}
+    latent_channels: 1
+    out_channels: ${model.vae.in_channels}
+    down_channels: [32, 256]
+    mid_attention: True
+    up_channels: [256, 32]
+    layers_per_block: 2
+    gradient_checkpointing: False
+  vae_checkpoint_path:
+  conditioner:
+    class_name: models.conditioner.image.ImageConditioner
+    num_prims: ${model.num_prims}
+    dim_feat: ${model.dim_feat}
+    prim_shape: ${model.prim_shape}
+    sample_view: False
+    encoder_config:
+      class_name: models.conditioner.image_dinov2.Dinov2Wrapper
+      model_name: dinov2_vitb14_reg
+      freeze: True
+  generator:
+    class_name: models.dit_crossattn.DiT
+    seq_length: ${model.num_prims}
+    in_channels: 68 # equals to model.vae.latent_channels * latent_dim^3
+    condition_channels: 768
+    hidden_size: 1152
+    depth: 28
+    num_heads: 16
+    attn_proj_bias: True
+    cond_drop_prob: 0.1
+    gradient_checkpointing: False
+  latent_nf: 1.0
+  latent_mean: [ 0.0442, -0.0029, -0.0425, -0.0043, -0.4086, -0.2906, -0.7002, -0.0852, -0.4446, -0.6896, -0.7344, -0.3524, -0.5488, -0.4313, -1.1715, -0.0875, -0.6131, -0.3924, -0.7335, -0.3749,  0.4658, -0.0236,  0.8362,  0.3388,  0.0188,  0.5988, -0.1853,  1.1579,  0.6240,  0.0758,  0.9641,  0.6586,  0.6260,  0.2384,  0.7798,  0.8297, -0.6543, -0.4441, -1.3887, -0.0393, -0.9008, -0.8616, -1.7434, -0.1328, -0.8119, -0.8225, -1.8533, -0.0444, -1.0510, -0.5158, -1.1907, -0.5265,  0.2832,  0.6037,  0.5981,  0.5461,  0.4366,  0.4144,  0.7219,  0.5722,  0.5937,  0.5598,  0.9414,  0.7419,  0.2102,  0.3388,  0.4501,  0.5166]
+  latent_std: [0.0219, 0.3707, 0.3911, 0.3610, 0.7549, 0.7909, 0.9691, 0.9193, 0.8218, 0.9389, 1.1785, 1.0254, 0.6376, 0.6568, 0.7892, 0.8468, 0.8775, 0.7920, 0.9037, 0.9329, 0.9196, 1.1123, 1.3041, 1.0955, 1.2727, 1.6565, 1.8502, 1.7006, 0.8973, 1.0408, 1.2034, 1.2703, 1.0373, 1.0486, 1.0716, 0.9746, 0.7088, 0.8685, 1.0030, 0.9504, 1.0410, 1.3033, 1.5368, 1.4386, 0.6142, 0.6887, 0.9085, 0.9903, 1.0190, 0.9302, 1.0121, 0.9964, 1.1474, 1.2729, 1.4627, 1.1404, 1.3713, 1.6692, 1.8424, 1.5047, 1.1356, 1.2369, 1.3554, 1.1848, 1.1319, 1.0822, 1.1972, 0.9916]
+diffusion:
+  timestep_respacing:
+  noise_schedule: squaredcos_cap_v2
+  diffusion_steps: 1000
+  parameterization: v
+rm:
+  volradius: 10000.0
+  dt: 1.0
+optimizer:
+  class_name: torch.optim.AdamW
+  lr: 0.0001
+  weight_decay: 0
+scheduler:
+  class_name: dva.scheduler.CosineWarmupScheduler
+  warmup_iters: 3000
+  max_iters: 200000
+train:
+  batch_size: 8
+  n_workers: 4
+  n_epochs: 1000
+  log_every_n_steps: 50
+  summary_every_n_steps: 10000
+  ckpt_every_n_steps: 10000
+  amp: False
+  precision: tf32
+tag: 3dtopia-xl-sview
+output_dir: ${root_data_dir}/inference/${tag}

dva/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

dva/attr_dict.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+class AttrDict:
+    def __init__(self, entries):
+        self.add_entries_(entries)
+    def keys(self):
+        return self.__dict__.keys()
+    def values(self):
+        return self.__dict__.values()
+    def __getitem__(self, key):
+        return self.__dict__[key]
+    def __setitem__(self, key, value):
+        self.__dict__[key] = value
+    def __delitem__(self, key):
+        return self.__dict__.__delitem__(key)
+    def __contains__(self, key):
+        return key in self.__dict__
+    def __repr__(self):
+        return self.__dict__.__repr__()
+    def __getattr__(self, attr):
+        if attr.startswith("__"):
+            return self.__getattribute__(attr)
+        return self.__dict__[attr]
+    def items(self):
+        return self.__dict__.items()
+    def __iter__(self):
+        return iter(self.items())
+    def add_entries_(self, entries, overwrite=True):
+        for key, value in entries.items():
+            if key not in self.__dict__:
+                if isinstance(value, dict):
+                    self.__dict__[key] = AttrDict(value)
+                else:
+                    self.__dict__[key] = value
+            else:
+                if isinstance(value, dict):
+                    self.__dict__[key].add_entries_(entries=value, overwrite=overwrite)
+                elif overwrite or self.__dict__[key] is None:
+                    self.__dict__[key] = value
+    def serialize(self):
+        return json.dumps(self, default=self.obj_to_dict, indent=4)
+    def obj_to_dict(self, obj):
+        return obj.__dict__
+    def get(self, key, default=None):
+        return self.__dict__.get(key, default)

dva/geom.py ADDED Viewed

	@@ -0,0 +1,653 @@

+from typing import Optional
+import numpy as np
+import torch as th
+import torch.nn.functional as F
+import torch.nn as nn
+from sklearn.neighbors import KDTree
+import logging
+logger = logging.getLogger(__name__)
+# NOTE: we need pytorch3d primarily for UV rasterization things
+from pytorch3d.renderer.mesh.rasterize_meshes import rasterize_meshes
+from pytorch3d.structures import Meshes
+from typing import Union, Optional, Tuple
+import trimesh
+from trimesh import Trimesh
+from trimesh.triangles import points_to_barycentric
+try:
+    # pyre-fixme[21]: Could not find module `igl`.
+    from igl import point_mesh_squared_distance  # @manual
+    # pyre-fixme[3]: Return type must be annotated.
+    # pyre-fixme[2]: Parameter must be annotated.
+    def closest_point(mesh, points):
+        """Helper function that mimics trimesh.proximity.closest_point but uses
+        IGL for faster queries."""
+        v = mesh.vertices
+        vi = mesh.faces
+        dist, face_idxs, p = point_mesh_squared_distance(points, v, vi)
+        return p, dist, face_idxs
+except ImportError:
+    from trimesh.proximity import closest_point
+def closest_point_barycentrics(v, vi, points):
+    """Given a 3D mesh and a set of query points, return closest point barycentrics
+    Args:
+        v: np.array (float)
+        [N, 3] mesh vertices
+        vi: np.array (int)
+        [N, 3] mesh triangle indices
+        points: np.array (float)
+        [M, 3] query points
+    Returns:
+        Tuple[approx, barys, interp_idxs, face_idxs]
+            approx:       [M, 3] approximated (closest) points on the mesh
+            barys:        [M, 3] barycentric weights that produce "approx"
+            interp_idxs:  [M, 3] vertex indices for barycentric interpolation
+            face_idxs:    [M] face indices for barycentric interpolation. interp_idxs = vi[face_idxs]
+    """
+    mesh = Trimesh(vertices=v, faces=vi, process=False)
+    p, _, face_idxs = closest_point(mesh, points)
+    p = p.reshape((points.shape[0], 3))
+    face_idxs = face_idxs.reshape((points.shape[0],))
+    barys = points_to_barycentric(mesh.triangles[face_idxs], p)
+    b0, b1, b2 = np.split(barys, 3, axis=1)
+    interp_idxs = vi[face_idxs]
+    v0 = v[interp_idxs[:, 0]]
+    v1 = v[interp_idxs[:, 1]]
+    v2 = v[interp_idxs[:, 2]]
+    approx = b0 * v0 + b1 * v1 + b2 * v2
+    return approx, barys, interp_idxs, face_idxs
+def make_uv_face_index(
+    vt: th.Tensor,
+    vti: th.Tensor,
+    uv_shape: Union[Tuple[int, int], int],
+    flip_uv: bool = True,
+    device: Optional[Union[str, th.device]] = None,
+):
+    """Compute a UV-space face index map identifying which mesh face contains each
+    texel. For texels with no assigned triangle, the index will be -1."""
+    if isinstance(uv_shape, int):
+        uv_shape = (uv_shape, uv_shape)
+    uv_max_shape_ind = uv_shape.index(max(uv_shape))
+    uv_min_shape_ind = uv_shape.index(min(uv_shape))
+    uv_ratio = uv_shape[uv_max_shape_ind] / uv_shape[uv_min_shape_ind]
+    if device is not None:
+        if isinstance(device, str):
+            dev = th.device(device)
+        else:
+            dev = device
+        assert dev.type == "cuda"
+    else:
+        dev = th.device("cuda")
+    vt = 1.0 - vt.clone()
+    if flip_uv:
+        vt = vt.clone()
+        vt[:, 1] = 1 - vt[:, 1]
+    vt_pix = 2.0 * vt.to(dev) - 1.0
+    vt_pix = th.cat([vt_pix, th.ones_like(vt_pix[:, 0:1])], dim=1)
+    vt_pix[:, uv_min_shape_ind] *= uv_ratio
+    meshes = Meshes(vt_pix[np.newaxis], vti[np.newaxis].to(dev))
+    with th.no_grad():
+        face_index, _, _, _ = rasterize_meshes(
+            meshes, uv_shape, faces_per_pixel=1, z_clip_value=0.0, bin_size=0
+        )
+        face_index = face_index[0, ..., 0]
+    return face_index
+def make_uv_vert_index(
+    vt: th.Tensor,
+    vi: th.Tensor,
+    vti: th.Tensor,
+    uv_shape: Union[Tuple[int, int], int],
+    flip_uv: bool = True,
+):
+    """Compute a UV-space vertex index map identifying which mesh vertices
+    comprise the triangle containing each texel. For texels with no assigned
+    triangle, all indices will be -1.
+    """
+    face_index_map = make_uv_face_index(vt, vti, uv_shape, flip_uv)
+    vert_index_map = vi[face_index_map.clamp(min=0)]
+    vert_index_map[face_index_map < 0] = -1
+    return vert_index_map.long()
+def bary_coords(points: th.Tensor, triangles: th.Tensor, eps: float = 1.0e-6):
+    """Computes barycentric coordinates for a set of 2D query points given
+    coordintes for the 3 vertices of the enclosing triangle for each point."""
+    x = points[:, 0] - triangles[2, :, 0]
+    x1 = triangles[0, :, 0] - triangles[2, :, 0]
+    x2 = triangles[1, :, 0] - triangles[2, :, 0]
+    y = points[:, 1] - triangles[2, :, 1]
+    y1 = triangles[0, :, 1] - triangles[2, :, 1]
+    y2 = triangles[1, :, 1] - triangles[2, :, 1]
+    denom = y2 * x1 - y1 * x2
+    n0 = y2 * x - x2 * y
+    n1 = x1 * y - y1 * x
+    # Small epsilon to prevent divide-by-zero error.
+    denom = th.where(denom >= 0, denom.clamp(min=eps), denom.clamp(max=-eps))
+    bary_0 = n0 / denom
+    bary_1 = n1 / denom
+    bary_2 = 1.0 - bary_0 - bary_1
+    return th.stack((bary_0, bary_1, bary_2))
+def make_uv_barys(
+    vt: th.Tensor,
+    vti: th.Tensor,
+    uv_shape: Union[Tuple[int, int], int],
+    flip_uv: bool = True,
+):
+    """Compute a UV-space barycentric map where each texel contains barycentric
+    coordinates for that texel within its enclosing UV triangle. For texels
+    with no assigned triangle, all 3 barycentric coordinates will be 0.
+    """
+    if isinstance(uv_shape, int):
+        uv_shape = (uv_shape, uv_shape)
+    if flip_uv:
+        # Flip here because texture coordinates in some of our topo files are
+        # stored in OpenGL convention with Y=0 on the bottom of the texture
+        # unlike numpy/torch arrays/tensors.
+        vt = vt.clone()
+        vt[:, 1] = 1 - vt[:, 1]
+    face_index_map = make_uv_face_index(vt, vti, uv_shape, flip_uv=False)
+    vti_map = vti.long()[face_index_map.clamp(min=0)]
+    uv_max_shape_ind = uv_shape.index(max(uv_shape))
+    uv_min_shape_ind = uv_shape.index(min(uv_shape))
+    uv_ratio = uv_shape[uv_max_shape_ind] / uv_shape[uv_min_shape_ind]
+    vt = vt.clone()
+    vt = vt * 2 - 1
+    vt[:, uv_min_shape_ind] *= uv_ratio
+    uv_tri_uvs = vt[vti_map].permute(2, 0, 1, 3)
+    uv_grid = th.meshgrid(
+        th.linspace(0.5, uv_shape[0] - 0.5, uv_shape[0]) / uv_shape[0],
+        th.linspace(0.5, uv_shape[1] - 0.5, uv_shape[1]) / uv_shape[1],
+    )
+    uv_grid = th.stack(uv_grid[::-1], dim=2).to(uv_tri_uvs)
+    uv_grid = uv_grid * 2 - 1
+    uv_grid[..., uv_min_shape_ind] *= uv_ratio
+    bary_map = bary_coords(uv_grid.view(-1, 2), uv_tri_uvs.view(3, -1, 2))
+    bary_map = bary_map.permute(1, 0).view(uv_shape[0], uv_shape[1], 3)
+    bary_map[face_index_map < 0] = 0
+    return face_index_map, bary_map
+def index_image_impaint(
+    index_image: th.Tensor,
+    bary_image: Optional[th.Tensor] = None,
+    distance_threshold=100.0,
+):
+    # getting the mask around the indexes?
+    if len(index_image.shape) == 3:
+        valid_index = (index_image != -1).any(dim=-1)
+    elif len(index_image.shape) == 2:
+        valid_index = index_image != -1
+    else:
+        raise ValueError("`index_image` should be a [H,W] or [H,W,C] image")
+    invalid_index = ~valid_index
+    device = index_image.device
+    valid_ij = th.stack(th.where(valid_index), dim=-1)
+    invalid_ij = th.stack(th.where(invalid_index), dim=-1)
+    lookup_valid = KDTree(valid_ij.cpu().numpy())
+    dists, idxs = lookup_valid.query(invalid_ij.cpu())
+    # TODO: try average?
+    idxs = th.as_tensor(idxs, device=device)[..., 0]
+    dists = th.as_tensor(dists, device=device)[..., 0]
+    dist_mask = dists < distance_threshold
+    invalid_border = th.zeros_like(invalid_index)
+    invalid_border[invalid_index] = dist_mask
+    invalid_src_ij = valid_ij[idxs][dist_mask]
+    invalid_dst_ij = invalid_ij[dist_mask]
+    index_image_imp = index_image.clone()
+    index_image_imp[invalid_dst_ij[:, 0], invalid_dst_ij[:, 1]] = index_image[
+        invalid_src_ij[:, 0], invalid_src_ij[:, 1]
+    ]
+    if bary_image is not None:
+        bary_image_imp = bary_image.clone()
+        bary_image_imp[invalid_dst_ij[:, 0], invalid_dst_ij[:, 1]] = bary_image[
+            invalid_src_ij[:, 0], invalid_src_ij[:, 1]
+        ]
+        return index_image_imp, bary_image_imp
+    return index_image_imp
+class GeometryModule(nn.Module):
+    def __init__(
+        self,
+        v,
+        vi,
+        vt,
+        vti,
+        uv_size,
+        v2uv: Optional[th.Tensor] = None,
+        flip_uv=False,
+        impaint=False,
+        impaint_threshold=100.0,
+    ):
+        super().__init__()
+        self.register_buffer("v", th.as_tensor(v))
+        self.register_buffer("vi", th.as_tensor(vi))
+        self.register_buffer("vt", th.as_tensor(vt))
+        self.register_buffer("vti", th.as_tensor(vti))
+        if v2uv is not None:
+            self.register_buffer("v2uv", th.as_tensor(v2uv, dtype=th.int64))
+        # TODO: should we just pass topology here?
+        # self.n_verts = v2uv.shape[0]
+        self.n_verts = vi.max() + 1
+        self.uv_size = uv_size
+        # TODO: can't we just index face_index?
+        index_image = make_uv_vert_index(
+            self.vt, self.vi, self.vti, uv_shape=uv_size, flip_uv=flip_uv
+        ).cpu()
+        face_index, bary_image = make_uv_barys(
+            self.vt, self.vti, uv_shape=uv_size, flip_uv=flip_uv
+        )
+        if impaint:
+            if min(uv_size) >= 1024:
+                logger.info(
+                    "impainting index image might take a while for sizes >= 1024"
+                )
+            index_image, bary_image = index_image_impaint(
+                index_image, bary_image, impaint_threshold
+            )
+            # TODO: we can avoid doing this 2x
+            face_index = index_image_impaint(
+                face_index, distance_threshold=impaint_threshold
+            )
+        self.register_buffer("index_image", index_image.cpu())
+        self.register_buffer("bary_image", bary_image.cpu())
+        self.register_buffer("face_index_image", face_index.cpu())
+    def render_index_images(self, uv_size, flip_uv=False, impaint=False):
+        index_image = make_uv_vert_index(
+            self.vt, self.vi, self.vti, uv_shape=uv_size, flip_uv=flip_uv
+        )
+        face_image, bary_image = make_uv_barys(
+            self.vt, self.vti, uv_shape=uv_size, flip_uv=flip_uv
+        )
+        if impaint:
+            index_image, bary_image = index_image_impaint(
+                index_image,
+                bary_image,
+            )
+        return index_image, face_image, bary_image
+    def vn(self, verts):
+        return vert_normals(verts, self.vi[np.newaxis].to(th.long))
+    def to_uv(self, values):
+        return values_to_uv(values, self.index_image, self.bary_image)
+    def from_uv(self, values_uv):
+        # TODO: we need to sample this
+        return sample_uv(values_uv, self.vt, self.v2uv.to(th.long))
+    def rand_sample_3d_uv(self, count, uv_img):
+        """
+        Sample a set of 3D points on the surface of mesh, return corresponding interpolated values in UV space.
+        Args:
+            count - num of 3D points to be sampled
+            uv_img - the image in uv space to be sampled, e.g., texture
+        """
+        _mesh = Trimesh(vertices=self.v.detach().cpu().numpy(), faces=self.vi.detach().cpu().numpy(), process=False)
+        points, _ = trimesh.sample.sample_surface(_mesh, count)
+        return self.sample_uv_from_3dpts(points, uv_img)
+    def sample_uv_from_3dpts(self, points, uv_img):
+        num_pts = points.shape[0]
+        approx, barys, interp_idxs, face_idxs = closest_point_barycentrics(self.v.detach().cpu().numpy(), self.vi.detach().cpu().numpy(), points)
+        interp_uv_coords = self.vt[interp_idxs, :] # [N, 3, 2]
+        # do bary interp first to get interp_uv_coord in high-reso uv space
+        target_uv_coords = th.sum(interp_uv_coords * th.from_numpy(barys)[..., None], dim=1).float()
+        # then directly sample from uv space
+        sampled_values = sample_uv(values_uv=uv_img.permute(2, 0, 1)[None, ...], uv_coords=target_uv_coords) # [1, count, c]
+        approx_values = sampled_values[0].reshape(num_pts, uv_img.shape[2])
+        return approx_values.numpy(), points
+    def vert_sample_uv(self, uv_img):
+        count = self.v.shape[0]
+        points = self.v.detach().cpu().numpy()
+        approx_values, _ = self.sample_uv_from_3dpts(points, uv_img)
+        return approx_values
+def sample_uv(
+    values_uv,
+    uv_coords,
+    v2uv: Optional[th.Tensor] = None,
+    mode: str = "bilinear",
+    align_corners: bool = True,
+    flip_uvs: bool = False,
+):
+    batch_size = values_uv.shape[0]
+    if flip_uvs:
+        uv_coords = uv_coords.clone()
+        uv_coords[:, 1] = 1.0 - uv_coords[:, 1]
+    # uv_coords_norm is [1, N, 1, 2] afterwards
+    uv_coords_norm = (uv_coords * 2.0 - 1.0)[np.newaxis, :, np.newaxis].expand(
+        batch_size, -1, -1, -1
+    )
+    # uv_shape = values_uv.shape[-2:]
+    # uv_max_shape_ind = uv_shape.index(max(uv_shape))
+    # uv_min_shape_ind = uv_shape.index(min(uv_shape))
+    # uv_ratio = uv_shape[uv_max_shape_ind] / uv_shape[uv_min_shape_ind]
+    # uv_coords_norm[..., uv_min_shape_ind] *= uv_ratio
+    values = (
+        F.grid_sample(values_uv, uv_coords_norm, align_corners=align_corners, mode=mode)
+        .squeeze(-1)
+        .permute((0, 2, 1))
+    )
+    if v2uv is not None:
+        values_duplicate = values[:, v2uv]
+        values = values_duplicate.mean(2)
+    return values
+def values_to_uv(values, index_img, bary_img):
+    uv_size = index_img.shape
+    index_mask = th.all(index_img != -1, dim=-1)
+    idxs_flat = index_img[index_mask].to(th.int64)
+    bary_flat = bary_img[index_mask].to(th.float32)
+    # NOTE: here we assume
+    values_flat = th.sum(values[:, idxs_flat].permute(0, 3, 1, 2) * bary_flat, dim=-1)
+    values_uv = th.zeros(
+        values.shape[0],
+        values.shape[-1],
+        uv_size[0],
+        uv_size[1],
+        dtype=values.dtype,
+        device=values.device,
+    )
+    values_uv[:, :, index_mask] = values_flat
+    return values_uv
+def face_normals(v, vi, eps: float = 1e-5):
+    pts = v[:, vi]
+    v0 = pts[:, :, 1] - pts[:, :, 0]
+    v1 = pts[:, :, 2] - pts[:, :, 0]
+    n = th.cross(v0, v1, dim=-1)
+    norm = th.norm(n, dim=-1, keepdim=True)
+    norm[norm < eps] = 1
+    n /= norm
+    return n
+def vert_normals(v, vi, eps: float = 1.0e-5):
+    fnorms = face_normals(v, vi)
+    fnorms = fnorms[:, :, None].expand(-1, -1, 3, -1).reshape(fnorms.shape[0], -1, 3)
+    vi_flat = vi.view(1, -1).expand(v.shape[0], -1)
+    vnorms = th.zeros_like(v)
+    for j in range(3):
+        vnorms[..., j].scatter_add_(1, vi_flat, fnorms[..., j])
+    norm = th.norm(vnorms, dim=-1, keepdim=True)
+    norm[norm < eps] = 1
+    vnorms /= norm
+    return vnorms
+def compute_view_cos(verts, faces, camera_pos):
+    vn = F.normalize(vert_normals(verts, faces), dim=-1)
+    v2c = F.normalize(verts - camera_pos[:, np.newaxis], dim=-1)
+    return th.einsum("bnd,bnd->bn", vn, v2c)
+def compute_tbn(geom, vt, vi, vti):
+    """Computes tangent, bitangent, and normal vectors given a mesh.
+    Args:
+        geom: [N, n_verts, 3] th.Tensor
+        Vertex positions.
+        vt: [n_uv_coords, 2] th.Tensor
+        UV coordinates.
+        vi: [..., 3] th.Tensor
+        Face vertex indices.
+        vti: [..., 3] th.Tensor
+        Face UV indices.
+    Returns:
+        [..., 3] th.Tensors for T, B, N.
+    """
+    v0 = geom[:, vi[..., 0]]
+    v1 = geom[:, vi[..., 1]]
+    v2 = geom[:, vi[..., 2]]
+    vt0 = vt[vti[..., 0]]
+    vt1 = vt[vti[..., 1]]
+    vt2 = vt[vti[..., 2]]
+    v01 = v1 - v0
+    v02 = v2 - v0
+    vt01 = vt1 - vt0
+    vt02 = vt2 - vt0
+    f = 1.0 / (
+        vt01[None, ..., 0] * vt02[None, ..., 1]
+        - vt01[None, ..., 1] * vt02[None, ..., 0]
+    )
+    tangent = f[..., None] * th.stack(
+        [
+            v01[..., 0] * vt02[None, ..., 1] - v02[..., 0] * vt01[None, ..., 1],
+            v01[..., 1] * vt02[None, ..., 1] - v02[..., 1] * vt01[None, ..., 1],
+            v01[..., 2] * vt02[None, ..., 1] - v02[..., 2] * vt01[None, ..., 1],
+        ],
+        dim=-1,
+    )
+    tangent = F.normalize(tangent, dim=-1)
+    normal = F.normalize(th.cross(v01, v02, dim=3), dim=-1)
+    bitangent = F.normalize(th.cross(tangent, normal, dim=3), dim=-1)
+    return tangent, bitangent, normal
+def compute_v2uv(n_verts, vi, vti, n_max=4):
+    """Computes mapping from vertex indices to texture indices.
+    Args:
+        vi: [F, 3], triangles
+        vti: [F, 3], texture triangles
+        n_max: int, max number of texture locations
+    Returns:
+        [n_verts, n_max], texture indices
+    """
+    v2uv_dict = {}
+    for i_v, i_uv in zip(vi.reshape(-1), vti.reshape(-1)):
+        v2uv_dict.setdefault(i_v, set()).add(i_uv)
+    assert len(v2uv_dict) == n_verts
+    v2uv = np.zeros((n_verts, n_max), dtype=np.int32)
+    for i in range(n_verts):
+        vals = sorted(list(v2uv_dict[i]))
+        v2uv[i, :] = vals[0]
+        v2uv[i, : len(vals)] = np.array(vals)
+    return v2uv
+def compute_neighbours(n_verts, vi, n_max_values=10):
+    """Computes first-ring neighbours given vertices and faces."""
+    n_vi = vi.shape[0]
+    adj = {i: set() for i in range(n_verts)}
+    for i in range(n_vi):
+        for idx in vi[i]:
+            adj[idx] |= set(vi[i]) - set([idx])
+    nbs_idxs = np.tile(np.arange(n_verts)[:, np.newaxis], (1, n_max_values))
+    nbs_weights = np.zeros((n_verts, n_max_values), dtype=np.float32)
+    for idx in range(n_verts):
+        n_values = min(len(adj[idx]), n_max_values)
+        nbs_idxs[idx, :n_values] = np.array(list(adj[idx]))[:n_values]
+        nbs_weights[idx, :n_values] = -1.0 / n_values
+    return nbs_idxs, nbs_weights
+def make_postex(v, idxim, barim):
+    return (
+        barim[None, :, :, 0, None] * v[:, idxim[:, :, 0]]
+        + barim[None, :, :, 1, None] * v[:, idxim[:, :, 1]]
+        + barim[None, :, :, 2, None] * v[:, idxim[:, :, 2]]
+    ).permute(0, 3, 1, 2)
+def matrix_to_axisangle(r):
+    th = th.arccos(0.5 * (r[..., 0, 0] + r[..., 1, 1] + r[..., 2, 2] - 1.0))[..., None]
+    vec = (
+        0.5
+        * th.stack(
+            [
+                r[..., 2, 1] - r[..., 1, 2],
+                r[..., 0, 2] - r[..., 2, 0],
+                r[..., 1, 0] - r[..., 0, 1],
+            ],
+            dim=-1,
+        )
+        / th.sin(th)
+    )
+    return th, vec
+def axisangle_to_matrix(rvec):
+    theta = th.sqrt(1e-5 + th.sum(rvec**2, dim=-1))
+    rvec = rvec / theta[..., None]
+    costh = th.cos(theta)
+    sinth = th.sin(theta)
+    return th.stack(
+        (
+            th.stack(
+                (
+                    rvec[..., 0] ** 2 + (1.0 - rvec[..., 0] ** 2) * costh,
+                    rvec[..., 0] * rvec[..., 1] * (1.0 - costh) - rvec[..., 2] * sinth,
+                    rvec[..., 0] * rvec[..., 2] * (1.0 - costh) + rvec[..., 1] * sinth,
+                ),
+                dim=-1,
+            ),
+            th.stack(
+                (
+                    rvec[..., 0] * rvec[..., 1] * (1.0 - costh) + rvec[..., 2] * sinth,
+                    rvec[..., 1] ** 2 + (1.0 - rvec[..., 1] ** 2) * costh,
+                    rvec[..., 1] * rvec[..., 2] * (1.0 - costh) - rvec[..., 0] * sinth,
+                ),
+                dim=-1,
+            ),
+            th.stack(
+                (
+                    rvec[..., 0] * rvec[..., 2] * (1.0 - costh) - rvec[..., 1] * sinth,
+                    rvec[..., 1] * rvec[..., 2] * (1.0 - costh) + rvec[..., 0] * sinth,
+                    rvec[..., 2] ** 2 + (1.0 - rvec[..., 2] ** 2) * costh,
+                ),
+                dim=-1,
+            ),
+        ),
+        dim=-2,
+    )
+def rotation_interp(r0, r1, alpha):
+    r0a = r0.view(-1, 3, 3)
+    r1a = r1.view(-1, 3, 3)
+    r = th.bmm(r0a.permute(0, 2, 1), r1a).view_as(r0)
+    th, rvec = matrix_to_axisangle(r)
+    rvec = rvec * (alpha * th)
+    r = axisangle_to_matrix(rvec)
+    return th.bmm(r0a, r.view(-1, 3, 3)).view_as(r0)
+def convert_camera_parameters(Rt, K):
+    R = Rt[:, :3, :3]
+    t = -R.permute(0, 2, 1).bmm(Rt[:, :3, 3].unsqueeze(2)).squeeze(2)
+    return dict(
+        campos=t,
+        camrot=R,
+        focal=K[:, :2, :2],
+        princpt=K[:, :2, 2],
+    )
+def project_points_multi(p, Rt, K, normalize=False, size=None):
+    """Project a set of 3D points into multiple cameras with a pinhole model.
+    Args:
+        p: [B, N, 3], input 3D points in world coordinates
+        Rt: [B, NC, 3, 4], extrinsics (where NC is the number of cameras to project to)
+        K: [B, NC, 3, 3], intrinsics
+        normalize: bool, whether to normalize coordinates to [-1.0, 1.0]
+    Returns:
+        tuple:
+        - [B, NC, N, 2] - projected points
+        - [B, NC, N] - their
+    """
+    B, N = p.shape[:2]
+    NC = Rt.shape[1]
+    Rt = Rt.reshape(B * NC, 3, 4)
+    K = K.reshape(B * NC, 3, 3)
+    # [B, N, 3] -> [B * NC, N, 3]
+    p = p[:, np.newaxis].expand(-1, NC, -1, -1).reshape(B * NC, -1, 3)
+    p_cam = p @ Rt[:, :3, :3].transpose(-2, -1) + Rt[:, :3, 3][:, np.newaxis]
+    p_pix = p_cam @ K.transpose(-2, -1)
+    p_depth = p_pix[:, :, 2:]
+    p_pix = (p_pix[..., :2] / p_depth).reshape(B, NC, N, 2)
+    p_depth = p_depth.reshape(B, NC, N)
+    if normalize:
+        assert size is not None
+        h, w = size
+        p_pix = (
+            2.0 * p_pix / th.as_tensor([w, h], dtype=th.float32, device=p.device) - 1.0
+        )
+    return p_pix, p_depth

dva/io.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import cv2
+import numpy as np
+import copy
+import importlib
+from typing import Any, Dict
+def load_module(module_name, class_name=None, silent: bool = False):
+    module = importlib.import_module(module_name)
+    return getattr(module, class_name) if class_name else module
+def load_class(class_name):
+    return load_module(*class_name.rsplit(".", 1))
+def load_from_config(config, **kwargs):
+    """Instantiate an object given a config and arguments."""
+    assert "class_name" in config and "module_name" not in config
+    config = copy.deepcopy(config)
+    class_name = config.pop("class_name")
+    object_class = load_class(class_name)
+    return object_class(**config, **kwargs)
+def load_opencv_calib(extrin_path, intrin_path):
+    cameras = {}
+    fse = cv2.FileStorage()
+    fse.open(extrin_path, cv2.FileStorage_READ)
+    fsi = cv2.FileStorage()
+    fsi.open(intrin_path, cv2.FileStorage_READ)
+    names = [
+        fse.getNode("names").at(c).string() for c in range(fse.getNode("names").size())
+    ]
+    for camera in names:
+        rot = fse.getNode(f"R_{camera}").mat()
+        R = fse.getNode(f"Rot_{camera}").mat()
+        T = fse.getNode(f"T_{camera}").mat()
+        R_pred = cv2.Rodrigues(rot)[0]
+        assert np.all(np.isclose(R_pred, R))
+        K = fsi.getNode(f"K_{camera}").mat()
+        cameras[camera] = {
+            "Rt": np.concatenate([R, T], axis=1).astype(np.float32),
+            "K": K.astype(np.float32),
+        }
+    return cameras

dva/layers.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch as th
+import torch.nn as nn
+import numpy as np
+from dva.mvp.models.utils import Conv2dWN, Conv2dWNUB, ConvTranspose2dWNUB, initmod
+class ConvBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        size,
+        lrelu_slope=0.2,
+        kernel_size=3,
+        padding=1,
+        wnorm_dim=0,
+    ):
+        super().__init__()
+        self.conv_resize = Conv2dWN(in_channels, out_channels, kernel_size=1)
+        self.conv1 = Conv2dWNUB(
+            in_channels,
+            in_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            height=size,
+            width=size,
+        )
+        self.lrelu1 = nn.LeakyReLU(lrelu_slope)
+        self.conv2 = Conv2dWNUB(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            height=size,
+            width=size,
+        )
+        self.lrelu2 = nn.LeakyReLU(lrelu_slope)
+    def forward(self, x):
+        x_skip = self.conv_resize(x)
+        x = self.conv1(x)
+        x = self.lrelu1(x)
+        x = self.conv2(x)
+        x = self.lrelu2(x)
+        return x + x_skip
+def tile2d(x, size: int):
+    """Tile a given set of features into a convolutional map.
+    Args:
+        x: float tensor of shape [N, F]
+        size: int or a tuple
+    Returns:
+        a feature map [N, F, size[0], size[1]]
+    """
+    # size = size if isinstance(size, tuple) else (size, size)
+    # NOTE: expecting only int here (!!!)
+    return x[:, :, np.newaxis, np.newaxis].expand(-1, -1, size, size)
+def weights_initializer(m, alpha: float = 1.0):
+    return initmod(m, nn.init.calculate_gain("leaky_relu", alpha))
+class UNetWB(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        size,
+        n_init_ftrs=8,
+        out_scale=0.1,
+    ):
+        # super().__init__(*args, **kwargs)
+        super().__init__()
+        self.out_scale = 0.1
+        F = n_init_ftrs
+        # TODO: allow changing the size?
+        self.size = size
+        self.down1 = nn.Sequential(
+            Conv2dWNUB(in_channels, F, self.size // 2, self.size // 2, 4, 2, 1),
+            nn.LeakyReLU(0.2),
+        )
+        self.down2 = nn.Sequential(
+            Conv2dWNUB(F, 2 * F, self.size // 4, self.size // 4, 4, 2, 1),
+            nn.LeakyReLU(0.2),
+        )
+        self.down3 = nn.Sequential(
+            Conv2dWNUB(2 * F, 4 * F, self.size // 8, self.size // 8, 4, 2, 1),
+            nn.LeakyReLU(0.2),
+        )
+        self.down4 = nn.Sequential(
+            Conv2dWNUB(4 * F, 8 * F, self.size // 16, self.size // 16, 4, 2, 1),
+            nn.LeakyReLU(0.2),
+        )
+        self.down5 = nn.Sequential(
+            Conv2dWNUB(8 * F, 16 * F, self.size // 32, self.size // 32, 4, 2, 1),
+            nn.LeakyReLU(0.2),
+        )
+        self.up1 = nn.Sequential(
+            ConvTranspose2dWNUB(
+                16 * F, 8 * F, self.size // 16, self.size // 16, 4, 2, 1
+            ),
+            nn.LeakyReLU(0.2),
+        )
+        self.up2 = nn.Sequential(
+            ConvTranspose2dWNUB(8 * F, 4 * F, self.size // 8, self.size // 8, 4, 2, 1),
+            nn.LeakyReLU(0.2),
+        )
+        self.up3 = nn.Sequential(
+            ConvTranspose2dWNUB(4 * F, 2 * F, self.size // 4, self.size // 4, 4, 2, 1),
+            nn.LeakyReLU(0.2),
+        )
+        self.up4 = nn.Sequential(
+            ConvTranspose2dWNUB(2 * F, F, self.size // 2, self.size // 2, 4, 2, 1),
+            nn.LeakyReLU(0.2),
+        )
+        self.up5 = nn.Sequential(
+            ConvTranspose2dWNUB(F, F, self.size, self.size, 4, 2, 1), nn.LeakyReLU(0.2)
+        )
+        self.out = Conv2dWNUB(
+            F + in_channels, out_channels, self.size, self.size, kernel_size=1
+        )
+        self.apply(lambda x: initmod(x, 0.2))
+        initmod(self.out, 1.0)
+    def forward(self, x):
+        x1 = x
+        x2 = self.down1(x1)
+        x3 = self.down2(x2)
+        x4 = self.down3(x3)
+        x5 = self.down4(x4)
+        x6 = self.down5(x5)
+        # TODO: switch to concat?
+        x = self.up1(x6) + x5
+        x = self.up2(x) + x4
+        x = self.up3(x) + x3
+        x = self.up4(x) + x2
+        x = self.up5(x)
+        x = th.cat([x, x1], dim=1)
+        return self.out(x) * self.out_scale

dva/losses.py ADDED Viewed

	@@ -0,0 +1,239 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch.nn as nn
+import torch as th
+import numpy as np
+import logging
+from .vgg import VGGLossMasked
+logger = logging.getLogger("dva.{__name__}")
+class DCTLoss(nn.Module):
+    def __init__(self, weights):
+        super().__init__()
+        self.weights = weights
+    def forward(self, inputs, preds, iteration=None):
+        loss_dict = {"loss_total": 0.0}
+        target = inputs['gt']
+        recon = preds['recon']
+        posterior = preds['posterior']
+        fft_gt = th.view_as_real(th.fft.fft(target.reshape(target.shape[0], -1)))
+        fft_recon = th.view_as_real(th.fft.fft(recon.reshape(recon.shape[0], -1)))
+        loss_recon_dct_l1 = th.mean(th.abs(fft_gt - fft_recon))
+        loss_recon_l1 = th.mean(th.abs(target - recon))
+        loss_kl = posterior.kl().mean()
+        loss_dict.update(loss_recon_l1=loss_recon_l1, loss_recon_dct_l1=loss_recon_dct_l1, loss_kl=loss_kl)
+        loss_total = self.weights.recon * loss_recon_dct_l1 + self.weights.kl * loss_kl
+        loss_dict["loss_total"] = loss_total
+        return loss_total, loss_dict
+class VAESepL2Loss(nn.Module):
+    def __init__(self, weights):
+        super().__init__()
+        self.weights = weights
+    def forward(self, inputs, preds, iteration=None):
+        loss_dict = {"loss_total": 0.0}
+        target = inputs['gt']
+        recon = preds['recon']
+        posterior = preds['posterior']
+        recon_diff = (target - recon) ** 2
+        loss_recon_sdf_l1 = th.mean(recon_diff[:, 0:1, ...])
+        loss_recon_rgb_l1 = th.mean(recon_diff[:, 1:4, ...])
+        loss_recon_mat_l1 = th.mean(recon_diff[:, 4:6, ...])
+        loss_kl = posterior.kl().mean()
+        loss_dict.update(loss_sdf_l1=loss_recon_sdf_l1, loss_rgb_l1=loss_recon_rgb_l1, loss_mat_l1=loss_recon_mat_l1, loss_kl=loss_kl)
+        loss_total = self.weights.sdf * loss_recon_sdf_l1 + self.weights.rgb * loss_recon_rgb_l1 + self.weights.mat * loss_recon_mat_l1
+        if "kl" in self.weights:
+            loss_total += self.weights.kl * loss_kl
+        loss_dict["loss_total"] = loss_total
+        return loss_total, loss_dict
+class VAESepLoss(nn.Module):
+    def __init__(self, weights):
+        super().__init__()
+        self.weights = weights
+    def forward(self, inputs, preds, iteration=None):
+        loss_dict = {"loss_total": 0.0}
+        target = inputs['gt']
+        recon = preds['recon']
+        posterior = preds['posterior']
+        recon_diff = th.abs(target - recon)
+        loss_recon_sdf_l1 = th.mean(recon_diff[:, 0:1, ...])
+        loss_recon_rgb_l1 = th.mean(recon_diff[:, 1:4, ...])
+        loss_recon_mat_l1 = th.mean(recon_diff[:, 4:6, ...])
+        loss_kl = posterior.kl().mean()
+        loss_dict.update(loss_sdf_l1=loss_recon_sdf_l1, loss_rgb_l1=loss_recon_rgb_l1, loss_mat_l1=loss_recon_mat_l1, loss_kl=loss_kl)
+        loss_total = self.weights.sdf * loss_recon_sdf_l1 + self.weights.rgb * loss_recon_rgb_l1 + self.weights.mat * loss_recon_mat_l1
+        if "kl" in self.weights:
+            loss_total += self.weights.kl * loss_kl
+        loss_dict["loss_total"] = loss_total
+        return loss_total, loss_dict
+class VAELoss(nn.Module):
+    def __init__(self, weights):
+        super().__init__()
+        self.weights = weights
+    def forward(self, inputs, preds, iteration=None):
+        loss_dict = {"loss_total": 0.0}
+        target = inputs['gt']
+        recon = preds['recon']
+        posterior = preds['posterior']
+        loss_recon_l1 = th.mean(th.abs(target - recon))
+        loss_kl = posterior.kl().mean()
+        loss_dict.update(loss_recon_l1=loss_recon_l1, loss_kl=loss_kl)
+        loss_total = self.weights.recon * loss_recon_l1 + self.weights.kl * loss_kl
+        loss_dict["loss_total"] = loss_total
+        return loss_total, loss_dict
+class PrimSDFLoss(nn.Module):
+    def __init__(self, weights, shape_opt_steps=2000, tex_opt_steps=6000):
+        super().__init__()
+        self.weights = weights
+        self.shape_opt_steps = shape_opt_steps
+        self.tex_opt_steps = tex_opt_steps
+    def forward(self, inputs, preds, iteration=None):
+        loss_dict = {"loss_total": 0.0}
+        if iteration < self.shape_opt_steps:
+            target_sdf = inputs['sdf']
+            sdf = preds['sdf']
+            loss_sdf_l1 = th.mean(th.abs(sdf - target_sdf))
+            loss_dict.update(loss_sdf_l1=loss_sdf_l1)
+            loss_total = self.weights.sdf_l1 * loss_sdf_l1
+            prim_scale = preds["prim_scale"]
+            # we use 1/scale instead of the original 100/scale as our scale is normalized to [-1, 1] cube
+            if "vol_sum" in self.weights:
+                loss_prim_vol_sum = th.mean(th.sum(th.prod(1 / prim_scale, dim=-1), dim=-1))
+                loss_dict.update(loss_prim_vol_sum=loss_prim_vol_sum)
+                loss_total += self.weights.vol_sum * loss_prim_vol_sum
+        if iteration >= self.shape_opt_steps and iteration < self.tex_opt_steps:
+            target_tex = inputs['tex']
+            tex = preds['tex']
+            loss_tex_l1 = th.mean(th.abs(tex - target_tex))
+            loss_dict.update(loss_tex_l1=loss_tex_l1)
+            loss_total = (
+                self.weights.rgb_l1 * loss_tex_l1
+            )
+            if "mat_l1" in self.weights:
+                target_mat = inputs['mat']
+                mat = preds['mat']
+                loss_mat_l1 = th.mean(th.abs(mat - target_mat))
+                loss_dict.update(loss_mat_l1=loss_mat_l1)
+                loss_total += self.weights.mat_l1 * loss_mat_l1
+        if "grad_l2" in self.weights:
+            loss_grad_l2 = th.mean((preds["grad"] - inputs["grad"]) ** 2)
+            loss_total += self.weights.grad_l2 * loss_grad_l2
+            loss_dict.update(loss_grad_l2=loss_grad_l2)
+        loss_dict["loss_total"] = loss_total
+        return loss_total, loss_dict
+class TotalMVPLoss(nn.Module):
+    def __init__(self, weights, assets=None):
+        super().__init__()
+        self.weights = weights
+        if "vgg" in self.weights:
+            self.vgg_loss = VGGLossMasked()
+    def forward(self, inputs, preds, iteration=None):
+        loss_dict = {"loss_total": 0.0}
+        B = inputs["image"].shape
+        # rgb
+        target_rgb = inputs["image"].permute(0, 2, 3, 1)
+        # removing the mask
+        target_rgb = target_rgb * inputs["image_mask"][:, 0, :, :, np.newaxis]
+        rgb = preds["rgb"]
+        loss_rgb_mse = th.mean(((rgb - target_rgb) / 16.0) ** 2.0)
+        loss_dict.update(loss_rgb_mse=loss_rgb_mse)
+        alpha = preds["alpha"]
+        # mask loss
+        target_mask = inputs["image_mask"][:, 0].to(th.float32)
+        loss_mask_mae = th.mean((target_mask - alpha).abs())
+        loss_dict.update(loss_mask_mae=loss_mask_mae)
+        B = alpha.shape[0]
+        # beta prior on opacity
+        loss_alpha_prior = th.mean(
+            th.log(0.1 + alpha.reshape(B, -1))
+            + th.log(0.1 + 1.0 - alpha.reshape(B, -1))
+            - -2.20727
+        )
+        loss_dict.update(loss_alpha_prior=loss_alpha_prior)
+        prim_scale = preds["prim_scale"]
+        loss_prim_vol_sum = th.mean(th.sum(th.prod(100.0 / prim_scale, dim=-1), dim=-1))
+        loss_dict.update(loss_prim_vol_sum=loss_prim_vol_sum)
+        loss_total = (
+            self.weights.rgb_mse * loss_rgb_mse
+            + self.weights.mask_mae * loss_mask_mae
+            + self.weights.alpha_prior * loss_alpha_prior
+            + self.weights.prim_vol_sum * loss_prim_vol_sum
+        )
+        if "embs_l2" in self.weights:
+            loss_embs_l2 = th.sum(th.norm(preds["embs"], dim=1))
+            loss_total += self.weights.embs_l2 * loss_embs_l2
+            loss_dict.update(loss_embs_l2=loss_embs_l2)
+        if "vgg" in self.weights:
+            loss_vgg = self.vgg_loss(
+                rgb.permute(0, 3, 1, 2),
+                target_rgb.permute(0, 3, 1, 2),
+                inputs["image_mask"],
+            )
+            loss_total += self.weights.vgg * loss_vgg
+            loss_dict.update(loss_vgg=loss_vgg)
+        if "prim_scale_var" in self.weights:
+            log_prim_scale = th.log(prim_scale)
+            # NOTE: should we detach this?
+            log_prim_scale_mean = th.mean(log_prim_scale, dim=1, keepdim=True)
+            loss_prim_scale_var = th.mean((log_prim_scale - log_prim_scale_mean) ** 2.0)
+            loss_total += self.weights.prim_scale_var * loss_prim_scale_var
+            loss_dict.update(loss_prim_scale_var=loss_prim_scale_var)
+        loss_dict["loss_total"] = loss_total
+        return loss_total, loss_dict
+def process_losses(loss_dict, reduce=True, detach=True):
+    """Preprocess the dict of losses outputs."""
+    result = {
+        k.replace("loss_", ""): v for k, v in loss_dict.items() if k.startswith("loss_")
+    }
+    if detach:
+        result = {k: v.detach() for k, v in result.items()}
+    if reduce:
+        result = {k: float(v.mean().item()) for k, v in result.items()}
+    return result

dva/mvp/extensions/mvpraymarch/bvh.cu ADDED Viewed

	@@ -0,0 +1,292 @@

+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#include <cmath>
+#include <cstdio>
+#include <functional>
+#include <map>
+#include "helper_math.h"
+#include "cudadispatch.h"
+#include "primtransf.h"
+// Expands a 10-bit integer into 30 bits
+// by inserting 2 zeros after each bit.
+__device__ unsigned int expand_bits(unsigned int v) {
+    v = (v * 0x00010001u) & 0xFF0000FFu;
+    v = (v * 0x00000101u) & 0x0F00F00Fu;
+    v = (v * 0x00000011u) & 0xC30C30C3u;
+    v = (v * 0x00000005u) & 0x49249249u;
+    return v;
+}
+// Calculates a 30-bit Morton code for the
+// given 3D point located within the unit cube [0,1].
+__device__ unsigned int morton3D(float x, float y, float z) {
+    x = fminf(fmaxf(x * 1024.0f, 0.0f), 1023.0f);
+    y = fminf(fmaxf(y * 1024.0f, 0.0f), 1023.0f);
+    z = fminf(fmaxf(z * 1024.0f, 0.0f), 1023.0f);
+    unsigned int xx = expand_bits((unsigned int)x);
+    unsigned int yy = expand_bits((unsigned int)y);
+    unsigned int zz = expand_bits((unsigned int)z);
+    return xx * 4 + yy * 2 + zz;
+}
+template<typename PrimTransfT>
+__global__ void compute_morton_kernel(
+        int N, int K,
+        typename PrimTransfT::Data data,
+        int * code
+        ) {
+    const int count = N * K;
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < count; index += blockDim.x * gridDim.x) {
+        const int k = index % K;
+        const int n = index / K;
+        //float4 c = center[n * K + k];
+        float3 c = data.get_center(n, k);
+        code[n * K + k] = morton3D(c.x, c.y, c.z);
+    }
+}
+__forceinline__ __device__ int delta(int* sortedcodes, int x, int y, int K) {
+	if (x >= 0 && x <= K - 1 && y >= 0 && y <= K - 1) {
+        return sortedcodes[x] == sortedcodes[y] ?
+            32 + __clz(x ^ y) :
+            __clz(sortedcodes[x] ^ sortedcodes[y]);
+    }
+	return -1;
+}
+__forceinline__ __device__ int sign(int x) {
+	return (int)(x > 0) - (int)(x < 0);
+}
+__device__ int find_split(
+       int* sortedcodes,
+       int first,
+       int last,
+       int K) {
+    float commonPrefix = delta(sortedcodes, first, last, K);
+    int split = first;
+    int step = last - first;
+    do {
+        step = (step + 1) >> 1; // exponential decrease
+        int newSplit = split + step; // proposed new position
+        if (newSplit < last) {
+            int splitPrefix = delta(sortedcodes, first, newSplit, K);
+            if (splitPrefix > commonPrefix) {
+                split = newSplit; // accept proposal
+            }
+        }
+    } while (step > 1);
+    return split;
+}
+__device__ int2 determine_range(int* sortedcodes, int K, int idx) {
+    int d = sign(delta(sortedcodes, idx, idx + 1, K) - delta(sortedcodes, idx, idx - 1, K));
+    int dmin = delta(sortedcodes, idx, idx - d, K);
+    int lmax = 2;
+    while (delta(sortedcodes, idx, idx + lmax * d, K) > dmin) {
+        lmax = lmax * 2;
+    }
+    int l = 0;
+    for (int t = lmax / 2; t >= 1; t /= 2) {
+        if (delta(sortedcodes, idx, idx + (l + t)*d, K) > dmin) {
+            l += t;
+        }
+    }
+    int j = idx + l*d;
+    int2 range;
+    range.x = min(idx, j);
+    range.y = max(idx, j);
+    return range;
+}
+__global__ void build_tree_kernel(
+        int N, int K,
+        int * sortedcodes,
+        int2 * nodechildren,
+        int * nodeparent) {
+    const int count = N * (K + K - 1);
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < count; index += blockDim.x * gridDim.x) {
+        const int k = index % (K + K - 1);
+        const int n = index / (K + K - 1);
+        if (k >= K - 1) {
+            // leaf
+            nodechildren[n * (K + K - 1) + k] = make_int2(-(k - (K - 1)) - 1, -(k - (K - 1)) - 2);
+        } else {
+            // internal node
+            // find out which range of objects the node corresponds to
+            int2 range = determine_range(sortedcodes + n * K, K, k);
+            int first = range.x;
+            int last = range.y;
+            // determine where to split the range
+            int split = find_split(sortedcodes + n * K, first, last, K);
+            // select childA
+            int childa = split == first ? (K - 1) + split : split;
+            // select childB
+            int childb = split + 1 == last ? (K - 1) + split + 1 : split + 1;
+            // record parent-child relationships
+            nodechildren[n * (K + K - 1) + k] = make_int2(childa, childb);
+            nodeparent[n * (K + K - 1) + childa] = k;
+            nodeparent[n * (K + K - 1) + childb] = k;
+        }
+    }
+}
+template<typename PrimTransfT>
+__global__ void compute_aabb_kernel(
+        int N, int K,
+        typename PrimTransfT::Data data,
+        int * sortedobjid,
+        int2 * nodechildren,
+        int * nodeparent,
+        float3 * nodeaabb,
+        int * atom) {
+    const int count = N * K;
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < count; index += blockDim.x * gridDim.x) {
+        const int k = index % K;
+        const int n = index / K;
+        // compute BBOX for leaf
+        int kk = sortedobjid[n * K + k];
+        float3 pmin;
+        float3 pmax;
+        data.compute_aabb(n, kk, pmin, pmax);
+        nodeaabb[n * (K + K - 1) * 2 + ((K - 1) + k) * 2 + 0] = pmin;
+        nodeaabb[n * (K + K - 1) * 2 + ((K - 1) + k) * 2 + 1] = pmax;
+        int node = nodeparent[n * (K + K - 1) + ((K - 1) + k)];
+        while (node != -1 && atomicCAS(&atom[n * (K - 1) + node], 0, 1) == 1) {
+            int2 children = nodechildren[n * (K + K - 1) + node];
+            float3 laabbmin = nodeaabb[n * (K + K - 1) * 2 + children.x * 2 + 0];
+            float3 laabbmax = nodeaabb[n * (K + K - 1) * 2 + children.x * 2 + 1];
+            float3 raabbmin = nodeaabb[n * (K + K - 1) * 2 + children.y * 2 + 0];
+            float3 raabbmax = nodeaabb[n * (K + K - 1) * 2 + children.y * 2 + 1];
+            float3 aabbmin = fminf(laabbmin, raabbmin);
+            float3 aabbmax = fmaxf(laabbmax, raabbmax);
+            nodeaabb[n * (K + K - 1) * 2 + node * 2 + 0] = aabbmin;
+            nodeaabb[n * (K + K - 1) * 2 + node * 2 + 1] = aabbmax;
+            node = nodeparent[n * (K + K - 1) + node];
+            __threadfence();
+        }
+    }
+}
+void compute_morton_cuda(
+        int N, int K,
+        float * primpos,
+        int * code,
+        int algorithm,
+        cudaStream_t stream) {
+    int count = N * K;
+    int blocksize = 512;
+    int gridsize = (count + blocksize - 1) / blocksize;
+    std::shared_ptr<PrimTransfDataBase> primtransf_data;
+    primtransf_data = std::make_shared<PrimTransfSRT::Data>(PrimTransfSRT::Data{
+            PrimTransfDataBase{},
+            K, (float3*)primpos, nullptr,
+            K * 3, nullptr, nullptr,
+            K, nullptr, nullptr});
+    std::map<int, std::function<void(dim3, dim3, cudaStream_t, int, int, std::shared_ptr<PrimTransfDataBase>, int*)>> dispatcher = {
+      { 0, make_cudacall(compute_morton_kernel<PrimTransfSRT>) }
+    };
+    auto iter = dispatcher.find(min(0, algorithm));
+    if (iter != dispatcher.end()) {
+        (iter->second)(
+            dim3(gridsize), dim3(blocksize), stream,
+            N, K,
+            primtransf_data,
+            code);
+    }
+}
+void build_tree_cuda(
+        int N, int K,
+        int * sortedcode,
+        int * nodechildren,
+        int * nodeparent,
+        cudaStream_t stream) {
+    int count = N * (K + K - 1);
+    int nthreads = 512;
+    int nblocks = (count + nthreads - 1) / nthreads;
+    build_tree_kernel<<<nblocks, nthreads, 0, stream>>>(
+            N, K,
+            sortedcode,
+            reinterpret_cast<int2 *>(nodechildren),
+            nodeparent);
+}
+void compute_aabb_cuda(
+        int N, int K,
+        float * primpos,
+        float * primrot,
+        float * primscale,
+        int * sortedobjid,
+        int * nodechildren,
+        int * nodeparent,
+        float * nodeaabb,
+        int algorithm,
+        cudaStream_t stream) {
+    int * atom;
+    cudaMalloc(&atom, N * (K - 1) * 4);
+    cudaMemset(atom, 0, N * (K - 1) * 4);
+    int count = N * K;
+    int blocksize = 512;
+    int gridsize = (count + blocksize - 1) / blocksize;
+    std::shared_ptr<PrimTransfDataBase> primtransf_data;
+    primtransf_data = std::make_shared<PrimTransfSRT::Data>(PrimTransfSRT::Data{
+            PrimTransfDataBase{},
+            K, (float3*)primpos, nullptr,
+            K * 3, (float3*)primrot, nullptr,
+            K, (float3*)primscale, nullptr});
+    std::map<int, std::function<void(dim3, dim3, cudaStream_t, int, int, std::shared_ptr<PrimTransfDataBase>, int*, int2*, int*, float3*, int*)>> dispatcher = {
+      { 0, make_cudacall(compute_aabb_kernel<PrimTransfSRT>) }
+    };
+    auto iter = dispatcher.find(min(0, algorithm));
+    if (iter != dispatcher.end()) {
+        (iter->second)(
+            dim3(gridsize), dim3(blocksize), stream,
+            N, K,
+            primtransf_data,
+            sortedobjid,
+            reinterpret_cast<int2 *>(nodechildren),
+            nodeparent,
+            reinterpret_cast<float3 *>(nodeaabb),
+            atom);
+    }
+    cudaFree(atom);
+}

dva/mvp/extensions/mvpraymarch/cudadispatch.h ADDED Viewed

	@@ -0,0 +1,104 @@

+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef cudadispatch_h_
+#define cudadispatch_h_
+#include <functional>
+#include <memory>
+#include <type_traits>
+template<typename T, typename = void>
+struct get_base {
+    typedef T type;
+};
+template<typename T>
+struct get_base<T, typename std::enable_if<std::is_base_of<typename T::base, T>::value>::type> {
+    typedef std::shared_ptr<typename T::base> type;
+};
+template<typename T> struct is_shared_ptr : std::false_type {};
+template<typename T> struct is_shared_ptr<std::shared_ptr<T>> : std::true_type {};
+template<typename OutT, typename T>
+auto convert_shptr_impl2(std::shared_ptr<T> t) {
+    return *static_cast<OutT*>(t.get());
+}
+template<typename OutT, typename T>
+auto convert_shptr_impl(T&& t, std::false_type) {
+    return convert_shptr_impl2<OutT>(t);
+}
+template<typename OutT, typename T>
+auto convert_shptr_impl(T&& t, std::true_type) {
+    return std::forward<T>(t);
+}
+template<typename OutT, typename T>
+auto convert_shptr(T&& t) {
+    return convert_shptr_impl<OutT>(std::forward<T>(t), std::is_same<OutT, T>{});
+}
+template<typename... ArgsIn>
+struct cudacall {
+    struct functbase {
+        virtual ~functbase() {}
+        virtual void call(dim3, dim3, cudaStream_t, ArgsIn...) const = 0;
+    };
+    template<typename... ArgsOut>
+    struct funct : public functbase {
+        std::function<void(ArgsOut...)> fn;
+        funct(void(*fn_)(ArgsOut...)) : fn(fn_) { }
+        void call(dim3 gridsize, dim3 blocksize, cudaStream_t stream, ArgsIn... args) const {
+            void (*const*kfunc)(ArgsOut...) = fn.template target<void (*)(ArgsOut...)>();
+            (*kfunc)<<<gridsize, blocksize, 0, stream>>>(
+                    std::forward<ArgsOut>(convert_shptr<ArgsOut>(std::forward<ArgsIn>(args)))...);
+        }
+    };
+    std::shared_ptr<functbase> fn;
+    template<typename... ArgsOut>
+    cudacall(void(*fn_)(ArgsOut...)) : fn(std::make_shared<funct<ArgsOut...>>(fn_)) { }
+    template<typename... ArgsTmp>
+    void call(dim3 gridsize, dim3 blocksize, cudaStream_t stream, ArgsTmp&&... args) const {
+        fn->call(gridsize, blocksize, stream, std::forward<ArgsIn>(args)...);
+    }
+};
+template <typename F, typename T>
+struct binder {
+    F f; T t;
+    template <typename... Args>
+    auto operator()(Args&&... args) const
+        -> decltype(f(t, std::forward<Args>(args)...)) {
+        return f(t, std::forward<Args>(args)...);
+    }
+};
+template <typename F, typename T>
+binder<typename std::decay<F>::type
+     , typename std::decay<T>::type> BindFirst(F&& f, T&& t) {
+    return { std::forward<F>(f), std::forward<T>(t) };
+}
+template<typename... ArgsOut>
+auto make_cudacall_(void(*fn)(ArgsOut...)) {
+    return BindFirst(
+            std::mem_fn(&cudacall<typename get_base<ArgsOut>::type...>::template call<typename get_base<ArgsOut>::type...>),
+            cudacall<typename get_base<ArgsOut>::type...>(fn));
+}
+template<typename... ArgsOut>
+std::function<void(dim3, dim3, cudaStream_t, typename get_base<ArgsOut>::type...)> make_cudacall(void(*fn)(ArgsOut...)) {
+    return std::function<void(dim3, dim3, cudaStream_t, typename get_base<ArgsOut>::type...)>(make_cudacall_(fn));
+}
+#endif

dva/mvp/extensions/mvpraymarch/helper_math.h ADDED Viewed

	@@ -0,0 +1,1453 @@

+/**
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+/*
+ *  This file implements common mathematical operations on vector types
+ *  (float3, float4 etc.) since these are not provided as standard by CUDA.
+ *
+ *  The syntax is modeled on the Cg standard library.
+ *
+ *  This is part of the Helper library includes
+ *
+ *    Thanks to Linh Hah for additions and fixes.
+ */
+#ifndef HELPER_MATH_H
+#define HELPER_MATH_H
+#include "cuda_runtime.h"
+typedef unsigned int uint;
+typedef unsigned short ushort;
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+#ifndef __CUDACC__
+#include <math.h>
+////////////////////////////////////////////////////////////////////////////////
+// host implementations of CUDA functions
+////////////////////////////////////////////////////////////////////////////////
+inline float fminf(float a, float b)
+{
+    return a < b ? a : b;
+}
+inline float fmaxf(float a, float b)
+{
+    return a > b ? a : b;
+}
+inline int max(int a, int b)
+{
+    return a > b ? a : b;
+}
+inline int min(int a, int b)
+{
+    return a < b ? a : b;
+}
+inline float rsqrtf(float x)
+{
+    return 1.0f / sqrtf(x);
+}
+#endif
+////////////////////////////////////////////////////////////////////////////////
+// constructors
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 make_float2(float s)
+{
+    return make_float2(s, s);
+}
+inline __host__ __device__ float2 make_float2(float3 a)
+{
+    return make_float2(a.x, a.y);
+}
+inline __host__ __device__ float2 make_float2(int2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+inline __host__ __device__ float2 make_float2(uint2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+inline __host__ __device__ int2 make_int2(int s)
+{
+    return make_int2(s, s);
+}
+inline __host__ __device__ int2 make_int2(int3 a)
+{
+    return make_int2(a.x, a.y);
+}
+inline __host__ __device__ int2 make_int2(uint2 a)
+{
+    return make_int2(int(a.x), int(a.y));
+}
+inline __host__ __device__ int2 make_int2(float2 a)
+{
+    return make_int2(int(a.x), int(a.y));
+}
+inline __host__ __device__ uint2 make_uint2(uint s)
+{
+    return make_uint2(s, s);
+}
+inline __host__ __device__ uint2 make_uint2(uint3 a)
+{
+    return make_uint2(a.x, a.y);
+}
+inline __host__ __device__ uint2 make_uint2(int2 a)
+{
+    return make_uint2(uint(a.x), uint(a.y));
+}
+inline __host__ __device__ float3 make_float3(float s)
+{
+    return make_float3(s, s, s);
+}
+inline __host__ __device__ float3 make_float3(float2 a)
+{
+    return make_float3(a.x, a.y, 0.0f);
+}
+inline __host__ __device__ float3 make_float3(float2 a, float s)
+{
+    return make_float3(a.x, a.y, s);
+}
+inline __host__ __device__ float3 make_float3(float4 a)
+{
+    return make_float3(a.x, a.y, a.z);
+}
+inline __host__ __device__ float3 make_float3(int3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+inline __host__ __device__ float3 make_float3(uint3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+inline __host__ __device__ int3 make_int3(int s)
+{
+    return make_int3(s, s, s);
+}
+inline __host__ __device__ int3 make_int3(int2 a)
+{
+    return make_int3(a.x, a.y, 0);
+}
+inline __host__ __device__ int3 make_int3(int2 a, int s)
+{
+    return make_int3(a.x, a.y, s);
+}
+inline __host__ __device__ int3 make_int3(uint3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+inline __host__ __device__ int3 make_int3(float3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+inline __host__ __device__ uint3 make_uint3(uint s)
+{
+    return make_uint3(s, s, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a)
+{
+    return make_uint3(a.x, a.y, 0);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a, uint s)
+{
+    return make_uint3(a.x, a.y, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint4 a)
+{
+    return make_uint3(a.x, a.y, a.z);
+}
+inline __host__ __device__ uint3 make_uint3(int3 a)
+{
+    return make_uint3(uint(a.x), uint(a.y), uint(a.z));
+}
+inline __host__ __device__ float4 make_float4(float s)
+{
+    return make_float4(s, s, s, s);
+}
+inline __host__ __device__ float4 make_float4(float3 a)
+{
+    return make_float4(a.x, a.y, a.z, 0.0f);
+}
+inline __host__ __device__ float4 make_float4(float3 a, float w)
+{
+    return make_float4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ float4 make_float4(int4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+inline __host__ __device__ float4 make_float4(uint4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+inline __host__ __device__ int4 make_int4(int s)
+{
+    return make_int4(s, s, s, s);
+}
+inline __host__ __device__ int4 make_int4(int3 a)
+{
+    return make_int4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ int4 make_int4(int3 a, int w)
+{
+    return make_int4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ int4 make_int4(uint4 a)
+{
+    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+inline __host__ __device__ int4 make_int4(float4 a)
+{
+    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+inline __host__ __device__ uint4 make_uint4(uint s)
+{
+    return make_uint4(s, s, s, s);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a)
+{
+    return make_uint4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a, uint w)
+{
+    return make_uint4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ uint4 make_uint4(int4 a)
+{
+    return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w));
+}
+////////////////////////////////////////////////////////////////////////////////
+// negate
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 operator-(float2 &a)
+{
+    return make_float2(-a.x, -a.y);
+}
+inline __host__ __device__ int2 operator-(int2 &a)
+{
+    return make_int2(-a.x, -a.y);
+}
+inline __host__ __device__ float3 operator-(float3 &a)
+{
+    return make_float3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ int3 operator-(int3 &a)
+{
+    return make_int3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ float4 operator-(float4 &a)
+{
+    return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+inline __host__ __device__ int4 operator-(int4 &a)
+{
+    return make_int4(-a.x, -a.y, -a.z, -a.w);
+}
+////////////////////////////////////////////////////////////////////////////////
+// addition
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 operator+(float2 a, float2 b)
+{
+    return make_float2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(float2 &a, float2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ float2 operator+(float2 a, float b)
+{
+    return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ float2 operator+(float b, float2 a)
+{
+    return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(float2 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+}
+inline __host__ __device__ int2 operator+(int2 a, int2 b)
+{
+    return make_int2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(int2 &a, int2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ int2 operator+(int2 a, int b)
+{
+    return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ int2 operator+(int b, int2 a)
+{
+    return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(int2 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+}
+inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
+{
+    return make_uint2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ uint2 operator+(uint2 a, uint b)
+{
+    return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ uint2 operator+(uint b, uint2 a)
+{
+    return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+}
+inline __host__ __device__ float3 operator+(float3 a, float3 b)
+{
+    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(float3 &a, float3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ float3 operator+(float3 a, float b)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(float3 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+inline __host__ __device__ int3 operator+(int3 a, int3 b)
+{
+    return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(int3 &a, int3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ int3 operator+(int3 a, int b)
+{
+    return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(int3 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
+{
+    return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ uint3 operator+(uint3 a, uint b)
+{
+    return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+inline __host__ __device__ int3 operator+(int b, int3 a)
+{
+    return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ uint3 operator+(uint b, uint3 a)
+{
+    return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ float3 operator+(float b, float3 a)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ float4 operator+(float4 a, float4 b)
+{
+    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(float4 &a, float4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ float4 operator+(float4 a, float b)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ float4 operator+(float b, float4 a)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ void operator+=(float4 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+inline __host__ __device__ int4 operator+(int4 a, int4 b)
+{
+    return make_int4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(int4 &a, int4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ int4 operator+(int4 a, int b)
+{
+    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ int4 operator+(int b, int4 a)
+{
+    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ void operator+=(int4 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
+{
+    return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ uint4 operator+(uint4 a, uint b)
+{
+    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ uint4 operator+(uint b, uint4 a)
+{
+    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+////////////////////////////////////////////////////////////////////////////////
+// subtract
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 operator-(float2 a, float2 b)
+{
+    return make_float2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ float2 operator-(float2 a, float b)
+{
+    return make_float2(a.x - b, a.y - b);
+}
+inline __host__ __device__ float2 operator-(float b, float2 a)
+{
+    return make_float2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+inline __host__ __device__ int2 operator-(int2 a, int2 b)
+{
+    return make_int2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ int2 operator-(int2 a, int b)
+{
+    return make_int2(a.x - b, a.y - b);
+}
+inline __host__ __device__ int2 operator-(int b, int2 a)
+{
+    return make_int2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
+{
+    return make_uint2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ uint2 operator-(uint2 a, uint b)
+{
+    return make_uint2(a.x - b, a.y - b);
+}
+inline __host__ __device__ uint2 operator-(uint b, uint2 a)
+{
+    return make_uint2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+inline __host__ __device__ float3 operator-(float3 a, float3 b)
+{
+    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ float3 operator-(float3 a, float b)
+{
+    return make_float3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ float3 operator-(float b, float3 a)
+{
+    return make_float3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+inline __host__ __device__ int3 operator-(int3 a, int3 b)
+{
+    return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ int3 operator-(int3 a, int b)
+{
+    return make_int3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ int3 operator-(int b, int3 a)
+{
+    return make_int3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
+{
+    return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ uint3 operator-(uint3 a, uint b)
+{
+    return make_uint3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ uint3 operator-(uint b, uint3 a)
+{
+    return make_uint3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+inline __host__ __device__ float4 operator-(float4 a, float4 b)
+{
+    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(float4 &a, float4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ float4 operator-(float4 a, float b)
+{
+    return make_float4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ void operator-=(float4 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+inline __host__ __device__ int4 operator-(int4 a, int4 b)
+{
+    return make_int4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ int4 operator-(int4 a, int b)
+{
+    return make_int4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ int4 operator-(int b, int4 a)
+{
+    return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
+{
+    return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ uint4 operator-(uint4 a, uint b)
+{
+    return make_uint4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ uint4 operator-(uint b, uint4 a)
+{
+    return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+////////////////////////////////////////////////////////////////////////////////
+// multiply
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 operator*(float2 a, float2 b)
+{
+    return make_float2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ float2 operator*(float2 a, float b)
+{
+    return make_float2(a.x * b, a.y * b);
+}
+inline __host__ __device__ float2 operator*(float b, float2 a)
+{
+    return make_float2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+inline __host__ __device__ int2 operator*(int2 a, int2 b)
+{
+    return make_int2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ int2 operator*(int2 a, int b)
+{
+    return make_int2(a.x * b, a.y * b);
+}
+inline __host__ __device__ int2 operator*(int b, int2 a)
+{
+    return make_int2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
+{
+    return make_uint2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ uint2 operator*(uint2 a, uint b)
+{
+    return make_uint2(a.x * b, a.y * b);
+}
+inline __host__ __device__ uint2 operator*(uint b, uint2 a)
+{
+    return make_uint2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+inline __host__ __device__ float3 operator*(float3 a, float3 b)
+{
+    return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ float3 operator*(float3 a, float b)
+{
+    return make_float3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ float3 operator*(float b, float3 a)
+{
+    return make_float3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+inline __host__ __device__ int3 operator*(int3 a, int3 b)
+{
+    return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ int3 operator*(int3 a, int b)
+{
+    return make_int3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ int3 operator*(int b, int3 a)
+{
+    return make_int3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
+{
+    return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ uint3 operator*(uint3 a, uint b)
+{
+    return make_uint3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ uint3 operator*(uint b, uint3 a)
+{
+    return make_uint3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+inline __host__ __device__ float4 operator*(float4 a, float4 b)
+{
+    return make_float4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ float4 operator*(float4 a, float b)
+{
+    return make_float4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ float4 operator*(float b, float4 a)
+{
+    return make_float4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+inline __host__ __device__ int4 operator*(int4 a, int4 b)
+{
+    return make_int4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ int4 operator*(int4 a, int b)
+{
+    return make_int4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ int4 operator*(int b, int4 a)
+{
+    return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
+{
+    return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ uint4 operator*(uint4 a, uint b)
+{
+    return make_uint4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ uint4 operator*(uint b, uint4 a)
+{
+    return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+////////////////////////////////////////////////////////////////////////////////
+// divide
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 operator/(float2 a, float2 b)
+{
+    return make_float2(a.x / b.x, a.y / b.y);
+}
+inline __host__ __device__ void operator/=(float2 &a, float2 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+}
+inline __host__ __device__ float2 operator/(float2 a, float b)
+{
+    return make_float2(a.x / b, a.y / b);
+}
+inline __host__ __device__ void operator/=(float2 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+}
+inline __host__ __device__ float2 operator/(float b, float2 a)
+{
+    return make_float2(b / a.x, b / a.y);
+}
+inline __host__ __device__ float3 operator/(float3 a, float3 b)
+{
+    return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+inline __host__ __device__ void operator/=(float3 &a, float3 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+}
+inline __host__ __device__ float3 operator/(float3 a, float b)
+{
+    return make_float3(a.x / b, a.y / b, a.z / b);
+}
+inline __host__ __device__ void operator/=(float3 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+    a.z /= b;
+}
+inline __host__ __device__ float3 operator/(float b, float3 a)
+{
+    return make_float3(b / a.x, b / a.y, b / a.z);
+}
+inline __host__ __device__ float4 operator/(float4 a, float4 b)
+{
+    return make_float4(a.x / b.x, a.y / b.y, a.z / b.z,  a.w / b.w);
+}
+inline __host__ __device__ void operator/=(float4 &a, float4 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+    a.w /= b.w;
+}
+inline __host__ __device__ float4 operator/(float4 a, float b)
+{
+    return make_float4(a.x / b, a.y / b, a.z / b,  a.w / b);
+}
+inline __host__ __device__ void operator/=(float4 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+    a.z /= b;
+    a.w /= b;
+}
+inline __host__ __device__ float4 operator/(float b, float4 a)
+{
+    return make_float4(b / a.x, b / a.y, b / a.z, b / a.w);
+}
+////////////////////////////////////////////////////////////////////////////////
+// min
+////////////////////////////////////////////////////////////////////////////////
+inline  __host__ __device__ float2 fminf(float2 a, float2 b)
+{
+    return make_float2(fminf(a.x,b.x), fminf(a.y,b.y));
+}
+inline __host__ __device__ float3 fminf(float3 a, float3 b)
+{
+    return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
+}
+inline  __host__ __device__ float4 fminf(float4 a, float4 b)
+{
+    return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
+}
+inline __host__ __device__ int2 min(int2 a, int2 b)
+{
+    return make_int2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ int3 min(int3 a, int3 b)
+{
+    return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ int4 min(int4 a, int4 b)
+{
+    return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+inline __host__ __device__ uint2 min(uint2 a, uint2 b)
+{
+    return make_uint2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ uint3 min(uint3 a, uint3 b)
+{
+    return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ uint4 min(uint4 a, uint4 b)
+{
+    return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+////////////////////////////////////////////////////////////////////////////////
+// max
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 fmaxf(float2 a, float2 b)
+{
+    return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y));
+}
+inline __host__ __device__ float3 fmaxf(float3 a, float3 b)
+{
+    return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
+}
+inline __host__ __device__ float4 fmaxf(float4 a, float4 b)
+{
+    return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
+}
+inline __host__ __device__ int2 max(int2 a, int2 b)
+{
+    return make_int2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ int3 max(int3 a, int3 b)
+{
+    return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ int4 max(int4 a, int4 b)
+{
+    return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+inline __host__ __device__ uint2 max(uint2 a, uint2 b)
+{
+    return make_uint2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ uint3 max(uint3 a, uint3 b)
+{
+    return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ uint4 max(uint4 a, uint4 b)
+{
+    return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+////////////////////////////////////////////////////////////////////////////////
+// lerp
+// - linear interpolation between a and b, based on value t in [0, 1] range
+////////////////////////////////////////////////////////////////////////////////
+inline __device__ __host__ float lerp(float a, float b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
+{
+    return a + t*(b-a);
+}
+////////////////////////////////////////////////////////////////////////////////
+// clamp
+// - clamp the value v to be in the range [a, b]
+////////////////////////////////////////////////////////////////////////////////
+inline __device__ __host__ float clamp(float f, float a, float b)
+{
+    return fmaxf(a, fminf(f, b));
+}
+inline __device__ __host__ int clamp(int f, int a, int b)
+{
+    return max(a, min(f, b));
+}
+inline __device__ __host__ uint clamp(uint f, uint a, uint b)
+{
+    return max(a, min(f, b));
+}
+inline __device__ __host__ float2 clamp(float2 v, float a, float b)
+{
+    return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
+{
+    return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ float3 clamp(float3 v, float a, float b)
+{
+    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
+{
+    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ float4 clamp(float4 v, float a, float b)
+{
+    return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
+{
+    return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+inline __device__ __host__ int2 clamp(int2 v, int a, int b)
+{
+    return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
+{
+    return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ int3 clamp(int3 v, int a, int b)
+{
+    return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
+{
+    return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ int4 clamp(int4 v, int a, int b)
+{
+    return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
+{
+    return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
+{
+    return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
+{
+    return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
+{
+    return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
+{
+    return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
+{
+    return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
+{
+    return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+////////////////////////////////////////////////////////////////////////////////
+// dot product
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float dot(float2 a, float2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ float dot(float3 a, float3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ float dot(float4 a, float4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+inline __host__ __device__ int dot(int2 a, int2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ int dot(int3 a, int3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ int dot(int4 a, int4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+inline __host__ __device__ uint dot(uint2 a, uint2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ uint dot(uint3 a, uint3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ uint dot(uint4 a, uint4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+////////////////////////////////////////////////////////////////////////////////
+// length
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float length(float2 v)
+{
+    return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float3 v)
+{
+    return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float4 v)
+{
+    return sqrtf(dot(v, v));
+}
+////////////////////////////////////////////////////////////////////////////////
+// normalize
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 normalize(float2 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+inline __host__ __device__ float3 normalize(float3 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+inline __host__ __device__ float4 normalize(float4 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+////////////////////////////////////////////////////////////////////////////////
+// floor
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 floorf(float2 v)
+{
+    return make_float2(floorf(v.x), floorf(v.y));
+}
+inline __host__ __device__ float3 floorf(float3 v)
+{
+    return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
+}
+inline __host__ __device__ float4 floorf(float4 v)
+{
+    return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
+}
+////////////////////////////////////////////////////////////////////////////////
+// frac - returns the fractional portion of a scalar or each vector component
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float fracf(float v)
+{
+    return v - floorf(v);
+}
+inline __host__ __device__ float2 fracf(float2 v)
+{
+    return make_float2(fracf(v.x), fracf(v.y));
+}
+inline __host__ __device__ float3 fracf(float3 v)
+{
+    return make_float3(fracf(v.x), fracf(v.y), fracf(v.z));
+}
+inline __host__ __device__ float4 fracf(float4 v)
+{
+    return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
+}
+////////////////////////////////////////////////////////////////////////////////
+// fmod
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 fmodf(float2 a, float2 b)
+{
+    return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y));
+}
+inline __host__ __device__ float3 fmodf(float3 a, float3 b)
+{
+    return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
+}
+inline __host__ __device__ float4 fmodf(float4 a, float4 b)
+{
+    return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
+}
+////////////////////////////////////////////////////////////////////////////////
+// absolute value
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 fabs(float2 v)
+{
+    return make_float2(fabs(v.x), fabs(v.y));
+}
+inline __host__ __device__ float3 fabs(float3 v)
+{
+    return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
+}
+inline __host__ __device__ float4 fabs(float4 v)
+{
+    return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
+}
+inline __host__ __device__ int2 abs(int2 v)
+{
+    return make_int2(abs(v.x), abs(v.y));
+}
+inline __host__ __device__ int3 abs(int3 v)
+{
+    return make_int3(abs(v.x), abs(v.y), abs(v.z));
+}
+inline __host__ __device__ int4 abs(int4 v)
+{
+    return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
+}
+////////////////////////////////////////////////////////////////////////////////
+// reflect
+// - returns reflection of incident ray I around surface normal N
+// - N should be normalized, reflected vector's length is equal to length of I
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float3 reflect(float3 i, float3 n)
+{
+    return i - 2.0f * n * dot(n,i);
+}
+////////////////////////////////////////////////////////////////////////////////
+// cross product
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float3 cross(float3 a, float3 b)
+{
+    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
+}
+////////////////////////////////////////////////////////////////////////////////
+// smoothstep
+// - returns 0 if x < a
+// - returns 1 if x > b
+// - otherwise returns smooth interpolation between 0 and 1 based on x
+////////////////////////////////////////////////////////////////////////////////
+inline __device__ __host__ float smoothstep(float a, float b, float x)
+{
+    float y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(3.0f - (2.0f*y)));
+}
+inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x)
+{
+    float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y)));
+}
+inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x)
+{
+    float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y)));
+}
+inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x)
+{
+    float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y)));
+}
+#endif

dva/mvp/extensions/mvpraymarch/makefile ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ all:
2	+ python setup.py build_ext --inplace

dva/mvp/extensions/mvpraymarch/mvpraymarch.cpp ADDED Viewed

	@@ -0,0 +1,405 @@

+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#include <torch/extension.h>
+#include <c10/cuda/CUDAStream.h>
+#include <vector>
+void compute_morton_cuda(
+        int N, int K,
+        float * primpos,
+        int * code,
+        int algorithm,
+        cudaStream_t stream);
+void build_tree_cuda(
+        int N, int K,
+        int * sortedcode,
+        int * nodechildren,
+        int * nodeparent,
+        cudaStream_t stream);
+void compute_aabb_cuda(
+        int N, int K,
+        float * primpos,
+        float * primrot,
+        float * primscale,
+        int * sortedobjid,
+        int * nodechildren,
+        int * nodeparent,
+        float * nodeaabb,
+        int algorithm,
+        cudaStream_t stream);
+void raymarch_forward_cuda(
+        int N, int H, int W, int K,
+        float * rayposim,
+        float * raydirim,
+        float stepsize,
+        float * tminmaxim,
+        int * sortedobjid,
+        int * nodechildren,
+        float * nodeaabb,
+        float * primpos,
+        float * primrot,
+        float * primscale,
+        int TD, int TH, int TW,
+        float * tplate,
+        int WD, int WH, int WW,
+        float * warp,
+        float * rayrgbaim,
+        float * raysatim,
+        int * raytermim,
+        int algorithm, bool sortboxes, int maxhitboxes, bool synchitboxes,
+        bool chlast, float fadescale, float fadeexp, int accum, float termthresh,
+        int griddim, int blocksizex, int blocksizey,
+        cudaStream_t stream);
+void raymarch_backward_cuda(
+        int N, int H, int W, int K,
+        float * rayposim,
+        float * raydirim,
+        float stepsize,
+        float * tminmaxim,
+        int * sortedobjid,
+        int * nodechildren,
+        float * nodeaabb,
+        float * primpos,
+        float * grad_primpos,
+        float * primrot,
+        float * grad_primrot,
+        float * primscale,
+        float * grad_primscale,
+        int TD, int TH, int TW,
+        float * tplate,
+        float * grad_tplate,
+        int WD, int WH, int WW,
+        float * warp,
+        float * grad_warp,
+        float * rayrgbaim,
+        float * grad_rayrgba,
+        float * raysatim,
+        int * raytermim,
+        int algorithm, bool sortboxes, int maxhitboxes, bool synchitboxes,
+        bool chlast, float fadescale, float fadeexp, int accum, float termthresh,
+        int griddim, int blocksizex, int blocksizey,
+        cudaStream_t stream);
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA((x)); CHECK_CONTIGUOUS((x))
+std::vector<torch::Tensor> compute_morton(
+        torch::Tensor primpos,
+        torch::Tensor code,
+        int algorithm) {
+    CHECK_INPUT(primpos);
+    CHECK_INPUT(code);
+    int N = primpos.size(0);
+    int K = primpos.size(1);
+    compute_morton_cuda(
+            N, K,
+            reinterpret_cast<float *>(primpos.data_ptr()),
+            reinterpret_cast<int *>(code.data_ptr()),
+            algorithm,
+            0);
+    return {};
+}
+std::vector<torch::Tensor> build_tree(
+        torch::Tensor sortedcode,
+        torch::Tensor nodechildren,
+        torch::Tensor nodeparent) {
+    CHECK_INPUT(sortedcode);
+    CHECK_INPUT(nodechildren);
+    CHECK_INPUT(nodeparent);
+    int N = sortedcode.size(0);
+    int K = sortedcode.size(1);
+    build_tree_cuda(N, K,
+            reinterpret_cast<int *>(sortedcode.data_ptr()),
+            reinterpret_cast<int *>(nodechildren.data_ptr()),
+            reinterpret_cast<int *>(nodeparent.data_ptr()),
+            0);
+    return {};
+}
+std::vector<torch::Tensor> compute_aabb(
+        torch::Tensor primpos,
+        torch::optional<torch::Tensor> primrot,
+        torch::optional<torch::Tensor> primscale,
+        torch::Tensor sortedobjid,
+        torch::Tensor nodechildren,
+        torch::Tensor nodeparent,
+        torch::Tensor nodeaabb,
+        int algorithm) {
+    CHECK_INPUT(sortedobjid);
+    CHECK_INPUT(primpos);
+    if (primrot) { CHECK_INPUT(*primrot); }
+    if (primscale) { CHECK_INPUT(*primscale); }
+    CHECK_INPUT(nodechildren);
+    CHECK_INPUT(nodeparent);
+    CHECK_INPUT(nodeaabb);
+    int N = primpos.size(0);
+    int K = primpos.size(1);
+    compute_aabb_cuda(N, K,
+            reinterpret_cast<float *>(primpos.data_ptr()),
+            primrot ? reinterpret_cast<float *>(primrot->data_ptr()) : nullptr,
+            primscale ? reinterpret_cast<float *>(primscale->data_ptr()) : nullptr,
+            reinterpret_cast<int *>(sortedobjid.data_ptr()),
+            reinterpret_cast<int *>(nodechildren.data_ptr()),
+            reinterpret_cast<int *>(nodeparent.data_ptr()),
+            reinterpret_cast<float *>(nodeaabb.data_ptr()),
+            algorithm,
+            0);
+    return {};
+}
+std::vector<torch::Tensor> raymarch_forward(
+        torch::Tensor rayposim,
+        torch::Tensor raydirim,
+        float stepsize,
+        torch::Tensor tminmaxim,
+        torch::optional<torch::Tensor> sortedobjid,
+        torch::optional<torch::Tensor> nodechildren,
+        torch::optional<torch::Tensor> nodeaabb,
+        torch::Tensor primpos,
+        torch::optional<torch::Tensor> primrot,
+        torch::optional<torch::Tensor> primscale,
+        torch::Tensor tplate,
+        torch::optional<torch::Tensor> warp,
+        torch::Tensor rayrgbaim,
+        torch::optional<torch::Tensor> raysatim,
+        torch::optional<torch::Tensor> raytermim,
+        int algorithm=0,
+        bool sortboxes=true,
+        int maxhitboxes=512,
+        bool synchitboxes=false,
+        bool chlast=false,
+        float fadescale=8.f,
+        float fadeexp=8.f,
+        int accum=0,
+        float termthresh=0.f,
+        int griddim=3,
+        int blocksizex=8,
+        int blocksizey=16) {
+    CHECK_INPUT(rayposim);
+    CHECK_INPUT(raydirim);
+    CHECK_INPUT(tminmaxim);
+    if (sortedobjid) { CHECK_INPUT(*sortedobjid); }
+    if (nodechildren) { CHECK_INPUT(*nodechildren); }
+    if (nodeaabb) { CHECK_INPUT(*nodeaabb); }
+    CHECK_INPUT(tplate);
+    if (warp) { CHECK_INPUT(*warp); }
+    CHECK_INPUT(primpos);
+    if (primrot) { CHECK_INPUT(*primrot); }
+    if (primscale) { CHECK_INPUT(*primscale); }
+    CHECK_INPUT(rayrgbaim);
+    if (raysatim) { CHECK_INPUT(*raysatim); }
+    if (raytermim) { CHECK_INPUT(*raytermim); }
+    int N = rayposim.size(0);
+    int H = rayposim.size(1);
+    int W = rayposim.size(2);
+    int K = primpos.size(1);
+    int TD, TH, TW;
+    if (chlast) {
+        TD = tplate.size(2); TH = tplate.size(3); TW = tplate.size(4);
+    } else {
+        TD = tplate.size(3); TH = tplate.size(4); TW = tplate.size(5);
+    }
+    int WD = 0, WH = 0, WW = 0;
+    if (warp) {
+        if (chlast) {
+            WD = warp->size(2); WH = warp->size(3); WW = warp->size(4);
+        } else {
+            WD = warp->size(3); WH = warp->size(4); WW = warp->size(5);
+        }
+    }
+    raymarch_forward_cuda(N, H, W, K,
+            reinterpret_cast<float *>(rayposim.data_ptr()),
+            reinterpret_cast<float *>(raydirim.data_ptr()),
+            stepsize,
+            reinterpret_cast<float *>(tminmaxim.data_ptr()),
+            sortedobjid ? reinterpret_cast<int *>(sortedobjid->data_ptr()) : nullptr,
+            nodechildren ? reinterpret_cast<int *>(nodechildren->data_ptr()) : nullptr,
+            nodeaabb ? reinterpret_cast<float *>(nodeaabb->data_ptr()) : nullptr,
+            // prim transforms
+            reinterpret_cast<float *>(primpos.data_ptr()),
+            primrot ? reinterpret_cast<float *>(primrot->data_ptr()) : nullptr,
+            primscale ? reinterpret_cast<float *>(primscale->data_ptr()) : nullptr,
+            // prim sampler
+            TD, TH, TW,
+            reinterpret_cast<float *>(tplate.data_ptr()),
+            WD, WH, WW,
+            warp ? reinterpret_cast<float *>(warp->data_ptr()) : nullptr,
+            // prim accumulator
+            reinterpret_cast<float *>(rayrgbaim.data_ptr()),
+            raysatim ? reinterpret_cast<float *>(raysatim->data_ptr()) : nullptr,
+            raytermim ? reinterpret_cast<int *>(raytermim->data_ptr()) : nullptr,
+            // options
+            algorithm, sortboxes, maxhitboxes, synchitboxes, chlast, fadescale, fadeexp, accum, termthresh,
+            griddim, blocksizex, blocksizey,
+            0);
+    return {};
+}
+std::vector<torch::Tensor> raymarch_backward(
+        torch::Tensor rayposim,
+        torch::Tensor raydirim,
+        float stepsize,
+        torch::Tensor tminmaxim,
+        torch::optional<torch::Tensor> sortedobjid,
+        torch::optional<torch::Tensor> nodechildren,
+        torch::optional<torch::Tensor> nodeaabb,
+        torch::Tensor primpos,
+        torch::Tensor grad_primpos,
+        torch::optional<torch::Tensor> primrot,
+        torch::optional<torch::Tensor> grad_primrot,
+        torch::optional<torch::Tensor> primscale,
+        torch::optional<torch::Tensor> grad_primscale,
+        torch::Tensor tplate,
+        torch::Tensor grad_tplate,
+        torch::optional<torch::Tensor> warp,
+        torch::optional<torch::Tensor> grad_warp,
+        torch::Tensor rayrgbaim,
+        torch::Tensor grad_rayrgba,
+        torch::optional<torch::Tensor> raysatim,
+        torch::optional<torch::Tensor> raytermim,
+        int algorithm=0,
+        bool sortboxes=true,
+        int maxhitboxes=512,
+        bool synchitboxes=false,
+        bool chlast=false,
+        float fadescale=8.f,
+        float fadeexp=8.f,
+        int accum=0,
+        float termthresh=0.f,
+        int griddim=3,
+        int blocksizex=8,
+        int blocksizey=16) {
+    CHECK_INPUT(rayposim);
+    CHECK_INPUT(raydirim);
+    CHECK_INPUT(tminmaxim);
+    if (sortedobjid) { CHECK_INPUT(*sortedobjid); }
+    if (nodechildren) { CHECK_INPUT(*nodechildren); }
+    if (nodeaabb) { CHECK_INPUT(*nodeaabb); }
+    CHECK_INPUT(tplate);
+    if (warp) { CHECK_INPUT(*warp); }
+    CHECK_INPUT(primpos);
+    if (primrot) { CHECK_INPUT(*primrot); }
+    if (primscale) { CHECK_INPUT(*primscale); }
+    CHECK_INPUT(rayrgbaim);
+    if (raysatim) { CHECK_INPUT(*raysatim); }
+    if (raytermim) { CHECK_INPUT(*raytermim); }
+    CHECK_INPUT(grad_rayrgba);
+    CHECK_INPUT(grad_tplate);
+    if (grad_warp) { CHECK_INPUT(*grad_warp); }
+    CHECK_INPUT(grad_primpos);
+    if (grad_primrot) { CHECK_INPUT(*grad_primrot); }
+    if (grad_primscale) { CHECK_INPUT(*grad_primscale); }
+    int N = rayposim.size(0);
+    int H = rayposim.size(1);
+    int W = rayposim.size(2);
+    int K = primpos.size(1);
+    int TD, TH, TW;
+    if (chlast) {
+        TD = tplate.size(2); TH = tplate.size(3); TW = tplate.size(4);
+    } else {
+        TD = tplate.size(3); TH = tplate.size(4); TW = tplate.size(5);
+    }
+    int WD = 0, WH = 0, WW = 0;
+    if (warp) {
+        if (chlast) {
+            WD = warp->size(2); WH = warp->size(3); WW = warp->size(4);
+        } else {
+            WD = warp->size(3); WH = warp->size(4); WW = warp->size(5);
+        }
+    }
+    raymarch_backward_cuda(N, H, W, K,
+            reinterpret_cast<float *>(rayposim.data_ptr()),
+            reinterpret_cast<float *>(raydirim.data_ptr()),
+            stepsize,
+            reinterpret_cast<float *>(tminmaxim.data_ptr()),
+            sortedobjid ? reinterpret_cast<int *>(sortedobjid->data_ptr()) : nullptr,
+            nodechildren ? reinterpret_cast<int *>(nodechildren->data_ptr()) : nullptr,
+            nodeaabb ? reinterpret_cast<float *>(nodeaabb->data_ptr()) : nullptr,
+            reinterpret_cast<float *>(primpos.data_ptr()),
+            reinterpret_cast<float *>(grad_primpos.data_ptr()),
+            primrot ? reinterpret_cast<float *>(primrot->data_ptr()) : nullptr,
+            grad_primrot ? reinterpret_cast<float *>(grad_primrot->data_ptr()) : nullptr,
+            primscale ? reinterpret_cast<float *>(primscale->data_ptr()) : nullptr,
+            grad_primscale ? reinterpret_cast<float *>(grad_primscale->data_ptr()) : nullptr,
+            TD, TH, TW,
+            reinterpret_cast<float *>(tplate.data_ptr()),
+            reinterpret_cast<float *>(grad_tplate.data_ptr()),
+            WD, WH, WW,
+            warp ? reinterpret_cast<float *>(warp->data_ptr()) : nullptr,
+            grad_warp ? reinterpret_cast<float *>(grad_warp->data_ptr()) : nullptr,
+            reinterpret_cast<float *>(rayrgbaim.data_ptr()),
+            reinterpret_cast<float *>(grad_rayrgba.data_ptr()),
+            raysatim ? reinterpret_cast<float *>(raysatim->data_ptr()) : nullptr,
+            raytermim ? reinterpret_cast<int *>(raytermim->data_ptr()) : nullptr,
+            algorithm, sortboxes, maxhitboxes, synchitboxes, chlast, fadescale, fadeexp, accum, termthresh,
+            griddim, blocksizex, blocksizey,
+            0);
+    return {};
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("compute_morton", &compute_morton, "compute morton codes (CUDA)");
+    m.def("build_tree", &build_tree, "build BVH tree (CUDA)");
+    m.def("compute_aabb", &compute_aabb, "compute AABB sizes (CUDA)");
+    m.def("raymarch_forward",  &raymarch_forward,  "raymarch forward (CUDA)");
+    m.def("raymarch_backward", &raymarch_backward, "raymarch backward (CUDA)");
+}

dva/mvp/extensions/mvpraymarch/mvpraymarch.py ADDED Viewed

	@@ -0,0 +1,559 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+import torch.nn.functional as F
+try:
+    from . import mvpraymarchlib
+except:
+    import mvpraymarchlib
+def build_accel(primtransfin, algo, fixedorder=False):
+    """build bvh structure given primitive centers and sizes
+    Parameters:
+    ----------
+    primtransfin : tuple[tensor, tensor, tensor]
+        primitive transform tensors
+    algo : int
+        raymarching algorithm
+    fixedorder : optional[str]
+        True means the bvh builder will not reorder primitives and will
+        use a trivial tree structure. Likely to be slow for arbitrary
+        configurations of primitives.
+    """
+    primpos, primrot, primscale = primtransfin
+    N = primpos.size(0)
+    K = primpos.size(1)
+    dev = primpos.device
+    # compute and sort morton codes
+    if fixedorder:
+        sortedobjid = (torch.arange(N*K, dtype=torch.int32, device=dev) % K).view(N, K)
+    else:
+        cmax = primpos.max(dim=1, keepdim=True)[0]
+        cmin = primpos.min(dim=1, keepdim=True)[0]
+        centers_norm = (primpos - cmin) / (cmax - cmin).clamp(min=1e-8)
+        mortoncode = torch.empty((N, K), dtype=torch.int32, device=dev)
+        mvpraymarchlib.compute_morton(centers_norm, mortoncode, algo)
+        sortedcode, sortedobjid_long = torch.sort(mortoncode, dim=-1)
+        sortedobjid = sortedobjid_long.int()
+    if fixedorder:
+        nodechildren = torch.cat([
+            torch.arange(1, (K - 1) * 2 + 1, dtype=torch.int32, device=dev),
+            torch.div(torch.arange(-2, -(K * 2 + 1) - 1, -1, dtype=torch.int32, device=dev), 2, rounding_mode="floor")],
+        dim=0).view(1, K + K - 1, 2).repeat(N, 1, 1)
+        nodeparent = (
+            torch.div(torch.arange(-1, K * 2 - 2, dtype=torch.int32, device=dev), 2, rounding_mode="floor")
+               .view(1, -1).repeat(N, 1))
+    else:
+        nodechildren = torch.empty((N, K + K - 1, 2), dtype=torch.int32, device=dev)
+        nodeparent = torch.full((N, K + K - 1), -1, dtype=torch.int32, device=dev)
+        mvpraymarchlib.build_tree(sortedcode, nodechildren, nodeparent)
+    nodeaabb = torch.empty((N, K + K - 1, 2, 3), dtype=torch.float32, device=dev)
+    mvpraymarchlib.compute_aabb(*primtransfin, sortedobjid, nodechildren, nodeparent, nodeaabb, algo)
+    return sortedobjid, nodechildren, nodeaabb
+class MVPRaymarch(Function):
+    """Custom Function for raymarching Mixture of Volumetric Primitives."""
+    @staticmethod
+    def forward(self, raypos, raydir, stepsize, tminmax,
+            primpos, primrot, primscale,
+            template, warp,
+            rayterm, gradmode, options):
+        algo = options["algo"]
+        usebvh = options["usebvh"]
+        sortprims = options["sortprims"]
+        randomorder = options["randomorder"]
+        maxhitboxes = options["maxhitboxes"]
+        synchitboxes = options["synchitboxes"]
+        chlast = options["chlast"]
+        fadescale = options["fadescale"]
+        fadeexp = options["fadeexp"]
+        accum = options["accum"]
+        termthresh = options["termthresh"]
+        griddim = options["griddim"]
+        if isinstance(options["blocksize"], tuple):
+            blocksizex, blocksizey = options["blocksize"]
+        else:
+            blocksizex = options["blocksize"]
+            blocksizey = 1
+        assert raypos.is_contiguous() and raypos.size(3) == 3
+        assert raydir.is_contiguous() and raydir.size(3) == 3
+        assert tminmax.is_contiguous() and tminmax.size(3) == 2
+        assert primpos is None or primpos.is_contiguous() and primpos.size(2) == 3
+        assert primrot is None or primrot.is_contiguous() and primrot.size(2) == 3
+        assert primscale is None or primscale.is_contiguous() and primscale.size(2) == 3
+        if chlast:
+            assert template.is_contiguous() and len(template.size()) == 6 and template.size(-1) == 4
+            assert warp is None or (warp.is_contiguous() and warp.size(-1) == 3)
+        else:
+            assert template.is_contiguous() and len(template.size()) == 6 and template.size(2) == 4
+            assert warp is None or (warp.is_contiguous() and warp.size(2) == 3)
+        primtransfin = (primpos, primrot, primscale)
+        # Build bvh
+        if usebvh is not False:
+            # compute radius of primitives
+            sortedobjid, nodechildren, nodeaabb = build_accel(primtransfin,
+                    algo, fixedorder=usebvh=="fixedorder")
+            assert sortedobjid.is_contiguous()
+            assert nodechildren.is_contiguous()
+            assert nodeaabb.is_contiguous()
+            if randomorder:
+                sortedobjid = sortedobjid[torch.randperm(len(sortedobjid))]
+        else:
+            _, sortedobjid, nodechildren, nodeaabb = None, None, None, None
+        # march through boxes
+        N, H, W = raypos.size(0), raypos.size(1), raypos.size(2)
+        rayrgba = torch.empty((N, H, W, 4), device=raypos.device)
+        if gradmode:
+            raysat = torch.full((N, H, W, 3), -1, dtype=torch.float32, device=raypos.device)
+            rayterm = None
+        else:
+            raysat = None
+            rayterm = None
+        mvpraymarchlib.raymarch_forward(
+                raypos, raydir, stepsize, tminmax,
+                sortedobjid, nodechildren, nodeaabb,
+                *primtransfin,
+                template, warp,
+                rayrgba, raysat, rayterm,
+                algo, sortprims, maxhitboxes, synchitboxes, chlast,
+                fadescale, fadeexp,
+                accum, termthresh,
+                griddim, blocksizex, blocksizey)
+        self.save_for_backward(
+                raypos, raydir, tminmax,
+                sortedobjid, nodechildren, nodeaabb,
+                primpos, primrot, primscale,
+                template, warp,
+                rayrgba, raysat, rayterm)
+        self.options = options
+        self.stepsize = stepsize
+        return rayrgba
+    @staticmethod
+    def backward(self, grad_rayrgba):
+        (raypos, raydir, tminmax,
+            sortedobjid, nodechildren, nodeaabb,
+            primpos, primrot, primscale,
+            template, warp,
+            rayrgba, raysat, rayterm) = self.saved_tensors
+        algo = self.options["algo"]
+        usebvh = self.options["usebvh"]
+        sortprims = self.options["sortprims"]
+        maxhitboxes = self.options["maxhitboxes"]
+        synchitboxes = self.options["synchitboxes"]
+        chlast = self.options["chlast"]
+        fadescale = self.options["fadescale"]
+        fadeexp = self.options["fadeexp"]
+        accum = self.options["accum"]
+        termthresh = self.options["termthresh"]
+        griddim = self.options["griddim"]
+        if isinstance(self.options["bwdblocksize"], tuple):
+            blocksizex, blocksizey = self.options["bwdblocksize"]
+        else:
+            blocksizex = self.options["bwdblocksize"]
+            blocksizey = 1
+        stepsize = self.stepsize
+        grad_primpos = torch.zeros_like(primpos)
+        grad_primrot = torch.zeros_like(primrot)
+        grad_primscale = torch.zeros_like(primscale)
+        primtransfin = (primpos, grad_primpos, primrot, grad_primrot, primscale, grad_primscale)
+        grad_template = torch.zeros_like(template)
+        grad_warp = torch.zeros_like(warp) if warp is not None else None
+        mvpraymarchlib.raymarch_backward(raypos, raydir, stepsize, tminmax,
+                sortedobjid, nodechildren, nodeaabb,
+                *primtransfin,
+                template, grad_template, warp, grad_warp,
+                rayrgba, grad_rayrgba.contiguous(), raysat, rayterm,
+                algo, sortprims, maxhitboxes, synchitboxes, chlast,
+                fadescale, fadeexp,
+                accum, termthresh,
+                griddim, blocksizex, blocksizey)
+        return (None, None, None, None,
+                grad_primpos, grad_primrot, grad_primscale,
+                grad_template, grad_warp,
+                None, None, None)
+def mvpraymarch(raypos, raydir, stepsize, tminmax,
+            primtransf,
+            template, warp,
+            rayterm=None,
+            algo=0, usebvh="fixedorder",
+            sortprims=False, randomorder=False,
+            maxhitboxes=512, synchitboxes=True,
+            chlast=True, fadescale=8., fadeexp=8.,
+            accum=0, termthresh=0.,
+            griddim=3, blocksize=(8, 16), bwdblocksize=(8, 16)):
+    """Main entry point for raymarching MVP.
+    Parameters:
+    ----------
+    raypos: N x H x W x 3 tensor of ray origins
+    raydir: N x H x W x 3 tensor of ray directions
+    stepsize: raymarching step size
+    tminmax: N x H x W x 2 tensor of raymarching min/max bounds
+    template: N x K x 4 x TD x TH x TW tensor of K RGBA primitives
+    warp: N x K x 3 x TD x TH x TW tensor of K warp fields (optional)
+    primpos: N x K x 3 tensor of primitive centers
+    primrot: N x K x 3 x 3 tensor of primitive orientations
+    primscale: N x K x 3 tensor of primitive inverse dimension lengths
+    algo: algorithm for raymarching (valid values: 0, 1). algo=0 is the fastest.
+        Currently algo=0 has a limit of 512 primitives per ray, so problems can
+        occur if there are many more boxes. all sortprims=True options have
+        this limitation, but you can use (algo=1, sortprims=False,
+        usebvh="fixedorder") which works correctly and has no primitive number
+        limitation (but is slightly slower).
+    usebvh: True to use bvh, "fixedorder" for a simple BVH, False for no bvh
+    sortprims: True to sort overlapping primitives at a sample point. Must
+        be True for gradients to match the PyTorch gradients. Seems unstable
+        if False but also not a big performance bottleneck.
+    chlast: whether template is provided as channels last or not. True tends
+        to be faster.
+    fadescale: Opacity is faded at the borders of the primitives by the equation
+        exp(-fadescale * x ** fadeexp) where x is the normalized coordinates of
+        the primitive.
+    fadeexp: Opacity is faded at the borders of the primitives by the equation
+        exp(-fadescale * x ** fadeexp) where x is the normalized coordinates of
+        the primitive.
+    griddim: CUDA grid dimensionality.
+    blocksize: blocksize of CUDA kernels. Should be 2-element tuple if
+        griddim>1, or integer if griddim==1."""
+    if isinstance(primtransf, tuple):
+        primpos, primrot, primscale = primtransf
+    else:
+        primpos, primrot, primscale = (
+                primtransf[:, :, 0, :].contiguous(),
+                primtransf[:, :, 1:4, :].contiguous(),
+                primtransf[:, :, 4, :].contiguous())
+    primtransfin = (primpos, primrot, primscale)
+    out = MVPRaymarch.apply(raypos, raydir, stepsize, tminmax,
+            *primtransfin,
+            template, warp,
+            rayterm, torch.is_grad_enabled(),
+            {"algo": algo, "usebvh": usebvh, "sortprims": sortprims, "randomorder": randomorder,
+                "maxhitboxes": maxhitboxes, "synchitboxes": synchitboxes,
+                "chlast": chlast, "fadescale": fadescale, "fadeexp": fadeexp,
+                "accum": accum, "termthresh": termthresh,
+                "griddim": griddim, "blocksize": blocksize, "bwdblocksize": bwdblocksize})
+    return out
+class Rodrigues(nn.Module):
+    def __init__(self):
+        super(Rodrigues, self).__init__()
+    def forward(self, rvec):
+        theta = torch.sqrt(1e-5 + torch.sum(rvec ** 2, dim=1))
+        rvec = rvec / theta[:, None]
+        costh = torch.cos(theta)
+        sinth = torch.sin(theta)
+        return torch.stack((
+            rvec[:, 0] ** 2 + (1. - rvec[:, 0] ** 2) * costh,
+            rvec[:, 0] * rvec[:, 1] * (1. - costh) - rvec[:, 2] * sinth,
+            rvec[:, 0] * rvec[:, 2] * (1. - costh) + rvec[:, 1] * sinth,
+            rvec[:, 0] * rvec[:, 1] * (1. - costh) + rvec[:, 2] * sinth,
+            rvec[:, 1] ** 2 + (1. - rvec[:, 1] ** 2) * costh,
+            rvec[:, 1] * rvec[:, 2] * (1. - costh) - rvec[:, 0] * sinth,
+            rvec[:, 0] * rvec[:, 2] * (1. - costh) - rvec[:, 1] * sinth,
+            rvec[:, 1] * rvec[:, 2] * (1. - costh) + rvec[:, 0] * sinth,
+            rvec[:, 2] ** 2 + (1. - rvec[:, 2] ** 2) * costh), dim=1).view(-1, 3, 3)
+def gradcheck(usebvh=True, sortprims=True, maxhitboxes=512, synchitboxes=False,
+        dowarp=False, chlast=False, fadescale=8., fadeexp=8.,
+        accum=0, termthresh=0., algo=0, griddim=2, blocksize=(8, 16), bwdblocksize=(8, 16)):
+    N = 2
+    H = 65
+    W = 65
+    k3 = 4
+    K = k3*k3*k3
+    M = 32
+    print("=================================================================")
+    print("usebvh={}, sortprims={}, maxhb={}, synchb={}, dowarp={}, chlast={}, "
+        "fadescale={}, fadeexp={}, accum={}, termthresh={}, algo={}, griddim={}, "
+        "blocksize={}, bwdblocksize={}".format(
+        usebvh, sortprims, maxhitboxes, synchitboxes, dowarp, chlast,
+        fadescale, fadeexp, accum, termthresh, algo, griddim, blocksize,
+        bwdblocksize))
+    # generate random inputs
+    torch.manual_seed(1112)
+    coherent_rays = True
+    if not coherent_rays:
+        _raypos = torch.randn(N, H, W, 3).to("cuda")
+        _raydir = torch.randn(N, H, W, 3).to("cuda")
+        _raydir /= torch.sqrt(torch.sum(_raydir ** 2, dim=-1, keepdim=True))
+    else:
+        focal = torch.tensor([[W*4.0, W*4.0] for n in range(N)])
+        princpt = torch.tensor([[W*0.5, H*0.5] for n in range(N)])
+        pixely, pixelx = torch.meshgrid(torch.arange(H).float(), torch.arange(W).float())
+        pixelcoords = torch.stack([pixelx, pixely], dim=-1)[None, :, :, :].repeat(N, 1, 1, 1)
+        raydir = (pixelcoords - princpt[:, None, None, :]) / focal[:, None, None, :]
+        raydir = torch.cat([raydir, torch.ones_like(raydir[:, :, :, 0:1])], dim=-1)
+        raydir = raydir / torch.sqrt(torch.sum(raydir ** 2, dim=-1, keepdim=True))
+        _raypos = torch.tensor([-0.0, 0.0, -4.])[None, None, None, :].repeat(N, H, W, 1).to("cuda")
+        _raydir = raydir.to("cuda")
+        _raydir /= torch.sqrt(torch.sum(_raydir ** 2, dim=-1, keepdim=True))
+    max_len = 6.0
+    _stepsize = max_len / 15.386928
+    _tminmax = max_len*torch.arange(2, dtype=torch.float32)[None, None, None, :].repeat(N, H, W, 1).to("cuda") + \
+            torch.rand(N, H, W, 2, device="cuda") * 1.
+    _template = torch.randn(N, K, 4, M, M, M, requires_grad=True)
+    _template.data[:, :, -1, :, :, :] -= 3.5
+    _template = _template.contiguous().detach().clone()
+    _template.requires_grad = True
+    gridxyz = torch.stack(torch.meshgrid(
+        torch.linspace(-1., 1., M//2),
+        torch.linspace(-1., 1., M//2),
+        torch.linspace(-1., 1., M//2))[::-1], dim=0).contiguous()
+    _warp = (torch.randn(N, K, 3, M//2, M//2, M//2) * 0.01 + gridxyz[None, None, :, :, :, :]).contiguous().detach().clone()
+    _warp.requires_grad = True
+    _primpos = torch.randn(N, K, 3, requires_grad=True)
+    _primpos = torch.randn(N, K, 3, requires_grad=True)
+    coherent_centers = True
+    if coherent_centers:
+        ns = k3
+        #assert ns*ns*ns==K
+        grid3d = torch.stack(torch.meshgrid(
+            torch.linspace(-1., 1., ns),
+            torch.linspace(-1., 1., ns),
+            torch.linspace(-1., 1., K//(ns*ns)))[::-1], dim=0)[None]
+        _primpos = ((
+            grid3d.permute((0, 2, 3, 4, 1)).reshape(1, K, 3).expand(N, -1, -1) +
+            0.1 * torch.randn(N, K, 3, requires_grad=True)
+            )).contiguous().detach().clone()
+        _primpos.requires_grad = True
+    scale_ws = 1.
+    _primrot = torch.randn(N, K, 3)
+    rodrigues = Rodrigues()
+    _primrot = rodrigues(_primrot.view(-1, 3)).view(N, K, 3, 3).contiguous().detach().clone()
+    _primrot.requires_grad = True
+    _primscale = torch.randn(N, K, 3, requires_grad=True)
+    _primscale.data *= 0.0
+    if dowarp:
+        params = [_template, _warp, _primscale, _primrot, _primpos]
+        paramnames = ["template", "warp", "primscale", "primrot", "primpos"]
+    else:
+        params = [_template, _primscale, _primrot, _primpos]
+        paramnames = ["template", "primscale", "primrot", "primpos"]
+    termthreshorig = termthresh
+    ########################### run pytorch version ###########################
+    raypos = _raypos
+    raydir = _raydir
+    stepsize = _stepsize
+    tminmax = _tminmax
+    #template = F.softplus(_template.to("cuda") * 1.5)
+    template = F.softplus(_template.to("cuda") * 1.5) if algo != 2 else _template.to("cuda") * 1.5
+    warp = _warp.to("cuda")
+    primpos = _primpos.to("cuda") * 0.3
+    primrot = _primrot.to("cuda")
+    primscale = scale_ws * torch.exp(0.1 * _primscale.to("cuda"))
+    # python raymarching implementation
+    rayrgba = torch.zeros((N, H, W, 4)).to("cuda")
+    raypos = raypos + raydir * tminmax[:, :, :, 0, None]
+    t = tminmax[:, :, :, 0]
+    step = 0
+    t0 = t.detach().clone()
+    raypos0 = raypos.detach().clone()
+    torch.cuda.synchronize()
+    time0 = time.time()
+    while (t < tminmax[:, :, :, 1]).any():
+        valid2 = torch.ones_like(rayrgba[:, :, :, 3:4])
+        for k in range(K):
+            y0 = torch.bmm(
+                    (raypos - primpos[:, k, None, None, :]).view(raypos.size(0), -1, raypos.size(3)),
+                    primrot[:, k, :, :]).view_as(raypos) * primscale[:, k, None, None, :]
+            fade = torch.exp(-fadescale * torch.sum(torch.abs(y0) ** fadeexp, dim=-1, keepdim=True))
+            if dowarp:
+                y1 = F.grid_sample(
+                        warp[:, k, :, :, :, :],
+                        y0[:, None, :, :, :], align_corners=True)[:, :, 0, :, :].permute(0, 2, 3, 1)
+            else:
+                y1 = y0
+            sample = F.grid_sample(
+                    template[:, k, :, :, :, :],
+                    y1[:, None, :, :, :], align_corners=True)[:, :, 0, :, :].permute(0, 2, 3, 1)
+            valid1 = (
+                torch.prod(y0[:, :, :, :] >= -1., dim=-1, keepdim=True) *
+                torch.prod(y0[:, :, :, :] <= 1., dim=-1, keepdim=True))
+            valid = ((t >= tminmax[:, :, :, 0]) & (t < tminmax[:, :, :, 1])).float()[:, :, :, None]
+            alpha0 = sample[:, :, :, 3:4]
+            rgb = sample[:, :, :, 0:3] * valid * valid1
+            alpha = alpha0 * fade * stepsize * valid * valid1
+            if accum == 0:
+                newalpha = rayrgba[:, :, :, 3:4] + alpha
+                contrib = (newalpha.clamp(max=1.0) - rayrgba[:, :, :, 3:4]) * valid * valid1
+                rayrgba = rayrgba + contrib * torch.cat([rgb, torch.ones_like(alpha)], dim=-1)
+            else:
+                raise
+        step += 1
+        t = t0 + stepsize * step
+        raypos = raypos0 + raydir * stepsize * step
+    print(rayrgba[..., -1].min().item(), rayrgba[..., -1].max().item())
+    sample0 = rayrgba
+    torch.cuda.synchronize()
+    time1 = time.time()
+    sample0.backward(torch.ones_like(sample0))
+    torch.cuda.synchronize()
+    time2 = time.time()
+    print("{:<10} {:>10} {:>10} {:>10}".format("", "fwd", "bwd", "total"))
+    print("{:<10} {:10.5} {:10.5} {:10.5}".format("pytime", time1 - time0, time2 - time1, time2 - time0))
+    grads0 = [p.grad.detach().clone() for p in params]
+    for p in params:
+        p.grad.detach_()
+        p.grad.zero_()
+    ############################## run cuda version ###########################
+    raypos = _raypos
+    raydir = _raydir
+    stepsize = _stepsize
+    tminmax = _tminmax
+    template = F.softplus(_template.to("cuda") * 1.5) if algo != 2 else _template.to("cuda") * 1.5
+    warp = _warp.to("cuda")
+    if chlast:
+        template = template.permute(0, 1, 3, 4, 5, 2).contiguous()
+        warp = warp.permute(0, 1, 3, 4, 5, 2).contiguous()
+    primpos = _primpos.to("cuda") * 0.3
+    primrot = _primrot.to("cuda")
+    primscale = scale_ws * torch.exp(0.1 * _primscale.to("cuda"))
+    niter = 1
+    tf, tb = 0., 0.
+    for i in range(niter):
+        for p in params:
+            try:
+                p.grad.detach_()
+                p.grad.zero_()
+            except:
+                pass
+        t0 = time.time()
+        torch.cuda.synchronize()
+        sample1 = mvpraymarch(raypos, raydir, stepsize, tminmax,
+                (primpos, primrot, primscale),
+                template, warp if dowarp else None,
+                algo=algo, usebvh=usebvh, sortprims=sortprims,
+                maxhitboxes=maxhitboxes, synchitboxes=synchitboxes,
+                chlast=chlast, fadescale=fadescale, fadeexp=fadeexp,
+                accum=accum, termthresh=termthreshorig,
+                griddim=griddim, blocksize=blocksize, bwdblocksize=bwdblocksize)
+        t1 = time.time()
+        torch.cuda.synchronize()
+        sample1.backward(torch.ones_like(sample1), retain_graph=True)
+        torch.cuda.synchronize()
+        t2 = time.time()
+        tf += t1 - t0
+        tb += t2 - t1
+    print("{:<10} {:10.5} {:10.5} {:10.5}".format("time", tf / niter, tb / niter, (tf + tb) / niter))
+    grads1 = [p.grad.detach().clone() for p in params]
+    ############# compare results #############
+    print("-----------------------------------------------------------------")
+    print("{:>10} {:>10} {:>10} {:>10} {:>10} {:>10} {:>10} {:>10}".format("", "maxabsdiff", "dp", "||py||", "||cuda||", "index", "py", "cuda"))
+    ind = torch.argmax(torch.abs(sample0 - sample1))
+    print("{:<10} {:>10.5} {:>10.5} {:>10.5} {:>10.5} {:>10} {:>10.5} {:>10.5}".format(
+        "fwd",
+        torch.max(torch.abs(sample0 - sample1)).item(),
+        (torch.sum(sample0 * sample1) / torch.sqrt(torch.sum(sample0 * sample0) * torch.sum(sample1 * sample1))).item(),
+        torch.sqrt(torch.sum(sample0 * sample0)).item(),
+        torch.sqrt(torch.sum(sample1 * sample1)).item(),
+        ind.item(),
+        sample0.view(-1)[ind].item(),
+        sample1.view(-1)[ind].item()))
+    for p, g0, g1 in zip(paramnames, grads0, grads1):
+        ind = torch.argmax(torch.abs(g0 - g1))
+        print("{:<10} {:>10.5} {:>10.5} {:>10.5} {:>10.5} {:>10} {:>10.5} {:>10.5}".format(
+                p,
+                torch.max(torch.abs(g0 - g1)).item(),
+                (torch.sum(g0 * g1) / torch.sqrt(torch.sum(g0 * g0) * torch.sum(g1 * g1))).item(),
+                torch.sqrt(torch.sum(g0 * g0)).item(),
+                torch.sqrt(torch.sum(g1 * g1)).item(),
+                ind.item(),
+                g0.view(-1)[ind].item(),
+                g1.view(-1)[ind].item()))
+if __name__ == "__main__":
+    gradcheck(usebvh="fixedorder", sortprims=False, maxhitboxes=512, synchitboxes=True,
+            dowarp=False, chlast=True, fadescale=6.5, fadeexp=7.5, accum=0, algo=0, griddim=3)
+    gradcheck(usebvh="fixedorder", sortprims=False, maxhitboxes=512, synchitboxes=True,
+            dowarp=True, chlast=True, fadescale=6.5, fadeexp=7.5, accum=0, algo=1, griddim=3)

dva/mvp/extensions/mvpraymarch/mvpraymarch_kernel.cu ADDED Viewed

	@@ -0,0 +1,208 @@

+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#include <chrono>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include "helper_math.h"
+#include "cudadispatch.h"
+#include "utils.h"
+#include "primtransf.h"
+#include "primsampler.h"
+#include "primaccum.h"
+#include "mvpraymarch_subset_kernel.h"
+typedef std::shared_ptr<PrimTransfDataBase> PrimTransfDataBase_ptr;
+typedef std::shared_ptr<PrimSamplerDataBase> PrimSamplerDataBase_ptr;
+typedef std::shared_ptr<PrimAccumDataBase> PrimAccumDataBase_ptr;
+typedef std::function<void(dim3, dim3, cudaStream_t, int, int, int, int,
+        float3*, float3*, float, float2*, int*, int2*, float3*,
+        PrimTransfDataBase_ptr, PrimSamplerDataBase_ptr,
+        PrimAccumDataBase_ptr)> mapfn_t;
+typedef RaySubsetFixedBVH<false, 512, true, PrimTransfSRT> raysubset_t;
+void raymarch_forward_cuda(
+        int N, int H, int W, int K,
+        float * rayposim,
+        float * raydirim,
+        float stepsize,
+        float * tminmaxim,
+        int * sortedobjid,
+        int * nodechildren,
+        float * nodeaabb,
+        float * primpos,
+        float * primrot,
+        float * primscale,
+        int TD, int TH, int TW,
+        float * tplate,
+        int WD, int WH, int WW,
+        float * warp,
+        float * rayrgbaim,
+        float * raysatim,
+        int * raytermim,
+        int algorithm,
+        bool sortboxes,
+        int maxhitboxes,
+        bool synchitboxes,
+        bool chlast,
+        float fadescale,
+        float fadeexp,
+        int accum,
+        float termthresh,
+        int griddim, int blocksizex, int blocksizey,
+        cudaStream_t stream) {
+    dim3 blocksize(blocksizex, blocksizey);
+    dim3 gridsize;
+    gridsize = dim3(
+            (W + blocksize.x - 1) / blocksize.x,
+            (H + blocksize.y - 1) / blocksize.y,
+            N);
+    std::shared_ptr<PrimTransfDataBase> primtransf_data;
+    primtransf_data = std::make_shared<PrimTransfSRT::Data>(PrimTransfSRT::Data{
+            PrimTransfDataBase{},
+            K, (float3*)primpos, nullptr,
+            K * 3, (float3*)primrot, nullptr,
+            K, (float3*)primscale, nullptr});
+    std::shared_ptr<PrimSamplerDataBase> primsampler_data;
+    if (algorithm == 1) {
+        primsampler_data = std::make_shared<PrimSamplerTW<true>::Data>(PrimSamplerTW<true>::Data{
+            PrimSamplerDataBase{},
+            fadescale, fadeexp,
+            K * TD * TH * TW * 4, TD, TH, TW, tplate, nullptr,
+            K * WD * WH * WW * 3, WD, WH, WW, warp, nullptr});
+    } else {
+        primsampler_data = std::make_shared<PrimSamplerTW<false>::Data>(PrimSamplerTW<false>::Data{
+            PrimSamplerDataBase{},
+            fadescale, fadeexp,
+            K * TD * TH * TW * 4, TD, TH, TW, tplate, nullptr,
+            0, 0, 0, 0, nullptr, nullptr});
+    }
+    std::shared_ptr<PrimAccumDataBase> primaccum_data = std::make_shared<PrimAccumAdditive::Data>(PrimAccumAdditive::Data{
+            PrimAccumDataBase{},
+            termthresh, H * W, W, 1, (float4*)rayrgbaim, nullptr, (float3*)raysatim});
+    std::map<int, mapfn_t> dispatcher = {
+        {0, make_cudacall(raymarch_subset_forward_kernel<512, 4, raysubset_t, PrimTransfSRT, PrimSamplerTW<false>, PrimAccumAdditive>)},
+        {1, make_cudacall(raymarch_subset_forward_kernel<512, 4, raysubset_t, PrimTransfSRT, PrimSamplerTW<true>, PrimAccumAdditive>)}};
+    auto iter = dispatcher.find(algorithm);
+    if (iter != dispatcher.end()) {
+        (iter->second)(
+            gridsize, blocksize, stream,
+            N, H, W, K,
+            reinterpret_cast<float3 *>(rayposim),
+            reinterpret_cast<float3 *>(raydirim),
+            stepsize,
+            reinterpret_cast<float2 *>(tminmaxim),
+            reinterpret_cast<int    *>(sortedobjid),
+            reinterpret_cast<int2   *>(nodechildren),
+            reinterpret_cast<float3 *>(nodeaabb),
+            primtransf_data,
+            primsampler_data,
+            primaccum_data);
+    }
+}
+void raymarch_backward_cuda(
+        int N, int H, int W, int K,
+        float * rayposim,
+        float * raydirim,
+        float stepsize,
+        float * tminmaxim,
+        int * sortedobjid,
+        int * nodechildren,
+        float * nodeaabb,
+        float * primpos,
+        float * grad_primpos,
+        float * primrot,
+        float * grad_primrot,
+        float * primscale,
+        float * grad_primscale,
+        int TD, int TH, int TW,
+        float * tplate,
+        float * grad_tplate,
+        int WD, int WH, int WW,
+        float * warp,
+        float * grad_warp,
+        float * rayrgbaim,
+        float * grad_rayrgba,
+        float * raysatim,
+        int * raytermim,
+        int algorithm, bool sortboxes, int maxhitboxes, bool synchitboxes,
+        bool chlast, float fadescale, float fadeexp, int accum, float termthresh,
+        int griddim, int blocksizex, int blocksizey,
+        cudaStream_t stream) {
+    dim3 blocksize(blocksizex, blocksizey);
+    dim3 gridsize;
+    gridsize = dim3(
+            (W + blocksize.x - 1) / blocksize.x,
+            (H + blocksize.y - 1) / blocksize.y,
+            N);
+    std::shared_ptr<PrimTransfDataBase> primtransf_data;
+    primtransf_data = std::make_shared<PrimTransfSRT::Data>(PrimTransfSRT::Data{
+        PrimTransfDataBase{},
+        K, (float3*)primpos, (float3*)grad_primpos,
+        K * 3, (float3*)primrot, (float3*)grad_primrot,
+        K, (float3*)primscale, (float3*)grad_primscale});
+    std::shared_ptr<PrimSamplerDataBase> primsampler_data;
+    if (algorithm == 1) {
+        primsampler_data = std::make_shared<PrimSamplerTW<true>::Data>(PrimSamplerTW<true>::Data{
+            PrimSamplerDataBase{},
+            fadescale, fadeexp,
+            K * TD * TH * TW * 4, TD, TH, TW, tplate, grad_tplate,
+            K * WD * WH * WW * 3, WD, WH, WW, warp, grad_warp});
+    } else {
+        primsampler_data = std::make_shared<PrimSamplerTW<false>::Data>(PrimSamplerTW<false>::Data{
+            PrimSamplerDataBase{},
+            fadescale, fadeexp,
+            K * TD * TH * TW * 4, TD, TH, TW, tplate, grad_tplate,
+            0, 0, 0, 0, nullptr, nullptr});
+    }
+    std::shared_ptr<PrimAccumDataBase> primaccum_data = std::make_shared<PrimAccumAdditive::Data>(PrimAccumAdditive::Data{
+            PrimAccumDataBase{},
+            termthresh, H * W, W, 1, (float4*)rayrgbaim, (float4*)grad_rayrgba, (float3*)raysatim});
+    std::map<int, mapfn_t> dispatcher = {
+        {0, make_cudacall(raymarch_subset_backward_kernel<true, 512, 4, raysubset_t, PrimTransfSRT, PrimSamplerTW<false>, PrimAccumAdditive>)},
+        {1, make_cudacall(raymarch_subset_backward_kernel<true, 512, 4, raysubset_t, PrimTransfSRT, PrimSamplerTW<true>, PrimAccumAdditive>)}};
+    auto iter = dispatcher.find(algorithm);
+    if (iter != dispatcher.end()) {
+        (iter->second)(
+            gridsize, blocksize, stream,
+            N, H, W, K,
+            reinterpret_cast<float3 *>(rayposim),
+            reinterpret_cast<float3 *>(raydirim),
+            stepsize,
+            reinterpret_cast<float2 *>(tminmaxim),
+            reinterpret_cast<int    *>(sortedobjid),
+            reinterpret_cast<int2   *>(nodechildren),
+            reinterpret_cast<float3 *>(nodeaabb),
+            primtransf_data,
+            primsampler_data,
+            primaccum_data);
+    }
+}

dva/mvp/extensions/mvpraymarch/mvpraymarch_subset_kernel.h ADDED Viewed

	@@ -0,0 +1,218 @@

+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+template<
+    int maxhitboxes,
+    int nwarps,
+    class RaySubsetT=RaySubsetFixedBVH<false, 512, true, PrimTransfSRT>,
+    class PrimTransfT=PrimTransfSRT,
+    class PrimSamplerT=PrimSamplerTW<false>,
+    class PrimAccumT=PrimAccumAdditive>
+__global__ void raymarch_subset_forward_kernel(
+        int N, int H, int W, int K,
+        float3 * rayposim,
+        float3 * raydirim,
+        float stepsize,
+        float2 * tminmaxim,
+        int * sortedobjid,
+        int2 * nodechildren,
+        float3 * nodeaabb,
+        typename PrimTransfT::Data primtransf_data,
+        typename PrimSamplerT::Data primsampler_data,
+        typename PrimAccumT::Data primaccum_data
+        ) {
+    int w = blockIdx.x * blockDim.x + threadIdx.x;
+    int h = blockIdx.y * blockDim.y + threadIdx.y;
+    int n = blockIdx.z;
+    bool validthread = (w < W) && (h < H) && (n<N);
+    assert(nwarps == 0 || blockDim.x * blockDim.y / 32 <= nwarps);
+    const int warpid = __shfl_sync(0xffffffff, (threadIdx.y * blockDim.x + threadIdx.x) / 32, 0);
+    assert(__match_any_sync(0xffffffff, (threadIdx.y * blockDim.x + threadIdx.x) / 32) == 0xffffffff);
+    // warpmask contains the valid threads in the warp
+    unsigned warpmask = 0xffffffff;
+    n = min(N - 1, n);
+    h = min(H - 1, h);
+    w = min(W - 1, w);
+    sortedobjid += n * K;
+    nodechildren += n * (K + K - 1);
+    nodeaabb += n * (K + K - 1) * 2;
+    primtransf_data.n_stride(n);
+    primsampler_data.n_stride(n);
+    primaccum_data.n_stride(n, h, w);
+    float3 raypos = rayposim[n * H * W + h * W + w];
+    float3 raydir = raydirim[n * H * W + h * W + w];
+    float2 tminmax = tminmaxim[n * H * W + h * W + w];
+    int hitboxes[nwarps > 0 ? 1 : maxhitboxes];
+    __shared__ int hitboxes_sh[nwarps > 0 ? maxhitboxes * nwarps : 1];
+    int * hitboxes_ptr = nwarps > 0 ? hitboxes_sh + maxhitboxes * warpid : hitboxes;
+    int nhitboxes = 0;
+    // find raytminmax
+    float2 rtminmax = make_float2(std::numeric_limits<float>::infinity(), -std::numeric_limits<float>::infinity());
+    RaySubsetT::forward(warpmask, K, raypos, raydir, tminmax, rtminmax,
+            sortedobjid, nodechildren, nodeaabb,
+            primtransf_data, hitboxes_ptr, nhitboxes);
+    rtminmax.x = max(rtminmax.x, tminmax.x);
+    rtminmax.y = min(rtminmax.y, tminmax.y);
+    __syncwarp(warpmask);
+    float t = tminmax.x;
+    raypos = raypos + raydir * tminmax.x;
+    int incs = floor((rtminmax.x - t) / stepsize);
+    t += incs * stepsize;
+    raypos += raydir * incs * stepsize;
+    PrimAccumT pa;
+    while (!__all_sync(warpmask, t > rtminmax.y + 1e-5f || pa.is_done())) {
+        for (int ks = 0; ks < nhitboxes; ++ks) {
+            int k = hitboxes_ptr[ks];
+            // compute primitive-relative coordinate
+            PrimTransfT pt;
+            float3 samplepos = pt.forward(primtransf_data, k, raypos);
+            if (pt.valid(samplepos) && !pa.is_done() && t < rtminmax.y + 1e-5f) {
+                // sample
+                PrimSamplerT ps;
+                float4 sample = ps.forward(primsampler_data, k, samplepos);
+                // accumulate
+                pa.forward_prim(primaccum_data, sample, stepsize);
+            }
+        }
+        // update position
+        t += stepsize;
+        raypos += raydir * stepsize;
+    }
+    pa.write(primaccum_data);
+}
+template <
+    bool forwarddir,
+    int maxhitboxes,
+    int nwarps,
+    class RaySubsetT=RaySubsetFixedBVH<false, 512, true, PrimTransfSRT>,
+    class PrimTransfT=PrimTransfSRT,
+    class PrimSamplerT=PrimSamplerTW<false>,
+    class PrimAccumT=PrimAccumAdditive>
+__global__ void raymarch_subset_backward_kernel(
+        int N, int H, int W, int K,
+        float3 * rayposim,
+        float3 * raydirim,
+        float stepsize,
+        float2 * tminmaxim,
+        int * sortedobjid,
+        int2 * nodechildren,
+        float3 * nodeaabb,
+        typename PrimTransfT::Data primtransf_data,
+        typename PrimSamplerT::Data primsampler_data,
+        typename PrimAccumT::Data primaccum_data
+        ) {
+    int w = blockIdx.x * blockDim.x + threadIdx.x;
+    int h = blockIdx.y * blockDim.y + threadIdx.y;
+    int n = blockIdx.z;
+    bool validthread = (w < W) && (h < H) && (n<N);
+    assert(nwarps == 0 || blockDim.x * blockDim.y / 32 <= nwarps);
+    const int warpid = __shfl_sync(0xffffffff, (threadIdx.y * blockDim.x + threadIdx.x) / 32, 0);
+    assert(__match_any_sync(0xffffffff, (threadIdx.y * blockDim.x + threadIdx.x) / 32) == 0xffffffff);
+    // warpmask contains the valid threads in the warp
+    unsigned warpmask = 0xffffffff;
+    n = min(N - 1, n);
+    h = min(H - 1, h);
+    w = min(W - 1, w);
+    sortedobjid += n * K;
+    nodechildren += n * (K + K - 1);
+    nodeaabb += n * (K + K - 1) * 2;
+    primtransf_data.n_stride(n);
+    primsampler_data.n_stride(n);
+    primaccum_data.n_stride(n, h, w);
+    float3 raypos = rayposim[n * H * W + h * W + w];
+    float3 raydir = raydirim[n * H * W + h * W + w];
+    float2 tminmax = tminmaxim[n * H * W + h * W + w];
+    PrimAccumT pa;
+    pa.read(primaccum_data);
+    int hitboxes[nwarps > 0 ? 1 : maxhitboxes];
+    __shared__ int hitboxes_sh[nwarps > 0 ? maxhitboxes * nwarps : 1];
+    int * hitboxes_ptr = nwarps > 0 ? hitboxes_sh + maxhitboxes * warpid : hitboxes;
+    int nhitboxes = 0;
+    // find raytminmax
+    float2 rtminmax = make_float2(std::numeric_limits<float>::infinity(), -std::numeric_limits<float>::infinity());
+    RaySubsetT::forward(warpmask, K, raypos, raydir, tminmax, rtminmax,
+            sortedobjid, nodechildren, nodeaabb,
+            primtransf_data, hitboxes_ptr, nhitboxes);
+    rtminmax.x = max(rtminmax.x, tminmax.x);
+    rtminmax.y = min(rtminmax.y, tminmax.y);
+    __syncwarp(warpmask);
+    // set up raymarching position
+    float t = tminmax.x;
+    raypos = raypos + raydir * tminmax.x;
+    int incs = floor((rtminmax.x - t) / stepsize);
+    t += incs * stepsize;
+    raypos += raydir * incs * stepsize;
+    if (!forwarddir) {
+        int nsteps = pa.get_nsteps();
+        t += nsteps * stepsize;
+        raypos += raydir * nsteps * stepsize;
+    }
+    while (__any_sync(warpmask, (
+                    (forwarddir && t < rtminmax.y + 1e-5f ||
+                     !forwarddir && t > rtminmax.x - 1e-5f) &&
+                    !pa.is_done()))) {
+        for (int ks = 0; ks < nhitboxes; ++ks) {
+            int k = hitboxes_ptr[forwarddir ? ks : nhitboxes - ks - 1];
+            PrimTransfT pt;
+            float3 samplepos = pt.forward(primtransf_data, k, raypos);
+            bool evalprim = pt.valid(samplepos) && !pa.is_done() && t < rtminmax.y + 1e-5f;
+            float3 dL_samplepos = make_float3(0.f);
+            if (evalprim) {
+                PrimSamplerT ps;
+                float4 sample = ps.forward(primsampler_data, k, samplepos);
+                float4 dL_sample = pa.forwardbackward_prim(primaccum_data, sample, stepsize);
+                dL_samplepos = ps.backward(primsampler_data, k, samplepos, sample, dL_sample, validthread);
+            }
+            if (__any_sync(warpmask, evalprim)) {
+                pt.backward(primtransf_data, k, samplepos, dL_samplepos, validthread && evalprim);
+            }
+        }
+        if (forwarddir) {
+            t += stepsize;
+            raypos += raydir * stepsize;
+        } else {
+            t -= stepsize;
+            raypos -= raydir * stepsize;
+        }
+    }
+}

dva/mvp/extensions/mvpraymarch/primaccum.h ADDED Viewed

	@@ -0,0 +1,101 @@

+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef MVPRAYMARCHER_PRIMACCUM_H_
+#define MVPRAYMARCHER_PRIMACCUM_H_
+struct PrimAccumDataBase {
+    typedef PrimAccumDataBase base;
+};
+struct PrimAccumAdditive {
+    struct Data : public PrimAccumDataBase {
+        float termthresh;
+        int nstride, hstride, wstride;
+        float4 * rayrgbaim;
+        float4 * grad_rayrgbaim;
+        float3 * raysatim;
+        __forceinline__ __device__ void n_stride(int n, int h, int w) {
+            rayrgbaim += n * nstride + h * hstride + w * wstride;
+            grad_rayrgbaim += n * nstride + h * hstride + w * wstride;
+            if (raysatim) {
+                raysatim += n * nstride + h * hstride + w * wstride;
+            }
+        }
+    };
+    float4 rayrgba;
+    float3 raysat;
+    bool sat;
+    float4 dL_rayrgba;
+    __forceinline__ __device__ PrimAccumAdditive() :
+        rayrgba(make_float4(0.f)),
+        raysat(make_float3(-1.f)),
+        sat(false) {
+    }
+    __forceinline__ __device__ bool is_done() const {
+        return sat;
+    }
+    __forceinline__ __device__ int get_nsteps() const {
+        return 0;
+    }
+    __forceinline__ __device__ void write(const Data & data) {
+        *data.rayrgbaim = rayrgba;
+        if (data.raysatim) {
+            *data.raysatim = raysat;
+        }
+    }
+    __forceinline__ __device__ void read(const Data & data) {
+        dL_rayrgba = *data.grad_rayrgbaim;
+        raysat = *data.raysatim;
+    }
+    __forceinline__ __device__ void forward_prim(const Data & data, float4 sample, float stepsize) {
+        // accumulate
+        float3 rgb = make_float3(sample);
+        float alpha = sample.w;
+        float newalpha = rayrgba.w + alpha * stepsize;
+        float contrib = fminf(newalpha, 1.f) - rayrgba.w;
+        rayrgba += make_float4(rgb, 1.f) * contrib;
+        if (newalpha >= 1.f) {
+            // save saturation point
+            if (!sat) {
+                raysat = rgb;
+            }
+            sat = true;
+        }
+    }
+    __forceinline__ __device__ float4 forwardbackward_prim(const Data & data, float4 sample, float stepsize) {
+        float3 rgb = make_float3(sample);
+        float4 rgb1 = make_float4(rgb, 1.f);
+        sample.w *= stepsize;
+        bool thissat = rayrgba.w + sample.w >= 1.f;
+        sat = sat || thissat;
+        float weight = sat ? (1.f - rayrgba.w) : sample.w;
+        float3 dL_rgb = weight * make_float3(dL_rayrgba);
+        float dL_alpha = sat ? 0.f :
+            stepsize * dot(rgb1 - (raysat.x > -1.f ? make_float4(raysat, 1.f) : make_float4(0.f)), dL_rayrgba);
+        rayrgba += make_float4(rgb, 1.f) * weight;
+        return make_float4(dL_rgb, dL_alpha);
+    }
+};
+#endif

dva/mvp/extensions/mvpraymarch/primsampler.h ADDED Viewed

	@@ -0,0 +1,94 @@

+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef MVPRAYMARCHER_PRIMSAMPLER_H_
+#define MVPRAYMARCHER_PRIMSAMPLER_H_
+struct PrimSamplerDataBase {
+    typedef PrimSamplerDataBase base;
+};
+template<
+    bool dowarp,
+    template<typename> class GridSamplerT=GridSamplerChlast>
+struct PrimSamplerTW {
+    struct Data : public PrimSamplerDataBase {
+        float fadescale, fadeexp;
+        int tplate_nstride;
+        int TD, TH, TW;
+        float * tplate;
+        float * grad_tplate;
+        int warp_nstride;
+        int WD, WH, WW;
+        float * warp;
+        float * grad_warp;
+        __forceinline__ __device__ void n_stride(int n) {
+            tplate += n * tplate_nstride;
+            grad_tplate += n * tplate_nstride;
+            warp += n * warp_nstride;
+            grad_warp += n * warp_nstride;
+        }
+    };
+    float fade;
+    float * tplate_ptr;
+    float * warp_ptr;
+    float3 yy1;
+    __forceinline__ __device__ float4 forward(
+            const Data & data,
+            int k,
+            float3 y0) {
+        fade = __expf(-data.fadescale * (
+                    __powf(abs(y0.x), data.fadeexp) +
+                    __powf(abs(y0.y), data.fadeexp) +
+                    __powf(abs(y0.z), data.fadeexp)));
+        if (dowarp) {
+            warp_ptr = data.warp + (k * 3 * data.WD * data.WH * data.WW);
+            yy1 = GridSamplerT<float3>::forward(3, data.WD, data.WH, data.WW, warp_ptr, y0, false);
+        } else {
+            yy1 = y0;
+        }
+        tplate_ptr = data.tplate + (k * 4 * data.TD * data.TH * data.TW);
+        float4 sample = GridSamplerT<float4>::forward(4, data.TD, data.TH, data.TW, tplate_ptr, yy1, false);
+        sample.w *= fade;
+        return sample;
+    }
+    __forceinline__ __device__ float3 backward(const Data & data, int k, float3 y0,
+            float4 sample, float4 dL_sample, bool validthread) {
+        float3 dfade_y0 = -(data.fadescale * data.fadeexp) * make_float3(
+                    __powf(abs(y0.x), data.fadeexp - 1.f) * (y0.x > 0.f ? 1.f : -1.f),
+                    __powf(abs(y0.y), data.fadeexp - 1.f) * (y0.y > 0.f ? 1.f : -1.f),
+                    __powf(abs(y0.z), data.fadeexp - 1.f) * (y0.z > 0.f ? 1.f : -1.f));
+        float3 dL_y0 = dfade_y0 * sample.w * dL_sample.w;
+        dL_sample.w *= fade;
+        float * grad_tplate_ptr = data.grad_tplate + (k * 4 * data.TD * data.TH * data.TW);
+        float3 dL_y1 = GridSamplerT<float4>::backward(4, data.TD, data.TH, data.TW,
+                tplate_ptr, grad_tplate_ptr, yy1, validthread ? dL_sample : make_float4(0.f), false);
+        if (dowarp) {
+            float * grad_warp_ptr = data.grad_warp + (k * 3 * data.WD * data.WH * data.WW);
+            dL_y0 += GridSamplerT<float3>::backward(3, data.WD, data.WH, data.WW,
+                    warp_ptr, grad_warp_ptr, y0, validthread ? dL_y1 : make_float3(0.f), false);
+        } else {
+            dL_y0 += dL_y1;
+        }
+        return dL_y0;
+    }
+};
+#endif

dva/mvp/extensions/mvpraymarch/primtransf.h ADDED Viewed

	@@ -0,0 +1,182 @@

+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef MVPRAYMARCHER_PRIMTRANSF_H_
+#define MVPRAYMARCHER_PRIMTRANSF_H_
+#include "utils.h"
+__forceinline__ __device__ void compute_aabb_srt(
+        float3 pt, float3 pr0, float3 pr1, float3 pr2, float3 ps,
+        float3 & pmin, float3 & pmax) {
+    float3 p;
+    p = make_float3(-1.f, -1.f, -1.f) / ps;
+    p = make_float3(dot(p, pr0), dot(p, pr1), dot(p, pr2)) + pt;
+    pmin = p;
+    pmax = p;
+    p = make_float3(1.f, -1.f, -1.f) / ps;
+    p = make_float3(dot(p, pr0), dot(p, pr1), dot(p, pr2)) + pt;
+    pmin = fminf(pmin, p);
+    pmax = fmaxf(pmax, p);
+    p = make_float3(-1.f, 1.f, -1.f) / ps;
+    p = make_float3(dot(p, pr0), dot(p, pr1), dot(p, pr2)) + pt;
+    pmin = fminf(pmin, p);
+    pmax = fmaxf(pmax, p);
+    p = make_float3(1.f, 1.f, -1.f) / ps;
+    p = make_float3(dot(p, pr0), dot(p, pr1), dot(p, pr2)) + pt;
+    pmin = fminf(pmin, p);
+    pmax = fmaxf(pmax, p);
+    p = make_float3(-1.f, -1.f, 1.f) / ps;
+    p = make_float3(dot(p, pr0), dot(p, pr1), dot(p, pr2)) + pt;
+    pmin = fminf(pmin, p);
+    pmax = fmaxf(pmax, p);
+    p = make_float3(1.f, -1.f, 1.f) / ps;
+    p = make_float3(dot(p, pr0), dot(p, pr1), dot(p, pr2)) + pt;
+    pmin = fminf(pmin, p);
+    pmax = fmaxf(pmax, p);
+    p = make_float3(-1.f, 1.f, 1.f) / ps;
+    p = make_float3(dot(p, pr0), dot(p, pr1), dot(p, pr2)) + pt;
+    pmin = fminf(pmin, p);
+    pmax = fmaxf(pmax, p);
+    p = make_float3(1.f, 1.f, 1.f) / ps;
+    p = make_float3(dot(p, pr0), dot(p, pr1), dot(p, pr2)) + pt;
+    pmin = fminf(pmin, p);
+    pmax = fmaxf(pmax, p);
+}
+struct PrimTransfDataBase {
+    typedef PrimTransfDataBase base;
+};
+struct PrimTransfSRT {
+    struct Data : public PrimTransfDataBase {
+        int primpos_nstride;
+        float3 * primpos;
+        float3 * grad_primpos;
+        int primrot_nstride;
+        float3 * primrot;
+        float3 * grad_primrot;
+        int primscale_nstride;
+        float3 * primscale;
+        float3 * grad_primscale;
+        __forceinline__ __device__ void n_stride(int n) {
+            primpos += n * primpos_nstride;
+            grad_primpos += n * primpos_nstride;
+            primrot += n * primrot_nstride;
+            grad_primrot += n * primrot_nstride;
+            primscale += n * primscale_nstride;
+            grad_primscale += n * primscale_nstride;
+        }
+        __forceinline__ __device__ float3 get_center(int n, int k) {
+            return primpos[n * primpos_nstride + k];
+        }
+        __forceinline__ __device__ void compute_aabb(int n, int k, float3 & pmin, float3 & pmax) {
+            float3 pt = primpos[n * primpos_nstride + k];
+            float3 pr0 = primrot[n * primrot_nstride + k * 3 + 0];
+            float3 pr1 = primrot[n * primrot_nstride + k * 3 + 1];
+            float3 pr2 = primrot[n * primrot_nstride + k * 3 + 2];
+            float3 ps = primscale[n * primscale_nstride + k];
+            compute_aabb_srt(pt, pr0, pr1, pr2, ps, pmin, pmax);
+        }
+    };
+    float3 xmt;
+    float3 pr0;
+    float3 pr1;
+    float3 pr2;
+    float3 rxmt;
+    float3 ps;
+    static __forceinline__ __device__ bool valid(float3 pos) {
+        return (
+            pos.x > -1.f && pos.x < 1.f &&
+            pos.y > -1.f && pos.y < 1.f &&
+            pos.z > -1.f && pos.z < 1.f);
+    }
+    __forceinline__ __device__ float3 forward(
+            const Data & data,
+            int k,
+            float3 x) {
+        float3 pt = data.primpos[k];
+        pr0 = data.primrot[(k) * 3 + 0];
+        pr1 = data.primrot[(k) * 3 + 1];
+        pr2 = data.primrot[(k) * 3 + 2];
+        ps = data.primscale[k];
+        xmt = x - pt;
+        rxmt = pr0 * xmt.x + pr1 * xmt.y + pr2 * xmt.z;
+        float3 y0 = rxmt * ps;
+        return y0;
+    }
+    static __forceinline__ __device__ void forward2(
+            const Data & data,
+            int k,
+            float3 r, float3 d, float3 & rout, float3 & dout) {
+        float3 pt = data.primpos[k];
+        float3 pr0 = data.primrot[k * 3 + 0];
+        float3 pr1 = data.primrot[k * 3 + 1];
+        float3 pr2 = data.primrot[k * 3 + 2];
+        float3 ps = data.primscale[k];
+        float3 xmt = r - pt;
+        float3 dmt = d;
+        float3 rxmt = pr0 * xmt.x;
+        float3 rdmt = pr0 * dmt.x;
+        rxmt += pr1 * xmt.y;
+        rdmt += pr1 * dmt.y;
+        rxmt += pr2 * xmt.z;
+        rdmt += pr2 * dmt.z;
+        rout = rxmt * ps;
+        dout = rdmt * ps;
+    }
+    __forceinline__ __device__ void backward(const Data & data, int k, float3 x, float3 dL_y0, bool validthread) {
+        fastAtomicAdd((float*)data.grad_primscale + k * 3 + 0, validthread ? rxmt.x * dL_y0.x : 0.f);
+        fastAtomicAdd((float*)data.grad_primscale + k * 3 + 1, validthread ? rxmt.y * dL_y0.y : 0.f);
+        fastAtomicAdd((float*)data.grad_primscale + k * 3 + 2, validthread ? rxmt.z * dL_y0.z : 0.f);
+        dL_y0 *= ps;
+        float3 gpr0 = xmt.x * dL_y0;
+        fastAtomicAdd((float*)data.grad_primrot + (k * 3 + 0) * 3 + 0, validthread ? gpr0.x : 0.f);
+        fastAtomicAdd((float*)data.grad_primrot + (k * 3 + 0) * 3 + 1, validthread ? gpr0.y : 0.f);
+        fastAtomicAdd((float*)data.grad_primrot + (k * 3 + 0) * 3 + 2, validthread ? gpr0.z : 0.f);
+        float3 gpr1 = xmt.y * dL_y0;
+        fastAtomicAdd((float*)data.grad_primrot + (k * 3 + 1) * 3 + 0, validthread ? gpr1.x : 0.f);
+        fastAtomicAdd((float*)data.grad_primrot + (k * 3 + 1) * 3 + 1, validthread ? gpr1.y : 0.f);
+        fastAtomicAdd((float*)data.grad_primrot + (k * 3 + 1) * 3 + 2, validthread ? gpr1.z : 0.f);
+        float3 gpr2 = xmt.z * dL_y0;
+        fastAtomicAdd((float*)data.grad_primrot + (k * 3 + 2) * 3 + 0, validthread ? gpr2.x : 0.f);
+        fastAtomicAdd((float*)data.grad_primrot + (k * 3 + 2) * 3 + 1, validthread ? gpr2.y : 0.f);
+        fastAtomicAdd((float*)data.grad_primrot + (k * 3 + 2) * 3 + 2, validthread ? gpr2.z : 0.f);
+        fastAtomicAdd((float*)data.grad_primpos + k * 3 + 0, validthread ? -dot(pr0, dL_y0) : 0.f);
+        fastAtomicAdd((float*)data.grad_primpos + k * 3 + 1, validthread ? -dot(pr1, dL_y0) : 0.f);
+        fastAtomicAdd((float*)data.grad_primpos + k * 3 + 2, validthread ? -dot(pr2, dL_y0) : 0.f);
+    }
+};
+#endif

dva/mvp/extensions/mvpraymarch/setup.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from setuptools import setup
+from torch.utils.cpp_extension import CUDAExtension, BuildExtension
+if __name__ == "__main__":
+    import torch
+    setup(
+        name="mvpraymarch",
+        ext_modules=[
+            CUDAExtension(
+                "mvpraymarchlib",
+                sources=["mvpraymarch.cpp", "mvpraymarch_kernel.cu", "bvh.cu"],
+                extra_compile_args={
+                    "nvcc": [
+                        "-use_fast_math",
+                        "-arch=sm_70",
+                        "-std=c++17",
+                        "-lineinfo",
+                    ]
+                }
+            )
+        ],
+        cmdclass={"build_ext": BuildExtension}
+    )

dva/mvp/extensions/mvpraymarch/utils.h ADDED Viewed

	@@ -0,0 +1,847 @@

+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef MVPRAYMARCHER_UTILS_H_
+#define MVPRAYMARCHER_UTILS_H_
+#include <cassert>
+#include <cmath>
+#include <limits>
+#include "helper_math.h"
+static __forceinline__ __device__ float clock_diff(long long int end, long long int start) {
+    long long int max_clock = std::numeric_limits<long long int>::max();
+    return (end<start? (end + float(max_clock-start)) : float(end-start));
+}
+static __forceinline__ __device__
+bool allgt(float3 a, float3 b) {
+    return a.x >= b.x && a.y >= b.y && a.z >= b.z;
+}
+static __forceinline__ __device__
+bool alllt(float3 a, float3 b) {
+    return a.x <= b.x && a.y <= b.y && a.z <= b.z;
+}
+static __forceinline__ __device__
+float4 softplus(float4 x) {
+    return make_float4(
+            x.x > 20.f ? x.x : logf(1.f + expf(x.x)),
+            x.y > 20.f ? x.y : logf(1.f + expf(x.y)),
+            x.z > 20.f ? x.z : logf(1.f + expf(x.z)),
+            x.w > 20.f ? x.w : logf(1.f + expf(x.w)));
+}
+static __forceinline__ __device__
+float softplus(float x) {
+    // that's a neat trick
+    return __logf(1.f + __expf(-abs(x))) + max(x, 0.f);
+}
+static __forceinline__ __device__
+float softplus_grad(float x) {
+    // that's a neat trick
+    float expnabsx = __expf(-abs(x));
+    return (0.5f - expnabsx / (1.f + expnabsx)) * copysign(1.f, x) + 0.5f;
+}
+static __forceinline__ __device__
+float4 sigmoid(float4 x) {
+    return make_float4(
+            1.f / (1.f + expf(-x.x)),
+            1.f / (1.f + expf(-x.y)),
+            1.f / (1.f + expf(-x.z)),
+            1.f / (1.f + expf(-x.w)));
+}
+// perform reduction on warp, then call atomicAdd for only one lane
+static __forceinline__ __device__ void fastAtomicAdd(float * ptr, float val) {
+    for (int offset = 16; offset > 0; offset /= 2) {
+        val += __shfl_down_sync(0xffffffff, val, offset);
+    }
+    const int laneid = (threadIdx.y * blockDim.x + threadIdx.x) % 32;
+    if (laneid == 0) {
+        atomicAdd(ptr, val);
+    }
+}
+static __forceinline__ __device__
+bool within_bounds_3d(int d, int h, int w, int D, int H, int W) {
+    return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
+}
+static __forceinline__ __device__
+void safe_add_3d(float *data, int d, int h, int w,
+               int sD, int sH, int sW, int D, int H, int W,
+               float delta) {
+    if (within_bounds_3d(d, h, w, D, H, W)) {
+        atomicAdd(data + d * sD + h * sH + w * sW, delta);
+    }
+}
+static __forceinline__ __device__
+void safe_add_3d(float3 *data, int d, int h, int w,
+               int sD, int sH, int sW, int D, int H, int W,
+               float3 delta) {
+    if (within_bounds_3d(d, h, w, D, H, W)) {
+        atomicAdd((float*)data + (d * sD + h * sH + w * sW) * 3 + 0, delta.x);
+        atomicAdd((float*)data + (d * sD + h * sH + w * sW) * 3 + 1, delta.y);
+        atomicAdd((float*)data + (d * sD + h * sH + w * sW) * 3 + 2, delta.z);
+    }
+}
+static __forceinline__ __device__
+void safe_add_3d(float4 *data, int d, int h, int w,
+               int sD, int sH, int sW, int D, int H, int W,
+               float4 delta) {
+    if (within_bounds_3d(d, h, w, D, H, W)) {
+        atomicAdd((float*)data + (d * sD + h * sH + w * sW) * 4 + 0, delta.x);
+        atomicAdd((float*)data + (d * sD + h * sH + w * sW) * 4 + 1, delta.y);
+        atomicAdd((float*)data + (d * sD + h * sH + w * sW) * 4 + 2, delta.z);
+        atomicAdd((float*)data + (d * sD + h * sH + w * sW) * 4 + 3, delta.w);
+    }
+}
+static __forceinline__ __device__
+float clip_coordinates(float in, int clip_limit) {
+    return ::min(static_cast<float>(clip_limit - 1), ::max(in, 0.f));
+}
+template <typename scalar_t>
+static __forceinline__ __device__
+float clip_coordinates_set_grad(float in, int clip_limit, scalar_t *grad_in) {
+    if (in < 0.f) {
+        *grad_in = static_cast<scalar_t>(0);
+        return 0.f;
+    } else {
+        float max = static_cast<float>(clip_limit - 1);
+        if (in > max) {
+            *grad_in = static_cast<scalar_t>(0);
+            return max;
+        } else {
+            *grad_in = static_cast<scalar_t>(1);
+            return in;
+        }
+    }
+}
+template<typename out_t>
+static __device__ out_t grid_sample_forward(int C, int inp_D, int inp_H,
+        int inp_W, float* vals, float3 pos, bool border) {
+    int inp_sW = 1, inp_sH = inp_W, inp_sD = inp_W * inp_H, inp_sC = inp_W * inp_H * inp_D;
+    int out_sC = 1;
+    // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1]
+    float ix = max(-10.f, min(10.f, ((pos.x + 1.f) * 0.5f))) * (inp_W - 1);
+    float iy = max(-10.f, min(10.f, ((pos.y + 1.f) * 0.5f))) * (inp_H - 1);
+    float iz = max(-10.f, min(10.f, ((pos.z + 1.f) * 0.5f))) * (inp_D - 1);
+    if (border) {
+        // clip coordinates to image borders
+        ix = clip_coordinates(ix, inp_W);
+        iy = clip_coordinates(iy, inp_H);
+        iz = clip_coordinates(iz, inp_D);
+    }
+    // get corner pixel values from (x, y, z)
+    // for 4d, we used north-east-south-west
+    // for 5d, we add top-bottom
+    int ix_tnw = static_cast<int>(::floor(ix));
+    int iy_tnw = static_cast<int>(::floor(iy));
+    int iz_tnw = static_cast<int>(::floor(iz));
+    int ix_tne = ix_tnw + 1;
+    int iy_tne = iy_tnw;
+    int iz_tne = iz_tnw;
+    int ix_tsw = ix_tnw;
+    int iy_tsw = iy_tnw + 1;
+    int iz_tsw = iz_tnw;
+    int ix_tse = ix_tnw + 1;
+    int iy_tse = iy_tnw + 1;
+    int iz_tse = iz_tnw;
+    int ix_bnw = ix_tnw;
+    int iy_bnw = iy_tnw;
+    int iz_bnw = iz_tnw + 1;
+    int ix_bne = ix_tnw + 1;
+    int iy_bne = iy_tnw;
+    int iz_bne = iz_tnw + 1;
+    int ix_bsw = ix_tnw;
+    int iy_bsw = iy_tnw + 1;
+    int iz_bsw = iz_tnw + 1;
+    int ix_bse = ix_tnw + 1;
+    int iy_bse = iy_tnw + 1;
+    int iz_bse = iz_tnw + 1;
+    // get surfaces to each neighbor:
+    float tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+    float tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+    float tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+    float tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+    float bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+    float bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+    float bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+    float bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+    out_t result;
+    //auto inp_ptr_NC = input.data + n * inp_sN;
+    //auto out_ptr_NCDHW = output.data + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+    float * inp_ptr_NC = vals;
+    float * out_ptr_NCDHW = &result.x;
+    for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+      //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne
+      // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse
+      // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) * bne
+      // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) * bse
+      *out_ptr_NCDHW = static_cast<float>(0);
+      if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
+        *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw;
+      }
+      if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
+        *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne;
+      }
+      if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
+        *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw;
+      }
+      if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
+        *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse;
+      }
+      if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
+        *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw;
+      }
+      if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
+        *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne;
+      }
+      if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
+        *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw;
+      }
+      if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
+        *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse;
+      }
+    }
+    return result;
+}
+template<typename out_t>
+static __device__ float3 grid_sample_backward(int C, int inp_D, int inp_H,
+        int inp_W, float* vals, float* grad_vals, float3 pos, out_t grad_out,
+        bool border) {
+    int inp_sW = 1, inp_sH = inp_W, inp_sD = inp_W * inp_H, inp_sC = inp_W * inp_H * inp_D;
+    int gInp_sW = 1, gInp_sH = inp_W, gInp_sD = inp_W * inp_H, gInp_sC = inp_W * inp_H * inp_D;
+    int gOut_sC = 1;
+    // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1]
+    float ix = max(-10.f, min(10.f, ((pos.x + 1.f) * 0.5f))) * (inp_W - 1);
+    float iy = max(-10.f, min(10.f, ((pos.y + 1.f) * 0.5f))) * (inp_H - 1);
+    float iz = max(-10.f, min(10.f, ((pos.z + 1.f) * 0.5f))) * (inp_D - 1);
+    float gix_mult = (inp_W - 1.f) / 2;
+    float giy_mult = (inp_H - 1.f) / 2;
+    float giz_mult = (inp_D - 1.f) / 2;
+    if (border) {
+        // clip coordinates to image borders
+        ix = clip_coordinates_set_grad(ix, inp_W, &gix_mult);
+        iy = clip_coordinates_set_grad(iy, inp_H, &giy_mult);
+        iz = clip_coordinates_set_grad(iz, inp_D, &giz_mult);
+    }
+    // get corner pixel values from (x, y, z)
+    // for 4d, we used north-east-south-west
+    // for 5d, we add top-bottom
+    int ix_tnw = static_cast<int>(::floor(ix));
+    int iy_tnw = static_cast<int>(::floor(iy));
+    int iz_tnw = static_cast<int>(::floor(iz));
+    int ix_tne = ix_tnw + 1;
+    int iy_tne = iy_tnw;
+    int iz_tne = iz_tnw;
+    int ix_tsw = ix_tnw;
+    int iy_tsw = iy_tnw + 1;
+    int iz_tsw = iz_tnw;
+    int ix_tse = ix_tnw + 1;
+    int iy_tse = iy_tnw + 1;
+    int iz_tse = iz_tnw;
+    int ix_bnw = ix_tnw;
+    int iy_bnw = iy_tnw;
+    int iz_bnw = iz_tnw + 1;
+    int ix_bne = ix_tnw + 1;
+    int iy_bne = iy_tnw;
+    int iz_bne = iz_tnw + 1;
+    int ix_bsw = ix_tnw;
+    int iy_bsw = iy_tnw + 1;
+    int iz_bsw = iz_tnw + 1;
+    int ix_bse = ix_tnw + 1;
+    int iy_bse = iy_tnw + 1;
+    int iz_bse = iz_tnw + 1;
+    // get surfaces to each neighbor:
+    float tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+    float tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+    float tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+    float tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+    float bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+    float bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+    float bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+    float bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+    float gix = static_cast<float>(0), giy = static_cast<float>(0), giz = static_cast<float>(0);
+    //float *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+    //float *gInp_ptr_NC = grad_input.data + n * gInp_sN;
+    //float *inp_ptr_NC = input.data + n * inp_sN;
+    float *gOut_ptr_NCDHW = &grad_out.x;
+    float *gInp_ptr_NC = grad_vals;
+    float *inp_ptr_NC = vals;
+    // calculate bilinear weighted pixel value and set output pixel
+    for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) {
+      float gOut = *gOut_ptr_NCDHW;
+      // calculate and set grad_input
+      safe_add_3d(gInp_ptr_NC, iz_tnw, iy_tnw, ix_tnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut);
+      safe_add_3d(gInp_ptr_NC, iz_tne, iy_tne, ix_tne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut);
+      safe_add_3d(gInp_ptr_NC, iz_tsw, iy_tsw, ix_tsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut);
+      safe_add_3d(gInp_ptr_NC, iz_tse, iy_tse, ix_tse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut);
+      safe_add_3d(gInp_ptr_NC, iz_bnw, iy_bnw, ix_bnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut);
+      safe_add_3d(gInp_ptr_NC, iz_bne, iy_bne, ix_bne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut);
+      safe_add_3d(gInp_ptr_NC, iz_bsw, iy_bsw, ix_bsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut);
+      safe_add_3d(gInp_ptr_NC, iz_bse, iy_bse, ix_bse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut);
+      // calculate grad_grid
+      if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
+        float tnw_val = inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW];
+        gix -= tnw_val * (iy_bse - iy)    * (iz_bse - iz)    * gOut;
+        giy -= tnw_val * (ix_bse - ix)    * (iz_bse - iz)    * gOut;
+        giz -= tnw_val * (ix_bse - ix)    * (iy_bse - iy)    * gOut;
+      }
+      if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
+        float tne_val = inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW];
+        gix += tne_val * (iy_bsw - iy)    * (iz_bsw - iz)    * gOut;
+        giy -= tne_val * (ix    - ix_bsw) * (iz_bsw - iz)    * gOut;
+        giz -= tne_val * (ix    - ix_bsw) * (iy_bsw - iy)    * gOut;
+      }
+      if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
+        float tsw_val = inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW];
+        gix -= tsw_val * (iy - iy_bne)    * (iz_bne - iz)    * gOut;
+        giy += tsw_val * (ix_bne - ix)    * (iz_bne - iz)    * gOut;
+        giz -= tsw_val * (ix_bne - ix)    * (iy    - iy_bne) * gOut;
+      }
+      if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
+        float tse_val = inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW];
+        gix += tse_val * (iy - iy_bnw)    * (iz_bnw - iz)    * gOut;
+        giy += tse_val * (ix    - ix_bnw) * (iz_bnw - iz)    * gOut;
+        giz -= tse_val * (ix    - ix_bnw) * (iy    - iy_bnw) * gOut;
+      }
+      if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
+        float bnw_val = inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW];
+        gix -= bnw_val * (iy_tse - iy)    * (iz - iz_tse)    * gOut;
+        giy -= bnw_val * (ix_tse - ix)    * (iz - iz_tse)    * gOut;
+        giz += bnw_val * (ix_tse - ix)    * (iy_tse - iy)    * gOut;
+      }
+      if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
+        float bne_val = inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW];
+        gix += bne_val * (iy_tsw - iy)    * (iz - iz_tsw)    * gOut;
+        giy -= bne_val * (ix    - ix_tsw) * (iz - iz_tsw)    * gOut;
+        giz += bne_val * (ix    - ix_tsw) * (iy_tsw - iy)    * gOut;
+      }
+      if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
+        float bsw_val = inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW];
+        gix -= bsw_val * (iy - iy_tne)    * (iz - iz_tne)    * gOut;
+        giy += bsw_val * (ix_tne - ix)    * (iz - iz_tne)    * gOut;
+        giz += bsw_val * (ix_tne - ix)    * (iy    - iy_tne) * gOut;
+      }
+      if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
+        float bse_val = inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW];
+        gix += bse_val * (iy - iy_tnw)    * (iz - iz_tnw)    * gOut;
+        giy += bse_val * (ix    - ix_tnw) * (iz - iz_tnw)    * gOut;
+        giz += bse_val * (ix    - ix_tnw) * (iy    - iy_tnw) * gOut;
+      }
+    }
+    return make_float3(gix_mult * gix, giy_mult * giy, giz_mult * giz);
+}
+// this dummy struct necessary because c++ is dumb
+template<typename out_t>
+struct GridSampler {
+    static __forceinline__ __device__ out_t forward(int C, int inp_D, int inp_H, int inp_W,
+            float* vals, float3 pos, bool border) {
+        return grid_sample_forward<out_t>(C, inp_D, inp_H, inp_W, vals, pos, border);
+    }
+    static __forceinline__ __device__ float3 backward(int C, int inp_D, int inp_H, int inp_W,
+            float* vals, float* grad_vals, float3 pos, out_t grad_out, bool border) {
+        return grid_sample_backward<out_t>(C, inp_D, inp_H, inp_W, vals, grad_vals, pos, grad_out, border);
+    }
+};
+//template <typename T>
+//__device__ void cswap ( T& a, T& b ) {
+//    T c(a); a=b; b=c;
+//}
+static __forceinline__ __device__
+int within_bounds_3d_ind(int d, int h, int w, int D, int H, int W) {
+    return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W ? ((d * H) + h) * W + w : -1;
+}
+template<class out_t>
+static __device__ out_t grid_sample_chlast_forward(int, int inp_D, int inp_H,
+        int inp_W, float * vals, float3 pos, bool border) {
+    int inp_sW = 1, inp_sH = inp_W, inp_sD = inp_W * inp_H;
+    // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1]
+    float ix = max(-100.f, min(100.f, ((pos.x + 1.f) / 2))) * (inp_W - 1);
+    float iy = max(-100.f, min(100.f, ((pos.y + 1.f) / 2))) * (inp_H - 1);
+    float iz = max(-100.f, min(100.f, ((pos.z + 1.f) / 2))) * (inp_D - 1);
+    if (border) {
+        // clip coordinates to image borders
+        ix = clip_coordinates(ix, inp_W);
+        iy = clip_coordinates(iy, inp_H);
+        iz = clip_coordinates(iz, inp_D);
+    }
+    // get corner pixel values from (x, y, z)
+    // for 4d, we used north-east-south-west
+    // for 5d, we add top-bottom
+    int ix_tnw = static_cast<int>(::floor(ix));
+    int iy_tnw = static_cast<int>(::floor(iy));
+    int iz_tnw = static_cast<int>(::floor(iz));
+    int ix_tne = ix_tnw + 1;
+    int iy_tne = iy_tnw;
+    int iz_tne = iz_tnw;
+    int ix_tsw = ix_tnw;
+    int iy_tsw = iy_tnw + 1;
+    int iz_tsw = iz_tnw;
+    int ix_tse = ix_tnw + 1;
+    int iy_tse = iy_tnw + 1;
+    int iz_tse = iz_tnw;
+    int ix_bnw = ix_tnw;
+    int iy_bnw = iy_tnw;
+    int iz_bnw = iz_tnw + 1;
+    int ix_bne = ix_tnw + 1;
+    int iy_bne = iy_tnw;
+    int iz_bne = iz_tnw + 1;
+    int ix_bsw = ix_tnw;
+    int iy_bsw = iy_tnw + 1;
+    int iz_bsw = iz_tnw + 1;
+    int ix_bse = ix_tnw + 1;
+    int iy_bse = iy_tnw + 1;
+    int iz_bse = iz_tnw + 1;
+    // get surfaces to each neighbor:
+    float tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+    float tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+    float tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+    float tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+    float bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+    float bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+    float bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+    float bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+    out_t result;
+    memset(&result, 0, sizeof(out_t));
+    out_t * inp_ptr_NC = (out_t*)vals;
+    out_t * out_ptr_NCDHW = &result;
+    {
+        if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
+            *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw;
+        }
+        if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
+            *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne;
+        }
+        if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
+            *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw;
+        }
+        if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
+            *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse;
+        }
+        if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
+            *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw;
+        }
+        if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
+            *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne;
+        }
+        if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
+            *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw;
+        }
+        if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
+            *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse;
+        }
+    }
+    return result;
+}
+template<typename out_t>
+static __device__ float3 grid_sample_chlast_backward(int, int inp_D, int inp_H,
+        int inp_W, float* vals, float* grad_vals, float3 pos, out_t grad_out,
+        bool border) {
+    int inp_sW = 1, inp_sH = inp_W, inp_sD = inp_W * inp_H;
+    int gInp_sW = 1, gInp_sH = inp_W, gInp_sD = inp_W * inp_H;
+    // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1]
+    float ix = max(-100.f, min(100.f, ((pos.x + 1.f) / 2))) * (inp_W - 1);
+    float iy = max(-100.f, min(100.f, ((pos.y + 1.f) / 2))) * (inp_H - 1);
+    float iz = max(-100.f, min(100.f, ((pos.z + 1.f) / 2))) * (inp_D - 1);
+    float gix_mult = (inp_W - 1.f) / 2;
+    float giy_mult = (inp_H - 1.f) / 2;
+    float giz_mult = (inp_D - 1.f) / 2;
+    if (border) {
+        // clip coordinates to image borders
+        ix = clip_coordinates_set_grad(ix, inp_W, &gix_mult);
+        iy = clip_coordinates_set_grad(iy, inp_H, &giy_mult);
+        iz = clip_coordinates_set_grad(iz, inp_D, &giz_mult);
+    }
+    // get corner pixel values from (x, y, z)
+    // for 4d, we used north-east-south-west
+    // for 5d, we add top-bottom
+    int ix_tnw = static_cast<int>(::floor(ix));
+    int iy_tnw = static_cast<int>(::floor(iy));
+    int iz_tnw = static_cast<int>(::floor(iz));
+    int ix_tne = ix_tnw + 1;
+    int iy_tne = iy_tnw;
+    int iz_tne = iz_tnw;
+    int ix_tsw = ix_tnw;
+    int iy_tsw = iy_tnw + 1;
+    int iz_tsw = iz_tnw;
+    int ix_tse = ix_tnw + 1;
+    int iy_tse = iy_tnw + 1;
+    int iz_tse = iz_tnw;
+    int ix_bnw = ix_tnw;
+    int iy_bnw = iy_tnw;
+    int iz_bnw = iz_tnw + 1;
+    int ix_bne = ix_tnw + 1;
+    int iy_bne = iy_tnw;
+    int iz_bne = iz_tnw + 1;
+    int ix_bsw = ix_tnw;
+    int iy_bsw = iy_tnw + 1;
+    int iz_bsw = iz_tnw + 1;
+    int ix_bse = ix_tnw + 1;
+    int iy_bse = iy_tnw + 1;
+    int iz_bse = iz_tnw + 1;
+    // get surfaces to each neighbor:
+    float tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+    float tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+    float tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+    float tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+    float bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+    float bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+    float bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+    float bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+    float gix = static_cast<float>(0), giy = static_cast<float>(0), giz = static_cast<float>(0);
+    out_t *gOut_ptr_NCDHW = &grad_out;
+    out_t *gInp_ptr_NC = (out_t*)grad_vals;
+    out_t *inp_ptr_NC = (out_t*)vals;
+    // calculate bilinear weighted pixel value and set output pixel
+    {
+      out_t gOut = *gOut_ptr_NCDHW;
+      // calculate and set grad_input
+      safe_add_3d(gInp_ptr_NC, iz_tnw, iy_tnw, ix_tnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut);
+      safe_add_3d(gInp_ptr_NC, iz_tne, iy_tne, ix_tne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut);
+      safe_add_3d(gInp_ptr_NC, iz_tsw, iy_tsw, ix_tsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut);
+      safe_add_3d(gInp_ptr_NC, iz_tse, iy_tse, ix_tse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut);
+      safe_add_3d(gInp_ptr_NC, iz_bnw, iy_bnw, ix_bnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut);
+      safe_add_3d(gInp_ptr_NC, iz_bne, iy_bne, ix_bne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut);
+      safe_add_3d(gInp_ptr_NC, iz_bsw, iy_bsw, ix_bsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut);
+      safe_add_3d(gInp_ptr_NC, iz_bse, iy_bse, ix_bse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut);
+      // calculate grad_grid
+      if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
+        out_t tnw_val = inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW];
+        gix -= (iy_bse - iy)    * (iz_bse - iz)    * dot(tnw_val, gOut);
+        giy -= (ix_bse - ix)    * (iz_bse - iz)    * dot(tnw_val, gOut);
+        giz -= (ix_bse - ix)    * (iy_bse - iy)    * dot(tnw_val, gOut);
+      }
+      if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
+        out_t tne_val = inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW];
+        gix += (iy_bsw - iy)    * (iz_bsw - iz)    * dot(tne_val, gOut);
+        giy -= (ix    - ix_bsw) * (iz_bsw - iz)    * dot(tne_val, gOut);
+        giz -= (ix    - ix_bsw) * (iy_bsw - iy)    * dot(tne_val, gOut);
+      }
+      if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
+        out_t tsw_val = inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW];
+        gix -= (iy - iy_bne)    * (iz_bne - iz)    * dot(tsw_val, gOut);
+        giy += (ix_bne - ix)    * (iz_bne - iz)    * dot(tsw_val, gOut);
+        giz -= (ix_bne - ix)    * (iy    - iy_bne) * dot(tsw_val, gOut);
+      }
+      if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
+        out_t tse_val = inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW];
+        gix += (iy - iy_bnw)    * (iz_bnw - iz)    * dot(tse_val, gOut);
+        giy += (ix    - ix_bnw) * (iz_bnw - iz)    * dot(tse_val, gOut);
+        giz -= (ix    - ix_bnw) * (iy    - iy_bnw) * dot(tse_val, gOut);
+      }
+      if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
+        out_t bnw_val = inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW];
+        gix -= (iy_tse - iy)    * (iz - iz_tse)    * dot(bnw_val, gOut);
+        giy -= (ix_tse - ix)    * (iz - iz_tse)    * dot(bnw_val, gOut);
+        giz += (ix_tse - ix)    * (iy_tse - iy)    * dot(bnw_val, gOut);
+      }
+      if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
+        out_t bne_val = inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW];
+        gix += (iy_tsw - iy)    * (iz - iz_tsw)    * dot(bne_val, gOut);
+        giy -= (ix    - ix_tsw) * (iz - iz_tsw)    * dot(bne_val, gOut);
+        giz += (ix    - ix_tsw) * (iy_tsw - iy)    * dot(bne_val, gOut);
+      }
+      if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
+        out_t bsw_val = inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW];
+        gix -= (iy - iy_tne)    * (iz - iz_tne)    * dot(bsw_val, gOut);
+        giy += (ix_tne - ix)    * (iz - iz_tne)    * dot(bsw_val, gOut);
+        giz += (ix_tne - ix)    * (iy    - iy_tne) * dot(bsw_val, gOut);
+      }
+      if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
+        out_t bse_val = inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW];
+        gix += (iy - iy_tnw)    * (iz - iz_tnw)    * dot(bse_val, gOut);
+        giy += (ix    - ix_tnw) * (iz - iz_tnw)    * dot(bse_val, gOut);
+        giz += (ix    - ix_tnw) * (iy    - iy_tnw) * dot(bse_val, gOut);
+      }
+    }
+    return make_float3(gix_mult * gix, giy_mult * giy, giz_mult * giz);
+}
+template<typename out_t>
+struct GridSamplerChlast {
+    static __forceinline__ __device__ out_t forward(int C, int inp_D, int inp_H, int inp_W,
+            float* vals, float3 pos, bool border) {
+        return grid_sample_chlast_forward<out_t>(C, inp_D, inp_H, inp_W, vals, pos, border);
+    }
+    static __forceinline__ __device__ float3 backward(int C, int inp_D, int inp_H, int inp_W,
+            float* vals, float* grad_vals, float3 pos, out_t grad_out, bool border) {
+        return grid_sample_chlast_backward<out_t>(C, inp_D, inp_H, inp_W, vals, grad_vals, pos, grad_out, border);
+    }
+};
+inline __host__ __device__ float min_component(float3 a) {
+    return fminf(fminf(a.x,a.y),a.z);
+}
+inline __host__ __device__ float max_component(float3 a) {
+    return fmaxf(fmaxf(a.x,a.y),a.z);
+}
+ inline __host__ __device__ float3 abs(float3 a) {
+    return make_float3(abs(a.x), abs(a.y), abs(a.z));
+}
+__forceinline__ __device__ bool ray_aabb_hit(float3 p0, float3 p1, float3 raypos, float3 raydir) {
+    float3 t0 = (p0 - raypos) / raydir;
+    float3 t1 = (p1 - raypos) / raydir;
+    float3 tmin = fminf(t0,t1), tmax = fmaxf(t0,t1);
+    return max_component(tmin) <= min_component(tmax);
+}
+__forceinline__ __device__ bool ray_aabb_hit_ird(float3 p0, float3 p1, float3 raypos, float3 ird) {
+    float3 t0 = (p0 - raypos) * ird;
+    float3 t1 = (p1 - raypos) * ird;
+    float3 tmin = fminf(t0,t1), tmax = fmaxf(t0,t1);
+    return max_component(tmin) <= min_component(tmax);
+}
+__forceinline__ __device__ void ray_aabb_hit_ird_tminmax(float3 p0, float3 p1,
+        float3 raypos, float3 ird, float &otmin, float &otmax) {
+    float3 t0 = (p0 - raypos) * ird;
+    float3 t1 = (p1 - raypos) * ird;
+    float3 tmin = fminf(t0,t1), tmax = fmaxf(t0,t1);
+    tmin = fminf(t0,t1);
+    tmax = fmaxf(t0,t1);
+    otmin = max_component(tmin);
+    otmax = min_component(tmax);
+}
+inline  __device__ bool aabb_intersect(float3 p0, float3 p1, float3 r0, float3 rd, float &tmin, float &tmax) {
+    float tymin, tymax, tzmin, tzmax;
+    const float3 bounds[2] = {p0, p1};
+    float3 ird = 1.0f/rd;
+    int sx = (ird.x<0) ? 1 : 0;
+    int sy = (ird.y<0) ? 1 : 0;
+    int sz = (ird.z<0) ? 1 : 0;
+    tmin = (bounds[sx].x - r0.x) * ird.x;
+    tmax = (bounds[1-sx].x - r0.x) * ird.x;
+    tymin = (bounds[sy].y - r0.y) * ird.y;
+    tymax = (bounds[1-sy].y - r0.y) * ird.y;
+    if ((tmin > tymax) || (tymin > tmax))
+        return false;
+    if (tymin > tmin)
+        tmin = tymin;
+    if (tymax < tmax)
+        tmax = tymax;
+    tzmin = (bounds[sz].z - r0.z) * ird.z;
+    tzmax = (bounds[1-sz].z - r0.z) * ird.z;
+    if ((tmin > tzmax) || (tzmin > tmax))
+        return false;
+    if (tzmin > tmin)
+        tmin = tzmin;
+    if (tzmax < tmax)
+        tmax = tzmax;
+    return true;
+}
+template<bool sortboxes, int maxhitboxes, bool sync, class PrimTransfT>
+static __forceinline__ __device__ void ray_subset_fixedbvh(
+        unsigned warpmask,
+        int K,
+        float3 raypos,
+        float3 raydir,
+        float2 tminmax,
+        float2 &rtminmax,
+        int * sortedobjid,
+        int2 * nodechildren,
+        float3 * nodeaabb,
+        const typename PrimTransfT::Data & primtransf_data,
+        int *hitboxes,
+        int & num) {
+    float3 iraydir = 1.0f/raydir;
+    int stack[64];
+    int* stack_ptr = stack;
+    *stack_ptr++ = -1;
+    int node = 0;
+    do {
+        // check if we're in a leaf
+        if (node >= (K - 1)) {
+            {
+                int k = node - (K - 1);
+                float3 r0, rd;
+                PrimTransfT::forward2(primtransf_data, k, raypos, raydir, r0, rd);
+                float3 ird = 1.0f/rd;
+                float3 t0 = (-1.f - r0) * ird;
+                float3 t1 = (1.f - r0) * ird;
+                float3 tmin = fminf(t0,t1), tmax = fmaxf(t0,t1);
+                float trmin = max_component(tmin);
+                float trmax = min_component(tmax);
+                bool intersection = trmin <= trmax;
+                if (intersection) {
+                    // hit
+                    rtminmax.x = fminf(rtminmax.x, trmin);
+                    rtminmax.y = fmaxf(rtminmax.y, trmax);
+                }
+                if (sync) {
+                    intersection = __any_sync(warpmask, intersection);
+                }
+                if (intersection) {
+                    if (sortboxes) {
+                        if (num < maxhitboxes) {
+                            int j = num - 1;
+                            while (j >= 0 && hitboxes[j] > k) {
+                                hitboxes[j + 1] = hitboxes[j];
+                                j = j - 1;
+                            }
+                            hitboxes[j + 1] = k;
+                            num++;
+                        }
+                    } else {
+                        if (num < maxhitboxes) {
+                            hitboxes[num++] = k;
+                        }
+                    }
+                }
+            }
+            node = *--stack_ptr;
+        } else {
+            int2 children = make_int2(node * 2 + 1, node * 2 + 2);
+            // check if we're in each child's bbox
+            float3 * nodeaabb_ptr = nodeaabb + children.x * 2;
+            bool traverse_l = ray_aabb_hit_ird(nodeaabb_ptr[0], nodeaabb_ptr[1], raypos, iraydir);
+            bool traverse_r = ray_aabb_hit_ird(nodeaabb_ptr[2], nodeaabb_ptr[3], raypos, iraydir);
+            if (sync) {
+                traverse_l = __any_sync(warpmask, traverse_l);
+                traverse_r = __any_sync(warpmask, traverse_r);
+            }
+            // update stack
+            if (!traverse_l && !traverse_r) {
+                node = *--stack_ptr;
+            } else {
+                node = traverse_l ? children.x : children.y;
+                if (traverse_l && traverse_r) {
+                    *stack_ptr++ = children.y;
+                }
+            }
+            if (sync) {
+                __syncwarp(warpmask);
+            }
+        }
+    } while (node != -1);
+}
+template<bool sortboxes, int maxhitboxes, bool sync, class PrimTransfT>
+struct RaySubsetFixedBVH {
+    static __forceinline__ __device__ void forward(
+        unsigned warpmask,
+        int K,
+        float3 raypos,
+        float3 raydir,
+        float2 tminmax,
+        float2 &rtminmax,
+        int * sortedobjid,
+        int2 * nodechildren,
+        float3 * nodeaabb,
+        const typename PrimTransfT::Data & primtransf_data,
+        int *hitboxes,
+        int & num) {
+        ray_subset_fixedbvh<sortboxes, maxhitboxes, sync, PrimTransfT>(
+                warpmask, K, raypos, raydir, tminmax, rtminmax,
+                sortedobjid, nodechildren, nodeaabb, primtransf_data, hitboxes, num);
+    }
+};
+#endif

dva/mvp/extensions/utils/helper_math.h ADDED Viewed

	@@ -0,0 +1,1453 @@

+/**
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+/*
+ *  This file implements common mathematical operations on vector types
+ *  (float3, float4 etc.) since these are not provided as standard by CUDA.
+ *
+ *  The syntax is modeled on the Cg standard library.
+ *
+ *  This is part of the Helper library includes
+ *
+ *    Thanks to Linh Hah for additions and fixes.
+ */
+#ifndef HELPER_MATH_H
+#define HELPER_MATH_H
+#include "cuda_runtime.h"
+typedef unsigned int uint;
+typedef unsigned short ushort;
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+#ifndef __CUDACC__
+#include <math.h>
+////////////////////////////////////////////////////////////////////////////////
+// host implementations of CUDA functions
+////////////////////////////////////////////////////////////////////////////////
+inline float fminf(float a, float b)
+{
+    return a < b ? a : b;
+}
+inline float fmaxf(float a, float b)
+{
+    return a > b ? a : b;
+}
+inline int max(int a, int b)
+{
+    return a > b ? a : b;
+}
+inline int min(int a, int b)
+{
+    return a < b ? a : b;
+}
+inline float rsqrtf(float x)
+{
+    return 1.0f / sqrtf(x);
+}
+#endif
+////////////////////////////////////////////////////////////////////////////////
+// constructors
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 make_float2(float s)
+{
+    return make_float2(s, s);
+}
+inline __host__ __device__ float2 make_float2(float3 a)
+{
+    return make_float2(a.x, a.y);
+}
+inline __host__ __device__ float2 make_float2(int2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+inline __host__ __device__ float2 make_float2(uint2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+inline __host__ __device__ int2 make_int2(int s)
+{
+    return make_int2(s, s);
+}
+inline __host__ __device__ int2 make_int2(int3 a)
+{
+    return make_int2(a.x, a.y);
+}
+inline __host__ __device__ int2 make_int2(uint2 a)
+{
+    return make_int2(int(a.x), int(a.y));
+}
+inline __host__ __device__ int2 make_int2(float2 a)
+{
+    return make_int2(int(a.x), int(a.y));
+}
+inline __host__ __device__ uint2 make_uint2(uint s)
+{
+    return make_uint2(s, s);
+}
+inline __host__ __device__ uint2 make_uint2(uint3 a)
+{
+    return make_uint2(a.x, a.y);
+}
+inline __host__ __device__ uint2 make_uint2(int2 a)
+{
+    return make_uint2(uint(a.x), uint(a.y));
+}
+inline __host__ __device__ float3 make_float3(float s)
+{
+    return make_float3(s, s, s);
+}
+inline __host__ __device__ float3 make_float3(float2 a)
+{
+    return make_float3(a.x, a.y, 0.0f);
+}
+inline __host__ __device__ float3 make_float3(float2 a, float s)
+{
+    return make_float3(a.x, a.y, s);
+}
+inline __host__ __device__ float3 make_float3(float4 a)
+{
+    return make_float3(a.x, a.y, a.z);
+}
+inline __host__ __device__ float3 make_float3(int3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+inline __host__ __device__ float3 make_float3(uint3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+inline __host__ __device__ int3 make_int3(int s)
+{
+    return make_int3(s, s, s);
+}
+inline __host__ __device__ int3 make_int3(int2 a)
+{
+    return make_int3(a.x, a.y, 0);
+}
+inline __host__ __device__ int3 make_int3(int2 a, int s)
+{
+    return make_int3(a.x, a.y, s);
+}
+inline __host__ __device__ int3 make_int3(uint3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+inline __host__ __device__ int3 make_int3(float3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+inline __host__ __device__ uint3 make_uint3(uint s)
+{
+    return make_uint3(s, s, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a)
+{
+    return make_uint3(a.x, a.y, 0);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a, uint s)
+{
+    return make_uint3(a.x, a.y, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint4 a)
+{
+    return make_uint3(a.x, a.y, a.z);
+}
+inline __host__ __device__ uint3 make_uint3(int3 a)
+{
+    return make_uint3(uint(a.x), uint(a.y), uint(a.z));
+}
+inline __host__ __device__ float4 make_float4(float s)
+{
+    return make_float4(s, s, s, s);
+}
+inline __host__ __device__ float4 make_float4(float3 a)
+{
+    return make_float4(a.x, a.y, a.z, 0.0f);
+}
+inline __host__ __device__ float4 make_float4(float3 a, float w)
+{
+    return make_float4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ float4 make_float4(int4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+inline __host__ __device__ float4 make_float4(uint4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+inline __host__ __device__ int4 make_int4(int s)
+{
+    return make_int4(s, s, s, s);
+}
+inline __host__ __device__ int4 make_int4(int3 a)
+{
+    return make_int4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ int4 make_int4(int3 a, int w)
+{
+    return make_int4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ int4 make_int4(uint4 a)
+{
+    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+inline __host__ __device__ int4 make_int4(float4 a)
+{
+    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+inline __host__ __device__ uint4 make_uint4(uint s)
+{
+    return make_uint4(s, s, s, s);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a)
+{
+    return make_uint4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a, uint w)
+{
+    return make_uint4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ uint4 make_uint4(int4 a)
+{
+    return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w));
+}
+////////////////////////////////////////////////////////////////////////////////
+// negate
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 operator-(float2 &a)
+{
+    return make_float2(-a.x, -a.y);
+}
+inline __host__ __device__ int2 operator-(int2 &a)
+{
+    return make_int2(-a.x, -a.y);
+}
+inline __host__ __device__ float3 operator-(float3 &a)
+{
+    return make_float3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ int3 operator-(int3 &a)
+{
+    return make_int3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ float4 operator-(float4 &a)
+{
+    return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+inline __host__ __device__ int4 operator-(int4 &a)
+{
+    return make_int4(-a.x, -a.y, -a.z, -a.w);
+}
+////////////////////////////////////////////////////////////////////////////////
+// addition
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 operator+(float2 a, float2 b)
+{
+    return make_float2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(float2 &a, float2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ float2 operator+(float2 a, float b)
+{
+    return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ float2 operator+(float b, float2 a)
+{
+    return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(float2 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+}
+inline __host__ __device__ int2 operator+(int2 a, int2 b)
+{
+    return make_int2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(int2 &a, int2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ int2 operator+(int2 a, int b)
+{
+    return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ int2 operator+(int b, int2 a)
+{
+    return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(int2 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+}
+inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
+{
+    return make_uint2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ uint2 operator+(uint2 a, uint b)
+{
+    return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ uint2 operator+(uint b, uint2 a)
+{
+    return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+}
+inline __host__ __device__ float3 operator+(float3 a, float3 b)
+{
+    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(float3 &a, float3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ float3 operator+(float3 a, float b)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(float3 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+inline __host__ __device__ int3 operator+(int3 a, int3 b)
+{
+    return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(int3 &a, int3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ int3 operator+(int3 a, int b)
+{
+    return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(int3 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
+{
+    return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ uint3 operator+(uint3 a, uint b)
+{
+    return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+inline __host__ __device__ int3 operator+(int b, int3 a)
+{
+    return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ uint3 operator+(uint b, uint3 a)
+{
+    return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ float3 operator+(float b, float3 a)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ float4 operator+(float4 a, float4 b)
+{
+    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(float4 &a, float4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ float4 operator+(float4 a, float b)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ float4 operator+(float b, float4 a)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ void operator+=(float4 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+inline __host__ __device__ int4 operator+(int4 a, int4 b)
+{
+    return make_int4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(int4 &a, int4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ int4 operator+(int4 a, int b)
+{
+    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ int4 operator+(int b, int4 a)
+{
+    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ void operator+=(int4 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
+{
+    return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ uint4 operator+(uint4 a, uint b)
+{
+    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ uint4 operator+(uint b, uint4 a)
+{
+    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+////////////////////////////////////////////////////////////////////////////////
+// subtract
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 operator-(float2 a, float2 b)
+{
+    return make_float2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ float2 operator-(float2 a, float b)
+{
+    return make_float2(a.x - b, a.y - b);
+}
+inline __host__ __device__ float2 operator-(float b, float2 a)
+{
+    return make_float2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+inline __host__ __device__ int2 operator-(int2 a, int2 b)
+{
+    return make_int2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ int2 operator-(int2 a, int b)
+{
+    return make_int2(a.x - b, a.y - b);
+}
+inline __host__ __device__ int2 operator-(int b, int2 a)
+{
+    return make_int2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
+{
+    return make_uint2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ uint2 operator-(uint2 a, uint b)
+{
+    return make_uint2(a.x - b, a.y - b);
+}
+inline __host__ __device__ uint2 operator-(uint b, uint2 a)
+{
+    return make_uint2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+inline __host__ __device__ float3 operator-(float3 a, float3 b)
+{
+    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ float3 operator-(float3 a, float b)
+{
+    return make_float3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ float3 operator-(float b, float3 a)
+{
+    return make_float3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+inline __host__ __device__ int3 operator-(int3 a, int3 b)
+{
+    return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ int3 operator-(int3 a, int b)
+{
+    return make_int3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ int3 operator-(int b, int3 a)
+{
+    return make_int3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
+{
+    return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ uint3 operator-(uint3 a, uint b)
+{
+    return make_uint3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ uint3 operator-(uint b, uint3 a)
+{
+    return make_uint3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+inline __host__ __device__ float4 operator-(float4 a, float4 b)
+{
+    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(float4 &a, float4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ float4 operator-(float4 a, float b)
+{
+    return make_float4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ void operator-=(float4 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+inline __host__ __device__ int4 operator-(int4 a, int4 b)
+{
+    return make_int4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ int4 operator-(int4 a, int b)
+{
+    return make_int4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ int4 operator-(int b, int4 a)
+{
+    return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
+{
+    return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ uint4 operator-(uint4 a, uint b)
+{
+    return make_uint4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ uint4 operator-(uint b, uint4 a)
+{
+    return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+////////////////////////////////////////////////////////////////////////////////
+// multiply
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 operator*(float2 a, float2 b)
+{
+    return make_float2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ float2 operator*(float2 a, float b)
+{
+    return make_float2(a.x * b, a.y * b);
+}
+inline __host__ __device__ float2 operator*(float b, float2 a)
+{
+    return make_float2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+inline __host__ __device__ int2 operator*(int2 a, int2 b)
+{
+    return make_int2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ int2 operator*(int2 a, int b)
+{
+    return make_int2(a.x * b, a.y * b);
+}
+inline __host__ __device__ int2 operator*(int b, int2 a)
+{
+    return make_int2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
+{
+    return make_uint2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ uint2 operator*(uint2 a, uint b)
+{
+    return make_uint2(a.x * b, a.y * b);
+}
+inline __host__ __device__ uint2 operator*(uint b, uint2 a)
+{
+    return make_uint2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+inline __host__ __device__ float3 operator*(float3 a, float3 b)
+{
+    return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ float3 operator*(float3 a, float b)
+{
+    return make_float3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ float3 operator*(float b, float3 a)
+{
+    return make_float3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+inline __host__ __device__ int3 operator*(int3 a, int3 b)
+{
+    return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ int3 operator*(int3 a, int b)
+{
+    return make_int3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ int3 operator*(int b, int3 a)
+{
+    return make_int3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
+{
+    return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ uint3 operator*(uint3 a, uint b)
+{
+    return make_uint3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ uint3 operator*(uint b, uint3 a)
+{
+    return make_uint3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+inline __host__ __device__ float4 operator*(float4 a, float4 b)
+{
+    return make_float4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ float4 operator*(float4 a, float b)
+{
+    return make_float4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ float4 operator*(float b, float4 a)
+{
+    return make_float4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+inline __host__ __device__ int4 operator*(int4 a, int4 b)
+{
+    return make_int4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ int4 operator*(int4 a, int b)
+{
+    return make_int4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ int4 operator*(int b, int4 a)
+{
+    return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
+{
+    return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ uint4 operator*(uint4 a, uint b)
+{
+    return make_uint4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ uint4 operator*(uint b, uint4 a)
+{
+    return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+////////////////////////////////////////////////////////////////////////////////
+// divide
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 operator/(float2 a, float2 b)
+{
+    return make_float2(a.x / b.x, a.y / b.y);
+}
+inline __host__ __device__ void operator/=(float2 &a, float2 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+}
+inline __host__ __device__ float2 operator/(float2 a, float b)
+{
+    return make_float2(a.x / b, a.y / b);
+}
+inline __host__ __device__ void operator/=(float2 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+}
+inline __host__ __device__ float2 operator/(float b, float2 a)
+{
+    return make_float2(b / a.x, b / a.y);
+}
+inline __host__ __device__ float3 operator/(float3 a, float3 b)
+{
+    return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+inline __host__ __device__ void operator/=(float3 &a, float3 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+}
+inline __host__ __device__ float3 operator/(float3 a, float b)
+{
+    return make_float3(a.x / b, a.y / b, a.z / b);
+}
+inline __host__ __device__ void operator/=(float3 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+    a.z /= b;
+}
+inline __host__ __device__ float3 operator/(float b, float3 a)
+{
+    return make_float3(b / a.x, b / a.y, b / a.z);
+}
+inline __host__ __device__ float4 operator/(float4 a, float4 b)
+{
+    return make_float4(a.x / b.x, a.y / b.y, a.z / b.z,  a.w / b.w);
+}
+inline __host__ __device__ void operator/=(float4 &a, float4 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+    a.w /= b.w;
+}
+inline __host__ __device__ float4 operator/(float4 a, float b)
+{
+    return make_float4(a.x / b, a.y / b, a.z / b,  a.w / b);
+}
+inline __host__ __device__ void operator/=(float4 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+    a.z /= b;
+    a.w /= b;
+}
+inline __host__ __device__ float4 operator/(float b, float4 a)
+{
+    return make_float4(b / a.x, b / a.y, b / a.z, b / a.w);
+}
+////////////////////////////////////////////////////////////////////////////////
+// min
+////////////////////////////////////////////////////////////////////////////////
+inline  __host__ __device__ float2 fminf(float2 a, float2 b)
+{
+    return make_float2(fminf(a.x,b.x), fminf(a.y,b.y));
+}
+inline __host__ __device__ float3 fminf(float3 a, float3 b)
+{
+    return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
+}
+inline  __host__ __device__ float4 fminf(float4 a, float4 b)
+{
+    return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
+}
+inline __host__ __device__ int2 min(int2 a, int2 b)
+{
+    return make_int2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ int3 min(int3 a, int3 b)
+{
+    return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ int4 min(int4 a, int4 b)
+{
+    return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+inline __host__ __device__ uint2 min(uint2 a, uint2 b)
+{
+    return make_uint2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ uint3 min(uint3 a, uint3 b)
+{
+    return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ uint4 min(uint4 a, uint4 b)
+{
+    return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+////////////////////////////////////////////////////////////////////////////////
+// max
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 fmaxf(float2 a, float2 b)
+{
+    return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y));
+}
+inline __host__ __device__ float3 fmaxf(float3 a, float3 b)
+{
+    return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
+}
+inline __host__ __device__ float4 fmaxf(float4 a, float4 b)
+{
+    return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
+}
+inline __host__ __device__ int2 max(int2 a, int2 b)
+{
+    return make_int2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ int3 max(int3 a, int3 b)
+{
+    return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ int4 max(int4 a, int4 b)
+{
+    return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+inline __host__ __device__ uint2 max(uint2 a, uint2 b)
+{
+    return make_uint2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ uint3 max(uint3 a, uint3 b)
+{
+    return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ uint4 max(uint4 a, uint4 b)
+{
+    return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+////////////////////////////////////////////////////////////////////////////////
+// lerp
+// - linear interpolation between a and b, based on value t in [0, 1] range
+////////////////////////////////////////////////////////////////////////////////
+inline __device__ __host__ float lerp(float a, float b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
+{
+    return a + t*(b-a);
+}
+////////////////////////////////////////////////////////////////////////////////
+// clamp
+// - clamp the value v to be in the range [a, b]
+////////////////////////////////////////////////////////////////////////////////
+inline __device__ __host__ float clamp(float f, float a, float b)
+{
+    return fmaxf(a, fminf(f, b));
+}
+inline __device__ __host__ int clamp(int f, int a, int b)
+{
+    return max(a, min(f, b));
+}
+inline __device__ __host__ uint clamp(uint f, uint a, uint b)
+{
+    return max(a, min(f, b));
+}
+inline __device__ __host__ float2 clamp(float2 v, float a, float b)
+{
+    return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
+{
+    return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ float3 clamp(float3 v, float a, float b)
+{
+    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
+{
+    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ float4 clamp(float4 v, float a, float b)
+{
+    return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
+{
+    return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+inline __device__ __host__ int2 clamp(int2 v, int a, int b)
+{
+    return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
+{
+    return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ int3 clamp(int3 v, int a, int b)
+{
+    return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
+{
+    return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ int4 clamp(int4 v, int a, int b)
+{
+    return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
+{
+    return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
+{
+    return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
+{
+    return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
+{
+    return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
+{
+    return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
+{
+    return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
+{
+    return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+////////////////////////////////////////////////////////////////////////////////
+// dot product
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float dot(float2 a, float2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ float dot(float3 a, float3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ float dot(float4 a, float4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+inline __host__ __device__ int dot(int2 a, int2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ int dot(int3 a, int3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ int dot(int4 a, int4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+inline __host__ __device__ uint dot(uint2 a, uint2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ uint dot(uint3 a, uint3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ uint dot(uint4 a, uint4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+////////////////////////////////////////////////////////////////////////////////
+// length
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float length(float2 v)
+{
+    return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float3 v)
+{
+    return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float4 v)
+{
+    return sqrtf(dot(v, v));
+}
+////////////////////////////////////////////////////////////////////////////////
+// normalize
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 normalize(float2 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+inline __host__ __device__ float3 normalize(float3 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+inline __host__ __device__ float4 normalize(float4 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+////////////////////////////////////////////////////////////////////////////////
+// floor
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 floorf(float2 v)
+{
+    return make_float2(floorf(v.x), floorf(v.y));
+}
+inline __host__ __device__ float3 floorf(float3 v)
+{
+    return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
+}
+inline __host__ __device__ float4 floorf(float4 v)
+{
+    return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
+}
+////////////////////////////////////////////////////////////////////////////////
+// frac - returns the fractional portion of a scalar or each vector component
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float fracf(float v)
+{
+    return v - floorf(v);
+}
+inline __host__ __device__ float2 fracf(float2 v)
+{
+    return make_float2(fracf(v.x), fracf(v.y));
+}
+inline __host__ __device__ float3 fracf(float3 v)
+{
+    return make_float3(fracf(v.x), fracf(v.y), fracf(v.z));
+}
+inline __host__ __device__ float4 fracf(float4 v)
+{
+    return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
+}
+////////////////////////////////////////////////////////////////////////////////
+// fmod
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 fmodf(float2 a, float2 b)
+{
+    return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y));
+}
+inline __host__ __device__ float3 fmodf(float3 a, float3 b)
+{
+    return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
+}
+inline __host__ __device__ float4 fmodf(float4 a, float4 b)
+{
+    return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
+}
+////////////////////////////////////////////////////////////////////////////////
+// absolute value
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float2 fabs(float2 v)
+{
+    return make_float2(fabs(v.x), fabs(v.y));
+}
+inline __host__ __device__ float3 fabs(float3 v)
+{
+    return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
+}
+inline __host__ __device__ float4 fabs(float4 v)
+{
+    return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
+}
+inline __host__ __device__ int2 abs(int2 v)
+{
+    return make_int2(abs(v.x), abs(v.y));
+}
+inline __host__ __device__ int3 abs(int3 v)
+{
+    return make_int3(abs(v.x), abs(v.y), abs(v.z));
+}
+inline __host__ __device__ int4 abs(int4 v)
+{
+    return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
+}
+////////////////////////////////////////////////////////////////////////////////
+// reflect
+// - returns reflection of incident ray I around surface normal N
+// - N should be normalized, reflected vector's length is equal to length of I
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float3 reflect(float3 i, float3 n)
+{
+    return i - 2.0f * n * dot(n,i);
+}
+////////////////////////////////////////////////////////////////////////////////
+// cross product
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float3 cross(float3 a, float3 b)
+{
+    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
+}
+////////////////////////////////////////////////////////////////////////////////
+// smoothstep
+// - returns 0 if x < a
+// - returns 1 if x > b
+// - otherwise returns smooth interpolation between 0 and 1 based on x
+////////////////////////////////////////////////////////////////////////////////
+inline __device__ __host__ float smoothstep(float a, float b, float x)
+{
+    float y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(3.0f - (2.0f*y)));
+}
+inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x)
+{
+    float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y)));
+}
+inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x)
+{
+    float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y)));
+}
+inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x)
+{
+    float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y)));
+}
+#endif

dva/mvp/extensions/utils/makefile ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ all:
2	+ python setup.py build_ext --inplace

dva/mvp/extensions/utils/setup.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from setuptools import setup
+from torch.utils.cpp_extension import CUDAExtension, BuildExtension
+if __name__ == "__main__":
+    import torch
+    setup(
+        name="utils",
+        ext_modules=[
+            CUDAExtension(
+                "utilslib",
+                sources=["utils.cpp", "utils_kernel.cu"],
+                extra_compile_args={
+                    "nvcc": [
+                        "-arch=sm_70",
+                        "-std=c++14",
+                        "-lineinfo",
+                    ]
+                }
+            )
+        ],
+        cmdclass={"build_ext": BuildExtension}
+    )

dva/mvp/extensions/utils/utils.cpp ADDED Viewed

	@@ -0,0 +1,137 @@

+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#include <torch/extension.h>
+#include <c10/cuda/CUDAStream.h>
+#include <vector>
+void compute_raydirs_forward_cuda(
+        int N, int H, int W,
+        float * viewposim,
+        float * viewrotim,
+        float * focalim,
+        float * princptim,
+        float * pixelcoordsim,
+        float volradius,
+        float * raypos,
+        float * raydir,
+        float * tminmax,
+        cudaStream_t stream);
+void compute_raydirs_backward_cuda(
+        int N, int H, int W,
+        float * viewposim,
+        float * viewrotim,
+        float * focalim,
+        float * princptim,
+        float * pixelcoordsim,
+        float volradius,
+        float * raypos,
+        float * raydir,
+        float * tminmax,
+        float * grad_viewposim,
+        float * grad_viewrotim,
+        float * grad_focalim,
+        float * grad_princptim,
+        cudaStream_t stream);
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA((x)); CHECK_CONTIGUOUS((x))
+std::vector<torch::Tensor> compute_raydirs_forward(
+        torch::Tensor viewposim,
+        torch::Tensor viewrotim,
+        torch::Tensor focalim,
+        torch::Tensor princptim,
+        torch::optional<torch::Tensor> pixelcoordsim,
+        int W, int H,
+        float volradius,
+        torch::Tensor rayposim,
+        torch::Tensor raydirim,
+        torch::Tensor tminmaxim) {
+    CHECK_INPUT(viewposim);
+    CHECK_INPUT(viewrotim);
+    CHECK_INPUT(focalim);
+    CHECK_INPUT(princptim);
+    if (pixelcoordsim) { CHECK_INPUT(*pixelcoordsim); }
+    CHECK_INPUT(rayposim);
+    CHECK_INPUT(raydirim);
+    CHECK_INPUT(tminmaxim);
+    int N = viewposim.size(0);
+    assert(!pixelcoordsim || (pixelcoordsim.size(1) == H && pixelcoordsim.size(2) == W));
+    compute_raydirs_forward_cuda(N, H, W,
+            reinterpret_cast<float *>(viewposim.data_ptr()),
+            reinterpret_cast<float *>(viewrotim.data_ptr()),
+            reinterpret_cast<float *>(focalim.data_ptr()),
+            reinterpret_cast<float *>(princptim.data_ptr()),
+            pixelcoordsim ? reinterpret_cast<float *>(pixelcoordsim->data_ptr()) : nullptr,
+            volradius,
+            reinterpret_cast<float *>(rayposim.data_ptr()),
+            reinterpret_cast<float *>(raydirim.data_ptr()),
+            reinterpret_cast<float *>(tminmaxim.data_ptr()),
+            0);
+    return {};
+}
+std::vector<torch::Tensor> compute_raydirs_backward(
+        torch::Tensor viewposim,
+        torch::Tensor viewrotim,
+        torch::Tensor focalim,
+        torch::Tensor princptim,
+        torch::optional<torch::Tensor> pixelcoordsim,
+        int W, int H,
+        float volradius,
+        torch::Tensor rayposim,
+        torch::Tensor raydirim,
+        torch::Tensor tminmaxim,
+        torch::Tensor grad_viewpos,
+        torch::Tensor grad_viewrot,
+        torch::Tensor grad_focal,
+        torch::Tensor grad_princpt) {
+    CHECK_INPUT(viewposim);
+    CHECK_INPUT(viewrotim);
+    CHECK_INPUT(focalim);
+    CHECK_INPUT(princptim);
+    if (pixelcoordsim) { CHECK_INPUT(*pixelcoordsim); }
+    CHECK_INPUT(rayposim);
+    CHECK_INPUT(raydirim);
+    CHECK_INPUT(tminmaxim);
+    CHECK_INPUT(grad_viewpos);
+    CHECK_INPUT(grad_viewrot);
+    CHECK_INPUT(grad_focal);
+    CHECK_INPUT(grad_princpt);
+    int N = viewposim.size(0);
+    assert(!pixelcoordsim || (pixelcoordsim.size(1) == H && pixelcoordsim.size(2) == W));
+    compute_raydirs_backward_cuda(N, H, W,
+            reinterpret_cast<float *>(viewposim.data_ptr()),
+            reinterpret_cast<float *>(viewrotim.data_ptr()),
+            reinterpret_cast<float *>(focalim.data_ptr()),
+            reinterpret_cast<float *>(princptim.data_ptr()),
+            pixelcoordsim ? reinterpret_cast<float *>(pixelcoordsim->data_ptr()) : nullptr,
+            volradius,
+            reinterpret_cast<float *>(rayposim.data_ptr()),
+            reinterpret_cast<float *>(raydirim.data_ptr()),
+            reinterpret_cast<float *>(tminmaxim.data_ptr()),
+            reinterpret_cast<float *>(grad_viewpos.data_ptr()),
+            reinterpret_cast<float *>(grad_viewrot.data_ptr()),
+            reinterpret_cast<float *>(grad_focal.data_ptr()),
+            reinterpret_cast<float *>(grad_princpt.data_ptr()),
+            0);
+    return {};
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("compute_raydirs_forward",  &compute_raydirs_forward,  "raydirs forward (CUDA)");
+    m.def("compute_raydirs_backward", &compute_raydirs_backward, "raydirs backward (CUDA)");
+}

dva/mvp/extensions/utils/utils.py ADDED Viewed

	@@ -0,0 +1,211 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+import torch.nn.functional as F
+try:
+    from . import utilslib
+except:
+    import utilslib
+class ComputeRaydirs(Function):
+    @staticmethod
+    def forward(self, viewpos, viewrot, focal, princpt, pixelcoords, volradius):
+        for tensor in [viewpos, viewrot, focal, princpt, pixelcoords]:
+            assert tensor.is_contiguous()
+        N = viewpos.size(0)
+        if isinstance(pixelcoords, tuple):
+            W, H = pixelcoords
+            pixelcoords = None
+        else:
+            H = pixelcoords.size(1)
+            W = pixelcoords.size(2)
+        raypos = torch.empty((N, H, W, 3), device=viewpos.device)
+        raydirs = torch.empty((N, H, W, 3), device=viewpos.device)
+        tminmax = torch.empty((N, H, W, 2), device=viewpos.device)
+        utilslib.compute_raydirs_forward(viewpos, viewrot, focal, princpt,
+                pixelcoords, W, H, volradius, raypos, raydirs, tminmax)
+        return raypos, raydirs, tminmax
+    @staticmethod
+    def backward(self, grad_raydirs, grad_tminmax):
+        return None, None, None, None, None, None
+def compute_raydirs(viewpos, viewrot, focal, princpt, pixelcoords, volradius):
+    raypos, raydirs, tminmax = ComputeRaydirs.apply(viewpos, viewrot, focal, princpt, pixelcoords, volradius)
+    return raypos, raydirs, tminmax
+class Rodrigues(nn.Module):
+    def __init__(self):
+        super(Rodrigues, self).__init__()
+    def forward(self, rvec):
+        theta = torch.sqrt(1e-5 + torch.sum(rvec ** 2, dim=1))
+        rvec = rvec / theta[:, None]
+        costh = torch.cos(theta)
+        sinth = torch.sin(theta)
+        return torch.stack((
+            rvec[:, 0] ** 2 + (1. - rvec[:, 0] ** 2) * costh,
+            rvec[:, 0] * rvec[:, 1] * (1. - costh) - rvec[:, 2] * sinth,
+            rvec[:, 0] * rvec[:, 2] * (1. - costh) + rvec[:, 1] * sinth,
+            rvec[:, 0] * rvec[:, 1] * (1. - costh) + rvec[:, 2] * sinth,
+            rvec[:, 1] ** 2 + (1. - rvec[:, 1] ** 2) * costh,
+            rvec[:, 1] * rvec[:, 2] * (1. - costh) - rvec[:, 0] * sinth,
+            rvec[:, 0] * rvec[:, 2] * (1. - costh) - rvec[:, 1] * sinth,
+            rvec[:, 1] * rvec[:, 2] * (1. - costh) + rvec[:, 0] * sinth,
+            rvec[:, 2] ** 2 + (1. - rvec[:, 2] ** 2) * costh), dim=1).view(-1, 3, 3)
+def gradcheck():
+    N = 2
+    H = 64
+    W = 64
+    k3 = 4
+    K = k3*k3*k3
+    M = 32
+    volradius = 1.
+    # generate random inputs
+    torch.manual_seed(1113)
+    rodrigues = Rodrigues()
+    _viewpos = torch.tensor([[-0.0, 0.0, -4.] for n in range(N)], device="cuda") + torch.randn(N, 3, device="cuda") * 0.1
+    viewrvec = torch.randn(N, 3, device="cuda") * 0.01
+    _viewrot = rodrigues(viewrvec)
+    _focal = torch.tensor([[W*4.0, W*4.0] for n in range(N)], device="cuda")
+    _princpt = torch.tensor([[W*0.5, H*0.5] for n in range(N)], device="cuda")
+    pixely, pixelx = torch.meshgrid(torch.arange(H, device="cuda").float(), torch.arange(W, device="cuda").float())
+    _pixelcoords = torch.stack([pixelx, pixely], dim=-1)[None, :, :, :].repeat(N, 1, 1, 1)
+    _viewpos = _viewpos.contiguous().detach().clone()
+    _viewpos.requires_grad = True
+    _viewrot = _viewrot.contiguous().detach().clone()
+    _viewrot.requires_grad = True
+    _focal = _focal.contiguous().detach().clone()
+    _focal.requires_grad = True
+    _princpt = _princpt.contiguous().detach().clone()
+    _princpt.requires_grad = True
+    _pixelcoords = _pixelcoords.contiguous().detach().clone()
+    _pixelcoords.requires_grad = True
+    max_len = 6.0
+    _stepsize = max_len / 15.5
+    params = [_viewpos, _viewrot, _focal, _princpt]
+    paramnames = ["viewpos", "viewrot", "focal", "princpt"]
+    ########################### run pytorch version ###########################
+    viewpos = _viewpos
+    viewrot = _viewrot
+    focal = _focal
+    princpt = _princpt
+    pixelcoords = _pixelcoords
+    raypos = viewpos[:, None, None, :].repeat(1, H, W, 1)
+    raydir = (pixelcoords - princpt[:, None, None, :]) / focal[:, None, None, :]
+    raydir = torch.cat([raydir, torch.ones_like(raydir[:, :, :, 0:1])], dim=-1)
+    raydir = torch.sum(viewrot[:, None, None, :, :] * raydir[:, :, :, :, None], dim=-2)
+    raydir = raydir / torch.sqrt(torch.sum(raydir ** 2, dim=-1, keepdim=True))
+    t1 = (-1. - viewpos[:, None, None, :]) / raydir
+    t2 = ( 1. - viewpos[:, None, None, :]) / raydir
+    tmin = torch.max(torch.min(t1[..., 0], t2[..., 0]),
+           torch.max(torch.min(t1[..., 1], t2[..., 1]),
+                     torch.min(t1[..., 2], t2[..., 2]))).clamp(min=0.)
+    tmax = torch.min(torch.max(t1[..., 0], t2[..., 0]),
+           torch.min(torch.max(t1[..., 1], t2[..., 1]),
+                     torch.max(t1[..., 2], t2[..., 2])))
+    tminmax = torch.stack([tmin, tmax], dim=-1)
+    sample0 = raydir
+    torch.cuda.synchronize()
+    time1 = time.time()
+    sample0.backward(torch.ones_like(sample0))
+    torch.cuda.synchronize()
+    time2 = time.time()
+    grads0 = [p.grad.detach().clone() if p.grad is not None else None for p in params]
+    for p in params:
+        if p.grad is not None:
+            p.grad.detach_()
+            p.grad.zero_()
+    ############################## run cuda version ###########################
+    viewpos = _viewpos
+    viewrot = _viewrot
+    focal = _focal
+    princpt = _princpt
+    pixelcoords = _pixelcoords
+    niter = 1
+    for p in params:
+        if p.grad is not None:
+            p.grad.detach_()
+            p.grad.zero_()
+    t0 = time.time()
+    torch.cuda.synchronize()
+    sample1 = compute_raydirs(viewpos, viewrot, focal, princpt, pixelcoords, volradius)[1]
+    t1 = time.time()
+    torch.cuda.synchronize()
+    print("-----------------------------------------------------------------")
+    print("{:>10} {:>10} {:>10} {:>10} {:>10} {:>10}".format("", "maxabsdiff", "dp", "index", "py", "cuda"))
+    ind = torch.argmax(torch.abs(sample0 - sample1))
+    print("{:<10} {:>10.5} {:>10.5} {:>10} {:>10.5} {:>10.5}".format(
+        "fwd",
+        torch.max(torch.abs(sample0 - sample1)).item(),
+        (torch.sum(sample0 * sample1) / torch.sqrt(torch.sum(sample0 * sample0) * torch.sum(sample1 * sample1))).item(),
+        ind.item(),
+        sample0.view(-1)[ind].item(),
+        sample1.view(-1)[ind].item()))
+    sample1.backward(torch.ones_like(sample1), retain_graph=True)
+    torch.cuda.synchronize()
+    t2 = time.time()
+    print("{:<10} {:10.5} {:10.5} {:10.5}".format("time", tf / niter, tb / niter, (tf + tb) / niter))
+    grads1 = [p.grad.detach().clone() if p.grad is not None else None for p in params]
+    ############# compare results #############
+    for p, g0, g1 in zip(paramnames, grads0, grads1):
+        ind = torch.argmax(torch.abs(g0 - g1))
+        print("{:<10} {:>10.5} {:>10.5} {:>10} {:>10.5} {:>10.5}".format(
+                p,
+                torch.max(torch.abs(g0 - g1)).item(),
+                (torch.sum(g0 * g1) / torch.sqrt(torch.sum(g0 * g0) * torch.sum(g1 * g1))).item(),
+                ind.item(),
+                g0.view(-1)[ind].item(),
+                g1.view(-1)[ind].item()))
+if __name__ == "__main__":
+    gradcheck()