diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/.zgroup b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/.zgroup
new file mode 100644
index 0000000000000000000000000000000000000000..3b7daf227c1687f28bc23b69f183e27ce9a475c1
--- /dev/null
+++ b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/.zgroup	
@@ -0,0 +1,3 @@
+{
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/.zgroup b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/.zgroup
new file mode 100644
index 0000000000000000000000000000000000000000..3b7daf227c1687f28bc23b69f183e27ce9a475c1
--- /dev/null
+++ b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/.zgroup	
@@ -0,0 +1,3 @@
+{
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/1.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/1.0
new file mode 100644
index 0000000000000000000000000000000000000000..a13fd0c98da93a0ed75adf81c50a14a4e3e3ff76
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/1.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/12.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/12.0
new file mode 100644
index 0000000000000000000000000000000000000000..3f11f3badc34ab4687eb6237d716d766619c0137
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/12.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/15.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/15.0
new file mode 100644
index 0000000000000000000000000000000000000000..45097370bdc51863cb9eed4e0b96f8af5716b5a7
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/15.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/16.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/16.0
new file mode 100644
index 0000000000000000000000000000000000000000..dec6b43d6b882a1eb058badfd711e308da67e6af
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/16.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/19.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/19.0
new file mode 100644
index 0000000000000000000000000000000000000000..485ecf836383ef7fafa59b5b9c125e15b7e0c360
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/19.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/2.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/2.0
new file mode 100644
index 0000000000000000000000000000000000000000..b51b93f0aeda1f918e86e2fc59785c5bc3942983
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/2.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/4.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/4.0
new file mode 100644
index 0000000000000000000000000000000000000000..9de28b9840d56ca35e0437b127c9b27af04fe54f
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/4.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/8.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/8.0
new file mode 100644
index 0000000000000000000000000000000000000000..256f46821b5c3a4ffeae50c12d5c088c89d84ccb
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/data/full_state/8.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/meta/.zgroup b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/meta/.zgroup
new file mode 100644
index 0000000000000000000000000000000000000000..3b7daf227c1687f28bc23b69f183e27ce9a475c1
--- /dev/null
+++ b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/meta/.zgroup	
@@ -0,0 +1,3 @@
+{
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/meta/episode_ends/.zarray b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/meta/episode_ends/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..60914cf77bdc3cc1ad411eea64d76bf1f23dfa50
--- /dev/null
+++ b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/meta/episode_ends/.zarray	
@@ -0,0 +1,20 @@
+{
+    "chunks": [
+        10
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 3,
+        "cname": "zstd",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "<i8",
+    "fill_value": 0,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        10
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/meta/episode_ends/0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/meta/episode_ends/0
new file mode 100644
index 0000000000000000000000000000000000000000..e242873752168b238a2521f6d5d2b29f1a9edc81
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_door-unlock_expert.zarr/meta/episode_ends/0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/15.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/15.0
new file mode 100644
index 0000000000000000000000000000000000000000..16ae6f9e29e6bb0e0dddfa90d0462de4654841a3
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/15.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/16.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/16.0
new file mode 100644
index 0000000000000000000000000000000000000000..7f792bd75bb8824df221c7efe6bf95accf1c8aaf
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/16.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/18.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/18.0
new file mode 100644
index 0000000000000000000000000000000000000000..434d7b4906e98c404d2ca2d3c96292ef75d2cd3a
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/18.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/2.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/2.0
new file mode 100644
index 0000000000000000000000000000000000000000..57d301f016dfafc2103f65987ca8d06ec9912c9d
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/2.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/7.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/7.0
new file mode 100644
index 0000000000000000000000000000000000000000..a0683122ae4a472ac2a0a5065f0911f08de35dd8
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/7.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/8.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/8.0
new file mode 100644
index 0000000000000000000000000000000000000000..e3eca0d6edafd93d4326002fcdcf48f9eb01829e
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/8.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/9.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/9.0
new file mode 100644
index 0000000000000000000000000000000000000000..3bbe79239ef25449651829d32f8aa770dc3cfc3f
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/action/9.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/full_state/.zarray b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/full_state/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..eaf649916e85393eb516477b73da093067487d61
--- /dev/null
+++ b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/full_state/.zarray	
@@ -0,0 +1,22 @@
+{
+    "chunks": [
+        100,
+        39
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 3,
+        "cname": "zstd",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "<f4",
+    "fill_value": 0.0,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        2000,
+        39
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/full_state/13.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/full_state/13.0
new file mode 100644
index 0000000000000000000000000000000000000000..0a88fd24b3ac891ac26c891bfffb10cb74261016
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/full_state/13.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/full_state/17.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/full_state/17.0
new file mode 100644
index 0000000000000000000000000000000000000000..7fbfd1203c3cba561fc5329c122e009cb3be3944
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/full_state/17.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/full_state/6.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/full_state/6.0
new file mode 100644
index 0000000000000000000000000000000000000000..4f8da58e7384bc925e53fcd1e289f42fc7acfc59
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/full_state/6.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/full_state/7.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/full_state/7.0
new file mode 100644
index 0000000000000000000000000000000000000000..2030cf8251b5b90089902a1e07c59b37c29f9415
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/full_state/7.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/.zarray b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..39941777ab13262b64b7ad08ab941729bc8e92c6
--- /dev/null
+++ b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/.zarray	
@@ -0,0 +1,26 @@
+{
+    "chunks": [
+        100,
+        128,
+        128,
+        3
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 3,
+        "cname": "zstd",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "|u1",
+    "fill_value": 0,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        2000,
+        128,
+        128,
+        3
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/0.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/0.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..d43a88f36494e75f0e051c5fee18dfe269dd8bdb
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/0.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/1.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/1.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..fd27bfa7f395b986f616fb0d4485f6f2d442d6ad
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/1.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/10.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/10.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..1976733710ccd7e2244919e6ce028cacfb852fe1
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/10.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/11.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/11.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..5c3fc72800f1d62090d25e0678f2da3b8932c61f
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/11.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/12.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/12.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..344d604558d3610a4af0e74ddbc1d86962e5e6ae
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/12.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/13.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/13.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..6a1ce4ad96b0466462b9c8e8132118b227372d63
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/13.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/14.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/14.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..94b61c5fc8785028fa87531edf83a316ee3d8f05
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/14.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/15.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/15.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..1fea3bae18fc11b61852583bc4a11411602fc74f
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/15.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/16.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/16.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..800209b0b052a5b6262f0f93d22aeca3fe0ea39e
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/16.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/17.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/17.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..651623cf545e258cb5be28dab738880d41571654
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/17.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/18.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/18.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..3429d45ec71defbdd12ffa25a07cd8fdc333c09b
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/18.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/19.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/19.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..7ff1c39b21bd28442ca8098564253c73ffd9e227
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/19.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/2.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/2.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..1e0032a87dcf4f4e6b9232dc223a9b6541411d7d
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/2.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/4.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/4.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..3147fdff9d4aeb8efa2903fa09c79e6e899b9d97
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/4.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/5.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/5.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..2f5096ea9fd4231b5c69e630d8ae9da70136de20
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/5.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/6.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/6.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..7dfaa7bef085ed47b2a00205153d9573aab55e1c
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/6.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/7.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/7.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..d64127577be54c14fff94f7ecb4d880555a69f07
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/7.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/8.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/8.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..9bc68e2e69cd7a2fe385165bb4d89e34f6b15749
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/8.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/9.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/9.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..2361c0b45f69238fda83d77ec19f7ba4dd5369f5
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/img/9.0.0.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/point_cloud/.zarray b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/point_cloud/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..8369a4d15f2993e1ea345f4ba0a01bced0d7cfec
--- /dev/null
+++ b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/point_cloud/.zarray	
@@ -0,0 +1,24 @@
+{
+    "chunks": [
+        100,
+        1024,
+        6
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 3,
+        "cname": "zstd",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "<f4",
+    "fill_value": 0.0,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        2000,
+        1024,
+        6
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/.zarray b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..0b166d46e2eb281b2402273f8e7bbf9690b2aad3
--- /dev/null
+++ b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/.zarray	
@@ -0,0 +1,22 @@
+{
+    "chunks": [
+        100,
+        9
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 3,
+        "cname": "zstd",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "<f4",
+    "fill_value": 0.0,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        2000,
+        9
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/13.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/13.0
new file mode 100644
index 0000000000000000000000000000000000000000..0d01979f6cb0647a7d7a8753044cdddc273b537a
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/13.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/16.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/16.0
new file mode 100644
index 0000000000000000000000000000000000000000..9fdac531ffce4eac99351ce630c48c70c9fb88f3
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/16.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/17.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/17.0
new file mode 100644
index 0000000000000000000000000000000000000000..2b436003a749124d170eb2d36381915752fb72f2
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/17.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/3.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/3.0
new file mode 100644
index 0000000000000000000000000000000000000000..4aa0e9b817c02de9bfa718a2dc31473c223472b3
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/3.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/4.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/4.0
new file mode 100644
index 0000000000000000000000000000000000000000..b341d6d96bff45da08f9ebeb48df77f3b300880d
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/4.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/5.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/5.0
new file mode 100644
index 0000000000000000000000000000000000000000..9e73645153a1da43ff55f594239047baa576f655
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/5.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/8.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/8.0
new file mode 100644
index 0000000000000000000000000000000000000000..b2749b7252e41e2da6c05502f72e965e3040f390
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/8.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/9.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/9.0
new file mode 100644
index 0000000000000000000000000000000000000000..0e0d9347ecb0235f097f051c67b32c1c8bf3350f
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/data/state/9.0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/meta/episode_ends/0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/meta/episode_ends/0
new file mode 100644
index 0000000000000000000000000000000000000000..e242873752168b238a2521f6d5d2b29f1a9edc81
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_drawer-close_expert.zarr/meta/episode_ends/0 differ
diff --git a/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_faucet-close_expert.zarr/data/img/7.0.0.0 b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_faucet-close_expert.zarr/data/img/7.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..b7da29463b4b9b52ab2ac75068161158731cea6a
Binary files /dev/null and b/Metaworld/zarr_path: /data/haojun/datasets/3d-dp/metaworld_faucet-close_expert.zarr/data/img/7.0.0.0 differ
diff --git a/Metaworld/zarr_path: data/metaworld_disassemble_expert.zarr/data/full_state/.zarray b/Metaworld/zarr_path: data/metaworld_disassemble_expert.zarr/data/full_state/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..eaf649916e85393eb516477b73da093067487d61
--- /dev/null
+++ b/Metaworld/zarr_path: data/metaworld_disassemble_expert.zarr/data/full_state/.zarray	
@@ -0,0 +1,22 @@
+{
+    "chunks": [
+        100,
+        39
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 3,
+        "cname": "zstd",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "<f4",
+    "fill_value": 0.0,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        2000,
+        39
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/Metaworld/zarr_path: data/metaworld_disassemble_expert.zarr/data/full_state/17.0 b/Metaworld/zarr_path: data/metaworld_disassemble_expert.zarr/data/full_state/17.0
new file mode 100644
index 0000000000000000000000000000000000000000..4d39870feab4ed625d884f0bc94390c77fc114fb
Binary files /dev/null and b/Metaworld/zarr_path: data/metaworld_disassemble_expert.zarr/data/full_state/17.0 differ
diff --git a/Metaworld/zarr_path: data/metaworld_disassemble_expert.zarr/meta/episode_ends/.zarray b/Metaworld/zarr_path: data/metaworld_disassemble_expert.zarr/meta/episode_ends/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..60914cf77bdc3cc1ad411eea64d76bf1f23dfa50
--- /dev/null
+++ b/Metaworld/zarr_path: data/metaworld_disassemble_expert.zarr/meta/episode_ends/.zarray	
@@ -0,0 +1,20 @@
+{
+    "chunks": [
+        10
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 3,
+        "cname": "zstd",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "<i8",
+    "fill_value": 0,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        10
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/.zarray b/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..39941777ab13262b64b7ad08ab941729bc8e92c6
--- /dev/null
+++ b/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/.zarray	
@@ -0,0 +1,26 @@
+{
+    "chunks": [
+        100,
+        128,
+        128,
+        3
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 3,
+        "cname": "zstd",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "|u1",
+    "fill_value": 0,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        2000,
+        128,
+        128,
+        3
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/10.0.0.0 b/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/10.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..2a2a7136e4ad4f1bebeb20a00f40d1a43213cfe8
Binary files /dev/null and b/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/10.0.0.0 differ
diff --git a/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/12.0.0.0 b/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/12.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..f8b284ace9439e68f4a011ccd4b70fc098cd2f1a
Binary files /dev/null and b/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/12.0.0.0 differ
diff --git a/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/2.0.0.0 b/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/2.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..829f14375971dad28ba7dcb40e212e40cacb37b4
Binary files /dev/null and b/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/2.0.0.0 differ
diff --git a/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/3.0.0.0 b/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/3.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..81dbf8eac67b3871e54d3b11c45dc0ac3943605e
Binary files /dev/null and b/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/3.0.0.0 differ
diff --git a/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/5.0.0.0 b/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/5.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..f5f60cb787dce3f1fcf7e135e767685336e8ce0b
Binary files /dev/null and b/Metaworld/zarr_path: data/metaworld_door-close_expert.zarr/data/img/5.0.0.0 differ
diff --git a/mujoco-py-2.1.2.14/docs/build/doctrees/environment.pickle b/mujoco-py-2.1.2.14/docs/build/doctrees/environment.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..41d19393641016b8e089ace1cdbe7704235ed4f5
--- /dev/null
+++ b/mujoco-py-2.1.2.14/docs/build/doctrees/environment.pickle
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbab0806fb7a0b1c5492675ffb630f96f989d658af4903cb58574cfa4d6ec489
+size 45615
diff --git a/pytorch3d_simplified/pytorch3d/__pycache__/__init__.cpython-38.pyc b/pytorch3d_simplified/pytorch3d/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d40e18c7da49b26e057c63f4f26ee40a97eca267
Binary files /dev/null and b/pytorch3d_simplified/pytorch3d/__pycache__/__init__.cpython-38.pyc differ
diff --git a/pytorch3d_simplified/pytorch3d/csrc/ext.cpp b/pytorch3d_simplified/pytorch3d/csrc/ext.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2e908a9260ba4115262a265f09dfcf6f873c4949
--- /dev/null
+++ b/pytorch3d_simplified/pytorch3d/csrc/ext.cpp
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// clang-format off
+
+#include <torch/extension.h>
+#include "sample_farthest_points/sample_farthest_points.h"
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  
+  m.def("sample_farthest_points", &FarthestPointSampling);
+}
diff --git a/pytorch3d_simplified/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.h b/pytorch3d_simplified/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b613d358880936863c2a56b82dee77d93d777f9
--- /dev/null
+++ b/pytorch3d_simplified/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+#include <tuple>
+#include "utils/pytorch3d_cutils.h"
+
+// Iterative farthest point sampling algorithm [1] to subsample a set of
+// K points from a given pointcloud. At each iteration, a point is selected
+// which has the largest nearest neighbor distance to any of the
+// already selected points.
+
+// Farthest point sampling provides more uniform coverage of the input
+// point cloud compared to uniform random sampling.
+
+// [1] Charles R. Qi et al, "PointNet++: Deep Hierarchical Feature Learning
+//     on Point Sets in a Metric Space", NeurIPS 2017.
+
+// Args:
+//     points: (N, P, D) float32 Tensor containing the batch of pointclouds.
+//     lengths: (N,) long Tensor giving the number of points in each pointcloud
+//        (to support heterogeneous batches of pointclouds).
+//     K: a tensor of length (N,) giving the number of
+//        samples to select for each element in the batch.
+//        The number of samples is typically << P.
+//     start_idxs: (N,) long Tensor giving the index of the first point to
+//        sample. Default is all 0. When a random start point is required,
+//        start_idxs should be set to a random value between [0, lengths[n]]
+//        for batch element n.
+// Returns:
+//     selected_indices: (N, K) array of selected indices. If the values in
+//        K are not all the same, then the shape will be (N, max(K), D), and
+//        padded with -1 for batch elements where k_i < max(K). The selected
+//        points are gathered in the pytorch autograd wrapper.
+
+at::Tensor FarthestPointSamplingCuda(
+    const at::Tensor& points,
+    const at::Tensor& lengths,
+    const at::Tensor& K,
+    const at::Tensor& start_idxs);
+
+at::Tensor FarthestPointSamplingCpu(
+    const at::Tensor& points,
+    const at::Tensor& lengths,
+    const at::Tensor& K,
+    const at::Tensor& start_idxs);
+
+// Exposed implementation.
+at::Tensor FarthestPointSampling(
+    const at::Tensor& points,
+    const at::Tensor& lengths,
+    const at::Tensor& K,
+    const at::Tensor& start_idxs) {
+  if (points.is_cuda() || lengths.is_cuda() || K.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(lengths);
+    CHECK_CUDA(K);
+    CHECK_CUDA(start_idxs);
+    return FarthestPointSamplingCuda(points, lengths, K, start_idxs);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return FarthestPointSamplingCpu(points, lengths, K, start_idxs);
+}
diff --git a/pytorch3d_simplified/pytorch3d/csrc/sample_farthest_points/sample_farthest_points_cpu.cpp b/pytorch3d_simplified/pytorch3d/csrc/sample_farthest_points/sample_farthest_points_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd533825f4da75e232bc493c28a0872e477d6db7
--- /dev/null
+++ b/pytorch3d_simplified/pytorch3d/csrc/sample_farthest_points/sample_farthest_points_cpu.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <iterator>
+#include <random>
+#include <vector>
+
+at::Tensor FarthestPointSamplingCpu(
+    const at::Tensor& points,
+    const at::Tensor& lengths,
+    const at::Tensor& K,
+    const at::Tensor& start_idxs) {
+  // Get constants
+  const int64_t N = points.size(0);
+  const int64_t P = points.size(1);
+  const int64_t D = points.size(2);
+  const int64_t max_K = torch::max(K).item<int64_t>();
+
+  // Initialize an output array for the sampled indices
+  // of shape (N, max_K)
+  auto long_opts = lengths.options();
+  torch::Tensor sampled_indices = torch::full({N, max_K}, -1, long_opts);
+
+  // Create accessors for all tensors
+  auto points_a = points.accessor<float, 3>();
+  auto lengths_a = lengths.accessor<int64_t, 1>();
+  auto k_a = K.accessor<int64_t, 1>();
+  auto sampled_indices_a = sampled_indices.accessor<int64_t, 2>();
+  auto start_idxs_a = start_idxs.accessor<int64_t, 1>();
+
+  // Initialize a mask to prevent duplicates
+  // If true, the point has already been selected.
+  std::vector<unsigned char> selected_points_mask(P, false);
+
+  // Initialize to infinity a vector of
+  // distances from each point to any of the previously selected points
+  std::vector<float> dists(P, std::numeric_limits<float>::max());
+
+  for (int64_t n = 0; n < N; ++n) {
+    // Resize and reset points mask and distances for each batch
+    selected_points_mask.resize(lengths_a[n]);
+    dists.resize(lengths_a[n]);
+    std::fill(selected_points_mask.begin(), selected_points_mask.end(), false);
+    std::fill(dists.begin(), dists.end(), std::numeric_limits<float>::max());
+
+    // Get the starting point index and save it
+    int64_t last_idx = start_idxs_a[n];
+    sampled_indices_a[n][0] = last_idx;
+
+    // Set the value of the mask at this point to false
+    selected_points_mask[last_idx] = true;
+
+    // For heterogeneous pointclouds, use the minimum of the
+    // length for that cloud compared to K as the number of
+    // points to sample
+    const int64_t batch_k = std::min(lengths_a[n], k_a[n]);
+
+    // Iteratively select batch_k points per batch
+    for (int64_t k = 1; k < batch_k; ++k) {
+      // Iterate through all the points
+      for (int64_t p = 0; p < lengths_a[n]; ++p) {
+        if (selected_points_mask[p]) {
+          // For already selected points set the distance to 0.0
+          dists[p] = 0.0;
+          continue;
+        }
+
+        // Calculate the distance to the last selected point
+        float dist2 = 0.0;
+        for (int64_t d = 0; d < D; ++d) {
+          float diff = points_a[n][last_idx][d] - points_a[n][p][d];
+          dist2 += diff * diff;
+        }
+
+        // If the distance of this point to the last selected point is closer
+        // than the distance to any of the previously selected points, then
+        // update this distance
+        if (dist2 < dists[p]) {
+          dists[p] = dist2;
+        }
+      }
+
+      // The aim is to pick the point that has the largest
+      // nearest neighbour distance to any of the already selected points
+      auto itr = std::max_element(dists.begin(), dists.end());
+      last_idx = std::distance(dists.begin(), itr);
+
+      // Save selected point
+      sampled_indices_a[n][k] = last_idx;
+
+      // Set the mask value to true to prevent duplicates.
+      selected_points_mask[last_idx] = true;
+    }
+  }
+
+  return sampled_indices;
+}
diff --git a/pytorch3d_simplified/pytorch3d/csrc/utils/dispatch.cuh b/pytorch3d_simplified/pytorch3d/csrc/utils/dispatch.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..83f3d69ff40907c396e3d175402d5cf4561142b5
--- /dev/null
+++ b/pytorch3d_simplified/pytorch3d/csrc/utils/dispatch.cuh
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// This file provides utilities for dispatching to specialized versions of
+// functions. This is especially useful for CUDA kernels, since specializing
+// them to particular input sizes can often allow the compiler to unroll loops
+// and place arrays into registers, which can give huge performance speedups.
+//
+// As an example, suppose we have the following function which is specialized
+// based on a compile-time int64_t value:
+//
+// template<typename T, int64_t x>
+// struct SquareOffset {
+//   static void run(T y) {
+//     T val = x * x + y;
+//     std::cout << val << std::endl;
+//   }
+// }
+//
+// This function takes one compile-time argument x, and one run-time argument y.
+// We might want to compile specialized versions of this for x=0, x=1, etc and
+// then dispatch to the correct one based on the runtime value of x.
+// One simple way to achieve this is with a lookup table:
+//
+// template<typename T>
+// void DispatchSquareOffset(const int64_t x, T y) {
+//   if (x == 0) {
+//     SquareOffset<T, 0>::run(y);
+//   } else if (x == 1) {
+//     SquareOffset<T, 1>::run(y);
+//   } else if (x == 2) {
+//     SquareOffset<T, 2>::run(y);
+//   }
+// }
+//
+// This function takes both x and y as run-time arguments, and dispatches to
+// different specialized versions of SquareOffset based on the run-time value
+// of x. This works, but it's tedious and error-prone. If we want to change the
+// set of x values for which we provide compile-time specializations, then we
+// will need to do a lot of tedius editing of the dispatch function. Also, if we
+// want to provide compile-time specializations for another function other than
+// SquareOffset, we will need to duplicate the entire lookup table.
+//
+// To solve these problems, we can use the DispatchKernel1D function provided by
+// this file instead:
+//
+// template<typename T>
+// void DispatchSquareOffset(const int64_t x, T y) {
+//     constexpr int64_t xmin = 0;
+//     constexpr int64_t xmax = 2;
+//     DispatchKernel1D<SquareOffset, T, xmin, xmax>(x, y);
+// }
+//
+// DispatchKernel1D uses template metaprogramming to compile specialized
+// versions of SquareOffset for all values of x with xmin <= x <= xmax, and
+// then dispatches to the correct one based on the run-time value of x. If we
+// want to change the range of x values for which SquareOffset is specialized
+// at compile-time, then all we have to do is change the values of the
+// compile-time constants xmin and xmax.
+//
+// This file also allows us to similarly dispatch functions that depend on two
+// compile-time int64_t values, using the DispatchKernel2D function like this:
+//
+// template<typename T, int64_t x, int64_t y>
+// struct Sum {
+//   static void run(T z, T w) {
+//     T val = x + y + z + w;
+//     std::cout << val << std::endl;
+//   }
+// }
+//
+// template<typename T>
+// void DispatchSum(const int64_t x, const int64_t y, int z, int w) {
+//   constexpr int64_t xmin = 1;
+//   constexpr int64_t xmax = 3;
+//   constexpr int64_t ymin = 2;
+//   constexpr int64_t ymax = 5;
+//   DispatchKernel2D<Sum, T, xmin, xmax, ymin, ymax>(x, y, z, w);
+// }
+//
+// Like its 1D counterpart, DispatchKernel2D uses template metaprogramming to
+// compile specialized versions of sum for all values of (x, y) with
+// xmin <= x <= xmax and ymin <= y <= ymax, then dispatches to the correct
+// specialized version based on the runtime values of x and y.
+
+// Define some helper structs in an anonymous namespace.
+namespace {
+
+// 1D dispatch: general case.
+// Kernel is the function we want to dispatch to; it should take a typename and
+// an int64_t as template args, and it should define a static void function
+// run which takes any number of arguments of any type.
+// In order to dispatch, we will take an additional template argument curN,
+// and increment it via template recursion until it is equal to the run-time
+// argument N.
+template <
+    template <typename, int64_t>
+    class Kernel,
+    typename T,
+    int64_t minN,
+    int64_t maxN,
+    int64_t curN,
+    typename... Args>
+struct DispatchKernelHelper1D {
+  static void run(const int64_t N, Args... args) {
+    if (curN == N) {
+      // The compile-time value curN is equal to the run-time value N, so we
+      // can dispatch to the run method of the Kernel.
+      Kernel<T, curN>::run(args...);
+    } else if (curN < N) {
+      // Increment curN via template recursion
+      DispatchKernelHelper1D<Kernel, T, minN, maxN, curN + 1, Args...>::run(
+          N, args...);
+    }
+    // We shouldn't get here -- throw an error?
+  }
+};
+
+// 1D dispatch: Specialization when curN == maxN
+// We need this base case to avoid infinite template recursion.
+template <
+    template <typename, int64_t>
+    class Kernel,
+    typename T,
+    int64_t minN,
+    int64_t maxN,
+    typename... Args>
+struct DispatchKernelHelper1D<Kernel, T, minN, maxN, maxN, Args...> {
+  static void run(const int64_t N, Args... args) {
+    if (N == maxN) {
+      Kernel<T, maxN>::run(args...);
+    }
+    // We shouldn't get here -- throw an error?
+  }
+};
+
+// 2D dispatch, general case.
+// This is similar to the 1D case: we take additional template args curN and
+// curM, and increment them via template recursion until they are equal to
+// the run-time values of N and M, at which point we dispatch to the run
+// method of the kernel.
+template <
+    template <typename, int64_t, int64_t>
+    class Kernel,
+    typename T,
+    int64_t minN,
+    int64_t maxN,
+    int64_t curN,
+    int64_t minM,
+    int64_t maxM,
+    int64_t curM,
+    typename... Args>
+struct DispatchKernelHelper2D {
+  static void run(const int64_t N, const int64_t M, Args... args) {
+    if (curN == N && curM == M) {
+      Kernel<T, curN, curM>::run(args...);
+    } else if (curN < N && curM < M) {
+      // Increment both curN and curM. This isn't strictly necessary; we could
+      // just increment one or the other at each step. But this helps to cut
+      // on the number of recursive calls we make.
+      DispatchKernelHelper2D<
+          Kernel,
+          T,
+          minN,
+          maxN,
+          curN + 1,
+          minM,
+          maxM,
+          curM + 1,
+          Args...>::run(N, M, args...);
+    } else if (curN < N) {
+      // Increment curN only
+      DispatchKernelHelper2D<
+          Kernel,
+          T,
+          minN,
+          maxN,
+          curN + 1,
+          minM,
+          maxM,
+          curM,
+          Args...>::run(N, M, args...);
+    } else if (curM < M) {
+      // Increment curM only
+      DispatchKernelHelper2D<
+          Kernel,
+          T,
+          minN,
+          maxN,
+          curN,
+          minM,
+          maxM,
+          curM + 1,
+          Args...>::run(N, M, args...);
+    }
+  }
+};
+
+// 2D dispatch, specialization for curN == maxN
+template <
+    template <typename, int64_t, int64_t>
+    class Kernel,
+    typename T,
+    int64_t minN,
+    int64_t maxN,
+    int64_t minM,
+    int64_t maxM,
+    int64_t curM,
+    typename... Args>
+struct DispatchKernelHelper2D<
+    Kernel,
+    T,
+    minN,
+    maxN,
+    maxN,
+    minM,
+    maxM,
+    curM,
+    Args...> {
+  static void run(const int64_t N, const int64_t M, Args... args) {
+    if (maxN == N && curM == M) {
+      Kernel<T, maxN, curM>::run(args...);
+    } else if (curM < maxM) {
+      DispatchKernelHelper2D<
+          Kernel,
+          T,
+          minN,
+          maxN,
+          maxN,
+          minM,
+          maxM,
+          curM + 1,
+          Args...>::run(N, M, args...);
+    }
+    // We should not get here -- throw an error?
+  }
+};
+
+// 2D dispatch, specialization for curM == maxM
+template <
+    template <typename, int64_t, int64_t>
+    class Kernel,
+    typename T,
+    int64_t minN,
+    int64_t maxN,
+    int64_t curN,
+    int64_t minM,
+    int64_t maxM,
+    typename... Args>
+struct DispatchKernelHelper2D<
+    Kernel,
+    T,
+    minN,
+    maxN,
+    curN,
+    minM,
+    maxM,
+    maxM,
+    Args...> {
+  static void run(const int64_t N, const int64_t M, Args... args) {
+    if (curN == N && maxM == M) {
+      Kernel<T, curN, maxM>::run(args...);
+    } else if (curN < maxN) {
+      DispatchKernelHelper2D<
+          Kernel,
+          T,
+          minN,
+          maxN,
+          curN + 1,
+          minM,
+          maxM,
+          maxM,
+          Args...>::run(N, M, args...);
+    }
+    // We should not get here -- throw an error?
+  }
+};
+
+// 2D dispatch, specialization for curN == maxN, curM == maxM
+template <
+    template <typename, int64_t, int64_t>
+    class Kernel,
+    typename T,
+    int64_t minN,
+    int64_t maxN,
+    int64_t minM,
+    int64_t maxM,
+    typename... Args>
+struct DispatchKernelHelper2D<
+    Kernel,
+    T,
+    minN,
+    maxN,
+    maxN,
+    minM,
+    maxM,
+    maxM,
+    Args...> {
+  static void run(const int64_t N, const int64_t M, Args... args) {
+    if (maxN == N && maxM == M) {
+      Kernel<T, maxN, maxM>::run(args...);
+    }
+    // We should not get here -- throw an error?
+  }
+};
+
+} // namespace
+
+// This is the function we expect users to call to dispatch to 1D functions
+template <
+    template <typename, int64_t>
+    class Kernel,
+    typename T,
+    int64_t minN,
+    int64_t maxN,
+    typename... Args>
+void DispatchKernel1D(const int64_t N, Args... args) {
+  if (minN <= N && N <= maxN) {
+    // Kick off the template recursion by calling the Helper with curN = minN
+    DispatchKernelHelper1D<Kernel, T, minN, maxN, minN, Args...>::run(
+        N, args...);
+  }
+  // Maybe throw an error if we tried to dispatch outside the allowed range?
+}
+
+// This is the function we expect users to call to dispatch to 2D functions
+template <
+    template <typename, int64_t, int64_t>
+    class Kernel,
+    typename T,
+    int64_t minN,
+    int64_t maxN,
+    int64_t minM,
+    int64_t maxM,
+    typename... Args>
+void DispatchKernel2D(const int64_t N, const int64_t M, Args... args) {
+  if (minN <= N && N <= maxN && minM <= M && M <= maxM) {
+    // Kick off the template recursion by calling the Helper with curN = minN
+    // and curM = minM
+    DispatchKernelHelper2D<
+        Kernel,
+        T,
+        minN,
+        maxN,
+        minN,
+        minM,
+        maxM,
+        minM,
+        Args...>::run(N, M, args...);
+  }
+  // Maybe throw an error if we tried to dispatch outside the specified range?
+}
diff --git a/pytorch3d_simplified/pytorch3d/csrc/utils/float_math.cuh b/pytorch3d_simplified/pytorch3d/csrc/utils/float_math.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e48e960e96544fd901655ce1d0217513d300187b
--- /dev/null
+++ b/pytorch3d_simplified/pytorch3d/csrc/utils/float_math.cuh
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <thrust/tuple.h>
+
+// Set epsilon
+#ifdef _MSC_VER
+#define vEpsilon 1e-8f
+#else
+const auto vEpsilon = 1e-8;
+#endif
+
+// Common functions and operators for float2.
+
+__device__ inline float2 operator-(const float2& a, const float2& b) {
+  return make_float2(a.x - b.x, a.y - b.y);
+}
+
+__device__ inline float2 operator+(const float2& a, const float2& b) {
+  return make_float2(a.x + b.x, a.y + b.y);
+}
+
+__device__ inline float2 operator/(const float2& a, const float2& b) {
+  return make_float2(a.x / b.x, a.y / b.y);
+}
+
+__device__ inline float2 operator/(const float2& a, const float b) {
+  return make_float2(a.x / b, a.y / b);
+}
+
+__device__ inline float2 operator*(const float2& a, const float2& b) {
+  return make_float2(a.x * b.x, a.y * b.y);
+}
+
+__device__ inline float2 operator*(const float a, const float2& b) {
+  return make_float2(a * b.x, a * b.y);
+}
+
+__device__ inline float FloatMin3(const float a, const float b, const float c) {
+  return fminf(a, fminf(b, c));
+}
+
+__device__ inline float FloatMax3(const float a, const float b, const float c) {
+  return fmaxf(a, fmaxf(b, c));
+}
+
+__device__ inline float dot(const float2& a, const float2& b) {
+  return a.x * b.x + a.y * b.y;
+}
+
+// Backward pass for the dot product.
+// Args:
+//     a, b: Coordinates of two points.
+//     grad_dot: Upstream gradient for the output.
+//
+// Returns:
+//    tuple of gradients for each of the input points:
+//      (float2 grad_a, float2 grad_b)
+//
+__device__ inline thrust::tuple<float2, float2>
+DotBackward(const float2& a, const float2& b, const float& grad_dot) {
+  return thrust::make_tuple(grad_dot * b, grad_dot * a);
+}
+
+__device__ inline float sum(const float2& a) {
+  return a.x + a.y;
+}
+
+// Common functions and operators for float3.
+
+__device__ inline float3 operator-(const float3& a, const float3& b) {
+  return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+
+__device__ inline float3 operator+(const float3& a, const float3& b) {
+  return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+
+__device__ inline float3 operator/(const float3& a, const float3& b) {
+  return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+
+__device__ inline float3 operator/(const float3& a, const float b) {
+  return make_float3(a.x / b, a.y / b, a.z / b);
+}
+
+__device__ inline float3 operator*(const float3& a, const float3& b) {
+  return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+
+__device__ inline float3 operator*(const float a, const float3& b) {
+  return make_float3(a * b.x, a * b.y, a * b.z);
+}
+
+__device__ inline float dot(const float3& a, const float3& b) {
+  return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+__device__ inline float sum(const float3& a) {
+  return a.x + a.y + a.z;
+}
+
+__device__ inline float3 cross(const float3& a, const float3& b) {
+  return make_float3(
+      a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+}
+
+__device__ inline thrust::tuple<float3, float3>
+cross_backward(const float3& a, const float3& b, const float3& grad_cross) {
+  const float grad_ax = -grad_cross.y * b.z + grad_cross.z * b.y;
+  const float grad_ay = grad_cross.x * b.z - grad_cross.z * b.x;
+  const float grad_az = -grad_cross.x * b.y + grad_cross.y * b.x;
+  const float3 grad_a = make_float3(grad_ax, grad_ay, grad_az);
+
+  const float grad_bx = grad_cross.y * a.z - grad_cross.z * a.y;
+  const float grad_by = -grad_cross.x * a.z + grad_cross.z * a.x;
+  const float grad_bz = grad_cross.x * a.y - grad_cross.y * a.x;
+  const float3 grad_b = make_float3(grad_bx, grad_by, grad_bz);
+
+  return thrust::make_tuple(grad_a, grad_b);
+}
+
+__device__ inline float norm(const float3& a) {
+  return sqrt(dot(a, a));
+}
+
+__device__ inline float3 normalize(const float3& a) {
+  return a / (norm(a) + vEpsilon);
+}
+
+__device__ inline float3 normalize_backward(
+    const float3& a,
+    const float3& grad_normz) {
+  const float a_norm = norm(a) + vEpsilon;
+  const float3 out = a / a_norm;
+
+  const float grad_ax = grad_normz.x * (1.0f - out.x * out.x) / a_norm +
+      grad_normz.y * (-out.x * out.y) / a_norm +
+      grad_normz.z * (-out.x * out.z) / a_norm;
+  const float grad_ay = grad_normz.x * (-out.x * out.y) / a_norm +
+      grad_normz.y * (1.0f - out.y * out.y) / a_norm +
+      grad_normz.z * (-out.y * out.z) / a_norm;
+  const float grad_az = grad_normz.x * (-out.x * out.z) / a_norm +
+      grad_normz.y * (-out.y * out.z) / a_norm +
+      grad_normz.z * (1.0f - out.z * out.z) / a_norm;
+  return make_float3(grad_ax, grad_ay, grad_az);
+}
diff --git a/pytorch3d_simplified/pytorch3d/csrc/utils/geometry_utils.cuh b/pytorch3d_simplified/pytorch3d/csrc/utils/geometry_utils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..66aee7fc7bcd3495bc7dbba56d89995d383b655e
--- /dev/null
+++ b/pytorch3d_simplified/pytorch3d/csrc/utils/geometry_utils.cuh
@@ -0,0 +1,792 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <float.h>
+#include <math.h>
+#include <cstdio>
+#include "float_math.cuh"
+
+// Set epsilon for preventing floating point errors and division by 0.
+#ifdef _MSC_VER
+#define kEpsilon 1e-8f
+#else
+const auto kEpsilon = 1e-8;
+#endif
+
+// ************************************************************* //
+//                          vec2 utils                           //
+// ************************************************************* //
+
+// Determines whether a point p is on the right side of a 2D line segment
+// given by the end points v0, v1.
+//
+// Args:
+//     p: vec2 Coordinates of a point.
+//     v0, v1: vec2 Coordinates of the end points of the edge.
+//
+// Returns:
+//     area: The signed area of the parallelogram given by the vectors
+//           A = p - v0
+//           B = v1 - v0
+//
+__device__ inline float
+EdgeFunctionForward(const float2& p, const float2& v0, const float2& v1) {
+  return (p.x - v0.x) * (v1.y - v0.y) - (p.y - v0.y) * (v1.x - v0.x);
+}
+
+// Backward pass for the edge function returning partial dervivatives for each
+// of the input points.
+//
+// Args:
+//     p: vec2 Coordinates of a point.
+//     v0, v1: vec2 Coordinates of the end points of the edge.
+//     grad_edge: Upstream gradient for output from edge function.
+//
+// Returns:
+//     tuple of gradients for each of the input points:
+//     (float2 d_edge_dp, float2 d_edge_dv0, float2 d_edge_dv1)
+//
+__device__ inline thrust::tuple<float2, float2, float2> EdgeFunctionBackward(
+    const float2& p,
+    const float2& v0,
+    const float2& v1,
+    const float& grad_edge) {
+  const float2 dedge_dp = make_float2(v1.y - v0.y, v0.x - v1.x);
+  const float2 dedge_dv0 = make_float2(p.y - v1.y, v1.x - p.x);
+  const float2 dedge_dv1 = make_float2(v0.y - p.y, p.x - v0.x);
+  return thrust::make_tuple(
+      grad_edge * dedge_dp, grad_edge * dedge_dv0, grad_edge * dedge_dv1);
+}
+
+// The forward pass for computing the barycentric coordinates of a point
+// relative to a triangle.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1, v2: Coordinates of the triangle vertices.
+//
+// Returns
+//     bary: (w0, w1, w2) barycentric coordinates in the range [0, 1].
+//
+__device__ inline float3 BarycentricCoordsForward(
+    const float2& p,
+    const float2& v0,
+    const float2& v1,
+    const float2& v2) {
+  const float area = EdgeFunctionForward(v2, v0, v1) + kEpsilon;
+  const float w0 = EdgeFunctionForward(p, v1, v2) / area;
+  const float w1 = EdgeFunctionForward(p, v2, v0) / area;
+  const float w2 = EdgeFunctionForward(p, v0, v1) / area;
+  return make_float3(w0, w1, w2);
+}
+
+// The backward pass for computing the barycentric coordinates of a point
+// relative to a triangle.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1, v2: (x, y) coordinates of the triangle vertices.
+//     grad_bary_upstream: vec3<T> Upstream gradient for each of the
+//                         barycentric coordaintes [grad_w0, grad_w1, grad_w2].
+//
+// Returns
+//    tuple of gradients for each of the triangle vertices:
+//    (float2 grad_v0, float2 grad_v1, float2 grad_v2)
+//
+__device__ inline thrust::tuple<float2, float2, float2, float2>
+BarycentricCoordsBackward(
+    const float2& p,
+    const float2& v0,
+    const float2& v1,
+    const float2& v2,
+    const float3& grad_bary_upstream) {
+  const float area = EdgeFunctionForward(v2, v0, v1) + kEpsilon;
+  const float area2 = pow(area, 2.0f);
+  const float e0 = EdgeFunctionForward(p, v1, v2);
+  const float e1 = EdgeFunctionForward(p, v2, v0);
+  const float e2 = EdgeFunctionForward(p, v0, v1);
+
+  const float grad_w0 = grad_bary_upstream.x;
+  const float grad_w1 = grad_bary_upstream.y;
+  const float grad_w2 = grad_bary_upstream.z;
+
+  // Calculate component of the gradient from each of w0, w1 and w2.
+  // e.g. for w0:
+  // dloss/dw0_v = dl/dw0 * dw0/dw0_top * dw0_top/dv
+  //               + dl/dw0 * dw0/dw0_bot * dw0_bot/dv
+  const float dw0_darea = -e0 / (area2);
+  const float dw0_e0 = 1 / area;
+  const float dloss_d_w0area = grad_w0 * dw0_darea;
+  const float dloss_e0 = grad_w0 * dw0_e0;
+  auto de0_dv = EdgeFunctionBackward(p, v1, v2, dloss_e0);
+  auto dw0area_dv = EdgeFunctionBackward(v2, v0, v1, dloss_d_w0area);
+  const float2 dw0_p = thrust::get<0>(de0_dv);
+  const float2 dw0_dv0 = thrust::get<1>(dw0area_dv);
+  const float2 dw0_dv1 = thrust::get<1>(de0_dv) + thrust::get<2>(dw0area_dv);
+  const float2 dw0_dv2 = thrust::get<2>(de0_dv) + thrust::get<0>(dw0area_dv);
+
+  const float dw1_darea = -e1 / (area2);
+  const float dw1_e1 = 1 / area;
+  const float dloss_d_w1area = grad_w1 * dw1_darea;
+  const float dloss_e1 = grad_w1 * dw1_e1;
+  auto de1_dv = EdgeFunctionBackward(p, v2, v0, dloss_e1);
+  auto dw1area_dv = EdgeFunctionBackward(v2, v0, v1, dloss_d_w1area);
+  const float2 dw1_p = thrust::get<0>(de1_dv);
+  const float2 dw1_dv0 = thrust::get<2>(de1_dv) + thrust::get<1>(dw1area_dv);
+  const float2 dw1_dv1 = thrust::get<2>(dw1area_dv);
+  const float2 dw1_dv2 = thrust::get<1>(de1_dv) + thrust::get<0>(dw1area_dv);
+
+  const float dw2_darea = -e2 / (area2);
+  const float dw2_e2 = 1 / area;
+  const float dloss_d_w2area = grad_w2 * dw2_darea;
+  const float dloss_e2 = grad_w2 * dw2_e2;
+  auto de2_dv = EdgeFunctionBackward(p, v0, v1, dloss_e2);
+  auto dw2area_dv = EdgeFunctionBackward(v2, v0, v1, dloss_d_w2area);
+  const float2 dw2_p = thrust::get<0>(de2_dv);
+  const float2 dw2_dv0 = thrust::get<1>(de2_dv) + thrust::get<1>(dw2area_dv);
+  const float2 dw2_dv1 = thrust::get<2>(de2_dv) + thrust::get<2>(dw2area_dv);
+  const float2 dw2_dv2 = thrust::get<0>(dw2area_dv);
+
+  const float2 dbary_p = dw0_p + dw1_p + dw2_p;
+  const float2 dbary_dv0 = dw0_dv0 + dw1_dv0 + dw2_dv0;
+  const float2 dbary_dv1 = dw0_dv1 + dw1_dv1 + dw2_dv1;
+  const float2 dbary_dv2 = dw0_dv2 + dw1_dv2 + dw2_dv2;
+
+  return thrust::make_tuple(dbary_p, dbary_dv0, dbary_dv1, dbary_dv2);
+}
+
+// Forward pass for applying perspective correction to barycentric coordinates.
+//
+// Args:
+//     bary: Screen-space barycentric coordinates for a point
+//     z0, z1, z2: Camera-space z-coordinates of the triangle vertices
+//
+// Returns
+//     World-space barycentric coordinates
+//
+__device__ inline float3 BarycentricPerspectiveCorrectionForward(
+    const float3& bary,
+    const float z0,
+    const float z1,
+    const float z2) {
+  const float w0_top = bary.x * z1 * z2;
+  const float w1_top = z0 * bary.y * z2;
+  const float w2_top = z0 * z1 * bary.z;
+  const float denom = fmaxf(w0_top + w1_top + w2_top, kEpsilon);
+  const float w0 = w0_top / denom;
+  const float w1 = w1_top / denom;
+  const float w2 = w2_top / denom;
+  return make_float3(w0, w1, w2);
+}
+
+// Backward pass for applying perspective correction to barycentric coordinates.
+//
+// Args:
+//     bary: Screen-space barycentric coordinates for a point
+//     z0, z1, z2: Camera-space z-coordinates of the triangle vertices
+//     grad_out: Upstream gradient of the loss with respect to the corrected
+//               barycentric coordinates.
+//
+// Returns a tuple of:
+//      grad_bary: Downstream gradient of the loss with respect to the the
+//                 uncorrected barycentric coordinates.
+//      grad_z0, grad_z1, grad_z2: Downstream gradient of the loss with respect
+//                                 to the z-coordinates of the triangle verts
+__device__ inline thrust::tuple<float3, float, float, float>
+BarycentricPerspectiveCorrectionBackward(
+    const float3& bary,
+    const float z0,
+    const float z1,
+    const float z2,
+    const float3& grad_out) {
+  // Recompute forward pass
+  const float w0_top = bary.x * z1 * z2;
+  const float w1_top = z0 * bary.y * z2;
+  const float w2_top = z0 * z1 * bary.z;
+  const float denom = fmaxf(w0_top + w1_top + w2_top, kEpsilon);
+
+  // Now do backward pass
+  const float grad_denom_top =
+      -w0_top * grad_out.x - w1_top * grad_out.y - w2_top * grad_out.z;
+  const float grad_denom = grad_denom_top / (denom * denom);
+  const float grad_w0_top = grad_denom + grad_out.x / denom;
+  const float grad_w1_top = grad_denom + grad_out.y / denom;
+  const float grad_w2_top = grad_denom + grad_out.z / denom;
+  const float grad_bary_x = grad_w0_top * z1 * z2;
+  const float grad_bary_y = grad_w1_top * z0 * z2;
+  const float grad_bary_z = grad_w2_top * z0 * z1;
+  const float3 grad_bary = make_float3(grad_bary_x, grad_bary_y, grad_bary_z);
+  const float grad_z0 = grad_w1_top * bary.y * z2 + grad_w2_top * bary.z * z1;
+  const float grad_z1 = grad_w0_top * bary.x * z2 + grad_w2_top * bary.z * z0;
+  const float grad_z2 = grad_w0_top * bary.x * z1 + grad_w1_top * bary.y * z0;
+  return thrust::make_tuple(grad_bary, grad_z0, grad_z1, grad_z2);
+}
+
+// Clip negative barycentric coordinates to 0.0 and renormalize so
+// the barycentric coordinates for a point sum to 1. When the blur_radius
+// is greater than 0, a face will still be recorded as overlapping a pixel
+// if the pixel is outside the face. In this case at least one of the
+// barycentric coordinates for the pixel relative to the face will be negative.
+// Clipping will ensure that the texture and z buffer are interpolated
+// correctly.
+//
+//  Args
+//     bary: (w0, w1, w2) barycentric coordinates which can be outside the
+//            range [0, 1].
+//
+//  Returns
+//     bary: (w0, w1, w2) barycentric coordinates in the range [0, 1] which
+//           satisfy the condition: sum(w0, w1, w2) = 1.0.
+//
+__device__ inline float3 BarycentricClipForward(const float3 bary) {
+  float3 w = make_float3(0.0f, 0.0f, 0.0f);
+  // Clamp lower bound only
+  w.x = max(bary.x, 0.0);
+  w.y = max(bary.y, 0.0);
+  w.z = max(bary.z, 0.0);
+  float w_sum = w.x + w.y + w.z;
+  w_sum = fmaxf(w_sum, 1e-5);
+  w.x /= w_sum;
+  w.y /= w_sum;
+  w.z /= w_sum;
+
+  return w;
+}
+
+// Backward pass for barycentric coordinate clipping.
+//
+//  Args
+//     bary: (w0, w1, w2) barycentric coordinates which can be outside the
+//            range [0, 1].
+//     grad_baryclip_upstream: vec3<T> Upstream gradient for each of the clipped
+//                         barycentric coordinates [grad_w0, grad_w1, grad_w2].
+//
+// Returns
+//    vec3<T> of gradients for the unclipped barycentric coordinates:
+//    (grad_w0, grad_w1, grad_w2)
+//
+__device__ inline float3 BarycentricClipBackward(
+    const float3 bary,
+    const float3 grad_baryclip_upstream) {
+  // Redo some of the forward pass calculations
+  float3 w = make_float3(0.0f, 0.0f, 0.0f);
+  // Clamp lower bound only
+  w.x = max(bary.x, 0.0);
+  w.y = max(bary.y, 0.0);
+  w.z = max(bary.z, 0.0);
+  float w_sum = w.x + w.y + w.z;
+
+  float3 grad_bary = make_float3(1.0f, 1.0f, 1.0f);
+  float3 grad_clip = make_float3(1.0f, 1.0f, 1.0f);
+  float3 grad_sum = make_float3(1.0f, 1.0f, 1.0f);
+
+  // Check if sum was clipped.
+  float grad_sum_clip = 1.0f;
+  if (w_sum < 1e-5) {
+    grad_sum_clip = 0.0f;
+    w_sum = 1e-5;
+  }
+
+  // Check if any of bary values have been clipped.
+  if (bary.x < 0.0f) {
+    grad_clip.x = 0.0f;
+  }
+  if (bary.y < 0.0f) {
+    grad_clip.y = 0.0f;
+  }
+  if (bary.z < 0.0f) {
+    grad_clip.z = 0.0f;
+  }
+
+  // Gradients of the sum.
+  grad_sum.x = -w.x / (pow(w_sum, 2.0f)) * grad_sum_clip;
+  grad_sum.y = -w.y / (pow(w_sum, 2.0f)) * grad_sum_clip;
+  grad_sum.z = -w.z / (pow(w_sum, 2.0f)) * grad_sum_clip;
+
+  // Gradients for each of the bary coordinates including the cross terms
+  // from the sum.
+  grad_bary.x = grad_clip.x *
+      (grad_baryclip_upstream.x * (1.0f / w_sum + grad_sum.x) +
+       grad_baryclip_upstream.y * (grad_sum.y) +
+       grad_baryclip_upstream.z * (grad_sum.z));
+
+  grad_bary.y = grad_clip.y *
+      (grad_baryclip_upstream.y * (1.0f / w_sum + grad_sum.y) +
+       grad_baryclip_upstream.x * (grad_sum.x) +
+       grad_baryclip_upstream.z * (grad_sum.z));
+
+  grad_bary.z = grad_clip.z *
+      (grad_baryclip_upstream.z * (1.0f / w_sum + grad_sum.z) +
+       grad_baryclip_upstream.x * (grad_sum.x) +
+       grad_baryclip_upstream.y * (grad_sum.y));
+
+  return grad_bary;
+}
+
+// Return minimum distance between line segment (v1 - v0) and point p.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1: Coordinates of the end points of the line segment.
+//
+// Returns:
+//     squared distance to the boundary of the triangle.
+//
+__device__ inline float
+PointLineDistanceForward(const float2& p, const float2& a, const float2& b) {
+  const float2 ba = b - a;
+  float l2 = dot(ba, ba);
+  float t = dot(ba, p - a) / l2;
+  if (l2 <= kEpsilon) {
+    return dot(p - b, p - b);
+  }
+  t = __saturatef(t); // clamp to the interval [+0.0, 1.0]
+  const float2 p_proj = a + t * ba;
+  const float2 d = (p_proj - p);
+  return dot(d, d); // squared distance
+}
+
+// Backward pass for point to line distance in 2D.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1: Coordinates of the end points of the line segment.
+//     grad_dist: Upstream gradient for the distance.
+//
+// Returns:
+//    tuple of gradients for each of the input points:
+//      (float2 grad_p, float2 grad_v0, float2 grad_v1)
+//
+__device__ inline thrust::tuple<float2, float2, float2>
+PointLineDistanceBackward(
+    const float2& p,
+    const float2& v0,
+    const float2& v1,
+    const float& grad_dist) {
+  // Redo some of the forward pass calculations.
+  const float2 v1v0 = v1 - v0;
+  const float2 pv0 = p - v0;
+  const float t_bot = dot(v1v0, v1v0);
+  const float t_top = dot(v1v0, pv0);
+  float tt = t_top / t_bot;
+  tt = __saturatef(tt);
+  const float2 p_proj = (1.0f - tt) * v0 + tt * v1;
+  const float2 d = p - p_proj;
+  const float dist = sqrt(dot(d, d));
+
+  const float2 grad_p = -1.0f * grad_dist * 2.0f * (p_proj - p);
+  const float2 grad_v0 = grad_dist * (1.0f - tt) * 2.0f * (p_proj - p);
+  const float2 grad_v1 = grad_dist * tt * 2.0f * (p_proj - p);
+
+  return thrust::make_tuple(grad_p, grad_v0, grad_v1);
+}
+
+// The forward pass for calculating the shortest distance between a point
+// and a triangle.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1, v2: Coordinates of the three triangle vertices.
+//
+// Returns:
+//     shortest squared distance from a point to a triangle.
+//
+__device__ inline float PointTriangleDistanceForward(
+    const float2& p,
+    const float2& v0,
+    const float2& v1,
+    const float2& v2) {
+  // Compute distance to all 3 edges of the triangle and return the min.
+  const float e01_dist = PointLineDistanceForward(p, v0, v1);
+  const float e02_dist = PointLineDistanceForward(p, v0, v2);
+  const float e12_dist = PointLineDistanceForward(p, v1, v2);
+  const float edge_dist = fminf(fminf(e01_dist, e02_dist), e12_dist);
+  return edge_dist;
+}
+
+// Backward pass for point triangle distance.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1, v2: Coordinates of the three triangle vertices.
+//     grad_dist: Upstream gradient for the distance.
+//
+// Returns:
+//    tuple of gradients for each of the triangle vertices:
+//      (float2 grad_v0, float2 grad_v1, float2 grad_v2)
+//
+__device__ inline thrust::tuple<float2, float2, float2, float2>
+PointTriangleDistanceBackward(
+    const float2& p,
+    const float2& v0,
+    const float2& v1,
+    const float2& v2,
+    const float& grad_dist) {
+  // Compute distance to all 3 edges of the triangle.
+  const float e01_dist = PointLineDistanceForward(p, v0, v1);
+  const float e02_dist = PointLineDistanceForward(p, v0, v2);
+  const float e12_dist = PointLineDistanceForward(p, v1, v2);
+
+  // Initialize output tensors.
+  float2 grad_v0 = make_float2(0.0f, 0.0f);
+  float2 grad_v1 = make_float2(0.0f, 0.0f);
+  float2 grad_v2 = make_float2(0.0f, 0.0f);
+  float2 grad_p = make_float2(0.0f, 0.0f);
+
+  // Find which edge is the closest and return PointLineDistanceBackward for
+  // that edge.
+  if (e01_dist <= e02_dist && e01_dist <= e12_dist) {
+    // Closest edge is v1 - v0.
+    auto grad_e01 = PointLineDistanceBackward(p, v0, v1, grad_dist);
+    grad_p = thrust::get<0>(grad_e01);
+    grad_v0 = thrust::get<1>(grad_e01);
+    grad_v1 = thrust::get<2>(grad_e01);
+  } else if (e02_dist <= e01_dist && e02_dist <= e12_dist) {
+    // Closest edge is v2 - v0.
+    auto grad_e02 = PointLineDistanceBackward(p, v0, v2, grad_dist);
+    grad_p = thrust::get<0>(grad_e02);
+    grad_v0 = thrust::get<1>(grad_e02);
+    grad_v2 = thrust::get<2>(grad_e02);
+  } else if (e12_dist <= e01_dist && e12_dist <= e02_dist) {
+    // Closest edge is v2 - v1.
+    auto grad_e12 = PointLineDistanceBackward(p, v1, v2, grad_dist);
+    grad_p = thrust::get<0>(grad_e12);
+    grad_v1 = thrust::get<1>(grad_e12);
+    grad_v2 = thrust::get<2>(grad_e12);
+  }
+
+  return thrust::make_tuple(grad_p, grad_v0, grad_v1, grad_v2);
+}
+
+// ************************************************************* //
+//                          vec3 utils                           //
+// ************************************************************* //
+
+// Computes the area of a triangle (v0, v1, v2).
+//
+// Args:
+//     v0, v1, v2: vec3 coordinates of the triangle vertices
+//
+// Returns
+//     area: float: The area of the triangle
+//
+__device__ inline float
+AreaOfTriangle(const float3& v0, const float3& v1, const float3& v2) {
+  float3 p0 = v1 - v0;
+  float3 p1 = v2 - v0;
+
+  // compute the hypotenus of the scross product (p0 x p1)
+  float dd = hypot(
+      p0.y * p1.z - p0.z * p1.y,
+      hypot(p0.z * p1.x - p0.x * p1.z, p0.x * p1.y - p0.y * p1.x));
+
+  return dd / 2.0;
+}
+
+// Computes the barycentric coordinates of a point p relative
+// to a triangle (v0, v1, v2), i.e. p = w0 * v0 + w1 * v1 + w2 * v2
+// s.t. w0 + w1 + w2 = 1.0
+//
+// NOTE that this function assumes that p lives on the space spanned
+// by (v0, v1, v2).
+// TODO(gkioxari) explicitly check whether p is coplanar with (v0, v1, v2)
+// and throw an error if check fails
+//
+// Args:
+//     p: vec3 coordinates of a point
+//     v0, v1, v2: vec3 coordinates of the triangle vertices
+//
+// Returns
+//     bary: (w0, w1, w2) barycentric coordinates
+//
+__device__ inline float3 BarycentricCoords3Forward(
+    const float3& p,
+    const float3& v0,
+    const float3& v1,
+    const float3& v2) {
+  float3 p0 = v1 - v0;
+  float3 p1 = v2 - v0;
+  float3 p2 = p - v0;
+
+  const float d00 = dot(p0, p0);
+  const float d01 = dot(p0, p1);
+  const float d11 = dot(p1, p1);
+  const float d20 = dot(p2, p0);
+  const float d21 = dot(p2, p1);
+
+  const float denom = d00 * d11 - d01 * d01 + kEpsilon;
+  const float w1 = (d11 * d20 - d01 * d21) / denom;
+  const float w2 = (d00 * d21 - d01 * d20) / denom;
+  const float w0 = 1.0f - w1 - w2;
+
+  return make_float3(w0, w1, w2);
+}
+
+// Checks whether the point p is inside the triangle (v0, v1, v2).
+// A point is inside the triangle, if all barycentric coordinates
+// wrt the triangle are >= 0 & <= 1.
+// If the triangle is degenerate, aka line or point, then return False.
+//
+// NOTE that this function assumes that p lives on the space spanned
+// by (v0, v1, v2).
+// TODO(gkioxari) explicitly check whether p is coplanar with (v0, v1, v2)
+// and throw an error if check fails
+//
+// Args:
+//     p: vec3 coordinates of a point
+//     v0, v1, v2: vec3 coordinates of the triangle vertices
+//     min_triangle_area: triangles less than this size are considered
+//     points/lines, IsInsideTriangle returns False
+//
+// Returns:
+//     inside: bool indicating wether p is inside triangle
+//
+__device__ inline bool IsInsideTriangle(
+    const float3& p,
+    const float3& v0,
+    const float3& v1,
+    const float3& v2,
+    const double min_triangle_area) {
+  bool inside;
+  if (AreaOfTriangle(v0, v1, v2) < min_triangle_area) {
+    inside = 0;
+  } else {
+    float3 bary = BarycentricCoords3Forward(p, v0, v1, v2);
+    bool x_in = 0.0f <= bary.x && bary.x <= 1.0f;
+    bool y_in = 0.0f <= bary.y && bary.y <= 1.0f;
+    bool z_in = 0.0f <= bary.z && bary.z <= 1.0f;
+    inside = x_in && y_in && z_in;
+  }
+  return inside;
+}
+
+// Computes the minimum squared Euclidean distance between the point p
+// and the segment spanned by (v0, v1).
+// To find this we parametrize p as: x(t) = v0 + t * (v1 - v0)
+// and find t which minimizes (x(t) - p) ^ 2.
+// Note that p does not need to live in the space spanned by (v0, v1)
+//
+// Args:
+//     p: vec3 coordinates of a point
+//     v0, v1: vec3 coordinates of start and end of segment
+//
+// Returns:
+//     dist: the minimum squared distance of p from segment (v0, v1)
+//
+
+__device__ inline float
+PointLine3DistanceForward(const float3& p, const float3& v0, const float3& v1) {
+  const float3 v1v0 = v1 - v0;
+  const float3 pv0 = p - v0;
+  const float t_bot = dot(v1v0, v1v0);
+  const float t_top = dot(pv0, v1v0);
+  // if t_bot small, then v0 == v1, set tt to 0.
+  float tt = (t_bot < kEpsilon) ? 0.0f : (t_top / t_bot);
+
+  tt = __saturatef(tt); // clamps to [0, 1]
+
+  const float3 p_proj = v0 + tt * v1v0;
+  const float3 diff = p - p_proj;
+  const float dist = dot(diff, diff);
+  return dist;
+}
+
+// Backward function of the minimum squared Euclidean distance between the point
+// p and the line segment (v0, v1).
+//
+// Args:
+//     p: vec3 coordinates of a point
+//     v0, v1: vec3 coordinates of start and end of segment
+//     grad_dist: Float of the gradient wrt dist
+//
+// Returns:
+//    tuple of gradients for the point and line segment (v0, v1):
+//      (float3 grad_p, float3 grad_v0, float3 grad_v1)
+
+__device__ inline thrust::tuple<float3, float3, float3>
+PointLine3DistanceBackward(
+    const float3& p,
+    const float3& v0,
+    const float3& v1,
+    const float& grad_dist) {
+  const float3 v1v0 = v1 - v0;
+  const float3 pv0 = p - v0;
+  const float t_bot = dot(v1v0, v1v0);
+  const float t_top = dot(v1v0, pv0);
+
+  float3 grad_p = make_float3(0.0f, 0.0f, 0.0f);
+  float3 grad_v0 = make_float3(0.0f, 0.0f, 0.0f);
+  float3 grad_v1 = make_float3(0.0f, 0.0f, 0.0f);
+
+  const float tt = t_top / t_bot;
+
+  if (t_bot < kEpsilon) {
+    // if t_bot small, then v0 == v1,
+    // and dist = 0.5 * dot(pv0, pv0) + 0.5 * dot(pv1, pv1)
+    grad_p = grad_dist * 2.0f * pv0;
+    grad_v0 = -0.5f * grad_p;
+    grad_v1 = grad_v0;
+  } else if (tt < 0.0f) {
+    grad_p = grad_dist * 2.0f * pv0;
+    grad_v0 = -1.0f * grad_p;
+    // no gradients wrt v1
+  } else if (tt > 1.0f) {
+    grad_p = grad_dist * 2.0f * (p - v1);
+    grad_v1 = -1.0f * grad_p;
+    // no gradients wrt v0
+  } else {
+    const float3 p_proj = v0 + tt * v1v0;
+    const float3 diff = p - p_proj;
+    const float3 grad_base = grad_dist * 2.0f * diff;
+    grad_p = grad_base - dot(grad_base, v1v0) * v1v0 / t_bot;
+    const float3 dtt_v0 = (-1.0f * v1v0 - pv0 + 2.0f * tt * v1v0) / t_bot;
+    grad_v0 = (-1.0f + tt) * grad_base - dot(grad_base, v1v0) * dtt_v0;
+    const float3 dtt_v1 = (pv0 - 2.0f * tt * v1v0) / t_bot;
+    grad_v1 = -dot(grad_base, v1v0) * dtt_v1 - tt * grad_base;
+  }
+
+  return thrust::make_tuple(grad_p, grad_v0, grad_v1);
+}
+
+// Computes the squared distance of a point p relative to a triangle (v0, v1,
+// v2). If the point's projection p0 on the plane spanned by (v0, v1, v2) is
+// inside the triangle with vertices (v0, v1, v2), then the returned value is
+// the squared distance of p to its projection p0. Otherwise, the returned value
+// is the smallest squared distance of p from the line segments (v0, v1), (v0,
+// v2) and (v1, v2).
+//
+// Args:
+//     p: vec3 coordinates of a point
+//     v0, v1, v2: vec3 coordinates of the triangle vertices
+//     min_triangle_area: triangles less than this size are considered
+//     points/lines, IsInsideTriangle returns False
+//
+// Returns:
+//     dist: Float of the squared distance
+//
+
+__device__ inline float PointTriangle3DistanceForward(
+    const float3& p,
+    const float3& v0,
+    const float3& v1,
+    const float3& v2,
+    const double min_triangle_area) {
+  float3 normal = cross(v2 - v0, v1 - v0);
+  const float norm_normal = norm(normal);
+  normal = normalize(normal);
+
+  // p0 is the projection of p on the plane spanned by (v0, v1, v2)
+  // i.e. p0 = p + t * normal, s.t. (p0 - v0) is orthogonal to normal
+  const float t = dot(v0 - p, normal);
+  const float3 p0 = p + t * normal;
+
+  bool is_inside = IsInsideTriangle(p0, v0, v1, v2, min_triangle_area);
+  float dist = 0.0f;
+
+  if ((is_inside) && (norm_normal > kEpsilon)) {
+    // if projection p0 is inside triangle spanned by (v0, v1, v2)
+    // then distance is equal to norm(p0 - p)^2
+    dist = t * t;
+  } else {
+    const float e01 = PointLine3DistanceForward(p, v0, v1);
+    const float e02 = PointLine3DistanceForward(p, v0, v2);
+    const float e12 = PointLine3DistanceForward(p, v1, v2);
+
+    dist = (e01 > e02) ? e02 : e01;
+    dist = (dist > e12) ? e12 : dist;
+  }
+
+  return dist;
+}
+
+// The backward pass for computing the squared distance of a point
+// to the triangle (v0, v1, v2).
+//
+// Args:
+//     p: xyz coordinates of a point
+//     v0, v1, v2: xyz coordinates of the triangle vertices
+//     grad_dist: Float of the gradient wrt dist
+//     min_triangle_area: triangles less than this size are considered
+//     points/lines, IsInsideTriangle returns False
+//
+// Returns:
+//     tuple of gradients for the point and triangle:
+//        (float3 grad_p, float3 grad_v0, float3 grad_v1, float3 grad_v2)
+//
+
+__device__ inline thrust::tuple<float3, float3, float3, float3>
+PointTriangle3DistanceBackward(
+    const float3& p,
+    const float3& v0,
+    const float3& v1,
+    const float3& v2,
+    const float& grad_dist,
+    const double min_triangle_area) {
+  const float3 v2v0 = v2 - v0;
+  const float3 v1v0 = v1 - v0;
+  const float3 v0p = v0 - p;
+  float3 raw_normal = cross(v2v0, v1v0);
+  const float norm_normal = norm(raw_normal);
+  float3 normal = normalize(raw_normal);
+
+  // p0 is the projection of p on the plane spanned by (v0, v1, v2)
+  // i.e. p0 = p + t * normal, s.t. (p0 - v0) is orthogonal to normal
+  const float t = dot(v0 - p, normal);
+  const float3 p0 = p + t * normal;
+  const float3 diff = t * normal;
+
+  bool is_inside = IsInsideTriangle(p0, v0, v1, v2, min_triangle_area);
+
+  float3 grad_p = make_float3(0.0f, 0.0f, 0.0f);
+  float3 grad_v0 = make_float3(0.0f, 0.0f, 0.0f);
+  float3 grad_v1 = make_float3(0.0f, 0.0f, 0.0f);
+  float3 grad_v2 = make_float3(0.0f, 0.0f, 0.0f);
+
+  if ((is_inside) && (norm_normal > kEpsilon)) {
+    // derivative of dist wrt p
+    grad_p = -2.0f * grad_dist * t * normal;
+    // derivative of dist wrt normal
+    const float3 grad_normal = 2.0f * grad_dist * t * (v0p + diff);
+    // derivative of dist wrt raw_normal
+    const float3 grad_raw_normal = normalize_backward(raw_normal, grad_normal);
+    // derivative of dist wrt v2v0 and v1v0
+    const auto grad_cross = cross_backward(v2v0, v1v0, grad_raw_normal);
+    const float3 grad_cross_v2v0 = thrust::get<0>(grad_cross);
+    const float3 grad_cross_v1v0 = thrust::get<1>(grad_cross);
+    grad_v0 =
+        grad_dist * 2.0f * t * normal - (grad_cross_v2v0 + grad_cross_v1v0);
+    grad_v1 = grad_cross_v1v0;
+    grad_v2 = grad_cross_v2v0;
+  } else {
+    const float e01 = PointLine3DistanceForward(p, v0, v1);
+    const float e02 = PointLine3DistanceForward(p, v0, v2);
+    const float e12 = PointLine3DistanceForward(p, v1, v2);
+
+    if ((e01 <= e02) && (e01 <= e12)) {
+      // e01 is smallest
+      const auto grads = PointLine3DistanceBackward(p, v0, v1, grad_dist);
+      grad_p = thrust::get<0>(grads);
+      grad_v0 = thrust::get<1>(grads);
+      grad_v1 = thrust::get<2>(grads);
+    } else if ((e02 <= e01) && (e02 <= e12)) {
+      // e02 is smallest
+      const auto grads = PointLine3DistanceBackward(p, v0, v2, grad_dist);
+      grad_p = thrust::get<0>(grads);
+      grad_v0 = thrust::get<1>(grads);
+      grad_v2 = thrust::get<2>(grads);
+    } else if ((e12 <= e01) && (e12 <= e02)) {
+      // e12 is smallest
+      const auto grads = PointLine3DistanceBackward(p, v1, v2, grad_dist);
+      grad_p = thrust::get<0>(grads);
+      grad_v1 = thrust::get<1>(grads);
+      grad_v2 = thrust::get<2>(grads);
+    }
+  }
+
+  return thrust::make_tuple(grad_p, grad_v0, grad_v1, grad_v2);
+}
diff --git a/pytorch3d_simplified/pytorch3d/csrc/utils/geometry_utils.h b/pytorch3d_simplified/pytorch3d/csrc/utils/geometry_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad9f7ff3f34dde7b119ea708eb0901cb826794d7
--- /dev/null
+++ b/pytorch3d_simplified/pytorch3d/csrc/utils/geometry_utils.h
@@ -0,0 +1,823 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <algorithm>
+#include <tuple>
+#include <type_traits>
+#include "vec2.h"
+#include "vec3.h"
+
+// Set epsilon for preventing floating point errors and division by 0.
+const auto kEpsilon = 1e-8;
+
+// Determines whether a point p is on the right side of a 2D line segment
+// given by the end points v0, v1.
+//
+// Args:
+//     p: vec2 Coordinates of a point.
+//     v0, v1: vec2 Coordinates of the end points of the edge.
+//
+// Returns:
+//     area: The signed area of the parallelogram given by the vectors
+//           A = p - v0
+//           B = v1 - v0
+//
+//                 v1 ________
+//                   /\      /
+//               A  /  \    /
+//                 /    \  /
+//             v0 /______\/
+//                   B    p
+//
+//          The area can also be interpreted as the cross product A x B.
+//          If the sign of the area is positive, the point p is on the
+//          right side of the edge. Negative area indicates the point is on
+//          the left side of the edge. i.e. for an edge v1 - v0:
+//
+//                      v1
+//                     /
+//                    /
+//             -     /    +
+//                  /
+//                 /
+//               v0
+//
+template <typename T>
+T EdgeFunctionForward(const vec2<T>& p, const vec2<T>& v0, const vec2<T>& v1) {
+  const T edge = (p.x - v0.x) * (v1.y - v0.y) - (p.y - v0.y) * (v1.x - v0.x);
+  return edge;
+}
+
+// Backward pass for the edge function returning partial dervivatives for each
+// of the input points.
+//
+// Args:
+//     p: vec2 Coordinates of a point.
+//     v0, v1: vec2 Coordinates of the end points of the edge.
+//     grad_edge: Upstream gradient for output from edge function.
+//
+// Returns:
+//     tuple of gradients for each of the input points:
+//     (vec2<T> d_edge_dp, vec2<T> d_edge_dv0, vec2<T> d_edge_dv1)
+//
+template <typename T>
+inline std::tuple<vec2<T>, vec2<T>, vec2<T>> EdgeFunctionBackward(
+    const vec2<T>& p,
+    const vec2<T>& v0,
+    const vec2<T>& v1,
+    const T grad_edge) {
+  const vec2<T> dedge_dp(v1.y - v0.y, v0.x - v1.x);
+  const vec2<T> dedge_dv0(p.y - v1.y, v1.x - p.x);
+  const vec2<T> dedge_dv1(v0.y - p.y, p.x - v0.x);
+  return std::make_tuple(
+      grad_edge * dedge_dp, grad_edge * dedge_dv0, grad_edge * dedge_dv1);
+}
+
+// The forward pass for computing the barycentric coordinates of a point
+// relative to a triangle.
+// Ref:
+// https://www.scratchapixel.com/lessons/3d-basic-rendering/ray-tracing-rendering-a-triangle/barycentric-coordinates
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1, v2: Coordinates of the triangle vertices.
+//
+// Returns
+//     bary: (w0, w1, w2) barycentric coordinates in the range [0, 1].
+//
+template <typename T>
+vec3<T> BarycentricCoordinatesForward(
+    const vec2<T>& p,
+    const vec2<T>& v0,
+    const vec2<T>& v1,
+    const vec2<T>& v2) {
+  const T area = EdgeFunctionForward(v2, v0, v1) + kEpsilon;
+  const T w0 = EdgeFunctionForward(p, v1, v2) / area;
+  const T w1 = EdgeFunctionForward(p, v2, v0) / area;
+  const T w2 = EdgeFunctionForward(p, v0, v1) / area;
+  return vec3<T>(w0, w1, w2);
+}
+
+// The backward pass for computing the barycentric coordinates of a point
+// relative to a triangle.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1, v2: (x, y) coordinates of the triangle vertices.
+//     grad_bary_upstream: vec3<T> Upstream gradient for each of the
+//                         barycentric coordaintes [grad_w0, grad_w1, grad_w2].
+//
+// Returns
+//    tuple of gradients for each of the triangle vertices:
+//    (vec2<T> grad_v0, vec2<T> grad_v1, vec2<T> grad_v2)
+//
+template <typename T>
+inline std::tuple<vec2<T>, vec2<T>, vec2<T>, vec2<T>> BarycentricCoordsBackward(
+    const vec2<T>& p,
+    const vec2<T>& v0,
+    const vec2<T>& v1,
+    const vec2<T>& v2,
+    const vec3<T>& grad_bary_upstream) {
+  const T area = EdgeFunctionForward(v2, v0, v1) + kEpsilon;
+  const T area2 = pow(area, 2.0f);
+  const T area_inv = 1.0f / area;
+  const T e0 = EdgeFunctionForward(p, v1, v2);
+  const T e1 = EdgeFunctionForward(p, v2, v0);
+  const T e2 = EdgeFunctionForward(p, v0, v1);
+
+  const T grad_w0 = grad_bary_upstream.x;
+  const T grad_w1 = grad_bary_upstream.y;
+  const T grad_w2 = grad_bary_upstream.z;
+
+  // Calculate component of the gradient from each of w0, w1 and w2.
+  // e.g. for w0:
+  // dloss/dw0_v = dl/dw0 * dw0/dw0_top * dw0_top/dv
+  //               + dl/dw0 * dw0/dw0_bot * dw0_bot/dv
+  const T dw0_darea = -e0 / (area2);
+  const T dw0_e0 = area_inv;
+  const T dloss_d_w0area = grad_w0 * dw0_darea;
+  const T dloss_e0 = grad_w0 * dw0_e0;
+  auto de0_dv = EdgeFunctionBackward(p, v1, v2, dloss_e0);
+  auto dw0area_dv = EdgeFunctionBackward(v2, v0, v1, dloss_d_w0area);
+  const vec2<T> dw0_p = std::get<0>(de0_dv);
+  const vec2<T> dw0_dv0 = std::get<1>(dw0area_dv);
+  const vec2<T> dw0_dv1 = std::get<1>(de0_dv) + std::get<2>(dw0area_dv);
+  const vec2<T> dw0_dv2 = std::get<2>(de0_dv) + std::get<0>(dw0area_dv);
+
+  const T dw1_darea = -e1 / (area2);
+  const T dw1_e1 = area_inv;
+  const T dloss_d_w1area = grad_w1 * dw1_darea;
+  const T dloss_e1 = grad_w1 * dw1_e1;
+  auto de1_dv = EdgeFunctionBackward(p, v2, v0, dloss_e1);
+  auto dw1area_dv = EdgeFunctionBackward(v2, v0, v1, dloss_d_w1area);
+  const vec2<T> dw1_p = std::get<0>(de1_dv);
+  const vec2<T> dw1_dv0 = std::get<2>(de1_dv) + std::get<1>(dw1area_dv);
+  const vec2<T> dw1_dv1 = std::get<2>(dw1area_dv);
+  const vec2<T> dw1_dv2 = std::get<1>(de1_dv) + std::get<0>(dw1area_dv);
+
+  const T dw2_darea = -e2 / (area2);
+  const T dw2_e2 = area_inv;
+  const T dloss_d_w2area = grad_w2 * dw2_darea;
+  const T dloss_e2 = grad_w2 * dw2_e2;
+  auto de2_dv = EdgeFunctionBackward(p, v0, v1, dloss_e2);
+  auto dw2area_dv = EdgeFunctionBackward(v2, v0, v1, dloss_d_w2area);
+  const vec2<T> dw2_p = std::get<0>(de2_dv);
+  const vec2<T> dw2_dv0 = std::get<1>(de2_dv) + std::get<1>(dw2area_dv);
+  const vec2<T> dw2_dv1 = std::get<2>(de2_dv) + std::get<2>(dw2area_dv);
+  const vec2<T> dw2_dv2 = std::get<0>(dw2area_dv);
+
+  const vec2<T> dbary_p = dw0_p + dw1_p + dw2_p;
+  const vec2<T> dbary_dv0 = dw0_dv0 + dw1_dv0 + dw2_dv0;
+  const vec2<T> dbary_dv1 = dw0_dv1 + dw1_dv1 + dw2_dv1;
+  const vec2<T> dbary_dv2 = dw0_dv2 + dw1_dv2 + dw2_dv2;
+
+  return std::make_tuple(dbary_p, dbary_dv0, dbary_dv1, dbary_dv2);
+}
+
+// Forward pass for applying perspective correction to barycentric coordinates.
+//
+// Args:
+//     bary: Screen-space barycentric coordinates for a point
+//     z0, z1, z2: Camera-space z-coordinates of the triangle vertices
+//
+// Returns
+//     World-space barycentric coordinates
+//
+template <typename T>
+inline vec3<T> BarycentricPerspectiveCorrectionForward(
+    const vec3<T>& bary,
+    const T z0,
+    const T z1,
+    const T z2) {
+  const T w0_top = bary.x * z1 * z2;
+  const T w1_top = bary.y * z0 * z2;
+  const T w2_top = bary.z * z0 * z1;
+  const T denom = std::max<T>(w0_top + w1_top + w2_top, kEpsilon);
+  const T w0 = w0_top / denom;
+  const T w1 = w1_top / denom;
+  const T w2 = w2_top / denom;
+  return vec3<T>(w0, w1, w2);
+}
+
+// Backward pass for applying perspective correction to barycentric coordinates.
+//
+// Args:
+//     bary: Screen-space barycentric coordinates for a point
+//     z0, z1, z2: Camera-space z-coordinates of the triangle vertices
+//     grad_out: Upstream gradient of the loss with respect to the corrected
+//               barycentric coordinates.
+//
+// Returns a tuple of:
+//      grad_bary: Downstream gradient of the loss with respect to the the
+//                 uncorrected barycentric coordinates.
+//      grad_z0, grad_z1, grad_z2: Downstream gradient of the loss with respect
+//                                 to the z-coordinates of the triangle verts
+template <typename T>
+inline std::tuple<vec3<T>, T, T, T> BarycentricPerspectiveCorrectionBackward(
+    const vec3<T>& bary,
+    const T z0,
+    const T z1,
+    const T z2,
+    const vec3<T>& grad_out) {
+  // Recompute forward pass
+  const T w0_top = bary.x * z1 * z2;
+  const T w1_top = bary.y * z0 * z2;
+  const T w2_top = bary.z * z0 * z1;
+  const T denom = std::max<T>(w0_top + w1_top + w2_top, kEpsilon);
+
+  // Now do backward pass
+  const T grad_denom_top =
+      -w0_top * grad_out.x - w1_top * grad_out.y - w2_top * grad_out.z;
+  const T grad_denom = grad_denom_top / (denom * denom);
+  const T grad_w0_top = grad_denom + grad_out.x / denom;
+  const T grad_w1_top = grad_denom + grad_out.y / denom;
+  const T grad_w2_top = grad_denom + grad_out.z / denom;
+  const T grad_bary_x = grad_w0_top * z1 * z2;
+  const T grad_bary_y = grad_w1_top * z0 * z2;
+  const T grad_bary_z = grad_w2_top * z0 * z1;
+  const vec3<T> grad_bary(grad_bary_x, grad_bary_y, grad_bary_z);
+  const T grad_z0 = grad_w1_top * bary.y * z2 + grad_w2_top * bary.z * z1;
+  const T grad_z1 = grad_w0_top * bary.x * z2 + grad_w2_top * bary.z * z0;
+  const T grad_z2 = grad_w0_top * bary.x * z1 + grad_w1_top * bary.y * z0;
+  return std::make_tuple(grad_bary, grad_z0, grad_z1, grad_z2);
+}
+
+// Clip negative barycentric coordinates to 0.0 and renormalize so
+// the barycentric coordinates for a point sum to 1. When the blur_radius
+// is greater than 0, a face will still be recorded as overlapping a pixel
+// if the pixel is outside the face. In this case at least one of the
+// barycentric coordinates for the pixel relative to the face will be negative.
+// Clipping will ensure that the texture and z buffer are interpolated
+// correctly.
+//
+//  Args
+//     bary: (w0, w1, w2) barycentric coordinates which can contain values < 0.
+//
+//  Returns
+//     bary: (w0, w1, w2) barycentric coordinates in the range [0, 1] which
+//           satisfy the condition: sum(w0, w1, w2) = 1.0.
+//
+template <typename T>
+vec3<T> BarycentricClipForward(const vec3<T> bary) {
+  vec3<T> w(0.0f, 0.0f, 0.0f);
+  // Only clamp negative values to 0.0.
+  // No need to clamp values > 1.0 as they will be renormalized.
+  w.x = std::max(bary.x, 0.0f);
+  w.y = std::max(bary.y, 0.0f);
+  w.z = std::max(bary.z, 0.0f);
+  float w_sum = w.x + w.y + w.z;
+  w_sum = std::fmaxf(w_sum, 1e-5);
+  w.x /= w_sum;
+  w.y /= w_sum;
+  w.z /= w_sum;
+  return w;
+}
+
+// Backward pass for barycentric coordinate clipping.
+//
+//  Args
+//     bary: (w0, w1, w2) barycentric coordinates which can contain values < 0.
+//     grad_baryclip_upstream: vec3<T> Upstream gradient for each of the clipped
+//                         barycentric coordinates [grad_w0, grad_w1, grad_w2].
+//
+// Returns
+//    vec3<T> of gradients for the unclipped barycentric coordinates:
+//    (grad_w0, grad_w1, grad_w2)
+//
+template <typename T>
+vec3<T> BarycentricClipBackward(
+    const vec3<T> bary,
+    const vec3<T> grad_baryclip_upstream) {
+  // Redo some of the forward pass calculations
+  vec3<T> w(0.0f, 0.0f, 0.0f);
+  w.x = std::max(bary.x, 0.0f);
+  w.y = std::max(bary.y, 0.0f);
+  w.z = std::max(bary.z, 0.0f);
+  float w_sum = w.x + w.y + w.z;
+
+  vec3<T> grad_bary(1.0f, 1.0f, 1.0f);
+  vec3<T> grad_clip(1.0f, 1.0f, 1.0f);
+  vec3<T> grad_sum(1.0f, 1.0f, 1.0f);
+
+  // Check if the sum was clipped.
+  float grad_sum_clip = 1.0f;
+  if (w_sum < 1e-5) {
+    grad_sum_clip = 0.0f;
+    w_sum = 1e-5;
+  }
+
+  // Check if any of the bary coordinates have been clipped.
+  // Only negative values are clamped to 0.0.
+  if (bary.x < 0.0f) {
+    grad_clip.x = 0.0f;
+  }
+  if (bary.y < 0.0f) {
+    grad_clip.y = 0.0f;
+  }
+  if (bary.z < 0.0f) {
+    grad_clip.z = 0.0f;
+  }
+
+  // Gradients of the sum.
+  grad_sum.x = -w.x / (pow(w_sum, 2.0f)) * grad_sum_clip;
+  grad_sum.y = -w.y / (pow(w_sum, 2.0f)) * grad_sum_clip;
+  grad_sum.z = -w.z / (pow(w_sum, 2.0f)) * grad_sum_clip;
+
+  // Gradients for each of the bary coordinates including the cross terms
+  // from the sum.
+  grad_bary.x = grad_clip.x *
+      (grad_baryclip_upstream.x * (1.0f / w_sum + grad_sum.x) +
+       grad_baryclip_upstream.y * (grad_sum.y) +
+       grad_baryclip_upstream.z * (grad_sum.z));
+
+  grad_bary.y = grad_clip.y *
+      (grad_baryclip_upstream.y * (1.0f / w_sum + grad_sum.y) +
+       grad_baryclip_upstream.x * (grad_sum.x) +
+       grad_baryclip_upstream.z * (grad_sum.z));
+
+  grad_bary.z = grad_clip.z *
+      (grad_baryclip_upstream.z * (1.0f / w_sum + grad_sum.z) +
+       grad_baryclip_upstream.x * (grad_sum.x) +
+       grad_baryclip_upstream.y * (grad_sum.y));
+
+  return grad_bary;
+}
+
+// Calculate minimum distance between a line segment (v1 - v0) and point p.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1: Coordinates of the end points of the line segment.
+//
+// Returns:
+//     squared distance of the point to the line.
+//
+// Consider the line extending the segment - this can be parameterized as:
+// v0 + t (v1 - v0).
+//
+// First find the projection of point p onto the line. It falls where:
+// t = [(p - v0) . (v1 - v0)] / |v1 - v0|^2
+// where . is the dot product.
+//
+// The parameter t is clamped from [0, 1] to handle points outside the
+// segment (v1 - v0).
+//
+// Once the projection of the point on the segment is known, the distance from
+// p to the projection gives the minimum distance to the segment.
+//
+template <typename T>
+T PointLineDistanceForward(
+    const vec2<T>& p,
+    const vec2<T>& v0,
+    const vec2<T>& v1) {
+  const vec2<T> v1v0 = v1 - v0;
+  const T l2 = dot(v1v0, v1v0);
+  if (l2 <= kEpsilon) {
+    return dot(p - v1, p - v1);
+  }
+
+  const T t = dot(v1v0, p - v0) / l2;
+  const T tt = std::min(std::max(t, 0.00f), 1.00f);
+  const vec2<T> p_proj = v0 + tt * v1v0;
+  return dot(p - p_proj, p - p_proj);
+}
+
+template <typename T>
+T PointLine3DistanceForward(
+    const vec3<T>& p,
+    const vec3<T>& v0,
+    const vec3<T>& v1) {
+  const vec3<T> v1v0 = v1 - v0;
+  const T l2 = dot(v1v0, v1v0);
+  if (l2 <= kEpsilon) {
+    return dot(p - v1, p - v1);
+  }
+
+  const T t = dot(v1v0, p - v0) / l2;
+  const T tt = std::min(std::max(t, 0.00f), 1.00f);
+  const vec3<T> p_proj = v0 + tt * v1v0;
+  return dot(p - p_proj, p - p_proj);
+}
+
+// Backward pass for point to line distance in 2D.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1: Coordinates of the end points of the line segment.
+//     grad_dist: Upstream gradient for the distance.
+//
+// Returns:
+//    tuple of gradients for each of the input points:
+//      (vec2<T> grad_p, vec2<T> grad_v0, vec2<T> grad_v1)
+//
+template <typename T>
+inline std::tuple<vec2<T>, vec2<T>, vec2<T>> PointLineDistanceBackward(
+    const vec2<T>& p,
+    const vec2<T>& v0,
+    const vec2<T>& v1,
+    const T& grad_dist) {
+  // Redo some of the forward pass calculations.
+  const vec2<T> v1v0 = v1 - v0;
+  const vec2<T> pv0 = p - v0;
+  const T t_bot = dot(v1v0, v1v0);
+  const T t_top = dot(v1v0, pv0);
+  const T t = t_top / t_bot;
+  const T tt = std::min(std::max(t, 0.00f), 1.00f);
+  const vec2<T> p_proj = (1.0f - tt) * v0 + tt * v1;
+
+  const vec2<T> grad_v0 = grad_dist * (1.0f - tt) * 2.0f * (p_proj - p);
+  const vec2<T> grad_v1 = grad_dist * tt * 2.0f * (p_proj - p);
+  const vec2<T> grad_p = -1.0f * grad_dist * 2.0f * (p_proj - p);
+
+  return std::make_tuple(grad_p, grad_v0, grad_v1);
+}
+
+template <typename T>
+std::tuple<vec3<T>, vec3<T>, vec3<T>> PointLine3DistanceBackward(
+    const vec3<T>& p,
+    const vec3<T>& v0,
+    const vec3<T>& v1,
+    const T& grad_dist) {
+  const vec3<T> v1v0 = v1 - v0;
+  const vec3<T> pv0 = p - v0;
+  const T t_bot = dot(v1v0, v1v0);
+  const T t_top = dot(v1v0, pv0);
+
+  vec3<T> grad_p{0.0f, 0.0f, 0.0f};
+  vec3<T> grad_v0{0.0f, 0.0f, 0.0f};
+  vec3<T> grad_v1{0.0f, 0.0f, 0.0f};
+
+  const T tt = t_top / t_bot;
+
+  if (t_bot < kEpsilon) {
+    // if t_bot small, then v0 == v1,
+    // and dist = 0.5 * dot(pv0, pv0) + 0.5 * dot(pv1, pv1)
+    grad_p = grad_dist * 2.0f * pv0;
+    grad_v0 = -0.5f * grad_p;
+    grad_v1 = grad_v0;
+  } else if (tt < 0.0f) {
+    grad_p = grad_dist * 2.0f * pv0;
+    grad_v0 = -1.0f * grad_p;
+    // no gradients wrt v1
+  } else if (tt > 1.0f) {
+    grad_p = grad_dist * 2.0f * (p - v1);
+    grad_v1 = -1.0f * grad_p;
+    // no gradients wrt v0
+  } else {
+    const vec3<T> p_proj = v0 + tt * v1v0;
+    const vec3<T> diff = p - p_proj;
+    const vec3<T> grad_base = grad_dist * 2.0f * diff;
+    grad_p = grad_base - dot(grad_base, v1v0) * v1v0 / t_bot;
+    const vec3<T> dtt_v0 = (-1.0f * v1v0 - pv0 + 2.0f * tt * v1v0) / t_bot;
+    grad_v0 = (-1.0f + tt) * grad_base - dot(grad_base, v1v0) * dtt_v0;
+    const vec3<T> dtt_v1 = (pv0 - 2.0f * tt * v1v0) / t_bot;
+    grad_v1 = -dot(grad_base, v1v0) * dtt_v1 - tt * grad_base;
+  }
+
+  return std::make_tuple(grad_p, grad_v0, grad_v1);
+}
+
+// The forward pass for calculating the shortest distance between a point
+// and a triangle.
+// Ref: https://www.randygaul.net/2014/07/23/distance-point-to-line-segment/
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1, v2: Coordinates of the three triangle vertices.
+//
+// Returns:
+//     shortest squared distance from a point to a triangle.
+//
+//
+template <typename T>
+T PointTriangleDistanceForward(
+    const vec2<T>& p,
+    const vec2<T>& v0,
+    const vec2<T>& v1,
+    const vec2<T>& v2) {
+  // Compute distance of point to 3 edges of the triangle and return the
+  // minimum value.
+  const T e01_dist = PointLineDistanceForward(p, v0, v1);
+  const T e02_dist = PointLineDistanceForward(p, v0, v2);
+  const T e12_dist = PointLineDistanceForward(p, v1, v2);
+  const T edge_dist = std::min(std::min(e01_dist, e02_dist), e12_dist);
+
+  return edge_dist;
+}
+
+// Backward pass for point triangle distance.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1, v2: Coordinates of the three triangle vertices.
+//     grad_dist: Upstream gradient for the distance.
+//
+// Returns:
+//    tuple of gradients for each of the triangle vertices:
+//      (vec2<T> grad_v0, vec2<T> grad_v1, vec2<T> grad_v2)
+//
+template <typename T>
+inline std::tuple<vec2<T>, vec2<T>, vec2<T>, vec2<T>>
+PointTriangleDistanceBackward(
+    const vec2<T>& p,
+    const vec2<T>& v0,
+    const vec2<T>& v1,
+    const vec2<T>& v2,
+    const T& grad_dist) {
+  // Compute distance to all 3 edges of the triangle.
+  const T e01_dist = PointLineDistanceForward(p, v0, v1);
+  const T e02_dist = PointLineDistanceForward(p, v0, v2);
+  const T e12_dist = PointLineDistanceForward(p, v1, v2);
+
+  // Initialize output tensors.
+  vec2<T> grad_v0(0.0f, 0.0f);
+  vec2<T> grad_v1(0.0f, 0.0f);
+  vec2<T> grad_v2(0.0f, 0.0f);
+  vec2<T> grad_p(0.0f, 0.0f);
+
+  // Find which edge is the closest and return PointLineDistanceBackward for
+  // that edge.
+  if (e01_dist <= e02_dist && e01_dist <= e12_dist) {
+    // Closest edge is v1 - v0.
+    auto grad_e01 = PointLineDistanceBackward(p, v0, v1, grad_dist);
+    grad_p = std::get<0>(grad_e01);
+    grad_v0 = std::get<1>(grad_e01);
+    grad_v1 = std::get<2>(grad_e01);
+  } else if (e02_dist <= e01_dist && e02_dist <= e12_dist) {
+    // Closest edge is v2 - v0.
+    auto grad_e02 = PointLineDistanceBackward(p, v0, v2, grad_dist);
+    grad_p = std::get<0>(grad_e02);
+    grad_v0 = std::get<1>(grad_e02);
+    grad_v2 = std::get<2>(grad_e02);
+  } else if (e12_dist <= e01_dist && e12_dist <= e02_dist) {
+    // Closest edge is v2 - v1.
+    auto grad_e12 = PointLineDistanceBackward(p, v1, v2, grad_dist);
+    grad_p = std::get<0>(grad_e12);
+    grad_v1 = std::get<1>(grad_e12);
+    grad_v2 = std::get<2>(grad_e12);
+  }
+
+  return std::make_tuple(grad_p, grad_v0, grad_v1, grad_v2);
+}
+
+// Computes the area of a triangle (v0, v1, v2).
+// Args:
+//     v0, v1, v2: vec3 coordinates of the triangle vertices
+//
+// Returns:
+//     area: float: the area of the triangle
+//
+template <typename T>
+T AreaOfTriangle(const vec3<T>& v0, const vec3<T>& v1, const vec3<T>& v2) {
+  vec3<T> p0 = v1 - v0;
+  vec3<T> p1 = v2 - v0;
+
+  // compute the hypotenus of the scross product (p0 x p1)
+  float dd = std::hypot(
+      p0.y * p1.z - p0.z * p1.y,
+      std::hypot(p0.z * p1.x - p0.x * p1.z, p0.x * p1.y - p0.y * p1.x));
+
+  return dd / 2.0;
+}
+
+// Computes the squared distance of a point p relative to a triangle (v0, v1,
+// v2). If the point's projection p0 on the plane spanned by (v0, v1, v2) is
+// inside the triangle with vertices (v0, v1, v2), then the returned value is
+// the squared distance of p to its projection p0. Otherwise, the returned value
+// is the smallest squared distance of p from the line segments (v0, v1), (v0,
+// v2) and (v1, v2).
+//
+// Args:
+//     p: vec3 coordinates of a point
+//     v0, v1, v2: vec3 coordinates of the triangle vertices
+//
+// Returns:
+//     dist: Float of the squared distance
+//
+
+const float vEpsilon = 1e-8;
+
+template <typename T>
+vec3<T> BarycentricCoords3Forward(
+    const vec3<T>& p,
+    const vec3<T>& v0,
+    const vec3<T>& v1,
+    const vec3<T>& v2) {
+  vec3<T> p0 = v1 - v0;
+  vec3<T> p1 = v2 - v0;
+  vec3<T> p2 = p - v0;
+
+  const T d00 = dot(p0, p0);
+  const T d01 = dot(p0, p1);
+  const T d11 = dot(p1, p1);
+  const T d20 = dot(p2, p0);
+  const T d21 = dot(p2, p1);
+
+  const T denom = d00 * d11 - d01 * d01 + kEpsilon;
+  const T w1 = (d11 * d20 - d01 * d21) / denom;
+  const T w2 = (d00 * d21 - d01 * d20) / denom;
+  const T w0 = 1.0f - w1 - w2;
+
+  return vec3<T>(w0, w1, w2);
+}
+
+// Checks whether the point p is inside the triangle (v0, v1, v2).
+// A point is inside the triangle, if all barycentric coordinates
+// wrt the triangle are >= 0 & <= 1.
+// If the triangle is degenerate, aka line or point, then return False.
+//
+// NOTE that this function assumes that p lives on the space spanned
+// by (v0, v1, v2).
+// TODO(gkioxari) explicitly check whether p is coplanar with (v0, v1, v2)
+// and throw an error if check fails
+//
+// Args:
+//     p: vec3 coordinates of a point
+//     v0, v1, v2: vec3 coordinates of the triangle vertices
+//     min_triangle_area: triangles less than this size are considered
+//     points/lines, IsInsideTriangle returns False
+//
+// Returns:
+//     inside: bool indicating wether p is inside triangle
+//
+template <typename T>
+static bool IsInsideTriangle(
+    const vec3<T>& p,
+    const vec3<T>& v0,
+    const vec3<T>& v1,
+    const vec3<T>& v2,
+    const double min_triangle_area) {
+  bool inside;
+  if (AreaOfTriangle(v0, v1, v2) < min_triangle_area) {
+    inside = 0;
+  } else {
+    vec3<T> bary = BarycentricCoords3Forward(p, v0, v1, v2);
+    bool x_in = 0.0f <= bary.x && bary.x <= 1.0f;
+    bool y_in = 0.0f <= bary.y && bary.y <= 1.0f;
+    bool z_in = 0.0f <= bary.z && bary.z <= 1.0f;
+    inside = x_in && y_in && z_in;
+  }
+  return inside;
+}
+
+template <typename T>
+T PointTriangle3DistanceForward(
+    const vec3<T>& p,
+    const vec3<T>& v0,
+    const vec3<T>& v1,
+    const vec3<T>& v2,
+    const double min_triangle_area) {
+  vec3<T> normal = cross(v2 - v0, v1 - v0);
+  const T norm_normal = norm(normal);
+  normal = normal / (norm_normal + vEpsilon);
+
+  // p0 is the projection of p on the plane spanned by (v0, v1, v2)
+  // i.e. p0 = p + t * normal, s.t. (p0 - v0) is orthogonal to normal
+  const T t = dot(v0 - p, normal);
+  const vec3<T> p0 = p + t * normal;
+
+  bool is_inside = IsInsideTriangle(p0, v0, v1, v2, min_triangle_area);
+  T dist = 0.0f;
+
+  if ((is_inside) && (norm_normal > kEpsilon)) {
+    // if projection p0 is inside triangle spanned by (v0, v1, v2)
+    // then distance is equal to norm(p0 - p)^2
+    dist = t * t;
+  } else {
+    const float e01 = PointLine3DistanceForward(p, v0, v1);
+    const float e02 = PointLine3DistanceForward(p, v0, v2);
+    const float e12 = PointLine3DistanceForward(p, v1, v2);
+
+    dist = (e01 > e02) ? e02 : e01;
+    dist = (dist > e12) ? e12 : dist;
+  }
+
+  return dist;
+}
+
+template <typename T>
+std::tuple<vec3<T>, vec3<T>>
+cross_backward(const vec3<T>& a, const vec3<T>& b, const vec3<T>& grad_cross) {
+  const float grad_ax = -grad_cross.y * b.z + grad_cross.z * b.y;
+  const float grad_ay = grad_cross.x * b.z - grad_cross.z * b.x;
+  const float grad_az = -grad_cross.x * b.y + grad_cross.y * b.x;
+  const vec3<T> grad_a = vec3<T>(grad_ax, grad_ay, grad_az);
+
+  const float grad_bx = grad_cross.y * a.z - grad_cross.z * a.y;
+  const float grad_by = -grad_cross.x * a.z + grad_cross.z * a.x;
+  const float grad_bz = grad_cross.x * a.y - grad_cross.y * a.x;
+  const vec3<T> grad_b = vec3<T>(grad_bx, grad_by, grad_bz);
+
+  return std::make_tuple(grad_a, grad_b);
+}
+
+template <typename T>
+vec3<T> normalize_backward(const vec3<T>& a, const vec3<T>& grad_normz) {
+  const float a_norm = norm(a) + vEpsilon;
+  const vec3<T> out = a / a_norm;
+
+  const float grad_ax = grad_normz.x * (1.0f - out.x * out.x) / a_norm +
+      grad_normz.y * (-out.x * out.y) / a_norm +
+      grad_normz.z * (-out.x * out.z) / a_norm;
+  const float grad_ay = grad_normz.x * (-out.x * out.y) / a_norm +
+      grad_normz.y * (1.0f - out.y * out.y) / a_norm +
+      grad_normz.z * (-out.y * out.z) / a_norm;
+  const float grad_az = grad_normz.x * (-out.x * out.z) / a_norm +
+      grad_normz.y * (-out.y * out.z) / a_norm +
+      grad_normz.z * (1.0f - out.z * out.z) / a_norm;
+  return vec3<T>(grad_ax, grad_ay, grad_az);
+}
+
+// The backward pass for computing the squared distance of a point
+// to the triangle (v0, v1, v2).
+//
+// Args:
+//     p: xyz coordinates of a point
+//     v0, v1, v2: xyz coordinates of the triangle vertices
+//     grad_dist: Float of the gradient wrt dist
+//     min_triangle_area: triangles less than this size are considered
+//     points/lines, IsInsideTriangle returns False
+//
+// Returns:
+//     tuple of gradients for the point and triangle:
+//        (float3 grad_p, float3 grad_v0, float3 grad_v1, float3 grad_v2)
+//
+
+template <typename T>
+static std::tuple<vec3<T>, vec3<T>, vec3<T>, vec3<T>>
+PointTriangle3DistanceBackward(
+    const vec3<T>& p,
+    const vec3<T>& v0,
+    const vec3<T>& v1,
+    const vec3<T>& v2,
+    const T& grad_dist,
+    const double min_triangle_area) {
+  const vec3<T> v2v0 = v2 - v0;
+  const vec3<T> v1v0 = v1 - v0;
+  const vec3<T> v0p = v0 - p;
+  vec3<T> raw_normal = cross(v2v0, v1v0);
+  const T norm_normal = norm(raw_normal);
+  vec3<T> normal = raw_normal / (norm_normal + vEpsilon);
+
+  // p0 is the projection of p on the plane spanned by (v0, v1, v2)
+  // i.e. p0 = p + t * normal, s.t. (p0 - v0) is orthogonal to normal
+  const T t = dot(v0 - p, normal);
+  const vec3<T> p0 = p + t * normal;
+  const vec3<T> diff = t * normal;
+
+  bool is_inside = IsInsideTriangle(p0, v0, v1, v2, min_triangle_area);
+
+  vec3<T> grad_p(0.0f, 0.0f, 0.0f);
+  vec3<T> grad_v0(0.0f, 0.0f, 0.0f);
+  vec3<T> grad_v1(0.0f, 0.0f, 0.0f);
+  vec3<T> grad_v2(0.0f, 0.0f, 0.0f);
+
+  if ((is_inside) && (norm_normal > kEpsilon)) {
+    // derivative of dist wrt p
+    grad_p = -2.0f * grad_dist * t * normal;
+    // derivative of dist wrt normal
+    const vec3<T> grad_normal = 2.0f * grad_dist * t * (v0p + diff);
+    // derivative of dist wrt raw_normal
+    const vec3<T> grad_raw_normal = normalize_backward(raw_normal, grad_normal);
+    // derivative of dist wrt v2v0 and v1v0
+    const auto grad_cross = cross_backward(v2v0, v1v0, grad_raw_normal);
+    const vec3<T> grad_cross_v2v0 = std::get<0>(grad_cross);
+    const vec3<T> grad_cross_v1v0 = std::get<1>(grad_cross);
+    grad_v0 =
+        grad_dist * 2.0f * t * normal - (grad_cross_v2v0 + grad_cross_v1v0);
+    grad_v1 = grad_cross_v1v0;
+    grad_v2 = grad_cross_v2v0;
+  } else {
+    const T e01 = PointLine3DistanceForward(p, v0, v1);
+    const T e02 = PointLine3DistanceForward(p, v0, v2);
+    const T e12 = PointLine3DistanceForward(p, v1, v2);
+
+    if ((e01 <= e02) && (e01 <= e12)) {
+      // e01 is smallest
+      const auto grads = PointLine3DistanceBackward(p, v0, v1, grad_dist);
+      grad_p = std::get<0>(grads);
+      grad_v0 = std::get<1>(grads);
+      grad_v1 = std::get<2>(grads);
+    } else if ((e02 <= e01) && (e02 <= e12)) {
+      // e02 is smallest
+      const auto grads = PointLine3DistanceBackward(p, v0, v2, grad_dist);
+      grad_p = std::get<0>(grads);
+      grad_v0 = std::get<1>(grads);
+      grad_v2 = std::get<2>(grads);
+    } else if ((e12 <= e01) && (e12 <= e02)) {
+      // e12 is smallest
+      const auto grads = PointLine3DistanceBackward(p, v1, v2, grad_dist);
+      grad_p = std::get<0>(grads);
+      grad_v1 = std::get<1>(grads);
+      grad_v2 = std::get<2>(grads);
+    }
+  }
+
+  return std::make_tuple(grad_p, grad_v0, grad_v1, grad_v2);
+}
diff --git a/pytorch3d_simplified/pytorch3d/csrc/utils/warp_reduce.cuh b/pytorch3d_simplified/pytorch3d/csrc/utils/warp_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3c903019debf5db594a6c71e1296ccd764991736
--- /dev/null
+++ b/pytorch3d_simplified/pytorch3d/csrc/utils/warp_reduce.cuh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <float.h>
+#include <math.h>
+#include <cstdio>
+
+// Helper functions WarpReduceMin and WarpReduceMax used in .cu files
+// Starting in Volta, instructions are no longer synchronous within a warp.
+// We need to call __syncwarp() to sync the 32 threads in the warp
+// instead of all the threads in the block.
+
+template <typename scalar_t>
+__device__ void
+WarpReduceMin(scalar_t* min_dists, int64_t* min_idxs, const size_t tid) {
+  // s = 32
+  if (min_dists[tid] > min_dists[tid + 32]) {
+    min_idxs[tid] = min_idxs[tid + 32];
+    min_dists[tid] = min_dists[tid + 32];
+  }
+  __syncwarp();
+  // s = 16
+  if (min_dists[tid] > min_dists[tid + 16]) {
+    min_idxs[tid] = min_idxs[tid + 16];
+    min_dists[tid] = min_dists[tid + 16];
+  }
+  __syncwarp();
+  // s = 8
+  if (min_dists[tid] > min_dists[tid + 8]) {
+    min_idxs[tid] = min_idxs[tid + 8];
+    min_dists[tid] = min_dists[tid + 8];
+  }
+  __syncwarp();
+  // s = 4
+  if (min_dists[tid] > min_dists[tid + 4]) {
+    min_idxs[tid] = min_idxs[tid + 4];
+    min_dists[tid] = min_dists[tid + 4];
+  }
+  __syncwarp();
+  // s = 2
+  if (min_dists[tid] > min_dists[tid + 2]) {
+    min_idxs[tid] = min_idxs[tid + 2];
+    min_dists[tid] = min_dists[tid + 2];
+  }
+  __syncwarp();
+  // s = 1
+  if (min_dists[tid] > min_dists[tid + 1]) {
+    min_idxs[tid] = min_idxs[tid + 1];
+    min_dists[tid] = min_dists[tid + 1];
+  }
+  __syncwarp();
+}
+
+template <typename scalar_t>
+__device__ void WarpReduceMax(
+    volatile scalar_t* dists,
+    volatile int64_t* dists_idx,
+    const size_t tid) {
+  if (dists[tid] < dists[tid + 32]) {
+    dists[tid] = dists[tid + 32];
+    dists_idx[tid] = dists_idx[tid + 32];
+  }
+  __syncwarp();
+  if (dists[tid] < dists[tid + 16]) {
+    dists[tid] = dists[tid + 16];
+    dists_idx[tid] = dists_idx[tid + 16];
+  }
+  __syncwarp();
+  if (dists[tid] < dists[tid + 8]) {
+    dists[tid] = dists[tid + 8];
+    dists_idx[tid] = dists_idx[tid + 8];
+  }
+  __syncwarp();
+  if (dists[tid] < dists[tid + 4]) {
+    dists[tid] = dists[tid + 4];
+    dists_idx[tid] = dists_idx[tid + 4];
+  }
+  __syncwarp();
+  if (dists[tid] < dists[tid + 2]) {
+    dists[tid] = dists[tid + 2];
+    dists_idx[tid] = dists_idx[tid + 2];
+  }
+  __syncwarp();
+  if (dists[tid] < dists[tid + 1]) {
+    dists[tid] = dists[tid + 1];
+    dists_idx[tid] = dists_idx[tid + 1];
+  }
+  __syncwarp();
+}