Spaces:

dxm21
/

4dgs-dpm

Runtime error

App Files Files Community

dxm21 commited on Jan 28

Commit

8c48cce

verified ·

1 Parent(s): 0325522

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +26 -35
.gitignore +40 -0
2403.20309v6.pdf +3 -0
2601.09499v1.pdf +3 -0
README.md +113 -12
app.py +751 -0
gs/.gitattributes +2 -0
gs/.gitignore +13 -0
gs/backward.py +1084 -0
gs/config.py +151 -0
gs/create_training_video.py +73 -0
gs/dataset_reader.py +1 -0
gs/forward.py +804 -0
gs/lib64 +1 -0
gs/loss.py +303 -0
gs/optimizer.py +399 -0
gs/render.py +141 -0
gs/scheduler.py +28 -0
gs/train.py +1044 -0
gs/train_colmap.py +1586 -0
gs/train_vdpm.py +712 -0
gs/training_progress.mp4 +3 -0
gs/utils/analyze_scales.py +100 -0
gs/utils/camera_utils.py +215 -0
gs/utils/check_opacities.py +21 -0
gs/utils/math_utils.py +111 -0
gs/utils/plot_loss_log.py +35 -0
gs/utils/point_cloud_utils.py +160 -0
gs/utils/wp_utils.py +45 -0
requirements.txt +26 -0
vdpm/.gitignore +132 -0
vdpm/.gitmodules +0 -0
vdpm/.gradio/certificate.pem +31 -0
vdpm/LICENSE +22 -0
vdpm/LICENSE-VGGT +115 -0
vdpm/README.md +44 -0
vdpm/check_model_size.py +85 -0
vdpm/configs/config.yaml +50 -0
vdpm/configs/model/dpm.yaml +3 -0
vdpm/configs/visualise.yaml +13 -0
vdpm/dpm/aggregator.py +366 -0
vdpm/dpm/decoder.py +416 -0
vdpm/dpm/model.py +149 -0
vdpm/examples/videos/camel.mp4 +3 -0
vdpm/examples/videos/car.mp4 +3 -0
vdpm/examples/videos/figure1.mp4 +3 -0
vdpm/examples/videos/figure2.mp4 +3 -0
vdpm/examples/videos/figure3.mp4 +3 -0
vdpm/examples/videos/goldfish.mp4 +3 -0
vdpm/examples/videos/horse.mp4 +3 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,26 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+# Auto detect text files and perform LF normalization
+* text=auto
+2403.20309v6.pdf filter=lfs diff=lfs merge=lfs -text
+2601.09499v1.pdf filter=lfs diff=lfs merge=lfs -text
+gs/training_progress.mp4 filter=lfs diff=lfs merge=lfs -text
+vdpm/examples/videos/camel.mp4 filter=lfs diff=lfs merge=lfs -text
+vdpm/examples/videos/car.mp4 filter=lfs diff=lfs merge=lfs -text
+vdpm/examples/videos/figure1.mp4 filter=lfs diff=lfs merge=lfs -text
+vdpm/examples/videos/figure2.mp4 filter=lfs diff=lfs merge=lfs -text
+vdpm/examples/videos/figure3.mp4 filter=lfs diff=lfs merge=lfs -text
+vdpm/examples/videos/goldfish.mp4 filter=lfs diff=lfs merge=lfs -text
+vdpm/examples/videos/horse.mp4 filter=lfs diff=lfs merge=lfs -text
+vdpm/examples/videos/paragliding.mp4 filter=lfs diff=lfs merge=lfs -text
+vdpm/examples/videos/pstudio.mp4 filter=lfs diff=lfs merge=lfs -text
+vdpm/examples/videos/stroller.mp4 filter=lfs diff=lfs merge=lfs -text
+vdpm/examples/videos/swing.mp4 filter=lfs diff=lfs merge=lfs -text
+vdpm/examples/videos/tennis.mp4 filter=lfs diff=lfs merge=lfs -text
+vdpm/examples/videos/tesla.mp4 filter=lfs diff=lfs merge=lfs -text
+vdpm/input_images_20260128_014417_015976/images/000000.png filter=lfs diff=lfs merge=lfs -text
+vdpm/input_images_20260128_014417_015976/images/000001.png filter=lfs diff=lfs merge=lfs -text
+vdpm/input_images_20260128_014417_015976/images/000002.png filter=lfs diff=lfs merge=lfs -text
+vdpm/input_images_20260128_014417_015976/images/000003.png filter=lfs diff=lfs merge=lfs -text
+vdpm/input_images_20260128_014417_015976/output_4d.npz filter=lfs diff=lfs merge=lfs -text
+vdpm/input_images_20260128_014417_015976/poses.npz filter=lfs diff=lfs merge=lfs -text
+vdpm/input_images_20260128_014417_015976/reconstruction_data.zip filter=lfs diff=lfs merge=lfs -text
+vdpm/input_images_20260128_014417_015976/tracks.npz filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,40 @@

+# User requested ignores
+output/
+mv-video/
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Virtual Environments
+venv/
+env/
+ENV/
+env.bak/
+venv.bak/
+# VS Code
+.vscode/
+# Gradio
+.gradio/

2403.20309v6.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd8415f171a0353126dcb1029126f48b805c3c24f65706bcc930c55dfc5dcc2e
+size 8417471

2601.09499v1.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09bad1eec73fad7ab1cc4d9c4da01305d3c4cebb3094c06924cd4df088065738
+size 13097134

README.md CHANGED Viewed

@@ -1,12 +1,113 @@
----
-title: 4dgs Dpm
-emoji: 🌍
-colorFrom: red
-colorTo: blue
-sdk: gradio
-sdk_version: 6.4.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: 4dgs-dpm
+app_file: app.py
+sdk: gradio
+sdk_version: 5.17.1
+---
+# DPM-Splat: Video → 4D Gaussian Splats
+End-to-end pipeline combining **V-DPM** (Video Dynamic Point Maps) with **3D Gaussian Splatting** for dynamic 4D scene reconstruction from multi-view video.
+![Pipeline](https://img.shields.io/badge/Pipeline-VDPM%20→%203DGS-blue)
+![License](https://img.shields.io/badge/License-MIT-green)
+## Features
+- **Feed-forward reconstruction**: No per-scene optimization needed for initial point cloud
+- **Multi-view support**: 1-4 synchronized video inputs
+- **Temporal consistency**: Dynamic point tracking across frames
+- **Memory efficient**: BF16/FP16 quantization, flash attention support
+- **Co-visibility filtering**: Reduces redundant points (InstantSplat-inspired)
+- **Gradio demo**: Easy-to-use web interface
+## Demo
+Run the interactive demo:
+```bash
+python app.py
+```
+Or try the hosted version on [Hugging Face Spaces](https://huggingface.co/spaces/YOUR_USERNAME/dpm-splat)
+## Installation
+```bash
+# Create environment
+conda create -n 4dgs-dpm python=3.10
+conda activate 4dgs-dpm
+# Install PyTorch with CUDA
+pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu118
+# Install dependencies
+pip install -r requirements.txt
+```
+## Usage
+### Web Interface (Recommended)
+```bash
+python app.py
+```
+Upload videos, adjust settings, and download results as ZIP.
+### Command Line
+```bash
+# Run VDPM inference
+python vdpm/visualise.py --input mv-video/your-video --output output/vdpm
+# Train 3DGS from VDPM output
+python -m gs.train_vdpm --input output/vdpm --output output/splats --iterations 1000
+```
+## Pipeline
+1. **Video Processing**: Extract and interleave frames from multi-view videos
+2. **VDPM Inference**: Generate dynamic point maps and camera poses using VGGT backbone
+3. **3DGS Training**: Train per-frame Gaussian splats initialized from point maps
+4. **Animation Rendering**: Generate GIF from interpolated camera viewpoint
+## Output
+The pipeline generates:
+- `splats/frame_XXXX.ply` - Gaussian splat for each timestep
+- `renders/` - Training progress images
+- `animation.gif` - Rendered animation from average camera
+- `tracks.npz` - 3D point tracks
+- `poses.npz` - Camera poses
+## Requirements
+- NVIDIA GPU with 8GB+ VRAM (tested on RTX 3070 Ti)
+- CUDA 11.8+
+- Python 3.10+
+## TO-DO
+- [x] VGGT Quantization (BF16/FP16)
+- [x] Co-visibility check to reduce points
+- [x] Dynamic point tracking
+- [x] Per-frame 3DGS training
+- [x] Gradio demo with GIF rendering
+- [ ] Flash Attention for VGGT
+- [ ] Dynamic/Static segmentation
+- [ ] 3DGS with dynamic deformation field
+- [ ] 4DGS primitive support
+## Citation
+```bibtex
+@misc{dpmsplat2026,
+  title={DPM-Splat: Video to 4D Gaussian Splats via Dynamic Point Maps},
+  author={Your Name},
+  year={2026},
+  url={https://github.com/YOUR_USERNAME/4dgs-dpm}
+}
+```
+## Acknowledgements
+- [VGGT](https://github.com/facebookresearch/vggt) - Visual Geometry Grounded Transformer
+- [3D Gaussian Splatting](https://github.com/graphdeco-inria/gaussian-splatting)
+- [NVIDIA Warp](https://github.com/NVIDIA/warp)

app.py ADDED Viewed

	@@ -0,0 +1,751 @@

+"""
+DPM-Splat: End-to-end pipeline for Video → 4D Gaussian Splats
+Combines VDPM inference with 3DGS training in a single Gradio interface.
+"""
+import os
+import sys
+import shutil
+import zipfile
+import gc
+import json
+import glob
+import time
+from pathlib import Path
+from datetime import datetime
+import cv2
+import numpy as np
+import gradio as gr
+import torch
+import imageio
+# Set memory optimization
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+# Add paths
+sys.path.insert(0, str(Path(__file__).parent / "vdpm"))
+sys.path.insert(0, str(Path(__file__).parent / "gs"))
+# Check GPU availability
+device = "cuda" if torch.cuda.is_available() else "cpu"
+if device == "cuda":
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    gpu_name = torch.cuda.get_device_name(0)
+    gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+    print(f"✓ GPU: {gpu_name} ({gpu_mem:.1f} GB)")
+else:
+    print("⚠ No GPU detected - running on CPU (will be slow)")
+# Configuration
+VIDEO_SAMPLE_HZ = 1.0
+MAX_FRAMES = 8 if device == "cuda" else 4
+# Global model cache
+_vdpm_model = None
+def get_vdpm_model():
+    """Load and cache the VDPM model"""
+    global _vdpm_model
+    if _vdpm_model is not None:
+        print("✓ Using cached VDPM model")
+        return _vdpm_model
+    print("Loading VDPM model...")
+    sys.stdout.flush()
+    from hydra import compose, initialize
+    from hydra.core.global_hydra import GlobalHydra
+    from dpm.model import VDPM
+    if GlobalHydra.instance().is_initialized():
+        GlobalHydra.instance().clear()
+    with initialize(config_path="vdpm/configs"):
+        cfg = compose(config_name="visualise")
+    model = VDPM(cfg).to(device)
+    # Load weights
+    cache_dir = os.path.expanduser("~/.cache/vdpm")
+    os.makedirs(cache_dir, exist_ok=True)
+    model_path = os.path.join(cache_dir, "vdpm_model.pt")
+    _URL = "https://huggingface.co/edgarsucar/vdpm/resolve/main/model.pt"
+    if not os.path.exists(model_path):
+        print(f"Downloading VDPM model...")
+        sd = torch.hub.load_state_dict_from_url(_URL, file_name="vdpm_model.pt", progress=True, map_location=device)
+        torch.save(sd, model_path)
+    else:
+        print(f"✓ Loading cached model from {model_path}")
+        sd = torch.load(model_path, map_location=device)
+    model.load_state_dict(sd, strict=True)
+    model.eval()
+    # Use half precision
+    if device == "cuda":
+        if torch.cuda.get_device_capability()[0] >= 8:
+            model = model.to(torch.bfloat16)
+            print("✓ Using BF16 precision")
+        else:
+            model = model.half()
+            print("✓ Using FP16 precision")
+    _vdpm_model = model
+    return model
+def process_videos(video_files, target_dir):
+    """Extract and interleave frames from uploaded videos"""
+    images_dir = target_dir / "images"
+    images_dir.mkdir(parents=True, exist_ok=True)
+    num_views = len(video_files)
+    captures = []
+    intervals = []
+    for vid_obj in video_files:
+        video_path = vid_obj.name if hasattr(vid_obj, 'name') else str(vid_obj)
+        vs = cv2.VideoCapture(video_path)
+        fps = float(vs.get(cv2.CAP_PROP_FPS) or 30.0)
+        interval = max(int(fps / max(VIDEO_SAMPLE_HZ, 1e-6)), 1)
+        captures.append(vs)
+        intervals.append(interval)
+    # Interleave frames
+    frame_num = 0
+    step_count = 0
+    active = True
+    image_paths = []
+    while active:
+        active = False
+        for i, vs in enumerate(captures):
+            if not vs.isOpened():
+                continue
+            ret, frame = vs.read()
+            if ret:
+                active = True
+                if step_count % intervals[i] == 0:
+                    out_path = images_dir / f"{frame_num:06d}.png"
+                    cv2.imwrite(str(out_path), frame)
+                    image_paths.append(str(out_path))
+                    frame_num += 1
+            else:
+                vs.release()
+        step_count += 1
+    for vs in captures:
+        if vs.isOpened():
+            vs.release()
+    # Save metadata
+    meta = {"num_views": num_views}
+    with open(target_dir / "meta.json", "w") as f:
+        json.dump(meta, f)
+    return image_paths, num_views
+def run_vdpm_inference(target_dir, progress):
+    """Run VDPM inference"""
+    from vggt.utils.load_fn import load_and_preprocess_images
+    model = get_vdpm_model()
+    image_names = sorted(glob.glob(os.path.join(target_dir, "images", "*")))
+    if not image_names:
+        raise ValueError("No images found")
+    # Load metadata
+    meta_path = target_dir / "meta.json"
+    num_views = 1
+    if meta_path.exists():
+        with open(meta_path) as f:
+            num_views = json.load(f).get("num_views", 1)
+    # Limit frames
+    if len(image_names) > MAX_FRAMES:
+        limit = (MAX_FRAMES // num_views) * num_views
+        if limit == 0:
+            limit = num_views
+        print(f"⚠ Limiting to {limit} frames")
+        image_names = image_names[:limit]
+    progress(0.15, desc=f"Loading {len(image_names)} images...")
+    images = load_and_preprocess_images(image_names).to(device)
+    # Construct views
+    views = []
+    for i in range(len(image_names)):
+        t_idx = i // num_views
+        cam_idx = i % num_views
+        views.append({
+            "img": images[i].unsqueeze(0),
+            "view_idxs": torch.tensor([[cam_idx, t_idx]], device=device, dtype=torch.long)
+        })
+    progress(0.2, desc="Running VDPM forward pass...")
+    print(f"Running inference on {len(image_names)} images...")
+    sys.stdout.flush()
+    with torch.no_grad():
+        with torch.amp.autocast('cuda'):
+            predictions = model.inference(views=views)
+    # Extract results
+    pts_list = [pm["pts3d"].detach().cpu().numpy() for pm in predictions["pointmaps"]]
+    conf_list = [pm["conf"].detach().cpu().numpy() for pm in predictions["pointmaps"]]
+    pose_enc = None
+    if "pose_enc" in predictions:
+        pose_enc = predictions["pose_enc"].detach().cpu().numpy()
+    del predictions
+    torch.cuda.empty_cache()
+    world_points_raw = np.concatenate(pts_list, axis=0)
+    world_points_conf_raw = np.concatenate(conf_list, axis=0)
+    T = world_points_raw.shape[0]
+    S = world_points_raw.shape[1]
+    num_timesteps = T
+    # Process multi-view
+    if num_views > 1 and S == num_views * T:
+        world_points_list = []
+        world_points_conf_list = []
+        for t in range(T):
+            start_idx = t * num_views
+            end_idx = start_idx + num_views
+            world_points_list.append(world_points_raw[t, start_idx:end_idx])
+            world_points_conf_list.append(world_points_conf_raw[t, start_idx:end_idx])
+        world_points = np.stack(world_points_list, axis=0)
+        world_points_conf = np.stack(world_points_conf_list, axis=0)
+    else:
+        if world_points_raw.ndim == 5 and world_points_raw.shape[0] == 1:
+            world_points = world_points_raw[0]
+            world_points_conf = world_points_conf_raw[0]
+        else:
+            world_points = world_points_raw
+            world_points_conf = world_points_conf_raw
+    progress(0.35, desc="Saving VDPM outputs...")
+    # Save outputs
+    np.savez_compressed(
+        target_dir / "tracks.npz",
+        world_points=world_points,
+        world_points_conf=world_points_conf,
+        num_views=num_views,
+        num_timesteps=num_timesteps
+    )
+    if pose_enc is not None:
+        np.savez_compressed(target_dir / "poses.npz", pose_enc=pose_enc)
+    print(f"✓ VDPM complete: {num_timesteps} timesteps, {num_views} views")
+    sys.stdout.flush()
+    return num_timesteps, num_views
+def run_3dgs_training(target_dir, output_dir, iterations, conf_threshold, progress):
+    """Run 3DGS training"""
+    import warp as wp
+    from train_vdpm import load_vdpm_data, VDPM3DGSTrainer
+    wp.init()
+    data = load_vdpm_data(str(target_dir))
+    num_timesteps = data['T']
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    all_ply_files = []
+    for frame_idx in range(num_timesteps):
+        frame_progress = 0.4 + (0.5 * frame_idx / num_timesteps)
+        progress(frame_progress, desc=f"Training frame {frame_idx + 1}/{num_timesteps}...")
+        print(f"\n{'='*50}")
+        print(f"[Frame {frame_idx + 1}/{num_timesteps}]")
+        print(f"{'='*50}")
+        sys.stdout.flush()
+        trainer = VDPM3DGSTrainer(
+            data=data,
+            frame_idx=frame_idx,
+            output_path=str(output_path),
+            conf_threshold=conf_threshold
+        )
+        # Training loop with progress
+        print(f"Training for {iterations} iterations...")
+        sys.stdout.flush()
+        trainer.save(0)  # Initial state
+        for it in range(iterations):
+            trainer.zero_grad()
+            cam_idx = np.random.randint(len(trainer.cameras))
+            camera = trainer.cameras[cam_idx]
+            target = trainer.images[cam_idx]
+            from forward import render_gaussians
+            from loss import l1_loss, compute_image_gradients
+            from backward import backward
+            from optimizer import adam_update
+            from config import DEVICE
+            rendered, depth, trainer.intermediate_buffers = render_gaussians(
+                background=np.array(trainer.config['background_color'], dtype=np.float32),
+                means3D=trainer.params['positions'].numpy(),
+                colors=None,
+                opacity=trainer.params['opacities'].numpy(),
+                scales=trainer.params['scales'].numpy(),
+                rotations=trainer.params['rotations'].numpy(),
+                scale_modifier=1.0,
+                viewmatrix=camera['world_to_camera'],
+                projmatrix=camera['full_proj_matrix'],
+                tan_fovx=camera['tan_fovx'],
+                tan_fovy=camera['tan_fovy'],
+                image_height=camera['height'],
+                image_width=camera['width'],
+                sh=trainer.params['shs'].numpy(),
+                degree=3,
+                campos=camera['camera_center'],
+                prefiltered=False,
+                antialiasing=True,
+            )
+            target_wp = wp.array(target.astype(np.float32), dtype=wp.vec3, device=DEVICE)
+            loss = l1_loss(rendered, target_wp)
+            trainer.losses.append(loss)
+            pixel_grad_buffer = compute_image_gradients(rendered, target_wp, lambda_dssim=0)
+            view_matrix = wp.mat44(camera['world_to_camera'].flatten())
+            proj_matrix = wp.mat44(camera['full_proj_matrix'].flatten())
+            campos = wp.vec3(camera['camera_center'][0], camera['camera_center'][1], camera['camera_center'][2])
+            geom_buffer = {
+                'radii': trainer.intermediate_buffers['radii'],
+                'means2D': trainer.intermediate_buffers['points_xy_image'],
+                'conic_opacity': trainer.intermediate_buffers['conic_opacity'],
+                'rgb': trainer.intermediate_buffers['colors'],
+                'clamped': trainer.intermediate_buffers['clamped_state']
+            }
+            binning_buffer = {'point_list': trainer.intermediate_buffers['point_list']}
+            img_buffer = {
+                'ranges': trainer.intermediate_buffers['ranges'],
+                'final_Ts': trainer.intermediate_buffers['final_Ts'],
+                'n_contrib': trainer.intermediate_buffers['n_contrib']
+            }
+            gradients = backward(
+                background=np.array(trainer.config['background_color'], dtype=np.float32),
+                means3D=trainer.params['positions'],
+                dL_dpixels=pixel_grad_buffer,
+                opacity=trainer.params['opacities'],
+                shs=trainer.params['shs'],
+                scales=trainer.params['scales'],
+                rotations=trainer.params['rotations'],
+                scale_modifier=trainer.config['scale_modifier'],
+                viewmatrix=view_matrix,
+                projmatrix=proj_matrix,
+                tan_fovx=camera['tan_fovx'],
+                tan_fovy=camera['tan_fovy'],
+                image_height=camera['height'],
+                image_width=camera['width'],
+                campos=campos,
+                radii=trainer.intermediate_buffers['radii'],
+                means2D=trainer.intermediate_buffers['points_xy_image'],
+                conic_opacity=trainer.intermediate_buffers['conic_opacity'],
+                rgb=trainer.intermediate_buffers['colors'],
+                cov3Ds=trainer.intermediate_buffers['cov3Ds'],
+                clamped=trainer.intermediate_buffers['clamped_state'],
+                geom_buffer=geom_buffer,
+                binning_buffer=binning_buffer,
+                img_buffer=img_buffer,
+                degree=trainer.config['sh_degree'],
+                debug=False
+            )
+            wp.copy(trainer.grads['positions'], gradients['dL_dmean3D'])
+            wp.copy(trainer.grads['scales'], gradients['dL_dscale'])
+            wp.copy(trainer.grads['rotations'], gradients['dL_drot'])
+            wp.copy(trainer.grads['opacities'], gradients['dL_dopacity'])
+            wp.copy(trainer.grads['shs'], gradients['dL_dshs'])
+            lr = 0.001 * (0.1 ** (it / iterations))
+            wp.launch(adam_update, dim=trainer.num_points, inputs=[
+                trainer.params['positions'], trainer.params['scales'],
+                trainer.params['rotations'], trainer.params['opacities'], trainer.params['shs'],
+                trainer.grads['positions'], trainer.grads['scales'],
+                trainer.grads['rotations'], trainer.grads['opacities'], trainer.grads['shs'],
+                trainer.adam_m['positions'], trainer.adam_m['scales'],
+                trainer.adam_m['rotations'], trainer.adam_m['opacities'], trainer.adam_m['shs'],
+                trainer.adam_v['positions'], trainer.adam_v['scales'],
+                trainer.adam_v['rotations'], trainer.adam_v['opacities'], trainer.adam_v['shs'],
+                trainer.num_points, lr, lr*5, lr*5, lr*2, lr*5,
+                0.9, 0.999, 1e-8, it
+            ])
+            # Progress logging
+            if (it + 1) % 100 == 0:
+                print(f"  Iter {it+1}/{iterations} | Loss: {loss:.4f}")
+                sys.stdout.flush()
+            # Checkpoints
+            if (it + 1) % 500 == 0 or it == iterations - 1:
+                trainer.save(it + 1)
+        ply_path = trainer.save_final()
+        all_ply_files.append(str(ply_path))
+        print(f"✓ Frame {frame_idx} complete: {ply_path}")
+        sys.stdout.flush()
+    return all_ply_files
+def render_animation_gif(ply_files, data, output_path, progress, fps=10):
+    """
+    Render a GIF animation from an average camera position across all frames.
+    Args:
+        ply_files: List of PLY file paths for each frame
+        data: VDPM data dict with camera info
+        output_path: Path to save the GIF
+        progress: Gradio progress callback
+        fps: Frames per second for GIF
+    """
+    import warp as wp
+    from forward import render_gaussians
+    from utils.point_cloud_utils import load_ply
+    from utils.math_utils import projection_matrix
+    from train_vdpm import decode_poses
+    if not ply_files:
+        return None
+    print("Rendering animation GIF...")
+    sys.stdout.flush()
+    # Get image dimensions
+    images = data['images']
+    img_H, img_W = images.shape[1:3]
+    # Decode poses to get all cameras
+    pose_enc = data.get('pose_enc')
+    if pose_enc is not None:
+        extrinsics, intrinsics = decode_poses(pose_enc, (img_H, img_W))
+    else:
+        # Fallback
+        N = data['T'] * data['V']
+        extrinsics = np.tile(np.eye(4, dtype=np.float32), (N, 1, 1))
+        fx = fy = max(img_H, img_W)
+        K = np.array([[fx, 0, img_W/2], [0, fy, img_H/2], [0, 0, 1]], dtype=np.float32)
+        intrinsics = np.tile(K, (N, 1, 1))
+    # Compute average camera position
+    camera_centers = []
+    for i in range(len(extrinsics)):
+        R = extrinsics[i][:3, :3]
+        t = extrinsics[i][:3, 3]
+        center = -R.T @ t
+        camera_centers.append(center)
+    avg_center = np.mean(camera_centers, axis=0)
+    # Use first camera's orientation and intrinsics as base
+    R = extrinsics[0][:3, :3]
+    intrinsic = intrinsics[0]
+    fx, fy = intrinsic[0, 0], intrinsic[1, 1]
+    # Compute translation for average position
+    t = -R @ avg_center
+    # Build camera matrices (transposed for Warp/OpenGL)
+    world_to_camera = np.eye(4, dtype=np.float32)
+    world_to_camera[:3, :3] = R
+    world_to_camera[:3, 3] = t
+    world_to_camera = world_to_camera.T
+    fov_x = 2 * np.arctan(img_W / (2 * fx))
+    fov_y = 2 * np.arctan(img_H / (2 * fy))
+    proj_matrix = projection_matrix(fovx=fov_x, fovy=fov_y, znear=0.01, zfar=100.0).T
+    full_proj_matrix = world_to_camera @ proj_matrix
+    tan_fovx = np.tan(fov_x / 2)
+    tan_fovy = np.tan(fov_y / 2)
+    # Render each frame
+    rendered_frames = []
+    background = np.array([1.0, 1.0, 1.0], dtype=np.float32)  # White background
+    for i, ply_path in enumerate(ply_files):
+        if not Path(ply_path).exists():
+            continue
+        progress(0.9 + 0.05 * (i / len(ply_files)), desc=f"Rendering GIF frame {i+1}/{len(ply_files)}...")
+        # Load PLY
+        ply_data = load_ply(ply_path)
+        positions = ply_data['positions']
+        scales = ply_data['scales']
+        rotations = ply_data['rotations']
+        opacities = ply_data['opacities']
+        shs = ply_data['shs']
+        # Render
+        rendered, _, _ = render_gaussians(
+            background=background,
+            means3D=positions,
+            colors=None,
+            opacity=opacities,
+            scales=scales,
+            rotations=rotations,
+            scale_modifier=1.0,
+            viewmatrix=world_to_camera,
+            projmatrix=full_proj_matrix,
+            tan_fovx=tan_fovx,
+            tan_fovy=tan_fovy,
+            image_height=img_H,
+            image_width=img_W,
+            sh=shs,
+            degree=3,
+            campos=avg_center,
+            prefiltered=False,
+            antialiasing=True,
+        )
+        # Convert to numpy
+        rendered_np = wp.to_torch(rendered).cpu().numpy()
+        rendered_np = np.clip(rendered_np * 255, 0, 255).astype(np.uint8)
+        rendered_frames.append(rendered_np)
+    if not rendered_frames:
+        return None
+    # Save GIF
+    gif_path = Path(output_path)
+    imageio.mimsave(str(gif_path), rendered_frames, fps=fps, loop=0)
+    print(f"✓ Animation GIF saved: {gif_path}")
+    sys.stdout.flush()
+    return str(gif_path)
+def run_pipeline(video_files, iterations, conf_threshold, progress=gr.Progress()):
+    """Run the full VDPM → 3DGS pipeline"""
+    if not video_files:
+        return None, None, None, "❌ Please upload video file(s)"
+    gc.collect()
+    if device == "cuda":
+        torch.cuda.empty_cache()
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    run_dir = Path(f"output/pipeline/run_{timestamp}")
+    run_dir.mkdir(parents=True, exist_ok=True)
+    try:
+        # Step 1: Process videos
+        progress(0.05, desc="Processing uploaded videos...")
+        print("=" * 50)
+        print("Processing Videos")
+        print("=" * 50)
+        sys.stdout.flush()
+        image_paths, num_views = process_videos(video_files, run_dir)
+        print(f"✓ Extracted {len(image_paths)} frames from {num_views} videos")
+        sys.stdout.flush()
+        # Step 2: VDPM inference
+        progress(0.1, desc="Running VDPM inference...")
+        print("=" * 50)
+        print("Running VDPM Inference")
+        print("=" * 50)
+        sys.stdout.flush()
+        num_timesteps, num_views = run_vdpm_inference(run_dir, progress)
+        # Clear VRAM before 3DGS training
+        global _vdpm_model
+        _vdpm_model = None
+        gc.collect()
+        if device == "cuda":
+            torch.cuda.empty_cache()
+            print(f"✓ Cleared VRAM: {torch.cuda.memory_allocated()/1024**3:.2f} GB in use")
+            sys.stdout.flush()
+        # Step 3: 3DGS training
+        progress(0.4, desc="Training 3D Gaussian Splats...")
+        print("=" * 50)
+        print("Training 3D Gaussian Splats")
+        print("=" * 50)
+        sys.stdout.flush()
+        splat_dir = run_dir / "splats"
+        all_ply_files = run_3dgs_training(
+            run_dir, splat_dir, int(iterations), float(conf_threshold), progress
+        )
+        # Step 4: Render animation GIF from average camera
+        progress(0.9, desc="Rendering animation GIF...")
+        print("=" * 50)
+        print("Rendering Animation GIF")
+        print("=" * 50)
+        sys.stdout.flush()
+        gif_path = None
+        if all_ply_files:
+            from train_vdpm import load_vdpm_data
+            data = load_vdpm_data(str(run_dir))
+            gif_path = render_animation_gif(
+                all_ply_files, data, run_dir / "animation.gif", progress
+            )
+        # Step 5: Package results
+        progress(0.95, desc="Packaging results...")
+        zip_path = run_dir / "results.zip"
+        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
+            # Add PLY files
+            for ply in all_ply_files:
+                if ply and Path(ply).exists():
+                    zf.write(ply, f"splats/{Path(ply).name}")
+            # Add all checkpoint renders
+            for render_dir in splat_dir.glob("frame_*/iter_*"):
+                for img in render_dir.glob("*.png"):
+                    rel_path = img.relative_to(splat_dir)
+                    zf.write(img, f"renders/{rel_path}")
+            # Add VDPM data to root
+            for f in ["tracks.npz", "poses.npz", "meta.json"]:
+                fp = run_dir / f
+                if fp.exists():
+                    zf.write(fp, f)
+            # Add input images
+            images_dir = run_dir / "images"
+            if images_dir.exists():
+                for img in images_dir.glob("*"):
+                    zf.write(img, f"images/{img.name}")
+            # Add animation GIF
+            if gif_path and Path(gif_path).exists():
+                zf.write(gif_path, "animation.gif")
+        progress(1.0, desc="Complete!")
+        # Return first PLY for preview
+        preview_ply = all_ply_files[0] if all_ply_files else None
+        status = f"""✅ Pipeline Complete!
+📊 Results:
+  • {len(all_ply_files)} PLY files generated
+  • {num_timesteps} timesteps × {num_views} views
+  • Animation GIF rendered
+📁 Output: {run_dir}
+📦 Download the ZIP for all files"""
+        return preview_ply, str(zip_path), gif_path, status
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return None, None, None, f"❌ Error: {str(e)}"
+# ===== Gradio Interface =====
+with gr.Blocks(title="DPM-Splat: 4D Gaussian Splatting", theme=gr.themes.Soft()) as app:
+    gr.Markdown("""
+    # 🎬 DPM-Splat: Video → 4D Gaussian Splats
+    End-to-end pipeline combining **V-DPM** (Video Dynamic Point Maps) with **3D Gaussian Splatting**.
+    Upload multi-view synchronized videos to generate temporally consistent 4D reconstructions.
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            video_input = gr.File(
+                label="📹 Upload Videos",
+                file_count="multiple",
+                file_types=[".mp4", ".mov", ".avi", ".webm"]
+            )
+            gr.Markdown("*Upload 1-4 synchronized video files for best results*")
+            with gr.Accordion("⚙️ Settings", open=True):
+                iterations = gr.Slider(
+                    minimum=0, maximum=10000, value=1000, step=100,
+                    label="Training Iterations",
+                    info="0 = export raw point cloud only, more = better quality"
+                )
+                conf_threshold = gr.Slider(
+                    minimum=0, maximum=100, value=0, step=5,
+                    label="Confidence Threshold (%)",
+                    info="0% keeps all points, higher = filter low confidence"
+                )
+            run_btn = gr.Button("🚀 Run Pipeline", variant="primary", size="lg")
+            status_text = gr.Textbox(
+                label="Status",
+                interactive=False,
+                lines=6,
+                value="Upload videos and click 'Run Pipeline' to begin."
+            )
+        with gr.Column(scale=2):
+            with gr.Row():
+                model_viewer = gr.Model3D(
+                    label="3D Preview (First Frame)",
+                    clear_color=[1.0, 1.0, 1.0, 1.0],
+                    height=400
+                )
+                gif_viewer = gr.Image(
+                    label="🎞️ Animation (Average Camera)",
+                    height=400
+                )
+            download_btn = gr.File(label="📦 Download Results (ZIP)")
+    gr.Markdown("""
+    ---
+    ### 📋 Output Contents
+    The downloaded ZIP contains:
+    - `splats/frame_XXXX.ply` - Gaussian splat for each timestep
+    - `renders/` - Training progress images (target vs rendered)
+    - `animation.gif` - Rendered animation from average camera
+    - `tracks.npz` - 3D point tracks
+    - `poses.npz` - Camera poses
+    - `images/` - Input frames
+    **Local runs**: Results saved to `output/pipeline/run_TIMESTAMP/`
+    """)
+    run_btn.click(
+        fn=run_pipeline,
+        inputs=[video_input, iterations, conf_threshold],
+        outputs=[model_viewer, download_btn, gif_viewer, status_text]
+    )
+if __name__ == "__main__":
+    app.queue().launch(share=True, show_error=True)

gs/.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Auto detect text files and perform LF normalization
2	+ * text=auto

gs/.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+# Created by venv; see https://docs.python.org/3/library/venv.html
+data/*
+output/*
+lib/*
+lib64/*
+data_/*
+colmap_0/*
+bin/*
+share/*
+__pycache__/*
+utils/__pycache__/*
+.DS_Store

gs/backward.py ADDED Viewed

	@@ -0,0 +1,1084 @@

+import warp as wp
+import math
+from utils.wp_utils import to_warp_array, wp_vec3_mul_element, wp_vec3_add_element, wp_vec3_sqrt, wp_vec3_div_element, wp_vec3_clamp
+from config import * # Assuming TILE_M, TILE_N, VEC6, DEVICE are defined here
+# Initialize Warp if not already done elsewhere
+# wp.init()
+# --- Spherical Harmonics Constants ---
+SH_C0 = 0.28209479177387814
+SH_C1 = 0.4886025119029199
+@wp.func
+def dnormvdv(v: wp.vec3, dv: wp.vec3) -> wp.vec3:
+    """
+    Computes the gradient of normalize(v) with respect to v, scaled by dv.
+    This is a direct port of the CUDA implementation.
+    Args:
+        v: The input vector to be normalized
+        dv: The gradient vector to scale the result by
+    Returns:
+        The gradient vector
+    """
+    sum2 = v[0] * v[0] + v[1] * v[1] + v[2] * v[2]
+    # Avoid division by zero
+    if sum2 < 1e-10:
+        return wp.vec3(0.0, 0.0, 0.0)
+    invsum32 = 1.0 / wp.sqrt(sum2 * sum2 * sum2)
+    result = wp.vec3(
+        ((sum2 - v[0] * v[0]) * dv[0] - v[1] * v[0] * dv[1] - v[2] * v[0] * dv[2]) * invsum32,
+        (-v[0] * v[1] * dv[0] + (sum2 - v[1] * v[1]) * dv[1] - v[2] * v[1] * dv[2]) * invsum32,
+        (-v[0] * v[2] * dv[0] - v[1] * v[2] * dv[1] + (sum2 - v[2] * v[2]) * dv[2]) * invsum32
+    )
+    return result
+# --- Backward Kernels ---
+@wp.kernel
+def sh_backward_kernel(
+    # --- Inputs ---
+    num_points: int,                             # Number of Gaussian points
+    degree: int,                                 # SH degree used in forward
+    means: wp.array(dtype=wp.vec3),              # 3D positions (N, 3)
+    shs: wp.array(dtype=wp.vec3),                # Flattened SH coeffs (N * 16, 3)
+    radii: wp.array(dtype=int),                  # Radii computed in forward (N,) - used for skipping
+    campos: wp.vec3,                             # Camera position (3,)
+    clamped_state: wp.array(dtype=wp.vec3),      # Clamping state {0,1} from forward pass (N, 3)
+    dL_dcolor: wp.array(dtype=wp.vec3),          # Grad L w.r.t. *final* gaussian color (N, 3)
+    # --- Outputs (Accumulate) ---
+    dL_dmeans: wp.array(dtype=wp.vec3),          # Accumulate mean grads here (N, 3)
+    dL_dshs: wp.array(dtype=wp.vec3)             # Accumulate SH grads here (N * 16, 3)
+):
+    idx = wp.tid()
+    if idx >= num_points or radii[idx] <= 0: # Skip if not rendered
+        return
+    mean = means[idx]
+    base_sh_idx = idx * 16
+    # --- Recompute view direction ---
+    dir_orig = mean - campos
+    dir_len = wp.length(dir_orig)
+    # Skip if direction length is too small (matches CUDA implementation)
+    if dir_len < 1e-8:
+        return
+    # Normalize direction
+    dir = dir_orig / dir_len
+    x = dir[0]; y = dir[1]; z = dir[2]
+    # --- Apply clamping mask to input gradient ---
+    dL_dRGB = dL_dcolor[idx]
+    dL_dRGB = wp_vec3_mul_element(dL_dRGB, wp_vec3_add_element(wp.vec3(1.0, 1.0, 1.0), -1.0 * clamped_state[idx]))
+    # Initialize gradients w.r.t. direction components (dRawColor/ddir)
+    dRGBdx = wp.vec3(0.0, 0.0, 0.0)
+    dRGBdy = wp.vec3(0.0, 0.0, 0.0)
+    dRGBdz = wp.vec3(0.0, 0.0, 0.0)
+    # --- Degree 0 ---
+    # Direct assignment for clarity (matching CUDA style)
+    dRGBdsh0 = SH_C0
+    dL_dshs[base_sh_idx] = dRGBdsh0 * dL_dRGB
+    # --- Degree 1 ---
+    if degree > 0:
+        sh1 = shs[base_sh_idx + 1]
+        sh2 = shs[base_sh_idx + 2]
+        sh3 = shs[base_sh_idx + 3]
+        # Exactly match CUDA computation order
+        dRGBdsh1 = -SH_C1 * y
+        dRGBdsh2 = SH_C1 * z
+        dRGBdsh3 = -SH_C1 * x
+        dL_dshs[base_sh_idx + 1] = dRGBdsh1 * dL_dRGB
+        dL_dshs[base_sh_idx + 2] = dRGBdsh2 * dL_dRGB
+        dL_dshs[base_sh_idx + 3] = dRGBdsh3 * dL_dRGB
+        # Gradient components w.r.t. direction
+        dRGBdx = -SH_C1 * sh3
+        dRGBdy = -SH_C1 * sh1
+        dRGBdz = SH_C1 * sh2
+        # --- Degree 2 ---
+        if degree > 1:
+            xx = x*x; yy = y*y; zz = z*z
+            xy = x*y; yz = y*z; xz = x*z
+            sh4 = shs[base_sh_idx + 4]; sh5 = shs[base_sh_idx + 5]
+            sh6 = shs[base_sh_idx + 6]; sh7 = shs[base_sh_idx + 7]
+            sh8 = shs[base_sh_idx + 8]
+            # Hardcoded C2 values (same as CUDA SH_C2)
+            C2_0 = 1.0925484305920792
+            C2_1 = -1.0925484305920792
+            C2_2 = 0.31539156525252005
+            C2_3 = -1.0925484305920792
+            C2_4 = 0.5462742152960396
+            # Compute gradients for degree 2 (matching CUDA)
+            dRGBdsh4 = C2_0 * xy
+            dRGBdsh5 = C2_1 * yz
+            dRGBdsh6 = C2_2 * (2.0 * zz - xx - yy)
+            dRGBdsh7 = C2_3 * xz
+            dRGBdsh8 = C2_4 * (xx - yy)
+            dL_dshs[base_sh_idx + 4] = dRGBdsh4 * dL_dRGB
+            dL_dshs[base_sh_idx + 5] = dRGBdsh5 * dL_dRGB
+            dL_dshs[base_sh_idx + 6] = dRGBdsh6 * dL_dRGB
+            dL_dshs[base_sh_idx + 7] = dRGBdsh7 * dL_dRGB
+            dL_dshs[base_sh_idx + 8] = dRGBdsh8 * dL_dRGB
+            # Accumulate gradients w.r.t. direction (exactly matching CUDA)
+            dRGBdx += C2_0 * y * sh4 + C2_2 * 2.0 * -x * sh6 + C2_3 * z * sh7 + C2_4 * 2.0 * x * sh8
+            dRGBdy += C2_0 * x * sh4 + C2_1 * z * sh5 + C2_2 * 2.0 * -y * sh6 + C2_4 * 2.0 * -y * sh8
+            dRGBdz += C2_1 * y * sh5 + C2_2 * 2.0 * 2.0 * z * sh6 + C2_3 * x * sh7
+            # --- Degree 3 ---
+            if degree > 2:
+                sh9 = shs[base_sh_idx + 9]; sh10 = shs[base_sh_idx + 10]
+                sh11 = shs[base_sh_idx + 11]; sh12 = shs[base_sh_idx + 12]
+                sh13 = shs[base_sh_idx + 13]; sh14 = shs[base_sh_idx + 14]
+                sh15 = shs[base_sh_idx + 15]
+                # Hardcoded C3 values (same as CUDA SH_C3)
+                C3_0 = -0.5900435899266435
+                C3_1 = 2.890611442640554
+                C3_2 = -0.4570457994644658
+                C3_3 = 0.3731763325901154
+                C3_4 = -0.4570457994644658
+                C3_5 = 1.445305721320277
+                C3_6 = -0.5900435899266435
+                # Direct computation of degree 3 gradients (matching CUDA)
+                dRGBdsh9 = C3_0 * y * (3.0 * xx - yy)
+                dRGBdsh10 = C3_1 * xy * z
+                dRGBdsh11 = C3_2 * y * (4.0 * zz - xx - yy)
+                dRGBdsh12 = C3_3 * z * (2.0 * zz - 3.0 * xx - 3.0 * yy)
+                dRGBdsh13 = C3_4 * x * (4.0 * zz - xx - yy)
+                dRGBdsh14 = C3_5 * z * (xx - yy)
+                dRGBdsh15 = C3_6 * x * (xx - 3.0 * yy)
+                dL_dshs[base_sh_idx + 9] = dRGBdsh9 * dL_dRGB
+                dL_dshs[base_sh_idx + 10] = dRGBdsh10 * dL_dRGB
+                dL_dshs[base_sh_idx + 11] = dRGBdsh11 * dL_dRGB
+                dL_dshs[base_sh_idx + 12] = dRGBdsh12 * dL_dRGB
+                dL_dshs[base_sh_idx + 13] = dRGBdsh13 * dL_dRGB
+                dL_dshs[base_sh_idx + 14] = dRGBdsh14 * dL_dRGB
+                dL_dshs[base_sh_idx + 15] = dRGBdsh15 * dL_dRGB
+                # Accumulate dRGBdx (matching CUDA's expression structure)
+                dRGBdx += (
+                    C3_0 * sh9 * 3.0 * 2.0 * xy +
+                    C3_1 * sh10 * yz +
+                    C3_2 * sh11 * -2.0 * xy +
+                    C3_3 * sh12 * -3.0 * 2.0 * xz +
+                    C3_4 * sh13 * (-3.0 * xx + 4.0 * zz - yy) +
+                    C3_5 * sh14 * 2.0 * xz +
+                    C3_6 * sh15 * 3.0 * (xx - yy)
+                )
+                # Accumulate dRGBdy (matching CUDA's expression structure)
+                dRGBdy += (
+                    C3_0 * sh9 * 3.0 * (xx - yy) +
+                    C3_1 * sh10 * xz +
+                    C3_2 * sh11 * (-3.0 * yy + 4.0 * zz - xx) +
+                    C3_3 * sh12 * -3.0 * 2.0 * yz +
+                    C3_4 * sh13 * -2.0 * xy +
+                    C3_5 * sh14 * -2.0 * yz +
+                    C3_6 * sh15 * -3.0 * 2.0 * xy
+                )
+                # Accumulate dRGBdz (matching CUDA's expression structure)
+                dRGBdz += (
+                    C3_1 * sh10 * xy +
+                    C3_2 * sh11 * 4.0 * 2.0 * yz +
+                    C3_3 * sh12 * 3.0 * (2.0 * zz - xx - yy) +
+                    C3_4 * sh13 * 4.0 * 2.0 * xz +
+                    C3_5 * sh14 * (xx - yy)
+                )
+    # --- Compute gradient w.r.t. view direction (dL/ddir) ---
+    dL_ddir = wp.vec3(wp.dot(dRGBdx, dL_dRGB),
+                      wp.dot(dRGBdy, dL_dRGB),
+                      wp.dot(dRGBdz, dL_dRGB))
+    # --- Propagate gradient from direction to mean position (dL/dmean) ---
+    dL_dmeans_local = dnormvdv(dir_orig, dL_ddir)
+    # --- Accumulate gradients to global arrays ---
+    dL_dmeans[idx] += dL_dmeans_local
+@wp.kernel
+def compute_cov2d_backward_kernel(
+    # --- Inputs ---
+    num_points: int,                             # Number of Gaussian points
+    means: wp.array(dtype=wp.vec3),              # 3D positions (N, 3)
+    cov3Ds: wp.array(dtype=VEC6),                # Packed 3D cov (N, 6)
+    radii: wp.array(dtype=int),                  # Radii computed in forward (N,) - used for skipping
+    h_x: float, h_y: float,                      # Focal lengths
+    tan_fovx: float, tan_fovy: float,            # Tangent of FOV
+    view_matrix: wp.mat44,                       # World->View matrix (4, 4)
+    dL_dconics: wp.array(dtype=wp.vec4),         # Grad L w.r.t. conic (a, b, c) (N, 3)
+    # --- Outputs (Accumulate) ---
+    dL_dmeans: wp.array(dtype=wp.vec3),          # Accumulate mean grads here (N, 3)
+    dL_dcov3Ds: wp.array(dtype=VEC6)             # Accumulate 3D cov grads here (N, 6)
+):
+    idx = wp.tid()
+    if idx >= num_points or radii[idx] <= 0: # Skip if not rendered
+        # Zero out dL_dcov3Ds to ensure we don't keep old values
+        dL_dcov3Ds[idx] = VEC6(0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
+        return
+    mean = means[idx]
+    cov3D_packed = cov3Ds[idx] # VEC6
+    dL_dconic = wp.vec3(dL_dconics[idx][0], dL_dconics[idx][1], dL_dconics[idx][3])
+    t = wp.vec4(mean[0], mean[1], mean[2], 1.0) * view_matrix
+    limx = 1.3 * tan_fovx
+    limy = 1.3 * tan_fovy
+    tz = t[2]
+    inv_tz = 1.0 / tz
+    txtz = t[0] * inv_tz
+    tytz = t[1] * inv_tz
+    x_clamped_flag = (txtz < -limx) or (txtz > limx)
+    y_clamped_flag = (tytz < -limy) or (tytz > limy)
+    x_grad_mul = 1.0 - float(x_clamped_flag) # 1.0 if not clamped, 0.0 if clamped
+    y_grad_mul = 1.0 - float(y_clamped_flag)
+    tx = wp.min(limx, wp.max(-limx, txtz)) * tz
+    ty = wp.min(limy, wp.max(-limy, tytz)) * tz
+    inv_tz2 = inv_tz * inv_tz
+    inv_tz3 = inv_tz2 * inv_tz
+    J00 = h_x * inv_tz
+    J11 = h_y * inv_tz
+    J02 = -h_x * tx * inv_tz2
+    J12 = -h_y * ty * inv_tz2
+    J = wp.transpose(wp.mat33(
+        J00, 0.0, J02,
+        0.0, J11, J12,
+        0.0, 0.0, 0.0
+    ))
+    W = wp.mat33(
+        view_matrix[0,0], view_matrix[0,1], view_matrix[0,2],
+        view_matrix[1,0], view_matrix[1,1], view_matrix[1,2],
+        view_matrix[2,0], view_matrix[2,1], view_matrix[2,2]
+    )
+    T = W * J
+    c0 = cov3D_packed[0]; c1 = cov3D_packed[1]; c2 = cov3D_packed[2]
+    c11 = cov3D_packed[3]; c12 = cov3D_packed[4]; c22 = cov3D_packed[5]
+    Vrk = wp.mat33(c0, c1, c2, c1, c11, c12, c2, c12, c22) # Assumes VEC6 stores upper triangle row-wise
+    cov2D_mat = wp.transpose(T) * wp.transpose(Vrk) * T
+    a_noblr = cov2D_mat[0,0]
+    b_noblr = cov2D_mat[0,1]
+    c_noblr = cov2D_mat[1,1]
+    a = a_noblr + 0.3
+    b = b_noblr
+    c = c_noblr + 0.3
+    denom = a * c - b * b
+    dL_da = 0.0; dL_db = 0.0; dL_dc = 0.0
+    # --- Calculate Gradients ---
+    if denom != 0.0:
+        # Use a small epsilon to prevent division by zero
+        denom2inv = 1.0 / (denom * denom + 1e-7)
+        dL_da = denom2inv * (-c * c * dL_dconic[0] + 2.0 * b * c * dL_dconic[1] + (denom - a * c) * dL_dconic[2])
+        dL_dc = denom2inv * (-a * a * dL_dconic[2] + 2.0 * a * b * dL_dconic[1] + (denom - a * c) * dL_dconic[0])
+        dL_db = denom2inv * 2.0 * (b * c * dL_dconic[0] - (denom + 2.0 * b * b) * dL_dconic[1] + a * b * dL_dconic[2])
+    dL_dcov3Ds[idx] = VEC6(
+        # Diagonal elements
+        T[0][0] * T[0][0] * dL_da + T[0][0] * T[0][1] * dL_db + T[0][1] * T[0][1] * dL_dc,  # c00
+        2.0 * T[0][0] * T[1][0] * dL_da + (T[0][0] * T[1][1] + T[1][0] * T[0][1]) * dL_db + 2.0 * T[0][1] * T[1][1] * dL_dc,  # c01
+        2.0 * T[0][0] * T[2][0] * dL_da + (T[0][0] * T[2][1] + T[2][0] * T[0][1]) * dL_db + 2.0 * T[0][1] * T[2][1] * dL_dc,  # c02
+        T[1][0] * T[1][0] * dL_da + T[1][0] * T[1][1] * dL_db + T[1][1] * T[1][1] * dL_dc,  # c11
+        2.0 * T[2][0] * T[1][0] * dL_da + (T[1][0] * T[2][1] + T[2][0] * T[1][1]) * dL_db + 2.0 * T[1][1] * T[2][1] * dL_dc,  # c12
+        T[2][0] * T[2][0] * dL_da + T[2][0] * T[2][1] * dL_db + T[2][1] * T[2][1] * dL_dc   # c22
+    )
+    dL_dT00 = 2.0 * (T[0][0] * Vrk[0][0] + T[1][0] * Vrk[1][0] + T[2][0] * Vrk[2][0]) * dL_da + \
+              (T[0][1] * Vrk[0][0] + T[1][1] * Vrk[1][0] + T[2][1] * Vrk[2][0]) * dL_db
+    dL_dT01 = 2.0 * (T[0][0] * Vrk[0][1] + T[1][0] * Vrk[1][1] + T[2][0] * Vrk[2][1]) * dL_da + \
+              (T[0][1] * Vrk[0][1] + T[1][1] * Vrk[1][1] + T[2][1] * Vrk[2][1]) * dL_db
+    dL_dT02 = 2.0 * (T[0][0] * Vrk[0][2] + T[1][0] * Vrk[1][2] + T[2][0] * Vrk[2][2]) * dL_da + \
+              (T[0][1] * Vrk[0][2] + T[1][1] * Vrk[1][2] + T[2][1] * Vrk[2][2]) * dL_db
+    dL_dT10 = 2.0 * (T[0][1] * Vrk[0][0] + T[1][1] * Vrk[1][0] + T[2][1] * Vrk[2][0]) * dL_dc + \
+              (T[0][0] * Vrk[0][0] + T[1][0] * Vrk[1][0] + T[2][0] * Vrk[2][0]) * dL_db
+    dL_dT11 = 2.0 * (T[0][1] * Vrk[0][1] + T[1][1] * Vrk[1][1] + T[2][1] * Vrk[2][1]) * dL_dc + \
+              (T[0][0] * Vrk[0][1] + T[1][0] * Vrk[1][1] + T[2][0] * Vrk[2][1]) * dL_db
+    dL_dT12 = 2.0 * (T[0][1] * Vrk[0][2] + T[1][1] * Vrk[1][2] + T[2][1] * Vrk[2][2]) * dL_dc + \
+              (T[0][0] * Vrk[0][2] + T[1][0] * Vrk[1][2] + T[2][0] * Vrk[2][2]) * dL_db
+    dL_dJ00 = W[0,0] * dL_dT00 + W[1,0] * dL_dT01 + W[2,0] * dL_dT02
+    dL_dJ02 = W[0,2] * dL_dT00 + W[1,2] * dL_dT01 + W[2,2] * dL_dT02
+    dL_dJ11 = W[0,1] * dL_dT10 + W[1,1] * dL_dT11 + W[2,1] * dL_dT12
+    dL_dJ12 = W[0,2] * dL_dT10 + W[1,2] * dL_dT11 + W[2,2] * dL_dT12
+    dL_dtx = -h_x * inv_tz2 * dL_dJ02
+    dL_dty = -h_y * inv_tz2 * dL_dJ12
+    dL_dtz = -h_x * inv_tz2 * dL_dJ00 - h_y * inv_tz2 * dL_dJ11 + \
+             2.0 * h_x * tx * inv_tz3 * dL_dJ02 + 2.0 * h_y * ty * inv_tz3 * dL_dJ12
+    dL_dt = wp.vec3(dL_dtx * x_grad_mul, dL_dty * y_grad_mul, dL_dtz)
+    dL_dmean_from_cov = wp.vec4(dL_dt[0], dL_dt[1], dL_dt[2], 1.0) * wp.transpose(view_matrix)
+    dL_dmeans[idx] += wp.vec3(dL_dmean_from_cov[0], dL_dmean_from_cov[1], dL_dmean_from_cov[2])
+@wp.kernel
+def compute_cov3d_backward_kernel(
+    # --- Inputs ---
+    num_points: int,                             # Number of Gaussian points
+    scales: wp.array(dtype=wp.vec3),             # Scale parameters (N, 3)
+    rotations: wp.array(dtype=wp.vec4),          # Quaternions (x, y, z, w) (N, 4)
+    radii: wp.array(dtype=int),                  # Radii computed in forward (N,) - used for skipping
+    scale_modifier: float,                       # Global scale modifier
+    dL_dcov3Ds: wp.array(dtype=VEC6),            # Grad L w.r.t packed 3D cov (N, 6)
+    # --- Outputs ---
+    dL_dscales: wp.array(dtype=wp.vec3),         # Write scale grads here (N, 3)
+    dL_drots: wp.array(dtype=wp.vec4)            # Write rot grads here (N, 4)
+):
+    idx = wp.tid()
+    # Skip if not rendered OR if grad input is zero (e.g., from compute_cov2d_backward)
+    if idx >= num_points or radii[idx] <= 0:
+        dL_dscales[idx] = wp.vec3(0.0, 0.0, 0.0)
+        dL_drots[idx] = wp.vec4(0.0, 0.0, 0.0, 0.0)
+        return
+    # --- Recompute intermediates ---
+    scale_vec = scales[idx]
+    rot_quat = rotations[idx] # (x, y, z, w) in Warp
+    # Extract quaternion components to match CUDA convention (r, x, y, z)
+    r = rot_quat[3]  # Real part is w in Warp
+    x = rot_quat[0]
+    y = rot_quat[1]
+    z = rot_quat[2]
+    # 1. Construct rotation matrix R manually as in CUDA
+    R = wp.mat33(
+        1.0 - 2.0 * (y * y + z * z), 2.0 * (x * y - r * z), 2.0 * (x * z + r * y),
+        2.0 * (x * y + r * z), 1.0 - 2.0 * (x * x + z * z), 2.0 * (y * z - r * x),
+        2.0 * (x * z - r * y), 2.0 * (y * z + r * x), 1.0 - 2.0 * (x * x + y * y)
+    )
+    # 2. Create scaling matrix S
+    s_vec = scale_modifier * scale_vec
+    S = wp.mat33(
+        s_vec[0], 0.0, 0.0,
+        0.0, s_vec[1], 0.0,
+        0.0, 0.0, s_vec[2]
+    )
+    # 3. M = S * R (match CUDA multiplication order)
+    M = S * R
+    # --- Extract gradient w.r.t. 3D covariance ---
+    dL_dcov3D_packed = dL_dcov3Ds[idx]
+    # Convert per-element covariance loss gradients to matrix form
+    dL_dSigma = wp.mat33(
+        dL_dcov3D_packed[0], 0.5 * dL_dcov3D_packed[1], 0.5 * dL_dcov3D_packed[2],
+        0.5 * dL_dcov3D_packed[1], dL_dcov3D_packed[3], 0.5 * dL_dcov3D_packed[4],
+        0.5 * dL_dcov3D_packed[2], 0.5 * dL_dcov3D_packed[4], dL_dcov3D_packed[5]
+    )
+    # --- Calculate Gradients ---
+    # 1. Gradient w.r.t. M: dL/dM = 2 * M * dL/dSigma
+    dL_dM = 2.0 * M * dL_dSigma
+    # 2. Transpose of matrices for gradient calculations
+    Rt = wp.transpose(R)
+    dL_dMt = wp.transpose(dL_dM)
+    # 3. Gradient w.r.t. scales - matching CUDA directly
+    dL_dscale = wp.vec3(
+        wp.dot(Rt[0], dL_dMt[0]),
+        wp.dot(Rt[1], dL_dMt[1]),
+        wp.dot(Rt[2], dL_dMt[2])
+    )
+    dL_dscales[idx] = dL_dscale * scale_modifier
+    # 4. Scale dL_dMt by scale factors for quaternion gradient calculation
+    dL_dMt_scaled = wp.mat33(
+        dL_dMt[0, 0] * s_vec[0], dL_dMt[0, 1] * s_vec[0], dL_dMt[0, 2] * s_vec[0],
+        dL_dMt[1, 0] * s_vec[1], dL_dMt[1, 1] * s_vec[1], dL_dMt[1, 2] * s_vec[1],
+        dL_dMt[2, 0] * s_vec[2], dL_dMt[2, 1] * s_vec[2], dL_dMt[2, 2] * s_vec[2]
+    )
+    # 5. Gradients of loss w.r.t. quaternion components
+    dL_dr = 2.0 * (z * (dL_dMt_scaled[0, 1] - dL_dMt_scaled[1, 0]) +
+                   y * (dL_dMt_scaled[2, 0] - dL_dMt_scaled[0, 2]) +
+                   x * (dL_dMt_scaled[1, 2] - dL_dMt_scaled[2, 1]))
+    dL_dx = 2.0 * (y * (dL_dMt_scaled[1, 0] + dL_dMt_scaled[0, 1]) +
+                   z * (dL_dMt_scaled[2, 0] + dL_dMt_scaled[0, 2]) +
+                   r * (dL_dMt_scaled[1, 2] - dL_dMt_scaled[2, 1])) - \
+            4.0 * x * (dL_dMt_scaled[2, 2] + dL_dMt_scaled[1, 1])
+    dL_dy = 2.0 * (x * (dL_dMt_scaled[1, 0] + dL_dMt_scaled[0, 1]) +
+                   r * (dL_dMt_scaled[2, 0] - dL_dMt_scaled[0, 2]) +
+                   z * (dL_dMt_scaled[1, 2] + dL_dMt_scaled[2, 1])) - \
+            4.0 * y * (dL_dMt_scaled[2, 2] + dL_dMt_scaled[0, 0])
+    dL_dz = 2.0 * (r * (dL_dMt_scaled[0, 1] - dL_dMt_scaled[1, 0]) +
+                   x * (dL_dMt_scaled[2, 0] + dL_dMt_scaled[0, 2]) +
+                   y * (dL_dMt_scaled[1, 2] + dL_dMt_scaled[2, 1])) - \
+            4.0 * z * (dL_dMt_scaled[1, 1] + dL_dMt_scaled[0, 0])
+    # 6. Convert back to Warp's quaternion ordering (x, y, z, r/w)
+    dL_drots[idx] = wp.vec4(dL_dx, dL_dy, dL_dz, dL_dr)
+@wp.kernel
+def wp_render_backward_kernel(
+    # --- Inputs ---
+    # Tile/Range data
+    ranges: wp.array(dtype=wp.vec2i),            # Range of point indices for each tile (start, end)
+    point_list: wp.array(dtype=int),             # Sorted point indices
+    # Image parameters
+    W: int,                                      # Image width
+    H: int,                                      # Image height
+    bg_color: wp.vec3,                           # Background color
+    tile_grid: wp.vec3,                          # Tile grid dimensions
+    # Gaussian parameters
+    points_xy_image: wp.array(dtype=wp.vec2),    # 2D projected positions
+    conic_opacity: wp.array(dtype=wp.vec4),      # Conic matrices and opacities (a, b, c, opacity)
+    colors: wp.array(dtype=wp.vec3),             # RGB colors
+    # Forward pass results
+    final_Ts: wp.array2d(dtype=float),           # Final transparency values
+    n_contrib: wp.array2d(dtype=int),            # Number of Gaussians contributing to each pixel
+    dL_dpixels: wp.array2d(dtype=wp.vec3),       # Gradient of loss w.r.t. output pixels
+    # --- Outputs ---
+    dL_dmean2D: wp.array(dtype=wp.vec3),         # Gradient w.r.t. 2D mean positions
+    dL_dconic2D: wp.array(dtype=wp.vec4),        # Gradient w.r.t. conic matrices
+    dL_dopacity: wp.array(dtype=float),          # Gradient w.r.t. opacity
+    dL_dcolors: wp.array(dtype=wp.vec3),         # Gradient w.r.t. colors
+):
+    """
+    Backward version of the rendering procedure, computing gradients of the loss with respect
+    to Gaussian parameters based on gradients of the loss with respect to output pixels.
+    This kernel is launched per pixel and processes Gaussians in back-to-front order,
+    similar to the forward rendering pass but accumulating gradients.
+    """
+    # Get pixel coordinates
+    tile_x, tile_y, tid_x, tid_y = wp.tid()
+    # Calculate pixel position
+    pix_x = tile_x * TILE_M + tid_x
+    pix_y = tile_y * TILE_N + tid_y
+    # Skip if pixel is outside image bounds
+    inside = (pix_x < W) and (pix_y < H)
+    if not inside:
+        return
+    # Convert to float coordinates for calculations
+    pixf_x = float(pix_x)
+    pixf_y = float(pix_y)
+    # Get tile range (start/end indices in point_list)
+    tile_id = tile_y * int(tile_grid[0]) + tile_x
+    range_start = ranges[tile_id][0]
+    range_end = ranges[tile_id][1]
+    # Get final transparency value and number of contributors from forward pass
+    T_final = final_Ts[pix_y, pix_x]
+    last_contributor = n_contrib[pix_y, pix_x]
+    # first_kept = max(range_start, range_end - last_contributor)   # = range_end-N
+    last_kept = min(range_end, range_start + last_contributor)
+    # Initialize working variables
+    T = T_final  # Current accumulated transparency
+    accum_rec = wp.vec3(0.0, 0.0, 0.0)  # Accumulated color
+    last_alpha = float(0.0)  # Alpha from the last processed Gaussian
+    last_color = wp.vec3(0.0, 0.0, 0.0)  #  Color from the last processed Gaussian
+    # Get gradients
+    dL_dpixel = dL_dpixels[pix_y, pix_x]
+    # Gradient of pixel coordinate w.r.t. normalized screen-space coordinates
+    ddelx_dx = 0.5 * float(W)
+    ddely_dy = 0.5 * float(H)
+    for i in range(last_kept - 1, range_start - 1, -1):
+        gaussian_id = point_list[i]
+        xy = points_xy_image[gaussian_id]
+        con_o = conic_opacity[gaussian_id]  # (a, b, c, opacity)
+        color = colors[gaussian_id]
+        # Compute distance to pixel center
+        d_x = xy[0] - pixf_x
+        d_y = xy[1] - pixf_y
+        # Compute Gaussian power
+        power = -0.5 * (con_o[0] * d_x * d_x + con_o[2] * d_y * d_y) - con_o[1] * d_x * d_y
+        # Skip if power is positive (too far away)
+        if power > 0.0:
+            continue
+        # Compute Gaussian value and alpha
+        G = wp.exp(power)
+        alpha = wp.min(0.99, con_o[3] * G)
+        # Skip if alpha is too small
+        if alpha < (1.0 / 255.0):
+            continue
+        T = T / (1.0 - alpha)
+        # Gradient factor for color contribution
+        dchannel_dcolor = alpha * T
+        # Compute gradient w.r.t. alpha
+        dL_dalpha = 0.0
+        # Update color accumulation and compute color gradients
+        accum_rec = last_alpha * last_color + (1.0 - last_alpha) * accum_rec
+        dL_dchannel = dL_dpixel
+        last_color = color
+        dL_dalpha = wp.dot(color - accum_rec, dL_dpixel)
+        wp.atomic_add(dL_dcolors, gaussian_id, dchannel_dcolor * dL_dchannel)
+        # Scale dL_dalpha by T
+        dL_dalpha *= T
+        last_alpha = alpha
+        # Account for background color contribution
+        bg_dot_dpixel = wp.dot(bg_color, dL_dpixel)
+        dL_dalpha += (-T_final / (1.0 - alpha)) * bg_dot_dpixel
+        # Helpful temporary variables
+        dL_dG = con_o[3] * dL_dalpha
+        gdx = G * d_x
+        gdy = G * d_y
+        dG_ddelx = -gdx * con_o[0] - gdy * con_o[1]
+        dG_ddely = -gdy * con_o[2] - gdx * con_o[1]
+        # Update gradients w.r.t. 2D mean position
+        wp.atomic_add(dL_dmean2D, gaussian_id, wp.vec3(
+            dL_dG * dG_ddelx * ddelx_dx,
+            dL_dG * dG_ddely * ddely_dy,
+            0.0
+        ))
+        # Update gradients w.r.t. 2D conic matrix
+        wp.atomic_add(dL_dconic2D, gaussian_id, wp.vec4(
+            -0.5 * gdx * d_x * dL_dG,
+            -0.5 * gdx * d_y * dL_dG,
+            0.0,
+            -0.5 * gdy * d_y * dL_dG
+        ))
+        # Update gradients w.r.t. opacity
+        wp.atomic_add(dL_dopacity, gaussian_id, G * dL_dalpha)
+@wp.kernel
+def compute_projection_backward_kernel(
+    # --- Inputs ---
+    num_points: int,                             # Number of Gaussian points
+    means: wp.array(dtype=wp.vec3),              # 3D positions (N, 3)
+    radii: wp.array(dtype=int),                  # Radii computed in forward (N,) - used for skipping
+    proj_matrix: wp.mat44,                       # Projection matrix (4, 4)
+    dL_dmean2D: wp.array(dtype=wp.vec3),         # Grad of loss w.r.t. 2D projected means (N, 2)
+    # --- Outputs (Accumulate) ---
+    dL_dmeans: wp.array(dtype=wp.vec3)           # Accumulate mean grads here (N, 3)
+):
+    """Compute gradients of 3D means due to projection to 2D.
+    This kernel handles the gradient propagation from 2D projected positions
+    back to 3D positions, based on the projection matrix.
+    """
+    idx = wp.tid()
+    if idx >= num_points or radii[idx] <= 0: # Skip if not rendered
+        return
+    # Get 3D mean and 2D mean gradient
+    mean3D = means[idx]
+    dL_dmean2D_val = dL_dmean2D[idx]
+    # Compute homogeneous coordinates
+    m_hom = wp.vec4(mean3D[0], mean3D[1], mean3D[2], 1.0)
+    m_hom = m_hom * proj_matrix
+    # Division by w (perspective division)
+    m_w = 1.0 / (m_hom[3] + 0.0000001)
+    # Compute gradient of loss w.r.t. 3D means due to 2D mean gradients
+    # Following the chain rule through the perspective projection
+    mul1 = (proj_matrix[0, 0] * mean3D[0] + proj_matrix[1, 0] * mean3D[1] +
+           proj_matrix[2, 0] * mean3D[2] + proj_matrix[3, 0]) * m_w * m_w
+    mul2 = (proj_matrix[0, 1] * mean3D[0] + proj_matrix[1, 1] * mean3D[1] +
+           proj_matrix[2, 1] * mean3D[2] + proj_matrix[3, 1]) * m_w * m_w
+    dL_dmean = wp.vec3(0.0, 0.0, 0.0)
+    # x component of gradient
+    dL_dmean[0] = (proj_matrix[0, 0] * m_w - proj_matrix[0, 3] * mul1) * dL_dmean2D_val[0] + \
+                 (proj_matrix[0, 1] * m_w - proj_matrix[0, 3] * mul2) * dL_dmean2D_val[1]
+    # y component of gradient
+    dL_dmean[1] = (proj_matrix[1, 0] * m_w - proj_matrix[1, 3] * mul1) * dL_dmean2D_val[0] + \
+                 (proj_matrix[1, 1] * m_w - proj_matrix[1, 3] * mul2) * dL_dmean2D_val[1]
+    # z component of gradient
+    dL_dmean[2] = (proj_matrix[2, 0] * m_w - proj_matrix[2, 3] * mul1) * dL_dmean2D_val[0] + \
+                 (proj_matrix[2, 1] * m_w - proj_matrix[2, 3] * mul2) * dL_dmean2D_val[1]
+    dL_dmeans[idx] += dL_dmean
+def backward_preprocess(
+    # Camera and model parameters
+    num_points: int,
+    means: wp.array(dtype=wp.vec3),           # 3D means
+    means_2d: wp.array(dtype=wp.vec2),        # 2D means
+    radii: wp.array(dtype=int),               # Computed radii
+    sh_coeffs: wp.array(dtype=wp.vec3),       # SH coefficients
+    scales: wp.array(dtype=wp.vec3),          # Scale parameters
+    rotations: wp.array(dtype=wp.vec4),       # Rotation quaternions
+    viewmatrix: wp.mat44,                     # Camera view matrix
+    projmatrix: wp.mat44,                     # Camera projection matrix
+    fov_x: float,                             # Camera horizontal FOV
+    fov_y: float,                             # Camera vertical FOV
+    focal_x: float,
+    focal_y: float,
+    # Intermediate data from forward
+    cov3Ds: wp.array(dtype=wp.mat33),         # 3D covariance matrices (or VEC6 depending on packing)
+    conic_opacity: wp.array(dtype=wp.vec4),  # 2D conics and opacity
+    campos: wp.array(dtype=wp.vec3),        # View directions (should be campos)
+    clamped: wp.array(dtype=wp.uint32),      # Clamping states
+    # Incoming gradients from render backward
+    dL_dmean2D: wp.array(dtype=wp.vec3),     # Grad of loss w.r.t. 2D means
+    dL_dconic: wp.array(dtype=wp.vec4),      # Grad of loss w.r.t. 2D conics
+    dL_dopacity: wp.array(dtype=float),      # Grad of loss w.r.t. opacity
+    dL_dcolors: wp.array(dtype=wp.vec3),     # Grad of loss w.r.t. colors
+    # Output gradient buffers
+    dL_dmeans: wp.array(dtype=wp.vec3),      # Output grad for 3D means
+    dL_dsh: wp.array(dtype=wp.vec3),         # Output grad for SH coeffs
+    dL_dscales: wp.array(dtype=wp.vec3),     # Output grad for scales
+    dL_drots: wp.array(dtype=wp.vec4),       # Output grad for rotations
+    # Optional parameters
+    scale_modifier: float = 1.0,
+    sh_degree: int = 3
+):
+    """
+    Orchestrates the backward pass for 3D Gaussian Splatting by coordinating several kernel calls.
+    """
+    # Create buffer for 3D covariance gradients
+    dL_dcov3D = wp.zeros(num_points, dtype=VEC6, device=DEVICE)
+    # Step 1: Compute gradients for 2D covariance (conic matrix)
+    # This also computes gradients w.r.t. 3D means due to conic computation
+    wp.launch(
+        kernel=compute_cov2d_backward_kernel,
+        dim=num_points,
+        inputs=[
+            num_points,            # P
+            means,                 # means3D
+            cov3Ds,                 # cov3Ds
+            radii,                 # radii
+            focal_x,               # focal_x
+            focal_y,               # focal_y
+            fov_x,                 # tan_fovx
+            fov_y,                 # tan_fovy
+            viewmatrix,            # viewmatrix
+            dL_dconic,             # dL_dconic
+            dL_dmeans,             # dL_dmean3D (outputs)
+            dL_dcov3D              # dL_dcov3D (outputs)
+        ],
+        device=DEVICE
+    )
+    dL_dmeans_np = dL_dmeans.numpy()
+    # Step 2: Compute gradients for 3D means due to projection
+    wp.launch(
+        kernel=compute_projection_backward_kernel,
+        dim=num_points,
+        inputs=[
+            num_points,
+            means,
+            radii,
+            projmatrix,
+            dL_dmean2D,
+            dL_dmeans  # Accumulate to final means gradients
+        ],
+        device=DEVICE
+    )
+    # Step 3: Compute gradients for SH coefficients
+    wp.launch(
+        kernel=sh_backward_kernel,
+        dim=num_points,
+        inputs=[
+            num_points,
+            sh_degree,
+            means,
+            sh_coeffs,
+            radii,
+            campos,
+            clamped,
+            dL_dcolors,
+            dL_dmeans,
+            dL_dsh
+        ],
+        device=DEVICE
+    )
+    dL_dmeans_np = dL_dmeans.numpy()
+    # Step 4: Compute gradients for scales and rotations
+    wp.launch(
+        kernel=compute_cov3d_backward_kernel,
+        dim=num_points,
+        inputs=[
+            num_points,
+            scales,
+            rotations,
+            radii,
+            scale_modifier,
+            dL_dcov3D,
+            dL_dscales,  # Output scale gradients
+            dL_drots     # Output rotation gradients
+        ],
+        device=DEVICE
+    )
+    return dL_dmeans, dL_dsh, dL_dscales, dL_drots
+def backward_render(
+    ranges,
+    point_list,
+    width,
+    height,
+    bg_color,
+    tile_grid,
+    points_xy_image,
+    conic_opacity,
+    colors,
+    final_Ts,
+    n_contrib,
+    dL_dpixels,
+    dL_dmean2D,
+    dL_dconic2D,
+    dL_dopacity,
+    dL_dcolors,
+):
+    """
+    Orchestrates the backward rendering process by launching the backward kernel.
+    Args:
+        ranges: Range of point indices for each tile
+        point_list: Sorted list of point indices
+        width, height: Image dimensions
+        bg_color: Background color
+        points_xy_image: 2D positions of Gaussians
+        conic_opacity: Conic matrices and opacities
+        colors: RGB colors
+        final_Ts: Final transparency values from forward pass
+        n_contrib: Number of contributors per pixel
+        dL_dpixels: Gradient of loss w.r.t. output pixels
+        dL_dmean2D: Output gradient w.r.t. 2D mean positions
+        dL_dconic2D: Output gradient w.r.t. conic matrices
+        dL_dopacity: Output gradient w.r.t. opacity
+        dL_dcolors: Output gradient w.r.t. colors
+    """
+    # Calculate tile grid dimensions
+    tile_grid_x = (width + TILE_M - 1) // TILE_M
+    tile_grid_y = (height + TILE_N - 1) // TILE_N
+    ranges_np = ranges.numpy()
+    # Launch the backward rendering kernel
+    wp.launch(
+        kernel=wp_render_backward_kernel,
+        dim=(tile_grid_x, tile_grid_y, TILE_M, TILE_N),
+        inputs=[
+            ranges,
+            point_list,
+            width,
+            height,
+            bg_color,
+            tile_grid,
+            points_xy_image,
+            conic_opacity,
+            colors,
+            final_Ts,
+            n_contrib,
+            dL_dpixels,
+            dL_dmean2D,
+            dL_dconic2D,
+            dL_dopacity,
+            dL_dcolors,
+        ],
+    )
+def backward(
+    # --- Core parameters ---
+    background,
+    means3D,
+    dL_dpixels,
+    # --- Model parameters ---
+    opacity=None,
+    shs=None,
+    scales=None,
+    rotations=None,
+    scale_modifier=1.0,
+    # --- Camera parameters ---
+    viewmatrix=None,
+    projmatrix=None,
+    tan_fovx=0.5,
+    tan_fovy=0.5,
+    image_height=256,
+    image_width=256,
+    campos=None,
+    # --- Forward output buffers ---
+    radii=None,
+    means2D=None,
+    conic_opacity=None,
+    rgb=None,
+    clamped=None,
+    cov3Ds=None,
+    # --- Internal state buffers ---
+    geom_buffer=None,
+    binning_buffer=None,
+    img_buffer=None,
+    # --- Algorithm parameters ---
+    degree=3,
+    debug=False,
+):
+    """
+    Main backward function for 3D Gaussian Splatting.
+    This function orchestrates the entire backward pass by calling two main sub-functions:
+    1. backward_render: Computes gradients w.r.t. 2D parameters (mean2D, conic, opacity, color)
+    2. backward_preprocess: Computes gradients w.r.t. 3D parameters
+       (mean3D, cov3D, SH coefficients, scales, rotations)
+    Args:
+        background: Background color as numpy array, torch tensor, or wp.vec3 (3,)
+        means3D: 3D positions as numpy array, torch tensor, or wp.array (N, 3)
+        dL_dpixels: Gradient of loss w.r.t. output pixels (H, W, 3)
+        opacity: Opacity values (N, 1) or (N,)
+        shs: Spherical harmonics coefficients (N, D, 3) or flattened (N*D, 3)
+        scales: Scale parameters (N, 3)
+        rotations: Rotation quaternions (N, 4)
+        scale_modifier: Global scale modifier (float)
+        viewmatrix: View matrix (4, 4)
+        projmatrix: Projection matrix (4, 4)
+        tan_fovx: Tangent of x field of view
+        tan_fovy: Tangent of y field of view
+        image_height: Image height
+        image_width: Image width
+        campos: Camera position (3,)
+        radii: Computed radii from forward pass (N,)
+        means2D: 2D projected positions from forward pass (N, 2)
+        conic_opacity: Conic matrices + opacity from forward pass (N, 4)
+        rgb: RGB colors from forward pass (N, 3)
+        clamped: Clamping state from forward pass (N, 3)
+        cov3Ds: 3D covariance matrices from forward pass (N, 6)
+        geom_buffer: Dictionary holding geometric state
+        binning_buffer: Dictionary holding binning state
+        img_buffer: Dictionary holding image state
+        degree: SH degree (0-3)
+        debug: Enable debug output
+    Returns:
+        dict: Dictionary containing gradients for all model parameters:
+            - dL_dmean3D: Gradient w.r.t. 3D positions (N, 3)
+            - dL_dcolor: Gradient w.r.t. colors (N, 3)
+            - dL_dshs: Gradient w.r.t. SH coefficients (N*D, 3)
+            - dL_dopacity: Gradient w.r.t. opacity (N,)
+            - dL_dscale: Gradient w.r.t. scales (N, 3)
+            - dL_drot: Gradient w.r.t. rotations (N, 4)
+    """
+    # Calculate focal lengths from FoV
+    focal_y = image_height / (2.0 * tan_fovy)
+    focal_x = image_width / (2.0 * tan_fovx)
+    # Convert inputs to warp arrays
+    background_warp = background if isinstance(background, wp.vec3) else wp.vec3(background[0], background[1], background[2])
+    means3D_warp = to_warp_array(means3D, wp.vec3)
+    dL_dpixels_warp = to_warp_array(dL_dpixels, wp.vec3) if not isinstance(dL_dpixels, wp.array) else dL_dpixels
+    # Get number of points
+    num_points = means3D_warp.shape[0]
+    # Convert optional parameters if provided
+    opacity_warp = to_warp_array(opacity, float, flatten=True) if opacity is not None else None
+    # SH coefficients need special handling for flattening
+    if shs is not None:
+        sh_data = shs.reshape(-1, 3) if hasattr(shs, 'reshape') and shs.ndim > 2 else shs
+        shs_warp = to_warp_array(sh_data, wp.vec3)
+    else:
+        shs_warp = None
+    # Handle other model parameters
+    scales_warp = to_warp_array(scales, wp.vec3) if scales is not None else None
+    # Handle rotations differently based on shape (matrices vs quaternions)
+    if rotations is not None:
+        rot_shape = rotations.shape[-1] if hasattr(rotations, 'shape') else rotations.size(-1)
+        if rot_shape == 4:  # Quaternions
+            rotations_warp = to_warp_array(rotations, wp.vec4)
+        else:  # 3x3 matrices
+            rotations_warp = to_warp_array(rotations, wp.mat33)
+    else:
+        rotations_warp = None
+    # Handle camera parameters
+    viewmatrix_warp = viewmatrix if isinstance(viewmatrix, wp.mat44) else wp.mat44(viewmatrix.flatten())
+    projmatrix_warp = projmatrix if isinstance(projmatrix, wp.mat44) else wp.mat44(projmatrix.flatten())
+    campos_warp = campos if isinstance(campos, wp.vec3) else wp.vec3(campos[0], campos[1], campos[2])
+    # --- Extract data from buffer dictionaries if provided ---
+    if img_buffer is not None:
+        ranges = img_buffer.get('ranges')
+        final_Ts = img_buffer.get('final_Ts')
+        n_contrib = img_buffer.get('n_contrib')
+    if binning_buffer is not None:
+        point_list = binning_buffer.get('point_list')
+    if geom_buffer is not None:
+        # Use internal data if not provided directly
+        if radii is None:
+            radii = geom_buffer.get('radii')
+        if means2D is None:
+            means2D = geom_buffer.get('means2D')
+        if conic_opacity is None:
+            conic_opacity = geom_buffer.get('conic_opacity')
+        if rgb is None:
+            rgb = geom_buffer.get('rgb')
+        if clamped is None:
+            clamped = geom_buffer.get('clamped_state')
+    # Convert forward pass outputs to warp arrays if they're not already
+    radii_warp = to_warp_array(radii, int) if radii is not None else None
+    means2D_warp = to_warp_array(means2D, wp.vec2) if means2D is not None else None
+    conic_opacity_warp = to_warp_array(conic_opacity, wp.vec4) if conic_opacity is not None else None
+    rgb_warp = to_warp_array(rgb, wp.vec3) if rgb is not None else None
+    clamped_warp = to_warp_array(clamped, wp.uint32) if clamped is not None else None
+    # --- Initialize output gradient arrays ---
+    dL_dmean2D = wp.zeros(num_points, dtype=wp.vec3, device=DEVICE)
+    dL_dconic = wp.zeros(num_points, dtype=wp.vec4, device=DEVICE)
+    dL_dopacity = wp.zeros(num_points, dtype=float, device=DEVICE)
+    dL_dcolor = wp.zeros(num_points, dtype=wp.vec3, device=DEVICE)
+    dL_dmean3D = wp.zeros(num_points, dtype=wp.vec3, device=DEVICE)
+    dL_dcov3D = wp.zeros(num_points, dtype=VEC6, device=DEVICE)
+    # SH gradients depend on degree
+    max_sh_coeffs = 16 if degree >= 3 else (degree + 1) * (degree + 1)
+    dL_dsh = wp.zeros(num_points * max_sh_coeffs, dtype=wp.vec3, device=DEVICE)
+    dL_dscale = wp.zeros(num_points, dtype=wp.vec3, device=DEVICE)
+    dL_drot = wp.zeros(num_points, dtype=wp.vec4, device=DEVICE)
+    # Use precomputed colors if provided, otherwise use colors from forward pass
+    tile_grid = wp.vec3((image_width + TILE_M - 1) // TILE_M,
+                        (image_height + TILE_N - 1) // TILE_N,
+                        1)
+    # --- Step 1: Compute loss gradients w.r.t. 2D parameters ---
+    backward_render(
+        ranges=ranges,
+        point_list=point_list,
+        width=image_width,
+        height=image_height,
+        bg_color=background_warp,
+        tile_grid=tile_grid,
+        points_xy_image=means2D_warp,
+        conic_opacity=conic_opacity_warp,
+        colors=rgb_warp,
+        final_Ts=final_Ts,
+        n_contrib=n_contrib,
+        dL_dpixels=dL_dpixels_warp,
+        dL_dmean2D=dL_dmean2D,
+        dL_dconic2D=dL_dconic,
+        dL_dopacity=dL_dopacity,
+        dL_dcolors=dL_dcolor,
+    )
+    # --- Step 2: Compute gradients for 3D parameters ---
+    backward_preprocess(
+        num_points=num_points,
+        means=means3D_warp,
+        means_2d=means2D_warp,
+        radii=radii_warp,
+        sh_coeffs=shs_warp,
+        scales=scales_warp,
+        rotations=rotations_warp,
+        viewmatrix=viewmatrix_warp,
+        projmatrix=projmatrix_warp,
+        fov_x=tan_fovx,
+        fov_y=tan_fovy,
+        focal_x=focal_x,
+        focal_y=focal_y,
+        cov3Ds=cov3Ds,
+        conic_opacity=conic_opacity_warp,
+        campos=campos_warp,
+        clamped=clamped_warp,
+        dL_dmean2D=dL_dmean2D,
+        dL_dconic=dL_dconic,
+        dL_dopacity=dL_dopacity,
+        dL_dcolors=dL_dcolor,
+        dL_dmeans=dL_dmean3D,
+        dL_dsh=dL_dsh,
+        dL_dscales=dL_dscale,
+        dL_drots=dL_drot,
+        sh_degree=degree
+    )
+    # Return all gradients in a dictionary for easy access
+    return {
+        'dL_dmean3D': dL_dmean3D,
+        'dL_dcolor': dL_dcolor,
+        'dL_dshs': dL_dsh,
+        'dL_dopacity': dL_dopacity,
+        'dL_dscale': dL_dscale,
+        'dL_drot': dL_drot,
+        # Include 2D gradients for completeness
+        'dL_dmean2D': dL_dmean2D,
+        'dL_dconic': dL_dconic,
+        'dL_dcov3D': dL_dcov3D
+    }

gs/config.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""
+Configuration settings and constants for 3D Gaussian Splatting with NeRF datasets.
+"""
+import warp as wp
+import numpy as np
+import random
+SEED = 42
+random.seed(SEED)
+# Warp data types and constants (keep capitalized as they are types)
+WP_FLOAT16 = wp.float16
+WP_FLOAT32 = wp.float32
+WP_INT = wp.int32
+WP_VEC2 = wp.vec2
+WP_VEC2H = wp.vec2h
+VEC6 = wp.types.vector(length=6, dtype=WP_FLOAT32)
+DEVICE = "cuda" #"cpu" # Use "cpu" or "cuda"
+TILE_M = wp.constant(16)
+TILE_N = wp.constant(16)
+TILE_THREADS = wp.constant(256)
+class GaussianParams:
+    """Parameters for 3D Gaussian Splatting."""
+    # Training parameters
+    num_iterations = 3*7000//1  # Default number of training iterations
+    num_points = 5000 # Initial number of Gaussian points
+    # Simple learning rate scheduler configuration
+    use_lr_scheduler = True
+    # Learning rate scheduler configuration
+    lr_scheduler_config = {
+        'lr_pos': 1e-2,      # Initial learning rate for positions
+        'lr_scale': 5e-3,    # Initial learning rate for scales
+        'lr_rot': 5e-3,      # Initial learning rate for rotations
+        'lr_sh': 2e-3,       # Initial learning rate for spherical harmonics
+        'lr_opac': 5e-3,     # Initial learning rate for opacities
+        'final_lr_factor': 0.01  # Final LR will be 1% of initial LR
+    }
+    # Optimization parameters
+    densification_interval = 100  # Perform densification every N iterations
+    pruning_interval = 100  # Perform pruning every N iterations
+    opacity_reset_interval = 3000
+    save_interval = 300  # Save checkpoint every N iterations
+    adam_beta1 = 0.9  # Adam optimizer beta1 parameter
+    adam_beta2 = 0.999  # Adam optimizer beta2 parameter
+    adam_epsilon = 1e-8  # Adam optimizer epsilon parameter
+    densify_grad_threshold = 0.0002
+    cull_opacity_threshold = 0.005
+    start_prune_iter = 500
+    end_prune_iter = 15000
+    percent_dense = 0.01
+    max_allowed_prune_ratio = 1.0 # no limit on pruning ratio
+    # Gaussian parameters
+    initial_scale = 0.1 #0.1  # Initial scale for Gaussian points
+    scale_modifier = 1.0  # Scaling factor for Gaussian splats
+    sh_degree = 3  # Spherical harmonics degree
+    # Scene parameters
+    scene_scale = 1.0  # Scale factor for the scene
+    background_color = [1.0,1.0,1.0] #[0.0, 0.0, 0.0]  # White background for NeRF synthetic
+    # Loss parameters
+    lambda_dssim = 0.0  # Weight for SSIM loss (1.0 means only SSIM, 0.0 means only L1)
+    # Depth loss parameters
+    depth_l1_weight_init = 0.0  # Initial weight for depth L1 loss
+    depth_l1_weight_final = 0.0  # Final weight for depth L1 loss
+    depth_l1_delay_steps = 0  # Number of steps to delay depth loss
+    depth_l1_delay_mult = 0.0  # Multiplier for delay rate
+    near = 0.01  # Default near clipping plane
+    far = 100.0  # Default far clipping plane
+    @classmethod
+    def get_depth_l1_weight(cls, step):
+        """Compute the depth L1 loss weight for the current step.
+        Args:
+            step (int): Current training step
+        Returns:
+            float: Weight for depth L1 loss
+        """
+        if step < 0 or (cls.depth_l1_weight_init == 0.0 and cls.depth_l1_weight_final == 0.0):
+            # Disable depth loss
+            return 0.0
+        if cls.depth_l1_delay_steps > 0:
+            # A kind of reverse cosine decay
+            delay_rate = cls.depth_l1_delay_mult + (1 - cls.depth_l1_delay_mult) * np.sin(
+                0.5 * np.pi * np.clip(step / cls.depth_l1_delay_steps, 0, 1)
+            )
+        else:
+            delay_rate = 1.0
+        # Logarithmic interpolation between initial and final weights
+        t = np.clip(step / cls.num_iterations, 0, 1)
+        log_lerp = np.exp(np.log(cls.depth_l1_weight_init) * (1 - t) + np.log(cls.depth_l1_weight_final) * t)
+        return delay_rate * log_lerp
+    @classmethod
+    def update(cls, **kwargs):
+        """Update parameters with new values."""
+        for key, value in kwargs.items():
+            if hasattr(cls, key):
+                setattr(cls, key, value)
+            else:
+                raise ValueError(f"Unknown parameter: {key}")
+    @classmethod
+    def get_config_dict(cls):
+        """Get parameters as a dictionary."""
+        return {
+            'num_iterations': cls.num_iterations,
+            'num_points': cls.num_points,
+            'densification_interval': cls.densification_interval,
+            'pruning_interval': cls.pruning_interval,
+            'scale_modifier': cls.scale_modifier,
+            'sh_degree': cls.sh_degree,
+            'background_color': cls.background_color,
+            'save_interval': cls.save_interval,
+            'adam_beta1': cls.adam_beta1,
+            'adam_beta2': cls.adam_beta2,
+            'adam_epsilon': cls.adam_epsilon,
+            'initial_scale': cls.initial_scale,
+            'scene_scale': cls.scene_scale,
+            'near': cls.near,
+            'far': cls.far,
+            'lambda_dssim': cls.lambda_dssim,
+            'depth_l1_weight_init': cls.depth_l1_weight_init,
+            'depth_l1_weight_final': cls.depth_l1_weight_final,
+            'depth_l1_delay_steps': cls.depth_l1_delay_steps,
+            'depth_l1_delay_mult': cls.depth_l1_delay_mult,
+            'densify_grad_threshold': cls.densify_grad_threshold,
+            'cull_opacity_threshold': cls.cull_opacity_threshold,
+            'start_prune_iter': cls.start_prune_iter,
+            'end_prune_iter': cls.end_prune_iter,
+            'use_lr_scheduler': cls.use_lr_scheduler,
+            'lr_scheduler_config': cls.lr_scheduler_config,
+            'max_allowed_prune_ratio': cls.max_allowed_prune_ratio,
+        }

gs/create_training_video.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import cv2
+import numpy as np
+import glob
+from tqdm import tqdm
+def create_training_video(input_pattern, output_path, fps=10):
+    """
+    Create a video from training iteration images.
+    Args:
+        input_pattern: Pattern to match image files (e.g., 'output/steak_is/point_cloud/iteration_*/rendered_view.png')
+        output_path: Path to save the output video
+        fps: Frames per second for the output video
+    """
+    # Find all matching image files and sort them by iteration number
+    image_files = sorted(glob.glob(input_pattern),
+                         key=lambda x: int(x.split('iteration_')[1].split('/')[0]))
+    if not image_files:
+        print(f"No images found matching pattern: {input_pattern}")
+        return
+    print(f"Found {len(image_files)} image files")
+    # Read first image to get dimensions
+    first_img = cv2.imread(image_files[0])
+    h, w, _ = first_img.shape
+    # Create VideoWriter object
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    video = cv2.VideoWriter(output_path, fourcc, fps, (w, h))
+    # Add each image to the video
+    for img_path in tqdm(image_files, desc="Creating video"):
+        img = cv2.imread(img_path)
+        # Optionally add iteration number as text overlay
+        iteration = int(img_path.split('iteration_')[1].split('/')[0])
+        cv2.putText(img, f"Iteration {iteration}", (20, 40),
+                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
+        video.write(img)
+    # Release the video writer
+    video.release()
+    print(f"Video created successfully: {output_path}")
+# Add a simple UI to select images and set options
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='Create a video from training iteration images')
+    parser.add_argument('--input', default='output/steak_is/point_cloud/iteration_*/rendered_view.png',
+                        help='Pattern to match image files')
+    parser.add_argument('--output', default='training_progress.mp4',
+                        help='Path to save the output video')
+    parser.add_argument('--fps', type=int, default=10,
+                        help='Frames per second for the output video')
+    parser.add_argument('--reverse', action='store_true',
+                        help='Reverse the order of images (show latest first)')
+    args = parser.parse_args()
+    if args.reverse:
+        # Find all matching image files and sort them in reverse order
+        image_files = sorted(glob.glob(args.input),
+                            key=lambda x: int(x.split('iteration_')[1].split('/')[0]),
+                            reverse=True)
+        if image_files:
+            create_training_video(image_files, args.output, args.fps)
+    else:
+        create_training_video(args.input, args.output, args.fps)

gs/dataset_reader.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

gs/forward.py ADDED Viewed

	@@ -0,0 +1,804 @@

+import warp as wp
+from utils.wp_utils import to_warp_array
+from config import *
+# Initialize Warp
+wp.init()
+print("Warp devices:", wp.get_devices())
+# Define spherical harmonics constants
+SH_C0 = 0.28209479177387814
+SH_C1 = 0.4886025119029199
+import warp as wp
+# Define the CUDA code snippets for bit reinterpretation
+float_to_uint32_snippet = """
+    return reinterpret_cast<uint32_t&>(x);
+"""
+@wp.func_native(float_to_uint32_snippet)
+def float_bits_to_uint32(x: float) -> wp.uint32:
+    ...
+@wp.func
+def ndc2pix(x: float, size: float) -> float:
+    return ((x + 1.0) * size - 1.0) * 0.5
+@wp.func
+def get_rect(p: wp.vec2, max_radius: float, tile_grid: wp.vec3):
+    # Extract grid dimensions
+    grid_size_x = tile_grid[0]
+    grid_size_y = tile_grid[1]
+    rect_min_x = wp.min(wp.int32(grid_size_x), wp.int32(wp.max(wp.int32(0), wp.int32((p[0] - max_radius) / float(TILE_M)))))
+    rect_min_y = wp.min(wp.int32(grid_size_y), wp.int32(wp.max(wp.int32(0), wp.int32((p[1] - max_radius) / float(TILE_N)))))
+    rect_max_x = wp.min(wp.int32(grid_size_x), wp.int32(wp.max(wp.int32(0), wp.int32((p[0] + max_radius + float(TILE_M) - 1.0) / float(TILE_M)))))
+    rect_max_y = wp.min(wp.int32(grid_size_y), wp.int32(wp.max(wp.int32(0), wp.int32((p[1] + max_radius + float(TILE_N) - 1.0) / float(TILE_N)))))
+    return rect_min_x, rect_min_y, rect_max_x, rect_max_y
+@wp.func
+def compute_cov2d(p_orig: wp.vec3, cov3d: VEC6, view_matrix: wp.mat44,
+                 tan_fovx: float, tan_fovy: float, width: float, height: float) -> wp.vec3:
+    t = wp.vec4(p_orig[0], p_orig[1], p_orig[2], 1.0) * view_matrix
+    limx = 1.3 * tan_fovx
+    limy = 1.3 * tan_fovy
+    # Clamp X/Y to stay inside frustum
+    txtz = t[0] / t[2]
+    tytz = t[1] / t[2]
+    t[0] = min(limx, max(-limx, txtz)) * t[2]
+    t[1] = min(limy, max(-limy, tytz)) * t[2]
+    focal_x = width / (2.0 * tan_fovx)
+    focal_y = height / (2.0 * tan_fovy)
+    # compute Jacobian
+    J = wp.mat33(
+        focal_x / t[2], 0.0, -(focal_x * t[0]) / (t[2] * t[2]),
+        0.0, focal_y / t[2], -(focal_y * t[1]) / (t[2] * t[2]),
+        0.0, 0.0, 0.0
+    )
+    W = wp.mat33(
+        view_matrix[0, 0], view_matrix[0, 1], view_matrix[0, 2],
+        view_matrix[1, 0], view_matrix[1, 1], view_matrix[1, 2],
+        view_matrix[2, 0], view_matrix[2, 1], view_matrix[2, 2]
+    )
+    T = J * W
+    Vrk = wp.mat33(
+        cov3d[0], cov3d[1], cov3d[2],
+        cov3d[1], cov3d[3], cov3d[4],
+        cov3d[2], cov3d[4], cov3d[5]
+    )
+    cov = T * wp.transpose(Vrk) * wp.transpose(T)
+    return wp.vec3(cov[0, 0], cov[0, 1], cov[1, 1])
+@wp.func
+def compute_cov3d(scale: wp.vec3, scale_mod: float, rot: wp.vec4) -> VEC6:
+    # Create scaling matrix with modifier applied
+    S = wp.mat33(
+        scale_mod * scale[0], 0.0, 0.0,
+        0.0, scale_mod * scale[1], 0.0,
+        0.0, 0.0, scale_mod * scale[2]
+    )
+    R = wp.quat_to_matrix(wp.quaternion(rot[0], rot[1], rot[2], rot[3]))
+    M = R * S
+    # Compute 3D covariance matrix: Sigma = M * M^T
+    sigma = M * wp.transpose(M)
+    return VEC6(sigma[0, 0], sigma[0, 1], sigma[0, 2], sigma[1, 1], sigma[1, 2], sigma[2, 2])
+@wp.kernel
+def wp_preprocess(
+    orig_points: wp.array(dtype=wp.vec3),
+    scales: wp.array(dtype=wp.vec3),
+    scale_modifier: float,
+    rotations: wp.array(dtype=wp.vec4),
+    opacities: wp.array(dtype=float),
+    shs: wp.array(dtype=wp.vec3),
+    degree: int,
+    clamped: bool,
+    view_matrix: wp.mat44,
+    proj_matrix: wp.mat44,
+    cam_pos: wp.vec3,
+    W: int,
+    H: int,
+    tan_fovx: float,
+    tan_fovy: float,
+    focal_x: float,
+    focal_y: float,
+    radii: wp.array(dtype=int),
+    points_xy_image: wp.array(dtype=wp.vec2),
+    depths: wp.array(dtype=float),
+    cov3Ds: wp.array(dtype=VEC6),
+    rgb: wp.array(dtype=wp.vec3),
+    conic_opacity: wp.array(dtype=wp.vec4),
+    tile_grid: wp.vec3,
+    tiles_touched: wp.array(dtype=int),
+    clamped_state: wp.array(dtype=wp.vec3),
+    prefiltered: bool,
+    antialiasing: bool
+):
+    # Get thread indices
+    i = wp.tid()
+    # For each Gaussian
+    p_orig = orig_points[i]
+    p_view = wp.vec4(p_orig[0], p_orig[1], p_orig[2], 1.0) * view_matrix
+    if p_view[2] < 0.2:
+        return
+    p_hom = wp.vec4(p_orig[0], p_orig[1], p_orig[2], 1.0) * proj_matrix
+    p_w = 1.0 / (p_hom[3] + 0.0000001)
+    p_proj = wp.vec3(p_hom[0] * p_w, p_hom[1] * p_w, p_hom[2] * p_w)
+    cov3d = compute_cov3d(scales[i], scale_modifier, rotations[i])
+    cov3Ds[i] = cov3d
+    # Compute 2D covariance matrix
+    cov2d = compute_cov2d(p_orig, cov3d, view_matrix, tan_fovx, tan_fovy, float(W), float(H))
+    # Constants
+    h_var = 0.3
+    W_float = float(W)
+    H_float = float(H)
+    C = 3  # RGB channels
+    # Add blur/antialiasing factor to covariance
+    det_cov = cov2d[0] * cov2d[2] - cov2d[1] * cov2d[1]
+    cov_with_blur = wp.vec3(cov2d[0] + h_var, cov2d[1], cov2d[2] + h_var)
+    det_cov_plus_h_cov = cov_with_blur[0] * cov_with_blur[2] - cov_with_blur[1] * cov_with_blur[1]
+    # Invert covariance (EWA algorithm)
+    det = det_cov_plus_h_cov
+    if det == 0.0:
+        return
+    det_inv = 1.0 / det
+    conic = wp.vec3(
+        cov_with_blur[2] * det_inv,
+        -cov_with_blur[1] * det_inv,
+        cov_with_blur[0] * det_inv
+    )
+    # Compute eigenvalues of covariance matrix to find screen-space extent
+    mid = 0.5 * (cov_with_blur[0] + cov_with_blur[2])
+    lambda1 = mid + wp.sqrt(wp.max(0.1, mid * mid - det))
+    lambda2 = mid - wp.sqrt(wp.max(0.1, mid * mid - det))
+    my_radius = wp.ceil(3.0 * wp.sqrt(wp.max(lambda1, lambda2)))
+    # Convert to pixel coordinates
+    point_image = wp.vec2(ndc2pix(p_proj[0], W_float), ndc2pix(p_proj[1], H_float))
+    # Get rectangle of affected tiles
+    rect_min_x, rect_min_y, rect_max_x, rect_max_y = get_rect(point_image, my_radius, tile_grid)
+    # Skip if rectangle has 0 area
+    if (rect_max_x - rect_min_x) * (rect_max_y - rect_min_y) == 0:
+        return
+    # Compute color from spherical harmonics
+    pos = p_orig
+    dir_orig = pos - cam_pos
+    dir = wp.normalize(dir_orig)
+    x, y, z = dir[0], dir[1], dir[2]
+    # Base offset for this Gaussian's SH coefficients
+    base_idx = i * 16  # assuming degree 3 (16 coefficients)
+    # Start with the DC component (degree 0)
+    result = SH_C0 * shs[base_idx]
+    # Add higher degree terms if requested
+    if degree > 0:
+        # Degree 1 terms
+        result = result - SH_C1 * y * shs[base_idx + 1] + SH_C1 * z * shs[base_idx + 2] - SH_C1 * x * shs[base_idx + 3]
+        if degree > 1:
+            # Degree 2 terms
+            xx = x*x
+            yy = y*y
+            zz = z*z
+            xy = x*y
+            yz = y*z
+            xz = x*z
+            # Degree 2 terms with hardcoded constants
+            result = result + 1.0925484305920792 * xy * shs[base_idx + 4]
+            result = result + (-1.0925484305920792) * yz * shs[base_idx + 5]
+            result = result + 0.31539156525252005 * (2.0 * zz - xx - yy) * shs[base_idx + 6]
+            result = result + (-1.0925484305920792) * xz * shs[base_idx + 7]
+            result = result + 0.5462742152960396 * (xx - yy) * shs[base_idx + 8]
+            if degree > 2:
+                # Degree 3 terms with hardcoded constants
+                result = result + (-0.5900435899266435) * y * (3.0 * xx - yy) * shs[base_idx + 9]
+                result = result + 2.890611442640554 * xy * z * shs[base_idx + 10]
+                result = result + (-0.4570457994644658) * y * (4.0 * zz - xx - yy) * shs[base_idx + 11]
+                result = result + 0.3731763325901154 * z * (2.0 * zz - 3.0 * xx - 3.0 * yy) * shs[base_idx + 12]
+                result = result + (-0.4570457994644658) * x * (4.0 * zz - xx - yy) * shs[base_idx + 13]
+                result = result + 1.445305721320277 * z * (xx - yy) * shs[base_idx + 14]
+                result = result + (-0.5900435899266435) * x * (xx - 3.0 * yy) * shs[base_idx + 15]
+    result = result + wp.vec3(0.5, 0.5, 0.5)
+    # Track which color channels are clamped (using wp.vec3 instead of separate uint32 values)
+    # Store 1.0 if clamped, 0.0 if not clamped
+    # Use separate assignments instead of conditional expressions
+    r_clamped = 0.0
+    g_clamped = 0.0
+    b_clamped = 0.0
+    if result[0] < 0.0:
+        r_clamped = 1.0
+    if result[1] < 0.0:
+        g_clamped = 1.0
+    if result[2] < 0.0:
+        b_clamped = 1.0
+    clamped_state[i] = wp.vec3(r_clamped, g_clamped, b_clamped)
+    if clamped:
+        # RGB colors are clamped to positive values
+        result = wp.vec3(
+            wp.max(result[0], 0.0),
+            wp.max(result[1], 0.0),
+            wp.max(result[2], 0.0)
+        )
+    rgb[i] = result
+    # Store computed data
+    depths[i] = p_view[2]
+    radii[i] = int(my_radius)
+    points_xy_image[i] = point_image
+    # Pack conic and opacity into single vec4
+    conic_opacity[i] = wp.vec4(conic[0], conic[1], conic[2], opacities[i])
+    # Store tile information
+    tiles_touched[i] = (rect_max_y - rect_min_y) * (rect_max_x - rect_min_x)
+@wp.kernel
+def wp_render_gaussians(
+    # Output buffers
+    rendered_image: wp.array2d(dtype=wp.vec3),
+    depth_image: wp.array2d(dtype=float),
+    # Tile data
+    ranges: wp.array(dtype=wp.vec2i),
+    point_list: wp.array(dtype=int),
+    # Image parameters
+    W: int,
+    H: int,
+    # Gaussian data
+    points_xy_image: wp.array(dtype=wp.vec2),
+    colors: wp.array(dtype=wp.vec3),
+    conic_opacity: wp.array(dtype=wp.vec4),
+    depths: wp.array(dtype=float),
+    # Background color
+    background: wp.vec3,
+    # Tile grid info
+    tile_grid: wp.vec3,
+    # Track additional data
+    final_Ts: wp.array2d(dtype=float),
+    n_contrib: wp.array2d(dtype=int),
+):
+    tile_x, tile_y, tid_x, tid_y = wp.tid()
+    # Calculate tile index
+    if tile_y >= (H + TILE_N - 1) // TILE_N:
+        return
+    # Calculate pixel boundaries for this tile
+    pix_min_x = tile_x * TILE_M
+    pix_min_y = tile_y * TILE_N
+    pix_max_x = wp.min(pix_min_x + TILE_M, W)
+    pix_max_y = wp.min(pix_min_y + TILE_N, H)
+    # Calculate pixel position for this thread
+    pix_x = pix_min_x + tid_x
+    pix_y = pix_min_y + tid_y
+    # Check if this thread processes a valid pixel
+    inside = (pix_x < W) and (pix_y < H)
+    if not inside:
+        return
+    pixf_x = float(pix_x)
+    pixf_y = float(pix_y)
+    # Get start/end range of IDs to process for this tile
+    tile_id = tile_y * int(tile_grid[0]) + tile_x
+    range_start = ranges[tile_id][0]
+    range_end = ranges[tile_id][1]
+    # Initialize blending variables
+    T = float(1.0)  # Transmittance
+    r, g, b = float(0.0), float(0.0), float(0.0)  # Accumulated color
+    expected_inv_depth = float(0.0)  # For depth calculation
+    # Track the number of contributors to this pixel
+    contributor_count = int(0)
+    last_contributor = int(0)
+    # Iterate over all Gaussians influencing this tile
+    for i in range(range_start, range_end):
+        # Get Gaussian ID
+        gaussian_id = point_list[i]
+        # Get Gaussian data
+        xy = points_xy_image[gaussian_id]
+        con_o = conic_opacity[gaussian_id]
+        color = colors[gaussian_id]
+        # Compute distance to Gaussian center
+        d_x = xy[0] - pixf_x
+        d_y = xy[1] - pixf_y
+        # Increment contributor count for this pixel
+        contributor_count += 1
+        # Compute Gaussian power (exponent)
+        power = -0.5 * (con_o[0] * d_x * d_x + con_o[2] * d_y * d_y) - con_o[1] * d_x * d_y
+        # Skip if power is positive (too far away)
+        if power > 0.0:
+            continue
+        # Compute alpha from power and opacity
+        alpha = wp.min(0.99, con_o[3] * wp.exp(power))
+        # Skip if alpha is too small
+        if alpha < (1.0 / 255.0):
+            continue
+        # Test if we're close to fully opaque
+        test_T = T * (1.0 - alpha)
+        if test_T < 0.0001:
+            break  # Early termination if pixel is almost opaque
+        # Accumulate color contribution
+        r += color[0] * alpha * T
+        g += color[1] * alpha * T
+        b += color[2] * alpha * T
+        # Accumulate inverse depth
+        expected_inv_depth += (1.0 / depths[gaussian_id]) * alpha * T
+        # Update transmittance
+        T = test_T
+        last_contributor = contributor_count
+    # Store final transmittance (T) and contributor count
+    final_Ts[pix_y, pix_x] = T
+    n_contrib[pix_y, pix_x] = last_contributor
+    # Write final color to output buffer (color + background)
+    rendered_image[pix_y, pix_x] = wp.vec3(
+        r + T * background[0],
+        g + T * background[1],
+        b + T * background[2]
+    )
+    # Write depth to output buffer
+    depth_image[pix_y, pix_x] = expected_inv_depth
+@wp.kernel
+def wp_duplicate_with_keys(
+    points_xy_image: wp.array(dtype=wp.vec2),
+    depths: wp.array(dtype=float),
+    point_offsets: wp.array(dtype=int),
+    point_list_keys_unsorted: wp.array(dtype=wp.int64),
+    point_list_unsorted: wp.array(dtype=int),
+    radii: wp.array(dtype=int),
+    tile_grid: wp.vec3
+):
+    tid = wp.tid()
+    if tid >= points_xy_image.shape[0]:
+        return
+    r = radii[tid]
+    if r <= 0:
+        return
+    # Find the global offset into key/value buffers
+    offset = 0
+    if tid > 0:
+        offset = point_offsets[tid - 1]
+    pos = points_xy_image[tid]
+    depth_val = depths[tid]
+    rect_min_x, rect_min_y, rect_max_x, rect_max_y = get_rect(pos, float(r), tile_grid)
+    for y in range(rect_min_y, rect_max_y):
+        for x in range(rect_min_x, rect_max_x):
+            tile_id = y * int(tile_grid[0]) + x
+            # Convert to int64 to avoid overflow during bit shift
+            tile_id_64 = wp.int64(tile_id)
+            shifted = tile_id_64 << wp.int64(32)
+            depth_bits = wp.int64(float_bits_to_uint32(depth_val))
+            # Combine tile ID and depth into single key
+            key = wp.int64(shifted) | depth_bits
+            point_list_keys_unsorted[offset] = key
+            point_list_unsorted[offset] = tid
+            offset += 1
+@wp.kernel
+def wp_identify_tile_ranges(
+    num_rendered: int,
+    point_list_keys: wp.array(dtype=wp.int64),
+    ranges: wp.array(dtype=wp.vec2i)  # Each range is (start, end)
+):
+    idx = wp.tid()
+    if idx >= num_rendered:
+        return
+    key = point_list_keys[idx]
+    curr_tile = int(key >> wp.int64(32))
+    # Set start of range if first element or tile changed
+    if idx == 0:
+        ranges[curr_tile][0] = 0
+    else:
+        prev_key = point_list_keys[idx - 1]
+        prev_tile = int(prev_key >> wp.int64(32))
+        if curr_tile != prev_tile:
+            ranges[prev_tile][1] = idx
+            ranges[curr_tile][0] = idx
+    # Set end of range if last element
+    if idx == num_rendered - 1:
+        ranges[curr_tile][1] = num_rendered
+@wp.kernel
+def wp_prefix_sum(input_array: wp.array(dtype=int),
+                      output_array: wp.array(dtype=int)):
+    tid = wp.tid()
+    if tid == 0:
+        output_array[0] = input_array[0]
+        # Perform prefix sum
+        for i in range(1, input_array.shape[0]):
+            output_array[i] = output_array[i-1] + input_array[i]
+@wp.kernel
+def wp_copy_int64(src: wp.array(dtype=wp.int64), dst: wp.array(dtype=wp.int64), count: int):
+    i = wp.tid()
+    if i < count:
+        dst[i] = src[i]
+@wp.kernel
+def wp_copy_int(src: wp.array(dtype=int), dst: wp.array(dtype=int), count: int):
+    i = wp.tid()
+    if i < count:
+        dst[i] = src[i]
+@wp.kernel
+def track_pixel_stats(
+    rendered_image: wp.array2d(dtype=wp.vec3),
+    depth_image: wp.array2d(dtype=float),
+    background: wp.vec3,
+    final_Ts: wp.array2d(dtype=float),
+    n_contrib: wp.array2d(dtype=int),
+    W: int,
+    H: int
+):
+    """Kernel to track final transparency values and contributor counts for each pixel."""
+    x, y = wp.tid()
+    if x >= W or y >= H:
+        return
+    # Get the rendered pixel
+    pixel = rendered_image[y, x]
+    # Calculate approximate alpha transparency by checking for background contribution
+    # If the pixel has no contribution from background, final_T should be close to 0
+    # If it's mostly background, final_T will be close to 1
+    diff_r = abs(pixel[0] - background[0])
+    diff_g = abs(pixel[1] - background[1])
+    diff_b = abs(pixel[2] - background[2])
+    has_content = (diff_r > 0.01) or (diff_g > 0.01) or (diff_b > 0.01)
+    if has_content:
+        # Approximate final_T - in a real scenario this should already be tracked during rendering
+        # We're just making sure it's populated for existing renderings
+        if final_Ts[y, x] == 0.0:
+            # If final_Ts hasn't been set during rendering, approximate it
+            # Higher difference from background means lower T
+            max_diff = max(diff_r, max(diff_g, diff_b))
+            final_Ts[y, x] = 1.0 - min(0.99, max_diff)
+        # Set n_contrib to 1 if we know the pixel has content but no contributor count
+        if n_contrib[y, x] == 0:
+            n_contrib[y, x] = 1
+def render_gaussians(
+    background,
+    means3D,
+    colors=None,
+    opacity=None,
+    scales=None,
+    rotations=None,
+    scale_modifier=1.0,
+    viewmatrix=None,
+    projmatrix=None,
+    tan_fovx=0.5,
+    tan_fovy=0.5,
+    image_height=256,
+    image_width=256,
+    sh=None,
+    degree=3,
+    campos=None,
+    prefiltered=False,
+    antialiasing=False,
+    clamped=True,
+    debug=False,
+):
+    """Render 3D Gaussians using Warp.
+    Args:
+        background: Background color tensor of shape (3,)
+        means3D: 3D positions tensor of shape (N, 3)
+        colors: Optional RGB colors tensor of shape (N, 3)
+        opacity: Opacity values tensor of shape (N, 1) or (N,)
+        scales: Scales tensor of shape (N, 3)
+        rotations: Rotation quaternions of shape (N, 4)
+        scale_modifier: Global scale modifier (float)
+        viewmatrix: View matrix tensor of shape (4, 4)
+        projmatrix: Projection matrix tensor of shape (4, 4)
+        tan_fovx: Tangent of the horizontal field of view
+        tan_fovy: Tangent of the vertical field of view
+        image_height: Height of the output image
+        image_width: Width of the output image
+        sh: Spherical harmonics coefficients tensor of shape (N, D, 3)
+        degree: Degree of spherical harmonics
+        campos: Camera position tensor of shape (3,)
+        prefiltered: Whether input Gaussians are prefiltered
+        antialiasing: Whether to apply antialiasing
+        clamped: Whether to clamp the colors
+        debug: Whether to print debug information
+    Returns:
+        Tuple of (rendered_image, depth_image, intermediate_buffers)
+    """
+    rendered_image = wp.zeros((image_height, image_width), dtype=wp.vec3, device=DEVICE)
+    depth_image = wp.zeros((image_height, image_width), dtype=float, device=DEVICE)
+    # Create additional buffers for tracking transparency and contributors
+    final_Ts = wp.zeros((image_height, image_width), dtype=float, device=DEVICE)
+    n_contrib = wp.zeros((image_height, image_width), dtype=int, device=DEVICE)
+    background_warp = wp.vec3(background[0], background[1], background[2])
+    points_warp = to_warp_array(means3D, wp.vec3)#(device=DEVICE)
+    # SH coefficients should be shape (n, 16, 3)
+    # Convert to a flattened array but preserve the structure
+    sh_data = sh.reshape(-1, 3) if hasattr(sh, 'reshape') else sh
+    shs_warp = to_warp_array(sh_data, wp.vec3)#.to(device=DEVICE)
+    # Handle other parameters
+    opacities_warp = to_warp_array(opacity, float, flatten=True)#.to(device=DEVICE)
+    scales_warp = to_warp_array(scales, wp.vec3)#.to(device=DEVICE)
+    rotations_warp = to_warp_array(rotations, wp.vec4)#.to(device=DEVICE)
+    # Handle camera parameters
+    view_matrix_warp = wp.mat44(viewmatrix.flatten()) if not isinstance(viewmatrix, wp.mat44) else viewmatrix
+    proj_matrix_warp = wp.mat44(projmatrix.flatten()) if not isinstance(projmatrix, wp.mat44) else projmatrix
+    campos_warp = wp.vec3(campos[0], campos[1], campos[2]) if not isinstance(campos, wp.vec3) else campos
+    # Calculate tile grid for spatial optimization
+    tile_grid = wp.vec3((image_width + TILE_M - 1) // TILE_M,
+                        (image_height + TILE_N - 1) // TILE_N,
+                        1)
+    # Preallocate buffers for preprocessed data
+    num_points = points_warp.shape[0]
+    radii = wp.zeros(num_points, dtype=int, device=DEVICE)
+    points_xy_image = wp.zeros(num_points, dtype=wp.vec2, device=DEVICE)
+    depths = wp.zeros(num_points, dtype=float, device=DEVICE)
+    cov3Ds = wp.zeros(num_points, dtype=VEC6, device=DEVICE)
+    rgb = wp.zeros(num_points, dtype=wp.vec3, device=DEVICE)
+    conic_opacity = wp.zeros(num_points, dtype=wp.vec4, device=DEVICE)
+    tiles_touched = wp.zeros(num_points, dtype=int, device=DEVICE)
+    # Add clamped_state buffer to track which color channels are clamped
+    clamped_state = wp.zeros(num_points, dtype=wp.vec3, device=DEVICE)
+    if debug:
+        print(f"\nWARP RENDERING: {image_width}x{image_height} image, {num_points} gaussians")
+        print(f"Colors: {'from SH' if colors is None else 'provided'}, SH degree: {degree}")
+        print(f"Antialiasing: {antialiasing}, Prefiltered: {prefiltered}")
+    # Launch preprocessing kernel
+    wp.launch(
+        kernel=wp_preprocess,
+        dim=(num_points,),
+        inputs=[
+            points_warp,               # orig_points
+            scales_warp,               # scales
+            scale_modifier,            # scale_modifier
+            rotations_warp,            # rotations_quat
+            opacities_warp,            # opacities
+            shs_warp,                  # shs
+            degree,
+            clamped,                   # clamped
+            view_matrix_warp,          # view_matrix
+            proj_matrix_warp,          # proj_matrix
+            campos_warp,               # cam_pos
+            image_width,               # W
+            image_height,              # H
+            tan_fovx,                  # tan_fovx
+            tan_fovy,                  # tan_fovy
+            image_width / (2.0 * tan_fovx),  # focal_x
+            image_height / (2.0 * tan_fovy),  # focal_y
+            radii,                     # radii
+            points_xy_image,           # points_xy_image
+            depths,                    # depths
+            cov3Ds,                    # cov3Ds
+            rgb,                       # rgb
+            conic_opacity,             # conic_opacity
+            tile_grid,                 # tile_grid
+            tiles_touched,             # tiles_touched
+            clamped_state,             # clamped_state - now using wp.vec3
+            prefiltered,               # prefiltered
+            antialiasing               # antialiasing
+        ],
+    )
+    point_offsets = wp.zeros(num_points, dtype=int, device=DEVICE)
+    wp.launch(
+        kernel=wp_prefix_sum,
+        dim=1,
+        inputs=[
+            tiles_touched,
+            point_offsets
+        ]
+    )
+    num_rendered = int(wp.to_torch(point_offsets)[-1].item())  # total number of duplicated entries
+    if num_rendered > (1 << 30):
+        # radix sort needs 2x memory
+        raise ValueError("Number of rendered points exceeds the maximum supported by Warp.")
+    point_list_keys_unsorted = wp.zeros(num_rendered, dtype=wp.int64, device=DEVICE)
+    point_list_unsorted = wp.zeros(num_rendered, dtype=int, device=DEVICE)
+    point_list_keys = wp.zeros(num_rendered, dtype=wp.int64, device=DEVICE)
+    point_list = wp.zeros(num_rendered, dtype=int, device=DEVICE)
+    wp.launch(
+        kernel=wp_duplicate_with_keys,
+        dim=num_points,
+        inputs=[
+            points_xy_image,
+            depths,
+            point_offsets,
+            point_list_keys_unsorted,
+            point_list_unsorted,
+            radii,
+            tile_grid
+        ]
+    )#
+    point_list_keys_unsorted_padded = wp.zeros(num_rendered * 2, dtype=wp.int64, device=DEVICE)
+    point_list_unsorted_padded = wp.zeros(num_rendered * 2, dtype=int, device=DEVICE)
+    # Copy data to padded arrays
+    wp.copy(point_list_keys_unsorted_padded, point_list_keys_unsorted)
+    wp.copy(point_list_unsorted_padded, point_list_unsorted)
+    wp.utils.radix_sort_pairs(
+        point_list_keys_unsorted_padded,  # keys to sort
+        point_list_unsorted_padded,       # values to sort along with keys
+        num_rendered                      # number of elements to sort
+    )
+    wp.launch(
+        kernel=wp_copy_int64,
+        dim=num_rendered,
+        inputs=[
+            point_list_keys_unsorted_padded,
+            point_list_keys,
+            num_rendered
+        ]
+    )
+    wp.launch(
+        kernel=wp_copy_int,
+        dim=num_rendered,
+        inputs=[
+            point_list_unsorted_padded,
+            point_list,
+            num_rendered
+        ]
+    )
+    tile_count = int(tile_grid[0] * tile_grid[1])
+    ranges = wp.zeros(tile_count, dtype=wp.vec2i, device=DEVICE)  # each is (start, end)
+    if num_rendered > 0:
+        wp.launch(
+            kernel=wp_identify_tile_ranges,  # You also need this kernel
+            dim=num_rendered,
+            inputs=[
+                num_rendered,
+                point_list_keys,
+                ranges
+            ]
+        )
+        wp.launch(
+            kernel=wp_render_gaussians,
+            dim=(int(tile_grid[0]), int(tile_grid[1]), TILE_M, TILE_N),
+            inputs=[
+                rendered_image,        # Output color image
+                depth_image,           # Output depth image
+                ranges,                # Tile ranges
+                point_list,            # Sorted point indices
+                image_width,           # Image width
+                image_height,          # Image height
+                points_xy_image,       # 2D points
+                rgb,                   # Precomputed colors
+                conic_opacity,         # Conic matrices and opacities
+                depths,                # Depth values
+                background_warp,       # Background color
+                tile_grid,             # Tile grid configuration
+                final_Ts,              # Final transparency values
+                n_contrib,             # Number of contributors per pixel
+            ]
+        )
+        # Launch the pixel stats tracking kernel as a fallback
+        # to make sure final_Ts and n_contrib are populated
+        # This is especially important for existing rendered pixels
+        wp.launch(
+            kernel=track_pixel_stats,
+            dim=(image_width, image_height),
+            inputs=[
+                rendered_image,
+                depth_image,
+                background_warp,
+                final_Ts,
+                n_contrib,
+                image_width,
+                image_height
+            ]
+        )
+    return rendered_image, depth_image, {
+        "radii": radii,
+        "point_offsets": point_offsets,
+        "points_xy_image": points_xy_image,
+        "depths": depths,
+        "colors": rgb,
+        "cov3Ds": cov3Ds,
+        "conic_opacity": conic_opacity,
+        "point_list": point_list,
+        "ranges": ranges,
+        "final_Ts": final_Ts,  # Add final_Ts to intermediate buffers
+        "n_contrib": n_contrib,  # Add contributor count to intermediate buffers
+        "clamped_state": clamped_state  # Add clamped state to intermediate buffers
+    }

gs/lib64 ADDED Viewed

	@@ -0,0 +1 @@


1	+ lib

gs/loss.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import warp as wp
+import numpy as np
+from config import DEVICE
+from utils.wp_utils import wp_vec3_mul_element
+# Constants for SSIM calculation
+C1 = 0.01 ** 2
+C2 = 0.03 ** 2
+WINDOW_SIZE = 11
+@wp.kernel
+def l1_loss_kernel(
+    rendered: wp.array2d(dtype=wp.vec3),
+    target: wp.array2d(dtype=wp.vec3),
+    loss_buffer: wp.array(dtype=float),
+    width: int,
+    height: int
+):
+    i, j = wp.tid()
+    if i >= width or j >= height:
+        return
+    # Compute L1 difference for each pixel component
+    rendered_pixel = rendered[j, i]
+    target_pixel = target[j, i]
+    diff = wp.abs(rendered_pixel - target_pixel)
+    l1_diff = diff[0] + diff[1] + diff[2]
+    # Atomic add to loss buffer
+    wp.atomic_add(loss_buffer, 0, l1_diff)
+@wp.kernel
+def gaussian_kernel(
+    kernel: wp.array(dtype=float),
+    sigma: float,
+    kernel_size: int
+):
+    i = wp.tid()
+    if i >= kernel_size:
+        return
+    center = kernel_size // 2
+    x = i - center
+    kernel[i] = wp.exp(-1.0 * float(x * x) / (2.0 * sigma * sigma))
+@wp.kernel
+def ssim_kernel(
+    rendered: wp.array2d(dtype=wp.vec3),
+    target: wp.array2d(dtype=wp.vec3),
+    gaussian_weights: wp.array(dtype=float),
+    ssim_buffer: wp.array(dtype=float),
+    width: int,
+    height: int,
+    window_size: int
+):
+    i, j = wp.tid()
+    if i >= width or j >= height:
+        return
+    # Constants for numerical stability
+    c1 = 0.01 * 0.01
+    c2 = 0.03 * 0.03
+    # We'll compute SSIM in a local window around each pixel
+    half_window = window_size // 2
+    # Initialize accumulators
+    mu1 = wp.vec3(0.0, 0.0, 0.0)
+    mu2 = wp.vec3(0.0, 0.0, 0.0)
+    sigma1 = wp.vec3(0.0, 0.0, 0.0)
+    sigma2 = wp.vec3(0.0, 0.0, 0.0)
+    sigma12 = wp.vec3(0.0, 0.0, 0.0)
+    weight_sum = float(0.0)
+    # Calculate weighted means and variances over the window
+    for y in range(max(0, j - half_window), min(height, j + half_window + 1)):
+        for x in range(max(0, i - half_window), min(width, i + half_window + 1)):
+            # Get Gaussian weight for this position
+            wy = abs(y - j)
+            wx = abs(x - i)
+            if wx <= half_window and wy <= half_window:
+                w = gaussian_weights[wx] * gaussian_weights[wy]
+                # Get pixels
+                p1 = rendered[y, x]
+                p2 = target[y, x]
+                # Accumulate weighted values
+                mu1 += p1 * w
+                mu2 += p2 * w
+                sigma1 += wp_vec3_mul_element(p1, p1) * w
+                sigma2 += wp_vec3_mul_element(p2, p2) * w
+                sigma12 += wp_vec3_mul_element(p1, p2) * w
+                weight_sum += w
+    # Normalize by weights
+    if weight_sum > 0.0:
+        mu1 /= weight_sum
+        mu2 /= weight_sum
+        sigma1 /= weight_sum
+        sigma2 /= weight_sum
+        sigma12 /= weight_sum
+    # Calculate variance and covariance
+    sigma1 = sigma1 - wp_vec3_mul_element(mu1, mu1)
+    sigma2 = sigma2 - wp_vec3_mul_element(mu2, mu2)
+    sigma12 = sigma12 - wp_vec3_mul_element(mu1, mu2)
+    # Calculate SSIM for each channel
+    ssim_r = ((2.0 * mu1[0] * mu2[0] + c1) * (2.0 * sigma12[0] + c2)) / ((mu1[0] * mu1[0] + mu2[0] * mu2[0] + c1) * (sigma1[0] + sigma2[0] + c2))
+    ssim_g = ((2.0 * mu1[1] * mu2[1] + c1) * (2.0 * sigma12[1] + c2)) / ((mu1[1] * mu1[1] + mu2[1] * mu2[1] + c1) * (sigma1[1] + sigma2[1] + c2))
+    ssim_b = ((2.0 * mu1[2] * mu2[2] + c1) * (2.0 * sigma12[2] + c2)) / ((mu1[2] * mu1[2] + mu2[2] * mu2[2] + c1) * (sigma1[2] + sigma2[2] + c2))
+    # Average SSIM across channels
+    ssim_val = (ssim_r + ssim_g + ssim_b) / 3.0
+    # Atomic add to SSIM buffer
+    wp.atomic_add(ssim_buffer, 0, ssim_val)
+@wp.kernel
+def backprop_l1_pixel_gradients(
+    rendered: wp.array2d(dtype=wp.vec3),
+    target: wp.array2d(dtype=wp.vec3),
+    pixel_grad: wp.array2d(dtype=wp.vec3),
+    width: int,
+    height: int,
+    l1_weight: float
+):
+    i, j = wp.tid()
+    if i >= width or j >= height:
+        return
+    # Compute gradient (sign function for L1 loss)
+    rendered_pixel = rendered[j, i]
+    target_pixel = target[j, i]
+    # Sign function for L1 gradient
+    l1_grad = wp.vec3(
+        l1_weight * wp.sign(rendered_pixel[0] - target_pixel[0]),
+        l1_weight * wp.sign(rendered_pixel[1] - target_pixel[1]),
+        l1_weight * wp.sign(rendered_pixel[2] - target_pixel[2])
+    )
+    # Store L1 gradients
+    pixel_grad[j, i] = l1_grad
+def l1_loss(rendered, target):
+    """Compute L1 loss between rendered and target images"""
+    height, width = rendered.shape[0], rendered.shape[1]
+    # Create device arrays if not already
+    if not isinstance(rendered, wp.array):
+        d_rendered = wp.array(rendered, dtype=wp.vec3, device=DEVICE)
+    else:
+        d_rendered = rendered
+    if not isinstance(target, wp.array):
+        d_target = wp.array(target, dtype=wp.vec3, device=DEVICE)
+    else:
+        d_target = target
+    # Create loss buffer
+    loss_buffer = wp.zeros(1, dtype=float, device=DEVICE)
+    # Compute loss
+    wp.launch(
+        kernel=l1_loss_kernel,
+        dim=(width, height),
+        inputs=[d_rendered, d_target, loss_buffer, width, height]
+    )
+    # Get loss value
+    loss = float(loss_buffer.numpy()[0]) / (width * height * 3)  # Normalize by pixel count and channels
+    np_loss_buffer = loss_buffer.numpy()
+    return loss
+def ssim(rendered, target):
+    """Compute SSIM between rendered and target images"""
+    height, width = rendered.shape[0], rendered.shape[1]
+    # Create device arrays if not already
+    if not isinstance(rendered, wp.array):
+        d_rendered = wp.array(rendered, dtype=wp.vec3, device=DEVICE)
+    else:
+        d_rendered = rendered
+    if not isinstance(target, wp.array):
+        d_target = wp.array(target, dtype=wp.vec3, device=DEVICE)
+    else:
+        d_target = target
+    # Precompute Gaussian kernel
+    kernel_size = WINDOW_SIZE
+    gaussian_weights = wp.zeros(kernel_size, dtype=float, device=DEVICE)
+    wp.launch(
+        gaussian_kernel,
+        dim=kernel_size,
+        inputs=[gaussian_weights, 1.5, kernel_size]
+    )
+    # Create SSIM buffer
+    ssim_buffer = wp.zeros(1, dtype=float, device=DEVICE)
+    pixel_count = wp.zeros(1, dtype=int, device=DEVICE)
+    # Compute SSIM
+    wp.launch(
+        ssim_kernel,
+        dim=(width, height),
+        inputs=[d_rendered, d_target, gaussian_weights, ssim_buffer, width, height, kernel_size]
+    )
+    # Get SSIM value (average over valid pixels)
+    ssim_val = float(ssim_buffer.numpy()[0]) / (width * height)
+    return ssim_val
+def compute_image_gradients(rendered, target, lambda_dssim=0.2):
+    """Compute gradients for combined L1 and SSIM loss"""
+    height, width = rendered.shape[0], rendered.shape[1]
+    # Create device arrays if not already
+    if not isinstance(rendered, wp.array):
+        d_rendered = wp.array(rendered, dtype=wp.vec3, device=DEVICE)
+    else:
+        d_rendered = rendered
+    if not isinstance(target, wp.array):
+        d_target = wp.array(target, dtype=wp.vec3, device=DEVICE)
+    else:
+        d_target = target
+    # Create gradient buffer
+    pixel_grad = wp.zeros((height, width), dtype=wp.vec3, device=DEVICE)
+    # Compute L1 loss gradient
+    l1_weight = (1.0 - lambda_dssim) / (height * width * 3.0)
+    wp.launch(
+        backprop_l1_pixel_gradients,
+        dim=(width, height),
+        inputs=[d_rendered, d_target, pixel_grad, width, height, l1_weight]
+    )
+    # TODO: Add SSIM gradient
+    return pixel_grad
+@wp.kernel
+def depth_loss_kernel(
+    rendered_depth: wp.array2d(dtype=float),
+    target_depth: wp.array2d(dtype=float),
+    depth_mask: wp.array2d(dtype=float),
+    loss_buffer: wp.array(dtype=float),
+    width: int,
+    height: int
+):
+    i, j = wp.tid()
+    if i >= width or j >= height:
+        return
+    # Get depths and mask
+    rendered_inv_depth = rendered_depth[j, i]
+    target_inv_depth = target_depth[j, i]
+    mask = depth_mask[j, i]
+    # Compute L1 difference for inverse depths
+    diff = wp.abs(rendered_inv_depth - target_inv_depth) * mask
+    # Atomic add to loss buffer
+    wp.atomic_add(loss_buffer, 0, diff)
+def depth_loss(rendered_depth, target_depth, depth_mask):
+    """Compute L1 loss between rendered and target inverse depths"""
+    height, width = rendered_depth.shape[0], rendered_depth.shape[1]
+    # Create device arrays if not already
+    if not isinstance(rendered_depth, wp.array):
+        d_rendered_depth = wp.array(rendered_depth, dtype=float, device=DEVICE)
+    else:
+        d_rendered_depth = rendered_depth
+    if not isinstance(target_depth, wp.array):
+        d_target_depth = wp.array(target_depth, dtype=float, device=DEVICE)
+    else:
+        d_target_depth = target_depth
+    if not isinstance(depth_mask, wp.array):
+        d_depth_mask = wp.array(depth_mask, dtype=float, device=DEVICE)
+    else:
+        d_depth_mask = depth_mask
+    # Create loss buffer
+    loss_buffer = wp.zeros(1, dtype=float, device=DEVICE)
+    # Compute loss
+    wp.launch(
+        kernel=depth_loss_kernel,
+        dim=(width, height),
+        inputs=[d_rendered_depth, d_target_depth, d_depth_mask, loss_buffer, width, height]
+    )
+    # Get loss value
+    loss = float(loss_buffer.numpy()[0]) / (width * height)  # Normalize by pixel count
+    return loss

gs/optimizer.py ADDED Viewed

	@@ -0,0 +1,399 @@

+import warp as wp
+from utils.wp_utils import to_warp_array, wp_vec3_mul_element, wp_vec3_add_element, wp_vec3_sqrt, wp_vec3_div_element, wp_vec3_clamp
+from config import *
+@wp.kernel
+def adam_update(
+    # Parameters
+    positions: wp.array(dtype=wp.vec3),
+    scales: wp.array(dtype=wp.vec3),
+    rotations: wp.array(dtype=wp.vec4),
+    opacities: wp.array(dtype=float),
+    shs: wp.array(dtype=wp.vec3),
+    # Gradients
+    pos_grads: wp.array(dtype=wp.vec3),
+    scale_grads: wp.array(dtype=wp.vec3),
+    rot_grads: wp.array(dtype=wp.vec4),
+    opacity_grads: wp.array(dtype=float),
+    sh_grads: wp.array(dtype=wp.vec3),
+    # First moments (m)
+    m_positions: wp.array(dtype=wp.vec3),
+    m_scales: wp.array(dtype=wp.vec3),
+    m_rotations: wp.array(dtype=wp.vec4),
+    m_opacities: wp.array(dtype=float),
+    m_shs: wp.array(dtype=wp.vec3),
+    # Second moments (v)
+    v_positions: wp.array(dtype=wp.vec3),
+    v_scales: wp.array(dtype=wp.vec3),
+    v_rotations: wp.array(dtype=wp.vec4),
+    v_opacities: wp.array(dtype=float),
+    v_shs: wp.array(dtype=wp.vec3),
+    num_points: int,
+    lr_pos: float,
+    lr_scale: float,
+    lr_rot: float,
+    lr_opac: float,
+    lr_sh: float,
+    beta1: float,
+    beta2: float,
+    epsilon: float,
+    iteration: int
+):
+    i = wp.tid()
+    if i >= num_points:
+        return
+    # Bias correction terms
+    bias_correction1 = 1.0 - wp.pow(beta1, float(iteration + 1))
+    bias_correction2 = 1.0 - wp.pow(beta2, float(iteration + 1))
+    # Update positions
+    m_positions[i] = beta1 * m_positions[i] + (1.0 - beta1) * pos_grads[i]
+    # Use the helper function for element-wise multiplication
+    v_positions[i] = beta2 * v_positions[i] + (1.0 - beta2) * wp_vec3_mul_element(pos_grads[i], pos_grads[i])
+    # Use distinct names for corrected moments per parameter type
+    m_pos_corrected = m_positions[i] / bias_correction1
+    v_pos_corrected = v_positions[i] / bias_correction2
+    # Use the helper function for element-wise sqrt and division
+    denominator_pos = wp_vec3_sqrt(v_pos_corrected) + wp.vec3(epsilon, epsilon, epsilon)
+    positions[i] = positions[i] - lr_pos * wp_vec3_div_element(m_pos_corrected, denominator_pos)
+    # Update scales (with some constraints to keep them positive)
+    m_scales[i] = beta1 * m_scales[i] + (1.0 - beta1) * scale_grads[i]
+    # Use the helper function for element-wise multiplication
+    v_scales[i] = beta2 * v_scales[i] + (1.0 - beta2) * wp_vec3_mul_element(scale_grads[i], scale_grads[i])
+    # Use distinct names for corrected moments per parameter type
+    m_scale_corrected = m_scales[i] / bias_correction1
+    v_scale_corrected = v_scales[i] / bias_correction2
+    # Use the helper function for element-wise sqrt and division
+    denominator_scale = wp_vec3_sqrt(v_scale_corrected) + wp.vec3(epsilon, epsilon, epsilon)
+    scale_update = lr_scale * wp_vec3_div_element(m_scale_corrected, denominator_scale)
+    scales[i] = wp.vec3(
+        wp.max(scales[i][0] - scale_update[0], 0.001),
+        wp.max(scales[i][1] - scale_update[1], 0.001),
+        wp.max(scales[i][2] - scale_update[2], 0.001)
+    )
+    # Update rotations
+    m_rotations[i] = beta1 * m_rotations[i] + (1.0 - beta1) * rot_grads[i]
+    # Element-wise multiplication for quaternions
+    v_rotations[i] = beta2 * v_rotations[i] + (1.0 - beta2) * wp.vec4(
+        rot_grads[i][0] * rot_grads[i][0],
+        rot_grads[i][1] * rot_grads[i][1],
+        rot_grads[i][2] * rot_grads[i][2],
+        rot_grads[i][3] * rot_grads[i][3]
+    )
+    m_rot_corrected = m_rotations[i] / bias_correction1
+    v_rot_corrected = v_rotations[i] / bias_correction2
+    # Element-wise sqrt and division for quaternions
+    denominator_rot = wp.vec4(
+        wp.sqrt(v_rot_corrected[0]) + epsilon,
+        wp.sqrt(v_rot_corrected[1]) + epsilon,
+        wp.sqrt(v_rot_corrected[2]) + epsilon,
+        wp.sqrt(v_rot_corrected[3]) + epsilon
+    )
+    rot_update = wp.vec4(
+        lr_rot * m_rot_corrected[0] / denominator_rot[0],
+        lr_rot * m_rot_corrected[1] / denominator_rot[1],
+        lr_rot * m_rot_corrected[2] / denominator_rot[2],
+        lr_rot * m_rot_corrected[3] / denominator_rot[3]
+    )
+    rotations[i] = rotations[i] - rot_update
+    # Normalize quaternion to ensure it's a valid rotation
+    quat_length = wp.sqrt(rotations[i][0]*rotations[i][0] +
+                         rotations[i][1]*rotations[i][1] +
+                         rotations[i][2]*rotations[i][2] +
+                         rotations[i][3]*rotations[i][3])
+    if quat_length > 0.0:
+        rotations[i] = wp.vec4(
+            rotations[i][0] / quat_length,
+            rotations[i][1] / quat_length,
+            rotations[i][2] / quat_length,
+            rotations[i][3] / quat_length
+        )
+    # Update opacity (with clamping to [0,1])
+    m_opacities[i] = beta1 * m_opacities[i] + (1.0 - beta1) * opacity_grads[i]
+    # Opacity is scalar, direct multiplication is fine
+    v_opacities[i] = beta2 * v_opacities[i] + (1.0 - beta2) * (opacity_grads[i] * opacity_grads[i])
+    # Use distinct names for corrected moments per parameter type
+    m_opacity_corrected = m_opacities[i] / bias_correction1
+    v_opacity_corrected = v_opacities[i] / bias_correction2
+    # Opacity is scalar, direct wp.sqrt is fine here
+    opacity_update = lr_opac * m_opacity_corrected / (wp.sqrt(v_opacity_corrected) + epsilon)
+    opacities[i] = wp.max(wp.min(opacities[i] - opacity_update, 1.0), 0.0)
+    # Update SH coefficients
+    for j in range(16):
+        idx = i * 16 + j
+        m_shs[idx] = beta1 * m_shs[idx] + (1.0 - beta1) * sh_grads[idx]
+        # Use the helper function for element-wise multiplication
+        v_shs[idx] = beta2 * v_shs[idx] + (1.0 - beta2) * wp_vec3_mul_element(sh_grads[idx], sh_grads[idx])
+        # Use distinct names for corrected moments per parameter type
+        m_sh_corrected = m_shs[idx] / bias_correction1
+        v_sh_corrected = v_shs[idx] / bias_correction2
+        # Use the helper function for element-wise sqrt and division
+        denominator_sh = wp_vec3_sqrt(v_sh_corrected) + wp.vec3(epsilon, epsilon, epsilon)
+        shs[idx] = shs[idx] - lr_sh * wp_vec3_div_element(m_sh_corrected, denominator_sh)
+@wp.kernel
+def reset_opacities(
+    opacities: wp.array(dtype=float),
+    max_opacity: float,
+    num_points: int
+):
+    """Reset opacities to prevent oversaturation."""
+    i = wp.tid()
+    if i >= num_points:
+        return
+    # Reset opacity to a small value
+    opacities[i] = max_opacity
+@wp.kernel
+def reset_densification_stats(
+    xyz_gradient_accum: wp.array(dtype=float),
+    denom: wp.array(dtype=float),
+    max_radii2D: wp.array(dtype=float),
+    num_points: int
+):
+    """Reset densification statistics after parameter count changes."""
+    i = wp.tid()
+    if i >= num_points:
+        return
+    xyz_gradient_accum[i] = 0.0
+    denom[i] = 0.0
+    max_radii2D[i] = 0.0
+@wp.kernel
+def mark_split_candidates(
+    grads: wp.array(dtype=float),
+    scales: wp.array(dtype=wp.vec3),
+    grad_threshold: float,
+    scene_extent: float,
+    percent_dense: float,
+    split_mask: wp.array(dtype=int),
+    num_points: int
+):
+    """Mark large Gaussians with high gradients for splitting."""
+    i = wp.tid()
+    if i >= num_points:
+        return
+    # Check if gradient exceeds threshold
+    high_grad = grads[i] >= grad_threshold
+    # Check if Gaussian is large (max scale > threshold)
+    max_scale = wp.max(wp.max(scales[i][0], scales[i][1]), scales[i][2])
+    scale_threshold = percent_dense * scene_extent
+    large_gaussian = max_scale > scale_threshold
+    # Mark for splitting if both conditions are met
+    if (high_grad and large_gaussian):
+        split_mask[i] = 1
+    else:
+        split_mask[i] = 0
+@wp.kernel
+def mark_clone_candidates(
+    grads: wp.array(dtype=float),
+    scales: wp.array(dtype=wp.vec3),
+    grad_threshold: float,
+    scene_extent: float,
+    percent_dense: float,
+    clone_mask: wp.array(dtype=int),
+    num_points: int
+):
+    """Mark small Gaussians with high gradients for cloning."""
+    i = wp.tid()
+    if i >= num_points:
+        return
+    # Check if gradient exceeds threshold
+    high_grad = grads[i] >= grad_threshold
+    # Check if Gaussian is small (max scale <= threshold)
+    max_scale = wp.max(wp.max(scales[i][0], scales[i][1]), scales[i][2])
+    scale_threshold = percent_dense * scene_extent
+    small_gaussian = max_scale <= scale_threshold
+    # Mark for cloning if both conditions are met
+    if (high_grad and small_gaussian):
+        clone_mask[i] = 1
+    else:
+        clone_mask[i] = 0
+@wp.kernel
+def split_gaussians(
+    split_mask: wp.array(dtype=int),
+    prefix_sum: wp.array(dtype=int),
+    positions: wp.array(dtype=wp.vec3),
+    scales: wp.array(dtype=wp.vec3),
+    rotations: wp.array(dtype=wp.vec4),
+    opacities: wp.array(dtype=float),
+    shs: wp.array(dtype=wp.vec3),
+    N_split: int,
+    scale_factor: float,
+    offset: int,
+    out_positions: wp.array(dtype=wp.vec3),
+    out_scales: wp.array(dtype=wp.vec3),
+    out_rotations: wp.array(dtype=wp.vec4),
+    out_opacities: wp.array(dtype=float),
+    out_shs: wp.array(dtype=wp.vec3)
+):
+    """Split large Gaussians into multiple smaller ones."""
+    i = wp.tid()
+    # Copy original Gaussians first
+    if i < len(positions):
+        out_positions[i] = positions[i]
+        out_scales[i] = scales[i]
+        out_rotations[i] = rotations[i]
+        out_opacities[i] = opacities[i]
+        # Copy SH coefficients
+        for j in range(16):
+            out_shs[i * 16 + j] = shs[i * 16 + j]
+    # Handle splits
+    if i >= len(positions):
+        return
+    if split_mask[i] == 1:
+        # Find where to write new Gaussians
+        split_idx = prefix_sum[i]
+        # Create N_split new Gaussians
+        for j in range(N_split):
+            new_idx = offset + split_idx * N_split + j
+            if new_idx < len(out_positions):
+                # Scale down the original Gaussian
+                scaled_scales = wp.vec3(
+                    scales[i][0] * scale_factor,
+                    scales[i][1] * scale_factor,
+                    scales[i][2] * scale_factor
+                )
+                # Add small random offset for position
+                random_offset = wp.vec3(
+                    ((wp.randf(wp.uint32(new_idx * 3))) * 2.0 - 1.0) * 0.01,
+                    ((wp.randf(wp.uint32(new_idx * 3 + 1))) * 2.0 - 1.0) * 0.01,
+                    ((wp.randf(wp.uint32(new_idx * 3 + 2))) * 2.0 - 1.0) * 0.01
+                )
+                out_positions[new_idx] = positions[i] + random_offset
+                out_scales[new_idx] = scaled_scales
+                out_rotations[new_idx] = rotations[i]
+                out_opacities[new_idx] = opacities[i]
+                # Copy SH coefficients
+                for k in range(16):
+                    out_shs[new_idx * 16 + k] = shs[i * 16 + k]
+@wp.kernel
+def clone_gaussians(
+    clone_mask: wp.array(dtype=int),
+    prefix_sum: wp.array(dtype=int),
+    positions: wp.array(dtype=wp.vec3),
+    scales: wp.array(dtype=wp.vec3),
+    rotations: wp.array(dtype=wp.vec4),
+    opacities: wp.array(dtype=float),
+    shs: wp.array(dtype=wp.vec3),  # shape: [N * 16]
+    noise_scale: float,
+    offset: int,  # where to start writing new points
+    out_positions: wp.array(dtype=wp.vec3),
+    out_scales: wp.array(dtype=wp.vec3),
+    out_rotations: wp.array(dtype=wp.vec4),
+    out_opacities: wp.array(dtype=float),
+    out_shs: wp.array(dtype=wp.vec3),
+):
+    i = wp.tid()
+    if i >= offset:
+        return
+    # Copy original to out[i]
+    out_positions[i] = positions[i]
+    out_scales[i] = scales[i]
+    out_rotations[i] = rotations[i]
+    out_opacities[i] = opacities[i]
+    for j in range(16):
+        out_shs[i * 16 + j] = shs[i * 16 + j]
+    if clone_mask[i] == 1:
+        base_idx = prefix_sum[i] + offset
+        pos = positions[i]
+        scale = scales[i]
+        rot = rotations[i]
+        opac = opacities[i]
+        noise = wp.vec3(
+            wp.randf(wp.uint32(i * 3)) * noise_scale,
+            wp.randf(wp.uint32(i * 3 + 1)) * noise_scale,
+            wp.randf(wp.uint32(i * 3 + 2)) * noise_scale
+        )
+        out_positions[base_idx] = pos + noise
+        out_scales[base_idx] = scale
+        out_rotations[base_idx] = rot
+        out_opacities[base_idx] = opac
+        for j in range(16):
+            out_shs[base_idx * 16 + j] = shs[i * 16 + j]
+@wp.kernel
+def prune_gaussians(
+    opacities: wp.array(dtype=float),
+    opacity_threshold: float,
+    valid_mask: wp.array(dtype=int),
+    num_points: int
+):
+    i = wp.tid()
+    if i >= num_points:
+        return
+    # Mark Gaussians for keeping or removal
+    if opacities[i] > opacity_threshold:
+        valid_mask[i] = 1
+    else:
+        valid_mask[i] = 0
+@wp.kernel
+def compact_gaussians(
+    valid_mask: wp.array(dtype=int),
+    prefix_sum: wp.array(dtype=int),
+    positions: wp.array(dtype=wp.vec3),
+    scales: wp.array(dtype=wp.vec3),
+    rotations: wp.array(dtype=wp.vec4),
+    opacities: wp.array(dtype=float),
+    shs: wp.array(dtype=wp.vec3),  # shape: [N * 16]
+    out_positions: wp.array(dtype=wp.vec3),
+    out_scales: wp.array(dtype=wp.vec3),
+    out_rotations: wp.array(dtype=wp.vec4),
+    out_opacities: wp.array(dtype=float),
+    out_shs: wp.array(dtype=wp.vec3)
+):
+    i = wp.tid()
+    if valid_mask[i] == 0:
+        return
+    new_i = prefix_sum[i]
+    out_positions[new_i] = positions[i]
+    out_scales[new_i] = scales[i]
+    out_rotations[new_i] = rotations[i]
+    out_opacities[new_i] = opacities[i]
+    for j in range(16):
+        out_shs[new_i * 16 + j] = shs[i * 16 + j]

gs/render.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import numpy as np
+import warp as wp
+import matplotlib.pyplot as plt
+import math
+from forward import render_gaussians
+from utils.math_utils import world_to_view, projection_matrix
+# Initialize Warp
+wp.init()
+def setup_example_scene(image_width=1800, image_height=1800, fovx=45.0, fovy=45.0, znear=0.01, zfar=100.0):
+    """Setup example scene with camera and Gaussians for testing and debugging"""
+    # Camera setup
+    T = np.array([0, 0, 5], dtype=np.float32)
+    R = np.array([[1, 0, 0], [0, 1, 0], [0, 0, -1]], dtype=np.float32)
+    world_to_camera = np.eye(4, dtype=np.float32)
+    world_to_camera[:3, :3] = R
+    world_to_camera[:3, 3] = T
+    world_to_camera = world_to_camera.T
+    # Compute matrices
+    view_matrix = world_to_view(R=R, t=T)
+    proj_matrix = projection_matrix(fovx=fovx, fovy=fovy, znear=znear, zfar=zfar).T
+    full_proj_matrix = world_to_camera @ proj_matrix
+    camera_center = np.linalg.inv(world_to_camera)[3, :3]
+    # Compute FOV parameters
+    tan_fovx = math.tan(fovx * 0.5)
+    tan_fovy = math.tan(fovy * 0.5)
+    focal_x = image_width / (2 * tan_fovx)
+    focal_y = image_height / (2 * tan_fovy)
+    camera_params = {
+        'R': R,
+        'T': T,
+        'camera_center': camera_center,
+        'view_matrix': view_matrix,
+        'proj_matrix': proj_matrix,
+        'world_to_camera': world_to_camera,
+        'full_proj_matrix': full_proj_matrix,
+        'tan_fovx': tan_fovx,
+        'tan_fovy': tan_fovy,
+        'focal_x': focal_x,
+        'focal_y': focal_y,
+        'width': image_width,
+        'height': image_height
+    }
+    # Gaussian setup - 3 points in a line
+    pts = np.array([[-5, 0, -10], [0, 2, -10], [5, 0, -10]], dtype=np.float32)
+    n = len(pts)
+    # Hard-coded SHs for debugging
+    shs = np.array([[0.71734341, 0.91905449, 0.49961076],
+                [0.08068483, 0.82132256, 0.01301602],
+                [0.8335743,  0.31798138, 0.19709007],
+                [0.82589597, 0.28206231, 0.790489  ],
+                [0.24008527, 0.21312673, 0.53132892],
+                [0.19493135, 0.37989934, 0.61886235],
+                [0.98106522, 0.28960672, 0.57313965],
+                [0.92623716, 0.46034381, 0.5485369 ],
+                [0.81660616, 0.7801104,  0.27813915],
+                [0.96114063, 0.69872817, 0.68313804],
+                [0.95464185, 0.21984855, 0.92912192],
+                [0.23503135, 0.29786121, 0.24999751],
+                [0.29844887, 0.6327788,  0.05423596],
+                [0.08934335, 0.11851827, 0.04186001],
+                [0.59331831, 0.919777,   0.71364335],
+                [0.83377388, 0.40242542, 0.8792624 ]]*n).reshape(n, 16, 3)
+    opacities = np.ones((n, 1), dtype=np.float32)
+    # Random anisotropic scales (e.g., each axis between 0.5 and 2.0)
+    scales = (0.2 + 1.5 * np.random.rand(n, 3)).astype(np.float32)
+    # Random rotations  as unit quaternions
+    q = np.random.randn(n, 4).astype(np.float32)
+    rotations = q / np.linalg.norm(q, axis=1, keepdims=True)
+    colors = np.ones((n, 3), dtype=np.float32)
+    return pts, shs, scales, colors, rotations, opacities, camera_params
+if __name__ == "__main__":
+    # Setup rendering parameters
+    image_width = 1800
+    image_height = 1800
+    background = np.array([0.0, 0.0, 0.0], dtype=np.float32)  # Black background
+    scale_modifier = 1.0
+    sh_degree = 3
+    prefiltered = False
+    antialiasing = False
+    clamped = True
+    # Create example scene
+    pts, shs, scales, colors, rotations, opacities, camera_params = setup_example_scene(
+        image_width=image_width,
+        image_height=image_height
+    )
+    n = len(pts)
+    print(f"Created example scene with {n} Gaussians")
+    # Call the Gaussian rasterizer
+    rendered_image, depth_image, _ = render_gaussians(
+        background=background,
+        means3D=pts,
+        colors=colors,
+        opacity=opacities,
+        scales=scales,
+        rotations=rotations,
+        scale_modifier=scale_modifier,
+        viewmatrix=camera_params['view_matrix'],
+        projmatrix=camera_params['full_proj_matrix'],
+        tan_fovx=camera_params['tan_fovx'],
+        tan_fovy=camera_params['tan_fovy'],
+        image_height=image_height,
+        image_width=image_width,
+        sh=shs,
+        degree=sh_degree,
+        campos=camera_params['camera_center'],
+        prefiltered=prefiltered,
+        antialiasing=antialiasing,
+        clamped=clamped,
+        debug=False
+    )
+    print("Rendering completed")
+    # Convert the rendered image from device to host
+    rendered_array = wp.to_torch(rendered_image).cpu().numpy()
+    # Display and save using matplotlib
+    plt.figure(figsize=(10, 10))
+    plt.imshow(rendered_array)
+    plt.axis('off')
+    plt.savefig("example_render.png", bbox_inches='tight', dpi=150)
+    print("Rendered image saved to example_render.png")

gs/scheduler.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import math
+class LRScheduler:
+    """Simple exponential decay learning rate scheduler."""
+    def __init__(self, initial_lr, final_lr_factor=0.01):
+        """
+        Args:
+            initial_lr: Starting learning rate
+            final_lr_factor: Final LR as fraction of initial (e.g., 0.01 means final_lr = 0.01 * initial_lr)
+        """
+        self.initial_lr = initial_lr
+        self.final_lr = initial_lr * final_lr_factor
+    def get_lr(self, iteration, total_iterations):
+        """Get learning rate for given iteration using exponential decay."""
+        if total_iterations <= 1:
+            return self.initial_lr
+        # Exponential decay from initial_lr to final_lr
+        progress = iteration / (total_iterations - 1)
+        progress = min(progress, 1.0)  # Clamp to [0, 1]
+        # Exponential interpolation: lr = initial * (final/initial)^progress
+        lr_ratio = self.final_lr / self.initial_lr
+        current_lr = self.initial_lr * (lr_ratio ** progress)
+        return current_lr

gs/train.py ADDED Viewed

	@@ -0,0 +1,1044 @@

+import os
+import numpy as np
+import matplotlib.pyplot as plt
+import warp as wp
+import imageio
+import json
+from tqdm import tqdm
+from pathlib import Path
+import argparse
+from forward import render_gaussians
+from backward import backward
+from optimizer import prune_gaussians, adam_update, clone_gaussians, compact_gaussians, mark_split_candidates, mark_clone_candidates, split_gaussians, reset_opacities, reset_densification_stats
+from config import *
+from utils.camera_utils import load_camera
+from utils.point_cloud_utils import save_ply
+from loss import l1_loss, compute_image_gradients
+from scheduler import LRScheduler
+# Initialize Warp
+wp.init()
+# Kernels for parameter updates
+@wp.kernel
+def init_gaussian_params(
+    positions: wp.array(dtype=wp.vec3),
+    scales: wp.array(dtype=wp.vec3),
+    rotations: wp.array(dtype=wp.vec4),
+    opacities: wp.array(dtype=float),
+    shs: wp.array(dtype=wp.vec3),
+    num_points: int,
+    init_scale: float
+):
+    i = wp.tid()
+    if i >= num_points:
+        return
+    # Initialize positions with random values
+    # Generate random positions using warp random
+    offset = wp.vec3(
+        (wp.randf(wp.uint32(i * 3)) * 2.6 - 1.3),
+        (wp.randf(wp.uint32(i * 3 + 1)) * 2.6 - 1.3),
+        (wp.randf(wp.uint32(i * 3 + 2)) * 2.6 - 1.3)
+    )
+    # camera_center
+    positions[i] =  offset
+    # Initialize scales
+    scales[i] = wp.vec3(init_scale, init_scale, init_scale)
+    # Initialize rotations to identity matrix
+    rotations[i] = wp.vec4(1.0, 0.0, 0.0, 0.0)
+    # Initialize opacities
+    opacities[i] = 0.1
+    # Initialize SH coefficients (just DC term for now)
+    for j in range(16):  # degree=3, total 16 coefficients
+        idx = i * 16 + j
+        # Slight random initialization with positive bias
+        if j == 0:
+            shs[idx] = wp.vec3(-0.007, -0.007, -0.007)
+        else:
+            shs[idx] = wp.vec3(0.0, 0.0, 0.0)
+@wp.kernel
+def zero_gradients(
+    pos_grad: wp.array(dtype=wp.vec3),
+    scale_grad: wp.array(dtype=wp.vec3),
+    rot_grad: wp.array(dtype=wp.vec4),
+    opacity_grad: wp.array(dtype=float),
+    sh_grad: wp.array(dtype=wp.vec3),
+    num_points: int
+):
+    i = wp.tid()
+    if i >= num_points:
+        return
+    pos_grad[i] = wp.vec3(0.0, 0.0, 0.0)
+    scale_grad[i] = wp.vec3(0.0, 0.0, 0.0)
+    rot_grad[i] = wp.vec4(0.0, 0.0, 0.0, 0.0)
+    opacity_grad[i] = 0.0
+    # Zero SH gradients
+    for j in range(16):
+        idx = i * 16 + j
+        sh_grad[idx] = wp.vec3(0.0, 0.0, 0.0)
+class NeRFGaussianSplattingTrainer:
+    def __init__(self, dataset_path, output_path, config=None):
+        """Initialize the 3D Gaussian Splatting trainer using pure Warp for NeRF dataset."""
+        self.dataset_path = Path(dataset_path)
+        self.output_path = Path(output_path)
+        self.output_path.mkdir(parents=True, exist_ok=True)
+        # Initialize configuration from GaussianParams
+        self.config = GaussianParams.get_config_dict()
+        if config is not None:
+            self.config.update(config)
+        # Initialize learning rate scheduler
+        self.lr_scheduler = self.create_lr_scheduler()
+        print(f"Learning rate scheduler: {'Enabled' if self.lr_scheduler else 'Disabled'}")
+        # For tracking learning rates
+        self.learning_rate_history = {
+            'positions': [],
+            'scales': [],
+            'rotations': [],
+            'shs': [],
+            'opacities': []
+        }
+        # Load NeRF dataset
+        print(f"Loading NeRF dataset from {self.dataset_path}")
+        self.cameras, self.image_paths = self.load_nerf_data("train")
+        self.val_cameras, self.val_image_paths = self.load_nerf_data("val")
+        self.test_cameras, self.test_image_paths = self.load_nerf_data("test")
+        print(f"Loaded {len(self.cameras)} train cameras and {len(self.image_paths)} train images")
+        print(f"Loaded {len(self.val_cameras)} val cameras and {len(self.val_image_paths)} val images")
+        print(f"Loaded {len(self.test_cameras)} test cameras and {len(self.test_image_paths)} test images")
+        # Calculate scene extent for densification
+        self.scene_extent = self.calculate_scene_extent()
+        print(f"Calculated scene extent: {self.scene_extent}")
+        # Initialize parameters
+        self.num_points = self.config['num_points']
+        self.params = self.initialize_parameters()
+        # Create gradient arrays
+        self.grads = self.create_gradient_arrays()
+        # Create optimizer state
+        self.adam_m = self.create_gradient_arrays()  # First moment
+        self.adam_v = self.create_gradient_arrays()  # Second moment
+        # Initialize densification state tracking
+        self.init_densification_state()
+        # For tracking loss
+        self.losses = []
+        # Initialize intermediate buffers dictionary
+        self.intermediate_buffers = {}
+        # Track iteration for opacity reset
+        self.opacity_reset_at = -32768
+    def create_lr_scheduler(self):
+        """Create simple learning rate schedulers for each parameter type."""
+        if not self.config['use_lr_scheduler']:
+            return None
+        config = self.config['lr_scheduler_config']
+        final_factor = config['final_lr_factor']
+        schedulers = {
+            'positions': LRScheduler(config['lr_pos'], final_factor),
+            'scales': LRScheduler(config['lr_scale'], final_factor),
+            'rotations': LRScheduler(config['lr_rot'], final_factor),
+            'shs': LRScheduler(config['lr_sh'], final_factor),
+            'opacities': LRScheduler(config['lr_opac'], final_factor)
+        }
+        return schedulers
+    def initialize_parameters(self):
+        """Initialize Gaussian parameters."""
+        positions = wp.zeros(self.num_points, dtype=wp.vec3)
+        scales = wp.zeros(self.num_points, dtype=wp.vec3)
+        rotations = wp.zeros(self.num_points, dtype=wp.vec4)
+        opacities = wp.zeros(self.num_points, dtype=float)
+        shs = wp.zeros(self.num_points * 16, dtype=wp.vec3)  # 16 coeffs per point
+        # Launch kernel to initialize parameters
+        wp.launch(
+            init_gaussian_params,
+            dim=self.num_points,
+            inputs=[positions, scales, rotations, opacities, shs, self.num_points, self.config['initial_scale']]
+        )
+        # Return parameters as dictionary
+        return {
+            'positions': positions,
+            'scales': scales,
+            'rotations': rotations,
+            'opacities': opacities,
+            'shs': shs
+        }
+    def create_gradient_arrays(self):
+        """Create arrays for gradients or optimizer state."""
+        positions = wp.zeros(self.num_points, dtype=wp.vec3)
+        scales = wp.zeros(self.num_points, dtype=wp.vec3)
+        rotations = wp.zeros(self.num_points, dtype=wp.vec4)
+        opacities = wp.zeros(self.num_points, dtype=float)
+        shs = wp.zeros(self.num_points * 16, dtype=wp.vec3)
+        # Return a dictionary of arrays
+        return {
+            'positions': positions,
+            'scales': scales,
+            'rotations': rotations,
+            'opacities': opacities,
+            'shs': shs
+        }
+    def calculate_scene_extent(self):
+        """Calculate the extent of the scene based on camera positions."""
+        if not self.cameras:
+            return 1.0  # Default fallback
+        # Extract camera positions
+        camera_positions = []
+        for camera in self.cameras:
+            camera_positions.append(camera['camera_center'])
+        camera_positions = np.array(camera_positions)
+        # Calculate the centroid of all camera positions
+        scene_center = np.mean(camera_positions, axis=0)
+        # Calculate the maximum distance from any camera to the scene center
+        max_distance_to_center = 0.0
+        for pos in camera_positions:
+            distance = np.linalg.norm(pos - scene_center)
+            max_distance_to_center = max(max_distance_to_center, distance)
+        # The scene extent is the radius of the bounding sphere
+        # Use default factor if extent is too small
+        extent = max_distance_to_center * self.config.get('camera_extent_factor', 1.0)
+        return max(extent, 1.0)
+    def init_densification_state(self):
+        """Initialize state tracking for densification."""
+        self.xyz_gradient_accum = wp.zeros(self.num_points, dtype=float, device=DEVICE)
+        self.denom = wp.zeros(self.num_points, dtype=float, device=DEVICE)
+        self.max_radii2D = wp.zeros(self.num_points, dtype=float, device=DEVICE)
+    def load_nerf_data(self, datasplit):
+        """Load camera parameters and images from a NeRF dataset."""
+        # Read transforms_train.json
+        transforms_path = self.dataset_path / f"transforms_{datasplit}.json"
+        if not transforms_path.exists():
+            raise FileNotFoundError(f"No transforms_train.json found in {self.dataset_path}")
+        with open(transforms_path, 'r') as f:
+            transforms = json.load(f)
+        # Get image dimensions from the first image if available
+        first_frame = transforms['frames'][0]
+        first_img_path = str(self.dataset_path / f"{first_frame['file_path']}.png")
+        if os.path.exists(first_img_path):
+            # Load first image to get dimensions
+            img = imageio.imread(first_img_path)
+            width = img.shape[1]
+            height = img.shape[0]
+            print(f"Using image dimensions from dataset: {width}x{height}")
+        else:
+            # Use default dimensions from config if image not found
+            width = self.config['width']
+            height = self.config['height']
+            print(f"Using default dimensions: {width}x{height}")
+        # Update config with actual dimensions
+        self.config['width'] = width
+        self.config['height'] = height
+        self.config['camera_angle_x'] = transforms['camera_angle_x']
+        # Calculate focal length
+        focal = 0.5 * width / np.tan(0.5 * self.config['camera_angle_x'])
+        cameras = []
+        image_paths = []
+        # Process each frame
+        for i, frame in enumerate(transforms['frames']):
+            camera_info = {
+                "camera_id": i,
+                "camera_to_world": frame['transform_matrix'],
+                "width": width,
+                "height": height,
+                "focal": focal,
+            }
+            # Load camera parameters using existing function
+            camera_params = load_camera(camera_info)
+            if camera_params is not None:
+                cameras.append(camera_params)
+                image_paths.append(str(self.dataset_path / f"{frame['file_path']}.png"))
+        return cameras, image_paths
+    def load_image(self, path):
+        """Load an image as a numpy array."""
+        if os.path.exists(path):
+            img = imageio.imread(path)
+            # Convert to float and normalize to [0, 1]
+            img_np = img.astype(np.float32) / 255.0
+            # Ensure image is RGB (discard alpha channel if present)
+            if img_np.shape[2] == 4:
+                img_np = img_np[:, :, :3] # Keep only R, G, B channels
+            return img_np
+        else:
+            raise FileNotFoundError(f"Image not found: {path}")
+    def zero_grad(self):
+        """Zero out all gradients."""
+        wp.launch(
+            zero_gradients,
+            dim=self.num_points,
+            inputs=[
+                self.grads['positions'],
+                self.grads['scales'],
+                self.grads['rotations'],
+                self.grads['opacities'],
+                self.grads['shs'],
+                self.num_points
+            ]
+        )
+    def densification_and_pruning(self, iteration):
+        """Perform sophisticated densification and pruning of Gaussians."""
+        # Check if we should do densification
+        densify_from_iter = self.config.get('densify_from_iter', 500)
+        densify_until_iter = self.config.get('densify_until_iter', 15000)
+        densification_interval = self.config.get('densification_interval', 100)
+        opacity_reset_interval = self.config.get('opacity_reset_interval', 3000)
+        # Skip densification if outside iteration range
+        if iteration > densify_from_iter and iteration < densify_until_iter and iteration % densification_interval == 0:
+            print(f"Iteration {iteration}: Performing sophisticated densification and pruning")
+            # For simplified implementation, use position gradients as proxy for viewspace gradients
+            pos_grads = self.grads['positions']
+            avg_grads = wp.zeros(self.num_points, dtype=float, device=DEVICE)
+            @wp.kernel
+            def compute_grad_norms(pos_grad: wp.array(dtype=wp.vec3),
+                                grad_norms: wp.array(dtype=float),
+                                num_points: int):
+                i = wp.tid()
+                if i >= num_points:
+                    return
+                grad_norms[i] = wp.length(pos_grad[i])
+            wp.launch(compute_grad_norms, dim=self.num_points,
+                    inputs=[pos_grads, avg_grads, self.num_points])
+            # Configuration
+            grad_threshold = self.config.get('densify_grad_threshold', 0.0002)
+            percent_dense = self.config.get('percent_dense', 0.01)
+            # --- Step 1: Clone small Gaussians with high gradients ---
+            clone_mask = wp.zeros(self.num_points, dtype=int, device=DEVICE)
+            wp.launch(
+                mark_clone_candidates,
+                dim=self.num_points,
+                inputs=[
+                    avg_grads,
+                    self.params['scales'],
+                    grad_threshold,
+                    self.scene_extent,
+                    percent_dense,
+                    clone_mask,
+                    self.num_points
+                ]
+            )
+            # Perform cloning
+            clone_prefix_sum = wp.zeros_like(clone_mask)
+            wp.utils.array_scan(clone_mask, clone_prefix_sum, inclusive=False)
+            total_to_clone = int(clone_prefix_sum.numpy()[-1])
+            if total_to_clone > 0:
+                print(f"[Clone] Cloning {total_to_clone} small Gaussians")
+                N = self.num_points
+                new_N = N + total_to_clone
+                # Allocate output arrays
+                out_params = {
+                    'positions': wp.zeros(new_N, dtype=wp.vec3, device=DEVICE),
+                    'scales': wp.zeros(new_N, dtype=wp.vec3, device=DEVICE),
+                    'rotations': wp.zeros(new_N, dtype=wp.vec4, device=DEVICE),
+                    'opacities': wp.zeros(new_N, dtype=float, device=DEVICE),
+                    'shs': wp.zeros(new_N * 16, dtype=wp.vec3, device=DEVICE)
+                }
+                # Clone Gaussians
+                wp.launch(
+                    clone_gaussians,
+                    dim=N,
+                    inputs=[
+                        clone_mask,
+                        clone_prefix_sum,
+                        self.params['positions'],
+                        self.params['scales'],
+                        self.params['rotations'],
+                        self.params['opacities'],
+                        self.params['shs'],
+                        0.01,  # noise_scale
+                        N,     # offset
+                        out_params['positions'],
+                        out_params['scales'],
+                        out_params['rotations'],
+                        out_params['opacities'],
+                        out_params['shs']
+                    ]
+                )
+                # Update parameters and state
+                self.params = out_params
+                self.num_points = new_N
+                self.grads = self.create_gradient_arrays()
+                self.adam_m = self.create_gradient_arrays()
+                self.adam_v = self.create_gradient_arrays()
+            # --- Step 2: Split large Gaussians with high gradients ---
+            split_mask = wp.zeros(self.num_points, dtype=int, device=DEVICE)
+            wp.launch(
+                mark_split_candidates,
+                dim=self.num_points,
+                inputs=[
+                    avg_grads,
+                    self.params['scales'],
+                    grad_threshold,
+                    self.scene_extent,
+                    percent_dense,
+                    split_mask,
+                    self.num_points
+                ]
+            )
+            # Perform splitting
+            split_prefix_sum = wp.zeros_like(split_mask)
+            wp.utils.array_scan(split_mask, split_prefix_sum, inclusive=False)
+            total_to_split = int(split_prefix_sum.numpy()[-1])
+            if total_to_split > 0:
+                print(f"[Split] Splitting {total_to_split} large Gaussians")
+                N = self.num_points
+                N_split = 2  # Split each Gaussian into 2
+                new_N = N + total_to_split * N_split
+                # Allocate output arrays
+                out_params = {
+                    'positions': wp.zeros(new_N, dtype=wp.vec3, device=DEVICE),
+                    'scales': wp.zeros(new_N, dtype=wp.vec3, device=DEVICE),
+                    'rotations': wp.zeros(new_N, dtype=wp.vec4, device=DEVICE),
+                    'opacities': wp.zeros(new_N, dtype=float, device=DEVICE),
+                    'shs': wp.zeros(new_N * 16, dtype=wp.vec3, device=DEVICE)
+                }
+                # Split Gaussians
+                wp.launch(
+                    split_gaussians,
+                    dim=N,
+                    inputs=[
+                        split_mask,
+                        split_prefix_sum,
+                        self.params['positions'],
+                        self.params['scales'],
+                        self.params['rotations'],
+                        self.params['opacities'],
+                        self.params['shs'],
+                        N_split,  # Number of splits per Gaussian
+                        0.8,      # scale_factor
+                        N,        # offset
+                        out_params['positions'],
+                        out_params['scales'],
+                        out_params['rotations'],
+                        out_params['opacities'],
+                        out_params['shs']
+                    ]
+                )
+                # Update parameters and state
+                self.params = out_params
+                self.num_points = new_N
+                self.grads = self.create_gradient_arrays()
+                self.adam_m = self.create_gradient_arrays()
+                self.adam_v = self.create_gradient_arrays()
+                # Remove original split Gaussians
+                prune_filter = wp.zeros(self.num_points, dtype=int, device=DEVICE)
+                @wp.kernel
+                def mark_split_originals_for_removal(
+                    split_mask: wp.array(dtype=int),
+                    prune_filter: wp.array(dtype=int),
+                    offset: int,
+                    num_points: int
+                ):
+                    i = wp.tid()
+                    if i >= num_points:
+                        return
+                    if i < offset and split_mask[i] == 1:
+                        prune_filter[i] = 1  # Mark for removal
+                    else:
+                        prune_filter[i] = 0  # Keep
+                wp.launch(mark_split_originals_for_removal, dim=self.num_points,
+                        inputs=[split_mask, prune_filter, N, self.num_points])
+                # Invert mask to get valid mask
+                valid_mask = wp.zeros_like(prune_filter)
+                @wp.kernel
+                def invert_mask(prune: wp.array(dtype=int), valid: wp.array(dtype=int), n: int):
+                    i = wp.tid()
+                    if i >= n:
+                        return
+                    valid[i] = 1 - prune[i]
+                wp.launch(invert_mask, dim=self.num_points,
+                        inputs=[prune_filter, valid_mask, self.num_points])
+                # Count valid points and compact
+                prefix_sum = wp.zeros_like(valid_mask)
+                wp.utils.array_scan(valid_mask, prefix_sum, inclusive=False)
+                valid_count = int(prefix_sum.numpy()[-1])
+                if valid_count < self.num_points:
+                    print(f"[Split] Removing {self.num_points - valid_count} original split Gaussians")
+                    # Allocate compacted output
+                    compact_params = {
+                        'positions': wp.zeros(valid_count, dtype=wp.vec3, device=DEVICE),
+                        'scales': wp.zeros(valid_count, dtype=wp.vec3, device=DEVICE),
+                        'rotations': wp.zeros(valid_count, dtype=wp.vec4, device=DEVICE),
+                        'opacities': wp.zeros(valid_count, dtype=float, device=DEVICE),
+                        'shs': wp.zeros(valid_count * 16, dtype=wp.vec3, device=DEVICE)
+                    }
+                    wp.launch(
+                        compact_gaussians,
+                        dim=self.num_points,
+                        inputs=[
+                            valid_mask,
+                            prefix_sum,
+                            self.params['positions'],
+                            self.params['scales'],
+                            self.params['rotations'],
+                            self.params['opacities'],
+                            self.params['shs'],
+                            compact_params['positions'],
+                            compact_params['scales'],
+                            compact_params['rotations'],
+                            compact_params['opacities'],
+                            compact_params['shs']
+                        ]
+                    )
+                    # Update parameters and state
+                    self.params = compact_params
+                    self.num_points = valid_count
+                    self.grads = self.create_gradient_arrays()
+                    self.adam_m = self.create_gradient_arrays()
+                    self.adam_v = self.create_gradient_arrays()
+            # --- Step 3: Enhanced Pruning ---
+            print(f"[Prune] Performing enhanced pruning")
+            valid_mask = wp.zeros(self.num_points, dtype=int, device=DEVICE)
+            # Use opacity-based pruning for now
+            wp.launch(
+                prune_gaussians,
+                dim=self.num_points,
+                inputs=[
+                    self.params['opacities'],
+                    self.config.get('cull_opacity_threshold', 0.005),
+                    valid_mask,
+                    self.num_points
+                ]
+            )
+            # Count valid points
+            prefix_sum = wp.zeros_like(valid_mask)
+            wp.utils.array_scan(valid_mask, prefix_sum, inclusive=False)
+            valid_count = int(prefix_sum.numpy()[-1])
+            # Check pruning constraints
+            min_valid_points = self.config.get('min_valid_points', 1000)
+            max_valid_points = self.config.get('max_valid_points', 1000000)
+            max_prune_ratio = self.config.get('max_allowed_prune_ratio', 0.5)
+            prune_count = self.num_points - valid_count
+            prune_ratio = prune_count / self.num_points if self.num_points > 0 else 0
+            if (valid_count >= min_valid_points and
+                valid_count <= max_valid_points and
+                prune_ratio <= max_prune_ratio and
+                valid_count < self.num_points):
+                print(f"[Prune] Compacting from {self.num_points} → {valid_count} points")
+                # Allocate compacted output
+                out_params = {
+                    'positions': wp.zeros(valid_count, dtype=wp.vec3, device=DEVICE),
+                    'scales': wp.zeros(valid_count, dtype=wp.vec3, device=DEVICE),
+                    'rotations': wp.zeros(valid_count, dtype=wp.vec4, device=DEVICE),
+                    'opacities': wp.zeros(valid_count, dtype=float, device=DEVICE),
+                    'shs': wp.zeros(valid_count * 16, dtype=wp.vec3, device=DEVICE)
+                }
+                wp.launch(
+                    compact_gaussians,
+                    dim=self.num_points,
+                    inputs=[
+                        valid_mask,
+                        prefix_sum,
+                        self.params['positions'],
+                        self.params['scales'],
+                        self.params['rotations'],
+                        self.params['opacities'],
+                        self.params['shs'],
+                        out_params['positions'],
+                        out_params['scales'],
+                        out_params['rotations'],
+                        out_params['opacities'],
+                        out_params['shs']
+                    ]
+                )
+                # Update parameters and state
+                self.params = out_params
+                self.num_points = valid_count
+                self.grads = self.create_gradient_arrays()
+                self.adam_m = self.create_gradient_arrays()
+                self.adam_v = self.create_gradient_arrays()
+            else:
+                print(f"[Prune] Skipping pruning: valid={valid_count}, ratio={prune_ratio:.3f}")
+        # Opacity reset - updated logic to match reference implementation
+        background_is_white = all(c == 1.0 for c in self.config['background_color'])
+        should_reset_opacity = (
+            iteration % opacity_reset_interval == 0 or
+            (background_is_white and iteration == densify_from_iter)
+        )
+        if should_reset_opacity:
+            print(f"Iteration {iteration}: Resetting opacities")
+            wp.launch(
+                reset_opacities,
+                dim=self.num_points,
+                inputs=[
+                    self.params['opacities'],
+                    0.01,  # max_opacity
+                    self.num_points
+                ]
+            )
+    def optimizer_step(self, iteration):
+        """Perform an Adam optimization step."""
+        # Get learning rates from scheduler or use config defaults
+        if self.lr_scheduler:
+            lr_pos = self.lr_scheduler['positions'].get_lr(iteration, self.config['num_iterations'])
+            lr_scale = self.lr_scheduler['scales'].get_lr(iteration, self.config['num_iterations'])
+            lr_rot = self.lr_scheduler['rotations'].get_lr(iteration, self.config['num_iterations'])
+            lr_sh = self.lr_scheduler['shs'].get_lr(iteration, self.config['num_iterations'])
+            lr_opac = self.lr_scheduler['opacities'].get_lr(iteration, self.config['num_iterations'])
+            # Track learning rate history
+            self.learning_rate_history['positions'].append(lr_pos)
+            self.learning_rate_history['scales'].append(lr_scale)
+            self.learning_rate_history['rotations'].append(lr_rot)
+            self.learning_rate_history['shs'].append(lr_sh)
+            self.learning_rate_history['opacities'].append(lr_opac)
+            # Log learning rates occasionally
+            if iteration % 1000 == 0:
+                print(f"Iteration {iteration} learning rates:")
+                print(f"  positions: {lr_pos:.6f}")
+                print(f"  scales: {lr_scale:.6f}")
+                print(f"  rotations: {lr_rot:.6f}")
+                print(f"  shs: {lr_sh:.6f}")
+                print(f"  opacities: {lr_opac:.6f}")
+        else:
+            # Use static learning rates from config
+            lr_pos = self.config['lr_pos']
+            lr_scale = self.config['lr_scale']
+            lr_rot = self.config['lr_rot']
+            lr_sh = self.config['lr_sh']
+            lr_opac = self.config['lr_opac']
+        wp.launch(
+            adam_update,
+            dim=self.num_points,
+            inputs=[
+                # Parameters
+                self.params['positions'],
+                self.params['scales'],
+                self.params['rotations'],
+                self.params['opacities'],
+                self.params['shs'],
+                # Gradients
+                self.grads['positions'],
+                self.grads['scales'],
+                self.grads['rotations'],
+                self.grads['opacities'],
+                self.grads['shs'],
+                # First moments (m)
+                self.adam_m['positions'],
+                self.adam_m['scales'],
+                self.adam_m['rotations'],
+                self.adam_m['opacities'],
+                self.adam_m['shs'],
+                # Second moments (v)
+                self.adam_v['positions'],
+                self.adam_v['scales'],
+                self.adam_v['rotations'],
+                self.adam_v['opacities'],
+                self.adam_v['shs'],
+                # Optimizer parameters with dynamic learning rates
+                self.num_points,
+                lr_pos,    # Dynamic learning rate for positions
+                lr_scale,  # Dynamic learning rate for scales
+                lr_rot,    # Dynamic learning rate for rotations
+                lr_sh,     # Dynamic learning rate for SH coefficients
+                lr_opac,   # Dynamic learning rate for opacities
+                self.config['adam_beta1'],
+                self.config['adam_beta2'],
+                self.config['adam_epsilon'],
+                iteration
+            ]
+        )
+    def save_checkpoint(self, iteration):
+        """Save the current point cloud and training state."""
+        checkpoint_dir = self.output_path / "point_cloud" / f"iteration_{iteration}"
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        # Save point cloud as PLY
+        ply_path = checkpoint_dir / "point_cloud.ply"
+        save_ply(self.params, ply_path, self.num_points)
+        # Save loss history
+        loss_path = self.output_path / "loss.txt"
+        with open(loss_path, 'w') as f:
+            for loss in self.losses:
+                f.write(f"{loss}\n")
+        # Save loss plot
+        plt.figure(figsize=(10, 5))
+        plt.plot(self.losses)
+        plt.title('Training Loss')
+        plt.xlabel('Iteration')
+        plt.ylabel('Loss')
+        plt.savefig(self.output_path / "loss_plot.png")
+        plt.close()
+        # Save a rendered view
+        camera_idx = 0  # Front view
+        rendered_image, _, _ = render_gaussians(
+            background=np.array(self.config['background_color'], dtype=np.float32),
+            means3D=self.params['positions'].numpy(),
+            colors=None,  # Use SH coefficients instead
+            opacity=self.params['opacities'].numpy(),
+            scales=self.params['scales'].numpy(),
+            rotations=self.params['rotations'].numpy(),
+            scale_modifier=self.config['scale_modifier'],
+            viewmatrix=self.cameras[camera_idx]['world_to_camera'],
+            projmatrix=self.cameras[camera_idx]['full_proj_matrix'],
+            tan_fovx=self.cameras[camera_idx]['tan_fovx'],
+            tan_fovy=self.cameras[camera_idx]['tan_fovy'],
+            image_height=self.cameras[camera_idx]['height'],
+            image_width=self.cameras[camera_idx]['width'],
+            sh=self.params['shs'].numpy(),  # Pass SH coefficients
+            degree=self.config['sh_degree'],
+            campos=self.cameras[camera_idx]['camera_center'],
+            prefiltered=False,
+            antialiasing=True,
+            clamped=True
+        )
+        # Save rendered view as image
+        rendered_array = wp.to_torch(rendered_image).cpu().numpy()
+        # Handle case where rendered_array has shape (3, H, W) - transpose to (H, W, 3)
+        if rendered_array.shape[0] == 3 and len(rendered_array.shape) == 3:
+            rendered_array = np.transpose(rendered_array, (1, 2, 0))
+        img8 = (np.clip(rendered_array, 0, 1) * 255).astype(np.uint8)
+        imageio.imwrite(checkpoint_dir / "rendered_view.png", img8)
+    def debug_log_and_save_images(
+            self,
+            rendered_image,         # np.float32  H×W×3  (range 0-1)
+            target_image,           # np.float32
+            depth_image,            # wp.array2d(float) – optional but unused here
+            camera_idx: int,
+            it: int
+    ):
+        # ------ quick numeric read-out -----------------------------------
+        radii   = wp.to_torch(self.intermediate_buffers["radii"]).cpu().numpy()
+        alphas  = wp.to_torch(self.intermediate_buffers["conic_opacity"]).cpu().numpy()[:, 3]
+        offs    = wp.to_torch(self.intermediate_buffers["point_offsets"]).cpu().numpy()
+        num_dup = int(offs[-1]) if len(offs) else 0
+        r_med   = np.median(radii[radii > 0]) if (radii > 0).any() else 0
+        # Count visible Gaussians
+        xy_image = wp.to_torch(self.intermediate_buffers["points_xy_image"]).cpu().numpy()
+        W = self.cameras[camera_idx]['width']
+        H = self.cameras[camera_idx]['height']
+        visible_gaussians = np.sum(
+            (xy_image[:, 0] >= 0) & (xy_image[:, 0] < W) &
+            (xy_image[:, 1] >= 0) & (xy_image[:, 1] < H) &
+            np.isfinite(xy_image).all(axis=1) &
+            (radii > 0)  # Only count Gaussians with positive radius
+        )
+        print(
+            f"[it {it:05d}] dup={num_dup:<6} "
+            f"r_med={r_med:5.1f}  α∈[{alphas.min():.3f},"
+            f"{np.median(alphas):.3f},{alphas.max():.3f}] "
+            f"visible={visible_gaussians}/{len(xy_image)}"
+        )
+        # ------ save render / target PNG ---------------------------------
+        def save_rgb(arr_f32, stem):
+            # Handle case where arr_f32 has shape (3, H, W) - transpose to (H, W, 3)
+            if arr_f32.shape[0] == 3 and len(arr_f32.shape) == 3:
+                arr_f32 = np.transpose(arr_f32, (1, 2, 0))
+            img8 = (np.clip(arr_f32, 0, 1) * 255).astype(np.uint8)
+            imageio.imwrite(self.output_path / f"{stem}_{it:06d}.png", img8)
+        save_rgb(rendered_image if isinstance(rendered_image, np.ndarray) else wp.to_torch(rendered_image).cpu().numpy(), "render")
+        save_rgb(target_image,   "target")
+        # ------ make 2-D projection scatter ------------------------------
+        xy     = wp.to_torch(self.intermediate_buffers["points_xy_image"]).cpu().numpy()
+        depth  = wp.to_torch(self.intermediate_buffers["depths"]).cpu().numpy()
+        H, W   = self.config["height"], self.config["width"]
+        mask = (
+            (xy[:, 0] >= 0) & (xy[:, 0] < W) &
+            (xy[:, 1] >= 0) & (xy[:, 1] < H) &
+            np.isfinite(xy).all(axis=1) &
+            (radii > 0)  # Only include Gaussians with positive radius
+        )
+        if mask.any():
+            plt.figure(figsize=(6, 6))
+            plt.scatter(xy[mask, 0], xy[mask, 1],
+                        s=4, c=depth[mask], cmap="turbo", alpha=.7)
+            plt.gca().invert_yaxis()
+            plt.xlim(0, W); plt.ylim(H, 0)
+            plt.title(f"Projected Gaussians (iter {it}): {np.sum(mask)}/{len(xy)} visible")
+            plt.colorbar(label="depth(z)")
+            plt.tight_layout()
+            plt.savefig(self.output_path / f"proj_{it:06d}.png", dpi=250)
+            plt.close()
+            # depth histogram
+            plt.figure(figsize=(5, 3))
+            plt.hist(depth[mask], bins=40, color="steelblue")
+            plt.xlabel("depth (camera-z)")
+            plt.ylabel("count")
+            plt.title(f"Depth hist – {mask.sum()} pts")
+            plt.tight_layout()
+            plt.savefig(self.output_path / f"depth_hist_{it:06d}.png", dpi=250)
+            plt.close()
+    def train(self):
+        """Train the 3D Gaussian Splatting model."""
+        num_iterations = self.config['num_iterations']
+        # Main training loop
+        with tqdm(total=num_iterations) as pbar:
+            for iteration in range(num_iterations):
+                # Select a random camera and corresponding image
+                camera_idx = np.random.randint(0, len(self.cameras))
+                image_path = self.image_paths[camera_idx]
+                target_image = self.load_image(image_path)
+                # Zero gradients
+                self.zero_grad()
+                # Render the view
+                rendered_image, depth_image, self.intermediate_buffers = render_gaussians(
+                    background=np.array(self.config['background_color'], dtype=np.float32),
+                    means3D=self.params['positions'].numpy(),
+                    colors=None,  # Use SH coefficients instead
+                    opacity=self.params['opacities'].numpy(),
+                    scales=self.params['scales'].numpy(),
+                    rotations=self.params['rotations'].numpy(),
+                    scale_modifier=self.config['scale_modifier'],
+                    viewmatrix=self.cameras[camera_idx]['world_to_camera'],
+                    projmatrix=self.cameras[camera_idx]['full_proj_matrix'],
+                    tan_fovx=self.cameras[camera_idx]['tan_fovx'],
+                    tan_fovy=self.cameras[camera_idx]['tan_fovy'],
+                    image_height=self.cameras[camera_idx]['height'],
+                    image_width=self.cameras[camera_idx]['width'],
+                    sh=self.params['shs'].numpy(),  # Pass SH coefficients
+                    degree=self.config['sh_degree'],
+                    campos=self.cameras[camera_idx]['camera_center'],
+                    prefiltered=False,
+                    antialiasing=False,
+                    clamped=True
+                )
+                radii = wp.to_torch(self.intermediate_buffers["radii"]).cpu().numpy()
+                np_rendered_image = wp.to_torch(rendered_image).cpu().numpy()
+                np_rendered_image = np_rendered_image.transpose(2, 0, 1)
+                if iteration % self.config['save_interval'] == 0:
+                    self.debug_log_and_save_images(np_rendered_image, target_image, depth_image, camera_idx, iteration)
+                # Calculate L1 loss
+                l1_val = l1_loss(rendered_image, target_image)
+                # # Calculate SSIM, not used
+                # ssim_val = ssim(rendered_image, target_image)
+                # # Combined loss with weighted SSIM
+                # lambda_dssim = self.config['lambda_dssim']
+                # # loss = (1 - λ) * L1 + λ * (1 - SSIM)
+                # loss = (1.0 - lambda_dssim) * l1_val + lambda_dssim * (1.0 - ssim_val)
+                loss = l1_val
+                self.losses.append(loss)
+                # Compute pixel gradients for image loss (dL/dColor)
+                pixel_grad_buffer = compute_image_gradients(
+                    rendered_image, target_image, lambda_dssim=0
+                )
+                # Prepare camera parameters
+                camera = self.cameras[camera_idx]
+                view_matrix = wp.mat44(camera['world_to_camera'].flatten())
+                proj_matrix = wp.mat44(camera['full_proj_matrix'].flatten())
+                campos = wp.vec3(camera['camera_center'][0], camera['camera_center'][1], camera['camera_center'][2])
+                # Create appropriate buffer dictionaries for the backward pass
+                geom_buffer = {
+                    'radii': self.intermediate_buffers['radii'],
+                    'means2D': self.intermediate_buffers['points_xy_image'],
+                    'conic_opacity': self.intermediate_buffers['conic_opacity'],
+                    'rgb': self.intermediate_buffers['colors'],
+                    'clamped': self.intermediate_buffers['clamped_state']
+                }
+                binning_buffer = {
+                    'point_list': self.intermediate_buffers['point_list']
+                }
+                img_buffer = {
+                    'ranges': self.intermediate_buffers['ranges'],
+                    'final_Ts': self.intermediate_buffers['final_Ts'],
+                    'n_contrib': self.intermediate_buffers['n_contrib']
+                }
+                gradients = backward(
+                    # Core parameters
+                    background=np.array(self.config['background_color'], dtype=np.float32),
+                    means3D=self.params['positions'],
+                    dL_dpixels=pixel_grad_buffer,
+                    # Model parameters (pass directly from self.params)
+                    opacity=self.params['opacities'],
+                    shs=self.params['shs'],
+                    scales=self.params['scales'],
+                    rotations=self.params['rotations'],
+                    scale_modifier=self.config['scale_modifier'],
+                    # Camera parameters
+                    viewmatrix=view_matrix,
+                    projmatrix=proj_matrix,
+                    tan_fovx=camera['tan_fovx'],
+                    tan_fovy=camera['tan_fovy'],
+                    image_height=camera['height'],
+                    image_width=camera['width'],
+                    campos=campos,
+                    # Forward output buffers
+                    radii=self.intermediate_buffers['radii'],
+                    means2D=self.intermediate_buffers['points_xy_image'],
+                    conic_opacity=self.intermediate_buffers['conic_opacity'],
+                    rgb=self.intermediate_buffers['colors'],
+                    cov3Ds=self.intermediate_buffers['cov3Ds'],
+                    clamped=self.intermediate_buffers['clamped_state'],
+                    # Internal state buffers
+                    geom_buffer=geom_buffer,
+                    binning_buffer=binning_buffer,
+                    img_buffer=img_buffer,
+                    # Algorithm parameters
+                    degree=self.config['sh_degree'],
+                    debug=False
+                )
+                # 3. Copy gradients from backward result to the optimizer's gradient buffers
+                wp.copy(self.grads['positions'], gradients['dL_dmean3D'])
+                wp.copy(self.grads['scales'], gradients['dL_dscale'])
+                wp.copy(self.grads['rotations'], gradients['dL_drot'])
+                wp.copy(self.grads['opacities'], gradients['dL_dopacity'])
+                wp.copy(self.grads['shs'], gradients['dL_dshs'])
+                # Update parameters
+                self.optimizer_step(iteration)
+                # Update progress bar
+                pbar.update(1)
+                pbar.set_description(f"Loss: {loss:.6f}")
+                self.densification_and_pruning(iteration)
+                # Save checkpoint
+                if iteration % self.config['save_interval'] == 0 or iteration == num_iterations - 1:
+                    self.save_checkpoint(iteration)
+        print("Training complete!")
+def main():
+    parser = argparse.ArgumentParser(description="Train 3D Gaussian Splatting model with NeRF dataset")
+    parser.add_argument("--dataset", type=str, default="./data/nerf_synthetic/lego",
+                        help="Path to NeRF dataset directory (default: Lego dataset)")
+    parser.add_argument("--output", type=str, default="./output", help="Output directory")
+    args = parser.parse_args()
+    # Create trainer and start training
+    trainer = NeRFGaussianSplattingTrainer(
+        dataset_path=args.dataset,
+        output_path=args.output,
+    )
+    trainer.train()
+if __name__ == "__main__":
+    main()

gs/train_colmap.py ADDED Viewed

	@@ -0,0 +1,1586 @@

+import os
+import numpy as np
+import matplotlib.pyplot as plt
+import warp as wp
+import imageio
+import json
+from tqdm import tqdm
+from pathlib import Path
+import argparse
+from forward import render_gaussians
+from backward import backward
+from optimizer import prune_gaussians, adam_update, clone_gaussians, compact_gaussians, mark_split_candidates, mark_clone_candidates, split_gaussians, reset_opacities, reset_densification_stats
+from config import *
+from utils.camera_utils import load_camera, load_camera_colmap
+from utils.point_cloud_utils import save_ply
+from loss import l1_loss, compute_image_gradients
+from scheduler import LRScheduler
+from utils.math_utils import quaternion_to_rotation_matrix
+from plyfile import PlyData, PlyElement
+from scipy.spatial import cKDTree # Add this import
+# Initialize Warp
+wp.init()
+# Kernels for parameter updates
+@wp.kernel
+def init_gaussian_params(
+    #positions: wp.array(dtype=wp.vec3),
+    #scales: wp.array(dtype=wp.vec3), # Keep as input, but it will be pre-filled
+    rotations: wp.array(dtype=wp.vec4),
+    opacities: wp.array(dtype=float),
+    #shs: wp.array(dtype=wp.vec3),
+    num_points: int
+    # init_scale: float # Remove init_scale, it's no longer used here
+):
+    i = wp.tid()
+    if i >= num_points:
+        return
+    # Initialize positions with random values (This is commented out in your version)
+    # Generate random positions using warp random
+    # offset = wp.vec3(
+    #     (wp.randf(wp.uint32(i * 3)) * 2.6 - 1.3),
+    #     (wp.randf(wp.uint32(i * 3 + 1)) * 2.6 - 1.3),
+    #     (wp.randf(wp.uint32(i * 3 + 2)) * 2.6 - 1.3)
+    # )
+    # # camera_center
+    # positions[i] =  offset
+    # Initialize scales (This line is removed, scales are pre-calculated)
+    # scales[i] = wp.vec3(init_scale, init_scale, init_scale)
+    # Initialize rotations to identity matrix
+    rotations[i] = wp.vec4(1.0, 0.0, 0.0, 0.0)
+    # Initialize opacities
+    opacities[i] = 0.1
+    # Initialize SH coefficients (This is commented out in your version)
+    # for j in range(16):  # degree=3, total 16 coefficients
+    #     idx = i * 16 + j
+    #     # Slight random initialization with positive bias
+    #     if j == 0:
+    #         shs[idx] = wp.vec3(-0.007, -0.007, -0.007)
+    #     else:
+    #         shs[idx] = wp.vec3(0.0, 0.0, 0.0)
+@wp.kernel
+def zero_gradients(
+    pos_grad: wp.array(dtype=wp.vec3),
+    scale_grad: wp.array(dtype=wp.vec3),
+    rot_grad: wp.array(dtype=wp.vec4),
+    opacity_grad: wp.array(dtype=float),
+    sh_grad: wp.array(dtype=wp.vec3),
+    num_points: int
+):
+    i = wp.tid()
+    if i >= num_points:
+        return
+    pos_grad[i] = wp.vec3(0.0, 0.0, 0.0)
+    scale_grad[i] = wp.vec3(0.0, 0.0, 0.0)
+    rot_grad[i] = wp.vec4(0.0, 0.0, 0.0, 0.0)
+    opacity_grad[i] = 0.0
+    # Zero SH gradients
+    for j in range(16):
+        idx = i * 16 + j
+        sh_grad[idx] = wp.vec3(0.0, 0.0, 0.0)
+class NeRFGaussianSplattingTrainer:
+    def __init__(self, dataset_path, output_path, config=None):
+        """Initialize the 3D Gaussian Splatting trainer using pure Warp for NeRF dataset."""
+        self.dataset_path = Path(dataset_path)
+        self.output_path = Path(output_path)
+        # Create output directories
+        self.output_path.mkdir(parents=True, exist_ok=True)
+        (self.output_path / "proj").mkdir(exist_ok=True)
+        (self.output_path / "render").mkdir(exist_ok=True)
+        (self.output_path / "target").mkdir(exist_ok=True)
+        (self.output_path / "depth_hist").mkdir(exist_ok=True)
+        (self.output_path / "point_cloud").mkdir(exist_ok=True)
+        # Initialize configuration from GaussianParams
+        self.config = GaussianParams.get_config_dict()
+        if config is not None:
+            self.config.update(config)
+        # Set default number of points (will be updated if points3D.ply is loaded)
+        self.num_points = self.config.get('num_points', 50000)
+        # Initialize learning rate scheduler
+        self.lr_scheduler = self.create_lr_scheduler()
+        print(f"Learning rate scheduler: {'Enabled' if self.lr_scheduler else 'Disabled'}")
+        # For tracking learning rates
+        self.learning_rate_history = {
+            'positions': [],
+            'scales': [],
+            'rotations': [],
+            'shs': [],
+            'opacities': []
+        }
+        # Load dataset
+        print(f"Loading COLMAP dataset from {self.dataset_path}")
+        self.cameras, self.image_paths = self.load_colmap("train")
+        self.test_cameras, self.test_image_paths = self.load_colmap("test")
+        print(f"Loaded {len(self.cameras)} train cameras and {len(self.image_paths)} train images")
+        print(f"Loaded {len(self.test_cameras)} test cameras and {len(self.test_image_paths)} test images")
+        # Calculate scene extent for densification
+        self.scene_extent = self.calculate_scene_extent()
+        print(f"Calculated scene extent: {self.scene_extent}")
+        # Initialize parameters (this may update self.num_points if points3D.ply is found)
+        self.params = self.initialize_parameters()
+        print(f"Initialized {self.num_points} Gaussians")
+        # Create gradient arrays
+        self.grads = self.create_gradient_arrays()
+        # Create optimizer state
+        self.adam_m = self.create_gradient_arrays()
+        self.adam_v = self.create_gradient_arrays()
+        # Initialize densification state tracking
+        self.init_densification_state()
+        # For tracking loss
+        self.losses = []
+        # Initialize intermediate buffers dictionary
+        self.intermediate_buffers = {}
+        # Track iteration for opacity reset
+        self.opacity_reset_at = -32768
+        # Call after loading data
+        #self.visualize_camera_points_alignment()
+    def create_lr_scheduler(self):
+        """Create simple learning rate schedulers for each parameter type."""
+        if not self.config['use_lr_scheduler']:
+            return None
+        config = self.config['lr_scheduler_config']
+        final_factor = config['final_lr_factor']
+        schedulers = {
+            'positions': LRScheduler(config['lr_pos'], final_factor),
+            'scales': LRScheduler(config['lr_scale'], final_factor),
+            'rotations': LRScheduler(config['lr_rot'], final_factor),
+            'shs': LRScheduler(config['lr_sh'], final_factor),
+            'opacities': LRScheduler(config['lr_opac'], final_factor)
+        }
+        return schedulers
+    def initialize_parameters(self):
+        """Initialize Gaussian parameters using points3D.ply if available."""
+        # Try to load points from points3D.ply
+        points3d_path = self.dataset_path / "sparse/0/points3D.ply"
+        initial_positions_np = None # Renamed to avoid confusion
+        initial_colors_np = None    # Renamed
+        if points3d_path.exists():
+            try:
+                plydata = PlyData.read(str(points3d_path))
+                vertices = plydata['vertex']
+                if 'x' in vertices and 'y' in vertices and 'z' in vertices:
+                    positions_data = np.vstack([vertices['x'], vertices['y'], vertices['z']]).T
+                    initial_positions_np = positions_data.astype(np.float32)
+                    if 'red' in vertices and 'green' in vertices and 'blue' in vertices:
+                        colors_data = np.vstack([vertices['red'], vertices['green'], vertices['blue']]).T
+                        initial_colors_np = (colors_data / 255.0).astype(np.float32)
+                    else:
+                        print("Warning: Color attributes (red, green, blue) not found in points3D.ply.")
+                    # Update num_points based on loaded points
+                    self.num_points = len(initial_positions_np)
+                    print(f"Loaded {self.num_points} points from points3D.ply")
+            except Exception as e:
+                print(f"Warning: Could not load points3D.ply: {e}")
+                initial_positions_np = None
+                initial_colors_np = None
+        if initial_positions_np is None:
+            # Fallback if points3D.ply is not loaded or doesn't have positions
+            print(f"Warning: Initial positions not loaded. Initializing {self.num_points} positions to zeros (or expect random init if uncommented in kernel).")
+            # self.num_points is already set from config or updated if PLY was partially read
+            initial_positions_np = np.zeros((self.num_points, 3), dtype=np.float32)
+        # Initialize scales_np
+        scales_np = np.zeros((self.num_points, 3), dtype=np.float32)
+        if initial_positions_np is not None and self.num_points > 3: # cKDTree needs k <= num_points
+            try:
+                print("Calculating initial scales using cKDTree...")
+                kdtree = cKDTree(initial_positions_np)
+                k = 2  # 1 self-point + 3 nearest neighbors
+                distances, _ = kdtree.query(initial_positions_np, k=k, workers=-1) # Use all available cores
+                # distances[:, 0] is the distance to self (0.0), so we use distances[:, 1:]
+                radius_np = np.mean(distances[:, 1:], axis=1)
+                scales_np = np.tile(radius_np[:, np.newaxis], (1, 3))
+                print(f"Initial scales calculated. Min radius: {radius_np.min()}, Max radius: {radius_np.max()}, Mean radius: {radius_np.mean()}")
+            except Exception as e:
+                print(f"Error during cKDTree scale initialization: {e}. Falling back to default scale.")
+                default_scale_val = self.config['initial_scale']
+                scales_np = np.full((self.num_points, 3), default_scale_val, dtype=np.float32)
+        else:
+            default_scale_val = self.config['initial_scale']
+            print(f"Not enough points for cKDTree or initial_positions_np is None. Using default scale: {default_scale_val}")
+            scales_np = np.full((self.num_points, 3), default_scale_val, dtype=np.float32)
+        # Initialize arrays with proper size
+        positions = wp.array(initial_positions_np, dtype=wp.vec3, device=DEVICE)
+        scales = wp.array(scales_np, dtype=wp.vec3, device=DEVICE) # Use the calculated or default scales_np
+        rotations = wp.zeros(self.num_points, dtype=wp.vec4, device=DEVICE)
+        opacities = wp.zeros(self.num_points, dtype=float, device=DEVICE)
+        C0 = 0.28209479177387814  # Constant for Y₀₀
+        shs_np_data = np.zeros((self.num_points * 16, 3), dtype=np.float32)
+        if initial_colors_np is not None and initial_colors_np.shape[0] == self.num_points:
+            shs_np_data[::16] = (initial_colors_np - 0.5) / C0
+        else:
+            # Default to gray if colors are not available or mismatch
+            gray_color_sh = (np.array([0.5, 0.5, 0.5]) - 0.5) / C0
+            shs_np_data[::16] = np.tile(gray_color_sh, (self.num_points, 1))
+        shs = wp.array(shs_np_data, dtype=wp.vec3, device=DEVICE)
+        # Launch kernel to initialize parameters (rotations and opacities)
+        # scales and shs are already initialized from Python side.
+        wp.launch(
+            init_gaussian_params,
+            dim=self.num_points,
+            inputs=[rotations, opacities, self.num_points] # Removed self.config['initial_scale']
+        )
+        return {
+            'positions': positions,
+            'scales': scales,
+            'rotations': rotations,
+            'opacities': opacities,
+            'shs': shs
+        }
+    def create_gradient_arrays(self):
+        """Create arrays for gradients or optimizer state."""
+        positions = wp.zeros(self.num_points, dtype=wp.vec3)
+        scales = wp.zeros(self.num_points, dtype=wp.vec3)
+        rotations = wp.zeros(self.num_points, dtype=wp.vec4)
+        opacities = wp.zeros(self.num_points, dtype=float)
+        shs = wp.zeros(self.num_points * 16, dtype=wp.vec3)
+        # Return a dictionary of arrays
+        return {
+            'positions': positions,
+            'scales': scales,
+            'rotations': rotations,
+            'opacities': opacities,
+            'shs': shs
+        }
+    def calculate_scene_extent(self):
+        """Calculate the extent of the scene based on camera positions."""
+        if not self.cameras:
+            return 1.0  # Default fallback
+        # Extract camera positions
+        camera_positions = []
+        for camera in self.cameras:
+            camera_positions.append(camera['camera_center'])
+        camera_positions = np.array(camera_positions)
+        # Calculate the centroid of all camera positions
+        scene_center = np.mean(camera_positions, axis=0)
+        # Calculate the maximum distance from any camera to the scene center
+        max_distance_to_center = 0.0
+        for pos in camera_positions:
+            distance = np.linalg.norm(pos - scene_center)
+            max_distance_to_center = max(max_distance_to_center, distance)
+        # The scene extent is the radius of the bounding sphere
+        # Use default factor if extent is too small
+        extent = max_distance_to_center * self.config.get('camera_extent_factor', 1.0)
+        return max(extent, 1.0)
+    def init_densification_state(self):
+        """Initialize state tracking for densification."""
+        self.xyz_gradient_accum = wp.zeros(self.num_points, dtype=float, device=DEVICE)
+        self.denom = wp.zeros(self.num_points, dtype=float, device=DEVICE)
+        self.max_radii2D = wp.zeros(self.num_points, dtype=float, device=DEVICE)
+    def load_colmap(self, datasplit="train", llffhold=8):
+        colmap_dir = self.dataset_path / "sparse/0"
+        images_dir = self.dataset_path / "images"
+        intrinsics = {}
+        with open(colmap_dir / "cameras.txt") as f:
+            for line in f:
+                if line.startswith("#"): continue
+                vals = line.strip().split()
+                if len(vals) < 4: continue
+                cam_id, model, w, h = int(vals[0]), vals[1], int(vals[2]), int(vals[3])
+                if model == "PINHOLE":
+                    # PINHOLE has 4 parameters: fx, fy, cx, cy
+                    if len(vals) >= 8:  # 4 basic + 4 params
+                        fx, fy, cx, cy = float(vals[4]), float(vals[5]), float(vals[6]), float(vals[7])
+                    else:
+                        continue
+                elif model == "SIMPLE_PINHOLE":
+                    # SIMPLE_PINHOLE has 3 parameters: f, cx, cy
+                    if len(vals) >= 7:  # 4 basic + 3 params
+                        f, cx, cy = float(vals[4]), float(vals[5]), float(vals[6])
+                        fx = fy = f  # Same focal length for both axes
+                    else:
+                        continue
+                else:
+                    print(f"Unsupported camera model: {model}")
+                    continue
+                intrinsics[cam_id] = (fx, fy, w, h, cx, cy)
+        extrinsics = []
+        with open(colmap_dir / "images.txt") as f:
+            for line in f:
+                if line.startswith("#"): continue
+                parts = line.strip().split()
+                if len(parts) < 10: continue
+                # COLMAP images.txt format: IMAGE_ID QW QX QY QZ TX TY TZ CAMERA_ID NAME
+                img_id, qw, qx, qy, qz, tx, ty, tz, cam_id, img_name = parts[:10]
+                cam_id = int(cam_id)
+                if cam_id not in intrinsics:
+                    print(f"Warning: Camera ID {cam_id} not found in intrinsics")
+                    continue
+                fx, fy, w, h, cx, cy = intrinsics[cam_id]
+                # Fix quaternion order and normalize
+                q = np.array([float(qw), float(qx), float(qy), float(qz)])
+                q = q / np.linalg.norm(q)  # Normalize if needed
+                t = np.array([float(tx), float(ty), float(tz)])
+                R = quaternion_to_rotation_matrix(q)
+                # # Convert from COLMAP's world-to-camera to camera-to-world
+                # c2w = np.eye(4, dtype=np.float32)
+                # c2w[:3, :3] = R.T
+                # c2w[:3, 3] = -R.T @ t
+                cam_info = {
+                    "camera_id": int(img_id),
+                    #"camera_to_world": c2w,
+                    "width": w,
+                    "height": h,
+                    "fx": fx,
+                    "fy": fy,
+                    "cx": cx,
+                    "cy": cy,
+                    "R": R,
+                    "T": t
+                }
+                camera = load_camera_colmap(cam_info)
+                if camera:
+                    extrinsics.append((camera, str(images_dir / img_name)))
+            # Split data based on datasplit parameter
+            if datasplit == "train":
+                selected = [c for i, c in enumerate(extrinsics) if i % llffhold != 0]
+            elif datasplit == "test":
+                selected = [c for i, c in enumerate(extrinsics) if i % llffhold == 0]
+            else:
+                selected = extrinsics
+            if selected:
+                cameras, image_paths = zip(*selected)
+                width = cameras[0]['width']
+                height = cameras[0]['height']
+                fx = cameras[0]['fx']
+                fy = cameras[0]['fy']
+                # Calculate field of view
+                camera_angle_x = 2 * np.arctan(0.5 * width / fx)
+                camera_angle_y = 2 * np.arctan(0.5 * height / fy)
+                self.config['width'] = width
+                self.config['height'] = height
+                self.config['fx'] = fx
+                self.config['fy'] = fy
+                self.config['focal'] = fx  # Use fx as primary focal length
+                return list(cameras), list(image_paths)
+            return [], []
+    def load_image(self, path):
+        """Load an image as a numpy array."""
+        if os.path.exists(path):
+            img = imageio.imread(path)
+            # Convert to float and normalize to [0, 1]
+            img_np = img.astype(np.float32) / 255.0
+            # Ensure image is RGB (discard alpha channel if present)
+            if img_np.shape[2] == 4:
+                img_np = img_np[:, :, :3] # Keep only R, G, B channels
+            return img_np
+        else:
+            raise FileNotFoundError(f"Image not found: {path}")
+    def zero_grad(self):
+        """Zero out all gradients."""
+        wp.launch(
+            zero_gradients,
+            dim=self.num_points,
+            inputs=[
+                self.grads['positions'],
+                self.grads['scales'],
+                self.grads['rotations'],
+                self.grads['opacities'],
+                self.grads['shs'],
+                self.num_points
+            ]
+        )
+    def densification_and_pruning(self, iteration):
+        """Perform sophisticated densification and pruning of Gaussians."""
+        # Check if we should do densification
+        densify_from_iter = self.config.get('densify_from_iter', 500)
+        densify_until_iter = self.config.get('densify_until_iter', 15000)
+        densification_interval = self.config.get('densification_interval', 100)
+        opacity_reset_interval = self.config.get('opacity_reset_interval', 3000)
+        # Skip densification if outside iteration range
+        if iteration > densify_from_iter and iteration < densify_until_iter and iteration % densification_interval == 0:
+            print(f"Iteration {iteration}: Performing sophisticated densification and pruning")
+            # For simplified implementation, use position gradients as proxy for viewspace gradients
+            pos_grads = self.grads['positions']
+            avg_grads = wp.zeros(self.num_points, dtype=float, device=DEVICE)
+            @wp.kernel
+            def compute_grad_norms(pos_grad: wp.array(dtype=wp.vec3),
+                                grad_norms: wp.array(dtype=float),
+                                num_points: int):
+                i = wp.tid()
+                if i >= num_points:
+                    return
+                grad_norms[i] = wp.length(pos_grad[i])
+            wp.launch(compute_grad_norms, dim=self.num_points,
+                    inputs=[pos_grads, avg_grads, self.num_points])
+            # Configuration
+            grad_threshold = self.config.get('densify_grad_threshold', 0.0002)
+            percent_dense = self.config.get('percent_dense', 0.01)
+            # --- Step 1: Clone small Gaussians with high gradients ---
+            clone_mask = wp.zeros(self.num_points, dtype=int, device=DEVICE)
+            wp.launch(
+                mark_clone_candidates,
+                dim=self.num_points,
+                inputs=[
+                    avg_grads,
+                    self.params['scales'],
+                    grad_threshold,
+                    self.scene_extent,
+                    percent_dense,
+                    clone_mask,
+                    self.num_points
+                ]
+            )
+            # Perform cloning
+            clone_prefix_sum = wp.zeros_like(clone_mask)
+            wp.utils.array_scan(clone_mask, clone_prefix_sum, inclusive=False)
+            total_to_clone = int(clone_prefix_sum.numpy()[-1])
+            if total_to_clone > 0:
+                print(f"[Clone] Cloning {total_to_clone} small Gaussians")
+                N = self.num_points
+                new_N = N + total_to_clone
+                # Allocate output arrays
+                out_params = {
+                    'positions': wp.zeros(new_N, dtype=wp.vec3, device=DEVICE),
+                    'scales': wp.zeros(new_N, dtype=wp.vec3, device=DEVICE),
+                    'rotations': wp.zeros(new_N, dtype=wp.vec4, device=DEVICE),
+                    'opacities': wp.zeros(new_N, dtype=float, device=DEVICE),
+                    'shs': wp.zeros(new_N * 16, dtype=wp.vec3, device=DEVICE)
+                }
+                # Clone Gaussians
+                wp.launch(
+                    clone_gaussians,
+                    dim=N,
+                    inputs=[
+                        clone_mask,
+                        clone_prefix_sum,
+                        self.params['positions'],
+                        self.params['scales'],
+                        self.params['rotations'],
+                        self.params['opacities'],
+                        self.params['shs'],
+                        0.01,  # noise_scale
+                        N,     # offset
+                        out_params['positions'],
+                        out_params['scales'],
+                        out_params['rotations'],
+                        out_params['opacities'],
+                        out_params['shs']
+                    ]
+                )
+                # Update parameters and state
+                self.params = out_params
+                self.num_points = new_N
+                self.grads = self.create_gradient_arrays()
+                self.adam_m = self.create_gradient_arrays()
+                self.adam_v = self.create_gradient_arrays()
+            # --- Step 2: Split large Gaussians with high gradients ---
+            split_mask = wp.zeros(self.num_points, dtype=int, device=DEVICE)
+            wp.launch(
+                mark_split_candidates,
+                dim=self.num_points,
+                inputs=[
+                    avg_grads,
+                    self.params['scales'],
+                    grad_threshold,
+                    self.scene_extent,
+                    percent_dense,
+                    split_mask,
+                    self.num_points
+                ]
+            )
+            # Perform splitting
+            split_prefix_sum = wp.zeros_like(split_mask)
+            wp.utils.array_scan(split_mask, split_prefix_sum, inclusive=False)
+            total_to_split = int(split_prefix_sum.numpy()[-1])
+            if total_to_split > 0:
+                print(f"[Split] Splitting {total_to_split} large Gaussians")
+                N = self.num_points
+                N_split = 2  # Split each Gaussian into 2
+                new_N = N + total_to_split * N_split
+                # Allocate output arrays
+                out_params = {
+                    'positions': wp.zeros(new_N, dtype=wp.vec3, device=DEVICE),
+                    'scales': wp.zeros(new_N, dtype=wp.vec3, device=DEVICE),
+                    'rotations': wp.zeros(new_N, dtype=wp.vec4, device=DEVICE),
+                    'opacities': wp.zeros(new_N, dtype=float, device=DEVICE),
+                    'shs': wp.zeros(new_N * 16, dtype=wp.vec3, device=DEVICE)
+                }
+                # Split Gaussians
+                wp.launch(
+                    split_gaussians,
+                    dim=N,
+                    inputs=[
+                        split_mask,
+                        split_prefix_sum,
+                        self.params['positions'],
+                        self.params['scales'],
+                        self.params['rotations'],
+                        self.params['opacities'],
+                        self.params['shs'],
+                        N_split,  # Number of splits per Gaussian
+                        0.8,      # scale_factor
+                        N,        # offset
+                        out_params['positions'],
+                        out_params['scales'],
+                        out_params['rotations'],
+                        out_params['opacities'],
+                        out_params['shs']
+                    ]
+                )
+                # Update parameters and state
+                self.params = out_params
+                self.num_points = new_N
+                self.grads = self.create_gradient_arrays()
+                self.adam_m = self.create_gradient_arrays()
+                self.adam_v = self.create_gradient_arrays()
+                # Remove original split Gaussians
+                prune_filter = wp.zeros(self.num_points, dtype=int, device=DEVICE)
+                @wp.kernel
+                def mark_split_originals_for_removal(
+                    split_mask: wp.array(dtype=int),
+                    prune_filter: wp.array(dtype=int),
+                    offset: int,
+                    num_points: int
+                ):
+                    i = wp.tid()
+                    if i >= num_points:
+                        return
+                    if i < offset and split_mask[i] == 1:
+                        prune_filter[i] = 1  # Mark for removal
+                    else:
+                        prune_filter[i] = 0  # Keep
+                wp.launch(mark_split_originals_for_removal, dim=self.num_points,
+                        inputs=[split_mask, prune_filter, N, self.num_points])
+                # Invert mask to get valid mask
+                valid_mask = wp.zeros_like(prune_filter)
+                @wp.kernel
+                def invert_mask(prune: wp.array(dtype=int), valid: wp.array(dtype=int), n: int):
+                    i = wp.tid()
+                    if i >= n:
+                        return
+                    valid[i] = 1 - prune[i]
+                wp.launch(invert_mask, dim=self.num_points,
+                        inputs=[prune_filter, valid_mask, self.num_points])
+                # Count valid points and compact
+                prefix_sum = wp.zeros_like(valid_mask)
+                wp.utils.array_scan(valid_mask, prefix_sum, inclusive=False)
+                valid_count = int(prefix_sum.numpy()[-1])
+                if valid_count < self.num_points:
+                    print(f"[Split] Removing {self.num_points - valid_count} original split Gaussians")
+                    # Allocate compacted output
+                    compact_params = {
+                        'positions': wp.zeros(valid_count, dtype=wp.vec3, device=DEVICE),
+                        'scales': wp.zeros(valid_count, dtype=wp.vec3, device=DEVICE),
+                        'rotations': wp.zeros(valid_count, dtype=wp.vec4, device=DEVICE),
+                        'opacities': wp.zeros(valid_count, dtype=float, device=DEVICE),
+                        'shs': wp.zeros(valid_count * 16, dtype=wp.vec3, device=DEVICE)
+                    }
+                    wp.launch(
+                        compact_gaussians,
+                        dim=self.num_points,
+                        inputs=[
+                            valid_mask,
+                            prefix_sum,
+                            self.params['positions'],
+                            self.params['scales'],
+                            self.params['rotations'],
+                            self.params['opacities'],
+                            self.params['shs'],
+                            compact_params['positions'],
+                            compact_params['scales'],
+                            compact_params['rotations'],
+                            compact_params['opacities'],
+                            compact_params['shs']
+                        ]
+                    )
+                    # Update parameters and state
+                    self.params = compact_params
+                    self.num_points = valid_count
+                    self.grads = self.create_gradient_arrays()
+                    self.adam_m = self.create_gradient_arrays()
+                    self.adam_v = self.create_gradient_arrays()
+            # --- Step 3: Enhanced Pruning ---
+            print(f"[Prune] Performing enhanced pruning")
+            valid_mask = wp.zeros(self.num_points, dtype=int, device=DEVICE)
+            # Use opacity-based pruning for now
+            wp.launch(
+                prune_gaussians,
+                dim=self.num_points,
+                inputs=[
+                    self.params['opacities'],
+                    self.config.get('cull_opacity_threshold', 0.005),
+                    valid_mask,
+                    self.num_points
+                ]
+            )
+            # Count valid points
+            prefix_sum = wp.zeros_like(valid_mask)
+            wp.utils.array_scan(valid_mask, prefix_sum, inclusive=False)
+            valid_count = int(prefix_sum.numpy()[-1])
+            # Check pruning constraints
+            min_valid_points = self.config.get('min_valid_points', 1000)
+            max_valid_points = self.config.get('max_valid_points', 1000000)
+            max_prune_ratio = self.config.get('max_allowed_prune_ratio', 0.5)
+            prune_count = self.num_points - valid_count
+            prune_ratio = prune_count / self.num_points if self.num_points > 0 else 0
+            if (valid_count >= min_valid_points and
+                valid_count <= max_valid_points and
+                prune_ratio <= max_prune_ratio and
+                valid_count < self.num_points):
+                print(f"[Prune] Compacting from {self.num_points} → {valid_count} points")
+                # Allocate compacted output
+                out_params = {
+                    'positions': wp.zeros(valid_count, dtype=wp.vec3, device=DEVICE),
+                    'scales': wp.zeros(valid_count, dtype=wp.vec3, device=DEVICE),
+                    'rotations': wp.zeros(valid_count, dtype=wp.vec4, device=DEVICE),
+                    'opacities': wp.zeros(valid_count, dtype=float, device=DEVICE),
+                    'shs': wp.zeros(valid_count * 16, dtype=wp.vec3, device=DEVICE)
+                }
+                wp.launch(
+                    compact_gaussians,
+                    dim=self.num_points,
+                    inputs=[
+                        valid_mask,
+                        prefix_sum,
+                        self.params['positions'],
+                        self.params['scales'],
+                        self.params['rotations'],
+                        self.params['opacities'],
+                        self.params['shs'],
+                        out_params['positions'],
+                        out_params['scales'],
+                        out_params['rotations'],
+                        out_params['opacities'],
+                        out_params['shs']
+                    ]
+                )
+                # Update parameters and state
+                self.params = out_params
+                self.num_points = valid_count
+                self.grads = self.create_gradient_arrays()
+                self.adam_m = self.create_gradient_arrays()
+                self.adam_v = self.create_gradient_arrays()
+            else:
+                print(f"[Prune] Skipping pruning: valid={valid_count}, ratio={prune_ratio:.3f}")
+        # Opacity reset - updated logic to match reference implementation
+        background_is_white = all(c == 1.0 for c in self.config['background_color'])
+        should_reset_opacity = (
+            iteration % opacity_reset_interval == 0 or
+            (background_is_white and iteration == densify_from_iter)
+        )
+        if should_reset_opacity:
+            print(f"Iteration {iteration}: Resetting opacities")
+            wp.launch(
+                reset_opacities,
+                dim=self.num_points,
+                inputs=[
+                    self.params['opacities'],
+                    0.01,  # max_opacity
+                    self.num_points
+                ]
+            )
+    def optimizer_step(self, iteration):
+        """Perform an Adam optimization step."""
+        # Get learning rates from scheduler or use config defaults
+        if self.lr_scheduler:
+            lr_pos = self.lr_scheduler['positions'].get_lr(iteration, self.config['num_iterations'])
+            lr_scale = self.lr_scheduler['scales'].get_lr(iteration, self.config['num_iterations'])
+            lr_rot = self.lr_scheduler['rotations'].get_lr(iteration, self.config['num_iterations'])
+            lr_sh = self.lr_scheduler['shs'].get_lr(iteration, self.config['num_iterations'])
+            lr_opac = self.lr_scheduler['opacities'].get_lr(iteration, self.config['num_iterations'])
+            # Track learning rate history
+            self.learning_rate_history['positions'].append(lr_pos)
+            self.learning_rate_history['scales'].append(lr_scale)
+            self.learning_rate_history['rotations'].append(lr_rot)
+            self.learning_rate_history['shs'].append(lr_sh)
+            self.learning_rate_history['opacities'].append(lr_opac)
+            # Log learning rates occasionally
+            if iteration % 1000 == 0:
+                print(f"Iteration {iteration} learning rates:")
+                print(f"  positions: {lr_pos:.6f}")
+                print(f"  scales: {lr_scale:.6f}")
+                print(f"  rotations: {lr_rot:.6f}")
+                print(f"  shs: {lr_sh:.6f}")
+                print(f"  opacities: {lr_opac:.6f}")
+        else:
+            # Use static learning rates from config
+            lr_pos = self.config['lr_pos']
+            lr_scale = self.config['lr_scale']
+            lr_rot = self.config['lr_rot']
+            lr_sh = self.config['lr_sh']
+            lr_opac = self.config['lr_opac']
+        wp.launch(
+            adam_update,
+            dim=self.num_points,
+            inputs=[
+                # Parameters
+                self.params['positions'],
+                self.params['scales'],
+                self.params['rotations'],
+                self.params['opacities'],
+                self.params['shs'],
+                # Gradients
+                self.grads['positions'],
+                self.grads['scales'],
+                self.grads['rotations'],
+                self.grads['opacities'],
+                self.grads['shs'],
+                # First moments (m)
+                self.adam_m['positions'],
+                self.adam_m['scales'],
+                self.adam_m['rotations'],
+                self.adam_m['opacities'],
+                self.adam_m['shs'],
+                # Second moments (v)
+                self.adam_v['positions'],
+                self.adam_v['scales'],
+                self.adam_v['rotations'],
+                self.adam_v['opacities'],
+                self.adam_v['shs'],
+                # Optimizer parameters with dynamic learning rates
+                self.num_points,
+                lr_pos,    # Dynamic learning rate for positions
+                lr_scale,  # Dynamic learning rate for scales
+                lr_rot,    # Dynamic learning rate for rotations
+                lr_sh,     # Dynamic learning rate for SH coefficients
+                lr_opac,   # Dynamic learning rate for opacities
+                self.config['adam_beta1'],
+                self.config['adam_beta2'],
+                self.config['adam_epsilon'],
+                iteration
+            ]
+        )
+    def save_checkpoint(self, iteration):
+        """Save the current point cloud and training state."""
+        checkpoint_dir = self.output_path / "point_cloud" / f"iteration_{iteration}"
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        # Save point cloud as PLY
+        ply_path = checkpoint_dir / "point_cloud.ply"
+        save_ply(self.params, ply_path, self.num_points)
+        # Save loss history
+        loss_path = self.output_path / "loss.txt"
+        with open(loss_path, 'w') as f:
+            for loss in self.losses:
+                f.write(f"{loss}\n")
+        # Save loss plot
+        plt.figure(figsize=(10, 5))
+        plt.plot(self.losses)
+        plt.title('Training Loss')
+        plt.xlabel('Iteration')
+        plt.ylabel('Loss')
+        plt.savefig(self.output_path / "loss_plot.png")
+        plt.close()
+        # Save a rendered view
+        camera_idx = 0  # Front view
+        rendered_image, _, _ = render_gaussians(
+            background=np.array(self.config['background_color'], dtype=np.float32),
+            means3D=self.params['positions'].numpy(),
+            colors=None,  # Use SH coefficients instead
+            opacity=self.params['opacities'].numpy(),
+            scales=self.params['scales'].numpy(),
+            rotations=self.params['rotations'].numpy(),
+            scale_modifier=self.config['scale_modifier'],
+            viewmatrix=self.cameras[camera_idx]['world_to_camera'],
+            projmatrix=self.cameras[camera_idx]['full_proj_matrix'],
+            tan_fovx=self.cameras[camera_idx]['tan_fovx'],
+            tan_fovy=self.cameras[camera_idx]['tan_fovy'],
+            image_height=self.cameras[camera_idx]['height'],
+            image_width=self.cameras[camera_idx]['width'],
+            sh=self.params['shs'].numpy(),  # Pass SH coefficients
+            degree=self.config['sh_degree'],
+            campos=self.cameras[camera_idx]['camera_center'],
+            prefiltered=False,
+            antialiasing=True,
+            clamped=True
+        )
+        # Save rendered view as image
+        rendered_array = wp.to_torch(rendered_image).cpu().numpy()
+        # Handle case where rendered_array has shape (3, H, W) - transpose to (H, W, 3)
+        if rendered_array.shape[0] == 3 and len(rendered_array.shape) == 3:
+            rendered_array = np.transpose(rendered_array, (1, 2, 0))
+        img8 = (np.clip(rendered_array, 0, 1) * 255).astype(np.uint8)
+        imageio.imwrite(checkpoint_dir / "rendered_view.png", img8)
+    def debug_log_and_save_images(
+        self,
+        rendered_image,         # np.float32  H×W×3  (range 0-1)
+        target_image,           # np.float32
+        depth_image,            # wp.array2d(float) – optional but unused here
+        camera_idx: int,
+        it: int
+    ):
+        # ------ quick numeric read-out -----------------------------------
+        radii   = wp.to_torch(self.intermediate_buffers["radii"]).cpu().numpy()
+        alphas  = wp.to_torch(self.intermediate_buffers["conic_opacity"]).cpu().numpy()[:, 3]
+        offs    = wp.to_torch(self.intermediate_buffers["point_offsets"]).cpu().numpy()
+        num_dup = int(offs[-1]) if len(offs) else 0
+        r_med   = np.median(radii[radii > 0]) if (radii > 0).any() else 0
+        # Count visible Gaussians
+        xy_image = wp.to_torch(self.intermediate_buffers["points_xy_image"]).cpu().numpy()
+        W = self.cameras[camera_idx]['width']
+        H = self.cameras[camera_idx]['height']
+        visible_gaussians = np.sum(
+            (xy_image[:, 0] >= 0) & (xy_image[:, 0] < W) &
+            (xy_image[:, 1] >= 0) & (xy_image[:, 1] < H) &
+            np.isfinite(xy_image).all(axis=1) &
+            (radii > 0)  # Only count Gaussians with positive radius
+        )
+        print(
+            f"[it {it:05d}] cam={camera_idx:02d} dup={num_dup:<6} "
+            f"r_med={r_med:5.1f}  α∈[{alphas.min():.3f},"
+            f"{np.median(alphas):.3f},{alphas.max():.3f}] "
+            f"visible={visible_gaussians}/{len(xy_image)}"
+        )
+        # ------ save render / target PNG ---------------------------------
+        def save_rgb(arr_f32, stem):
+            # Handle case where arr_f32 has shape (3, H, W) - transpose to (H, W, 3)
+            if arr_f32.shape[0] == 3 and len(arr_f32.shape) == 3:
+                arr_f32 = np.transpose(arr_f32, (1, 2, 0))
+            img8 = (np.clip(arr_f32, 0, 1) * 255).astype(np.uint8)
+            # Include camera index in the filename
+            imageio.imwrite(self.output_path / f"{stem}" / f"{stem}_{it:06d}_cam{camera_idx:02d}.png", img8)
+        save_rgb(rendered_image if isinstance(rendered_image, np.ndarray) else wp.to_torch(rendered_image).cpu().numpy(), "render")
+        save_rgb(target_image, "target")
+        # ------ make 2-D projection scatter ------------------------------
+        xy     = wp.to_torch(self.intermediate_buffers["points_xy_image"]).cpu().numpy()
+        depth  = wp.to_torch(self.intermediate_buffers["depths"]).cpu().numpy()
+        H, W   = self.config["height"], self.config["width"]
+        mask = (
+            (xy[:, 0] >= 0) & (xy[:, 0] < W) &
+            (xy[:, 1] >= 0) & (xy[:, 1] < H) &
+            np.isfinite(xy).all(axis=1) &
+            (radii > 0)  # Only include Gaussians with positive radius
+        )
+        if mask.any():
+            plt.figure(figsize=(6, 6))
+            plt.scatter(xy[mask, 0], xy[mask, 1],
+                        s=4, c=depth[mask], cmap="turbo", alpha=.7)
+            plt.gca().invert_yaxis()
+            plt.xlim(0, W); plt.ylim(H, 0)
+            plt.title(f"Projected Gaussians (cam {camera_idx}, iter {it}): {np.sum(mask)}/{len(xy)} visible")
+            plt.colorbar(label="depth(z)")
+            plt.tight_layout()
+            # Include camera index in the filename
+            plt.savefig(self.output_path / 'proj' / f"proj_{it:06d}_cam{camera_idx:02d}.png", dpi=250)
+            plt.close()
+            # depth histogram
+            plt.figure(figsize=(5, 3))
+            plt.hist(depth[mask], bins=40, color="steelblue")
+            plt.xlabel("depth (camera-z)")
+            plt.ylabel("count")
+            plt.title(f"Depth hist – cam {camera_idx}, {mask.sum()} pts")
+            plt.tight_layout()
+            # Include camera index in the filename
+            plt.savefig(self.output_path / 'depth_hist' / f"depth_hist_{it:06d}.png", dpi=250)
+            plt.close()
+    def train(self):
+        """Train the 3D Gaussian Splatting model."""
+        num_iterations = self.config['num_iterations']
+        # Main training loop
+        with tqdm(total=num_iterations) as pbar:
+            for iteration in range(num_iterations):
+                # Select a random camera and corresponding image
+                camera_idx = np.random.randint(0, len(self.cameras))
+                image_path = self.image_paths[camera_idx]
+                target_image = self.load_image(image_path)
+                # Zero gradients
+                self.zero_grad()
+                # Render the view
+                rendered_image, depth_image, self.intermediate_buffers = render_gaussians(
+                    background=np.array(self.config['background_color'], dtype=np.float32),
+                    means3D=self.params['positions'].numpy(),
+                    colors=None,  # Use SH coefficients instead
+                    opacity=self.params['opacities'].numpy(),
+                    scales=self.params['scales'].numpy(),
+                    rotations=self.params['rotations'].numpy(),
+                    scale_modifier=self.config['scale_modifier'],
+                    viewmatrix=self.cameras[camera_idx]['world_to_camera'],
+                    projmatrix=self.cameras[camera_idx]['full_proj_matrix'],
+                    tan_fovx=self.cameras[camera_idx]['tan_fovx'],
+                    tan_fovy=self.cameras[camera_idx]['tan_fovy'],
+                    image_height=self.cameras[camera_idx]['height'],
+                    image_width=self.cameras[camera_idx]['width'],
+                    sh=self.params['shs'].numpy(),  # Pass SH coefficients
+                    degree=self.config['sh_degree'],
+                    campos=self.cameras[camera_idx]['camera_center'],
+                    prefiltered=False,
+                    antialiasing=False,
+                    clamped=True
+                )
+                radii = wp.to_torch(self.intermediate_buffers["radii"]).cpu().numpy()
+                np_rendered_image = wp.to_torch(rendered_image).cpu().numpy()
+                np_rendered_image = np_rendered_image.transpose(2, 0, 1)
+                #if iteration % self.config['save_interval'] == 0:
+                if (
+                    iteration < 10 or
+                    #(iteration < 50 and iteration % 5 == 0) or
+                    #(iteration < 100 and iteration % 10 == 0) or
+                    #(iteration < 1000 and iteration % 100 == 0) or
+                    (iteration % 1000 == 0) or
+                    (iteration == num_iterations - 1)
+                ):
+                    self.debug_log_and_save_images(np_rendered_image, target_image, depth_image, camera_idx, iteration)
+                # Calculate L1 loss
+                l1_val = l1_loss(rendered_image, target_image)
+                # # Calculate SSIM, not used
+                # ssim_val = ssim(rendered_image, target_image)
+                # # Combined loss with weighted SSIM
+                # lambda_dssim = self.config['lambda_dssim']
+                # # loss = (1 - λ) * L1 + λ * (1 - SSIM)
+                # loss = (1.0 - lambda_dssim) * l1_val + lambda_dssim * (1.0 - ssim_val)
+                loss = l1_val
+                self.losses.append(loss)
+                # Compute pixel gradients for image loss (dL/dColor)
+                pixel_grad_buffer = compute_image_gradients(
+                    rendered_image, target_image, lambda_dssim=0
+                )
+                # Prepare camera parameters
+                camera = self.cameras[camera_idx]
+                view_matrix = wp.mat44(camera['world_to_camera'].flatten())
+                proj_matrix = wp.mat44(camera['full_proj_matrix'].flatten())
+                campos = wp.vec3(camera['camera_center'][0], camera['camera_center'][1], camera['camera_center'][2])
+                # Create appropriate buffer dictionaries for the backward pass
+                geom_buffer = {
+                    'radii': self.intermediate_buffers['radii'],
+                    'means2D': self.intermediate_buffers['points_xy_image'],
+                    'conic_opacity': self.intermediate_buffers['conic_opacity'],
+                    'rgb': self.intermediate_buffers['colors'],
+                    'clamped': self.intermediate_buffers['clamped_state']
+                }
+                binning_buffer = {
+                    'point_list': self.intermediate_buffers['point_list']
+                }
+                img_buffer = {
+                    'ranges': self.intermediate_buffers['ranges'],
+                    'final_Ts': self.intermediate_buffers['final_Ts'],
+                    'n_contrib': self.intermediate_buffers['n_contrib']
+                }
+                gradients = backward(
+                    # Core parameters
+                    background=np.array(self.config['background_color'], dtype=np.float32),
+                    means3D=self.params['positions'],
+                    dL_dpixels=pixel_grad_buffer,
+                    # Model parameters (pass directly from self.params)
+                    opacity=self.params['opacities'],
+                    shs=self.params['shs'],
+                    scales=self.params['scales'],
+                    rotations=self.params['rotations'],
+                    scale_modifier=self.config['scale_modifier'],
+                    # Camera parameters
+                    viewmatrix=view_matrix,
+                    projmatrix=proj_matrix,
+                    tan_fovx=camera['tan_fovx'],
+                    tan_fovy=camera['tan_fovy'],
+                    image_height=camera['height'],
+                    image_width=camera['width'],
+                    campos=campos,
+                    # Forward output buffers
+                    radii=self.intermediate_buffers['radii'],
+                    means2D=self.intermediate_buffers['points_xy_image'],
+                    conic_opacity=self.intermediate_buffers['conic_opacity'],
+                    rgb=self.intermediate_buffers['colors'],
+                    cov3Ds=self.intermediate_buffers['cov3Ds'],
+                    clamped=self.intermediate_buffers['clamped_state'],
+                    # Internal state buffers
+                    geom_buffer=geom_buffer,
+                    binning_buffer=binning_buffer,
+                    img_buffer=img_buffer,
+                    # Algorithm parameters
+                    degree=self.config['sh_degree'],
+                    debug=False
+                )
+                # 3. Copy gradients from backward result to the optimizer's gradient buffers
+                wp.copy(self.grads['positions'], gradients['dL_dmean3D'])
+                wp.copy(self.grads['scales'], gradients['dL_dscale'])
+                wp.copy(self.grads['rotations'], gradients['dL_drot'])
+                wp.copy(self.grads['opacities'], gradients['dL_dopacity'])
+                wp.copy(self.grads['shs'], gradients['dL_dshs'])
+                # Update parameters
+                self.optimizer_step(iteration)
+                # Update progress bar
+                pbar.update(1)
+                pbar.set_description(f"Loss: {loss:.6f}")
+                self.densification_and_pruning(iteration)
+                # Save checkpoint
+                #if iteration % self.config['save_interval'] == 0 or iteration == num_iterations - 1:
+                if (
+                    iteration < 10 or
+                    # (iteration < 50 and iteration % 5 == 0) or
+                    # (iteration < 100 and iteration % 10 == 0) or
+                    # (iteration < 1000 and iteration % 100 == 0) or
+                    (iteration % 1000 == 0) or
+                    (iteration == num_iterations - 1)
+                ):
+                    self.save_checkpoint(iteration)
+        print("Training complete!")
+    def visualize_camera_points_alignment_interactive(self):
+        """Create an interactive 3D visualization with camera frustums and colored points"""
+        try:
+            import plotly.graph_objects as go
+            from plotly.subplots import make_subplots
+            import plotly.express as px
+        except ImportError:
+            print("plotly not found. Install with: pip install plotly")
+            return
+        # Get data
+        camera_positions = np.array([cam['camera_center'] for cam in self.cameras])
+        points_np = wp.to_torch(self.params['positions']).cpu().numpy()
+        # Get SH coefficients for colors
+        shs_np = wp.to_torch(self.params['shs']).cpu().numpy()
+        # Extract base colors from SH coefficients
+        C0 = 0.28209479177387814  # Normalization constant for Y_00
+        point_colors = np.zeros((len(points_np), 3), dtype=np.float32)
+        # Get only the DC component (first SH coefficient) for each point
+        for i in range(len(points_np)):
+            sh_dc = shs_np[i * 16]  # First SH coefficient for each point
+            rgb = sh_dc * C0 + 0.5
+            point_colors[i] = np.clip(rgb, 0, 1)
+        # Sample points for better performance
+        max_points = 5000
+        if len(points_np) > max_points:
+            indices = np.random.choice(len(points_np), max_points, replace=False)
+            points_sample = points_np[indices]
+            colors_sample = point_colors[indices]
+        else:
+            points_sample = points_np
+            colors_sample = point_colors
+        # Convert colors to hex format for plotly
+        colors_hex = [f'rgb({int(r*255)},{int(g*255)},{int(b*255)})' for r, g, b in colors_sample]
+        # Calculate extents
+        cam_extent = np.max(np.abs(camera_positions))
+        points_extent = np.max(np.abs(points_sample))
+        print(f"Camera extent: {cam_extent:.3f}")
+        print(f"Points extent: {points_extent:.3f}")
+        print(f"Scale ratio: {cam_extent/points_extent:.3f}")
+        # Create figure
+        fig = make_subplots(
+            rows=1, cols=1,
+            specs=[[{"type": "scene"}]],
+            subplot_titles=['3D Scene with Camera Frustums']
+        )
+        # Colors for cameras
+        n_cameras = len(camera_positions)
+        camera_colors = px.colors.qualitative.Bold[:min(n_cameras, 10)]
+        if n_cameras > 10:
+            camera_colors = camera_colors * (n_cameras // 10 + 1)
+        # Add point cloud with actual colors
+        fig.add_trace(
+            go.Scatter3d(
+                x=points_sample[:, 0],
+                y=points_sample[:, 1],
+                z=points_sample[:, 2],
+                mode='markers',
+                marker=dict(
+                    size=1.5,
+                    color=colors_hex,
+                    opacity=0.7
+                ),
+                name='Point Cloud',
+                hovertemplate='Point<br>X: %{x:.3f}<br>Y: %{y:.3f}<br>Z: %{z:.3f}<extra></extra>'
+            )
+        )
+        # Create camera frustums
+        frustum_scale = cam_extent * 0.2  # Increased frustum size
+        for i, (cam, pos) in enumerate(zip(self.cameras, camera_positions)):
+            color = camera_colors[i % len(camera_colors)]
+            # Extract camera parameters correctly
+            c2w = cam['camera_to_world']
+            # Camera coordinate system in world space
+            right = c2w[:3, 0]   # x-axis
+            up = c2w[:3, 1]      # y-axis
+            # IMPORTANT FIX: FLIP THE DIRECTION for correct frustum orientation
+            # The camera looks along the +Z axis in camera space, which is the 3rd column of c2w
+            forward = c2w[:3, 2]  # Use +Z for dust3r/COLMAP convention
+            # Debug first few cameras
+            if i < 3:
+                print(f"Camera {i} coordinate system:")
+                print(f"  Position: {pos}")
+                print(f"  Right:    {right}")
+                print(f"  Up:       {up}")
+                print(f"  Forward:  {forward}")
+            # Calculate frustum parameters
+            aspect_ratio = cam['width'] / cam['height']
+            tan_fov_x = cam['tan_fovx']
+            tan_fov_y = cam['tan_fovy']
+            # Frustum corners at near and far planes
+            near_dist = frustum_scale * 0.1
+            far_dist = frustum_scale
+            # Near plane corners
+            tl_near = pos + forward * near_dist - right * near_dist * tan_fov_x + up * near_dist * tan_fov_y
+            tr_near = pos + forward * near_dist + right * near_dist * tan_fov_x + up * near_dist * tan_fov_y
+            bl_near = pos + forward * near_dist - right * near_dist * tan_fov_x - up * near_dist * tan_fov_y
+            br_near = pos + forward * near_dist + right * near_dist * tan_fov_x - up * near_dist * tan_fov_y
+            # Far plane corners
+            tl_far = pos + forward * far_dist - right * far_dist * tan_fov_x + up * far_dist * tan_fov_y
+            tr_far = pos + forward * far_dist + right * far_dist * tan_fov_x + up * far_dist * tan_fov_y
+            bl_far = pos + forward * far_dist - right * far_dist * tan_fov_x - up * far_dist * tan_fov_y
+            br_far = pos + forward * far_dist + right * far_dist * tan_fov_x - up * far_dist * tan_fov_y
+            # Camera position marker
+            fig.add_trace(
+                go.Scatter3d(
+                    x=[pos[0]],
+                    y=[pos[1]],
+                    z=[pos[2]],
+                    mode='markers',
+                    marker=dict(
+                        size=8,  # Larger marker
+                        color=color,
+                        symbol='diamond',
+                    ),
+                    name=f'Camera {i}',
+                    hovertemplate=f'Camera {i}<br>X: %{{x:.3f}}<br>Y: %{{y:.3f}}<br>Z: %{{z:.3f}}<extra></extra>'
+                )
+            )
+            # Draw frustum edges
+            lines_x = []
+            lines_y = []
+            lines_z = []
+            # Helper to add a line
+            def add_line(p1, p2):
+                lines_x.extend([p1[0], p2[0], None])
+                lines_y.extend([p1[1], p2[1], None])
+                lines_z.extend([p1[2], p2[2], None])
+            # Near plane
+            add_line(tl_near, tr_near)
+            add_line(tr_near, br_near)
+            add_line(br_near, bl_near)
+            add_line(bl_near, tl_near)
+            # Far plane
+            add_line(tl_far, tr_far)
+            add_line(tr_far, br_far)
+            add_line(br_far, bl_far)
+            add_line(bl_far, tl_far)
+            # Connecting edges
+            add_line(tl_near, tl_far)
+            add_line(tr_near, tr_far)
+            add_line(bl_near, bl_far)
+            add_line(br_near, br_far)
+            # Camera to near plane corners
+            add_line(pos, tl_near)
+            add_line(pos, tr_near)
+            add_line(pos, bl_near)
+            add_line(pos, br_near)
+            # Add frustum lines
+            fig.add_trace(
+                go.Scatter3d(
+                    x=lines_x,
+                    y=lines_y,
+                    z=lines_z,
+                    mode='lines',
+                    line=dict(
+                        color=color,
+                        width=2
+                    ),
+                    name=f'Frustum {i}',
+                    showlegend=False,
+                    hoverinfo='none'
+                )
+            )
+            # Add coordinate system axes with LARGER SIZE
+            axis_length = frustum_scale * 0.15  # Increased from 0.05 to 0.15
+            # Right direction (X axis) - Red
+            fig.add_trace(
+                go.Scatter3d(
+                    x=[pos[0], pos[0] + right[0] * axis_length],
+                    y=[pos[1], pos[1] + right[1] * axis_length],
+                    z=[pos[2], pos[2] + right[2] * axis_length],
+                    mode='lines',
+                    line=dict(color='red', width=4),  # Thicker line
+                    name='X (Right)' if i == 0 else '',
+                    showlegend=i==0,
+                    hoverinfo='none'
+                )
+            )
+            # Up direction (Y axis) - Green
+            fig.add_trace(
+                go.Scatter3d(
+                    x=[pos[0], pos[0] + up[0] * axis_length],
+                    y=[pos[1], pos[1] + up[1] * axis_length],
+                    z=[pos[2], pos[2] + up[2] * axis_length],
+                    mode='lines',
+                    line=dict(color='green', width=4),  # Thicker line
+                    name='Y (Up)' if i == 0 else '',
+                    showlegend=i==0,
+                    hoverinfo='none'
+                )
+            )
+            # Forward direction (Z axis) - Blue
+            fig.add_trace(
+                go.Scatter3d(
+                    x=[pos[0], pos[0] + forward[0] * axis_length],
+                    y=[pos[1], pos[1] + forward[1] * axis_length],
+                    z=[pos[2], pos[2] + forward[2] * axis_length],
+                    mode='lines',
+                    line=dict(color='blue', width=4),  # Thicker line
+                    name='Z (Forward)' if i == 0 else '',
+                    showlegend=i==0,
+                    hoverinfo='none'
+                )
+            )
+            # Add cones at the end of each axis for better direction visibility
+            cone_size = axis_length * 0.2  # Size of cone relative to axis length
+            for axis_dir, color, axis_name in [
+                (right, 'red', 'X'),
+                (up, 'green', 'Y'),
+                (forward, 'blue', 'Z')
+            ]:
+                # End point of axis
+                end_point = pos + axis_dir * axis_length
+                # Add a sphere marker at axis end
+                fig.add_trace(
+                    go.Scatter3d(
+                        x=[end_point[0]],
+                        y=[end_point[1]],
+                        z=[end_point[2]],
+                        mode='markers',
+                        marker=dict(
+                            size=6,  # Size of endpoint marker
+                            color=color,
+                            symbol='circle'
+                        ),
+                        name=f'{axis_name} Axis End' if i == 0 else '',
+                        showlegend=False,
+                        hoverinfo='none'
+                    )
+                )
+        # Update layout for better visualization
+        fig.update_layout(
+            title=dict(
+                text=f'Interactive Camera Frustums and Point Cloud<br>'
+                     f'<sub>Cameras: {n_cameras}, Points: {len(points_sample)}/{len(points_np)}, '
+                     f'Scale ratio: {cam_extent/points_extent:.2f}</sub>',
+                x=0.5
+            ),
+            scene=dict(
+                xaxis_title='X',
+                yaxis_title='Y',
+                zaxis_title='Z',
+                aspectmode='data',  # 'cube' or 'data'
+                camera=dict(
+                    eye=dict(x=1.8, y=1.8, z=1.8)  # Adjusted default view
+                ),
+                annotations=[
+                    dict(
+                        showarrow=False,
+                        x=0.05,
+                        y=0.05,
+                        z=0.05,
+                        text="Camera axes:<br>Red: X (right)<br>Green: Y (up)<br>Blue: Z (forward)",
+                        xanchor="left",
+                        xshift=10,
+                        opacity=0.8,
+                        font=dict(size=14)
+                    )
+                ]
+            ),
+            height=900,
+            width=1000,
+            margin=dict(l=0, r=0, t=50, b=0)
+        )
+        # Add axis legend
+        for color, name in [('red', 'X (Right)'), ('green', 'Y (Up)'), ('blue', 'Z (Forward)')]:
+            fig.add_trace(
+                go.Scatter3d(
+                    x=[None], y=[None], z=[None],
+                    mode='lines',
+                    line=dict(color=color, width=6),
+                    name=name,
+                    showlegend=True
+                )
+            )
+        # Save interactive HTML
+        html_path = self.output_path / 'camera_frustums_visualization.html'
+        fig.write_html(str(html_path))
+        print(f"Interactive visualization saved to: {html_path}")
+        # Show in browser if possible
+        try:
+            fig.show()
+        except Exception as e:
+            print(f"Could not display in browser: {e}")
+            print(f"Open {html_path} in your browser to view the interactive plot")
+        return fig
+    def debug_camera_and_points_alignment(self):
+        """Enhanced debug function with scale analysis"""
+        # Get camera positions
+        camera_positions = np.array([cam['camera_center'] for cam in self.cameras])
+        # Get point cloud positions
+        points_np = wp.to_torch(self.params['positions']).cpu().numpy()
+        # Calculate detailed statistics
+        cam_stats = {
+            'min': np.min(camera_positions, axis=0),
+            'max': np.max(camera_positions, axis=0),
+            'mean': np.mean(camera_positions, axis=0),
+            'extent': np.max(np.abs(camera_positions))
+        }
+        points_stats = {
+            'min': np.min(points_np, axis=0),
+            'max': np.max(points_np, axis=0),
+            'mean': np.mean(points_np, axis=0),
+            'extent': np.max(np.abs(points_np))
+        }
+        print("=== ALIGNMENT DEBUG ===")
+        print(f"Cameras ({len(camera_positions)}):")
+        print(f"  Min: [{cam_stats['min'][0]:8.3f}, {cam_stats['min'][1]:8.3f}, {cam_stats['min'][2]:8.3f}]")
+        print(f"  Max: [{cam_stats['max'][0]:8.3f}, {cam_stats['max'][1]:8.3f}, {cam_stats['max'][2]:8.3f}]")
+        print(f"  Mean:[{cam_stats['mean'][0]:8.3f}, {cam_stats['mean'][1]:8.3f}, {cam_stats['mean'][2]:8.3f}]")
+        print(f"  Extent: {cam_stats['extent']:.3f}")
+        print(f"\nPoints ({len(points_np)}):")
+        print(f"  Min: [{points_stats['min'][0]:8.3f}, {points_stats['min'][1]:8.3f}, {points_stats['min'][2]:8.3f}]")
+        print(f"  Max: [{points_stats['max'][0]:8.3f}, {points_stats['max'][1]:8.3f}, {points_stats['max'][2]:8.3f}]")
+        print(f"  Mean:[{points_stats['mean'][0]:8.3f}, {points_stats['mean'][1]:8.3f}, {points_stats['mean'][2]:8.3f}]")
+        print(f"  Extent: {points_stats['extent']:.3f}")
+        # Scale analysis
+        scale_ratio = cam_stats['extent'] / points_stats['extent'] if points_stats['extent'] > 0 else float('inf')
+        print(f"\nScale Analysis:")
+        print(f"  Scale ratio (cam/points): {scale_ratio:.3f}")
+        if scale_ratio > 10:
+            print("  ⚠️  WARNING: Cameras much larger than points - may need to scale points up")
+            print(f"  Suggested point scale factor: {scale_ratio/10:.3f}")
+        elif scale_ratio < 0.1:
+            print("  ⚠️  WARNING: Points much larger than cameras - may need to scale points down")
+            print(f"  Suggested point scale factor: {scale_ratio*10:.3f}")
+        else:
+            print("  ✅ Scale ratio looks reasonable")
+        # Distance analysis
+        center_distance = np.linalg.norm(cam_stats['mean'] - points_stats['mean'])
+        print(f"\nCenter separation: {center_distance:.3f}")
+        if center_distance > max(cam_stats['extent'], points_stats['extent']):
+            print("  ⚠️  WARNING: Camera and point centers are far apart - possible coordinate system issue")
+        return cam_stats, points_stats
+def main():
+    parser = argparse.ArgumentParser(description="Train 3D Gaussian Splatting model with Colmap")
+    parser.add_argument("--dataset", type=str, default="./data_/scenes/steak_is",
+                        help="Path to NeRF dataset directory (default: Lego dataset)")
+    parser.add_argument("--output", type=str, default="./output/steak_is", help="Output directory")
+    args = parser.parse_args()
+    # Create trainer and start training
+    trainer = NeRFGaussianSplattingTrainer(
+        dataset_path=args.dataset,
+        output_path=args.output,
+    )
+    # Debug alignment
+    trainer.debug_camera_and_points_alignment()
+    # Create interactive visualization
+    trainer.visualize_camera_points_alignment_interactive()
+    # Start training
+    trainer.train()
+if __name__ == "__main__":
+    main()

gs/train_vdpm.py ADDED Viewed

	@@ -0,0 +1,712 @@

+"""
+Train 3D Gaussian Splatting from VDPM Output
+Loads VDPM reconstruction (tracks.npz, poses.npz, images/) and trains 3DGS.
+Supports per-frame training or combined multi-timestep training.
+Usage (from 4dgs-dpm root):
+    python -m gs.train_vdpm --input ./vdpm/input_images_XXXX --output ./output/vdpm_scene
+    python -m gs.train_vdpm --input ./vdpm/input_images_XXXX --output ./output --frame 0
+Or directly:
+    cd gs
+    python train_vdpm.py --input ../vdpm/input_images_XXXX --output ./output
+"""
+import os
+import sys
+import json
+import argparse
+import numpy as np
+from pathlib import Path
+from scipy.spatial import cKDTree
+import imageio
+import torch
+import matplotlib.pyplot as plt
+# Ensure gs/ modules are importable when running from root
+_gs_dir = Path(__file__).parent.resolve()
+if str(_gs_dir) not in sys.path:
+    sys.path.insert(0, str(_gs_dir))
+import warp as wp
+from tqdm import tqdm
+from forward import render_gaussians
+from backward import backward
+from optimizer import adam_update, prune_gaussians
+from config import GaussianParams, DEVICE
+from utils.point_cloud_utils import save_ply
+from utils.math_utils import world_to_view, projection_matrix
+from loss import l1_loss, compute_image_gradients
+wp.init()
+def decode_poses(pose_enc: np.ndarray, image_hw: tuple) -> tuple:
+    """
+    Decode VGGT pose encodings to extrinsic and intrinsic matrices.
+    Args:
+        pose_enc: (1, N, 9) pose encoding from VDPM
+        image_hw: (H, W) image dimensions
+    Returns:
+        extrinsics: (N, 4, 4) world-to-camera matrices
+        intrinsics: (N, 3, 3) camera intrinsic matrices
+    """
+    try:
+        from vggt.utils.pose_enc import pose_encoding_to_extri_intri
+        pose_enc_t = torch.from_numpy(pose_enc).float()
+        extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc_t, image_hw)
+        # extrinsic is (1, N, 3, 4) camera-from-world
+        extrinsic = extrinsic[0].numpy()  # (N, 3, 4)
+        intrinsic = intrinsic[0].numpy()  # (N, 3, 3)
+        # Add homogeneous row to extrinsic
+        N = extrinsic.shape[0]
+        bottom = np.array([0, 0, 0, 1], dtype=np.float32).reshape(1, 1, 4)
+        bottom = np.tile(bottom, (N, 1, 1))
+        extrinsics_4x4 = np.concatenate([extrinsic, bottom], axis=1)  # (N, 4, 4)
+        return extrinsics_4x4, intrinsic
+    except ImportError:
+        print("Warning: vggt not available. Using identity poses.")
+        N = pose_enc.shape[1]
+        extrinsics = np.tile(np.eye(4, dtype=np.float32), (N, 1, 1))
+        H, W = image_hw
+        fx = fy = max(H, W)
+        cx, cy = W / 2, H / 2
+        intrinsic = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32)
+        intrinsics = np.tile(intrinsic, (N, 1, 1))
+        return extrinsics, intrinsics
+def load_vdpm_data(input_path: str) -> dict:
+    """Load all VDPM outputs from a directory."""
+    input_path = Path(input_path)
+    # Load tracks/points
+    tracks_path = input_path / "tracks.npz"
+    output_path = input_path / "output_4d.npz"
+    if tracks_path.exists():
+        data = np.load(tracks_path)
+    elif output_path.exists():
+        data = np.load(output_path)
+    else:
+        raise FileNotFoundError(f"No tracks.npz or output_4d.npz in {input_path}")
+    world_points = data['world_points']
+    world_points_conf = data['world_points_conf']
+    num_views = int(data.get('num_views', 1))
+    num_timesteps = int(data.get('num_timesteps', world_points.shape[0]))
+    # Handle multi-view format
+    if world_points.ndim == 5:
+        T, V, H, W, _ = world_points.shape
+        print(f"Multi-view: {T} timesteps × {V} views × {H}×{W}")
+    else:
+        T, H, W, _ = world_points.shape
+        V = 1
+        world_points = world_points[:, np.newaxis, :, :, :]
+        world_points_conf = world_points_conf[:, np.newaxis, :, :]
+        print(f"Single-view: {T} timesteps × {H}×{W}")
+    # Load poses
+    poses_path = input_path / "poses.npz"
+    pose_enc = None
+    if poses_path.exists():
+        pose_data = np.load(poses_path)
+        pose_enc = pose_data.get('pose_enc')
+        print(f"Loaded poses: {pose_enc.shape if pose_enc is not None else 'None'}")
+    # Load images
+    images_dir = input_path / "images"
+    image_paths = sorted(images_dir.glob("*.png")) + sorted(images_dir.glob("*.jpg"))
+    images = []
+    for img_path in image_paths:
+        img = imageio.imread(img_path)
+        if img.ndim == 2:
+            img = np.stack([img, img, img], axis=-1)
+        elif img.shape[-1] == 4:
+            img = img[..., :3]
+        images.append(img.astype(np.float32) / 255.0)
+    images = np.stack(images, axis=0)  # (N, H, W, 3)
+    print(f"Loaded {len(images)} images, shape: {images.shape}")
+    # Load metadata
+    meta_path = input_path / "meta.json"
+    meta = {}
+    if meta_path.exists():
+        with open(meta_path) as f:
+            meta = json.load(f)
+    return {
+        'world_points': world_points,  # (T, V, H, W, 3)
+        'world_points_conf': world_points_conf,  # (T, V, H, W)
+        'pose_enc': pose_enc,
+        'images': images,  # (N, H, W, 3)
+        'num_views': num_views,
+        'num_timesteps': num_timesteps,
+        'T': T, 'V': V, 'H': H, 'W': W,
+        'meta': meta,
+    }
+def extract_frame_pointcloud(data: dict, frame_idx: int, conf_threshold: float = 50.0):
+    """
+    Extract point cloud and colors for a specific frame.
+    Returns:
+        positions: (N, 3) XYZ
+        colors: (N, 3) RGB [0,1]
+        confidence: (N,) confidence scores
+    """
+    T, V, H, W = data['T'], data['V'], data['H'], data['W']
+    # Get points for this frame (merge all views)
+    pts = data['world_points'][frame_idx]  # (V, H, W, 3)
+    conf = data['world_points_conf'][frame_idx]  # (V, H, W)
+    # Flatten
+    pts_flat = pts.reshape(-1, 3)  # (V*H*W, 3)
+    conf_flat = conf.reshape(-1)   # (V*H*W,)
+    # Get colors from images
+    # Images are interleaved: [cam0_t0, cam1_t0, cam0_t1, cam1_t1, ...]
+    start_idx = frame_idx * V
+    end_idx = start_idx + V
+    frame_images = data['images'][start_idx:end_idx]  # (V, H_img, W_img, 3)
+    # Resize images to match point cloud if needed
+    img_H, img_W = frame_images.shape[1:3]
+    if img_H != H or img_W != W:
+        from scipy.ndimage import zoom
+        scale_h = H / img_H
+        scale_w = W / img_W
+        resized = []
+        for v in range(V):
+            img_v = zoom(frame_images[v], (scale_h, scale_w, 1), order=1)
+            resized.append(img_v)
+        frame_images = np.stack(resized, axis=0)
+    colors_flat = frame_images.reshape(-1, 3)  # (V*H*W, 3)
+    # Filter by confidence
+    if conf_threshold > 0:
+        thresh = np.percentile(conf_flat, conf_threshold)
+        mask = (conf_flat >= thresh) & (conf_flat > 1e-5)
+    else:
+        mask = conf_flat > 1e-5
+    # Also filter NaN/Inf
+    valid_pts = np.isfinite(pts_flat).all(axis=1)
+    mask = mask & valid_pts
+    return pts_flat[mask], colors_flat[mask], conf_flat[mask]
+def build_cameras(data: dict, frame_idx: int) -> list:
+    """
+    Build camera dictionaries for training from VDPM data.
+    Returns list of camera dicts compatible with 3DGS training.
+    """
+    T, V, H, W = data['T'], data['V'], data['H'], data['W']
+    images = data['images']
+    img_H, img_W = images.shape[1:3]
+    # Decode poses
+    pose_enc = data.get('pose_enc')
+    if pose_enc is not None:
+        extrinsics, intrinsics = decode_poses(pose_enc, (img_H, img_W))
+    else:
+        # Fallback: identity poses with reasonable intrinsics
+        N = T * V
+        extrinsics = np.tile(np.eye(4, dtype=np.float32), (N, 1, 1))
+        fx = fy = max(img_H, img_W)
+        cx, cy = img_W / 2, img_H / 2
+        K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32)
+        intrinsics = np.tile(K, (N, 1, 1))
+    cameras = []
+    # Get camera indices for this frame
+    for v in range(V):
+        img_idx = frame_idx * V + v
+        if img_idx >= len(extrinsics):
+            continue
+        extrinsic = extrinsics[img_idx]  # (4, 4) camera-from-world
+        intrinsic = intrinsics[img_idx]  # (3, 3)
+        # Extract components
+        R = extrinsic[:3, :3]
+        t = extrinsic[:3, 3]
+        fx, fy = intrinsic[0, 0], intrinsic[1, 1]
+        cx, cy = intrinsic[0, 2], intrinsic[1, 2]
+        # Camera center in world coords
+        camera_center = -R.T @ t
+        # FOV from intrinsics
+        fov_x = 2 * np.arctan(img_W / (2 * fx))
+        fov_y = 2 * np.arctan(img_H / (2 * fy))
+        # Build matrices exactly like render.py does for Warp/OpenGL compatibility
+        # Warp uses column-major (OpenGL convention), so matrices must be transposed
+        world_to_camera = np.eye(4, dtype=np.float32)
+        world_to_camera[:3, :3] = R
+        world_to_camera[:3, 3] = t
+        world_to_camera = world_to_camera.T  # Transpose for Warp/OpenGL!
+        # Projection matrix (transposed for Warp/OpenGL)
+        near, far = 0.01, 100.0
+        proj_matrix = projection_matrix(fovx=fov_x, fovy=fov_y, znear=near, zfar=far).T
+        # Full projection = view @ proj
+        full_proj_matrix = world_to_camera @ proj_matrix
+        cameras.append({
+            'camera_id': img_idx,
+            'width': img_W,
+            'height': img_H,
+            'world_to_camera': world_to_camera,  # Transposed for Warp/OpenGL
+            'camera_to_world': np.linalg.inv(world_to_camera),
+            'camera_center': camera_center,
+            'full_proj_matrix': full_proj_matrix,
+            'tan_fovx': np.tan(fov_x / 2),
+            'tan_fovy': np.tan(fov_y / 2),
+            'fx': fx, 'fy': fy,
+            'cx': cx, 'cy': cy,
+        })
+    return cameras
+@wp.kernel
+def init_rotations_opacities(
+    rotations: wp.array(dtype=wp.vec4),
+    opacities: wp.array(dtype=float),
+    num_points: int
+):
+    i = wp.tid()
+    if i >= num_points:
+        return
+    rotations[i] = wp.vec4(1.0, 0.0, 0.0, 0.0)
+    opacities[i] = 0.5
+@wp.kernel
+def zero_gradients(
+    pos_grad: wp.array(dtype=wp.vec3),
+    scale_grad: wp.array(dtype=wp.vec3),
+    rot_grad: wp.array(dtype=wp.vec4),
+    opacity_grad: wp.array(dtype=float),
+    sh_grad: wp.array(dtype=wp.vec3),
+    num_points: int
+):
+    i = wp.tid()
+    if i >= num_points:
+        return
+    pos_grad[i] = wp.vec3(0.0, 0.0, 0.0)
+    scale_grad[i] = wp.vec3(0.0, 0.0, 0.0)
+    rot_grad[i] = wp.vec4(0.0, 0.0, 0.0, 0.0)
+    opacity_grad[i] = 0.0
+    for j in range(16):
+        idx = i * 16 + j
+        sh_grad[idx] = wp.vec3(0.0, 0.0, 0.0)
+class VDPM3DGSTrainer:
+    """Train 3DGS from VDPM point cloud initialization."""
+    def __init__(self, data: dict, frame_idx: int, output_path: str, conf_threshold: float = 50.0):
+        self.output_path = Path(output_path)
+        self.output_path.mkdir(parents=True, exist_ok=True)
+        # Create output directories for renders
+        self.render_dir = self.output_path / f"frame_{frame_idx}" / "renders"
+        self.render_dir.mkdir(parents=True, exist_ok=True)
+        self.config = GaussianParams.get_config_dict()
+        self.frame_idx = frame_idx
+        # Extract point cloud for this frame
+        print(f"Extracting point cloud for frame {frame_idx}...")
+        positions, colors, confidence = extract_frame_pointcloud(data, frame_idx, conf_threshold)
+        self.num_points = len(positions)
+        print(f"Got {self.num_points} points")
+        # Build cameras
+        self.cameras = build_cameras(data, frame_idx)
+        print(f"Built {len(self.cameras)} cameras")
+        # Store images
+        V = data['V']
+        start_idx = frame_idx * V
+        end_idx = start_idx + V
+        self.images = data['images'][start_idx:end_idx]
+        # Initialize Gaussian parameters
+        self.params = self._init_params(positions, colors)
+        self.grads = self._create_grad_arrays()
+        self.adam_m = self._create_grad_arrays()
+        self.adam_v = self._create_grad_arrays()
+        self.losses = []
+        self.intermediate_buffers = {}
+    def _init_params(self, positions: np.ndarray, colors: np.ndarray):
+        """Initialize Gaussian parameters from point cloud."""
+        N = self.num_points
+        # Positions
+        positions_wp = wp.array(positions.astype(np.float32), dtype=wp.vec3, device=DEVICE)
+        # Scales from KNN
+        if N > 3:
+            tree = cKDTree(positions)
+            dists, _ = tree.query(positions, k=4)
+            avg_dist = np.mean(dists[:, 1:], axis=1)
+            scales_np = np.clip(avg_dist, 0.001, 1.0)[:, np.newaxis] * np.ones((N, 3))
+        else:
+            scales_np = np.full((N, 3), 0.01, dtype=np.float32)
+        scales_wp = wp.array(scales_np.astype(np.float32), dtype=wp.vec3, device=DEVICE)
+        # Rotations and opacities
+        rotations_wp = wp.zeros(N, dtype=wp.vec4, device=DEVICE)
+        opacities_wp = wp.zeros(N, dtype=float, device=DEVICE)
+        wp.launch(init_rotations_opacities, dim=N, inputs=[rotations_wp, opacities_wp, N])
+        # SH coefficients from colors
+        C0 = 0.28209479177387814
+        shs_np = np.zeros((N * 16, 3), dtype=np.float32)
+        shs_np[::16] = (colors - 0.5) / C0
+        shs_wp = wp.array(shs_np, dtype=wp.vec3, device=DEVICE)
+        return {
+            'positions': positions_wp,
+            'scales': scales_wp,
+            'rotations': rotations_wp,
+            'opacities': opacities_wp,
+            'shs': shs_wp,
+        }
+    def _create_grad_arrays(self):
+        N = self.num_points
+        return {
+            'positions': wp.zeros(N, dtype=wp.vec3, device=DEVICE),
+            'scales': wp.zeros(N, dtype=wp.vec3, device=DEVICE),
+            'rotations': wp.zeros(N, dtype=wp.vec4, device=DEVICE),
+            'opacities': wp.zeros(N, dtype=float, device=DEVICE),
+            'shs': wp.zeros(N * 16, dtype=wp.vec3, device=DEVICE),
+        }
+    def zero_grad(self):
+        wp.launch(zero_gradients, dim=self.num_points, inputs=[
+            self.grads['positions'], self.grads['scales'],
+            self.grads['rotations'], self.grads['opacities'],
+            self.grads['shs'], self.num_points
+        ])
+    def train(self, num_iterations: int = 3000):
+        """Train the 3DGS model."""
+        print(f"Training for {num_iterations} iterations...")
+        # Save iteration 0 (initial state before training)
+        print("Saving initial state (iteration 0)...")
+        self.save(0)
+        with tqdm(total=num_iterations) as pbar:
+            for it in range(num_iterations):
+                self.zero_grad()
+                # Pick a random camera
+                cam_idx = np.random.randint(len(self.cameras))
+                camera = self.cameras[cam_idx]
+                target = self.images[cam_idx]
+                # Render
+                rendered, depth, self.intermediate_buffers = render_gaussians(
+                    background=np.array(self.config['background_color'], dtype=np.float32),
+                    means3D=self.params['positions'].numpy(),
+                    colors=None,
+                    opacity=self.params['opacities'].numpy(),
+                    scales=self.params['scales'].numpy(),
+                    rotations=self.params['rotations'].numpy(),
+                    scale_modifier=1.0,
+                    viewmatrix=camera['world_to_camera'],
+                    projmatrix=camera['full_proj_matrix'],
+                    tan_fovx=camera['tan_fovx'],
+                    tan_fovy=camera['tan_fovy'],
+                    image_height=camera['height'],
+                    image_width=camera['width'],
+                    sh=self.params['shs'].numpy(),
+                    degree=3,
+                    campos=camera['camera_center'],
+                    prefiltered=False,
+                    antialiasing=True,
+                )
+                # Compute loss
+                rendered_np = wp.to_torch(rendered).cpu().numpy()
+                if rendered_np.shape[0] == 3:
+                    rendered_np = np.transpose(rendered_np, (1, 2, 0))
+                target_wp = wp.array(target.astype(np.float32), dtype=wp.vec3, device=DEVICE)
+                loss = l1_loss(rendered, target_wp)
+                self.losses.append(loss)
+                # Compute pixel gradients for backward pass
+                pixel_grad_buffer = compute_image_gradients(
+                    rendered, target_wp, lambda_dssim=0
+                )
+                # Prepare camera parameters
+                view_matrix = wp.mat44(camera['world_to_camera'].flatten())
+                proj_matrix = wp.mat44(camera['full_proj_matrix'].flatten())
+                campos = wp.vec3(camera['camera_center'][0], camera['camera_center'][1], camera['camera_center'][2])
+                # Prepare buffers for backward pass
+                geom_buffer = {
+                    'radii': self.intermediate_buffers['radii'],
+                    'means2D': self.intermediate_buffers['points_xy_image'],
+                    'conic_opacity': self.intermediate_buffers['conic_opacity'],
+                    'rgb': self.intermediate_buffers['colors'],
+                    'clamped': self.intermediate_buffers['clamped_state']
+                }
+                binning_buffer = {
+                    'point_list': self.intermediate_buffers['point_list']
+                }
+                img_buffer = {
+                    'ranges': self.intermediate_buffers['ranges'],
+                    'final_Ts': self.intermediate_buffers['final_Ts'],
+                    'n_contrib': self.intermediate_buffers['n_contrib']
+                }
+                # Backward pass
+                gradients = backward(
+                    # Core parameters
+                    background=np.array(self.config['background_color'], dtype=np.float32),
+                    means3D=self.params['positions'],
+                    dL_dpixels=pixel_grad_buffer,
+                    # Model parameters
+                    opacity=self.params['opacities'],
+                    shs=self.params['shs'],
+                    scales=self.params['scales'],
+                    rotations=self.params['rotations'],
+                    scale_modifier=self.config['scale_modifier'],
+                    # Camera parameters
+                    viewmatrix=view_matrix,
+                    projmatrix=proj_matrix,
+                    tan_fovx=camera['tan_fovx'],
+                    tan_fovy=camera['tan_fovy'],
+                    image_height=camera['height'],
+                    image_width=camera['width'],
+                    campos=campos,
+                    # Forward output buffers
+                    radii=self.intermediate_buffers['radii'],
+                    means2D=self.intermediate_buffers['points_xy_image'],
+                    conic_opacity=self.intermediate_buffers['conic_opacity'],
+                    rgb=self.intermediate_buffers['colors'],
+                    cov3Ds=self.intermediate_buffers['cov3Ds'],
+                    clamped=self.intermediate_buffers['clamped_state'],
+                    # Internal state buffers
+                    geom_buffer=geom_buffer,
+                    binning_buffer=binning_buffer,
+                    img_buffer=img_buffer,
+                    # Algorithm parameters
+                    degree=self.config['sh_degree'],
+                    debug=False
+                )
+                # Copy gradients to optimizer buffers
+                wp.copy(self.grads['positions'], gradients['dL_dmean3D'])
+                wp.copy(self.grads['scales'], gradients['dL_dscale'])
+                wp.copy(self.grads['rotations'], gradients['dL_drot'])
+                wp.copy(self.grads['opacities'], gradients['dL_dopacity'])
+                wp.copy(self.grads['shs'], gradients['dL_dshs'])
+                # Optimizer step
+                lr = 0.001 * (0.1 ** (it / num_iterations))
+                wp.launch(adam_update, dim=self.num_points, inputs=[
+                    self.params['positions'], self.params['scales'],
+                    self.params['rotations'], self.params['opacities'], self.params['shs'],
+                    self.grads['positions'], self.grads['scales'],
+                    self.grads['rotations'], self.grads['opacities'], self.grads['shs'],
+                    self.adam_m['positions'], self.adam_m['scales'],
+                    self.adam_m['rotations'], self.adam_m['opacities'], self.adam_m['shs'],
+                    self.adam_v['positions'], self.adam_v['scales'],
+                    self.adam_v['rotations'], self.adam_v['opacities'], self.adam_v['shs'],
+                    self.num_points, lr, lr*5, lr*5, lr*2, lr*5,
+                    0.9, 0.999, 1e-8, it
+                ])
+                pbar.set_postfix(loss=f"{loss:.4f}")
+                pbar.update(1)
+                # Save checkpoint
+                if (it + 1) % 500 == 0 or it == num_iterations - 1:
+                    self.save(it + 1)
+        print("Training complete!")
+    def save(self, iteration: int):
+        """Save checkpoint with rendered images."""
+        ckpt_dir = self.output_path / f"frame_{self.frame_idx}" / f"iter_{iteration}"
+        ckpt_dir.mkdir(parents=True, exist_ok=True)
+        # Save PLY
+        save_ply(self.params, ckpt_dir / "point_cloud.ply", self.num_points)
+        # Render and save images for all cameras
+        for cam_idx, camera in enumerate(self.cameras):
+            target = self.images[cam_idx]
+            rendered, depth, _ = render_gaussians(
+                background=np.array(self.config['background_color'], dtype=np.float32),
+                means3D=self.params['positions'].numpy(),
+                colors=None,
+                opacity=self.params['opacities'].numpy(),
+                scales=self.params['scales'].numpy(),
+                rotations=self.params['rotations'].numpy(),
+                scale_modifier=1.0,
+                viewmatrix=camera['world_to_camera'],
+                projmatrix=camera['full_proj_matrix'],
+                tan_fovx=camera['tan_fovx'],
+                tan_fovy=camera['tan_fovy'],
+                image_height=camera['height'],
+                image_width=camera['width'],
+                sh=self.params['shs'].numpy(),
+                degree=3,
+                campos=camera['camera_center'],
+                prefiltered=False,
+                antialiasing=True,
+            )
+            # Convert rendered to numpy
+            rendered_np = wp.to_torch(rendered).cpu().numpy()
+            if rendered_np.shape[0] == 3:
+                rendered_np = np.transpose(rendered_np, (1, 2, 0))
+            # Save rendered image
+            rendered_uint8 = (np.clip(rendered_np, 0, 1) * 255).astype(np.uint8)
+            imageio.imwrite(ckpt_dir / f"render_cam{cam_idx}.png", rendered_uint8)
+            # Save target image
+            target_uint8 = (np.clip(target, 0, 1) * 255).astype(np.uint8)
+            imageio.imwrite(ckpt_dir / f"target_cam{cam_idx}.png", target_uint8)
+            # Save side-by-side comparison
+            comparison = np.concatenate([target_uint8, rendered_uint8], axis=1)
+            imageio.imwrite(ckpt_dir / f"compare_cam{cam_idx}.png", comparison)
+        # Save loss plot
+        if len(self.losses) > 0:
+            plt.figure(figsize=(10, 5))
+            plt.plot(self.losses)
+            plt.title(f'Training Loss - Frame {self.frame_idx}')
+            plt.xlabel('Iteration')
+            plt.ylabel('L1 Loss')
+            plt.grid(True)
+            plt.savefig(ckpt_dir / "loss_plot.png")
+            plt.close()
+        print(f"Saved checkpoint to {ckpt_dir}")
+    def save_final(self):
+        """Save final PLY to flat output structure for easy loading."""
+        final_path = self.output_path / f"frame_{self.frame_idx:04d}.ply"
+        save_ply(self.params, final_path, self.num_points)
+        return final_path
+def train_single_frame(data: dict, frame_idx: int, output_path: str,
+                       conf_threshold: float, iterations: int) -> str:
+    """Train a single frame and return the output PLY path."""
+    trainer = VDPM3DGSTrainer(
+        data=data,
+        frame_idx=frame_idx,
+        output_path=output_path,
+        conf_threshold=conf_threshold,
+    )
+    trainer.train(num_iterations=iterations)
+    return trainer.save_final()
+def main():
+    parser = argparse.ArgumentParser(description="Train 3DGS from VDPM output")
+    parser.add_argument("--input", "-i", required=True, help="Path to VDPM output directory")
+    parser.add_argument("--output", "-o", required=True, help="Output directory")
+    parser.add_argument("--frame", "-f", type=int, default=None,
+                        help="Single frame index to train (default: train ALL frames)")
+    parser.add_argument("--conf", type=float, default=50.0, help="Confidence threshold percentile")
+    parser.add_argument("--iterations", "-n", type=int, default=3000, help="Training iterations per frame")
+    args = parser.parse_args()
+    # Load data
+    print(f"Loading VDPM data from {args.input}...")
+    data = load_vdpm_data(args.input)
+    num_timesteps = data['T']
+    print(f"Found {num_timesteps} timesteps in data")
+    output_path = Path(args.output)
+    output_path.mkdir(parents=True, exist_ok=True)
+    if args.frame is not None:
+        # Train single frame
+        if args.frame >= num_timesteps:
+            raise ValueError(f"Frame {args.frame} out of range (0-{num_timesteps-1})")
+        print(f"\n{'='*60}")
+        print(f"Training frame {args.frame}/{num_timesteps-1}")
+        print(f"{'='*60}")
+        ply_path = train_single_frame(
+            data, args.frame, args.output, args.conf, args.iterations
+        )
+        print(f"\n✓ Saved: {ply_path}")
+    else:
+        # Train ALL frames
+        print(f"\n{'='*60}")
+        print(f"Training ALL {num_timesteps} frames")
+        print(f"Output: {output_path}/frame_XXXX.ply")
+        print(f"{'='*60}")
+        ply_paths = []
+        for frame_idx in range(num_timesteps):
+            print(f"\n[Frame {frame_idx+1}/{num_timesteps}]")
+            ply_path = train_single_frame(
+                data, frame_idx, args.output, args.conf, args.iterations
+            )
+            ply_paths.append(ply_path)
+        print(f"\n{'='*60}")
+        print(f"✓ Training complete! Generated {len(ply_paths)} PLY files:")
+        for p in ply_paths:
+            print(f"   {p}")
+        print(f"{'='*60}")
+if __name__ == "__main__":
+    main()

gs/training_progress.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:052cff51cf3d320a299faea9c795fad53349ed193b8bc222ee37860afd7def99
+size 306145

gs/utils/analyze_scales.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import numpy as np
+import matplotlib.pyplot as plt
+from plyfile import PlyData
+import argparse
+from pathlib import Path
+import os
+def analyze_scales(input_ply, output_ply=None, threshold=None, show_plot=True):
+    """
+    Analyze scales in a PLY file and optionally filter out large splats.
+    Args:
+        input_ply (str): Path to input PLY file
+        output_ply (str, optional): Path to save filtered PLY file
+        threshold (float, optional): Maximum scale value to keep
+        show_plot (bool): Whether to display the histogram plot
+    """
+    # Convert input path to absolute path if it's relative
+    repo_root = Path(__file__).parent.parent  # Go up one level from utils
+    input_ply = Path(repo_root) / input_ply if not os.path.isabs(input_ply) else Path(input_ply)
+    if not input_ply.exists():
+        raise FileNotFoundError(f"Could not find PLY file: {input_ply}")
+    print(f"Reading PLY file: {input_ply}")
+    plydata = PlyData.read(str(input_ply))
+    vertex_data = plydata['vertex']
+    # Extract scale values - assuming log-space encoding in PLY
+    scales = np.vstack([
+        np.exp(vertex_data['scale_0']),
+        np.exp(vertex_data['scale_1']),
+        np.exp(vertex_data['scale_2'])
+    ]).T
+    # Calculate statistics
+    max_scales = np.max(scales, axis=1)
+    mean_scale = np.mean(max_scales)
+    median_scale = np.median(max_scales)
+    print(f"Scale statistics:")
+    print(f"Mean scale: {mean_scale:.6f}")
+    print(f"Median scale: {median_scale:.6f}")
+    print(f"Min scale: {np.min(max_scales):.6f}")
+    print(f"Max scale: {np.max(max_scales):.6f}")
+    # Plot histogram
+    if show_plot:
+        plt.figure(figsize=(10, 6))
+        plt.hist(max_scales, bins=100, edgecolor='black')
+        plt.title('Histogram of Maximum Scales per Gaussian')
+        plt.xlabel('Scale')
+        plt.ylabel('Count')
+        if threshold is not None:
+            plt.axvline(x=threshold, color='r', linestyle='--',
+                       label=f'Threshold ({threshold})')
+            plt.legend()
+        plt.savefig(Path(input_ply).parent / 'scale_histogram.png')
+        plt.show()
+    # Filter and save new PLY if threshold is provided
+    if threshold is not None and output_ply is not None:
+        # Create mask for Gaussians to keep
+        keep_mask = max_scales <= threshold
+        num_removed = np.sum(~keep_mask)
+        print(f"Removing {num_removed} Gaussians ({(num_removed/len(keep_mask))*100:.2f}%)")
+        # Create new vertex data with filtered Gaussians
+        new_vertex = []
+        for i, keep in enumerate(keep_mask):
+            if keep:
+                new_vertex.append(tuple(vertex_data[i]))
+        # Create new PLY file
+        new_vertex_array = np.array(
+            new_vertex,
+            dtype=vertex_data.dtype
+        )
+        new_vertex_element = PlyData.describe(new_vertex_array, 'vertex')
+        PlyData([new_vertex_element], text=True).write(output_ply)
+        print(f"Saved filtered PLY to: {output_ply}")
+def main():
+    parser = argparse.ArgumentParser(description='Analyze and filter Gaussian scales in PLY file')
+    parser.add_argument('input_ply', help='Input PLY file path')
+    parser.add_argument('--output', '-o', help='Output PLY file path')
+    parser.add_argument('--threshold', '-t', type=float, help='Maximum scale threshold')
+    parser.add_argument('--no-plot', action='store_true', help='Disable histogram plot')
+    args = parser.parse_args()
+    analyze_scales(
+        args.input_ply,
+        args.output,
+        args.threshold,
+        show_plot=not args.no_plot
+    )
+if __name__ == "__main__":
+    main()

gs/utils/camera_utils.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import os
+import json
+import numpy as np
+from utils.math_utils import world_to_view, projection_matrix
+# Y down, Z forward
+def load_camera(camera_info):
+    """Load camera parameters from camera info dictionary"""
+    # Extract camera parameters
+    camera_id = camera_info["camera_id"]
+    camera_to_world = np.asarray(camera_info["camera_to_world"], dtype=np.float64)
+    # Change from OpenGL/Blender camera axes (Y up, Z back) to COLMAP (Y down, Z forward)
+    camera_to_world[:3, 1:3] *= -1
+    # Calculate world to camera transform
+    world_to_camera = np.linalg.inv(camera_to_world).astype(np.float32)
+    # Extract rotation and translation
+    R = world_to_camera[:3, :3]
+    T = world_to_camera[:3, 3]
+    world_to_camera[3, 3] = 1.
+    world_to_camera = world_to_camera.T
+    width = camera_info.get("width")
+    height = camera_info.get("height")
+    fx = camera_info.get("focal")
+    fy = camera_info.get("focal")
+    cx = width / 2
+    cy = height / 2
+    # Calculate field of view from focal length
+    fovx = 2 * np.arctan(width / (2 * fx))
+    fovy = 2 * np.arctan(height / (2 * fy))
+    # Create view matrix
+    view_matrix = world_to_view(R=R, t=T)
+    # Create projection matrix
+    znear = 0.01
+    zfar = 100.0
+    proj_matrix = projection_matrix(fovx=fovx, fovy=fovy, znear=znear, zfar=zfar).T
+    full_proj_matrix = world_to_camera @ proj_matrix
+    # Calculate other parameters
+    tan_fovx = np.tan(fovx * 0.5)
+    tan_fovy = np.tan(fovy * 0.5)
+    camera_center = np.linalg.inv(world_to_camera)[3, :3]
+    # Handle camera type and distortion
+    camera_model = camera_info.get("camera_model", "OPENCV")
+    if camera_model == "OPENCV" or camera_model is None:
+        camera_type = 0  # PERSPECTIVE
+    elif camera_model == "OPENCV_FISHEYE":
+        camera_type = 1  # FISHEYE
+    else:
+        raise ValueError(f"Unsupported camera_model '{camera_model}'")
+    # Get distortion parameters
+    distortion_params = []
+    for param_name in ["k1", "k2", "p1", "p2", "k3", "k4"]:
+        distortion_params.append(camera_info.get(param_name, 0.0))
+    camera_params = {
+        'R': R,
+        'T': T,
+        'camera_center': camera_center,
+        'view_matrix': view_matrix,
+        'proj_matrix': proj_matrix,
+        'full_proj_matrix': full_proj_matrix,
+        'tan_fovx': tan_fovx,
+        'tan_fovy': tan_fovy,
+        'fx': fx,
+        'fy': fy,
+        'cx': cx,
+        'cy': cy,
+        'width': width,
+        'height': height,
+        'camera_to_world': camera_to_world,
+        'world_to_camera': world_to_camera,
+        'camera_type': camera_type,
+        'distortion_params': np.array(distortion_params, dtype=np.float32)
+    }
+    return camera_params
+def load_camera_from_json(input_path, camera_id=0):
+    """Load camera parameters from camera.json file"""
+    camera_file = os.path.join(os.path.dirname(input_path), "cameras.json")
+    if not os.path.exists(camera_file):
+        print(f"Warning: No cameras.json found in {os.path.dirname(input_path)}, using default camera")
+        return None
+    try:
+        with open(camera_file, 'r') as f:
+            cameras = json.load(f)
+        # Find camera with specified ID, or use the first one
+        camera = next((cam for cam in cameras if cam["id"] == camera_id), cameras[0])
+        # Use load_camera to process the camera parameters
+        return load_camera(camera)
+    except Exception as e:
+        print(f"Error loading camera from cameras.json: {e}")
+        return None
+def load_camera_colmap(cam_info):
+    """
+    Load camera from COLMAP format (dust3r output) with exact compatibility to original load_camera.
+    Args:
+        cam_info: Dictionary containing:
+            - width, height: image dimensions
+            - fx, fy: focal lengths
+            - cx, cy: principal point
+            - camera_id: unique identifier
+            - R: rotation matrix (world-to-camera rotation)
+            - T: translation vector (world-to-camera translation)
+            - Optional: camera_model, distortion params
+    """
+    # Extract camera parameters
+    camera_id = cam_info["camera_id"]
+    # Use provided R and T directly (COLMAP convention - world to camera)
+    R = cam_info['R']
+    T = cam_info['T']  # This is world-to-camera translation
+    # Build world-to-camera matrix
+    world_to_camera = np.eye(4, dtype=np.float64)
+    world_to_camera[:3, :3] = R
+    world_to_camera[:3, 3] = T
+    # Invert to get camera-to-world
+    camera_to_world = np.linalg.inv(world_to_camera).astype(np.float64)
+    # IMPORTANT FIX: Ensure Z direction is correctly oriented for COLMAP convention
+    # COLMAP uses +Z forward, so no need to flip Z axis
+    # If frustums are still backwards, uncomment this line:
+    # camera_to_world[:3, 2] *= -1  # Flip Z axis if needed
+    # Recalculate world_to_camera after any modifications
+    world_to_camera = np.linalg.inv(camera_to_world).astype(np.float32)
+    # Extract intrinsics
+    width = cam_info.get("width")
+    height = cam_info.get("height")
+    fx = cam_info.get("fx", cam_info.get("focal", width * 0.7))
+    fy = cam_info.get("fy", cam_info.get("focal", height * 0.7))
+    cx = cam_info.get("cx", width / 2)
+    cy = cam_info.get("cy", height / 2)
+    # Calculate field of view from focal length
+    fovx = 2 * np.arctan(width / (2 * fx))
+    fovy = 2 * np.arctan(height / (2 * fy))
+    # Create view matrix using the original R and T
+    view_matrix = world_to_view(R=R, t=T)
+    # Create projection matrix
+    znear = 0.01
+    zfar = 100.0
+    proj_matrix = projection_matrix(fovx=fovx, fovy=fovy, znear=znear, zfar=zfar).T
+    full_proj_matrix = world_to_camera @ proj_matrix
+    # Calculate other parameters
+    tan_fovx = np.tan(fovx * 0.5)
+    tan_fovy = np.tan(fovy * 0.5)
+    # IMPORTANT FIX: Correctly calculate camera center
+    camera_center = camera_to_world[:3, 3]  # Extract translation from c2w matrix
+    # Handle camera type and distortion
+    camera_model = cam_info.get("camera_model", "OPENCV")
+    if camera_model == "OPENCV" or camera_model is None:
+        camera_type = 0  # PERSPECTIVE
+    elif camera_model == "OPENCV_FISHEYE":
+        camera_type = 1  # FISHEYE
+    else:
+        camera_type = 0  # Default to PERSPECTIVE
+    # Get distortion parameters
+    distortion_params = []
+    for param_name in ["k1", "k2", "p1", "p2", "k3", "k4"]:
+        distortion_params.append(cam_info.get(param_name, 0.0))
+    # Return camera parameters
+    camera_params = {
+        'R': R,
+        'T': T,
+        'camera_center': camera_center,
+        'view_matrix': view_matrix,
+        'proj_matrix': proj_matrix,
+        'full_proj_matrix': full_proj_matrix,
+        'tan_fovx': tan_fovx,
+        'tan_fovy': tan_fovy,
+        'fx': fx,
+        'fy': fy,
+        'cx': cx,
+        'cy': cy,
+        'width': width,
+        'height': height,
+        'camera_to_world': camera_to_world,
+        'world_to_camera': world_to_camera,
+        'camera_type': camera_type,
+        'distortion_params': np.array(distortion_params, dtype=np.float32)
+    }
+    return camera_params

gs/utils/check_opacities.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import numpy as np
+from plyfile import PlyData
+import os
+# Path for output folder that's one level above utils
+ply_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
+                        'output', 'point_cloud', 'iteration_13999', 'point_cloud.ply')
+# load the PLY
+ply = PlyData.read(ply_path)
+opacities = np.array(ply['vertex']['opacity'])
+# compute statistics
+min_o, max_o, mean_o = opacities.min(), opacities.max(), opacities.mean()
+near_zero = np.sum(opacities < 1e-3)
+near_one  = np.sum(opacities > 0.999)
+print(f'Loaded {len(opacities)} splats')
+print(f'Opacity range:  min={min_o:.6f}, max={max_o:.6f}, mean={mean_o:.6f}')
+print(f'Count near-zero (<1e-3): {near_zero}')
+print(f'Count near-one  (>0.999): {near_one}')
+print('Sample opacities:', opacities[:100])

gs/utils/math_utils.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import math
+import numpy as np
+import math
+import warp as wp
+def world_to_view(R, t, translate=np.array([0.0, 0.0, 0.0]), scale=1.0):
+    Rt = np.zeros((4, 4))
+    Rt[:3, :3] = R.transpose()
+    Rt[:3, 3] = t
+    Rt[3, 3] = 1.0
+    C2W = np.linalg.inv(Rt)
+    cam_center = C2W[:3, 3]
+    cam_center = (cam_center + translate) * scale
+    C2W[:3, 3] = cam_center
+    Rt = np.linalg.inv(C2W)
+    return np.float32(Rt)
+def projection_matrix(fovx, fovy, znear, zfar):
+    tanHalfFovY = math.tan((fovy / 2))
+    tanHalfFovX = math.tan((fovx / 2))
+    top = tanHalfFovY * znear
+    bottom = -top
+    right = tanHalfFovX * znear
+    left = -right
+    P = np.zeros((4, 4))
+    z_sign = 1.0
+    P[0, 0] = 2.0 * znear / (right - left)
+    P[1, 1] = 2.0 * znear / (top - bottom)
+    P[0, 2] = (right + left) / (right - left)
+    P[1, 2] = (top + bottom) / (top - bottom)
+    P[3, 2] = z_sign
+    P[2, 2] = z_sign * zfar / (zfar - znear)
+    P[2, 3] = -(zfar * znear) / (zfar - znear)
+    return P
+def matrix_to_quaternion(matrix):
+    """
+    Convert a 3x3 rotation matrix to a quaternion in (x, y, z, w) format.
+    Args:
+        matrix: 3x3 rotation matrix
+    Returns:
+        Quaternion as (x, y, z, w) in numpy array of shape (4,)
+    """
+    # Ensure the input is a proper rotation matrix
+    # This is just a simple check that might be helpful during debug
+    if np.abs(np.linalg.det(matrix) - 1.0) > 1e-5:
+        print(f"Warning: Input matrix determinant is not 1: {np.linalg.det(matrix)}")
+    trace = np.trace(matrix)
+    if trace > 0:
+        S = 2.0 * np.sqrt(trace + 1.0)
+        w = 0.25 * S
+        x = (matrix[2, 1] - matrix[1, 2]) / S
+        y = (matrix[0, 2] - matrix[2, 0]) / S
+        z = (matrix[1, 0] - matrix[0, 1]) / S
+    elif matrix[0, 0] > matrix[1, 1] and matrix[0, 0] > matrix[2, 2]:
+        S = 2.0 * np.sqrt(1.0 + matrix[0, 0] - matrix[1, 1] - matrix[2, 2])
+        w = (matrix[2, 1] - matrix[1, 2]) / S
+        x = 0.25 * S
+        y = (matrix[0, 1] + matrix[1, 0]) / S
+        z = (matrix[0, 2] + matrix[2, 0]) / S
+    elif matrix[1, 1] > matrix[2, 2]:
+        S = 2.0 * np.sqrt(1.0 + matrix[1, 1] - matrix[0, 0] - matrix[2, 2])
+        w = (matrix[0, 2] - matrix[2, 0]) / S
+        x = (matrix[0, 1] + matrix[1, 0]) / S
+        y = 0.25 * S
+        z = (matrix[1, 2] + matrix[2, 1]) / S
+    else:
+        S = 2.0 * np.sqrt(1.0 + matrix[2, 2] - matrix[0, 0] - matrix[1, 1])
+        w = (matrix[1, 0] - matrix[0, 1]) / S
+        x = (matrix[0, 2] + matrix[2, 0]) / S
+        y = (matrix[1, 2] + matrix[2, 1]) / S
+        z = 0.25 * S
+    # Return as (x, y, z, w) to match Warp's convention
+    return np.array([x, y, z, w], dtype=np.float32)
+def quaternion_to_rotation_matrix(q):
+    w, x, y, z = q
+    return np.array([
+        [1 - 2*y**2 - 2*z**2, 2*x*y - 2*z*w,     2*x*z + 2*y*w],
+        [2*x*y + 2*z*w,       1 - 2*x**2 - 2*z**2, 2*y*z - 2*x*w],
+        [2*x*z - 2*y*w,       2*y*z + 2*x*w,     1 - 2*x**2 - 2*y**2]
+    ], dtype=np.float32)
+# def quaternion_to_rotation_matrix(q):
+#     """Convert quaternion to rotation matrix with swapped X and Z axes."""
+#     qw, qx, qy, qz = q
+#     # Original conversion
+#     R = np.array([
+#         [1 - 2*qy*qy - 2*qz*qz, 2*qx*qy - 2*qz*qw, 2*qx*qz + 2*qy*qw],
+#         [2*qx*qy + 2*qz*qw, 1 - 2*qx*qx - 2*qz*qz, 2*qy*qz - 2*qx*qw],
+#         [2*qx*qz - 2*qy*qw, 2*qy*qz + 2*qx*qw, 1 - 2*qx*qx - 2*qy*qy]
+#     ])
+#     # Swap X and Z axes (columns and rows)
+#     R_fixed = R.copy()
+#     R_fixed[:, [0, 2]] = R[:, [2, 0]]  # Swap columns 0 and 2
+#     R_fixed[[0, 2], :] = R_fixed[[2, 0], :]  # Swap rows 0 and 2
+#     return R_fixed

gs/utils/plot_loss_log.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import numpy as np
+import matplotlib.pyplot as plt
+def plot_loss_log(loss_file="output/steak/loss.txt"):
+    """Plot training loss values on a logarithmic scale from loss.txt"""
+    # Load loss values from txt file
+    with open(loss_file, 'r') as f:
+        losses = [float(line.strip()) for line in f if line.strip()]
+    # Create figure with log scale
+    plt.figure(figsize=(12, 6))
+    plt.semilogy(losses, label='Training Loss')
+    # Customize plot
+    plt.grid(True, which="both", ls="-", alpha=0.2)
+    plt.xlabel('Iteration')
+    plt.ylabel('Loss (log scale)')
+    plt.title('Training Loss over Time (Log Scale)')
+    plt.legend()
+    # Save plot
+    output_path = loss_file.replace('.txt', '_plot_log.png')
+    plt.savefig(output_path, dpi=300, bbox_inches='tight')
+    plt.close()
+    print(f"Saved loss plot to: {output_path}")
+    print(f"Loss statistics:")
+    print(f"  Min: {min(losses):.6f}")
+    print(f"  Max: {max(losses):.6f}")
+    print(f"  Mean: {np.mean(losses):.6f}")
+    print(f"  Final: {losses[-1]:.6f}")
+if __name__ == "__main__":
+    plot_loss_log()

gs/utils/point_cloud_utils.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import math
+import numpy as np
+from plyfile import PlyData, PlyElement
+import math
+import os
+import warp as wp
+def load_ply(filepath):
+    """
+    Load a Gaussian splat PLY file.
+    Returns dict with: positions, scales, rotations, opacities, shs
+    """
+    plydata = PlyData.read(filepath)
+    vertex = plydata['vertex']
+    num_points = len(vertex)
+    # Load positions
+    positions = np.stack([
+        vertex['x'], vertex['y'], vertex['z']
+    ], axis=-1).astype(np.float32)
+    # Load scales (stored in log space)
+    scales = np.stack([
+        np.exp(vertex['scale_0']),
+        np.exp(vertex['scale_1']),
+        np.exp(vertex['scale_2'])
+    ], axis=-1).astype(np.float32)
+    # Load opacities
+    opacities = vertex['opacity'].astype(np.float32).reshape(-1, 1)
+    # Load rotations (quaternion)
+    rotations = np.stack([
+        vertex['rot_0'], vertex['rot_1'], vertex['rot_2'], vertex['rot_3']
+    ], axis=-1).astype(np.float32)
+    # Load SH coefficients
+    # DC term
+    sh_dc = np.stack([
+        vertex['f_dc_0'], vertex['f_dc_1'], vertex['f_dc_2']
+    ], axis=-1).astype(np.float32)
+    # Rest of SH coefficients
+    sh_rest = []
+    for i in range(45):
+        sh_rest.append(vertex[f'f_rest_{i}'])
+    sh_rest = np.stack(sh_rest, axis=-1).astype(np.float32)  # (N, 45)
+    sh_rest = sh_rest.reshape(num_points, 15, 3)  # (N, 15, 3)
+    # Combine into (N*16, 3) format expected by renderer
+    shs = np.zeros((num_points * 16, 3), dtype=np.float32)
+    for i in range(num_points):
+        shs[i * 16] = sh_dc[i]
+        for j in range(15):
+            shs[i * 16 + j + 1] = sh_rest[i, j]
+    return {
+        'positions': positions,
+        'scales': scales,
+        'rotations': rotations,
+        'opacities': opacities,
+        'shs': shs,
+        'num_points': num_points
+    }
+# Function to save point cloud to PLY file
+def save_ply(params, filepath, num_points, colors=None):
+    # Get numpy arrays
+    positions = params['positions'].numpy()
+    scales = params['scales'].numpy()
+    rotations = params['rotations'].numpy()
+    opacities = params['opacities'].numpy()
+    shs = params['shs'].numpy()
+    # Handle colors - either provided or computed from SH coefficients
+    if colors is not None:
+        # Use provided colors
+        if hasattr(colors, 'numpy'):
+            colors_np = colors.numpy()
+        else:
+            colors_np = colors
+    else:
+        # Compute colors from SH coefficients (DC term only for simplicity)
+        # SH DC coefficients are stored in the first coefficient (index 0)
+        colors_np = np.zeros((num_points, 3), dtype=np.float32)
+        for i in range(num_points):
+            # Get DC term from SH coefficients
+            sh_dc = shs[i * 16]  # First SH coefficient contains DC term
+            # Convert from SH to RGB (simplified - just use DC term)
+            colors_np[i] = np.clip(sh_dc + 0.5, 0.0, 1.0)  # Add 0.5 offset and clamp
+    # Create vertex data
+    vertex_data = []
+    for i in range(num_points):
+        # Basic properties
+        vertex = (
+            positions[i][0], positions[i][1], positions[i][2],
+            np.log(scales[i][0]), np.log(scales[i][1]), np.log(scales[i][2]),  # Log-space encoding
+            (opacities[i])
+        )
+        # Add rotation quaternion elements
+        quat = rotations[i]
+        rot_elements = (quat[0], quat[1], quat[2], quat[3])  # x, y, z, w
+        vertex += rot_elements
+        # Add RGB colors (convert to 0-255 range)
+        color_255 = (
+            int(np.clip(colors_np[i][0] * 255, 0, 255)),
+            int(np.clip(colors_np[i][1] * 255, 0, 255)),
+            int(np.clip(colors_np[i][2] * 255, 0, 255))
+        )
+        vertex += color_255
+        # Add SH coefficients
+        sh_dc = tuple(shs[i * 16][j] for j in range(3))
+        vertex += sh_dc
+        # Add remaining SH coefficients
+        sh_rest = []
+        for j in range(1, 16):
+            for c in range(3):
+                sh_rest.append(shs[i * 16 + j][c])
+        vertex += tuple(sh_rest)
+        vertex_data.append(vertex)
+    # Define the structure of the PLY file
+    vertex_type = [
+        ('x', 'f4'), ('y', 'f4'), ('z', 'f4'),
+        ('scale_0', 'f4'), ('scale_1', 'f4'), ('scale_2', 'f4'),
+        ('opacity', 'f4')
+    ]
+    # Add rotation quaternion elements
+    vertex_type.extend([('rot_0', 'f4'), ('rot_1', 'f4'), ('rot_2', 'f4'), ('rot_3', 'f4')])
+    # Add RGB color fields
+    vertex_type.extend([('red', 'u1'), ('green', 'u1'), ('blue', 'u1')])
+    # Add SH coefficients
+    vertex_type.extend([('f_dc_0', 'f4'), ('f_dc_1', 'f4'), ('f_dc_2', 'f4')])
+    # Add remaining SH coefficients
+    for i in range(45):  # 15 coeffs * 3 channels
+        vertex_type.append((f'f_rest_{i}', 'f4'))
+    vertex_array = np.array(vertex_data, dtype=vertex_type)
+    el = PlyElement.describe(vertex_array, 'vertex')
+    # Create directory if it doesn't exist
+    os.makedirs(os.path.dirname(filepath), exist_ok=True)
+    # Save the PLY file
+    PlyData([el], text=False).write(filepath)
+    print(f"Point cloud saved to {filepath}")

gs/utils/wp_utils.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import warp as wp
+from config import DEVICE
+@wp.func
+def wp_vec3_mul_element(a: wp.vec3, b: wp.vec3) -> wp.vec3:
+    return wp.vec3(a[0] * b[0], a[1] * b[1], a[2] * b[2])
+# Reinstate the element-wise vector square root helper function
+@wp.func
+def wp_vec3_sqrt(a: wp.vec3) -> wp.vec3:
+    return wp.vec3(wp.sqrt(a[0]), wp.sqrt(a[1]), wp.sqrt(a[2]))
+# Add element-wise vector division helper function
+@wp.func
+def wp_vec3_div_element(a: wp.vec3, b: wp.vec3) -> wp.vec3:
+    # Add small epsilon to denominator to prevent division by zero
+    # (although Adam's epsilon should mostly handle this)
+    safe_b = wp.vec3(b[0] + 1e-9, b[1] + 1e-9, b[2] + 1e-9)
+    return wp.vec3(a[0] / safe_b[0], a[1] / safe_b[1], a[2] / safe_b[2])
+@wp.func
+def wp_vec3_add_element(a: wp.vec3, b: wp.vec3) -> wp.vec3:
+    return wp.vec3(a[0] + b[0], a[1] + b[1], a[2] + b[2])
+@wp.func
+def wp_vec3_clamp(x: wp.vec3, min_val: float, max_val: float) -> wp.vec3:
+    return wp.vec3(
+        wp.clamp(x[0], min_val, max_val),
+        wp.clamp(x[1], min_val, max_val),
+        wp.clamp(x[2], min_val, max_val)
+    )
+def to_warp_array(data, dtype, shape_check=None, flatten=False):
+    if isinstance(data, wp.array):
+        return data
+    if data is None:
+        return None
+    # Convert torch tensor to numpy if needed
+    if hasattr(data, 'cpu') and hasattr(data, 'numpy'):
+        data = data.cpu().numpy()
+    if flatten and data.ndim == 2 and data.shape[1] == 1:
+        data = data.flatten()
+    return wp.array(data, dtype=dtype, device=DEVICE)

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+# Install PyTorch with CUDA 11.8 separately using:
+# pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu118
+# torch==2.5.1
+# torchvision==0.20.1
+warp-lang==1.7.0
+numpy==1.26.3
+imageio==2.34.1
+plyfile
+roma
+gradio==5.17.1
+pydantic==2.10.6
+matplotlib==3.9.2
+tqdm==4.66.5
+opencv-python
+pypng
+scipy
+einops
+trimesh
+pyglet<2
+viser
+jaxtyping
+hydra-submitit-launcher
+scikit-learn
+plotly
+git+https://github.com/facebookresearch/vggt.git@44b3afb

vdpm/.gitignore ADDED Viewed

	@@ -0,0 +1,132 @@

+data/
+checkpoints/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

vdpm/.gitmodules ADDED Viewed

File without changes

vdpm/.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

vdpm/LICENSE ADDED Viewed

	@@ -0,0 +1,22 @@

+MIT License
+Copyright (c) 2025 Eldar Insafutdinov, Edgar Sucar
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

vdpm/LICENSE-VGGT ADDED Viewed

	@@ -0,0 +1,115 @@

+VGGT License
+v1 Last Updated: July 29, 2025
+“Acceptable Use Policy” means the Acceptable Use Policy, applicable to Research Materials, that is incorporated into this Agreement.
+“Agreement” means the terms and conditions for use, reproduction, distribution and modification of the Research Materials set forth herein.
+“Documentation” means the specifications, manuals and documentation accompanying
+Research Materials distributed by Meta.
+“Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+“Meta” or “we” means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).
+“Research Materials” means, collectively, Documentation and the models, software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code, demonstration materials and other elements of the foregoing distributed by Meta and made available under this Agreement.
+By clicking “I Accept” below or by using or distributing any portion or element of the Research Materials, you agree to be bound by this Agreement.
+1. License Rights and Redistribution.
+a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the Research Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Research Materials.
+b. Redistribution and Use.
+i. Distribution of Research Materials, and any derivative works thereof, are subject to the terms of this Agreement. If you distribute or make the Research Materials, or any derivative works thereof, available to a third party, you may only do so under the terms of this Agreement. You shall also provide a copy of this Agreement to such third party.
+ii.  If you submit for publication the results of research you perform on, using, or otherwise in connection with Research Materials, you must acknowledge the use of Research Materials in your publication.
+iii. Your use of the Research Materials must comply with applicable laws and regulations (including Trade Control Laws) and adhere to the Acceptable Use Policy, which is hereby incorporated by reference into this Agreement.
+2. User Support. Your  use of the Research Materials is done at your own discretion; Meta does not process any information nor provide any service in relation to such use.  Meta is under no obligation to provide any support services for the Research Materials. Any support provided is “as is”, “with all faults”, and without warranty of any kind.
+3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE RESEARCH MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE RESEARCH MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE RESEARCH MATERIALS AND ANY OUTPUT AND RESULTS.
+4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT OR INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+5. Intellectual Property.
+a. Subject to Meta’s ownership of Research Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Research Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
+b. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Research Materials, outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Research Materials.
+6. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Research Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Research Materials. Sections 5, 6 and 9 shall survive the termination of this Agreement.
+7. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
+8. Modifications and Amendments. Meta may modify this Agreement from time to time; provided that they are similar in spirit to the current version of the Agreement, but may differ in detail to address new problems or concerns. All such changes will be effective immediately. Your continued use of the Research Materials after any modification to this Agreement constitutes your agreement to such modification. Except as provided in this Agreement, no modification or addition to any provision of this Agreement will be binding unless it is in writing and signed by an authorized representative of both you and Meta.
+Acceptable Use Policy
+Meta seeks to further understanding of new and existing research domains with the mission of advancing the state-of-the-art in artificial intelligence through open research for the benefit of all.
+As part of this mission, Meta makes certain research materials available for use in accordance with this Agreement (including the Acceptable Use Policy). Meta is committed to promoting the safe and responsible use of such research materials.
+Prohibited Uses
+You agree you will not use, or allow others to use, Research Materials to:
+ Violate the law or others’ rights, including to:
+Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as:
+Violence or terrorism
+Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material
+Human trafficking, exploitation, and sexual violence
+The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials.
+Sexual solicitation
+Any other criminal activity
+Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals
+Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services
+Engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices
+Collect, process, disclose, generate, or infer health, demographic, or other sensitive personal or private information about individuals without rights and consents required by applicable laws
+Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any technology using Research Materials
+Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system
+2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of research artifacts related to the following:
+Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are subject to the International Traffic Arms Regulations (ITAR) maintained by the United States Department of State
+Guns and illegal weapons (including weapon development)
+Illegal drugs and regulated/controlled substances
+Operation of critical infrastructure, transportation technologies, or heavy machinery
+Self-harm or harm to others, including suicide, cutting, and eating disorders
+Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual
+3. Intentionally deceive or mislead others, including use of Research Materials related to the following:
+ Generating, promoting, or furthering fraud or the creation or promotion of disinformation
+ Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content
+Generating, promoting, or further distributing spam
+ Impersonating another individual without consent, authorization, or legal right
+Representing that outputs of research materials or outputs from technology using Research Materials are human-generated
+Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement
+4. Fail to appropriately disclose to end users any known dangers of your Research Materials.

vdpm/README.md ADDED Viewed

	@@ -0,0 +1,44 @@

+---
+title: vdpm
+app_file: gradio_demo.py
+sdk: gradio
+sdk_version: 5.17.1
+---
+<div align="center">
+<h1>V-DPM: 4D Video Reconstruction with Dynamic Point Maps</h1>
+<a href="https://www.robots.ox.ac.uk/~vgg/research/vdpm/"><img src="https://img.shields.io/badge/Project_Page-green" alt="Project Page"></a>
+<a href="https://huggingface.co/spaces/edgarsucar/vdpm"><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo-blue'></a>
+**[Visual Geometry Group, University of Oxford](https://www.robots.ox.ac.uk/~vgg/)**
+[Edgar Sucar](https://edgarsucar.github.io/)\*, [Eldar Insafutdinov](https://eldar.insafutdinov.com/)\*, [Zihang Lai](https://scholar.google.com/citations?user=31eXgMYAAAAJ), [Andrea Vedaldi](https://www.robots.ox.ac.uk/~vedaldi/)
+</div>
+## Setup
+First, clone the repository and setup a virtual environment with [uv](https://github.com/astral-sh/uv):
+```bash
+git clone git@github.com:eldar/vdpm.git
+cd vdpm
+uv venv --python 3.12
+. .venv/bin/activate
+# Install PyTorch with CUDA 11.8 first
+uv pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu118
+# Then install remaining dependencies
+uv pip install -r requirements.txt
+```
+## Viser demo
+```bash
+python visualise.py ++vis.input_video=examples/videos/camel.mp4
+```
+## Gradio demo
+```bash
+python gradio_demo.py
+```

vdpm/check_model_size.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torch
+import sys
+from pathlib import Path
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent))
+def check_model_memory():
+    # Simple config object
+    class SimpleConfig:
+        class ModelConfig:
+            decoder_depth = 4
+        model = ModelConfig()
+    cfg = SimpleConfig()
+    # Import after path is set
+    from dpm.model import VDPM
+    # Create model on CPU first to count parameters
+    print("Creating model...")
+    model = VDPM(cfg)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"\n{'='*60}")
+    print(f"MODEL SIZE ANALYSIS FOR RTX 3070 Ti (8GB)")
+    print(f"{'='*60}")
+    print(f"Total parameters: {total_params:,}")
+    print(f"Trainable parameters: {trainable_params:,}")
+    print(f"\nEstimated model weights memory:")
+    print(f"  - FP32 (float32): {total_params * 4 / 1024**3:.2f} GB")
+    print(f"  - FP16 (float16): {total_params * 2 / 1024**3:.2f} GB")
+    print(f"  - BF16 (bfloat16): {total_params * 2 / 1024**3:.2f} GB")
+    print(f"  - INT8 (quantized): {total_params * 1 / 1024**3:.2f} GB  <-- RECOMMENDED for 8GB GPU")
+    # Estimate activation memory for typical input
+    batch_size = 1
+    num_frames = 5  # typical video length
+    img_size = 518
+    print(f"\nEstimated activation memory (batch={batch_size}, frames={num_frames}, img_size={img_size}):")
+    # Input images: [B, S, 3, H, W]
+    input_mem = batch_size * num_frames * 3 * img_size * img_size * 4 / 1024**3
+    print(f"  - Input images (FP32): {input_mem:.2f} GB")
+    # Rough estimate for activations (can be 2-4x model size during forward pass)
+    activation_mem_estimate = total_params * 2 * 3 / 1024**3  # conservative estimate
+    print(f"  - Activations (estimate): {activation_mem_estimate:.2f} GB")
+    # Calculate total for different precision modes
+    total_fp16 = (total_params * 2 / 1024**3) + input_mem + activation_mem_estimate
+    total_int8 = (total_params * 1 / 1024**3) + input_mem + (activation_mem_estimate * 0.6)  # INT8 reduces activations too
+    print(f"\nTotal estimated GPU memory needed:")
+    print(f"  - With FP16/BF16: {total_fp16:.2f} GB")
+    print(f"  - With INT8 quantization: {total_int8:.2f} GB  <-- FITS IN 8GB!")
+    print(f"Your RTX 3070 Ti has: 8 GB VRAM")
+    if total_int8 <= 8:
+        print(f"\n✓ With INT8 quantization, model will fit in GPU memory!")
+        print(f"  Set USE_QUANTIZATION = True in gradio_demo.py")
+    elif total_fp16 > 8:
+        print(f"\n⚠️  WARNING: Even with INT8 ({total_int8:.2f} GB), memory is tight")
+        print(f"    Recommendations:")
+        print(f"  1. Use INT8 quantization (USE_QUANTIZATION = True)")
+        print(f"  2. Reduce number of input frames to {num_frames} or fewer")
+        print(f"  3. Clear CUDA cache between batches")
+    else:
+        print(f"\n✓ Model should fit with FP16!")
+    print(f"{'='*60}\n")
+    # Check actual GPU memory if CUDA available
+    if torch.cuda.is_available():
+        print(f"GPU: {torch.cuda.get_device_name(0)}")
+        print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
+        print(f"Current GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
+        print(f"Current GPU memory cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
+if __name__ == "__main__":
+    check_model_memory()

vdpm/configs/config.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+defaults:
+  - _self_
+  - hydra: defaults
+  - model: dpm
+config:
+  exp_name: "debug"
+  file: "config.yaml"
+data_loader:
+  batch_size: 2
+  num_workers: 8
+  dynamic_batch: false
+train:
+  logging: true
+  num_gpus: 4
+  amp: bfloat16
+  amp_dpt: false
+  dry_run: false
+  camera_loss_lambda: 5.0
+optimiser:
+  lr: 0.00005 # absolute lr
+  blr: 1.5e-4 # base learning rate: absolute_lr = base_lr * total_batch_size / 256
+  start_epoch:
+  epochs: 70
+  accum_iter: 1
+  warmup_epochs: 3
+  min_lr: 1e-06
+run:
+  resume: false
+  dirpath: null
+  debug: false
+  random_seed: 42
+  git_hash: null
+  log_frequency: 250
+  training_progress_bar: false
+  save_freq: 5
+  eval_freq: 1
+  keep_freq: 5
+  print_freq: 20
+  num_keep_ckpts: 5
+  # Old Dust3r params
+  world_size: -1
+  local_rank: -1
+  dist_url: "env://"
+  seed: 0

vdpm/configs/model/dpm.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+name: dpm-video
+pretrained: /work/eldar/models/vggt/VGGT-1B.pt
+decoder_depth: 4

vdpm/configs/visualise.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+defaults:
+  - _self_
+  - model: dpm
+hydra:
+  output_subdir: null  # Disable saving of config files.
+  job:
+    chdir: False
+vis:
+  port: 8080
+  input_video:

vdpm/dpm/aggregator.py ADDED Viewed

	@@ -0,0 +1,366 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE-VGGT file in the root directory of this source tree.
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+from typing import Optional, Tuple, Union, List, Dict, Any
+from vggt.layers import PatchEmbed
+from vggt.layers.block import Block
+from vggt.layers.rope import RotaryPositionEmbedding2D, PositionGetter
+from vggt.layers.vision_transformer import vit_small, vit_base, vit_large, vit_giant2
+logger = logging.getLogger(__name__)
+_RESNET_MEAN = [0.485, 0.456, 0.406]
+_RESNET_STD = [0.229, 0.224, 0.225]
+class Aggregator(nn.Module):
+    """
+    The Aggregator applies alternating-attention over input frames,
+    as described in VGGT: Visual Geometry Grounded Transformer.
+    Args:
+        img_size (int): Image size in pixels.
+        patch_size (int): Size of each patch for PatchEmbed.
+        embed_dim (int): Dimension of the token embeddings.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        mlp_ratio (float): Ratio of MLP hidden dim to embedding dim.
+        num_register_tokens (int): Number of register tokens.
+        block_fn (nn.Module): The block type used for attention (Block by default).
+        qkv_bias (bool): Whether to include bias in QKV projections.
+        proj_bias (bool): Whether to include bias in the output projection.
+        ffn_bias (bool): Whether to include bias in MLP layers.
+        patch_embed (str): Type of patch embed. e.g., "conv" or "dinov2_vitl14_reg".
+        aa_order (list[str]): The order of alternating attention, e.g. ["frame", "global"].
+        aa_block_size (int): How many blocks to group under each attention type before switching. If not necessary, set to 1.
+        qk_norm (bool): Whether to apply QK normalization.
+        rope_freq (int): Base frequency for rotary embedding. -1 to disable.
+        init_values (float): Init scale for layer scale.
+    """
+    def __init__(
+        self,
+        img_size=518,
+        patch_size=14,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4.0,
+        num_register_tokens=4,
+        block_fn=Block,
+        qkv_bias=True,
+        proj_bias=True,
+        ffn_bias=True,
+        patch_embed="dinov2_vitl14_reg",
+        aa_order=["frame", "global"],
+        aa_block_size=1,
+        qk_norm=True,
+        rope_freq=100,
+        init_values=0.01,
+    ):
+        super().__init__()
+        self.__build_patch_embed__(patch_embed, img_size, patch_size, num_register_tokens, embed_dim=embed_dim)
+        # Initialize rotary position embedding if frequency > 0
+        self.rope = RotaryPositionEmbedding2D(frequency=rope_freq) if rope_freq > 0 else None
+        self.position_getter = PositionGetter() if self.rope is not None else None
+        self.frame_blocks = nn.ModuleList(
+            [
+                block_fn(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    proj_bias=proj_bias,
+                    ffn_bias=ffn_bias,
+                    init_values=init_values,
+                    qk_norm=qk_norm,
+                    rope=self.rope,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.global_blocks = nn.ModuleList(
+            [
+                block_fn(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    proj_bias=proj_bias,
+                    ffn_bias=ffn_bias,
+                    init_values=init_values,
+                    qk_norm=qk_norm,
+                    rope=self.rope,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.depth = depth
+        self.aa_order = aa_order
+        self.patch_size = patch_size
+        self.aa_block_size = aa_block_size
+        # Validate that depth is divisible by aa_block_size
+        if self.depth % self.aa_block_size != 0:
+            raise ValueError(f"depth ({depth}) must be divisible by aa_block_size ({aa_block_size})")
+        self.aa_block_num = self.depth // self.aa_block_size
+        # Note: We have two camera tokens, one for the first frame and one for the rest
+        # The same applies for register tokens
+        self.camera_token = nn.Parameter(torch.randn(1, 2, 1, embed_dim))
+        self.register_token = nn.Parameter(torch.randn(1, 2, num_register_tokens, embed_dim))
+        # The patch tokens start after the camera and register tokens
+        self.patch_start_idx = 1 + num_register_tokens
+        self.time_conditioning_token = nn.Parameter(torch.randn(1, 1, embed_dim))
+        self.patch_start_idx += 1
+        # Initialize parameters with small values
+        nn.init.normal_(self.camera_token, std=1e-6)
+        nn.init.normal_(self.register_token, std=1e-6)
+        # Register normalization constants as buffers
+        for name, value in (
+            ("_resnet_mean", _RESNET_MEAN),
+            ("_resnet_std", _RESNET_STD),
+        ):
+            self.register_buffer(
+                name,
+                torch.FloatTensor(value).view(1, 1, 3, 1, 1),
+                persistent=False,
+            )
+        self.use_reentrant = False # hardcoded to False
+    def __build_patch_embed__(
+        self,
+        patch_embed,
+        img_size,
+        patch_size,
+        num_register_tokens,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        block_chunks=0,
+        init_values=1.0,
+        embed_dim=1024,
+    ):
+        """
+        Build the patch embed layer. If 'conv', we use a
+        simple PatchEmbed conv layer. Otherwise, we use a vision transformer.
+        """
+        if "conv" in patch_embed:
+            self.patch_embed = PatchEmbed(img_size=img_size, patch_size=patch_size, in_chans=3, embed_dim=embed_dim)
+        else:
+            vit_models = {
+                "dinov2_vitl14_reg": vit_large,
+                "dinov2_vitb14_reg": vit_base,
+                "dinov2_vits14_reg": vit_small,
+                "dinov2_vitg2_reg": vit_giant2,
+            }
+            self.patch_embed = vit_models[patch_embed](
+                img_size=img_size,
+                patch_size=patch_size,
+                num_register_tokens=num_register_tokens,
+                interpolate_antialias=interpolate_antialias,
+                interpolate_offset=interpolate_offset,
+                block_chunks=block_chunks,
+                init_values=init_values,
+            )
+            # Disable gradient updates for mask token
+            if hasattr(self.patch_embed, "mask_token"):
+                self.patch_embed.mask_token.requires_grad_(False)
+    def forward(
+        self,
+        images: torch.Tensor,
+    ) -> Tuple[List[torch.Tensor], int]:
+        """
+        Args:
+            images (torch.Tensor): Input images with shape [B, S, 3, H, W], in range [0, 1].
+                B: batch size, S: sequence length, 3: RGB channels, H: height, W: width
+        Returns:
+            (list[torch.Tensor], int):
+                The list of outputs from the attention blocks,
+                and the patch_start_idx indicating where patch tokens begin.
+        """
+        B, S, C_in, H, W = images.shape
+        if C_in != 3:
+            raise ValueError(f"Expected 3 input channels, got {C_in}")
+        # Normalize images and reshape for patch embed
+        images = (images - self._resnet_mean) / self._resnet_std
+        # Reshape to [B*S, C, H, W] for patch embedding
+        images = images.view(B * S, C_in, H, W)
+        patch_tokens = self.patch_embed(images)
+        if isinstance(patch_tokens, dict):
+            patch_tokens = patch_tokens["x_norm_patchtokens"]
+        _, P, C = patch_tokens.shape
+        # Expand camera and register tokens to match batch size and sequence length
+        camera_token = slice_expand_and_flatten(self.camera_token, B, S)
+        register_token = slice_expand_and_flatten(self.register_token, B, S)
+        # do something similar for time_conditioning_token
+        time_conditioning_token = slice_expand_and_flatten_single(self.time_conditioning_token, B, S)
+        # Concatenate special tokens with patch tokens
+        tokens = torch.cat([camera_token, time_conditioning_token, register_token, patch_tokens], dim=1)
+        pos = None
+        if self.rope is not None:
+            pos = self.position_getter(B * S, H // self.patch_size, W // self.patch_size, device=images.device)
+        if self.patch_start_idx > 0:
+            # do not use position embedding for special tokens (camera and register tokens)
+            # so set pos to 0 for the special tokens
+            pos = pos + 1
+            pos_special = torch.zeros(B * S, self.patch_start_idx, 2).to(images.device).to(pos.dtype)
+            pos = torch.cat([pos_special, pos], dim=1)
+        # update P because we added special tokens
+        _, P, C = tokens.shape
+        frame_idx = 0
+        global_idx = 0
+        output_list = []
+        for _ in range(self.aa_block_num):
+            for attn_type in self.aa_order:
+                if attn_type == "frame":
+                    tokens, frame_idx, frame_intermediates = self._process_frame_attention(
+                        tokens, B, S, P, C, frame_idx, pos=pos
+                    )
+                elif attn_type == "global":
+                    tokens, global_idx, global_intermediates = self._process_global_attention(
+                        tokens, B, S, P, C, global_idx, pos=pos
+                    )
+                else:
+                    raise ValueError(f"Unknown attention type: {attn_type}")
+            for i in range(len(frame_intermediates)):
+                # concat frame and global intermediates, [B x S x P x 2C]
+                concat_inter = torch.cat([frame_intermediates[i], global_intermediates[i]], dim=-1)
+                output_list.append(concat_inter)
+        del concat_inter
+        del frame_intermediates
+        del global_intermediates
+        return output_list, self.patch_start_idx
+    def _process_frame_attention(self, tokens, B, S, P, C, frame_idx, pos=None):
+        """
+        Process frame attention blocks. We keep tokens in shape (B*S, P, C).
+        """
+        # If needed, reshape tokens or positions:
+        if tokens.shape != (B * S, P, C):
+            tokens = tokens.view(B, S, P, C).view(B * S, P, C)
+        if pos is not None and pos.shape != (B * S, P, 2):
+            pos = pos.view(B, S, P, 2).view(B * S, P, 2)
+        intermediates = []
+        # by default, self.aa_block_size=1, which processes one block at a time
+        for _ in range(self.aa_block_size):
+            if self.training:
+                tokens = checkpoint(self.frame_blocks[frame_idx], tokens, pos, use_reentrant=self.use_reentrant)
+            else:
+                tokens = self.frame_blocks[frame_idx](tokens, pos=pos)
+            frame_idx += 1
+            intermediates.append(tokens.view(B, S, P, C))
+        return tokens, frame_idx, intermediates
+    def _process_global_attention(self, tokens, B, S, P, C, global_idx, pos=None):
+        """
+        Process global attention blocks. We keep tokens in shape (B, S*P, C).
+        """
+        if tokens.shape != (B, S * P, C):
+            tokens = tokens.view(B, S, P, C).view(B, S * P, C)
+        if pos is not None and pos.shape != (B, S * P, 2):
+            pos = pos.view(B, S, P, 2).view(B, S * P, 2)
+        intermediates = []
+        # by default, self.aa_block_size=1, which processes one block at a time
+        for _ in range(self.aa_block_size):
+            if self.training:
+                tokens = checkpoint(self.global_blocks[global_idx], tokens, pos, use_reentrant=self.use_reentrant)
+            else:
+                tokens = self.global_blocks[global_idx](tokens, pos=pos)
+            global_idx += 1
+            intermediates.append(tokens.view(B, S, P, C))
+        return tokens, global_idx, intermediates
+def slice_expand_and_flatten(token_tensor, B, S):
+    """
+    Processes specialized tokens with shape (1, 2, X, C) for multi-frame processing:
+    1) Uses the first position (index=0) for the first frame only
+    2) Uses the second position (index=1) for all remaining frames (S-1 frames)
+    3) Expands both to match batch size B
+    4) Concatenates to form (B, S, X, C) where each sequence has 1 first-position token
+       followed by (S-1) second-position tokens
+    5) Flattens to (B*S, X, C) for processing
+    Returns:
+        torch.Tensor: Processed tokens with shape (B*S, X, C)
+    """
+    # Slice out the "query" tokens => shape (1, 1, ...)
+    query = token_tensor[:, 0:1, ...].expand(B, 1, *token_tensor.shape[2:])
+    # Slice out the "other" tokens => shape (1, S-1, ...)
+    others = token_tensor[:, 1:, ...].expand(B, S - 1, *token_tensor.shape[2:])
+    # Concatenate => shape (B, S, ...)
+    combined = torch.cat([query, others], dim=1)
+    # Finally flatten => shape (B*S, ...)
+    combined = combined.view(B * S, *combined.shape[2:])
+    return combined
+def slice_expand_and_flatten_single(token_tensor, B, S):
+    """
+    Processes specialized tokens with shape (1, 2, X, C) for multi-frame processing:
+    1) Uses the first position (index=0) for the first frame only
+    2) Uses the second position (index=1) for all remaining frames (S-1 frames)
+    3) Expands both to match batch size B
+    4) Concatenates to form (B, S, X, C) where each sequence has 1 first-position token
+       followed by (S-1) second-position tokens
+    5) Flattens to (B*S, X, C) for processing
+    Returns:
+        torch.Tensor: Processed tokens with shape (B*S, X, C)
+    """
+    # Slice out the "query" tokens => shape (1, 1, ...)
+    token = token_tensor.expand(B, S, *token_tensor.shape[2:])
+    # Finally flatten => shape (B*S, ...)
+    token = token.view(B * S, 1, *token.shape[2:])
+    return token

vdpm/dpm/decoder.py ADDED Viewed

	@@ -0,0 +1,416 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE-VGGT file in the root directory of this source tree.
+import logging
+import torch
+from torch import nn, Tensor
+from torch.utils.checkpoint import checkpoint
+from typing import List, Callable
+from dataclasses import dataclass
+from einops import repeat
+from vggt.layers.block import drop_add_residual_stochastic_depth
+from vggt.layers.rope import RotaryPositionEmbedding2D, PositionGetter
+from vggt.layers.attention import Attention
+from vggt.layers.drop_path import DropPath
+from vggt.layers.layer_scale import LayerScale
+from vggt.layers.mlp import Mlp
+logger = logging.getLogger(__name__)
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+class ConditionalBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+        qk_norm: bool = False,
+        fused_attn: bool = True,  # use F.scaled_dot_product_attention or not
+        rope=None,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim, elementwise_affine=False)
+        self.modulation = Modulation(dim, double=False)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            qk_norm=qk_norm,
+            fused_attn=fused_attn,
+            rope=rope,
+        )
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor, pos=None, cond=None, is_global=False) -> Tensor:
+        B, S = cond.shape[:2]
+        C = x.shape[-1]
+        if is_global:
+            P = x.shape[1] // S
+        cond = cond.view(B * S, C)
+        mod, _ = self.modulation(cond)
+        def attn_residual_func(x: Tensor, pos=None) -> Tensor:
+            """
+            conditional attention following DiT implementation from Flux
+            https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py#L194-L239
+            """
+            def prepare_for_mod(y):
+                """reshape to modulate the patch tokens with correct conditioning one"""
+                return y.view(B, S, P, C).view(B * S, P, C) if is_global else y
+            def restore_after_mod(y):
+                """reshape back to global sequence"""
+                return y.view(B, S, P, C).view(B, S * P, C) if is_global else y
+            x = prepare_for_mod(x)
+            x = (1 + mod.scale) * self.norm1(x) + mod.shift
+            x = restore_after_mod(x)
+            x = self.attn(x, pos=pos)
+            x = prepare_for_mod(x)
+            x = mod.gate * x
+            x = restore_after_mod(x)
+            return x
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                pos=pos,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x, pos=pos))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x, pos=pos)
+            x = x + ffn_residual_func(x)
+        return x
+class Decoder(nn.Module):
+    """Attention blocks after encoder per DPT input feature
+    to generate point maps at a given time.
+    """
+    def __init__(
+        self,
+        cfg,
+        dim_in: int,
+        intermediate_layer_idx: List[int] = [4, 11, 17, 23],
+        patch_size=14,
+        embed_dim=1024,
+        depth=2,
+        num_heads=16,
+        mlp_ratio=4.0,
+        block_fn=ConditionalBlock,
+        qkv_bias=True,
+        proj_bias=True,
+        ffn_bias=True,
+        aa_order=["frame", "global"],
+        aa_block_size=1,
+        qk_norm=True,
+        rope_freq=100,
+        init_values=0.01,
+    ):
+        super().__init__()
+        self.cfg = cfg
+        self.intermediate_layer_idx = intermediate_layer_idx
+        self.depth = depth
+        self.aa_order = aa_order
+        self.patch_size = patch_size
+        self.aa_block_size = aa_block_size
+        # Validate that depth is divisible by aa_block_size
+        if self.depth % self.aa_block_size != 0:
+            raise ValueError(f"depth ({depth}) must be divisible by aa_block_size ({aa_block_size})")
+        self.aa_block_num = self.depth // self.aa_block_size
+        self.rope = (
+            RotaryPositionEmbedding2D(frequency=rope_freq) if rope_freq > 0 else None
+        )
+        self.position_getter = PositionGetter() if self.rope is not None else None
+        self.dim_in = dim_in
+        self.old_decoder = False
+        if self.old_decoder:
+            self.frame_blocks = nn.ModuleList(
+                [
+                    block_fn(
+                        dim=embed_dim*2,
+                        num_heads=num_heads,
+                        mlp_ratio=mlp_ratio,
+                        qkv_bias=qkv_bias,
+                        proj_bias=proj_bias,
+                        ffn_bias=ffn_bias,
+                        init_values=init_values,
+                        qk_norm=qk_norm,
+                        rope=self.rope,
+                    )
+                    for _ in range(depth)
+                ]
+            )
+            self.global_blocks = nn.ModuleList(
+                [
+                    block_fn(
+                        dim=embed_dim*2,
+                        num_heads=num_heads,
+                        mlp_ratio=mlp_ratio,
+                        qkv_bias=qkv_bias,
+                        proj_bias=proj_bias,
+                        ffn_bias=ffn_bias,
+                        init_values=init_values,
+                        qk_norm=qk_norm,
+                        rope=self.rope,
+                    )
+                    for _ in range(depth)
+                ]
+            )
+        else:
+            depths = [depth]
+            self.frame_blocks = nn.ModuleList([
+                nn.ModuleList([
+                    block_fn(
+                        dim=embed_dim*2,
+                        num_heads=num_heads,
+                        mlp_ratio=mlp_ratio,
+                        qkv_bias=qkv_bias,
+                        proj_bias=proj_bias,
+                        ffn_bias=ffn_bias,
+                        init_values=init_values,
+                        qk_norm=qk_norm,
+                        rope=self.rope,
+                    )
+                    for _ in range(d)
+                ])
+                for d in depths
+            ])
+            self.global_blocks = nn.ModuleList([
+                nn.ModuleList([
+                    block_fn(
+                        dim=embed_dim*2,
+                        num_heads=num_heads,
+                        mlp_ratio=mlp_ratio,
+                        qkv_bias=qkv_bias,
+                        proj_bias=proj_bias,
+                        ffn_bias=ffn_bias,
+                        init_values=init_values,
+                        qk_norm=qk_norm,
+                        rope=self.rope,
+                    )
+                    for _ in range(d)
+                ])
+                for d in depths
+            ])
+        self.use_reentrant = False # hardcoded to False
+    def get_condition_tokens(
+        self,
+        aggregated_tokens_list: List[torch.Tensor],
+        cond_view_idxs: torch.Tensor
+    ):
+        # Use tokens from the last block for conditioning
+        tokens_last = aggregated_tokens_list[-1]  # [B S N_tok D]
+        # Extract the camera tokens
+        cond_token_idx = 1
+        camera_tokens = tokens_last[:, :, [cond_token_idx]]  # [B S D]
+        cond_view_idxs = cond_view_idxs.to(camera_tokens.device)
+        cond_view_idxs = repeat(
+            cond_view_idxs,
+            "b s -> b s c d",
+            c=camera_tokens.shape[2],
+            d=camera_tokens.shape[3],
+        )
+        cond_tokens = torch.gather(camera_tokens, 1, cond_view_idxs)
+        return cond_tokens
+    def forward(
+        self,
+        images: torch.Tensor,
+        aggregated_tokens_list: List[torch.Tensor],
+        patch_start_idx: int,
+        cond_view_idxs: torch.Tensor,
+    ):
+        B, S, _, H, W = images.shape
+        cond_tokens = self.get_condition_tokens(
+            aggregated_tokens_list, cond_view_idxs
+        )
+        input_tokens = []
+        for k, layer_idx in enumerate(self.intermediate_layer_idx):
+            layer_tokens = aggregated_tokens_list[layer_idx].clone()
+            input_tokens.append(layer_tokens)
+        _, _, P, C = input_tokens[0].shape
+        pos = None
+        if self.rope is not None:
+            pos = self.position_getter(
+                B * S, H // self.patch_size, W // self.patch_size, device=images.device
+            )
+        if patch_start_idx > 0:
+            # do not use position embedding for special tokens (camera and register tokens)
+            # so set pos to 0 for the special tokens
+            pos = pos + 1
+            pos_special = torch.zeros(B * S, patch_start_idx, 2).to(images.device).to(pos.dtype)
+            pos = torch.cat([pos_special, pos], dim=1)
+        frame_idx = 0
+        global_idx = 0
+        depth = len(self.frame_blocks[0])
+        N = len(input_tokens)
+        # stack all intermediate layer tokens along batch dimension
+        # they are all processed by the same decoder
+        s_tokens = torch.cat(input_tokens)
+        s_cond_tokens = torch.cat([cond_tokens] * N, dim=0)
+        s_pos = torch.cat([pos] * N, dim=0)
+        # perform time conditioned attention
+        for _ in range(depth):
+            for attn_type in self.aa_order:
+                token_idx = 0
+                if attn_type == "frame":
+                    s_tokens, frame_idx, _ = self._process_frame_attention(
+                        s_tokens, s_cond_tokens, B * N, S, P, C, frame_idx, pos=s_pos, token_idx=token_idx
+                    )
+                elif attn_type == "global":
+                    s_tokens, global_idx, _ = self._process_global_attention(
+                        s_tokens, s_cond_tokens, B * N, S, P, C, global_idx, pos=s_pos, token_idx=token_idx
+                    )
+                else:
+                    raise ValueError(f"Unknown attention type: {attn_type}")
+        processed = [t.view(B, S, P, C) for t in s_tokens.split(B, dim=0)]
+        return processed
+    def _process_frame_attention(self, tokens, cond_tokens, B, S, P, C, frame_idx, pos=None, token_idx=0):
+        """
+        Process frame attention blocks. We keep tokens in shape (B*S, P, C).
+        """
+        # If needed, reshape tokens or positions:
+        if tokens.shape != (B * S, P, C):
+            tokens = tokens.view(B, S, P, C).view(B * S, P, C)
+        if pos is not None and pos.shape != (B * S, P, 2):
+            pos = pos.view(B, S, P, 2).view(B * S, P, 2)
+        intermediates = []
+        # by default, self.aa_block_size=1, which processes one block at a time
+        for _ in range(self.aa_block_size):
+            if self.training:
+                tokens = checkpoint(self.frame_blocks[token_idx][frame_idx], tokens, pos, cond_tokens, use_reentrant=self.use_reentrant)
+            else:
+                if self.old_decoder:
+                    tokens = self.frame_blocks[frame_idx](tokens, pos=pos, cond=cond_tokens)
+                else:
+                    tokens = self.frame_blocks[0][frame_idx](tokens, pos=pos, cond=cond_tokens)
+            frame_idx += 1
+            intermediates.append(tokens.view(B, S, P, C))
+        return tokens, frame_idx, intermediates
+    def _process_global_attention(self, tokens, cond_tokens, B, S, P, C, global_idx, pos=None, token_idx=0):
+        """
+        Process global attention blocks. We keep tokens in shape (B, S*P, C).
+        """
+        if tokens.shape != (B, S * P, C):
+            tokens = tokens.view(B, S, P, C).view(B, S * P, C)
+        if pos is not None and pos.shape != (B, S * P, 2):
+            pos = pos.view(B, S, P, 2).view(B, S * P, 2)
+        intermediates = []
+        # by default, self.aa_block_size=1, which processes one block at a time
+        for _ in range(self.aa_block_size):
+            if self.training:
+                tokens = checkpoint(self.global_blocks[token_idx][global_idx], tokens, pos, cond_tokens, True, use_reentrant=self.use_reentrant)
+            else:
+                if self.old_decoder:
+                    tokens = self.global_blocks[global_idx](tokens, pos=pos, cond=cond_tokens, is_global=True)
+                else:
+                    tokens = self.global_blocks[0][global_idx](tokens, pos=pos, cond=cond_tokens, is_global=True)
+            global_idx += 1
+            intermediates.append(tokens.view(B, S, P, C))
+        return tokens, global_idx, intermediates

vdpm/dpm/model.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import torch
+import torch.nn as nn
+from vggt.heads.camera_head import CameraHead
+from vggt.heads.dpt_head import DPTHead
+from .aggregator import Aggregator
+from .decoder import Decoder
+def freeze_all_params(modules):
+    for module in modules:
+        try:
+            for n, param in module.named_parameters():
+                param.requires_grad = False
+        except AttributeError:
+            # module is directly a parameter
+            module.requires_grad = False
+class VDPM(nn.Module):
+    def __init__(self, cfg, img_size=518, patch_size=14, embed_dim=1024):
+        super().__init__()
+        self.cfg = cfg
+        self.aggregator = Aggregator(
+            img_size=img_size,
+            patch_size=patch_size,
+            embed_dim=embed_dim,
+        )
+        self.decoder = Decoder(
+            cfg,
+            dim_in=2*embed_dim,
+            embed_dim=embed_dim,
+            depth=cfg.model.decoder_depth
+        )
+        self.point_head = DPTHead(dim_in=2 * embed_dim, output_dim=4, activation="inv_log", conf_activation="expp1")
+        self.camera_head = CameraHead(dim_in=2 * embed_dim)
+        self.set_freeze()
+    def set_freeze(self):
+        to_be_frozen = [self.aggregator.patch_embed]
+        freeze_all_params(to_be_frozen)
+    def forward(
+        self,
+        views, autocast_dpt=None
+    ):
+        images = torch.stack([view["img"] for view in views], dim=1)
+        aggregated_tokens_list, patch_start_idx = self.aggregator(images)
+        res_dynamic = dict()
+        if self.decoder is not None:
+            cond_view_idxs = torch.stack([view["view_idxs"][:, 1] for view in views], dim=1)
+            decoded_tokens = self.decoder(images, aggregated_tokens_list, patch_start_idx, cond_view_idxs)
+        if autocast_dpt is None:
+            autocast_dpt = torch.amp.autocast("cuda", enabled=False)
+        with autocast_dpt:
+            pts3d, pts3d_conf = self.point_head(
+                aggregated_tokens_list, images, patch_start_idx
+            )
+            padded_decoded_tokens = [None] * len(aggregated_tokens_list)
+            for idx, layer_idx in enumerate(self.point_head.intermediate_layer_idx):
+                padded_decoded_tokens[layer_idx] = decoded_tokens[idx]
+            pts3d_dyn, pts3d_dyn_conf = self.point_head(
+                padded_decoded_tokens, images, patch_start_idx
+            )
+            res_dynamic |= {
+                "pts3d": pts3d_dyn,
+                "conf": pts3d_dyn_conf
+            }
+            pose_enc_list = self.camera_head(aggregated_tokens_list)
+            res_dynamic |= {"pose_enc_list": pose_enc_list}
+        res_static = dict(
+            pts3d=pts3d,
+            conf=pts3d_conf
+        )
+        return res_static, res_dynamic
+    def inference(
+        self,
+        views,
+        images=None,
+        num_timesteps=None
+    ):
+        autocast_amp = torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16)
+        if images is None:
+            images = torch.stack([view["img"] for view in views], dim=1)
+        with autocast_amp:
+            aggregated_tokens_list, patch_start_idx = self.aggregator(images)
+        S = images.shape[1]
+        # Determine number of timesteps to query
+        if num_timesteps is None:
+             # Default to S if not specified (legacy behavior)
+             # But if views has indices, try to infer max time
+            if views is not None and "view_idxs" in views[0]:
+                try:
+                    all_idxs = torch.cat([v["view_idxs"][:, 1] for v in views])
+                    num_timesteps = int(all_idxs.max().item()) + 1
+                except:
+                    num_timesteps = S
+            else:
+                num_timesteps = S
+        predictions = dict()
+        pointmaps = []
+        ones = torch.ones(1, S, dtype=torch.int64)
+        for time_ in range(num_timesteps):
+            cond_view_idxs = ones * time_
+            with autocast_amp:
+                decoded_tokens = self.decoder(images, aggregated_tokens_list, patch_start_idx, cond_view_idxs)
+            padded_decoded_tokens = [None] * len(aggregated_tokens_list)
+            for idx, layer_idx in enumerate(self.point_head.intermediate_layer_idx):
+                padded_decoded_tokens[layer_idx] = decoded_tokens[idx]
+            # ... existing code ...
+            pts3d, pts3d_conf = self.point_head(
+                padded_decoded_tokens, images, patch_start_idx
+            )
+            pointmaps.append(dict(
+                pts3d=pts3d,
+                conf=pts3d_conf
+            ))
+        pose_enc_list = self.camera_head(aggregated_tokens_list)
+        predictions["pose_enc"] = pose_enc_list[-1]  # pose encoding of the last iteration
+        predictions["pose_enc_list"] = pose_enc_list
+        predictions["pointmaps"] = pointmaps
+        return predictions
+    def load_state_dict(self, ckpt, is_VGGT_static=False, **kw):
+        # don't load these VGGT heads as not needed
+        exclude = ["depth_head", "track_head"]
+        ckpt = {k:v for k, v in ckpt.items() if k.split('.')[0] not in exclude}
+        return super().load_state_dict(ckpt, **kw)

vdpm/examples/videos/camel.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3db92c240efbd1b97a466565988a9a06687fd422086656dc0a29e12c5b99b9bb
+size 1301172

vdpm/examples/videos/car.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd74efdb4d4d59fc17356fefa5dadd4c5b787641c98ce3172ecd8e5a180e76a6
+size 1015132

vdpm/examples/videos/figure1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae285726e5d247e904bb1ea7887ee96733c0beea913b421abba39150a3299cd5
+size 465850

vdpm/examples/videos/figure2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b2b030dd564cffbb9b2795e7fcdf97fa50e3a518df5b71dfb3dfb36f431dfa4
+size 516209

vdpm/examples/videos/figure3.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a4144a53f14bd2dc671376d26ecbb42b06c9b8810e1700f21a16d3e11dfbf5c
+size 559096

vdpm/examples/videos/goldfish.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28912e59d0d9e6b20d26973efee4806e89e115c7f1e63aec7206384ac3d0bf78
+size 668862

vdpm/examples/videos/horse.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8227c7d901a936aeab6a2b41f104dd17e5544315d4cde7dac37f5787319947e7
+size 1223145