File size: 4,547 Bytes
436b829
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
Demo: run LiDAR-Perfect Depth on a video sequence with the temporal Kalman
filter looped between frames.

Usage:
    python run_lpd_video.py \
        --sequence /mnt/sig/datasets/eval_video/bonn_rgbd/rgbd_bonn_balloon \
        --dataset bonn \
        --weights checkpoints/ppd.pth \
        --out outputs/bonn_balloon

Requires: PPD weights + DA-V2 weights + RAFT weights under checkpoints/.
"""
from __future__ import annotations

import argparse
import os
from pathlib import Path

import imageio.v2 as imageio
import numpy as np
import torch
from omegaconf import OmegaConf

from ppd.lpd.lpd_train import LiDARPerfectDepth
from ppd.lpd.lpd_video import run_video, VideoInferenceConfig
from ppd.utils.vis_utils import visualize_depth


def _load_pipeline(ppd_weights: str, device: str = "cuda") -> LiDARPerfectDepth:
    cfg = OmegaConf.create(
        {
            "pretrain": False,
            "semantics_model": "DA2",
            "semantics_pth": "checkpoints/depth_anything_v2_vitl.pth",
            "ppd_weights": ppd_weights,
            "freeze_backbone": True,
            "lambda_anchor": 0.0,
            "R_proj": 0.1,
            "proj_alpha": 0.1,
            "init_P": 1.0,
            "sparse": {"pattern": "hybrid", "density": 0.005, "n_lines": 64,
                       "line_density": 0.5, "grid_stride": 32, "min_points": 16,
                       "measurement_noise_std": 0.0},
            "score_model": {"depth": 24, "hidden_size": 1024, "patch_size": 8,
                            "num_heads": 16, "in_channels": 4, "out_channels": 1},
            "diffusion": {
                "schedule": {"type": "lerp", "T": 1000},
                "sampler": {"type": "euler", "prediction_type": "v_lerp"},
                "timesteps": {
                    "training": {"type": "logitnormal", "loc": 0.0, "scale": 1.0},
                    "sampling": {"type": "uniform", "steps": 4},
                },
            },
        }
    )
    pipeline = LiDARPerfectDepth(cfg).to(device).eval()
    return pipeline


def _yield_bonn(seq_dir: str, max_frames: int | None = None):
    rgb_dir = Path(seq_dir) / "rgb"
    dpt_dir = Path(seq_dir) / "depth"
    rgb_files = sorted(p for p in rgb_dir.iterdir() if p.suffix == ".png")
    dpt_files = sorted(p for p in dpt_dir.iterdir() if p.suffix == ".png")
    n = min(len(rgb_files), len(dpt_files))
    if max_frames is not None:
        n = min(n, max_frames)
    for i in range(n):
        rgb = imageio.imread(rgb_files[i]).astype(np.float32) / 255.0
        depth = imageio.imread(dpt_files[i]).astype(np.float32) / 5000.0
        mask = (depth > 0.01) & (depth < 10.0)
        yield {
            "image": torch.from_numpy(rgb).permute(2, 0, 1)[None],
            "depth": torch.from_numpy(depth)[None, None],
            "mask": torch.from_numpy(mask.astype(np.uint8))[None, None],
            "frame_idx": i,
        }


def main():
    p = argparse.ArgumentParser()
    p.add_argument("--sequence", required=True, help="path to a video sequence")
    p.add_argument("--dataset", default="bonn", choices=["bonn"])
    p.add_argument("--weights", default="checkpoints/ppd.pth")
    p.add_argument("--out", default="outputs/lpd_video")
    p.add_argument("--max-frames", type=int, default=None)
    p.add_argument("--device", default="cuda")
    args = p.parse_args()

    os.makedirs(args.out, exist_ok=True)
    print(f"[lpd_video] loading pipeline from {args.weights}")
    pipeline = _load_pipeline(args.weights, args.device)

    if args.dataset == "bonn":
        frames = list(_yield_bonn(args.sequence, args.max_frames))
    else:
        raise ValueError(f"unknown dataset: {args.dataset}")

    print(f"[lpd_video] running on {len(frames)} frames")
    cfg = VideoInferenceConfig()
    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
        outs = run_video(pipeline, frames, config=cfg)

    print(f"[lpd_video] writing visualizations to {args.out}")
    for i, (frame, out) in enumerate(zip(frames, outs)):
        depth_np = out["depth"][0, 0].float().cpu().numpy()
        var_np = out["kalman_variance_running"][0, 0].float().cpu().numpy()
        vis_d = visualize_depth(depth_np)
        vis_v = visualize_depth(var_np, var_np.min(), var_np.max())
        rgb_np = frame["image"][0].permute(1, 2, 0).numpy()
        panel = np.concatenate([rgb_np, vis_d, vis_v], axis=1)
        imageio.imwrite(os.path.join(args.out, f"{i:05d}.png"), (panel * 255).astype(np.uint8))

    print("[lpd_video] done")


if __name__ == "__main__":
    main()