Add 7 navigation models (ONNX + inference wrappers) and model card
Browse files- .gitattributes +3 -0
- CityWalker_PG_Official/citywalker.onnx +3 -0
- CityWalker_PG_Official/inference.py +203 -0
- CityWalker_PG_Official/model_info.yaml +13 -0
- GNM_GL_Official/gnm_imagegoal.onnx +3 -0
- GNM_GL_Official/gnm_imagegoal.onnx.data +3 -0
- GNM_GL_Official/inference.py +247 -0
- GNM_GL_Official/model_info.yaml +33 -0
- MBRA_PG_Official/inference.py +177 -0
- MBRA_PG_Official/mbra.onnx +3 -0
- MBRA_PG_Official/model_info.yaml +12 -0
- MIMIC/inference.py +170 -0
- MIMIC/mimic.onnx +3 -0
- MIMIC/model_info.yaml +13 -0
- NoMaD_GL_Official/inference.py +440 -0
- NoMaD_GL_Official/model_info.yaml +40 -0
- NoMaD_GL_Official/nomad_dist_pred.onnx +3 -0
- NoMaD_GL_Official/nomad_dist_pred.onnx.data +0 -0
- NoMaD_GL_Official/nomad_noise_pred.onnx +3 -0
- NoMaD_GL_Official/nomad_vision_encoder.onnx +3 -0
- NoMaD_GL_Official/nomad_vision_encoder.onnx.data +3 -0
- README.md +232 -0
- S2E/inference.py +167 -0
- S2E/model_info.yaml +12 -0
- S2E/s2e.onnx +3 -0
- Vint_GL_Official/inference.py +294 -0
- Vint_GL_Official/model_info.yaml +33 -0
- Vint_GL_Official/vint_imagegoal.onnx +3 -0
- Vint_GL_Official/vint_imagegoal.onnx.data +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
GNM_GL_Official/gnm_imagegoal.onnx.data filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
NoMaD_GL_Official/nomad_vision_encoder.onnx.data filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
Vint_GL_Official/vint_imagegoal.onnx.data filter=lfs diff=lfs merge=lfs -text
|
CityWalker_PG_Official/citywalker.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e8fb9ff081a883d80f0502d5b7e9046ed4b0dfafd46fe96ff5ad16194f0949a
|
| 3 |
+
size 806082192
|
CityWalker_PG_Official/inference.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""CityWalker-PG-Official inference utilities.
|
| 3 |
+
|
| 4 |
+
CityWalker uses a NON-STANDARD coordinate system internally:
|
| 5 |
+
model-internal: y = forward, x = right
|
| 6 |
+
our standard: x = forward, y = left
|
| 7 |
+
|
| 8 |
+
This navigator accepts standard-frame inputs (goal_xy in x=forward, y=left)
|
| 9 |
+
and returns standard-frame outputs. The coordinate swap is handled internally.
|
| 10 |
+
|
| 11 |
+
nav = CityWalkerPGNavigator(device="cuda")
|
| 12 |
+
traj, scores = nav.inference_trajectory(obs, goal_xy=np.array([5.0, 0.2]))
|
| 13 |
+
vw = nav.inference_vw(obs, goal_xy=np.array([5.0, 0.2]))
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import os
|
| 17 |
+
import math
|
| 18 |
+
import numpy as np
|
| 19 |
+
import torch
|
| 20 |
+
import yaml
|
| 21 |
+
import onnxruntime as ort
|
| 22 |
+
from torchvision import transforms
|
| 23 |
+
|
| 24 |
+
MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 25 |
+
ONNX_PATH = os.path.join(MODEL_DIR, "citywalker.onnx")
|
| 26 |
+
INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
|
| 27 |
+
|
| 28 |
+
# ImageNet normalization (CityWalker uses normalized images)
|
| 29 |
+
IMAGENET_TRANSFORM = transforms.Compose([
|
| 30 |
+
transforms.ToTensor(),
|
| 31 |
+
transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
| 32 |
+
std=[0.229, 0.224, 0.225]),
|
| 33 |
+
])
|
| 34 |
+
IMG_H, IMG_W = 350, 630
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class PDController:
|
| 38 |
+
MAX_V = 2.5
|
| 39 |
+
MAX_W = 0.65
|
| 40 |
+
|
| 41 |
+
def __init__(self):
|
| 42 |
+
self.last_v = None
|
| 43 |
+
|
| 44 |
+
def reset(self):
|
| 45 |
+
self.last_v = None
|
| 46 |
+
|
| 47 |
+
def __call__(self, waypoints, dt=1.0):
|
| 48 |
+
EPS = 1e-6
|
| 49 |
+
dxr = waypoints[:, 2, 0]
|
| 50 |
+
dyr = waypoints[:, 2, 1]
|
| 51 |
+
idx = 0
|
| 52 |
+
dx = waypoints[:, idx, 0]
|
| 53 |
+
v = dx / dt
|
| 54 |
+
w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
|
| 55 |
+
* dxr.sign() / dt)
|
| 56 |
+
near_zero = dx.abs() < EPS
|
| 57 |
+
v = torch.where(near_zero, torch.zeros_like(v), v)
|
| 58 |
+
w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
|
| 59 |
+
if self.last_v is not None:
|
| 60 |
+
v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
|
| 61 |
+
v = v.clamp(-self.MAX_V, self.MAX_V)
|
| 62 |
+
w = w.clamp(-self.MAX_W, self.MAX_W)
|
| 63 |
+
self.last_v = v
|
| 64 |
+
return v, w
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _std_to_cw(xy):
|
| 68 |
+
"""Standard frame (x=fwd, y=left) → CityWalker frame (y=fwd, x=right).
|
| 69 |
+
|
| 70 |
+
CityWalker model input: [y_cw, -x_cw] where y_cw=forward, x_cw=right.
|
| 71 |
+
Given standard (x_std=fwd, y_std=left):
|
| 72 |
+
y_cw = x_std (forward)
|
| 73 |
+
x_cw = -y_std (right = -left)
|
| 74 |
+
Model expects [y_cw, -x_cw] = [x_std, y_std]
|
| 75 |
+
|
| 76 |
+
Actually from the code: input_traj[..., [1,0]] then negate [0].
|
| 77 |
+
So model wants: col0 = -x_cw = y_std, col1 = y_cw = x_std.
|
| 78 |
+
"""
|
| 79 |
+
out = np.empty_like(xy)
|
| 80 |
+
out[..., 0] = -xy[..., 1] # col0 = y_std (= -x_cw)
|
| 81 |
+
out[..., 1] = xy[..., 0] # col1 = x_std (= y_cw = forward)
|
| 82 |
+
return out
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _cw_to_std(xy):
|
| 86 |
+
"""CityWalker output frame → standard frame.
|
| 87 |
+
|
| 88 |
+
Model output: [y_std, x_std] (reverse of the swap above).
|
| 89 |
+
"""
|
| 90 |
+
out = np.empty_like(xy)
|
| 91 |
+
out[..., 0] = xy[..., 1] # x_std = forward
|
| 92 |
+
out[..., 1] = -xy[..., 0] # y_std = left
|
| 93 |
+
return out
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
class CityWalkerPGNavigator:
|
| 97 |
+
"""CityWalker point-goal navigator.
|
| 98 |
+
|
| 99 |
+
All user-facing coordinates are standard frame: x=forward, y=left, meters.
|
| 100 |
+
"""
|
| 101 |
+
|
| 102 |
+
context_size = 5
|
| 103 |
+
multimodal = False
|
| 104 |
+
|
| 105 |
+
def __init__(
|
| 106 |
+
self,
|
| 107 |
+
onnx_path: str = ONNX_PATH,
|
| 108 |
+
device: str = "cuda",
|
| 109 |
+
max_v: float = 2.5,
|
| 110 |
+
max_w: float = 0.65,
|
| 111 |
+
dt: float = 1.0,
|
| 112 |
+
):
|
| 113 |
+
self.device = device
|
| 114 |
+
self.dt = dt
|
| 115 |
+
|
| 116 |
+
ort.set_default_logger_severity(3)
|
| 117 |
+
providers = [
|
| 118 |
+
("CUDAExecutionProvider",
|
| 119 |
+
{"arena_extend_strategy": "kSameAsRequested"}),
|
| 120 |
+
"CPUExecutionProvider",
|
| 121 |
+
]
|
| 122 |
+
self._session = ort.InferenceSession(onnx_path, providers=providers)
|
| 123 |
+
with open(INFO_PATH, "r") as f:
|
| 124 |
+
self._info = yaml.safe_load(f)
|
| 125 |
+
self._controller = PDController()
|
| 126 |
+
self._controller.MAX_V = max_v
|
| 127 |
+
self._controller.MAX_W = max_w
|
| 128 |
+
self._past_traj = None
|
| 129 |
+
|
| 130 |
+
def reset(self):
|
| 131 |
+
self._controller.reset()
|
| 132 |
+
self._past_traj = None
|
| 133 |
+
|
| 134 |
+
def _preprocess_images(self, obs_rgb):
|
| 135 |
+
"""obs_rgb: (B, C, 3, H, W) float32 [0,1] → ImageNet-normalized."""
|
| 136 |
+
from PIL import Image as PILImage
|
| 137 |
+
B, C = obs_rgb.shape[:2]
|
| 138 |
+
out = np.empty((B, C, 3, IMG_H, IMG_W), dtype=np.float32)
|
| 139 |
+
for b in range(B):
|
| 140 |
+
for c in range(C):
|
| 141 |
+
# CHW [0,1] → HWC uint8
|
| 142 |
+
frame = (obs_rgb[b, c].transpose(1, 2, 0) * 255).astype(np.uint8)
|
| 143 |
+
pil = PILImage.fromarray(frame)
|
| 144 |
+
pil = pil.resize((IMG_W, IMG_H))
|
| 145 |
+
out[b, c] = IMAGENET_TRANSFORM(pil).numpy()
|
| 146 |
+
return out
|
| 147 |
+
|
| 148 |
+
def inference_trajectory(self, obs, goal_xy=None, past_traj_std=None):
|
| 149 |
+
"""Run model → (trajectory, scores).
|
| 150 |
+
|
| 151 |
+
Args:
|
| 152 |
+
obs: (B, context_size, 3, H, W) float32 in [0,1]. 5 frames.
|
| 153 |
+
goal_xy: (2,) or (B, 2) goal in standard frame [x_fwd, y_left] meters.
|
| 154 |
+
If None, uses [5.0, 0.0] (straight ahead 5m).
|
| 155 |
+
past_traj_std: (B, 6, 2) past trajectory in standard frame, or None.
|
| 156 |
+
If None, uses zeros (stationary start).
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
trajectory: np.ndarray (B, 1, 5, 2) in standard frame
|
| 160 |
+
scores: np.ndarray (B, 1)
|
| 161 |
+
"""
|
| 162 |
+
if isinstance(obs, torch.Tensor):
|
| 163 |
+
obs_np = obs.cpu().numpy()
|
| 164 |
+
else:
|
| 165 |
+
obs_np = np.asarray(obs, dtype=np.float32)
|
| 166 |
+
B = obs_np.shape[0]
|
| 167 |
+
|
| 168 |
+
# Preprocess images: resize + ImageNet normalize
|
| 169 |
+
obs_norm = self._preprocess_images(obs_np)
|
| 170 |
+
|
| 171 |
+
# Build trajectory input (past 6 waypoints in CW frame)
|
| 172 |
+
if past_traj_std is None:
|
| 173 |
+
past_std = np.zeros((B, 6, 2), dtype=np.float32)
|
| 174 |
+
else:
|
| 175 |
+
past_std = np.asarray(past_traj_std, dtype=np.float32)
|
| 176 |
+
if past_std.ndim == 2:
|
| 177 |
+
past_std = past_std[np.newaxis]
|
| 178 |
+
past_std[:, -1, 0] = np.random.uniform(5.0, 10.0, size=(B,))
|
| 179 |
+
past_std[:, -1, 1] = np.random.uniform(-2.0, 2.0, size=(B,))
|
| 180 |
+
traj_cw = _std_to_cw(past_std)
|
| 181 |
+
out = self._session.run(None, {
|
| 182 |
+
"obs_images": obs_norm,
|
| 183 |
+
"trajectory": traj_cw,
|
| 184 |
+
})
|
| 185 |
+
wp_cw = out[0] # (B, 5, 2) in CW frame
|
| 186 |
+
|
| 187 |
+
# Convert back to standard frame
|
| 188 |
+
wp_std = _cw_to_std(wp_cw)
|
| 189 |
+
trajectory = wp_std[:, np.newaxis] # (B, 1, 5, 2)
|
| 190 |
+
scores = np.ones((B, 1), dtype=np.float32)
|
| 191 |
+
return trajectory, scores
|
| 192 |
+
|
| 193 |
+
def inference_vw(self, obs, goal_xy=None, past_traj_std=None):
|
| 194 |
+
"""Run model → (v, w) velocity commands in standard frame.
|
| 195 |
+
|
| 196 |
+
Returns:
|
| 197 |
+
vw: torch.Tensor (B, 2)
|
| 198 |
+
"""
|
| 199 |
+
trajectory, _ = self.inference_trajectory(obs, goal_xy, past_traj_std)
|
| 200 |
+
best_traj = trajectory[:, 0] # (B, 5, 2) standard frame
|
| 201 |
+
waypoints = torch.from_numpy(best_traj).float().to(self.device)
|
| 202 |
+
v, w = self._controller(waypoints, dt=self.dt)
|
| 203 |
+
return torch.stack([v, w], dim=1), best_traj
|
CityWalker_PG_Official/model_info.yaml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: CityWalker-PG-Official
|
| 2 |
+
goal_type: point_goal
|
| 3 |
+
image_resolution: [350, 630]
|
| 4 |
+
context_size: 5
|
| 5 |
+
input_fps: 5.0
|
| 6 |
+
obs_normalize: false
|
| 7 |
+
inputs:
|
| 8 |
+
obs_images: {shape: [batch, 5, 3, 350, 630], dtype: float32, description: "5 RGB observation frames in [0,1]"}
|
| 9 |
+
trajectory: {shape: [batch, 6, 2], dtype: float32, description: "6 past trajectory waypoints (x,y) in meters"}
|
| 10 |
+
outputs:
|
| 11 |
+
wp_pred: {shape: [batch, 5, 2], description: "5 predicted waypoints (x,y)"}
|
| 12 |
+
arrive_pred: {shape: [batch, 1], description: "Arrival probability"}
|
| 13 |
+
num_waypoints: 5
|
GNM_GL_Official/gnm_imagegoal.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8914d698a58f0f49ca8da5a2f61d646d759a2178ad6f8a89f0e863b625110c77
|
| 3 |
+
size 619733
|
GNM_GL_Official/gnm_imagegoal.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2764830aaf3d0a6189f1f2a0d5c9d2f4c725aa9cc9638a8beeae9e6c9dd59b8a
|
| 3 |
+
size 34603008
|
GNM_GL_Official/inference.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""GNM-GL-Official inference wrapper (NavDP image-goal GNM in goal-free mode).
|
| 3 |
+
|
| 4 |
+
Mirror of Vint_GL_Official/inference.py — same I/O contract, only the ONNX
|
| 5 |
+
backbone and the class name differ. See that file for the action-decode and
|
| 6 |
+
preprocessing rationale.
|
| 7 |
+
|
| 8 |
+
nav = GNMGLOfficialNavigator(device="cuda")
|
| 9 |
+
traj, scores = nav.inference_trajectory(obs) # (B, 1, 5, 2) meters
|
| 10 |
+
vw = nav.inference_vw(obs) # caches nav._last_best_traj
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import os
|
| 14 |
+
import math
|
| 15 |
+
import numpy as np
|
| 16 |
+
import torch
|
| 17 |
+
import yaml
|
| 18 |
+
import onnxruntime as ort
|
| 19 |
+
from torchvision import transforms
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
from urbansim.custom.pp import PurePursuitController
|
| 23 |
+
except Exception:
|
| 24 |
+
PurePursuitController = None # optional — only needed for inference_vw_pp
|
| 25 |
+
|
| 26 |
+
MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 27 |
+
ONNX_PATH = os.path.join(MODEL_DIR, "gnm_imagegoal.onnx")
|
| 28 |
+
INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
|
| 29 |
+
|
| 30 |
+
IMG_W, IMG_H = 85, 64
|
| 31 |
+
NUM_WAYPOINTS = 5
|
| 32 |
+
CTX_FRAMES = 6
|
| 33 |
+
WAYPOINT_INDEX = 2
|
| 34 |
+
|
| 35 |
+
IMAGE_ASPECT_RATIO = 4 / 3
|
| 36 |
+
|
| 37 |
+
TRAIN_CAM_FX = 272.547
|
| 38 |
+
TRAIN_CAM_FY = 266.358
|
| 39 |
+
TRAIN_CAM_CX = 320.0
|
| 40 |
+
TRAIN_CAM_CY = 220.0
|
| 41 |
+
TRAIN_CAM_W = 640
|
| 42 |
+
TRAIN_CAM_H = 440
|
| 43 |
+
|
| 44 |
+
DEFAULT_SRC_INTRINSICS = {
|
| 45 |
+
"fx": 210.667, "fy": 210.667,
|
| 46 |
+
"cx": 256.0, "cy": 144.0,
|
| 47 |
+
"w": 512, "h": 288,
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
IMAGENET_TRANSFORM = transforms.Compose([
|
| 51 |
+
transforms.ToTensor(),
|
| 52 |
+
transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
| 53 |
+
std=[0.229, 0.224, 0.225]),
|
| 54 |
+
])
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class PDController:
|
| 58 |
+
MAX_V = 2.5
|
| 59 |
+
MAX_W = 0.65
|
| 60 |
+
|
| 61 |
+
def __init__(self):
|
| 62 |
+
self.last_v = None
|
| 63 |
+
|
| 64 |
+
def reset(self):
|
| 65 |
+
self.last_v = None
|
| 66 |
+
|
| 67 |
+
def __call__(self, waypoints, dt=1.0):
|
| 68 |
+
EPS = 1e-6
|
| 69 |
+
idx = min(WAYPOINT_INDEX, waypoints.shape[1] - 1)
|
| 70 |
+
dx = waypoints[:, idx, 0]
|
| 71 |
+
dyr = waypoints[:, -1, 1]
|
| 72 |
+
dxr = waypoints[:, -1, 0]
|
| 73 |
+
v = dx / dt
|
| 74 |
+
w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
|
| 75 |
+
* dxr.sign() / dt)
|
| 76 |
+
near_zero = dx.abs() < EPS
|
| 77 |
+
v = torch.where(near_zero, torch.zeros_like(v), v)
|
| 78 |
+
w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
|
| 79 |
+
if self.last_v is not None:
|
| 80 |
+
v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
|
| 81 |
+
v = v.clamp(-self.MAX_V, self.MAX_V)
|
| 82 |
+
w = w.clamp(-self.MAX_W, self.MAX_W)
|
| 83 |
+
self.last_v = v
|
| 84 |
+
return v, w
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def _make_session(path):
|
| 88 |
+
ort.set_default_logger_severity(3)
|
| 89 |
+
opts = ort.SessionOptions()
|
| 90 |
+
opts.log_severity_level = 3
|
| 91 |
+
return ort.InferenceSession(
|
| 92 |
+
path, sess_options=opts,
|
| 93 |
+
providers=[
|
| 94 |
+
("CUDAExecutionProvider",
|
| 95 |
+
{"arena_extend_strategy": "kSameAsRequested"}),
|
| 96 |
+
"CPUExecutionProvider"])
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
class GNMGLOfficialNavigator:
|
| 100 |
+
"""GNM-GL-Official navigator (NavDP image-goal GNM, goal-free mode)."""
|
| 101 |
+
|
| 102 |
+
context_size = CTX_FRAMES
|
| 103 |
+
multimodal = False
|
| 104 |
+
num_waypoints = NUM_WAYPOINTS
|
| 105 |
+
|
| 106 |
+
def __init__(
|
| 107 |
+
self,
|
| 108 |
+
device: str = "cuda",
|
| 109 |
+
max_v: float = 2.5,
|
| 110 |
+
max_w: float = 0.65,
|
| 111 |
+
dt: float = 1.25,
|
| 112 |
+
center_crop: bool = True,
|
| 113 |
+
use_train_intrinsics: bool = True,
|
| 114 |
+
src_intrinsics: dict = None,
|
| 115 |
+
):
|
| 116 |
+
self.device = device
|
| 117 |
+
self.dt = dt
|
| 118 |
+
self.center_crop = center_crop
|
| 119 |
+
self.use_train_intrinsics = use_train_intrinsics
|
| 120 |
+
self.src_intrinsics = dict(src_intrinsics or DEFAULT_SRC_INTRINSICS)
|
| 121 |
+
self._train_affine = None
|
| 122 |
+
|
| 123 |
+
self._sess = _make_session(ONNX_PATH)
|
| 124 |
+
with open(INFO_PATH, "r") as f:
|
| 125 |
+
self._info = yaml.safe_load(f)
|
| 126 |
+
|
| 127 |
+
self._normalize_actions = bool(self._info.get("normalize", True))
|
| 128 |
+
self._waypoint_spacing = float(
|
| 129 |
+
self._info.get("metric_waypoint_spacing", 0.8 / 3.0))
|
| 130 |
+
self.model_fps = float(self._info.get("model_fps", 3.0))
|
| 131 |
+
|
| 132 |
+
self._controller = PDController()
|
| 133 |
+
self._controller.MAX_V = max_v
|
| 134 |
+
self._controller.MAX_W = max_w
|
| 135 |
+
|
| 136 |
+
self._last_best_traj = None
|
| 137 |
+
self._pp = (PurePursuitController(action_dt=3, waypoint_index=2)
|
| 138 |
+
if PurePursuitController is not None else None)
|
| 139 |
+
|
| 140 |
+
def reset(self):
|
| 141 |
+
self._controller.reset()
|
| 142 |
+
self._last_best_traj = None
|
| 143 |
+
|
| 144 |
+
def _ensure_train_affine(self):
|
| 145 |
+
if self._train_affine is not None:
|
| 146 |
+
return
|
| 147 |
+
fx_t = TRAIN_CAM_FX * IMG_W / TRAIN_CAM_W
|
| 148 |
+
fy_t = TRAIN_CAM_FY * IMG_H / TRAIN_CAM_H
|
| 149 |
+
cx_t = TRAIN_CAM_CX * IMG_W / TRAIN_CAM_W
|
| 150 |
+
cy_t = TRAIN_CAM_CY * IMG_H / TRAIN_CAM_H
|
| 151 |
+
src = self.src_intrinsics
|
| 152 |
+
a = src["fx"] / fx_t
|
| 153 |
+
e = src["fy"] / fy_t
|
| 154 |
+
c = src["cx"] - cx_t * a
|
| 155 |
+
f = src["cy"] - cy_t * e
|
| 156 |
+
self._train_affine = (a, 0.0, c, 0.0, e, f)
|
| 157 |
+
|
| 158 |
+
def _to_model_input_uint8(self, frame_uint8):
|
| 159 |
+
from PIL import Image as PILImage
|
| 160 |
+
pil = PILImage.fromarray(frame_uint8)
|
| 161 |
+
if self.use_train_intrinsics:
|
| 162 |
+
self._ensure_train_affine()
|
| 163 |
+
pil = pil.transform(
|
| 164 |
+
(IMG_W, IMG_H), PILImage.AFFINE, self._train_affine,
|
| 165 |
+
resample=PILImage.BILINEAR, fillcolor=0)
|
| 166 |
+
return np.array(pil)
|
| 167 |
+
if self.center_crop:
|
| 168 |
+
w, h = pil.size
|
| 169 |
+
if w > h:
|
| 170 |
+
crop_w = int(h * IMAGE_ASPECT_RATIO)
|
| 171 |
+
x0 = max((w - crop_w) // 2, 0)
|
| 172 |
+
pil = pil.crop((x0, 0, x0 + crop_w, h))
|
| 173 |
+
else:
|
| 174 |
+
crop_h = int(w / IMAGE_ASPECT_RATIO)
|
| 175 |
+
y0 = max((h - crop_h) // 2, 0)
|
| 176 |
+
pil = pil.crop((0, y0, w, y0 + crop_h))
|
| 177 |
+
pil = pil.resize((IMG_W, IMG_H))
|
| 178 |
+
return np.array(pil)
|
| 179 |
+
|
| 180 |
+
def _preprocess_images(self, obs_rgb):
|
| 181 |
+
from PIL import Image as PILImage
|
| 182 |
+
B, T = obs_rgb.shape[:2]
|
| 183 |
+
frames = []
|
| 184 |
+
for b in range(B):
|
| 185 |
+
ch_list = []
|
| 186 |
+
for t in range(T):
|
| 187 |
+
frame = (obs_rgb[b, t].transpose(1, 2, 0) * 255).astype(np.uint8)
|
| 188 |
+
model_in = self._to_model_input_uint8(frame)
|
| 189 |
+
pil = PILImage.fromarray(model_in)
|
| 190 |
+
ch_list.append(IMAGENET_TRANSFORM(pil).numpy())
|
| 191 |
+
frames.append(np.concatenate(ch_list, axis=0))
|
| 192 |
+
return np.stack(frames)
|
| 193 |
+
|
| 194 |
+
def inference_trajectory(self, obs):
|
| 195 |
+
"""Run GNM → metric waypoints + uniform scores.
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
trajectory: np.ndarray (B, 1, 5, 2) — meters in robot frame
|
| 199 |
+
scores: np.ndarray (B, 1)
|
| 200 |
+
"""
|
| 201 |
+
if isinstance(obs, torch.Tensor):
|
| 202 |
+
obs_np = obs.cpu().numpy()
|
| 203 |
+
else:
|
| 204 |
+
obs_np = np.asarray(obs, dtype=np.float32)
|
| 205 |
+
B = obs_np.shape[0]
|
| 206 |
+
|
| 207 |
+
obs_in = self._preprocess_images(obs_np)
|
| 208 |
+
fake_goal = np.random.randn(B, 3, IMG_H, IMG_W).astype(np.float32)
|
| 209 |
+
|
| 210 |
+
devnull = os.open(os.devnull, os.O_WRONLY)
|
| 211 |
+
old_stderr = os.dup(2)
|
| 212 |
+
os.dup2(devnull, 2)
|
| 213 |
+
try:
|
| 214 |
+
dist_pred, action_pred = self._sess.run(
|
| 215 |
+
None, {"obs_img": obs_in, "goal_img": fake_goal})
|
| 216 |
+
finally:
|
| 217 |
+
os.dup2(old_stderr, 2)
|
| 218 |
+
os.close(devnull)
|
| 219 |
+
os.close(old_stderr)
|
| 220 |
+
|
| 221 |
+
xy = action_pred[:, :, :2]
|
| 222 |
+
if self._normalize_actions:
|
| 223 |
+
xy = xy * self._waypoint_spacing
|
| 224 |
+
|
| 225 |
+
trajectory = xy[:, None, :, :] # (B, 1, 5, 2)
|
| 226 |
+
scores = np.ones((B, 1), dtype=np.float32)
|
| 227 |
+
return trajectory.astype(np.float32), scores
|
| 228 |
+
|
| 229 |
+
def inference_vw(self, obs):
|
| 230 |
+
trajectory, _ = self.inference_trajectory(obs)
|
| 231 |
+
best_traj = trajectory[:, 0]
|
| 232 |
+
self._last_best_traj = best_traj
|
| 233 |
+
waypoints = torch.from_numpy(best_traj).float().to(self.device)
|
| 234 |
+
v, w = self._controller(waypoints, dt=self.dt)
|
| 235 |
+
return torch.stack([v, w], dim=1)
|
| 236 |
+
|
| 237 |
+
def inference_vw_pp(self, obs, robot):
|
| 238 |
+
if self._pp is None:
|
| 239 |
+
raise RuntimeError(
|
| 240 |
+
"PurePursuitController unavailable — install urbansim or "
|
| 241 |
+
"make sure `urbansim.custom.pp` is on the Python path.")
|
| 242 |
+
trajectory, _ = self.inference_trajectory(obs)
|
| 243 |
+
best_traj = trajectory[:, 0]
|
| 244 |
+
self._last_best_traj = best_traj
|
| 245 |
+
waypoints = torch.from_numpy(best_traj).float().to(self.device)
|
| 246 |
+
v, w = self._pp.step(waypoints, robot=robot)
|
| 247 |
+
return torch.stack([v, w], dim=1)
|
GNM_GL_Official/model_info.yaml
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: GNM-GL-Official
|
| 2 |
+
goal_type: goal_less
|
| 3 |
+
image_resolution: [64, 85]
|
| 4 |
+
context_size: 6
|
| 5 |
+
obs_normalize: true
|
| 6 |
+
normalize: true
|
| 7 |
+
action_stats:
|
| 8 |
+
min: [0.0, -2.0]
|
| 9 |
+
max: [5.0, 2.0]
|
| 10 |
+
model_max_v: 0.8
|
| 11 |
+
model_rate: 3
|
| 12 |
+
metric_waypoint_spacing: 0.8
|
| 13 |
+
model_fps: 3
|
| 14 |
+
len_traj_pred: 5
|
| 15 |
+
learn_angle: true
|
| 16 |
+
inputs:
|
| 17 |
+
obs_img:
|
| 18 |
+
shape: [batch, 18, 64, 85]
|
| 19 |
+
dtype: float32
|
| 20 |
+
description: 6 RGB frames x 3 channels, ImageNet normalized; resized to image_resolution
|
| 21 |
+
goal_img:
|
| 22 |
+
shape: [batch, 3, 64, 85]
|
| 23 |
+
dtype: float32
|
| 24 |
+
description: Goal image (random noise for goal-free)
|
| 25 |
+
outputs:
|
| 26 |
+
dist_pred:
|
| 27 |
+
shape: [batch, 1]
|
| 28 |
+
description: Predicted distance to goal
|
| 29 |
+
action_pred:
|
| 30 |
+
shape: [batch, 5, 4]
|
| 31 |
+
description: waypoints (cumsum'd dx,dy in dataset units; scale by metric_waypoint_spacing),
|
| 32 |
+
sin/cos heading
|
| 33 |
+
onnx: gnm_imagegoal.onnx
|
MBRA_PG_Official/inference.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""MBRA-PG-Official inference utilities.
|
| 3 |
+
|
| 4 |
+
MBRA takes 6 ImageNet-normalized 96x96 frames (stacked as 18 channels)
|
| 5 |
+
and a goal pose [x, y, sin(yaw), cos(yaw)] normalized by metric_spacing=0.25m.
|
| 6 |
+
|
| 7 |
+
User provides goal_xy in standard frame (x=forward, y=left, meters).
|
| 8 |
+
|
| 9 |
+
nav = MBRAPGNavigator(device="cuda")
|
| 10 |
+
traj, scores = nav.inference_trajectory(obs, goal_xy=np.array([5.0, 0.2]))
|
| 11 |
+
vw = nav.inference_vw(obs, goal_xy=np.array([5.0, 0.2]))
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import math
|
| 16 |
+
import numpy as np
|
| 17 |
+
import torch
|
| 18 |
+
import onnxruntime as ort
|
| 19 |
+
import yaml
|
| 20 |
+
from torchvision import transforms
|
| 21 |
+
|
| 22 |
+
MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 23 |
+
ONNX_PATH = os.path.join(MODEL_DIR, "mbra.onnx")
|
| 24 |
+
INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
|
| 25 |
+
METRIC_SPACING = 0.8 # waypoint normalization factor
|
| 26 |
+
|
| 27 |
+
IMAGENET_TRANSFORM = transforms.Compose([
|
| 28 |
+
transforms.ToTensor(),
|
| 29 |
+
transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
| 30 |
+
std=[0.229, 0.224, 0.225]),
|
| 31 |
+
])
|
| 32 |
+
IMG_SIZE = 96
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class PDController:
|
| 36 |
+
MAX_V = 2.5
|
| 37 |
+
MAX_W = 0.65
|
| 38 |
+
|
| 39 |
+
def __init__(self):
|
| 40 |
+
self.last_v = None
|
| 41 |
+
|
| 42 |
+
def reset(self):
|
| 43 |
+
self.last_v = None
|
| 44 |
+
|
| 45 |
+
def __call__(self, waypoints, dt=1.0):
|
| 46 |
+
EPS = 1e-6
|
| 47 |
+
idx = min(2, waypoints.shape[1] - 1)
|
| 48 |
+
dx = waypoints[:, idx, 0]
|
| 49 |
+
dyr = waypoints[:, -1, 1]
|
| 50 |
+
dxr = waypoints[:, -1, 0]
|
| 51 |
+
v = dx / dt
|
| 52 |
+
w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
|
| 53 |
+
* dxr.sign() / dt)
|
| 54 |
+
near_zero = dx.abs() < EPS
|
| 55 |
+
v = torch.where(near_zero, torch.zeros_like(v), v)
|
| 56 |
+
w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
|
| 57 |
+
if self.last_v is not None:
|
| 58 |
+
v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
|
| 59 |
+
v = v.clamp(-self.MAX_V, self.MAX_V)
|
| 60 |
+
w = w.clamp(-self.MAX_W, self.MAX_W)
|
| 61 |
+
self.last_v = v
|
| 62 |
+
return v, w
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class MBRAPGNavigator:
|
| 66 |
+
"""MBRA point-goal navigator.
|
| 67 |
+
|
| 68 |
+
All user-facing coordinates: x=forward, y=left, meters.
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
context_size = 6
|
| 72 |
+
multimodal = False
|
| 73 |
+
|
| 74 |
+
def __init__(
|
| 75 |
+
self,
|
| 76 |
+
onnx_path: str = ONNX_PATH,
|
| 77 |
+
device: str = "cuda",
|
| 78 |
+
max_v: float = 2.5,
|
| 79 |
+
max_w: float = 0.65,
|
| 80 |
+
dt: float = 2.5,
|
| 81 |
+
):
|
| 82 |
+
self.device = device
|
| 83 |
+
self.dt = dt
|
| 84 |
+
|
| 85 |
+
ort.set_default_logger_severity(3)
|
| 86 |
+
providers = [
|
| 87 |
+
("CUDAExecutionProvider",
|
| 88 |
+
{"arena_extend_strategy": "kSameAsRequested"}),
|
| 89 |
+
"CPUExecutionProvider",
|
| 90 |
+
]
|
| 91 |
+
self._session = ort.InferenceSession(onnx_path, providers=providers)
|
| 92 |
+
with open(INFO_PATH, "r") as f:
|
| 93 |
+
self._info = yaml.safe_load(f)
|
| 94 |
+
self._controller = PDController()
|
| 95 |
+
self._controller.MAX_V = max_v
|
| 96 |
+
self._controller.MAX_W = max_w
|
| 97 |
+
|
| 98 |
+
def reset(self):
|
| 99 |
+
self._controller.reset()
|
| 100 |
+
|
| 101 |
+
def _preprocess_images(self, obs_rgb):
|
| 102 |
+
"""obs_rgb: (B, C, 3, H, W) float32 [0,1] → (B, C*3, 96, 96) ImageNet-norm."""
|
| 103 |
+
from PIL import Image as PILImage
|
| 104 |
+
B, C = obs_rgb.shape[:2]
|
| 105 |
+
frames = []
|
| 106 |
+
for b in range(B):
|
| 107 |
+
ch_list = []
|
| 108 |
+
for c in range(C):
|
| 109 |
+
frame = (obs_rgb[b, c].transpose(1, 2, 0) * 255).astype(np.uint8)
|
| 110 |
+
pil = PILImage.fromarray(frame).resize((IMG_SIZE, IMG_SIZE))
|
| 111 |
+
ch_list.append(IMAGENET_TRANSFORM(pil).numpy()) # (3, 96, 96)
|
| 112 |
+
frames.append(np.concatenate(ch_list, axis=0)) # (C*3, 96, 96)
|
| 113 |
+
return np.stack(frames) # (B, 18, 96, 96)
|
| 114 |
+
|
| 115 |
+
@staticmethod
|
| 116 |
+
def _goal_to_pose(goal_xy):
|
| 117 |
+
"""Convert standard goal [x_fwd, y_left] (meters) → MBRA goal_pose [4].
|
| 118 |
+
|
| 119 |
+
MBRA expects [x/spacing, y/spacing, cos(angle), sin(angle)].
|
| 120 |
+
"""
|
| 121 |
+
x, y = float(goal_xy[0]), float(goal_xy[1])
|
| 122 |
+
angle = math.atan2(y, x)
|
| 123 |
+
return np.array([
|
| 124 |
+
x / METRIC_SPACING,
|
| 125 |
+
y / METRIC_SPACING,
|
| 126 |
+
math.cos(angle),
|
| 127 |
+
math.sin(angle),
|
| 128 |
+
], dtype=np.float32)
|
| 129 |
+
|
| 130 |
+
def inference_trajectory(self, obs, goal_xy=None):
|
| 131 |
+
"""Run model → (trajectory, scores).
|
| 132 |
+
|
| 133 |
+
Args:
|
| 134 |
+
obs: (B, 6, 3, H, W) float32 in [0,1]. 6 frames.
|
| 135 |
+
goal_xy: (2,) goal in standard frame [x_fwd, y_left] meters.
|
| 136 |
+
If None, uses [5.0, 0.0].
|
| 137 |
+
|
| 138 |
+
Returns:
|
| 139 |
+
trajectory: np.ndarray (B, 1, 8, 2) waypoints [x, y] meters
|
| 140 |
+
scores: np.ndarray (B, 1)
|
| 141 |
+
"""
|
| 142 |
+
if isinstance(obs, torch.Tensor):
|
| 143 |
+
obs_np = obs.cpu().numpy()
|
| 144 |
+
else:
|
| 145 |
+
obs_np = np.asarray(obs, dtype=np.float32)
|
| 146 |
+
B = obs_np.shape[0]
|
| 147 |
+
|
| 148 |
+
obs_norm = self._preprocess_images(obs_np)
|
| 149 |
+
|
| 150 |
+
if goal_xy is None:
|
| 151 |
+
goal_xy = np.array([np.random.uniform(2.0, 10.0), np.random.uniform(-2.0, 2.0)])
|
| 152 |
+
goal_pose = self._goal_to_pose(goal_xy)
|
| 153 |
+
goal_batch = np.tile(goal_pose, (B, 1))
|
| 154 |
+
|
| 155 |
+
out = self._session.run(None, {
|
| 156 |
+
"obs_images": obs_norm,
|
| 157 |
+
"goal_pose": goal_batch,
|
| 158 |
+
})
|
| 159 |
+
wp_raw = out[0] # (B, 8, 4) — [x, y, sin(yaw), cos(yaw)]
|
| 160 |
+
|
| 161 |
+
# Extract x, y and un-normalize
|
| 162 |
+
wp_xy = wp_raw[:, :, :2] * METRIC_SPACING # (B, 8, 2) meters
|
| 163 |
+
trajectory = wp_xy[:, np.newaxis] # (B, 1, 8, 2)
|
| 164 |
+
scores = np.ones((B, 1), dtype=np.float32)
|
| 165 |
+
return trajectory, scores
|
| 166 |
+
|
| 167 |
+
def inference_vw(self, obs, goal_xy=None):
|
| 168 |
+
"""Run model → (v, w).
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
vw: torch.Tensor (B, 2)
|
| 172 |
+
"""
|
| 173 |
+
trajectory, _ = self.inference_trajectory(obs, goal_xy)
|
| 174 |
+
best_traj = trajectory[:, 0]
|
| 175 |
+
waypoints = torch.from_numpy(best_traj).float().to(self.device)
|
| 176 |
+
v, w = self._controller(waypoints, dt=self.dt)
|
| 177 |
+
return torch.stack([v, w], dim=1), best_traj
|
MBRA_PG_Official/mbra.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc42919737964d9c4db9428d8a836ad9d95007f1a41fcf5cf95526e1042f17a5
|
| 3 |
+
size 253629409
|
MBRA_PG_Official/model_info.yaml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: MBRA-PG-Official
|
| 2 |
+
goal_type: point_goal
|
| 3 |
+
image_resolution: [96, 96]
|
| 4 |
+
context_size: 6
|
| 5 |
+
input_fps: 5.0
|
| 6 |
+
obs_normalize: true
|
| 7 |
+
inputs:
|
| 8 |
+
obs_images: {shape: [batch, 6, 3, 96, 96], dtype: float32, description: "6 RGB frames x 3 channels, ImageNet normalized"}
|
| 9 |
+
goal_pose: {shape: [batch, 4], dtype: float32, description: "Goal pose [x, y, sin(yaw), cos(yaw)] in meters/rad"}
|
| 10 |
+
outputs:
|
| 11 |
+
waypoints: {shape: [batch, 8, 4], description: "8 waypoints x 4 (x, y, sin(yaw), cos(yaw))"}
|
| 12 |
+
num_waypoints: 8
|
MIMIC/inference.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""MIMIC inference utilities.
|
| 3 |
+
|
| 4 |
+
Mirrors the style of :mod:`models.S2E_PG_Full.inference`, adapted for the
|
| 5 |
+
MIMIC ONNX (``mimic.onnx``) — point-goal, multi-modal, 16-frame context with
|
| 6 |
+
an explicit past-trajectory + past-mask input. Reference single-shot script:
|
| 7 |
+
``urban-sim-human-official/run_v3_b_onnx.py``.
|
| 8 |
+
|
| 9 |
+
The ONNX has four inputs and three outputs::
|
| 10 |
+
|
| 11 |
+
obs : (B, 16, 3, 256, 256) float32 in [0, 1]
|
| 12 |
+
goal : (B, 3) float32 = [dist/200, cos(a), sin(a)]
|
| 13 |
+
past_traj : (B, 15, 2) float32 — past XY in camera frame x 0.25
|
| 14 |
+
(usually zeros at inference)
|
| 15 |
+
past_mask : (B, 16) float32 — 1.0 for valid frames
|
| 16 |
+
|
| 17 |
+
best_reg_mid : (B, 10, 3) — best single trajectory (model-picked)
|
| 18 |
+
poses_reg_mid_denorm : (B, 62, 10, 3) — 62 candidates
|
| 19 |
+
poses_cls_mid : (B, 62) — classification scores
|
| 20 |
+
|
| 21 |
+
Outputs are in *camera frame* (col-0 = forward, col-1 = +left in the model
|
| 22 |
+
output; the reference script's ``[-col1, col0]`` swap turns col1 into the
|
| 23 |
+
controller's right-positive ``x_right``). We rescale by ``waypoint_scale``
|
| 24 |
+
(=0.25) and return waypoints in standard frame ``(x_fwd, y_left)``.
|
| 25 |
+
|
| 26 |
+
nav = MIMICNavigator(device="cuda")
|
| 27 |
+
traj, scores = nav.inference_trajectory(obs, goal_xy=np.array([5.0, 0.2]))
|
| 28 |
+
vw, best = nav.inference_vw(obs, goal_xy=np.array([5.0, 0.2]))
|
| 29 |
+
nav.reset()
|
| 30 |
+
"""
|
| 31 |
+
import os
|
| 32 |
+
import math
|
| 33 |
+
import numpy as np
|
| 34 |
+
import torch
|
| 35 |
+
import yaml
|
| 36 |
+
import onnxruntime as ort
|
| 37 |
+
from urbansim.custom.pp import PurePursuitController
|
| 38 |
+
|
| 39 |
+
MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 40 |
+
ONNX_PATH = os.path.join(MODEL_DIR, "mimic.onnx")
|
| 41 |
+
INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
|
| 42 |
+
HORIZON_IDX = 13
|
| 43 |
+
Scaler = 0.25
|
| 44 |
+
|
| 45 |
+
class PDController:
|
| 46 |
+
"""Convert predicted waypoints to (v, w) commands."""
|
| 47 |
+
MAX_V = 2.5
|
| 48 |
+
MAX_W = 0.65
|
| 49 |
+
|
| 50 |
+
def __init__(self):
|
| 51 |
+
self.last_v = None
|
| 52 |
+
|
| 53 |
+
def reset(self):
|
| 54 |
+
self.last_v = None
|
| 55 |
+
|
| 56 |
+
def __call__(self, waypoints, dt=1.0):
|
| 57 |
+
EPS = 1e-6
|
| 58 |
+
dxr = waypoints[:, -1, 0]
|
| 59 |
+
dyr = waypoints[:, -1, 1]
|
| 60 |
+
idx = min(6, waypoints.shape[1] - 1)
|
| 61 |
+
dx = waypoints[:, idx, 0]
|
| 62 |
+
v = dx / dt
|
| 63 |
+
w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
|
| 64 |
+
* dxr.sign() / dt)
|
| 65 |
+
near_zero = dx.abs() < EPS
|
| 66 |
+
v = torch.where(near_zero, torch.zeros_like(v), v)
|
| 67 |
+
w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
|
| 68 |
+
if self.last_v is not None:
|
| 69 |
+
v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
|
| 70 |
+
v = v.clamp(-self.MAX_V, self.MAX_V)
|
| 71 |
+
w = w.clamp(-self.MAX_W, self.MAX_W)
|
| 72 |
+
self.last_v = v
|
| 73 |
+
return v, w
|
| 74 |
+
|
| 75 |
+
class MIMICNavigator:
|
| 76 |
+
"""
|
| 77 |
+
"""
|
| 78 |
+
|
| 79 |
+
context_size = 16
|
| 80 |
+
multimodal = False
|
| 81 |
+
|
| 82 |
+
def __init__(
|
| 83 |
+
self,
|
| 84 |
+
onnx_path: str = ONNX_PATH,
|
| 85 |
+
device: str = "cuda",
|
| 86 |
+
max_v: float = 2.5,
|
| 87 |
+
max_w: float = 0.65,
|
| 88 |
+
dt: float = 2.0,
|
| 89 |
+
horizon_idx: int = HORIZON_IDX,
|
| 90 |
+
):
|
| 91 |
+
"""
|
| 92 |
+
Args:
|
| 93 |
+
onnx_path: Path to the ONNX model file.
|
| 94 |
+
device: "cuda" or "cpu".
|
| 95 |
+
max_v: Max linear velocity (m/s).
|
| 96 |
+
max_w: Max angular velocity (rad/s).
|
| 97 |
+
dt: Time scaling factor for PD controller.
|
| 98 |
+
horizon_idx: Number of output waypoints to keep (up to 4s).
|
| 99 |
+
"""
|
| 100 |
+
self.device = device
|
| 101 |
+
self.dt = dt
|
| 102 |
+
self.horizon_idx = horizon_idx
|
| 103 |
+
|
| 104 |
+
ort.set_default_logger_severity(3)
|
| 105 |
+
providers = [
|
| 106 |
+
("CUDAExecutionProvider",
|
| 107 |
+
{"arena_extend_strategy": "kSameAsRequested"}),
|
| 108 |
+
"CPUExecutionProvider",
|
| 109 |
+
]
|
| 110 |
+
self._session = ort.InferenceSession(onnx_path, providers=providers)
|
| 111 |
+
with open(INFO_PATH, "r") as f:
|
| 112 |
+
self._info = yaml.safe_load(f)
|
| 113 |
+
self._controller = PDController()
|
| 114 |
+
self._controller.MAX_V = max_v
|
| 115 |
+
self._controller.MAX_W = max_w
|
| 116 |
+
|
| 117 |
+
self._pp = PurePursuitController(action_dt=4, waypoint_index=6)
|
| 118 |
+
self._last_best_traj = None
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def reset(self):
|
| 122 |
+
self._controller.reset()
|
| 123 |
+
self._last_best_traj = None
|
| 124 |
+
|
| 125 |
+
def inference_trajectory(self, obs):
|
| 126 |
+
"""Run model → (trajectory, scores).
|
| 127 |
+
|
| 128 |
+
Args:
|
| 129 |
+
obs: (B, 16, 3, 288, 512) float32 in [0,1].
|
| 130 |
+
Note: batch dim is fixed to 1 for this model.
|
| 131 |
+
|
| 132 |
+
Returns:
|
| 133 |
+
trajectory: np.ndarray (B, 1, K, 2) where K = horizon_idx
|
| 134 |
+
scores: np.ndarray (B, 1)
|
| 135 |
+
"""
|
| 136 |
+
if isinstance(obs, torch.Tensor):
|
| 137 |
+
obs_np = obs.cpu().numpy()
|
| 138 |
+
else:
|
| 139 |
+
obs_np = np.asarray(obs, dtype=np.float32)
|
| 140 |
+
B = obs_np.shape[0]
|
| 141 |
+
|
| 142 |
+
if B == 1:
|
| 143 |
+
out = self._session.run(None, {"input": obs_np})
|
| 144 |
+
raw = out[0]
|
| 145 |
+
else:
|
| 146 |
+
raw = np.concatenate(
|
| 147 |
+
[self._session.run(None, {"input": obs_np[i:i+1]})[0] for i in range(B)],
|
| 148 |
+
axis=0,
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# Take x, y columns up to the 4-second horizon
|
| 152 |
+
traj_xy = raw[:, :self.horizon_idx, :2].astype(np.float32)
|
| 153 |
+
# Reshape to (B, 1, K, 2) for uniform interface
|
| 154 |
+
trajectory = traj_xy[:, np.newaxis] * Scaler
|
| 155 |
+
scores = np.ones((B, 1), dtype=np.float32)
|
| 156 |
+
return trajectory, scores
|
| 157 |
+
|
| 158 |
+
def inference_vw(self, obs):
|
| 159 |
+
"""Run model → (v, w) velocity commands.
|
| 160 |
+
|
| 161 |
+
Returns:
|
| 162 |
+
vw: torch.Tensor (B, 2)
|
| 163 |
+
"""
|
| 164 |
+
trajectory, _ = self.inference_trajectory(obs)
|
| 165 |
+
best_traj = trajectory[:, 0] # (B, K, 2)
|
| 166 |
+
waypoints = torch.from_numpy(best_traj).float().to(self.device)
|
| 167 |
+
v, w = self._controller(waypoints, dt=self.dt)
|
| 168 |
+
self._last_best_traj = best_traj
|
| 169 |
+
return torch.stack([v, w], dim=1), best_traj
|
| 170 |
+
|
MIMIC/mimic.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7557512c791b824a1d693368d590ee6741af8a9aeef7974b763eb912df69d765
|
| 3 |
+
size 318192969
|
MIMIC/model_info.yaml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: MIMIC
|
| 2 |
+
goal_type: goal_free
|
| 3 |
+
image_resolution: [288, 512]
|
| 4 |
+
context_size: 16
|
| 5 |
+
input_fps: 5.0
|
| 6 |
+
obs_normalize: false
|
| 7 |
+
inputs:
|
| 8 |
+
input: {shape: [1, 16, 3, 288, 512], dtype: float32, description: "16 RGB frames [t-15, ..., t] in [0,1]"}
|
| 9 |
+
outputs:
|
| 10 |
+
output: {shape: [1, 15, 3], description: "15 waypoints x 3 (x, y, yaw) at non-uniform timestamps [1,2,4,6,7,8,10,12,14,15,17,19,21,23,25] @ 5Hz (0.2s-5.0s)"}
|
| 11 |
+
num_waypoints: 15
|
| 12 |
+
output_timestamps_5hz: [1, 2, 4, 6, 7, 8, 10, 12, 14, 15, 17, 19, 21, 23, 25]
|
| 13 |
+
output_fps: 5.0
|
NoMaD_GL_Official/inference.py
ADDED
|
@@ -0,0 +1,440 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""NoMaD-GL-Official inference utilities (3-component diffusion model).
|
| 3 |
+
|
| 4 |
+
NoMaD uses iterative DDPM diffusion (10 steps) with 3 ONNX components:
|
| 5 |
+
1. vision_encoder: obs + goal_img + goal_mask → conditioning vector
|
| 6 |
+
2. noise_pred: single denoising step
|
| 7 |
+
3. dist_pred: distance estimation
|
| 8 |
+
|
| 9 |
+
Goal-free mode: goal_mask=1, goal_img=random noise.
|
| 10 |
+
|
| 11 |
+
Action decode (NavDP / official NoMaD convention):
|
| 12 |
+
Diffusion output is normalized delta-actions clipped to [-1, 1]. To recover
|
| 13 |
+
metric waypoints in the robot frame:
|
| 14 |
+
1. unnormalize: d = (norm + 1)/2 * (max - min) + min (dataset units)
|
| 15 |
+
2. cumsum over time → cumulative positions (dataset units)
|
| 16 |
+
3. multiply by `metric_waypoint_spacing` (m) → meters (robot frame)
|
| 17 |
+
|
| 18 |
+
nav = NoMaDGLNavigator(device="cuda")
|
| 19 |
+
traj, scores = nav.inference_trajectory(obs) # (B, S, 8, 2) meters
|
| 20 |
+
vw = nav.inference_vw(obs) # caches nav._last_best_traj
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
import os
|
| 24 |
+
import math
|
| 25 |
+
import numpy as np
|
| 26 |
+
import torch
|
| 27 |
+
import yaml
|
| 28 |
+
import onnxruntime as ort
|
| 29 |
+
from torchvision import transforms
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
from urbansim.custom.pp import PurePursuitController
|
| 33 |
+
except Exception:
|
| 34 |
+
PurePursuitController = None # optional — only needed for inference_vw_pp
|
| 35 |
+
|
| 36 |
+
MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 37 |
+
VE_PATH = os.path.join(MODEL_DIR, "nomad_vision_encoder.onnx")
|
| 38 |
+
NP_PATH = os.path.join(MODEL_DIR, "nomad_noise_pred.onnx")
|
| 39 |
+
DP_PATH = os.path.join(MODEL_DIR, "nomad_dist_pred.onnx")
|
| 40 |
+
INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
|
| 41 |
+
IMG_SIZE = 96
|
| 42 |
+
NUM_DIFFUSION_STEPS = 10
|
| 43 |
+
NUM_SAMPLES = 8
|
| 44 |
+
WAYPOINT_INDEX = 2 # which waypoint to use for v/w
|
| 45 |
+
IMAGE_ASPECT_RATIO = 4 / 3 # NavDP optional center-crop target
|
| 46 |
+
|
| 47 |
+
# Training camera intrinsics (NoMaD recon dataset, from NavDP data_config.yaml).
|
| 48 |
+
# Used by the optional `use_train_intrinsics=True` rectification path which
|
| 49 |
+
# warps the source pinhole view to look like a recon-camera capture squished
|
| 50 |
+
# to 96x96 — i.e. the geometric input distribution the model saw at training.
|
| 51 |
+
TRAIN_CAM_FX = 272.547
|
| 52 |
+
TRAIN_CAM_FY = 266.358
|
| 53 |
+
TRAIN_CAM_CX = 320.0
|
| 54 |
+
TRAIN_CAM_CY = 220.0
|
| 55 |
+
TRAIN_CAM_W = 640 # = 2 * cx
|
| 56 |
+
TRAIN_CAM_H = 440 # = 2 * cy
|
| 57 |
+
|
| 58 |
+
# Default source intrinsics: the urban-sim undistorted pinhole image emitted
|
| 59 |
+
# by test_inference_forward.py:build_undistort_maps. Override at __init__ time
|
| 60 |
+
# if you change the upstream fisheye→pinhole step.
|
| 61 |
+
DEFAULT_SRC_INTRINSICS = {
|
| 62 |
+
"fx": 210.667, "fy": 210.667,
|
| 63 |
+
"cx": 256.0, "cy": 144.0,
|
| 64 |
+
"w": 512, "h": 288,
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
IMAGENET_TRANSFORM = transforms.Compose([
|
| 68 |
+
transforms.ToTensor(),
|
| 69 |
+
transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
| 70 |
+
std=[0.229, 0.224, 0.225]),
|
| 71 |
+
])
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
class PDController:
|
| 75 |
+
MAX_V = 2.5
|
| 76 |
+
MAX_W = 0.65
|
| 77 |
+
|
| 78 |
+
def __init__(self):
|
| 79 |
+
self.last_v = None
|
| 80 |
+
|
| 81 |
+
def reset(self):
|
| 82 |
+
self.last_v = None
|
| 83 |
+
|
| 84 |
+
def __call__(self, waypoints, dt=1.0):
|
| 85 |
+
EPS = 1e-6
|
| 86 |
+
idx = min(WAYPOINT_INDEX, waypoints.shape[1] - 1)
|
| 87 |
+
dx = waypoints[:, idx, 0]
|
| 88 |
+
dyr = waypoints[:, -1, 1]
|
| 89 |
+
dxr = waypoints[:, -1, 0]
|
| 90 |
+
v = dx / dt
|
| 91 |
+
w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
|
| 92 |
+
* dxr.sign() / dt)
|
| 93 |
+
near_zero = dx.abs() < EPS
|
| 94 |
+
v = torch.where(near_zero, torch.zeros_like(v), v)
|
| 95 |
+
w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
|
| 96 |
+
if self.last_v is not None:
|
| 97 |
+
v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
|
| 98 |
+
v = v.clamp(-self.MAX_V, self.MAX_V)
|
| 99 |
+
w = w.clamp(-self.MAX_W, self.MAX_W)
|
| 100 |
+
self.last_v = v
|
| 101 |
+
return v, w
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def _make_session(path):
|
| 105 |
+
ort.set_default_logger_severity(3)
|
| 106 |
+
opts = ort.SessionOptions()
|
| 107 |
+
opts.log_severity_level = 3
|
| 108 |
+
return ort.InferenceSession(
|
| 109 |
+
path, sess_options=opts,
|
| 110 |
+
providers=[
|
| 111 |
+
("CUDAExecutionProvider",
|
| 112 |
+
{"arena_extend_strategy": "kSameAsRequested"}),
|
| 113 |
+
"CPUExecutionProvider"])
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
class NoMaDGLNavigator:
|
| 117 |
+
"""NoMaD goal-free diffusion navigator.
|
| 118 |
+
|
| 119 |
+
Runs DDPM diffusion loop (10 steps) to generate trajectory samples.
|
| 120 |
+
All coordinates: x=forward, y=left, meters.
|
| 121 |
+
"""
|
| 122 |
+
|
| 123 |
+
context_size = 4 # 4 obs frames → 12 channels
|
| 124 |
+
multimodal = True
|
| 125 |
+
|
| 126 |
+
def __init__(
|
| 127 |
+
self,
|
| 128 |
+
device: str = "cuda",
|
| 129 |
+
max_v: float = 2.5,
|
| 130 |
+
max_w: float = 0.65,
|
| 131 |
+
dt: float = 2.0,
|
| 132 |
+
num_samples: int = NUM_SAMPLES,
|
| 133 |
+
num_diffusion_steps: int = NUM_DIFFUSION_STEPS,
|
| 134 |
+
center_crop: bool = True,
|
| 135 |
+
use_train_intrinsics: bool = True,
|
| 136 |
+
src_intrinsics: dict = None,
|
| 137 |
+
):
|
| 138 |
+
self.device = device
|
| 139 |
+
self.dt = dt
|
| 140 |
+
self.num_samples = num_samples
|
| 141 |
+
self.num_steps = num_diffusion_steps
|
| 142 |
+
# NavDP default is center_crop=False (direct resize). Flag exposed for
|
| 143 |
+
# A/B testing: True crops landscape input to 4:3 before resizing.
|
| 144 |
+
self.center_crop = center_crop
|
| 145 |
+
|
| 146 |
+
# Optional virtual rectification to the NoMaD training camera (recon).
|
| 147 |
+
# When True, the source pinhole frame is warped (affine, same math as
|
| 148 |
+
# a homography between two pinhole cameras) so that its content lines
|
| 149 |
+
# up with what a recon-camera frame squished to 96x96 would look like.
|
| 150 |
+
self.use_train_intrinsics = use_train_intrinsics
|
| 151 |
+
self.src_intrinsics = dict(src_intrinsics or DEFAULT_SRC_INTRINSICS)
|
| 152 |
+
self._train_affine = None # built lazily in _ensure_train_affine
|
| 153 |
+
|
| 154 |
+
self._ve = _make_session(VE_PATH)
|
| 155 |
+
self._np = _make_session(NP_PATH)
|
| 156 |
+
self._dp = _make_session(DP_PATH)
|
| 157 |
+
with open(INFO_PATH, "r") as f:
|
| 158 |
+
self._info = yaml.safe_load(f)
|
| 159 |
+
|
| 160 |
+
# Action decoder params (NavDP convention)
|
| 161 |
+
action_stats = self._info.get("action_stats", {})
|
| 162 |
+
self._act_min = np.asarray(
|
| 163 |
+
action_stats.get("min", [0.0, -4.0]), dtype=np.float32)
|
| 164 |
+
self._act_max = np.asarray(
|
| 165 |
+
action_stats.get("max", [5.0, 4.0]), dtype=np.float32)
|
| 166 |
+
self._normalize_actions = bool(self._info.get("normalize", True))
|
| 167 |
+
self._waypoint_spacing = float(
|
| 168 |
+
self._info.get("metric_waypoint_spacing", 0.25))
|
| 169 |
+
# Model's expected observation frame rate (Hz). NoMaD was trained at
|
| 170 |
+
# 3 Hz; callers should downsample input video to this rate.
|
| 171 |
+
self.model_fps = float(self._info.get("model_fps", 3.0))
|
| 172 |
+
|
| 173 |
+
self._controller = PDController()
|
| 174 |
+
self._controller.MAX_V = max_v
|
| 175 |
+
self._controller.MAX_W = max_w
|
| 176 |
+
|
| 177 |
+
# Cache slot for visualization — last (B, 8, 2) trajectory chosen by
|
| 178 |
+
# the action head. Matches Coco's contract so downstream viewers
|
| 179 |
+
# (`model._last_best_traj`) just work.
|
| 180 |
+
self._last_best_traj = None
|
| 181 |
+
|
| 182 |
+
# Optional pure-pursuit controller for `inference_vw_pp`.
|
| 183 |
+
self._pp = (PurePursuitController(action_dt=3, waypoint_index=5)
|
| 184 |
+
if PurePursuitController is not None else None)
|
| 185 |
+
|
| 186 |
+
# Build DDPM schedule (squaredcos_cap_v2)
|
| 187 |
+
self._build_schedule()
|
| 188 |
+
|
| 189 |
+
def _build_schedule(self):
|
| 190 |
+
"""Pre-compute DDPM alpha/beta schedule (matches diffusers squaredcos_cap_v2)."""
|
| 191 |
+
T = self.num_steps
|
| 192 |
+
|
| 193 |
+
def alpha_bar(t):
|
| 194 |
+
return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
|
| 195 |
+
|
| 196 |
+
betas = []
|
| 197 |
+
for i in range(T):
|
| 198 |
+
t1 = i / T
|
| 199 |
+
t2 = (i + 1) / T
|
| 200 |
+
b = min(1 - alpha_bar(t2) / alpha_bar(t1), 0.999)
|
| 201 |
+
betas.append(b)
|
| 202 |
+
betas = np.array(betas, dtype=np.float64)
|
| 203 |
+
alphas = 1.0 - betas
|
| 204 |
+
alpha_cumprod = np.cumprod(alphas)
|
| 205 |
+
|
| 206 |
+
self._betas = betas
|
| 207 |
+
self._alpha_cumprod = alpha_cumprod
|
| 208 |
+
self._sqrt_alpha = np.sqrt(alphas)
|
| 209 |
+
self._sqrt_one_minus_alpha_cumprod = np.sqrt(1.0 - alpha_cumprod)
|
| 210 |
+
# For x0 prediction clipping
|
| 211 |
+
self._sqrt_recip_alpha_cumprod = np.sqrt(1.0 / alpha_cumprod)
|
| 212 |
+
self._sqrt_recip_alpha_cumprod_m1 = np.sqrt(1.0 / alpha_cumprod - 1.0)
|
| 213 |
+
|
| 214 |
+
def _ddpm_step(self, sample, noise_pred, t):
|
| 215 |
+
"""Single DDPM reverse step: x_{t-1} from x_t and predicted noise."""
|
| 216 |
+
alpha_prod_t = self._alpha_cumprod[t]
|
| 217 |
+
beta_t = self._betas[t]
|
| 218 |
+
|
| 219 |
+
# Predict x0
|
| 220 |
+
x0_pred = (sample - math.sqrt(1.0 - alpha_prod_t) * noise_pred) / math.sqrt(alpha_prod_t)
|
| 221 |
+
x0_pred = np.clip(x0_pred, -1.0, 1.0)
|
| 222 |
+
|
| 223 |
+
# Compute mean of q(x_{t-1} | x_t, x_0)
|
| 224 |
+
if t > 0:
|
| 225 |
+
alpha_prod_prev = self._alpha_cumprod[t - 1]
|
| 226 |
+
beta_prod_t = 1.0 - alpha_prod_t
|
| 227 |
+
beta_prod_prev = 1.0 - alpha_prod_prev
|
| 228 |
+
coeff_x0 = math.sqrt(alpha_prod_prev) * beta_t / beta_prod_t
|
| 229 |
+
coeff_xt = math.sqrt(1.0 - beta_t) * beta_prod_prev / beta_prod_t
|
| 230 |
+
mean = coeff_x0 * x0_pred + coeff_xt * sample
|
| 231 |
+
# Variance
|
| 232 |
+
variance = beta_t * beta_prod_prev / beta_prod_t
|
| 233 |
+
noise = np.random.randn(*sample.shape).astype(np.float32)
|
| 234 |
+
return (mean + math.sqrt(variance) * noise).astype(np.float32)
|
| 235 |
+
else:
|
| 236 |
+
return x0_pred.astype(np.float32)
|
| 237 |
+
|
| 238 |
+
def reset(self):
|
| 239 |
+
self._controller.reset()
|
| 240 |
+
self._last_best_traj = None
|
| 241 |
+
|
| 242 |
+
def _ensure_train_affine(self):
|
| 243 |
+
"""Build PIL affine coeffs mapping output (96x96) pixels → source pixels.
|
| 244 |
+
|
| 245 |
+
Both cameras are pinhole, so the ray (X/Z, Y/Z) for a target pixel
|
| 246 |
+
(u_t, v_t) is ((u_t - cx_t)/fx_t, (v_t - cy_t)/fy_t). Re-projecting that
|
| 247 |
+
ray into the source camera gives (u_s, v_s) = (ray_x * fx_s + cx_s,
|
| 248 |
+
ray_y * fy_s + cy_s). Substituting yields an affine map:
|
| 249 |
+
|
| 250 |
+
u_s = (fx_s / fx_t) * u_t + (cx_s - cx_t * fx_s / fx_t)
|
| 251 |
+
v_s = (fy_s / fy_t) * v_t + (cy_s - cy_t * fy_s / fy_t)
|
| 252 |
+
|
| 253 |
+
PIL.Image.transform expects (a, b, c, d, e, f) where:
|
| 254 |
+
u_s = a*u_t + b*v_t + c
|
| 255 |
+
v_s = d*u_t + e*v_t + f
|
| 256 |
+
"""
|
| 257 |
+
if self._train_affine is not None:
|
| 258 |
+
return
|
| 259 |
+
# Effective recon intrinsics when its 640x440 native frame is squished
|
| 260 |
+
# to 96x96 (which is what the model saw during training).
|
| 261 |
+
fx_t = TRAIN_CAM_FX * IMG_SIZE / TRAIN_CAM_W
|
| 262 |
+
fy_t = TRAIN_CAM_FY * IMG_SIZE / TRAIN_CAM_H
|
| 263 |
+
cx_t = TRAIN_CAM_CX * IMG_SIZE / TRAIN_CAM_W
|
| 264 |
+
cy_t = TRAIN_CAM_CY * IMG_SIZE / TRAIN_CAM_H
|
| 265 |
+
|
| 266 |
+
src = self.src_intrinsics
|
| 267 |
+
a = src["fx"] / fx_t
|
| 268 |
+
e = src["fy"] / fy_t
|
| 269 |
+
c = src["cx"] - cx_t * a
|
| 270 |
+
f = src["cy"] - cy_t * e
|
| 271 |
+
self._train_affine = (a, 0.0, c, 0.0, e, f)
|
| 272 |
+
|
| 273 |
+
def _to_model_input_uint8(self, frame_uint8):
|
| 274 |
+
"""Apply preprocessing to one (H, W, 3) uint8 RGB frame → (96, 96, 3) uint8.
|
| 275 |
+
|
| 276 |
+
Used for both the actual model input and for visualization (panel 2).
|
| 277 |
+
Does NOT apply ImageNet normalization.
|
| 278 |
+
"""
|
| 279 |
+
from PIL import Image as PILImage
|
| 280 |
+
pil = PILImage.fromarray(frame_uint8)
|
| 281 |
+
if self.use_train_intrinsics:
|
| 282 |
+
self._ensure_train_affine()
|
| 283 |
+
pil = pil.transform(
|
| 284 |
+
(IMG_SIZE, IMG_SIZE), PILImage.AFFINE, self._train_affine,
|
| 285 |
+
resample=PILImage.BILINEAR, fillcolor=0)
|
| 286 |
+
return np.array(pil)
|
| 287 |
+
if self.center_crop:
|
| 288 |
+
w, h = pil.size
|
| 289 |
+
if w > h:
|
| 290 |
+
crop_w = int(h * IMAGE_ASPECT_RATIO)
|
| 291 |
+
x0 = max((w - crop_w) // 2, 0)
|
| 292 |
+
pil = pil.crop((x0, 0, x0 + crop_w, h))
|
| 293 |
+
else:
|
| 294 |
+
crop_h = int(w / IMAGE_ASPECT_RATIO)
|
| 295 |
+
y0 = max((h - crop_h) // 2, 0)
|
| 296 |
+
pil = pil.crop((0, y0, w, y0 + crop_h))
|
| 297 |
+
pil = pil.resize((IMG_SIZE, IMG_SIZE))
|
| 298 |
+
return np.array(pil)
|
| 299 |
+
|
| 300 |
+
def _preprocess_images(self, obs_rgb):
|
| 301 |
+
"""obs_rgb: (B, T=4, 3, H, W) float32 [0,1] → (B, 12, 96, 96) ImageNet-norm."""
|
| 302 |
+
from PIL import Image as PILImage
|
| 303 |
+
B, T = obs_rgb.shape[:2]
|
| 304 |
+
frames = []
|
| 305 |
+
for b in range(B):
|
| 306 |
+
ch_list = []
|
| 307 |
+
for t in range(T):
|
| 308 |
+
frame = (obs_rgb[b, t].transpose(1, 2, 0) * 255).astype(np.uint8)
|
| 309 |
+
model_in = self._to_model_input_uint8(frame)
|
| 310 |
+
pil = PILImage.fromarray(model_in)
|
| 311 |
+
ch_list.append(IMAGENET_TRANSFORM(pil).numpy())
|
| 312 |
+
frames.append(np.concatenate(ch_list, axis=0))
|
| 313 |
+
return np.stack(frames) # (B, 12, 96, 96)
|
| 314 |
+
|
| 315 |
+
def _decode_actions(self, sample):
|
| 316 |
+
"""Decode normalized deltas → metric waypoints (NavDP recipe).
|
| 317 |
+
|
| 318 |
+
sample: (N, T, 2) normalized deltas in [-1, 1]
|
| 319 |
+
Returns: (N, T, 2) cumulative positions in meters (x=fwd, y=left).
|
| 320 |
+
"""
|
| 321 |
+
# 1. Unnormalize from [-1, 1] → [min, max] (dataset units).
|
| 322 |
+
deltas = (sample + 1.0) / 2.0
|
| 323 |
+
deltas = deltas * (self._act_max - self._act_min) + self._act_min
|
| 324 |
+
# 2. Cumulative sum over time.
|
| 325 |
+
positions = np.cumsum(deltas, axis=1)
|
| 326 |
+
# 3. Scale by metric_waypoint_spacing (only when training-time normalize=True).
|
| 327 |
+
if self._normalize_actions:
|
| 328 |
+
positions = positions * self._waypoint_spacing
|
| 329 |
+
return positions.astype(np.float32)
|
| 330 |
+
|
| 331 |
+
def inference_trajectory(self, obs):
|
| 332 |
+
"""Run diffusion model → (trajectories, scores).
|
| 333 |
+
|
| 334 |
+
Args:
|
| 335 |
+
obs: (B, 4, 3, H, W) float32 in [0,1]. 4 frames.
|
| 336 |
+
|
| 337 |
+
Returns:
|
| 338 |
+
trajectory: np.ndarray (B, num_samples, 8, 2) — METERS, robot frame
|
| 339 |
+
(x=forward, y=left). Decoded from normalized deltas via
|
| 340 |
+
unnorm → cumsum → scale.
|
| 341 |
+
scores: np.ndarray (B, num_samples) — uniform (no scoring in goal-free)
|
| 342 |
+
"""
|
| 343 |
+
if isinstance(obs, torch.Tensor):
|
| 344 |
+
obs_np = obs.cpu().numpy()
|
| 345 |
+
else:
|
| 346 |
+
obs_np = np.asarray(obs, dtype=np.float32)
|
| 347 |
+
B = obs_np.shape[0]
|
| 348 |
+
|
| 349 |
+
obs_norm = self._preprocess_images(obs_np) # (B, 12, 96, 96)
|
| 350 |
+
|
| 351 |
+
# Goal-free: random goal image + mask=1
|
| 352 |
+
fake_goal = np.random.randn(B, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
|
| 353 |
+
mask = np.ones(B, dtype=np.int64)
|
| 354 |
+
|
| 355 |
+
# 1. Vision encoder
|
| 356 |
+
cond = self._ve.run(None, {
|
| 357 |
+
"obs_img": obs_norm,
|
| 358 |
+
"goal_img": fake_goal,
|
| 359 |
+
"goal_mask": mask,
|
| 360 |
+
})[0] # (B, 256)
|
| 361 |
+
|
| 362 |
+
# Replicate for num_samples
|
| 363 |
+
cond_rep = np.repeat(cond, self.num_samples, axis=0) # (B*S, 256)
|
| 364 |
+
|
| 365 |
+
# 2. Diffusion denoising loop (output: normalized deltas in [-1,1]).
|
| 366 |
+
BS = B * self.num_samples
|
| 367 |
+
sample = np.random.randn(BS, 8, 2).astype(np.float32)
|
| 368 |
+
|
| 369 |
+
for t in reversed(range(self.num_steps)):
|
| 370 |
+
t_arr = np.array(t, dtype=np.int64)
|
| 371 |
+
# Suppress stderr from ORT during inference
|
| 372 |
+
devnull = os.open(os.devnull, os.O_WRONLY)
|
| 373 |
+
old_stderr = os.dup(2)
|
| 374 |
+
os.dup2(devnull, 2)
|
| 375 |
+
try:
|
| 376 |
+
noise_pred = self._np.run(None, {
|
| 377 |
+
"sample": sample,
|
| 378 |
+
"timestep": t_arr,
|
| 379 |
+
"global_cond": cond_rep,
|
| 380 |
+
})[0]
|
| 381 |
+
finally:
|
| 382 |
+
os.dup2(old_stderr, 2)
|
| 383 |
+
os.close(devnull)
|
| 384 |
+
os.close(old_stderr)
|
| 385 |
+
|
| 386 |
+
sample = self._ddpm_step(sample, noise_pred, t)
|
| 387 |
+
|
| 388 |
+
# 3. Decode normalized deltas → metric waypoints.
|
| 389 |
+
waypoints = self._decode_actions(sample) # (BS, 8, 2) meters
|
| 390 |
+
|
| 391 |
+
trajectory = waypoints.reshape(B, self.num_samples, 8, 2)
|
| 392 |
+
scores = np.ones((B, self.num_samples), dtype=np.float32)
|
| 393 |
+
return trajectory, scores
|
| 394 |
+
|
| 395 |
+
def inference_vw(self, obs):
|
| 396 |
+
"""Run model → (v, w) velocity commands.
|
| 397 |
+
|
| 398 |
+
Picks the first diffusion sample (goal-free has uniform scores, so any
|
| 399 |
+
sample is equally valid), runs the PD controller on its waypoints, and
|
| 400 |
+
caches the chosen trajectory on ``self._last_best_traj`` so visualizers
|
| 401 |
+
can pull it without re-running inference.
|
| 402 |
+
|
| 403 |
+
Args:
|
| 404 |
+
obs: (B, 4, 3, H, W) float32 in [0, 1]
|
| 405 |
+
|
| 406 |
+
Returns:
|
| 407 |
+
vw: torch.Tensor (B, 2) — [linear_vel, angular_vel]
|
| 408 |
+
"""
|
| 409 |
+
trajectory, _ = self.inference_trajectory(obs)
|
| 410 |
+
best_traj = trajectory[:, 0] # (B, 8, 2)
|
| 411 |
+
self._last_best_traj = best_traj # cached for visualization
|
| 412 |
+
waypoints = torch.from_numpy(best_traj).float().to(self.device)
|
| 413 |
+
v, w = self._controller(waypoints, dt=self.dt)
|
| 414 |
+
return torch.stack([v, w], dim=1), best_traj
|
| 415 |
+
|
| 416 |
+
def inference_vw_pp(self, obs, robot):
|
| 417 |
+
"""Pure-pursuit variant of :meth:`inference_vw`.
|
| 418 |
+
|
| 419 |
+
Picks the first diffusion sample's waypoints and feeds them to the
|
| 420 |
+
urbansim ``PurePursuitController`` instead of the PD head. Same return
|
| 421 |
+
contract as :meth:`inference_vw`.
|
| 422 |
+
|
| 423 |
+
Args:
|
| 424 |
+
obs: (B, 4, 3, H, W) float32 in [0, 1]
|
| 425 |
+
robot: IsaacLab articulation handle (``env.scene["robot"]``).
|
| 426 |
+
|
| 427 |
+
Returns:
|
| 428 |
+
vw: torch.Tensor (B, 2) — [linear_vel, angular_vel]
|
| 429 |
+
"""
|
| 430 |
+
if self._pp is None:
|
| 431 |
+
raise RuntimeError(
|
| 432 |
+
"PurePursuitController unavailable — install urbansim or "
|
| 433 |
+
"make sure `urbansim.custom.pp` is on the Python path."
|
| 434 |
+
)
|
| 435 |
+
trajectory, _ = self.inference_trajectory(obs)
|
| 436 |
+
best_traj = trajectory[:, 0] # (B, 8, 2)
|
| 437 |
+
self._last_best_traj = best_traj # cached for visualization
|
| 438 |
+
waypoints = torch.from_numpy(best_traj).float().to(self.device)
|
| 439 |
+
v, w = self._pp.step(waypoints, robot=robot)
|
| 440 |
+
return torch.stack([v, w], dim=1)
|
NoMaD_GL_Official/model_info.yaml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: NoMaD-GL-Official
|
| 2 |
+
goal_type: goal_less
|
| 3 |
+
image_resolution: [96, 96]
|
| 4 |
+
context_size: 4
|
| 5 |
+
obs_normalize: true
|
| 6 |
+
diffusion_steps: 10
|
| 7 |
+
diffusion_scheduler: DDPMScheduler
|
| 8 |
+
diffusion_config:
|
| 9 |
+
beta_schedule: squaredcos_cap_v2
|
| 10 |
+
clip_sample: true
|
| 11 |
+
prediction_type: epsilon
|
| 12 |
+
inputs:
|
| 13 |
+
obs_img: {shape: [batch, 12, 96, 96], dtype: float32, description: "4 RGB frames x 3 channels, ImageNet normalized"}
|
| 14 |
+
goal_img: {shape: [batch, 3, 96, 96], dtype: float32, description: "Goal image (random noise for goal-free)"}
|
| 15 |
+
goal_mask: {shape: [batch], dtype: int64, description: "1=goal-free, 0=goal-conditioned"}
|
| 16 |
+
outputs:
|
| 17 |
+
trajectories: {shape: [num_samples, 8, 2], description: "8 normalized delta-actions in [-1,1]; decode via unnorm→cumsum→scale"}
|
| 18 |
+
distances: {shape: [batch, 1], description: "Predicted distance to goal"}
|
| 19 |
+
num_waypoints: 8
|
| 20 |
+
num_samples: 8
|
| 21 |
+
waypoint_index: 2
|
| 22 |
+
|
| 23 |
+
# NoMaD action decoder (NavDP convention)
|
| 24 |
+
# 1. unnormalize: d = (norm + 1) / 2 * (max - min) + min
|
| 25 |
+
# 2. cumsum over time → cumulative positions in dataset units
|
| 26 |
+
# 3. multiply by metric_waypoint_spacing → meters in robot frame (x=fwd, y=left)
|
| 27 |
+
normalize: true
|
| 28 |
+
action_stats:
|
| 29 |
+
min: [0.0, -4.0] # [min_dx, min_dy] in dataset waypoint-spacing units
|
| 30 |
+
max: [5.0, 4.0] # [max_dx, max_dy]
|
| 31 |
+
# Per-step metric spacing (meters): model_max_v / model_rate = 0.8 / 3.0
|
| 32 |
+
model_max_v: 0.8
|
| 33 |
+
model_rate: 3.0
|
| 34 |
+
metric_waypoint_spacing: 0.26667
|
| 35 |
+
|
| 36 |
+
# Inference frame rate (Hz). NoMaD was trained at 3 Hz observation rate; the
|
| 37 |
+
# 4-frame context spans ~1.33 s and each predicted waypoint is 1/model_fps s
|
| 38 |
+
# into the future. Upstream callers should downsample their input video to
|
| 39 |
+
# match this — see test_inference_forward.py.
|
| 40 |
+
model_fps: 3.0
|
NoMaD_GL_Official/nomad_dist_pred.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:adab58bef61d7b8d0f2bf04636853f0056229e06634e2536a5e7821d6b9def4f
|
| 3 |
+
size 71682
|
NoMaD_GL_Official/nomad_dist_pred.onnx.data
ADDED
|
Binary file (69.6 kB). View file
|
|
|
NoMaD_GL_Official/nomad_noise_pred.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3588e1ff53ba748240f160d3cd3fec0c9c987f05e59ec64a631c3536e036e34
|
| 3 |
+
size 15554469
|
NoMaD_GL_Official/nomad_vision_encoder.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a3477772d7ff671dbba9711c58b2df89deb9b1b6a93cae07c530524ab6d47377
|
| 3 |
+
size 47986697
|
NoMaD_GL_Official/nomad_vision_encoder.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:57ceee7e8f0777b6e5cb444cddb1e17549e8188c78ba414463696b130031b320
|
| 3 |
+
size 47448064
|
README.md
CHANGED
|
@@ -1,3 +1,235 @@
|
|
| 1 |
---
|
| 2 |
license: apache-2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
license: apache-2.0
|
| 3 |
+
tags:
|
| 4 |
+
- robotics
|
| 5 |
+
- navigation
|
| 6 |
+
- visual-navigation
|
| 7 |
+
- embodied-ai
|
| 8 |
+
- onnx
|
| 9 |
+
pipeline_tag: robotics
|
| 10 |
---
|
| 11 |
+
|
| 12 |
+
# Navigation Model Zoo
|
| 13 |
+
|
| 14 |
+
A collection of vision-based navigation policies exported to **ONNX**, each wrapped in a small,
|
| 15 |
+
uniform Python inference API. Maintained by **Honglin He @ UCLA-VAIL**.
|
| 16 |
+
|
| 17 |
+
Every model takes a short history of RGB frames and predicts a local trajectory (and optionally a
|
| 18 |
+
distance-to-goal / arrival signal); a built-in PD controller turns the trajectory into `(v, ω)`
|
| 19 |
+
velocity commands. All models share the same wrapper interface so they can be swapped and
|
| 20 |
+
benchmarked without per-model glue code.
|
| 21 |
+
|
| 22 |
+
## Models
|
| 23 |
+
|
| 24 |
+
| Folder | Model / paper | Goal mode | Context | Input H×W | Waypoints | Weights |
|
| 25 |
+
|--------|---------------|-----------|:-------:|:---------:|:---------:|---------|
|
| 26 |
+
| [`GNM_GL_Official`](GNM_GL_Official) | [GNM](https://arxiv.org/abs/2210.03370) · ICRA 2023 | goal-free | 6 | 64×85 | 5 | `gnm_imagegoal.onnx` (+`.data`) · 35 MB |
|
| 27 |
+
| [`Vint_GL_Official`](Vint_GL_Official) | [ViNT](https://arxiv.org/abs/2306.14846) · CoRL 2023 | goal-free | 6 | 64×85 | 5 | `vint_imagegoal.onnx` (+`.data`) · 97 MB |
|
| 28 |
+
| [`NoMaD_GL_Official`](NoMaD_GL_Official) | [NoMaD](https://arxiv.org/abs/2310.07896) · ICRA 2024 | goal-free (diffusion) | 4 | 96×96 | 8 ×8 samples | 3× `.onnx` (+`.data`) · 111 MB |
|
| 29 |
+
| [`CityWalker_PG_Official`](CityWalker_PG_Official) | [CityWalker](https://arxiv.org/abs/2411.17820) · CVPR 2025 | point-goal | 5 | 350×630 | 5 | `citywalker.onnx` · 806 MB |
|
| 30 |
+
| [`MBRA_PG_Official`](MBRA_PG_Official) | [MBRA](https://arxiv.org/abs/2505.05592) · RA-L 2025 | point-goal | 6 | 96×96 | 8 | `mbra.onnx` · 254 MB |
|
| 31 |
+
| [`S2E`](S2E) | [S2E](https://arxiv.org/abs/2507.22028) · ICLR 2026 | point-goal / goal-free | 11 | 256×256 | 10 | `s2e.onnx` · 382 MB |
|
| 32 |
+
| [`MIMIC`](MIMIC) | [MIMIC](https://arxiv.org/abs/2603.22527) · ICRA 2026 | goal-free | 16 | 288×512 | 13 | `mimic.onnx` · 318 MB |
|
| 33 |
+
|
| 34 |
+
Suffix legend: `PG` = point-goal, `GL` = goal-less (goal-free). Models with a `.onnx.data` companion
|
| 35 |
+
(GNM, ViNT, NoMaD) use ONNX external weights — keep each `.onnx` and its `.onnx.data` together.
|
| 36 |
+
|
| 37 |
+
## Common interface
|
| 38 |
+
|
| 39 |
+
Each folder is a self-contained module exposing one navigator class. They all follow the same contract:
|
| 40 |
+
|
| 41 |
+
```python
|
| 42 |
+
import numpy as np
|
| 43 |
+
from MBRA_PG_Official.inference import MBRAPGNavigator # run from the repo root
|
| 44 |
+
|
| 45 |
+
nav = MBRAPGNavigator(device="cuda") # use device="cpu" if you have no GPU
|
| 46 |
+
|
| 47 |
+
# obs: (B, nav.context_size, 3, H, W) float32 in [0, 1]
|
| 48 |
+
# the wrapper resizes & normalizes to the model's spec internally
|
| 49 |
+
obs = np.random.rand(1, nav.context_size, 3, 96, 96).astype(np.float32)
|
| 50 |
+
|
| 51 |
+
# Point-goal models take goal_xy (standard frame: x=forward, y=left, meters);
|
| 52 |
+
# goal-free models omit it.
|
| 53 |
+
traj, scores = nav.inference_trajectory(obs, goal_xy=np.array([5.0, 0.2])) # (B, M, W, 2) meters
|
| 54 |
+
vw, best = nav.inference_vw(obs, goal_xy=np.array([5.0, 0.2])) # vw: (B, 2) = [v, ω]
|
| 55 |
+
|
| 56 |
+
nav.reset() # clears PD-controller velocity smoothing between episodes
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
Conventions shared by every model:
|
| 60 |
+
|
| 61 |
+
- **Coordinate frame** — all user-facing inputs/outputs are *standard frame*: `x = forward`, `y = left`, in meters. Models with a different internal convention (e.g. CityWalker) convert transparently.
|
| 62 |
+
- **Observations** — `(B, context_size, 3, H, W)`, `float32`, pixel values in `[0, 1]`. The wrapper handles resize and any ImageNet normalization. *(Exception: `MIMIC` expects frames already at 288×512 and does not resize.)*
|
| 63 |
+
- **`inference_trajectory(obs[, goal_xy])`** → `(trajectory, scores)`. `trajectory` is `(B, M, W, 2)` in meters, where `M` is the number of modes (1 for unimodal, 8 for NoMaD) and `W` the waypoint count; `scores` is `(B, M)`.
|
| 64 |
+
- **`inference_vw(obs[, goal_xy])`** → `(vw, best_traj)` where `vw` is a `(B, 2)` torch tensor of `[linear_v, angular_w]`. Tune limits with `max_v` / `max_w` at construction.
|
| 65 |
+
- Goal-free models (`Vint`, `GNM`, `NoMaD`, `MIMIC`) ignore `goal_xy` — call `inference_trajectory(obs)`.
|
| 66 |
+
|
| 67 |
+
## Installation
|
| 68 |
+
|
| 69 |
+
```bash
|
| 70 |
+
pip install onnxruntime-gpu numpy torch torchvision pyyaml pillow
|
| 71 |
+
# CPU-only: use onnxruntime instead of onnxruntime-gpu
|
| 72 |
+
pip install opencv-python # required by S2E (frame resizing)
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
Optional, lab-internal dependency: `Vint`, `GNM`, and `NoMaD` expose an extra `inference_vw_pp()`
|
| 76 |
+
method that uses `urbansim.custom.pp.PurePursuitController`; it is imported lazily and only needed
|
| 77 |
+
for that method. **`MIMIC` imports `urbansim` at module load**, so its `inference.py` will not import
|
| 78 |
+
without the `urbansim` package on your path.
|
| 79 |
+
|
| 80 |
+
## Model details
|
| 81 |
+
|
| 82 |
+
### GNM_GL_Official — `gnm_imagegoal.onnx` (+ `.onnx.data`)
|
| 83 |
+
**Paper:** *GNM: A General Navigation Model to Drive Any Robot* (ICRA 2023) · [arXiv:2210.03370](https://arxiv.org/abs/2210.03370) · [code](https://github.com/robodhruv/drive-any-robot)
|
| 84 |
+
|
| 85 |
+
Goal-free General Navigation Model — same NavDP image-goal I/O contract as ViNT (`obs_img (B,18,64,85)` + `goal_img (B,3,64,85)` → `dist_pred (B,1)`, `action_pred (B,5,4)`), with a lower top speed. Expects input downsampled to ≈ 3 Hz.
|
| 86 |
+
|
| 87 |
+
### Vint_GL_Official — `vint_imagegoal.onnx` (+ `.onnx.data`)
|
| 88 |
+
**Paper:** *ViNT: A Foundation Model for Visual Navigation* (CoRL 2023) · [arXiv:2306.14846](https://arxiv.org/abs/2306.14846) · [project](https://general-navigation-models.github.io/vint/)
|
| 89 |
+
|
| 90 |
+
Goal-free ViNT (NavDP image-goal backbone run with a random goal image). **ONNX I/O:** `obs_img (B,18,64,85)` (6 ImageNet-normalized frames × 3 ch) + `goal_img (B,3,64,85)` (random noise) → `dist_pred (B,1)`, `action_pred (B,5,4)`. Cumulative `xy` is already baked in; the wrapper scales by the 0.8 m metric spacing. Expects input downsampled to ≈ 3 Hz.
|
| 91 |
+
|
| 92 |
+
### NoMaD_GL_Official — 3× ONNX (diffusion, + `.onnx.data`)
|
| 93 |
+
**Paper:** *NoMaD: Goal Masked Diffusion Policies for Navigation and Exploration* (ICRA 2024) · [arXiv:2310.07896](https://arxiv.org/abs/2310.07896) · [project](https://general-navigation-models.github.io/nomad/)
|
| 94 |
+
|
| 95 |
+
Goal-free diffusion policy. Runs a 10-step DDPM loop (`squaredcos_cap_v2`) over 3 components:
|
| 96 |
+
`nomad_vision_encoder.onnx` (`obs_img (B,12,96,96)` + `goal_img (B,3,96,96)` + `goal_mask (B)` → `cond (B,256)`), `nomad_noise_pred.onnx` (one denoising step), and `nomad_dist_pred.onnx`. Produces **8 trajectory samples** → `trajectory (B,8,8,2)` meters (decode: unnormalize → cumsum → ×0.267 m spacing). This is the only multi-modal model and the slowest (diffusion + multiple samples).
|
| 97 |
+
|
| 98 |
+
### CityWalker_PG_Official — `citywalker.onnx`
|
| 99 |
+
**Paper:** *CityWalker: Learning Embodied Urban Navigation from Web-Scale Videos* (CVPR 2025) · [arXiv:2411.17820](https://arxiv.org/abs/2411.17820) · [project](https://ai4ce.github.io/CityWalker/)
|
| 100 |
+
|
| 101 |
+
Point-goal urban walker. **ONNX I/O:** `obs_images (B,5,3,350,630)` + `trajectory (B,6,2)` past waypoints → `wp_pred (B,5,2)`, `arrive_pred (B,1)` (arrival probability). Images are ImageNet-normalized internally; the model's internal `y=forward, x=right` frame is converted to standard frame by the wrapper. Input rate ≈ 5 Hz.
|
| 102 |
+
|
| 103 |
+
### MBRA_PG_Official — `mbra.onnx`
|
| 104 |
+
**Paper:** *Learning to Drive Anywhere with Model-Based Reannotation* (RA-L 2025) · [arXiv:2505.05592](https://arxiv.org/abs/2505.05592) · [project](https://model-base-reannotation.github.io/)
|
| 105 |
+
|
| 106 |
+
Point-goal policy. **ONNX I/O:** `obs_images (B,6,3,96,96)` ImageNet-normalized + `goal_pose (B,4)` = `[x, y, sin(yaw), cos(yaw)]` → `waypoints (B,8,4)`. Goal is given as `goal_xy` (meters) and converted internally; waypoints are un-normalized by a 0.8 m metric spacing. Input rate ≈ 5 Hz.
|
| 107 |
+
|
| 108 |
+
### S2E — `s2e.onnx`
|
| 109 |
+
**Paper:** *From Seeing to Experiencing: Scaling Navigation Foundation Models with Reinforcement Learning* (ICLR 2026) · [arXiv:2507.22028](https://arxiv.org/abs/2507.22028) · [project](https://metadriverse.github.io/s2e)
|
| 110 |
+
|
| 111 |
+
UCLA-VAIL navigation foundation model; this is the behavior-cloning, point-goal, web-pretrained variant (`S2EBC-PG-Web100`). **ONNX I/O:** `obs_images (B,11,3,256,256)` in `[0,1]` (no ImageNet norm) + `goal (B,3)` = `[norm_dist, cos(θ), sin(θ)]` → `wp_pred (B,10,3)` `[x,y,yaw]`, `wp_pred_score (B,63)` mode scores. Frames are resized to 256×256 with OpenCV.
|
| 112 |
+
|
| 113 |
+
### MIMIC — `mimic.onnx`
|
| 114 |
+
**Paper:** *Learning Sidewalk Autopilot from Multi-Scale Imitation with Corrective Behavior Expansion* (ICRA 2026) · [arXiv:2603.22527](https://arxiv.org/abs/2603.22527) · [project](https://vail-ucla.github.io/MIMIC)
|
| 115 |
+
|
| 116 |
+
UCLA-VAIL goal-free long-context sidewalk policy. **ONNX I/O:** `input (1,16,3,288,512)` in `[0,1]` → `output (1,15,3)` `[x,y,yaw]` at non-uniform timestamps (0.2 s–5.0 s @ 5 Hz). Batch is processed one sample at a time; the wrapper keeps the first 13 waypoints (~4 s) and scales to meters. Requires `urbansim` (see Installation).
|
| 117 |
+
|
| 118 |
+
## Downloading
|
| 119 |
+
|
| 120 |
+
**Full repo** (includes the LFS-tracked ONNX weights):
|
| 121 |
+
```bash
|
| 122 |
+
hf download UCLA-VAIL/Navigation-Model-Zoo-Public --local-dir ./Navigation-Model-Zoo-Public
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
**One model** — fetch just its folder, e.g. MBRA:
|
| 126 |
+
```bash
|
| 127 |
+
hf download UCLA-VAIL/Navigation-Model-Zoo-Public \
|
| 128 |
+
--include "MBRA_PG_Official/*" --local-dir .
|
| 129 |
+
```
|
| 130 |
+
|
| 131 |
+
Then run from the repo root: `from MBRA_PG_Official.inference import MBRAPGNavigator`.
|
| 132 |
+
|
| 133 |
+
> **External weights:** GNM, ViNT, and NoMaD ship `*.onnx.data` files — keep each `.onnx` and its
|
| 134 |
+
> `.onnx.data` together in the same folder so ONNX Runtime can resolve the weights.
|
| 135 |
+
|
| 136 |
+
## Intended use & limitations
|
| 137 |
+
|
| 138 |
+
These are **research artifacts** for navigation research, reproduction, and benchmarking — not
|
| 139 |
+
safety-validated for deployment on real robots without additional testing. Each policy's behavior
|
| 140 |
+
is bounded by its training distribution (camera intrinsics, embodiment, frame rate, environment).
|
| 141 |
+
Several wrappers rectify/resize inputs to a specific training camera; mismatched cameras may degrade
|
| 142 |
+
performance.
|
| 143 |
+
|
| 144 |
+
## License
|
| 145 |
+
|
| 146 |
+
Released under **Apache 2.0**. Individual models carry the licenses and terms of their original
|
| 147 |
+
sources (ViNT, GNM, NoMaD, CityWalker, MBRA) — check upstream before commercial use.
|
| 148 |
+
|
| 149 |
+
## Citation
|
| 150 |
+
|
| 151 |
+
If you use a model from this zoo, please cite its original paper.
|
| 152 |
+
|
| 153 |
+
**GNM**
|
| 154 |
+
```bibtex
|
| 155 |
+
@inproceedings{shah2023gnm,
|
| 156 |
+
title={Gnm: A general navigation model to drive any robot},
|
| 157 |
+
author={Shah, Dhruv and Sridhar, Ajay and Bhorkar, Arjun and Hirose, Noriaki and Levine, Sergey},
|
| 158 |
+
booktitle={2023 IEEE International Conference on Robotics and Automation (ICRA)},
|
| 159 |
+
pages={7226--7233},
|
| 160 |
+
year={2023},
|
| 161 |
+
organization={IEEE}
|
| 162 |
+
}
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
**ViNT**
|
| 166 |
+
```bibtex
|
| 167 |
+
@article{shah2023vint,
|
| 168 |
+
title={ViNT: A foundation model for visual navigation},
|
| 169 |
+
author={Shah, Dhruv and Sridhar, Ajay and Dashora, Nitish and Stachowicz, Kyle and Black, Kevin and Hirose, Noriaki and Levine, Sergey},
|
| 170 |
+
journal={arXiv preprint arXiv:2306.14846},
|
| 171 |
+
year={2023}
|
| 172 |
+
}
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
**NoMaD**
|
| 176 |
+
```bibtex
|
| 177 |
+
@inproceedings{sridhar2024nomad,
|
| 178 |
+
title={Nomad: Goal masked diffusion policies for navigation and exploration},
|
| 179 |
+
author={Sridhar, Ajay and Shah, Dhruv and Glossop, Catherine and Levine, Sergey},
|
| 180 |
+
booktitle={2024 IEEE International Conference on Robotics and Automation (ICRA)},
|
| 181 |
+
pages={63--70},
|
| 182 |
+
year={2024},
|
| 183 |
+
organization={IEEE}
|
| 184 |
+
}
|
| 185 |
+
```
|
| 186 |
+
|
| 187 |
+
**CityWalker**
|
| 188 |
+
```bibtex
|
| 189 |
+
@inproceedings{liu2025citywalker,
|
| 190 |
+
title={Citywalker: Learning embodied urban navigation from web-scale videos},
|
| 191 |
+
author={Liu, Xinhao and Li, Jintong and Jiang, Yicheng and Sujay, Niranjan and Yang, Zhicheng and Zhang, Juexiao and Abanes, John and Zhang, Jing and Feng, Chen},
|
| 192 |
+
booktitle={Proceedings of the Computer Vision and Pattern Recognition Conference},
|
| 193 |
+
pages={6875--6885},
|
| 194 |
+
year={2025}
|
| 195 |
+
}
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
**MBRA**
|
| 199 |
+
```bibtex
|
| 200 |
+
@article{hirose2025learning,
|
| 201 |
+
title={Learning to drive anywhere with model-based reannotation},
|
| 202 |
+
author={Hirose, Noriaki and Ignatova, Lydia and Stachowicz, Kyle and Glossop, Catherine and Levine, Sergey and Shah, Dhruv},
|
| 203 |
+
journal={IEEE Robotics and Automation Letters},
|
| 204 |
+
volume={11},
|
| 205 |
+
number={2},
|
| 206 |
+
pages={1242--1249},
|
| 207 |
+
year={2025},
|
| 208 |
+
publisher={IEEE}
|
| 209 |
+
}
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
**S2E**
|
| 213 |
+
```bibtex
|
| 214 |
+
@article{he2025seeing,
|
| 215 |
+
title={From seeing to experiencing: Scaling navigation foundation models with reinforcement learning},
|
| 216 |
+
author={He, Honglin and Ma, Yukai and Squicciarini, Brad and Wu, Wayne and Zhou, Bolei},
|
| 217 |
+
journal={arXiv preprint arXiv:2507.22028},
|
| 218 |
+
year={2025}
|
| 219 |
+
}
|
| 220 |
+
```
|
| 221 |
+
|
| 222 |
+
**MIMIC**
|
| 223 |
+
```bibtex
|
| 224 |
+
@article{he2026learning,
|
| 225 |
+
title={Learning Sidewalk Autopilot from Multi-Scale Imitation with Corrective Behavior Expansion},
|
| 226 |
+
author={He, Honglin and Ma, Yukai and Squicciarini, Brad and Wu, Wayne and Zhou, Bolei},
|
| 227 |
+
journal={arXiv preprint arXiv:2603.22527},
|
| 228 |
+
year={2026}
|
| 229 |
+
}
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
## Contact
|
| 233 |
+
|
| 234 |
+
Maintained by [UCLA-VAIL](https://vail-ucla.github.io/). Open an issue/discussion on the
|
| 235 |
+
repository page for questions or contributions.
|
S2E/inference.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""S2EBC-PG-Web100 inference utilities.
|
| 3 |
+
|
| 4 |
+
S2EBC-PG takes 11 frames at 256x256 (no ImageNet normalization, [0,1]),
|
| 5 |
+
a goal vector [norm_dist, cos(angle), sin(angle)], and outputs
|
| 6 |
+
10 waypoints x 3 (x, y, yaw). Waypoints are scaled by 1/0.1 = 10x.
|
| 7 |
+
|
| 8 |
+
User provides goal_xy in standard frame (x=forward, y=left, meters).
|
| 9 |
+
|
| 10 |
+
nav = S2EBCPGNavigator(device="cuda")
|
| 11 |
+
traj, scores = nav.inference_trajectory(obs, goal_xy=np.array([5.0, 0.2]))
|
| 12 |
+
vw = nav.inference_vw(obs, goal_xy=np.array([5.0, 0.2]))
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import os
|
| 16 |
+
import math
|
| 17 |
+
import numpy as np
|
| 18 |
+
import torch
|
| 19 |
+
import onnxruntime as ort
|
| 20 |
+
|
| 21 |
+
MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 22 |
+
ONNX_PATH = os.path.join(MODEL_DIR, "s2e.onnx")
|
| 23 |
+
IMG_SIZE = 256
|
| 24 |
+
WP_SCALE = 0.25 # model outputs are divided by this to get meters
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class PDController:
|
| 28 |
+
MAX_V = 2.5
|
| 29 |
+
MAX_W = 0.65
|
| 30 |
+
|
| 31 |
+
def __init__(self):
|
| 32 |
+
self.last_v = None
|
| 33 |
+
|
| 34 |
+
def reset(self):
|
| 35 |
+
self.last_v = None
|
| 36 |
+
|
| 37 |
+
def __call__(self, waypoints, dt=1.0):
|
| 38 |
+
EPS = 1e-6
|
| 39 |
+
idx = min(4, waypoints.shape[1] - 1)
|
| 40 |
+
dx = waypoints[:, idx, 0]
|
| 41 |
+
dyr = waypoints[:, -1, 1]
|
| 42 |
+
dxr = waypoints[:, -1, 0]
|
| 43 |
+
v = dx / dt
|
| 44 |
+
w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
|
| 45 |
+
* dxr.sign() / dt )
|
| 46 |
+
near_zero = dx.abs() < EPS
|
| 47 |
+
v = torch.where(near_zero, torch.zeros_like(v), v)
|
| 48 |
+
w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
|
| 49 |
+
if self.last_v is not None:
|
| 50 |
+
v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
|
| 51 |
+
v = v.clamp(-self.MAX_V, self.MAX_V)
|
| 52 |
+
w = w.clamp(-self.MAX_W, self.MAX_W)
|
| 53 |
+
self.last_v = v
|
| 54 |
+
return v, w
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class S2ENavigator:
|
| 58 |
+
"""S2E point-goal / goal-free navigator.
|
| 59 |
+
|
| 60 |
+
All user-facing coordinates: x=forward, y=left, meters.
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
context_size = 11
|
| 64 |
+
multimodal = False
|
| 65 |
+
|
| 66 |
+
def __init__(
|
| 67 |
+
self,
|
| 68 |
+
onnx_path: str = ONNX_PATH,
|
| 69 |
+
device: str = "cuda",
|
| 70 |
+
max_v: float = 2.5,
|
| 71 |
+
max_w: float = 0.65,
|
| 72 |
+
dt: float = 2.0,
|
| 73 |
+
):
|
| 74 |
+
self.device = device
|
| 75 |
+
self.dt = dt
|
| 76 |
+
|
| 77 |
+
ort.set_default_logger_severity(3)
|
| 78 |
+
providers = [
|
| 79 |
+
("CUDAExecutionProvider",
|
| 80 |
+
{"arena_extend_strategy": "kSameAsRequested"}),
|
| 81 |
+
"CPUExecutionProvider",
|
| 82 |
+
]
|
| 83 |
+
self._session = ort.InferenceSession(onnx_path, providers=providers)
|
| 84 |
+
self._controller = PDController()
|
| 85 |
+
self._controller.MAX_V = max_v
|
| 86 |
+
self._controller.MAX_W = max_w
|
| 87 |
+
self._last_best_traj = None
|
| 88 |
+
INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
|
| 89 |
+
import yaml
|
| 90 |
+
with open(INFO_PATH, "r") as f:
|
| 91 |
+
self._info = yaml.safe_load(f)
|
| 92 |
+
|
| 93 |
+
def reset(self):
|
| 94 |
+
self._controller.reset()
|
| 95 |
+
self._last_best_traj = None
|
| 96 |
+
|
| 97 |
+
@staticmethod
|
| 98 |
+
def _goal_to_input(goal_xy):
|
| 99 |
+
"""Standard goal [x_fwd, y_left] meters → model input [norm_dist, cos(θ), sin(θ)]."""
|
| 100 |
+
x, y = float(goal_xy[0]), float(goal_xy[1])
|
| 101 |
+
dist = math.sqrt(x * x + y * y)
|
| 102 |
+
norm_dist = max(min(dist, 200.0), 0.1) / 200.0
|
| 103 |
+
angle = math.atan2(y, x)
|
| 104 |
+
return np.array([norm_dist, math.cos(angle), math.sin(angle)],
|
| 105 |
+
dtype=np.float32)
|
| 106 |
+
|
| 107 |
+
def inference_trajectory(self, obs, goal_xy=None):
|
| 108 |
+
"""Run model → (trajectory, scores).
|
| 109 |
+
|
| 110 |
+
Args:
|
| 111 |
+
obs: (B, 11, 3, H, W) float32 in [0,1]. 11 frames.
|
| 112 |
+
Images are resized internally to 256x256.
|
| 113 |
+
goal_xy: (2,) goal in standard frame, or None → [5.0, 0.0].
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
trajectory: np.ndarray (B, 1, 10, 2) meters
|
| 117 |
+
scores: np.ndarray (B, 1)
|
| 118 |
+
"""
|
| 119 |
+
if isinstance(obs, torch.Tensor):
|
| 120 |
+
obs_np = obs.cpu().numpy()
|
| 121 |
+
else:
|
| 122 |
+
obs_np = np.asarray(obs, dtype=np.float32)
|
| 123 |
+
B = obs_np.shape[0]
|
| 124 |
+
|
| 125 |
+
# Resize to 256x256 if needed (obs is CHW)
|
| 126 |
+
if obs_np.shape[-2:] != (IMG_SIZE, IMG_SIZE):
|
| 127 |
+
import cv2
|
| 128 |
+
resized = np.empty((B, 11, 3, IMG_SIZE, IMG_SIZE), dtype=np.float32)
|
| 129 |
+
for b in range(B):
|
| 130 |
+
for c in range(11):
|
| 131 |
+
frame_hwc = obs_np[b, c].transpose(1, 2, 0) # HWC
|
| 132 |
+
frame_rsz = cv2.resize(frame_hwc, (IMG_SIZE, IMG_SIZE))
|
| 133 |
+
resized[b, c] = frame_rsz.transpose(2, 0, 1)
|
| 134 |
+
obs_np = resized
|
| 135 |
+
|
| 136 |
+
if goal_xy is None:
|
| 137 |
+
goal_xy = np.array([5.0, 0.0])
|
| 138 |
+
goal_input = self._goal_to_input(goal_xy)
|
| 139 |
+
goal_batch = np.tile(goal_input, (B, 1))
|
| 140 |
+
|
| 141 |
+
all_wp_raw = []
|
| 142 |
+
for i in range(B):
|
| 143 |
+
out = self._session.run(None, {
|
| 144 |
+
"obs_images": obs_np[i:i+1], # (1, 11, 3, 256, 256)
|
| 145 |
+
"goal": goal_batch[i:i+1], # (1, 3)
|
| 146 |
+
})
|
| 147 |
+
all_wp_raw.append(out[0][:, :, :2])
|
| 148 |
+
|
| 149 |
+
wp_raw = np.concatenate(all_wp_raw, axis=0) # (B, 10, 2) — x, y (scaled)
|
| 150 |
+
wp_meters = wp_raw * WP_SCALE # un-scale to meters
|
| 151 |
+
|
| 152 |
+
trajectory = wp_meters[:, np.newaxis].astype(np.float32) # (B, 1, 10, 2)
|
| 153 |
+
scores = np.ones((B, 1), dtype=np.float32)
|
| 154 |
+
self._last_best_traj = trajectory[:, 0]
|
| 155 |
+
return trajectory, scores
|
| 156 |
+
|
| 157 |
+
def inference_vw(self, obs, goal_xy=None):
|
| 158 |
+
"""Run model → (v, w).
|
| 159 |
+
|
| 160 |
+
Returns:
|
| 161 |
+
vw: torch.Tensor (B, 2)
|
| 162 |
+
"""
|
| 163 |
+
trajectory, _ = self.inference_trajectory(obs, goal_xy)
|
| 164 |
+
best_traj = trajectory[:, 0]
|
| 165 |
+
waypoints = torch.from_numpy(best_traj).float().to(self.device)
|
| 166 |
+
v, w = self._controller(waypoints, dt=self.dt)
|
| 167 |
+
return torch.stack([v, w], dim=1), best_traj
|
S2E/model_info.yaml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: S2E
|
| 2 |
+
goal_type: point_goal / goal-free (random goal)
|
| 3 |
+
image_resolution: [256, 256]
|
| 4 |
+
context_size: 11
|
| 5 |
+
obs_normalize: false
|
| 6 |
+
inputs:
|
| 7 |
+
obs_images: {shape: [batch, 11, 3, 256, 256], dtype: float32, description: "11 RGB observation frames in [0,1]"}
|
| 8 |
+
goal: {shape: [batch, 3], dtype: float32, description: "Goal (x, y, yaw) in meters/rad"}
|
| 9 |
+
outputs:
|
| 10 |
+
wp_pred: {shape: [batch, 10, 3], description: "10 waypoints x 3 (x, y, yaw)"}
|
| 11 |
+
wp_pred_score: {shape: [batch, 63], description: "Mode selection scores"}
|
| 12 |
+
num_waypoints: 10
|
S2E/s2e.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee1410ab55a54946cf25323e82b3be2a8cb106e2ccb2ad878e7638839686108a
|
| 3 |
+
size 381993971
|
Vint_GL_Official/inference.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Vint-GL-Official inference wrapper (NavDP image-goal ViNT in goal-free mode).
|
| 3 |
+
|
| 4 |
+
The ONNX has two inputs (obs_img, goal_img) and two outputs (dist, action).
|
| 5 |
+
Goal-free mode: pass random noise as goal_img — same trick NavDP uses in
|
| 6 |
+
its `ViNTAgent.step_nogoal`.
|
| 7 |
+
|
| 8 |
+
nav = VintGLOfficialNavigator(device="cuda")
|
| 9 |
+
traj, scores = nav.inference_trajectory(obs) # (B, 1, 5, 2) meters
|
| 10 |
+
vw = nav.inference_vw(obs) # caches nav._last_best_traj
|
| 11 |
+
|
| 12 |
+
Action decode (NavDP convention):
|
| 13 |
+
The model bakes `torch.cumsum` into its forward, so the ONNX output
|
| 14 |
+
`action_pred[:, :, :2]` is *already* cumulative — we just scale by
|
| 15 |
+
`metric_waypoint_spacing` to get meters in the robot frame (x=fwd, y=left).
|
| 16 |
+
No unnormalize step (unlike NoMaD).
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import os
|
| 20 |
+
import math
|
| 21 |
+
import numpy as np
|
| 22 |
+
import torch
|
| 23 |
+
import yaml
|
| 24 |
+
import onnxruntime as ort
|
| 25 |
+
from torchvision import transforms
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
from urbansim.custom.pp import PurePursuitController
|
| 29 |
+
except Exception:
|
| 30 |
+
PurePursuitController = None # optional — only needed for inference_vw_pp
|
| 31 |
+
|
| 32 |
+
MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 33 |
+
ONNX_PATH = os.path.join(MODEL_DIR, "vint_imagegoal.onnx")
|
| 34 |
+
INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
|
| 35 |
+
|
| 36 |
+
# NavDP training image size for ViNT (width, height).
|
| 37 |
+
IMG_W, IMG_H = 85, 64
|
| 38 |
+
NUM_WAYPOINTS = 5
|
| 39 |
+
CTX_FRAMES = 6 # context_size=5 past + 1 current
|
| 40 |
+
WAYPOINT_INDEX = 2
|
| 41 |
+
|
| 42 |
+
IMAGE_ASPECT_RATIO = 4 / 3
|
| 43 |
+
|
| 44 |
+
# Recon training camera (for the optional virtual rectification path).
|
| 45 |
+
TRAIN_CAM_FX = 272.547
|
| 46 |
+
TRAIN_CAM_FY = 266.358
|
| 47 |
+
TRAIN_CAM_CX = 320.0
|
| 48 |
+
TRAIN_CAM_CY = 220.0
|
| 49 |
+
TRAIN_CAM_W = 640
|
| 50 |
+
TRAIN_CAM_H = 440
|
| 51 |
+
|
| 52 |
+
DEFAULT_SRC_INTRINSICS = {
|
| 53 |
+
"fx": 210.667, "fy": 210.667,
|
| 54 |
+
"cx": 256.0, "cy": 144.0,
|
| 55 |
+
"w": 512, "h": 288,
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
IMAGENET_TRANSFORM = transforms.Compose([
|
| 59 |
+
transforms.ToTensor(),
|
| 60 |
+
transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
| 61 |
+
std=[0.229, 0.224, 0.225]),
|
| 62 |
+
])
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class PDController:
|
| 66 |
+
MAX_V = 2.5
|
| 67 |
+
MAX_W = 0.65
|
| 68 |
+
|
| 69 |
+
def __init__(self):
|
| 70 |
+
self.last_v = None
|
| 71 |
+
|
| 72 |
+
def reset(self):
|
| 73 |
+
self.last_v = None
|
| 74 |
+
|
| 75 |
+
def __call__(self, waypoints, dt=1.0):
|
| 76 |
+
EPS = 1e-6
|
| 77 |
+
idx = min(WAYPOINT_INDEX, waypoints.shape[1] - 1)
|
| 78 |
+
dx = waypoints[:, idx, 0]
|
| 79 |
+
dyr = waypoints[:, -1, 1]
|
| 80 |
+
dxr = waypoints[:, -1, 0]
|
| 81 |
+
v = dx / dt
|
| 82 |
+
w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
|
| 83 |
+
* dxr.sign() / dt)
|
| 84 |
+
near_zero = dx.abs() < EPS
|
| 85 |
+
v = torch.where(near_zero, torch.zeros_like(v), v)
|
| 86 |
+
w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
|
| 87 |
+
if self.last_v is not None:
|
| 88 |
+
v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
|
| 89 |
+
v = v.clamp(-self.MAX_V, self.MAX_V)
|
| 90 |
+
w = w.clamp(-self.MAX_W, self.MAX_W)
|
| 91 |
+
self.last_v = v
|
| 92 |
+
return v, w
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _make_session(path):
|
| 96 |
+
ort.set_default_logger_severity(3)
|
| 97 |
+
opts = ort.SessionOptions()
|
| 98 |
+
opts.log_severity_level = 3
|
| 99 |
+
return ort.InferenceSession(
|
| 100 |
+
path, sess_options=opts,
|
| 101 |
+
providers=[
|
| 102 |
+
("CUDAExecutionProvider",
|
| 103 |
+
{"arena_extend_strategy": "kSameAsRequested"}),
|
| 104 |
+
"CPUExecutionProvider"])
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
class VintGLOfficialNavigator:
|
| 108 |
+
"""Vint-GL-Official navigator (NavDP image-goal ViNT, goal-free mode)."""
|
| 109 |
+
|
| 110 |
+
context_size = CTX_FRAMES # 6 frames in obs tensor
|
| 111 |
+
multimodal = False
|
| 112 |
+
num_waypoints = NUM_WAYPOINTS
|
| 113 |
+
|
| 114 |
+
def __init__(
|
| 115 |
+
self,
|
| 116 |
+
device: str = "cuda",
|
| 117 |
+
max_v: float = 2.5,
|
| 118 |
+
max_w: float = 0.65,
|
| 119 |
+
dt: float = 1.25,
|
| 120 |
+
center_crop: bool = True,
|
| 121 |
+
use_train_intrinsics: bool = True,
|
| 122 |
+
src_intrinsics: dict = None,
|
| 123 |
+
):
|
| 124 |
+
self.device = device
|
| 125 |
+
self.dt = dt
|
| 126 |
+
|
| 127 |
+
# NavDP default is center_crop=False (direct resize). Flag exposed for
|
| 128 |
+
# A/B testing: True crops landscape input to 4:3 before resizing.
|
| 129 |
+
self.center_crop = center_crop
|
| 130 |
+
|
| 131 |
+
# Optional virtual rectification to the NoMaD/ViNT training camera
|
| 132 |
+
# (recon dataset). When True, the source pinhole frame is warped
|
| 133 |
+
# (affine, same math as a homography between two pinhole cameras)
|
| 134 |
+
# so its content lines up with what a recon-camera frame squished
|
| 135 |
+
# to 85x64 would look like.
|
| 136 |
+
self.use_train_intrinsics = use_train_intrinsics
|
| 137 |
+
self.src_intrinsics = dict(src_intrinsics or DEFAULT_SRC_INTRINSICS)
|
| 138 |
+
self._train_affine = None
|
| 139 |
+
|
| 140 |
+
self._sess = _make_session(ONNX_PATH)
|
| 141 |
+
with open(INFO_PATH, "r") as f:
|
| 142 |
+
self._info = yaml.safe_load(f)
|
| 143 |
+
|
| 144 |
+
# Action decoder params (NavDP convention — cumsum already baked
|
| 145 |
+
# into the network, only the metric scale remains here).
|
| 146 |
+
self._normalize_actions = bool(self._info.get("normalize", True))
|
| 147 |
+
self._waypoint_spacing = float(
|
| 148 |
+
self._info.get("metric_waypoint_spacing", 0.8 / 3.0))
|
| 149 |
+
self.model_fps = float(self._info.get("model_fps", 3.0))
|
| 150 |
+
|
| 151 |
+
self._controller = PDController()
|
| 152 |
+
self._controller.MAX_V = max_v
|
| 153 |
+
self._controller.MAX_W = max_w
|
| 154 |
+
|
| 155 |
+
# Cache for visualization (last best trajectory in metric meters).
|
| 156 |
+
self._last_best_traj = None
|
| 157 |
+
|
| 158 |
+
# Optional pure-pursuit controller (used by inference_vw_pp).
|
| 159 |
+
self._pp = (PurePursuitController(action_dt=3, waypoint_index=2)
|
| 160 |
+
if PurePursuitController is not None else None)
|
| 161 |
+
|
| 162 |
+
def reset(self):
|
| 163 |
+
self._controller.reset()
|
| 164 |
+
self._last_best_traj = None
|
| 165 |
+
|
| 166 |
+
# ------------------------------------------------------------------
|
| 167 |
+
# Preprocessing — mirrors NoMaD_GL_Official, only the target size differs.
|
| 168 |
+
# ------------------------------------------------------------------
|
| 169 |
+
|
| 170 |
+
def _ensure_train_affine(self):
|
| 171 |
+
if self._train_affine is not None:
|
| 172 |
+
return
|
| 173 |
+
# Recon intrinsics scaled to (IMG_W, IMG_H) = (85, 64).
|
| 174 |
+
fx_t = TRAIN_CAM_FX * IMG_W / TRAIN_CAM_W
|
| 175 |
+
fy_t = TRAIN_CAM_FY * IMG_H / TRAIN_CAM_H
|
| 176 |
+
cx_t = TRAIN_CAM_CX * IMG_W / TRAIN_CAM_W
|
| 177 |
+
cy_t = TRAIN_CAM_CY * IMG_H / TRAIN_CAM_H
|
| 178 |
+
|
| 179 |
+
src = self.src_intrinsics
|
| 180 |
+
a = src["fx"] / fx_t
|
| 181 |
+
e = src["fy"] / fy_t
|
| 182 |
+
c = src["cx"] - cx_t * a
|
| 183 |
+
f = src["cy"] - cy_t * e
|
| 184 |
+
self._train_affine = (a, 0.0, c, 0.0, e, f)
|
| 185 |
+
|
| 186 |
+
def _to_model_input_uint8(self, frame_uint8):
|
| 187 |
+
from PIL import Image as PILImage
|
| 188 |
+
pil = PILImage.fromarray(frame_uint8)
|
| 189 |
+
if self.use_train_intrinsics:
|
| 190 |
+
self._ensure_train_affine()
|
| 191 |
+
pil = pil.transform(
|
| 192 |
+
(IMG_W, IMG_H), PILImage.AFFINE, self._train_affine,
|
| 193 |
+
resample=PILImage.BILINEAR, fillcolor=0)
|
| 194 |
+
return np.array(pil)
|
| 195 |
+
if self.center_crop:
|
| 196 |
+
w, h = pil.size
|
| 197 |
+
if w > h:
|
| 198 |
+
crop_w = int(h * IMAGE_ASPECT_RATIO)
|
| 199 |
+
x0 = max((w - crop_w) // 2, 0)
|
| 200 |
+
pil = pil.crop((x0, 0, x0 + crop_w, h))
|
| 201 |
+
else:
|
| 202 |
+
crop_h = int(w / IMAGE_ASPECT_RATIO)
|
| 203 |
+
y0 = max((h - crop_h) // 2, 0)
|
| 204 |
+
pil = pil.crop((0, y0, w, y0 + crop_h))
|
| 205 |
+
pil = pil.resize((IMG_W, IMG_H))
|
| 206 |
+
return np.array(pil)
|
| 207 |
+
|
| 208 |
+
def _preprocess_images(self, obs_rgb):
|
| 209 |
+
"""obs_rgb: (B, T=6, 3, H, W) float32 [0,1] → (B, 18, 64, 85) ImageNet-norm."""
|
| 210 |
+
from PIL import Image as PILImage
|
| 211 |
+
B, T = obs_rgb.shape[:2]
|
| 212 |
+
frames = []
|
| 213 |
+
for b in range(B):
|
| 214 |
+
ch_list = []
|
| 215 |
+
for t in range(T):
|
| 216 |
+
frame = (obs_rgb[b, t].transpose(1, 2, 0) * 255).astype(np.uint8)
|
| 217 |
+
model_in = self._to_model_input_uint8(frame)
|
| 218 |
+
pil = PILImage.fromarray(model_in)
|
| 219 |
+
ch_list.append(IMAGENET_TRANSFORM(pil).numpy())
|
| 220 |
+
frames.append(np.concatenate(ch_list, axis=0))
|
| 221 |
+
return np.stack(frames) # (B, 3*T, IMG_H, IMG_W)
|
| 222 |
+
|
| 223 |
+
# ------------------------------------------------------------------
|
| 224 |
+
# Inference
|
| 225 |
+
# ------------------------------------------------------------------
|
| 226 |
+
|
| 227 |
+
def inference_trajectory(self, obs):
|
| 228 |
+
"""Run ViNT → metric waypoints + uniform scores.
|
| 229 |
+
|
| 230 |
+
Args:
|
| 231 |
+
obs: (B, 6, 3, H, W) float32 in [0,1]. 6 frames.
|
| 232 |
+
|
| 233 |
+
Returns:
|
| 234 |
+
trajectory: np.ndarray (B, 1, 5, 2) — meters in robot frame
|
| 235 |
+
scores: np.ndarray (B, 1) — uniform (1.0)
|
| 236 |
+
"""
|
| 237 |
+
if isinstance(obs, torch.Tensor):
|
| 238 |
+
obs_np = obs.cpu().numpy()
|
| 239 |
+
else:
|
| 240 |
+
obs_np = np.asarray(obs, dtype=np.float32)
|
| 241 |
+
B = obs_np.shape[0]
|
| 242 |
+
|
| 243 |
+
obs_in = self._preprocess_images(obs_np) # (B, 18, 64, 85)
|
| 244 |
+
# Goal-free: random fake goal (matches NavDP ViNTAgent.step_nogoal).
|
| 245 |
+
fake_goal = np.random.randn(B, 3, IMG_H, IMG_W).astype(np.float32)
|
| 246 |
+
|
| 247 |
+
# Suppress ORT stderr noise during this call.
|
| 248 |
+
devnull = os.open(os.devnull, os.O_WRONLY)
|
| 249 |
+
old_stderr = os.dup(2)
|
| 250 |
+
os.dup2(devnull, 2)
|
| 251 |
+
all_action_pred = []
|
| 252 |
+
try:
|
| 253 |
+
for i in range(B):
|
| 254 |
+
dist_pred, action_pred = self._sess.run(
|
| 255 |
+
None, {"obs_img": obs_in[i:i+1], "goal_img": fake_goal[i:i+1]})
|
| 256 |
+
all_action_pred.append(action_pred)
|
| 257 |
+
finally:
|
| 258 |
+
os.dup2(old_stderr, 2)
|
| 259 |
+
os.close(devnull)
|
| 260 |
+
os.close(old_stderr)
|
| 261 |
+
action_pred = np.concatenate(all_action_pred, axis=0) # (B, 5, 4)
|
| 262 |
+
# action_pred shape (B, 5, 4): xy already cumsum'd, last two are sin/cos.
|
| 263 |
+
xy = action_pred[:, :, :2]
|
| 264 |
+
if self._normalize_actions:
|
| 265 |
+
xy = xy * self._waypoint_spacing # meters
|
| 266 |
+
|
| 267 |
+
trajectory = xy[:, None, :, :] # (B, 1, 5, 2)
|
| 268 |
+
scores = np.ones((B, 1), dtype=np.float32)
|
| 269 |
+
return trajectory.astype(np.float32), scores
|
| 270 |
+
|
| 271 |
+
def inference_vw(self, obs):
|
| 272 |
+
"""Run ViNT → (v, w) velocity commands.
|
| 273 |
+
|
| 274 |
+
Caches the chosen trajectory on ``self._last_best_traj``.
|
| 275 |
+
"""
|
| 276 |
+
trajectory, _ = self.inference_trajectory(obs)
|
| 277 |
+
best_traj = trajectory[:, 0] # (B, 5, 2)
|
| 278 |
+
self._last_best_traj = best_traj
|
| 279 |
+
waypoints = torch.from_numpy(best_traj).float().to(self.device)
|
| 280 |
+
v, w = self._controller(waypoints, dt=self.dt)
|
| 281 |
+
return torch.stack([v, w], dim=1), best_traj
|
| 282 |
+
|
| 283 |
+
def inference_vw_pp(self, obs, robot):
|
| 284 |
+
"""Pure-pursuit variant of :meth:`inference_vw`."""
|
| 285 |
+
if self._pp is None:
|
| 286 |
+
raise RuntimeError(
|
| 287 |
+
"PurePursuitController unavailable — install urbansim or "
|
| 288 |
+
"make sure `urbansim.custom.pp` is on the Python path.")
|
| 289 |
+
trajectory, _ = self.inference_trajectory(obs)
|
| 290 |
+
best_traj = trajectory[:, 0]
|
| 291 |
+
self._last_best_traj = best_traj
|
| 292 |
+
waypoints = torch.from_numpy(best_traj).float().to(self.device)
|
| 293 |
+
v, w = self._pp.step(waypoints, robot=robot)
|
| 294 |
+
return torch.stack([v, w], dim=1)
|
Vint_GL_Official/model_info.yaml
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Vint_GL_Official
|
| 2 |
+
goal_type: goal_less
|
| 3 |
+
image_resolution: [64, 85]
|
| 4 |
+
context_size: 6
|
| 5 |
+
obs_normalize: true
|
| 6 |
+
normalize: true
|
| 7 |
+
action_stats:
|
| 8 |
+
min: [0.0, -2.0]
|
| 9 |
+
max: [5.0, 2.0]
|
| 10 |
+
model_max_v: 2.5
|
| 11 |
+
model_rate: 3
|
| 12 |
+
metric_waypoint_spacing: 0.8
|
| 13 |
+
model_fps: 3
|
| 14 |
+
len_traj_pred: 5
|
| 15 |
+
learn_angle: true
|
| 16 |
+
inputs:
|
| 17 |
+
obs_img:
|
| 18 |
+
shape: [batch, 18, 64, 85]
|
| 19 |
+
dtype: float32
|
| 20 |
+
description: 6 RGB frames x 3 channels, ImageNet normalized; resized to image_resolution
|
| 21 |
+
goal_img:
|
| 22 |
+
shape: [batch, 3, 64, 85]
|
| 23 |
+
dtype: float32
|
| 24 |
+
description: Goal image (random noise for goal-free)
|
| 25 |
+
outputs:
|
| 26 |
+
dist_pred:
|
| 27 |
+
shape: [batch, 1]
|
| 28 |
+
description: Predicted distance to goal
|
| 29 |
+
action_pred:
|
| 30 |
+
shape: [batch, 5, 4]
|
| 31 |
+
description: waypoints (cumsum'd dx,dy in dataset units; scale by metric_waypoint_spacing),
|
| 32 |
+
sin/cos heading
|
| 33 |
+
onnx: vint_imagegoal.onnx
|
Vint_GL_Official/vint_imagegoal.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d480bf0fc28f6c1d84f568bf28733f6b418920218e3affd6f9d59fff1506ecec
|
| 3 |
+
size 1415900
|
Vint_GL_Official/vint_imagegoal.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:399b8d50e7fcc812a69734fcc51f40dd313d964fec79d6fea3429252bc994791
|
| 3 |
+
size 95748096
|