Hollis71025 commited on
Commit
9c1f523
·
verified ·
1 Parent(s): 2587ef0

Add 7 navigation models (ONNX + inference wrappers) and model card

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ GNM_GL_Official/gnm_imagegoal.onnx.data filter=lfs diff=lfs merge=lfs -text
37
+ NoMaD_GL_Official/nomad_vision_encoder.onnx.data filter=lfs diff=lfs merge=lfs -text
38
+ Vint_GL_Official/vint_imagegoal.onnx.data filter=lfs diff=lfs merge=lfs -text
CityWalker_PG_Official/citywalker.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e8fb9ff081a883d80f0502d5b7e9046ed4b0dfafd46fe96ff5ad16194f0949a
3
+ size 806082192
CityWalker_PG_Official/inference.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """CityWalker-PG-Official inference utilities.
3
+
4
+ CityWalker uses a NON-STANDARD coordinate system internally:
5
+ model-internal: y = forward, x = right
6
+ our standard: x = forward, y = left
7
+
8
+ This navigator accepts standard-frame inputs (goal_xy in x=forward, y=left)
9
+ and returns standard-frame outputs. The coordinate swap is handled internally.
10
+
11
+ nav = CityWalkerPGNavigator(device="cuda")
12
+ traj, scores = nav.inference_trajectory(obs, goal_xy=np.array([5.0, 0.2]))
13
+ vw = nav.inference_vw(obs, goal_xy=np.array([5.0, 0.2]))
14
+ """
15
+
16
+ import os
17
+ import math
18
+ import numpy as np
19
+ import torch
20
+ import yaml
21
+ import onnxruntime as ort
22
+ from torchvision import transforms
23
+
24
+ MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
25
+ ONNX_PATH = os.path.join(MODEL_DIR, "citywalker.onnx")
26
+ INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
27
+
28
+ # ImageNet normalization (CityWalker uses normalized images)
29
+ IMAGENET_TRANSFORM = transforms.Compose([
30
+ transforms.ToTensor(),
31
+ transforms.Normalize(mean=[0.485, 0.456, 0.406],
32
+ std=[0.229, 0.224, 0.225]),
33
+ ])
34
+ IMG_H, IMG_W = 350, 630
35
+
36
+
37
+ class PDController:
38
+ MAX_V = 2.5
39
+ MAX_W = 0.65
40
+
41
+ def __init__(self):
42
+ self.last_v = None
43
+
44
+ def reset(self):
45
+ self.last_v = None
46
+
47
+ def __call__(self, waypoints, dt=1.0):
48
+ EPS = 1e-6
49
+ dxr = waypoints[:, 2, 0]
50
+ dyr = waypoints[:, 2, 1]
51
+ idx = 0
52
+ dx = waypoints[:, idx, 0]
53
+ v = dx / dt
54
+ w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
55
+ * dxr.sign() / dt)
56
+ near_zero = dx.abs() < EPS
57
+ v = torch.where(near_zero, torch.zeros_like(v), v)
58
+ w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
59
+ if self.last_v is not None:
60
+ v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
61
+ v = v.clamp(-self.MAX_V, self.MAX_V)
62
+ w = w.clamp(-self.MAX_W, self.MAX_W)
63
+ self.last_v = v
64
+ return v, w
65
+
66
+
67
+ def _std_to_cw(xy):
68
+ """Standard frame (x=fwd, y=left) → CityWalker frame (y=fwd, x=right).
69
+
70
+ CityWalker model input: [y_cw, -x_cw] where y_cw=forward, x_cw=right.
71
+ Given standard (x_std=fwd, y_std=left):
72
+ y_cw = x_std (forward)
73
+ x_cw = -y_std (right = -left)
74
+ Model expects [y_cw, -x_cw] = [x_std, y_std]
75
+
76
+ Actually from the code: input_traj[..., [1,0]] then negate [0].
77
+ So model wants: col0 = -x_cw = y_std, col1 = y_cw = x_std.
78
+ """
79
+ out = np.empty_like(xy)
80
+ out[..., 0] = -xy[..., 1] # col0 = y_std (= -x_cw)
81
+ out[..., 1] = xy[..., 0] # col1 = x_std (= y_cw = forward)
82
+ return out
83
+
84
+
85
+ def _cw_to_std(xy):
86
+ """CityWalker output frame → standard frame.
87
+
88
+ Model output: [y_std, x_std] (reverse of the swap above).
89
+ """
90
+ out = np.empty_like(xy)
91
+ out[..., 0] = xy[..., 1] # x_std = forward
92
+ out[..., 1] = -xy[..., 0] # y_std = left
93
+ return out
94
+
95
+
96
+ class CityWalkerPGNavigator:
97
+ """CityWalker point-goal navigator.
98
+
99
+ All user-facing coordinates are standard frame: x=forward, y=left, meters.
100
+ """
101
+
102
+ context_size = 5
103
+ multimodal = False
104
+
105
+ def __init__(
106
+ self,
107
+ onnx_path: str = ONNX_PATH,
108
+ device: str = "cuda",
109
+ max_v: float = 2.5,
110
+ max_w: float = 0.65,
111
+ dt: float = 1.0,
112
+ ):
113
+ self.device = device
114
+ self.dt = dt
115
+
116
+ ort.set_default_logger_severity(3)
117
+ providers = [
118
+ ("CUDAExecutionProvider",
119
+ {"arena_extend_strategy": "kSameAsRequested"}),
120
+ "CPUExecutionProvider",
121
+ ]
122
+ self._session = ort.InferenceSession(onnx_path, providers=providers)
123
+ with open(INFO_PATH, "r") as f:
124
+ self._info = yaml.safe_load(f)
125
+ self._controller = PDController()
126
+ self._controller.MAX_V = max_v
127
+ self._controller.MAX_W = max_w
128
+ self._past_traj = None
129
+
130
+ def reset(self):
131
+ self._controller.reset()
132
+ self._past_traj = None
133
+
134
+ def _preprocess_images(self, obs_rgb):
135
+ """obs_rgb: (B, C, 3, H, W) float32 [0,1] → ImageNet-normalized."""
136
+ from PIL import Image as PILImage
137
+ B, C = obs_rgb.shape[:2]
138
+ out = np.empty((B, C, 3, IMG_H, IMG_W), dtype=np.float32)
139
+ for b in range(B):
140
+ for c in range(C):
141
+ # CHW [0,1] → HWC uint8
142
+ frame = (obs_rgb[b, c].transpose(1, 2, 0) * 255).astype(np.uint8)
143
+ pil = PILImage.fromarray(frame)
144
+ pil = pil.resize((IMG_W, IMG_H))
145
+ out[b, c] = IMAGENET_TRANSFORM(pil).numpy()
146
+ return out
147
+
148
+ def inference_trajectory(self, obs, goal_xy=None, past_traj_std=None):
149
+ """Run model → (trajectory, scores).
150
+
151
+ Args:
152
+ obs: (B, context_size, 3, H, W) float32 in [0,1]. 5 frames.
153
+ goal_xy: (2,) or (B, 2) goal in standard frame [x_fwd, y_left] meters.
154
+ If None, uses [5.0, 0.0] (straight ahead 5m).
155
+ past_traj_std: (B, 6, 2) past trajectory in standard frame, or None.
156
+ If None, uses zeros (stationary start).
157
+
158
+ Returns:
159
+ trajectory: np.ndarray (B, 1, 5, 2) in standard frame
160
+ scores: np.ndarray (B, 1)
161
+ """
162
+ if isinstance(obs, torch.Tensor):
163
+ obs_np = obs.cpu().numpy()
164
+ else:
165
+ obs_np = np.asarray(obs, dtype=np.float32)
166
+ B = obs_np.shape[0]
167
+
168
+ # Preprocess images: resize + ImageNet normalize
169
+ obs_norm = self._preprocess_images(obs_np)
170
+
171
+ # Build trajectory input (past 6 waypoints in CW frame)
172
+ if past_traj_std is None:
173
+ past_std = np.zeros((B, 6, 2), dtype=np.float32)
174
+ else:
175
+ past_std = np.asarray(past_traj_std, dtype=np.float32)
176
+ if past_std.ndim == 2:
177
+ past_std = past_std[np.newaxis]
178
+ past_std[:, -1, 0] = np.random.uniform(5.0, 10.0, size=(B,))
179
+ past_std[:, -1, 1] = np.random.uniform(-2.0, 2.0, size=(B,))
180
+ traj_cw = _std_to_cw(past_std)
181
+ out = self._session.run(None, {
182
+ "obs_images": obs_norm,
183
+ "trajectory": traj_cw,
184
+ })
185
+ wp_cw = out[0] # (B, 5, 2) in CW frame
186
+
187
+ # Convert back to standard frame
188
+ wp_std = _cw_to_std(wp_cw)
189
+ trajectory = wp_std[:, np.newaxis] # (B, 1, 5, 2)
190
+ scores = np.ones((B, 1), dtype=np.float32)
191
+ return trajectory, scores
192
+
193
+ def inference_vw(self, obs, goal_xy=None, past_traj_std=None):
194
+ """Run model → (v, w) velocity commands in standard frame.
195
+
196
+ Returns:
197
+ vw: torch.Tensor (B, 2)
198
+ """
199
+ trajectory, _ = self.inference_trajectory(obs, goal_xy, past_traj_std)
200
+ best_traj = trajectory[:, 0] # (B, 5, 2) standard frame
201
+ waypoints = torch.from_numpy(best_traj).float().to(self.device)
202
+ v, w = self._controller(waypoints, dt=self.dt)
203
+ return torch.stack([v, w], dim=1), best_traj
CityWalker_PG_Official/model_info.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CityWalker-PG-Official
2
+ goal_type: point_goal
3
+ image_resolution: [350, 630]
4
+ context_size: 5
5
+ input_fps: 5.0
6
+ obs_normalize: false
7
+ inputs:
8
+ obs_images: {shape: [batch, 5, 3, 350, 630], dtype: float32, description: "5 RGB observation frames in [0,1]"}
9
+ trajectory: {shape: [batch, 6, 2], dtype: float32, description: "6 past trajectory waypoints (x,y) in meters"}
10
+ outputs:
11
+ wp_pred: {shape: [batch, 5, 2], description: "5 predicted waypoints (x,y)"}
12
+ arrive_pred: {shape: [batch, 1], description: "Arrival probability"}
13
+ num_waypoints: 5
GNM_GL_Official/gnm_imagegoal.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8914d698a58f0f49ca8da5a2f61d646d759a2178ad6f8a89f0e863b625110c77
3
+ size 619733
GNM_GL_Official/gnm_imagegoal.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2764830aaf3d0a6189f1f2a0d5c9d2f4c725aa9cc9638a8beeae9e6c9dd59b8a
3
+ size 34603008
GNM_GL_Official/inference.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """GNM-GL-Official inference wrapper (NavDP image-goal GNM in goal-free mode).
3
+
4
+ Mirror of Vint_GL_Official/inference.py — same I/O contract, only the ONNX
5
+ backbone and the class name differ. See that file for the action-decode and
6
+ preprocessing rationale.
7
+
8
+ nav = GNMGLOfficialNavigator(device="cuda")
9
+ traj, scores = nav.inference_trajectory(obs) # (B, 1, 5, 2) meters
10
+ vw = nav.inference_vw(obs) # caches nav._last_best_traj
11
+ """
12
+
13
+ import os
14
+ import math
15
+ import numpy as np
16
+ import torch
17
+ import yaml
18
+ import onnxruntime as ort
19
+ from torchvision import transforms
20
+
21
+ try:
22
+ from urbansim.custom.pp import PurePursuitController
23
+ except Exception:
24
+ PurePursuitController = None # optional — only needed for inference_vw_pp
25
+
26
+ MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
27
+ ONNX_PATH = os.path.join(MODEL_DIR, "gnm_imagegoal.onnx")
28
+ INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
29
+
30
+ IMG_W, IMG_H = 85, 64
31
+ NUM_WAYPOINTS = 5
32
+ CTX_FRAMES = 6
33
+ WAYPOINT_INDEX = 2
34
+
35
+ IMAGE_ASPECT_RATIO = 4 / 3
36
+
37
+ TRAIN_CAM_FX = 272.547
38
+ TRAIN_CAM_FY = 266.358
39
+ TRAIN_CAM_CX = 320.0
40
+ TRAIN_CAM_CY = 220.0
41
+ TRAIN_CAM_W = 640
42
+ TRAIN_CAM_H = 440
43
+
44
+ DEFAULT_SRC_INTRINSICS = {
45
+ "fx": 210.667, "fy": 210.667,
46
+ "cx": 256.0, "cy": 144.0,
47
+ "w": 512, "h": 288,
48
+ }
49
+
50
+ IMAGENET_TRANSFORM = transforms.Compose([
51
+ transforms.ToTensor(),
52
+ transforms.Normalize(mean=[0.485, 0.456, 0.406],
53
+ std=[0.229, 0.224, 0.225]),
54
+ ])
55
+
56
+
57
+ class PDController:
58
+ MAX_V = 2.5
59
+ MAX_W = 0.65
60
+
61
+ def __init__(self):
62
+ self.last_v = None
63
+
64
+ def reset(self):
65
+ self.last_v = None
66
+
67
+ def __call__(self, waypoints, dt=1.0):
68
+ EPS = 1e-6
69
+ idx = min(WAYPOINT_INDEX, waypoints.shape[1] - 1)
70
+ dx = waypoints[:, idx, 0]
71
+ dyr = waypoints[:, -1, 1]
72
+ dxr = waypoints[:, -1, 0]
73
+ v = dx / dt
74
+ w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
75
+ * dxr.sign() / dt)
76
+ near_zero = dx.abs() < EPS
77
+ v = torch.where(near_zero, torch.zeros_like(v), v)
78
+ w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
79
+ if self.last_v is not None:
80
+ v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
81
+ v = v.clamp(-self.MAX_V, self.MAX_V)
82
+ w = w.clamp(-self.MAX_W, self.MAX_W)
83
+ self.last_v = v
84
+ return v, w
85
+
86
+
87
+ def _make_session(path):
88
+ ort.set_default_logger_severity(3)
89
+ opts = ort.SessionOptions()
90
+ opts.log_severity_level = 3
91
+ return ort.InferenceSession(
92
+ path, sess_options=opts,
93
+ providers=[
94
+ ("CUDAExecutionProvider",
95
+ {"arena_extend_strategy": "kSameAsRequested"}),
96
+ "CPUExecutionProvider"])
97
+
98
+
99
+ class GNMGLOfficialNavigator:
100
+ """GNM-GL-Official navigator (NavDP image-goal GNM, goal-free mode)."""
101
+
102
+ context_size = CTX_FRAMES
103
+ multimodal = False
104
+ num_waypoints = NUM_WAYPOINTS
105
+
106
+ def __init__(
107
+ self,
108
+ device: str = "cuda",
109
+ max_v: float = 2.5,
110
+ max_w: float = 0.65,
111
+ dt: float = 1.25,
112
+ center_crop: bool = True,
113
+ use_train_intrinsics: bool = True,
114
+ src_intrinsics: dict = None,
115
+ ):
116
+ self.device = device
117
+ self.dt = dt
118
+ self.center_crop = center_crop
119
+ self.use_train_intrinsics = use_train_intrinsics
120
+ self.src_intrinsics = dict(src_intrinsics or DEFAULT_SRC_INTRINSICS)
121
+ self._train_affine = None
122
+
123
+ self._sess = _make_session(ONNX_PATH)
124
+ with open(INFO_PATH, "r") as f:
125
+ self._info = yaml.safe_load(f)
126
+
127
+ self._normalize_actions = bool(self._info.get("normalize", True))
128
+ self._waypoint_spacing = float(
129
+ self._info.get("metric_waypoint_spacing", 0.8 / 3.0))
130
+ self.model_fps = float(self._info.get("model_fps", 3.0))
131
+
132
+ self._controller = PDController()
133
+ self._controller.MAX_V = max_v
134
+ self._controller.MAX_W = max_w
135
+
136
+ self._last_best_traj = None
137
+ self._pp = (PurePursuitController(action_dt=3, waypoint_index=2)
138
+ if PurePursuitController is not None else None)
139
+
140
+ def reset(self):
141
+ self._controller.reset()
142
+ self._last_best_traj = None
143
+
144
+ def _ensure_train_affine(self):
145
+ if self._train_affine is not None:
146
+ return
147
+ fx_t = TRAIN_CAM_FX * IMG_W / TRAIN_CAM_W
148
+ fy_t = TRAIN_CAM_FY * IMG_H / TRAIN_CAM_H
149
+ cx_t = TRAIN_CAM_CX * IMG_W / TRAIN_CAM_W
150
+ cy_t = TRAIN_CAM_CY * IMG_H / TRAIN_CAM_H
151
+ src = self.src_intrinsics
152
+ a = src["fx"] / fx_t
153
+ e = src["fy"] / fy_t
154
+ c = src["cx"] - cx_t * a
155
+ f = src["cy"] - cy_t * e
156
+ self._train_affine = (a, 0.0, c, 0.0, e, f)
157
+
158
+ def _to_model_input_uint8(self, frame_uint8):
159
+ from PIL import Image as PILImage
160
+ pil = PILImage.fromarray(frame_uint8)
161
+ if self.use_train_intrinsics:
162
+ self._ensure_train_affine()
163
+ pil = pil.transform(
164
+ (IMG_W, IMG_H), PILImage.AFFINE, self._train_affine,
165
+ resample=PILImage.BILINEAR, fillcolor=0)
166
+ return np.array(pil)
167
+ if self.center_crop:
168
+ w, h = pil.size
169
+ if w > h:
170
+ crop_w = int(h * IMAGE_ASPECT_RATIO)
171
+ x0 = max((w - crop_w) // 2, 0)
172
+ pil = pil.crop((x0, 0, x0 + crop_w, h))
173
+ else:
174
+ crop_h = int(w / IMAGE_ASPECT_RATIO)
175
+ y0 = max((h - crop_h) // 2, 0)
176
+ pil = pil.crop((0, y0, w, y0 + crop_h))
177
+ pil = pil.resize((IMG_W, IMG_H))
178
+ return np.array(pil)
179
+
180
+ def _preprocess_images(self, obs_rgb):
181
+ from PIL import Image as PILImage
182
+ B, T = obs_rgb.shape[:2]
183
+ frames = []
184
+ for b in range(B):
185
+ ch_list = []
186
+ for t in range(T):
187
+ frame = (obs_rgb[b, t].transpose(1, 2, 0) * 255).astype(np.uint8)
188
+ model_in = self._to_model_input_uint8(frame)
189
+ pil = PILImage.fromarray(model_in)
190
+ ch_list.append(IMAGENET_TRANSFORM(pil).numpy())
191
+ frames.append(np.concatenate(ch_list, axis=0))
192
+ return np.stack(frames)
193
+
194
+ def inference_trajectory(self, obs):
195
+ """Run GNM → metric waypoints + uniform scores.
196
+
197
+ Returns:
198
+ trajectory: np.ndarray (B, 1, 5, 2) — meters in robot frame
199
+ scores: np.ndarray (B, 1)
200
+ """
201
+ if isinstance(obs, torch.Tensor):
202
+ obs_np = obs.cpu().numpy()
203
+ else:
204
+ obs_np = np.asarray(obs, dtype=np.float32)
205
+ B = obs_np.shape[0]
206
+
207
+ obs_in = self._preprocess_images(obs_np)
208
+ fake_goal = np.random.randn(B, 3, IMG_H, IMG_W).astype(np.float32)
209
+
210
+ devnull = os.open(os.devnull, os.O_WRONLY)
211
+ old_stderr = os.dup(2)
212
+ os.dup2(devnull, 2)
213
+ try:
214
+ dist_pred, action_pred = self._sess.run(
215
+ None, {"obs_img": obs_in, "goal_img": fake_goal})
216
+ finally:
217
+ os.dup2(old_stderr, 2)
218
+ os.close(devnull)
219
+ os.close(old_stderr)
220
+
221
+ xy = action_pred[:, :, :2]
222
+ if self._normalize_actions:
223
+ xy = xy * self._waypoint_spacing
224
+
225
+ trajectory = xy[:, None, :, :] # (B, 1, 5, 2)
226
+ scores = np.ones((B, 1), dtype=np.float32)
227
+ return trajectory.astype(np.float32), scores
228
+
229
+ def inference_vw(self, obs):
230
+ trajectory, _ = self.inference_trajectory(obs)
231
+ best_traj = trajectory[:, 0]
232
+ self._last_best_traj = best_traj
233
+ waypoints = torch.from_numpy(best_traj).float().to(self.device)
234
+ v, w = self._controller(waypoints, dt=self.dt)
235
+ return torch.stack([v, w], dim=1)
236
+
237
+ def inference_vw_pp(self, obs, robot):
238
+ if self._pp is None:
239
+ raise RuntimeError(
240
+ "PurePursuitController unavailable — install urbansim or "
241
+ "make sure `urbansim.custom.pp` is on the Python path.")
242
+ trajectory, _ = self.inference_trajectory(obs)
243
+ best_traj = trajectory[:, 0]
244
+ self._last_best_traj = best_traj
245
+ waypoints = torch.from_numpy(best_traj).float().to(self.device)
246
+ v, w = self._pp.step(waypoints, robot=robot)
247
+ return torch.stack([v, w], dim=1)
GNM_GL_Official/model_info.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: GNM-GL-Official
2
+ goal_type: goal_less
3
+ image_resolution: [64, 85]
4
+ context_size: 6
5
+ obs_normalize: true
6
+ normalize: true
7
+ action_stats:
8
+ min: [0.0, -2.0]
9
+ max: [5.0, 2.0]
10
+ model_max_v: 0.8
11
+ model_rate: 3
12
+ metric_waypoint_spacing: 0.8
13
+ model_fps: 3
14
+ len_traj_pred: 5
15
+ learn_angle: true
16
+ inputs:
17
+ obs_img:
18
+ shape: [batch, 18, 64, 85]
19
+ dtype: float32
20
+ description: 6 RGB frames x 3 channels, ImageNet normalized; resized to image_resolution
21
+ goal_img:
22
+ shape: [batch, 3, 64, 85]
23
+ dtype: float32
24
+ description: Goal image (random noise for goal-free)
25
+ outputs:
26
+ dist_pred:
27
+ shape: [batch, 1]
28
+ description: Predicted distance to goal
29
+ action_pred:
30
+ shape: [batch, 5, 4]
31
+ description: waypoints (cumsum'd dx,dy in dataset units; scale by metric_waypoint_spacing),
32
+ sin/cos heading
33
+ onnx: gnm_imagegoal.onnx
MBRA_PG_Official/inference.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """MBRA-PG-Official inference utilities.
3
+
4
+ MBRA takes 6 ImageNet-normalized 96x96 frames (stacked as 18 channels)
5
+ and a goal pose [x, y, sin(yaw), cos(yaw)] normalized by metric_spacing=0.25m.
6
+
7
+ User provides goal_xy in standard frame (x=forward, y=left, meters).
8
+
9
+ nav = MBRAPGNavigator(device="cuda")
10
+ traj, scores = nav.inference_trajectory(obs, goal_xy=np.array([5.0, 0.2]))
11
+ vw = nav.inference_vw(obs, goal_xy=np.array([5.0, 0.2]))
12
+ """
13
+
14
+ import os
15
+ import math
16
+ import numpy as np
17
+ import torch
18
+ import onnxruntime as ort
19
+ import yaml
20
+ from torchvision import transforms
21
+
22
+ MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
23
+ ONNX_PATH = os.path.join(MODEL_DIR, "mbra.onnx")
24
+ INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
25
+ METRIC_SPACING = 0.8 # waypoint normalization factor
26
+
27
+ IMAGENET_TRANSFORM = transforms.Compose([
28
+ transforms.ToTensor(),
29
+ transforms.Normalize(mean=[0.485, 0.456, 0.406],
30
+ std=[0.229, 0.224, 0.225]),
31
+ ])
32
+ IMG_SIZE = 96
33
+
34
+
35
+ class PDController:
36
+ MAX_V = 2.5
37
+ MAX_W = 0.65
38
+
39
+ def __init__(self):
40
+ self.last_v = None
41
+
42
+ def reset(self):
43
+ self.last_v = None
44
+
45
+ def __call__(self, waypoints, dt=1.0):
46
+ EPS = 1e-6
47
+ idx = min(2, waypoints.shape[1] - 1)
48
+ dx = waypoints[:, idx, 0]
49
+ dyr = waypoints[:, -1, 1]
50
+ dxr = waypoints[:, -1, 0]
51
+ v = dx / dt
52
+ w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
53
+ * dxr.sign() / dt)
54
+ near_zero = dx.abs() < EPS
55
+ v = torch.where(near_zero, torch.zeros_like(v), v)
56
+ w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
57
+ if self.last_v is not None:
58
+ v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
59
+ v = v.clamp(-self.MAX_V, self.MAX_V)
60
+ w = w.clamp(-self.MAX_W, self.MAX_W)
61
+ self.last_v = v
62
+ return v, w
63
+
64
+
65
+ class MBRAPGNavigator:
66
+ """MBRA point-goal navigator.
67
+
68
+ All user-facing coordinates: x=forward, y=left, meters.
69
+ """
70
+
71
+ context_size = 6
72
+ multimodal = False
73
+
74
+ def __init__(
75
+ self,
76
+ onnx_path: str = ONNX_PATH,
77
+ device: str = "cuda",
78
+ max_v: float = 2.5,
79
+ max_w: float = 0.65,
80
+ dt: float = 2.5,
81
+ ):
82
+ self.device = device
83
+ self.dt = dt
84
+
85
+ ort.set_default_logger_severity(3)
86
+ providers = [
87
+ ("CUDAExecutionProvider",
88
+ {"arena_extend_strategy": "kSameAsRequested"}),
89
+ "CPUExecutionProvider",
90
+ ]
91
+ self._session = ort.InferenceSession(onnx_path, providers=providers)
92
+ with open(INFO_PATH, "r") as f:
93
+ self._info = yaml.safe_load(f)
94
+ self._controller = PDController()
95
+ self._controller.MAX_V = max_v
96
+ self._controller.MAX_W = max_w
97
+
98
+ def reset(self):
99
+ self._controller.reset()
100
+
101
+ def _preprocess_images(self, obs_rgb):
102
+ """obs_rgb: (B, C, 3, H, W) float32 [0,1] → (B, C*3, 96, 96) ImageNet-norm."""
103
+ from PIL import Image as PILImage
104
+ B, C = obs_rgb.shape[:2]
105
+ frames = []
106
+ for b in range(B):
107
+ ch_list = []
108
+ for c in range(C):
109
+ frame = (obs_rgb[b, c].transpose(1, 2, 0) * 255).astype(np.uint8)
110
+ pil = PILImage.fromarray(frame).resize((IMG_SIZE, IMG_SIZE))
111
+ ch_list.append(IMAGENET_TRANSFORM(pil).numpy()) # (3, 96, 96)
112
+ frames.append(np.concatenate(ch_list, axis=0)) # (C*3, 96, 96)
113
+ return np.stack(frames) # (B, 18, 96, 96)
114
+
115
+ @staticmethod
116
+ def _goal_to_pose(goal_xy):
117
+ """Convert standard goal [x_fwd, y_left] (meters) → MBRA goal_pose [4].
118
+
119
+ MBRA expects [x/spacing, y/spacing, cos(angle), sin(angle)].
120
+ """
121
+ x, y = float(goal_xy[0]), float(goal_xy[1])
122
+ angle = math.atan2(y, x)
123
+ return np.array([
124
+ x / METRIC_SPACING,
125
+ y / METRIC_SPACING,
126
+ math.cos(angle),
127
+ math.sin(angle),
128
+ ], dtype=np.float32)
129
+
130
+ def inference_trajectory(self, obs, goal_xy=None):
131
+ """Run model → (trajectory, scores).
132
+
133
+ Args:
134
+ obs: (B, 6, 3, H, W) float32 in [0,1]. 6 frames.
135
+ goal_xy: (2,) goal in standard frame [x_fwd, y_left] meters.
136
+ If None, uses [5.0, 0.0].
137
+
138
+ Returns:
139
+ trajectory: np.ndarray (B, 1, 8, 2) waypoints [x, y] meters
140
+ scores: np.ndarray (B, 1)
141
+ """
142
+ if isinstance(obs, torch.Tensor):
143
+ obs_np = obs.cpu().numpy()
144
+ else:
145
+ obs_np = np.asarray(obs, dtype=np.float32)
146
+ B = obs_np.shape[0]
147
+
148
+ obs_norm = self._preprocess_images(obs_np)
149
+
150
+ if goal_xy is None:
151
+ goal_xy = np.array([np.random.uniform(2.0, 10.0), np.random.uniform(-2.0, 2.0)])
152
+ goal_pose = self._goal_to_pose(goal_xy)
153
+ goal_batch = np.tile(goal_pose, (B, 1))
154
+
155
+ out = self._session.run(None, {
156
+ "obs_images": obs_norm,
157
+ "goal_pose": goal_batch,
158
+ })
159
+ wp_raw = out[0] # (B, 8, 4) — [x, y, sin(yaw), cos(yaw)]
160
+
161
+ # Extract x, y and un-normalize
162
+ wp_xy = wp_raw[:, :, :2] * METRIC_SPACING # (B, 8, 2) meters
163
+ trajectory = wp_xy[:, np.newaxis] # (B, 1, 8, 2)
164
+ scores = np.ones((B, 1), dtype=np.float32)
165
+ return trajectory, scores
166
+
167
+ def inference_vw(self, obs, goal_xy=None):
168
+ """Run model → (v, w).
169
+
170
+ Returns:
171
+ vw: torch.Tensor (B, 2)
172
+ """
173
+ trajectory, _ = self.inference_trajectory(obs, goal_xy)
174
+ best_traj = trajectory[:, 0]
175
+ waypoints = torch.from_numpy(best_traj).float().to(self.device)
176
+ v, w = self._controller(waypoints, dt=self.dt)
177
+ return torch.stack([v, w], dim=1), best_traj
MBRA_PG_Official/mbra.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc42919737964d9c4db9428d8a836ad9d95007f1a41fcf5cf95526e1042f17a5
3
+ size 253629409
MBRA_PG_Official/model_info.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: MBRA-PG-Official
2
+ goal_type: point_goal
3
+ image_resolution: [96, 96]
4
+ context_size: 6
5
+ input_fps: 5.0
6
+ obs_normalize: true
7
+ inputs:
8
+ obs_images: {shape: [batch, 6, 3, 96, 96], dtype: float32, description: "6 RGB frames x 3 channels, ImageNet normalized"}
9
+ goal_pose: {shape: [batch, 4], dtype: float32, description: "Goal pose [x, y, sin(yaw), cos(yaw)] in meters/rad"}
10
+ outputs:
11
+ waypoints: {shape: [batch, 8, 4], description: "8 waypoints x 4 (x, y, sin(yaw), cos(yaw))"}
12
+ num_waypoints: 8
MIMIC/inference.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """MIMIC inference utilities.
3
+
4
+ Mirrors the style of :mod:`models.S2E_PG_Full.inference`, adapted for the
5
+ MIMIC ONNX (``mimic.onnx``) — point-goal, multi-modal, 16-frame context with
6
+ an explicit past-trajectory + past-mask input. Reference single-shot script:
7
+ ``urban-sim-human-official/run_v3_b_onnx.py``.
8
+
9
+ The ONNX has four inputs and three outputs::
10
+
11
+ obs : (B, 16, 3, 256, 256) float32 in [0, 1]
12
+ goal : (B, 3) float32 = [dist/200, cos(a), sin(a)]
13
+ past_traj : (B, 15, 2) float32 — past XY in camera frame x 0.25
14
+ (usually zeros at inference)
15
+ past_mask : (B, 16) float32 — 1.0 for valid frames
16
+
17
+ best_reg_mid : (B, 10, 3) — best single trajectory (model-picked)
18
+ poses_reg_mid_denorm : (B, 62, 10, 3) — 62 candidates
19
+ poses_cls_mid : (B, 62) — classification scores
20
+
21
+ Outputs are in *camera frame* (col-0 = forward, col-1 = +left in the model
22
+ output; the reference script's ``[-col1, col0]`` swap turns col1 into the
23
+ controller's right-positive ``x_right``). We rescale by ``waypoint_scale``
24
+ (=0.25) and return waypoints in standard frame ``(x_fwd, y_left)``.
25
+
26
+ nav = MIMICNavigator(device="cuda")
27
+ traj, scores = nav.inference_trajectory(obs, goal_xy=np.array([5.0, 0.2]))
28
+ vw, best = nav.inference_vw(obs, goal_xy=np.array([5.0, 0.2]))
29
+ nav.reset()
30
+ """
31
+ import os
32
+ import math
33
+ import numpy as np
34
+ import torch
35
+ import yaml
36
+ import onnxruntime as ort
37
+ from urbansim.custom.pp import PurePursuitController
38
+
39
+ MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
40
+ ONNX_PATH = os.path.join(MODEL_DIR, "mimic.onnx")
41
+ INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
42
+ HORIZON_IDX = 13
43
+ Scaler = 0.25
44
+
45
+ class PDController:
46
+ """Convert predicted waypoints to (v, w) commands."""
47
+ MAX_V = 2.5
48
+ MAX_W = 0.65
49
+
50
+ def __init__(self):
51
+ self.last_v = None
52
+
53
+ def reset(self):
54
+ self.last_v = None
55
+
56
+ def __call__(self, waypoints, dt=1.0):
57
+ EPS = 1e-6
58
+ dxr = waypoints[:, -1, 0]
59
+ dyr = waypoints[:, -1, 1]
60
+ idx = min(6, waypoints.shape[1] - 1)
61
+ dx = waypoints[:, idx, 0]
62
+ v = dx / dt
63
+ w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
64
+ * dxr.sign() / dt)
65
+ near_zero = dx.abs() < EPS
66
+ v = torch.where(near_zero, torch.zeros_like(v), v)
67
+ w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
68
+ if self.last_v is not None:
69
+ v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
70
+ v = v.clamp(-self.MAX_V, self.MAX_V)
71
+ w = w.clamp(-self.MAX_W, self.MAX_W)
72
+ self.last_v = v
73
+ return v, w
74
+
75
+ class MIMICNavigator:
76
+ """
77
+ """
78
+
79
+ context_size = 16
80
+ multimodal = False
81
+
82
+ def __init__(
83
+ self,
84
+ onnx_path: str = ONNX_PATH,
85
+ device: str = "cuda",
86
+ max_v: float = 2.5,
87
+ max_w: float = 0.65,
88
+ dt: float = 2.0,
89
+ horizon_idx: int = HORIZON_IDX,
90
+ ):
91
+ """
92
+ Args:
93
+ onnx_path: Path to the ONNX model file.
94
+ device: "cuda" or "cpu".
95
+ max_v: Max linear velocity (m/s).
96
+ max_w: Max angular velocity (rad/s).
97
+ dt: Time scaling factor for PD controller.
98
+ horizon_idx: Number of output waypoints to keep (up to 4s).
99
+ """
100
+ self.device = device
101
+ self.dt = dt
102
+ self.horizon_idx = horizon_idx
103
+
104
+ ort.set_default_logger_severity(3)
105
+ providers = [
106
+ ("CUDAExecutionProvider",
107
+ {"arena_extend_strategy": "kSameAsRequested"}),
108
+ "CPUExecutionProvider",
109
+ ]
110
+ self._session = ort.InferenceSession(onnx_path, providers=providers)
111
+ with open(INFO_PATH, "r") as f:
112
+ self._info = yaml.safe_load(f)
113
+ self._controller = PDController()
114
+ self._controller.MAX_V = max_v
115
+ self._controller.MAX_W = max_w
116
+
117
+ self._pp = PurePursuitController(action_dt=4, waypoint_index=6)
118
+ self._last_best_traj = None
119
+
120
+
121
+ def reset(self):
122
+ self._controller.reset()
123
+ self._last_best_traj = None
124
+
125
+ def inference_trajectory(self, obs):
126
+ """Run model → (trajectory, scores).
127
+
128
+ Args:
129
+ obs: (B, 16, 3, 288, 512) float32 in [0,1].
130
+ Note: batch dim is fixed to 1 for this model.
131
+
132
+ Returns:
133
+ trajectory: np.ndarray (B, 1, K, 2) where K = horizon_idx
134
+ scores: np.ndarray (B, 1)
135
+ """
136
+ if isinstance(obs, torch.Tensor):
137
+ obs_np = obs.cpu().numpy()
138
+ else:
139
+ obs_np = np.asarray(obs, dtype=np.float32)
140
+ B = obs_np.shape[0]
141
+
142
+ if B == 1:
143
+ out = self._session.run(None, {"input": obs_np})
144
+ raw = out[0]
145
+ else:
146
+ raw = np.concatenate(
147
+ [self._session.run(None, {"input": obs_np[i:i+1]})[0] for i in range(B)],
148
+ axis=0,
149
+ )
150
+
151
+ # Take x, y columns up to the 4-second horizon
152
+ traj_xy = raw[:, :self.horizon_idx, :2].astype(np.float32)
153
+ # Reshape to (B, 1, K, 2) for uniform interface
154
+ trajectory = traj_xy[:, np.newaxis] * Scaler
155
+ scores = np.ones((B, 1), dtype=np.float32)
156
+ return trajectory, scores
157
+
158
+ def inference_vw(self, obs):
159
+ """Run model → (v, w) velocity commands.
160
+
161
+ Returns:
162
+ vw: torch.Tensor (B, 2)
163
+ """
164
+ trajectory, _ = self.inference_trajectory(obs)
165
+ best_traj = trajectory[:, 0] # (B, K, 2)
166
+ waypoints = torch.from_numpy(best_traj).float().to(self.device)
167
+ v, w = self._controller(waypoints, dt=self.dt)
168
+ self._last_best_traj = best_traj
169
+ return torch.stack([v, w], dim=1), best_traj
170
+
MIMIC/mimic.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7557512c791b824a1d693368d590ee6741af8a9aeef7974b763eb912df69d765
3
+ size 318192969
MIMIC/model_info.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: MIMIC
2
+ goal_type: goal_free
3
+ image_resolution: [288, 512]
4
+ context_size: 16
5
+ input_fps: 5.0
6
+ obs_normalize: false
7
+ inputs:
8
+ input: {shape: [1, 16, 3, 288, 512], dtype: float32, description: "16 RGB frames [t-15, ..., t] in [0,1]"}
9
+ outputs:
10
+ output: {shape: [1, 15, 3], description: "15 waypoints x 3 (x, y, yaw) at non-uniform timestamps [1,2,4,6,7,8,10,12,14,15,17,19,21,23,25] @ 5Hz (0.2s-5.0s)"}
11
+ num_waypoints: 15
12
+ output_timestamps_5hz: [1, 2, 4, 6, 7, 8, 10, 12, 14, 15, 17, 19, 21, 23, 25]
13
+ output_fps: 5.0
NoMaD_GL_Official/inference.py ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """NoMaD-GL-Official inference utilities (3-component diffusion model).
3
+
4
+ NoMaD uses iterative DDPM diffusion (10 steps) with 3 ONNX components:
5
+ 1. vision_encoder: obs + goal_img + goal_mask → conditioning vector
6
+ 2. noise_pred: single denoising step
7
+ 3. dist_pred: distance estimation
8
+
9
+ Goal-free mode: goal_mask=1, goal_img=random noise.
10
+
11
+ Action decode (NavDP / official NoMaD convention):
12
+ Diffusion output is normalized delta-actions clipped to [-1, 1]. To recover
13
+ metric waypoints in the robot frame:
14
+ 1. unnormalize: d = (norm + 1)/2 * (max - min) + min (dataset units)
15
+ 2. cumsum over time → cumulative positions (dataset units)
16
+ 3. multiply by `metric_waypoint_spacing` (m) → meters (robot frame)
17
+
18
+ nav = NoMaDGLNavigator(device="cuda")
19
+ traj, scores = nav.inference_trajectory(obs) # (B, S, 8, 2) meters
20
+ vw = nav.inference_vw(obs) # caches nav._last_best_traj
21
+ """
22
+
23
+ import os
24
+ import math
25
+ import numpy as np
26
+ import torch
27
+ import yaml
28
+ import onnxruntime as ort
29
+ from torchvision import transforms
30
+
31
+ try:
32
+ from urbansim.custom.pp import PurePursuitController
33
+ except Exception:
34
+ PurePursuitController = None # optional — only needed for inference_vw_pp
35
+
36
+ MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
37
+ VE_PATH = os.path.join(MODEL_DIR, "nomad_vision_encoder.onnx")
38
+ NP_PATH = os.path.join(MODEL_DIR, "nomad_noise_pred.onnx")
39
+ DP_PATH = os.path.join(MODEL_DIR, "nomad_dist_pred.onnx")
40
+ INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
41
+ IMG_SIZE = 96
42
+ NUM_DIFFUSION_STEPS = 10
43
+ NUM_SAMPLES = 8
44
+ WAYPOINT_INDEX = 2 # which waypoint to use for v/w
45
+ IMAGE_ASPECT_RATIO = 4 / 3 # NavDP optional center-crop target
46
+
47
+ # Training camera intrinsics (NoMaD recon dataset, from NavDP data_config.yaml).
48
+ # Used by the optional `use_train_intrinsics=True` rectification path which
49
+ # warps the source pinhole view to look like a recon-camera capture squished
50
+ # to 96x96 — i.e. the geometric input distribution the model saw at training.
51
+ TRAIN_CAM_FX = 272.547
52
+ TRAIN_CAM_FY = 266.358
53
+ TRAIN_CAM_CX = 320.0
54
+ TRAIN_CAM_CY = 220.0
55
+ TRAIN_CAM_W = 640 # = 2 * cx
56
+ TRAIN_CAM_H = 440 # = 2 * cy
57
+
58
+ # Default source intrinsics: the urban-sim undistorted pinhole image emitted
59
+ # by test_inference_forward.py:build_undistort_maps. Override at __init__ time
60
+ # if you change the upstream fisheye→pinhole step.
61
+ DEFAULT_SRC_INTRINSICS = {
62
+ "fx": 210.667, "fy": 210.667,
63
+ "cx": 256.0, "cy": 144.0,
64
+ "w": 512, "h": 288,
65
+ }
66
+
67
+ IMAGENET_TRANSFORM = transforms.Compose([
68
+ transforms.ToTensor(),
69
+ transforms.Normalize(mean=[0.485, 0.456, 0.406],
70
+ std=[0.229, 0.224, 0.225]),
71
+ ])
72
+
73
+
74
+ class PDController:
75
+ MAX_V = 2.5
76
+ MAX_W = 0.65
77
+
78
+ def __init__(self):
79
+ self.last_v = None
80
+
81
+ def reset(self):
82
+ self.last_v = None
83
+
84
+ def __call__(self, waypoints, dt=1.0):
85
+ EPS = 1e-6
86
+ idx = min(WAYPOINT_INDEX, waypoints.shape[1] - 1)
87
+ dx = waypoints[:, idx, 0]
88
+ dyr = waypoints[:, -1, 1]
89
+ dxr = waypoints[:, -1, 0]
90
+ v = dx / dt
91
+ w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
92
+ * dxr.sign() / dt)
93
+ near_zero = dx.abs() < EPS
94
+ v = torch.where(near_zero, torch.zeros_like(v), v)
95
+ w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
96
+ if self.last_v is not None:
97
+ v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
98
+ v = v.clamp(-self.MAX_V, self.MAX_V)
99
+ w = w.clamp(-self.MAX_W, self.MAX_W)
100
+ self.last_v = v
101
+ return v, w
102
+
103
+
104
+ def _make_session(path):
105
+ ort.set_default_logger_severity(3)
106
+ opts = ort.SessionOptions()
107
+ opts.log_severity_level = 3
108
+ return ort.InferenceSession(
109
+ path, sess_options=opts,
110
+ providers=[
111
+ ("CUDAExecutionProvider",
112
+ {"arena_extend_strategy": "kSameAsRequested"}),
113
+ "CPUExecutionProvider"])
114
+
115
+
116
+ class NoMaDGLNavigator:
117
+ """NoMaD goal-free diffusion navigator.
118
+
119
+ Runs DDPM diffusion loop (10 steps) to generate trajectory samples.
120
+ All coordinates: x=forward, y=left, meters.
121
+ """
122
+
123
+ context_size = 4 # 4 obs frames → 12 channels
124
+ multimodal = True
125
+
126
+ def __init__(
127
+ self,
128
+ device: str = "cuda",
129
+ max_v: float = 2.5,
130
+ max_w: float = 0.65,
131
+ dt: float = 2.0,
132
+ num_samples: int = NUM_SAMPLES,
133
+ num_diffusion_steps: int = NUM_DIFFUSION_STEPS,
134
+ center_crop: bool = True,
135
+ use_train_intrinsics: bool = True,
136
+ src_intrinsics: dict = None,
137
+ ):
138
+ self.device = device
139
+ self.dt = dt
140
+ self.num_samples = num_samples
141
+ self.num_steps = num_diffusion_steps
142
+ # NavDP default is center_crop=False (direct resize). Flag exposed for
143
+ # A/B testing: True crops landscape input to 4:3 before resizing.
144
+ self.center_crop = center_crop
145
+
146
+ # Optional virtual rectification to the NoMaD training camera (recon).
147
+ # When True, the source pinhole frame is warped (affine, same math as
148
+ # a homography between two pinhole cameras) so that its content lines
149
+ # up with what a recon-camera frame squished to 96x96 would look like.
150
+ self.use_train_intrinsics = use_train_intrinsics
151
+ self.src_intrinsics = dict(src_intrinsics or DEFAULT_SRC_INTRINSICS)
152
+ self._train_affine = None # built lazily in _ensure_train_affine
153
+
154
+ self._ve = _make_session(VE_PATH)
155
+ self._np = _make_session(NP_PATH)
156
+ self._dp = _make_session(DP_PATH)
157
+ with open(INFO_PATH, "r") as f:
158
+ self._info = yaml.safe_load(f)
159
+
160
+ # Action decoder params (NavDP convention)
161
+ action_stats = self._info.get("action_stats", {})
162
+ self._act_min = np.asarray(
163
+ action_stats.get("min", [0.0, -4.0]), dtype=np.float32)
164
+ self._act_max = np.asarray(
165
+ action_stats.get("max", [5.0, 4.0]), dtype=np.float32)
166
+ self._normalize_actions = bool(self._info.get("normalize", True))
167
+ self._waypoint_spacing = float(
168
+ self._info.get("metric_waypoint_spacing", 0.25))
169
+ # Model's expected observation frame rate (Hz). NoMaD was trained at
170
+ # 3 Hz; callers should downsample input video to this rate.
171
+ self.model_fps = float(self._info.get("model_fps", 3.0))
172
+
173
+ self._controller = PDController()
174
+ self._controller.MAX_V = max_v
175
+ self._controller.MAX_W = max_w
176
+
177
+ # Cache slot for visualization — last (B, 8, 2) trajectory chosen by
178
+ # the action head. Matches Coco's contract so downstream viewers
179
+ # (`model._last_best_traj`) just work.
180
+ self._last_best_traj = None
181
+
182
+ # Optional pure-pursuit controller for `inference_vw_pp`.
183
+ self._pp = (PurePursuitController(action_dt=3, waypoint_index=5)
184
+ if PurePursuitController is not None else None)
185
+
186
+ # Build DDPM schedule (squaredcos_cap_v2)
187
+ self._build_schedule()
188
+
189
+ def _build_schedule(self):
190
+ """Pre-compute DDPM alpha/beta schedule (matches diffusers squaredcos_cap_v2)."""
191
+ T = self.num_steps
192
+
193
+ def alpha_bar(t):
194
+ return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
195
+
196
+ betas = []
197
+ for i in range(T):
198
+ t1 = i / T
199
+ t2 = (i + 1) / T
200
+ b = min(1 - alpha_bar(t2) / alpha_bar(t1), 0.999)
201
+ betas.append(b)
202
+ betas = np.array(betas, dtype=np.float64)
203
+ alphas = 1.0 - betas
204
+ alpha_cumprod = np.cumprod(alphas)
205
+
206
+ self._betas = betas
207
+ self._alpha_cumprod = alpha_cumprod
208
+ self._sqrt_alpha = np.sqrt(alphas)
209
+ self._sqrt_one_minus_alpha_cumprod = np.sqrt(1.0 - alpha_cumprod)
210
+ # For x0 prediction clipping
211
+ self._sqrt_recip_alpha_cumprod = np.sqrt(1.0 / alpha_cumprod)
212
+ self._sqrt_recip_alpha_cumprod_m1 = np.sqrt(1.0 / alpha_cumprod - 1.0)
213
+
214
+ def _ddpm_step(self, sample, noise_pred, t):
215
+ """Single DDPM reverse step: x_{t-1} from x_t and predicted noise."""
216
+ alpha_prod_t = self._alpha_cumprod[t]
217
+ beta_t = self._betas[t]
218
+
219
+ # Predict x0
220
+ x0_pred = (sample - math.sqrt(1.0 - alpha_prod_t) * noise_pred) / math.sqrt(alpha_prod_t)
221
+ x0_pred = np.clip(x0_pred, -1.0, 1.0)
222
+
223
+ # Compute mean of q(x_{t-1} | x_t, x_0)
224
+ if t > 0:
225
+ alpha_prod_prev = self._alpha_cumprod[t - 1]
226
+ beta_prod_t = 1.0 - alpha_prod_t
227
+ beta_prod_prev = 1.0 - alpha_prod_prev
228
+ coeff_x0 = math.sqrt(alpha_prod_prev) * beta_t / beta_prod_t
229
+ coeff_xt = math.sqrt(1.0 - beta_t) * beta_prod_prev / beta_prod_t
230
+ mean = coeff_x0 * x0_pred + coeff_xt * sample
231
+ # Variance
232
+ variance = beta_t * beta_prod_prev / beta_prod_t
233
+ noise = np.random.randn(*sample.shape).astype(np.float32)
234
+ return (mean + math.sqrt(variance) * noise).astype(np.float32)
235
+ else:
236
+ return x0_pred.astype(np.float32)
237
+
238
+ def reset(self):
239
+ self._controller.reset()
240
+ self._last_best_traj = None
241
+
242
+ def _ensure_train_affine(self):
243
+ """Build PIL affine coeffs mapping output (96x96) pixels → source pixels.
244
+
245
+ Both cameras are pinhole, so the ray (X/Z, Y/Z) for a target pixel
246
+ (u_t, v_t) is ((u_t - cx_t)/fx_t, (v_t - cy_t)/fy_t). Re-projecting that
247
+ ray into the source camera gives (u_s, v_s) = (ray_x * fx_s + cx_s,
248
+ ray_y * fy_s + cy_s). Substituting yields an affine map:
249
+
250
+ u_s = (fx_s / fx_t) * u_t + (cx_s - cx_t * fx_s / fx_t)
251
+ v_s = (fy_s / fy_t) * v_t + (cy_s - cy_t * fy_s / fy_t)
252
+
253
+ PIL.Image.transform expects (a, b, c, d, e, f) where:
254
+ u_s = a*u_t + b*v_t + c
255
+ v_s = d*u_t + e*v_t + f
256
+ """
257
+ if self._train_affine is not None:
258
+ return
259
+ # Effective recon intrinsics when its 640x440 native frame is squished
260
+ # to 96x96 (which is what the model saw during training).
261
+ fx_t = TRAIN_CAM_FX * IMG_SIZE / TRAIN_CAM_W
262
+ fy_t = TRAIN_CAM_FY * IMG_SIZE / TRAIN_CAM_H
263
+ cx_t = TRAIN_CAM_CX * IMG_SIZE / TRAIN_CAM_W
264
+ cy_t = TRAIN_CAM_CY * IMG_SIZE / TRAIN_CAM_H
265
+
266
+ src = self.src_intrinsics
267
+ a = src["fx"] / fx_t
268
+ e = src["fy"] / fy_t
269
+ c = src["cx"] - cx_t * a
270
+ f = src["cy"] - cy_t * e
271
+ self._train_affine = (a, 0.0, c, 0.0, e, f)
272
+
273
+ def _to_model_input_uint8(self, frame_uint8):
274
+ """Apply preprocessing to one (H, W, 3) uint8 RGB frame → (96, 96, 3) uint8.
275
+
276
+ Used for both the actual model input and for visualization (panel 2).
277
+ Does NOT apply ImageNet normalization.
278
+ """
279
+ from PIL import Image as PILImage
280
+ pil = PILImage.fromarray(frame_uint8)
281
+ if self.use_train_intrinsics:
282
+ self._ensure_train_affine()
283
+ pil = pil.transform(
284
+ (IMG_SIZE, IMG_SIZE), PILImage.AFFINE, self._train_affine,
285
+ resample=PILImage.BILINEAR, fillcolor=0)
286
+ return np.array(pil)
287
+ if self.center_crop:
288
+ w, h = pil.size
289
+ if w > h:
290
+ crop_w = int(h * IMAGE_ASPECT_RATIO)
291
+ x0 = max((w - crop_w) // 2, 0)
292
+ pil = pil.crop((x0, 0, x0 + crop_w, h))
293
+ else:
294
+ crop_h = int(w / IMAGE_ASPECT_RATIO)
295
+ y0 = max((h - crop_h) // 2, 0)
296
+ pil = pil.crop((0, y0, w, y0 + crop_h))
297
+ pil = pil.resize((IMG_SIZE, IMG_SIZE))
298
+ return np.array(pil)
299
+
300
+ def _preprocess_images(self, obs_rgb):
301
+ """obs_rgb: (B, T=4, 3, H, W) float32 [0,1] → (B, 12, 96, 96) ImageNet-norm."""
302
+ from PIL import Image as PILImage
303
+ B, T = obs_rgb.shape[:2]
304
+ frames = []
305
+ for b in range(B):
306
+ ch_list = []
307
+ for t in range(T):
308
+ frame = (obs_rgb[b, t].transpose(1, 2, 0) * 255).astype(np.uint8)
309
+ model_in = self._to_model_input_uint8(frame)
310
+ pil = PILImage.fromarray(model_in)
311
+ ch_list.append(IMAGENET_TRANSFORM(pil).numpy())
312
+ frames.append(np.concatenate(ch_list, axis=0))
313
+ return np.stack(frames) # (B, 12, 96, 96)
314
+
315
+ def _decode_actions(self, sample):
316
+ """Decode normalized deltas → metric waypoints (NavDP recipe).
317
+
318
+ sample: (N, T, 2) normalized deltas in [-1, 1]
319
+ Returns: (N, T, 2) cumulative positions in meters (x=fwd, y=left).
320
+ """
321
+ # 1. Unnormalize from [-1, 1] → [min, max] (dataset units).
322
+ deltas = (sample + 1.0) / 2.0
323
+ deltas = deltas * (self._act_max - self._act_min) + self._act_min
324
+ # 2. Cumulative sum over time.
325
+ positions = np.cumsum(deltas, axis=1)
326
+ # 3. Scale by metric_waypoint_spacing (only when training-time normalize=True).
327
+ if self._normalize_actions:
328
+ positions = positions * self._waypoint_spacing
329
+ return positions.astype(np.float32)
330
+
331
+ def inference_trajectory(self, obs):
332
+ """Run diffusion model → (trajectories, scores).
333
+
334
+ Args:
335
+ obs: (B, 4, 3, H, W) float32 in [0,1]. 4 frames.
336
+
337
+ Returns:
338
+ trajectory: np.ndarray (B, num_samples, 8, 2) — METERS, robot frame
339
+ (x=forward, y=left). Decoded from normalized deltas via
340
+ unnorm → cumsum → scale.
341
+ scores: np.ndarray (B, num_samples) — uniform (no scoring in goal-free)
342
+ """
343
+ if isinstance(obs, torch.Tensor):
344
+ obs_np = obs.cpu().numpy()
345
+ else:
346
+ obs_np = np.asarray(obs, dtype=np.float32)
347
+ B = obs_np.shape[0]
348
+
349
+ obs_norm = self._preprocess_images(obs_np) # (B, 12, 96, 96)
350
+
351
+ # Goal-free: random goal image + mask=1
352
+ fake_goal = np.random.randn(B, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
353
+ mask = np.ones(B, dtype=np.int64)
354
+
355
+ # 1. Vision encoder
356
+ cond = self._ve.run(None, {
357
+ "obs_img": obs_norm,
358
+ "goal_img": fake_goal,
359
+ "goal_mask": mask,
360
+ })[0] # (B, 256)
361
+
362
+ # Replicate for num_samples
363
+ cond_rep = np.repeat(cond, self.num_samples, axis=0) # (B*S, 256)
364
+
365
+ # 2. Diffusion denoising loop (output: normalized deltas in [-1,1]).
366
+ BS = B * self.num_samples
367
+ sample = np.random.randn(BS, 8, 2).astype(np.float32)
368
+
369
+ for t in reversed(range(self.num_steps)):
370
+ t_arr = np.array(t, dtype=np.int64)
371
+ # Suppress stderr from ORT during inference
372
+ devnull = os.open(os.devnull, os.O_WRONLY)
373
+ old_stderr = os.dup(2)
374
+ os.dup2(devnull, 2)
375
+ try:
376
+ noise_pred = self._np.run(None, {
377
+ "sample": sample,
378
+ "timestep": t_arr,
379
+ "global_cond": cond_rep,
380
+ })[0]
381
+ finally:
382
+ os.dup2(old_stderr, 2)
383
+ os.close(devnull)
384
+ os.close(old_stderr)
385
+
386
+ sample = self._ddpm_step(sample, noise_pred, t)
387
+
388
+ # 3. Decode normalized deltas → metric waypoints.
389
+ waypoints = self._decode_actions(sample) # (BS, 8, 2) meters
390
+
391
+ trajectory = waypoints.reshape(B, self.num_samples, 8, 2)
392
+ scores = np.ones((B, self.num_samples), dtype=np.float32)
393
+ return trajectory, scores
394
+
395
+ def inference_vw(self, obs):
396
+ """Run model → (v, w) velocity commands.
397
+
398
+ Picks the first diffusion sample (goal-free has uniform scores, so any
399
+ sample is equally valid), runs the PD controller on its waypoints, and
400
+ caches the chosen trajectory on ``self._last_best_traj`` so visualizers
401
+ can pull it without re-running inference.
402
+
403
+ Args:
404
+ obs: (B, 4, 3, H, W) float32 in [0, 1]
405
+
406
+ Returns:
407
+ vw: torch.Tensor (B, 2) — [linear_vel, angular_vel]
408
+ """
409
+ trajectory, _ = self.inference_trajectory(obs)
410
+ best_traj = trajectory[:, 0] # (B, 8, 2)
411
+ self._last_best_traj = best_traj # cached for visualization
412
+ waypoints = torch.from_numpy(best_traj).float().to(self.device)
413
+ v, w = self._controller(waypoints, dt=self.dt)
414
+ return torch.stack([v, w], dim=1), best_traj
415
+
416
+ def inference_vw_pp(self, obs, robot):
417
+ """Pure-pursuit variant of :meth:`inference_vw`.
418
+
419
+ Picks the first diffusion sample's waypoints and feeds them to the
420
+ urbansim ``PurePursuitController`` instead of the PD head. Same return
421
+ contract as :meth:`inference_vw`.
422
+
423
+ Args:
424
+ obs: (B, 4, 3, H, W) float32 in [0, 1]
425
+ robot: IsaacLab articulation handle (``env.scene["robot"]``).
426
+
427
+ Returns:
428
+ vw: torch.Tensor (B, 2) — [linear_vel, angular_vel]
429
+ """
430
+ if self._pp is None:
431
+ raise RuntimeError(
432
+ "PurePursuitController unavailable — install urbansim or "
433
+ "make sure `urbansim.custom.pp` is on the Python path."
434
+ )
435
+ trajectory, _ = self.inference_trajectory(obs)
436
+ best_traj = trajectory[:, 0] # (B, 8, 2)
437
+ self._last_best_traj = best_traj # cached for visualization
438
+ waypoints = torch.from_numpy(best_traj).float().to(self.device)
439
+ v, w = self._pp.step(waypoints, robot=robot)
440
+ return torch.stack([v, w], dim=1)
NoMaD_GL_Official/model_info.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: NoMaD-GL-Official
2
+ goal_type: goal_less
3
+ image_resolution: [96, 96]
4
+ context_size: 4
5
+ obs_normalize: true
6
+ diffusion_steps: 10
7
+ diffusion_scheduler: DDPMScheduler
8
+ diffusion_config:
9
+ beta_schedule: squaredcos_cap_v2
10
+ clip_sample: true
11
+ prediction_type: epsilon
12
+ inputs:
13
+ obs_img: {shape: [batch, 12, 96, 96], dtype: float32, description: "4 RGB frames x 3 channels, ImageNet normalized"}
14
+ goal_img: {shape: [batch, 3, 96, 96], dtype: float32, description: "Goal image (random noise for goal-free)"}
15
+ goal_mask: {shape: [batch], dtype: int64, description: "1=goal-free, 0=goal-conditioned"}
16
+ outputs:
17
+ trajectories: {shape: [num_samples, 8, 2], description: "8 normalized delta-actions in [-1,1]; decode via unnorm→cumsum→scale"}
18
+ distances: {shape: [batch, 1], description: "Predicted distance to goal"}
19
+ num_waypoints: 8
20
+ num_samples: 8
21
+ waypoint_index: 2
22
+
23
+ # NoMaD action decoder (NavDP convention)
24
+ # 1. unnormalize: d = (norm + 1) / 2 * (max - min) + min
25
+ # 2. cumsum over time → cumulative positions in dataset units
26
+ # 3. multiply by metric_waypoint_spacing → meters in robot frame (x=fwd, y=left)
27
+ normalize: true
28
+ action_stats:
29
+ min: [0.0, -4.0] # [min_dx, min_dy] in dataset waypoint-spacing units
30
+ max: [5.0, 4.0] # [max_dx, max_dy]
31
+ # Per-step metric spacing (meters): model_max_v / model_rate = 0.8 / 3.0
32
+ model_max_v: 0.8
33
+ model_rate: 3.0
34
+ metric_waypoint_spacing: 0.26667
35
+
36
+ # Inference frame rate (Hz). NoMaD was trained at 3 Hz observation rate; the
37
+ # 4-frame context spans ~1.33 s and each predicted waypoint is 1/model_fps s
38
+ # into the future. Upstream callers should downsample their input video to
39
+ # match this — see test_inference_forward.py.
40
+ model_fps: 3.0
NoMaD_GL_Official/nomad_dist_pred.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adab58bef61d7b8d0f2bf04636853f0056229e06634e2536a5e7821d6b9def4f
3
+ size 71682
NoMaD_GL_Official/nomad_dist_pred.onnx.data ADDED
Binary file (69.6 kB). View file
 
NoMaD_GL_Official/nomad_noise_pred.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3588e1ff53ba748240f160d3cd3fec0c9c987f05e59ec64a631c3536e036e34
3
+ size 15554469
NoMaD_GL_Official/nomad_vision_encoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3477772d7ff671dbba9711c58b2df89deb9b1b6a93cae07c530524ab6d47377
3
+ size 47986697
NoMaD_GL_Official/nomad_vision_encoder.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57ceee7e8f0777b6e5cb444cddb1e17549e8188c78ba414463696b130031b320
3
+ size 47448064
README.md CHANGED
@@ -1,3 +1,235 @@
1
  ---
2
  license: apache-2.0
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
+ tags:
4
+ - robotics
5
+ - navigation
6
+ - visual-navigation
7
+ - embodied-ai
8
+ - onnx
9
+ pipeline_tag: robotics
10
  ---
11
+
12
+ # Navigation Model Zoo
13
+
14
+ A collection of vision-based navigation policies exported to **ONNX**, each wrapped in a small,
15
+ uniform Python inference API. Maintained by **Honglin He @ UCLA-VAIL**.
16
+
17
+ Every model takes a short history of RGB frames and predicts a local trajectory (and optionally a
18
+ distance-to-goal / arrival signal); a built-in PD controller turns the trajectory into `(v, ω)`
19
+ velocity commands. All models share the same wrapper interface so they can be swapped and
20
+ benchmarked without per-model glue code.
21
+
22
+ ## Models
23
+
24
+ | Folder | Model / paper | Goal mode | Context | Input H×W | Waypoints | Weights |
25
+ |--------|---------------|-----------|:-------:|:---------:|:---------:|---------|
26
+ | [`GNM_GL_Official`](GNM_GL_Official) | [GNM](https://arxiv.org/abs/2210.03370) · ICRA&nbsp;2023 | goal-free | 6 | 64×85 | 5 | `gnm_imagegoal.onnx` (+`.data`) · 35 MB |
27
+ | [`Vint_GL_Official`](Vint_GL_Official) | [ViNT](https://arxiv.org/abs/2306.14846) · CoRL&nbsp;2023 | goal-free | 6 | 64×85 | 5 | `vint_imagegoal.onnx` (+`.data`) · 97 MB |
28
+ | [`NoMaD_GL_Official`](NoMaD_GL_Official) | [NoMaD](https://arxiv.org/abs/2310.07896) · ICRA&nbsp;2024 | goal-free (diffusion) | 4 | 96×96 | 8 ×8 samples | 3× `.onnx` (+`.data`) · 111 MB |
29
+ | [`CityWalker_PG_Official`](CityWalker_PG_Official) | [CityWalker](https://arxiv.org/abs/2411.17820) · CVPR&nbsp;2025 | point-goal | 5 | 350×630 | 5 | `citywalker.onnx` · 806 MB |
30
+ | [`MBRA_PG_Official`](MBRA_PG_Official) | [MBRA](https://arxiv.org/abs/2505.05592) · RA-L&nbsp;2025 | point-goal | 6 | 96×96 | 8 | `mbra.onnx` · 254 MB |
31
+ | [`S2E`](S2E) | [S2E](https://arxiv.org/abs/2507.22028) · ICLR&nbsp;2026 | point-goal / goal-free | 11 | 256×256 | 10 | `s2e.onnx` · 382 MB |
32
+ | [`MIMIC`](MIMIC) | [MIMIC](https://arxiv.org/abs/2603.22527) · ICRA&nbsp;2026 | goal-free | 16 | 288×512 | 13 | `mimic.onnx` · 318 MB |
33
+
34
+ Suffix legend: `PG` = point-goal, `GL` = goal-less (goal-free). Models with a `.onnx.data` companion
35
+ (GNM, ViNT, NoMaD) use ONNX external weights — keep each `.onnx` and its `.onnx.data` together.
36
+
37
+ ## Common interface
38
+
39
+ Each folder is a self-contained module exposing one navigator class. They all follow the same contract:
40
+
41
+ ```python
42
+ import numpy as np
43
+ from MBRA_PG_Official.inference import MBRAPGNavigator # run from the repo root
44
+
45
+ nav = MBRAPGNavigator(device="cuda") # use device="cpu" if you have no GPU
46
+
47
+ # obs: (B, nav.context_size, 3, H, W) float32 in [0, 1]
48
+ # the wrapper resizes & normalizes to the model's spec internally
49
+ obs = np.random.rand(1, nav.context_size, 3, 96, 96).astype(np.float32)
50
+
51
+ # Point-goal models take goal_xy (standard frame: x=forward, y=left, meters);
52
+ # goal-free models omit it.
53
+ traj, scores = nav.inference_trajectory(obs, goal_xy=np.array([5.0, 0.2])) # (B, M, W, 2) meters
54
+ vw, best = nav.inference_vw(obs, goal_xy=np.array([5.0, 0.2])) # vw: (B, 2) = [v, ω]
55
+
56
+ nav.reset() # clears PD-controller velocity smoothing between episodes
57
+ ```
58
+
59
+ Conventions shared by every model:
60
+
61
+ - **Coordinate frame** — all user-facing inputs/outputs are *standard frame*: `x = forward`, `y = left`, in meters. Models with a different internal convention (e.g. CityWalker) convert transparently.
62
+ - **Observations** — `(B, context_size, 3, H, W)`, `float32`, pixel values in `[0, 1]`. The wrapper handles resize and any ImageNet normalization. *(Exception: `MIMIC` expects frames already at 288×512 and does not resize.)*
63
+ - **`inference_trajectory(obs[, goal_xy])`** → `(trajectory, scores)`. `trajectory` is `(B, M, W, 2)` in meters, where `M` is the number of modes (1 for unimodal, 8 for NoMaD) and `W` the waypoint count; `scores` is `(B, M)`.
64
+ - **`inference_vw(obs[, goal_xy])`** → `(vw, best_traj)` where `vw` is a `(B, 2)` torch tensor of `[linear_v, angular_w]`. Tune limits with `max_v` / `max_w` at construction.
65
+ - Goal-free models (`Vint`, `GNM`, `NoMaD`, `MIMIC`) ignore `goal_xy` — call `inference_trajectory(obs)`.
66
+
67
+ ## Installation
68
+
69
+ ```bash
70
+ pip install onnxruntime-gpu numpy torch torchvision pyyaml pillow
71
+ # CPU-only: use onnxruntime instead of onnxruntime-gpu
72
+ pip install opencv-python # required by S2E (frame resizing)
73
+ ```
74
+
75
+ Optional, lab-internal dependency: `Vint`, `GNM`, and `NoMaD` expose an extra `inference_vw_pp()`
76
+ method that uses `urbansim.custom.pp.PurePursuitController`; it is imported lazily and only needed
77
+ for that method. **`MIMIC` imports `urbansim` at module load**, so its `inference.py` will not import
78
+ without the `urbansim` package on your path.
79
+
80
+ ## Model details
81
+
82
+ ### GNM_GL_Official — `gnm_imagegoal.onnx` (+ `.onnx.data`)
83
+ **Paper:** *GNM: A General Navigation Model to Drive Any Robot* (ICRA 2023) · [arXiv:2210.03370](https://arxiv.org/abs/2210.03370) · [code](https://github.com/robodhruv/drive-any-robot)
84
+
85
+ Goal-free General Navigation Model — same NavDP image-goal I/O contract as ViNT (`obs_img (B,18,64,85)` + `goal_img (B,3,64,85)` → `dist_pred (B,1)`, `action_pred (B,5,4)`), with a lower top speed. Expects input downsampled to ≈ 3 Hz.
86
+
87
+ ### Vint_GL_Official — `vint_imagegoal.onnx` (+ `.onnx.data`)
88
+ **Paper:** *ViNT: A Foundation Model for Visual Navigation* (CoRL 2023) · [arXiv:2306.14846](https://arxiv.org/abs/2306.14846) · [project](https://general-navigation-models.github.io/vint/)
89
+
90
+ Goal-free ViNT (NavDP image-goal backbone run with a random goal image). **ONNX I/O:** `obs_img (B,18,64,85)` (6 ImageNet-normalized frames × 3 ch) + `goal_img (B,3,64,85)` (random noise) → `dist_pred (B,1)`, `action_pred (B,5,4)`. Cumulative `xy` is already baked in; the wrapper scales by the 0.8 m metric spacing. Expects input downsampled to ≈ 3 Hz.
91
+
92
+ ### NoMaD_GL_Official — 3× ONNX (diffusion, + `.onnx.data`)
93
+ **Paper:** *NoMaD: Goal Masked Diffusion Policies for Navigation and Exploration* (ICRA 2024) · [arXiv:2310.07896](https://arxiv.org/abs/2310.07896) · [project](https://general-navigation-models.github.io/nomad/)
94
+
95
+ Goal-free diffusion policy. Runs a 10-step DDPM loop (`squaredcos_cap_v2`) over 3 components:
96
+ `nomad_vision_encoder.onnx` (`obs_img (B,12,96,96)` + `goal_img (B,3,96,96)` + `goal_mask (B)` → `cond (B,256)`), `nomad_noise_pred.onnx` (one denoising step), and `nomad_dist_pred.onnx`. Produces **8 trajectory samples** → `trajectory (B,8,8,2)` meters (decode: unnormalize → cumsum → ×0.267 m spacing). This is the only multi-modal model and the slowest (diffusion + multiple samples).
97
+
98
+ ### CityWalker_PG_Official — `citywalker.onnx`
99
+ **Paper:** *CityWalker: Learning Embodied Urban Navigation from Web-Scale Videos* (CVPR 2025) · [arXiv:2411.17820](https://arxiv.org/abs/2411.17820) · [project](https://ai4ce.github.io/CityWalker/)
100
+
101
+ Point-goal urban walker. **ONNX I/O:** `obs_images (B,5,3,350,630)` + `trajectory (B,6,2)` past waypoints → `wp_pred (B,5,2)`, `arrive_pred (B,1)` (arrival probability). Images are ImageNet-normalized internally; the model's internal `y=forward, x=right` frame is converted to standard frame by the wrapper. Input rate ≈ 5 Hz.
102
+
103
+ ### MBRA_PG_Official — `mbra.onnx`
104
+ **Paper:** *Learning to Drive Anywhere with Model-Based Reannotation* (RA-L 2025) · [arXiv:2505.05592](https://arxiv.org/abs/2505.05592) · [project](https://model-base-reannotation.github.io/)
105
+
106
+ Point-goal policy. **ONNX I/O:** `obs_images (B,6,3,96,96)` ImageNet-normalized + `goal_pose (B,4)` = `[x, y, sin(yaw), cos(yaw)]` → `waypoints (B,8,4)`. Goal is given as `goal_xy` (meters) and converted internally; waypoints are un-normalized by a 0.8 m metric spacing. Input rate ≈ 5 Hz.
107
+
108
+ ### S2E — `s2e.onnx`
109
+ **Paper:** *From Seeing to Experiencing: Scaling Navigation Foundation Models with Reinforcement Learning* (ICLR 2026) · [arXiv:2507.22028](https://arxiv.org/abs/2507.22028) · [project](https://metadriverse.github.io/s2e)
110
+
111
+ UCLA-VAIL navigation foundation model; this is the behavior-cloning, point-goal, web-pretrained variant (`S2EBC-PG-Web100`). **ONNX I/O:** `obs_images (B,11,3,256,256)` in `[0,1]` (no ImageNet norm) + `goal (B,3)` = `[norm_dist, cos(θ), sin(θ)]` → `wp_pred (B,10,3)` `[x,y,yaw]`, `wp_pred_score (B,63)` mode scores. Frames are resized to 256×256 with OpenCV.
112
+
113
+ ### MIMIC — `mimic.onnx`
114
+ **Paper:** *Learning Sidewalk Autopilot from Multi-Scale Imitation with Corrective Behavior Expansion* (ICRA 2026) · [arXiv:2603.22527](https://arxiv.org/abs/2603.22527) · [project](https://vail-ucla.github.io/MIMIC)
115
+
116
+ UCLA-VAIL goal-free long-context sidewalk policy. **ONNX I/O:** `input (1,16,3,288,512)` in `[0,1]` → `output (1,15,3)` `[x,y,yaw]` at non-uniform timestamps (0.2 s–5.0 s @ 5 Hz). Batch is processed one sample at a time; the wrapper keeps the first 13 waypoints (~4 s) and scales to meters. Requires `urbansim` (see Installation).
117
+
118
+ ## Downloading
119
+
120
+ **Full repo** (includes the LFS-tracked ONNX weights):
121
+ ```bash
122
+ hf download UCLA-VAIL/Navigation-Model-Zoo-Public --local-dir ./Navigation-Model-Zoo-Public
123
+ ```
124
+
125
+ **One model** — fetch just its folder, e.g. MBRA:
126
+ ```bash
127
+ hf download UCLA-VAIL/Navigation-Model-Zoo-Public \
128
+ --include "MBRA_PG_Official/*" --local-dir .
129
+ ```
130
+
131
+ Then run from the repo root: `from MBRA_PG_Official.inference import MBRAPGNavigator`.
132
+
133
+ > **External weights:** GNM, ViNT, and NoMaD ship `*.onnx.data` files — keep each `.onnx` and its
134
+ > `.onnx.data` together in the same folder so ONNX Runtime can resolve the weights.
135
+
136
+ ## Intended use & limitations
137
+
138
+ These are **research artifacts** for navigation research, reproduction, and benchmarking — not
139
+ safety-validated for deployment on real robots without additional testing. Each policy's behavior
140
+ is bounded by its training distribution (camera intrinsics, embodiment, frame rate, environment).
141
+ Several wrappers rectify/resize inputs to a specific training camera; mismatched cameras may degrade
142
+ performance.
143
+
144
+ ## License
145
+
146
+ Released under **Apache 2.0**. Individual models carry the licenses and terms of their original
147
+ sources (ViNT, GNM, NoMaD, CityWalker, MBRA) — check upstream before commercial use.
148
+
149
+ ## Citation
150
+
151
+ If you use a model from this zoo, please cite its original paper.
152
+
153
+ **GNM**
154
+ ```bibtex
155
+ @inproceedings{shah2023gnm,
156
+ title={Gnm: A general navigation model to drive any robot},
157
+ author={Shah, Dhruv and Sridhar, Ajay and Bhorkar, Arjun and Hirose, Noriaki and Levine, Sergey},
158
+ booktitle={2023 IEEE International Conference on Robotics and Automation (ICRA)},
159
+ pages={7226--7233},
160
+ year={2023},
161
+ organization={IEEE}
162
+ }
163
+ ```
164
+
165
+ **ViNT**
166
+ ```bibtex
167
+ @article{shah2023vint,
168
+ title={ViNT: A foundation model for visual navigation},
169
+ author={Shah, Dhruv and Sridhar, Ajay and Dashora, Nitish and Stachowicz, Kyle and Black, Kevin and Hirose, Noriaki and Levine, Sergey},
170
+ journal={arXiv preprint arXiv:2306.14846},
171
+ year={2023}
172
+ }
173
+ ```
174
+
175
+ **NoMaD**
176
+ ```bibtex
177
+ @inproceedings{sridhar2024nomad,
178
+ title={Nomad: Goal masked diffusion policies for navigation and exploration},
179
+ author={Sridhar, Ajay and Shah, Dhruv and Glossop, Catherine and Levine, Sergey},
180
+ booktitle={2024 IEEE International Conference on Robotics and Automation (ICRA)},
181
+ pages={63--70},
182
+ year={2024},
183
+ organization={IEEE}
184
+ }
185
+ ```
186
+
187
+ **CityWalker**
188
+ ```bibtex
189
+ @inproceedings{liu2025citywalker,
190
+ title={Citywalker: Learning embodied urban navigation from web-scale videos},
191
+ author={Liu, Xinhao and Li, Jintong and Jiang, Yicheng and Sujay, Niranjan and Yang, Zhicheng and Zhang, Juexiao and Abanes, John and Zhang, Jing and Feng, Chen},
192
+ booktitle={Proceedings of the Computer Vision and Pattern Recognition Conference},
193
+ pages={6875--6885},
194
+ year={2025}
195
+ }
196
+ ```
197
+
198
+ **MBRA**
199
+ ```bibtex
200
+ @article{hirose2025learning,
201
+ title={Learning to drive anywhere with model-based reannotation},
202
+ author={Hirose, Noriaki and Ignatova, Lydia and Stachowicz, Kyle and Glossop, Catherine and Levine, Sergey and Shah, Dhruv},
203
+ journal={IEEE Robotics and Automation Letters},
204
+ volume={11},
205
+ number={2},
206
+ pages={1242--1249},
207
+ year={2025},
208
+ publisher={IEEE}
209
+ }
210
+ ```
211
+
212
+ **S2E**
213
+ ```bibtex
214
+ @article{he2025seeing,
215
+ title={From seeing to experiencing: Scaling navigation foundation models with reinforcement learning},
216
+ author={He, Honglin and Ma, Yukai and Squicciarini, Brad and Wu, Wayne and Zhou, Bolei},
217
+ journal={arXiv preprint arXiv:2507.22028},
218
+ year={2025}
219
+ }
220
+ ```
221
+
222
+ **MIMIC**
223
+ ```bibtex
224
+ @article{he2026learning,
225
+ title={Learning Sidewalk Autopilot from Multi-Scale Imitation with Corrective Behavior Expansion},
226
+ author={He, Honglin and Ma, Yukai and Squicciarini, Brad and Wu, Wayne and Zhou, Bolei},
227
+ journal={arXiv preprint arXiv:2603.22527},
228
+ year={2026}
229
+ }
230
+ ```
231
+
232
+ ## Contact
233
+
234
+ Maintained by [UCLA-VAIL](https://vail-ucla.github.io/). Open an issue/discussion on the
235
+ repository page for questions or contributions.
S2E/inference.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """S2EBC-PG-Web100 inference utilities.
3
+
4
+ S2EBC-PG takes 11 frames at 256x256 (no ImageNet normalization, [0,1]),
5
+ a goal vector [norm_dist, cos(angle), sin(angle)], and outputs
6
+ 10 waypoints x 3 (x, y, yaw). Waypoints are scaled by 1/0.1 = 10x.
7
+
8
+ User provides goal_xy in standard frame (x=forward, y=left, meters).
9
+
10
+ nav = S2EBCPGNavigator(device="cuda")
11
+ traj, scores = nav.inference_trajectory(obs, goal_xy=np.array([5.0, 0.2]))
12
+ vw = nav.inference_vw(obs, goal_xy=np.array([5.0, 0.2]))
13
+ """
14
+
15
+ import os
16
+ import math
17
+ import numpy as np
18
+ import torch
19
+ import onnxruntime as ort
20
+
21
+ MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
22
+ ONNX_PATH = os.path.join(MODEL_DIR, "s2e.onnx")
23
+ IMG_SIZE = 256
24
+ WP_SCALE = 0.25 # model outputs are divided by this to get meters
25
+
26
+
27
+ class PDController:
28
+ MAX_V = 2.5
29
+ MAX_W = 0.65
30
+
31
+ def __init__(self):
32
+ self.last_v = None
33
+
34
+ def reset(self):
35
+ self.last_v = None
36
+
37
+ def __call__(self, waypoints, dt=1.0):
38
+ EPS = 1e-6
39
+ idx = min(4, waypoints.shape[1] - 1)
40
+ dx = waypoints[:, idx, 0]
41
+ dyr = waypoints[:, -1, 1]
42
+ dxr = waypoints[:, -1, 0]
43
+ v = dx / dt
44
+ w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
45
+ * dxr.sign() / dt )
46
+ near_zero = dx.abs() < EPS
47
+ v = torch.where(near_zero, torch.zeros_like(v), v)
48
+ w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
49
+ if self.last_v is not None:
50
+ v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
51
+ v = v.clamp(-self.MAX_V, self.MAX_V)
52
+ w = w.clamp(-self.MAX_W, self.MAX_W)
53
+ self.last_v = v
54
+ return v, w
55
+
56
+
57
+ class S2ENavigator:
58
+ """S2E point-goal / goal-free navigator.
59
+
60
+ All user-facing coordinates: x=forward, y=left, meters.
61
+ """
62
+
63
+ context_size = 11
64
+ multimodal = False
65
+
66
+ def __init__(
67
+ self,
68
+ onnx_path: str = ONNX_PATH,
69
+ device: str = "cuda",
70
+ max_v: float = 2.5,
71
+ max_w: float = 0.65,
72
+ dt: float = 2.0,
73
+ ):
74
+ self.device = device
75
+ self.dt = dt
76
+
77
+ ort.set_default_logger_severity(3)
78
+ providers = [
79
+ ("CUDAExecutionProvider",
80
+ {"arena_extend_strategy": "kSameAsRequested"}),
81
+ "CPUExecutionProvider",
82
+ ]
83
+ self._session = ort.InferenceSession(onnx_path, providers=providers)
84
+ self._controller = PDController()
85
+ self._controller.MAX_V = max_v
86
+ self._controller.MAX_W = max_w
87
+ self._last_best_traj = None
88
+ INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
89
+ import yaml
90
+ with open(INFO_PATH, "r") as f:
91
+ self._info = yaml.safe_load(f)
92
+
93
+ def reset(self):
94
+ self._controller.reset()
95
+ self._last_best_traj = None
96
+
97
+ @staticmethod
98
+ def _goal_to_input(goal_xy):
99
+ """Standard goal [x_fwd, y_left] meters → model input [norm_dist, cos(θ), sin(θ)]."""
100
+ x, y = float(goal_xy[0]), float(goal_xy[1])
101
+ dist = math.sqrt(x * x + y * y)
102
+ norm_dist = max(min(dist, 200.0), 0.1) / 200.0
103
+ angle = math.atan2(y, x)
104
+ return np.array([norm_dist, math.cos(angle), math.sin(angle)],
105
+ dtype=np.float32)
106
+
107
+ def inference_trajectory(self, obs, goal_xy=None):
108
+ """Run model → (trajectory, scores).
109
+
110
+ Args:
111
+ obs: (B, 11, 3, H, W) float32 in [0,1]. 11 frames.
112
+ Images are resized internally to 256x256.
113
+ goal_xy: (2,) goal in standard frame, or None → [5.0, 0.0].
114
+
115
+ Returns:
116
+ trajectory: np.ndarray (B, 1, 10, 2) meters
117
+ scores: np.ndarray (B, 1)
118
+ """
119
+ if isinstance(obs, torch.Tensor):
120
+ obs_np = obs.cpu().numpy()
121
+ else:
122
+ obs_np = np.asarray(obs, dtype=np.float32)
123
+ B = obs_np.shape[0]
124
+
125
+ # Resize to 256x256 if needed (obs is CHW)
126
+ if obs_np.shape[-2:] != (IMG_SIZE, IMG_SIZE):
127
+ import cv2
128
+ resized = np.empty((B, 11, 3, IMG_SIZE, IMG_SIZE), dtype=np.float32)
129
+ for b in range(B):
130
+ for c in range(11):
131
+ frame_hwc = obs_np[b, c].transpose(1, 2, 0) # HWC
132
+ frame_rsz = cv2.resize(frame_hwc, (IMG_SIZE, IMG_SIZE))
133
+ resized[b, c] = frame_rsz.transpose(2, 0, 1)
134
+ obs_np = resized
135
+
136
+ if goal_xy is None:
137
+ goal_xy = np.array([5.0, 0.0])
138
+ goal_input = self._goal_to_input(goal_xy)
139
+ goal_batch = np.tile(goal_input, (B, 1))
140
+
141
+ all_wp_raw = []
142
+ for i in range(B):
143
+ out = self._session.run(None, {
144
+ "obs_images": obs_np[i:i+1], # (1, 11, 3, 256, 256)
145
+ "goal": goal_batch[i:i+1], # (1, 3)
146
+ })
147
+ all_wp_raw.append(out[0][:, :, :2])
148
+
149
+ wp_raw = np.concatenate(all_wp_raw, axis=0) # (B, 10, 2) — x, y (scaled)
150
+ wp_meters = wp_raw * WP_SCALE # un-scale to meters
151
+
152
+ trajectory = wp_meters[:, np.newaxis].astype(np.float32) # (B, 1, 10, 2)
153
+ scores = np.ones((B, 1), dtype=np.float32)
154
+ self._last_best_traj = trajectory[:, 0]
155
+ return trajectory, scores
156
+
157
+ def inference_vw(self, obs, goal_xy=None):
158
+ """Run model → (v, w).
159
+
160
+ Returns:
161
+ vw: torch.Tensor (B, 2)
162
+ """
163
+ trajectory, _ = self.inference_trajectory(obs, goal_xy)
164
+ best_traj = trajectory[:, 0]
165
+ waypoints = torch.from_numpy(best_traj).float().to(self.device)
166
+ v, w = self._controller(waypoints, dt=self.dt)
167
+ return torch.stack([v, w], dim=1), best_traj
S2E/model_info.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: S2E
2
+ goal_type: point_goal / goal-free (random goal)
3
+ image_resolution: [256, 256]
4
+ context_size: 11
5
+ obs_normalize: false
6
+ inputs:
7
+ obs_images: {shape: [batch, 11, 3, 256, 256], dtype: float32, description: "11 RGB observation frames in [0,1]"}
8
+ goal: {shape: [batch, 3], dtype: float32, description: "Goal (x, y, yaw) in meters/rad"}
9
+ outputs:
10
+ wp_pred: {shape: [batch, 10, 3], description: "10 waypoints x 3 (x, y, yaw)"}
11
+ wp_pred_score: {shape: [batch, 63], description: "Mode selection scores"}
12
+ num_waypoints: 10
S2E/s2e.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee1410ab55a54946cf25323e82b3be2a8cb106e2ccb2ad878e7638839686108a
3
+ size 381993971
Vint_GL_Official/inference.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Vint-GL-Official inference wrapper (NavDP image-goal ViNT in goal-free mode).
3
+
4
+ The ONNX has two inputs (obs_img, goal_img) and two outputs (dist, action).
5
+ Goal-free mode: pass random noise as goal_img — same trick NavDP uses in
6
+ its `ViNTAgent.step_nogoal`.
7
+
8
+ nav = VintGLOfficialNavigator(device="cuda")
9
+ traj, scores = nav.inference_trajectory(obs) # (B, 1, 5, 2) meters
10
+ vw = nav.inference_vw(obs) # caches nav._last_best_traj
11
+
12
+ Action decode (NavDP convention):
13
+ The model bakes `torch.cumsum` into its forward, so the ONNX output
14
+ `action_pred[:, :, :2]` is *already* cumulative — we just scale by
15
+ `metric_waypoint_spacing` to get meters in the robot frame (x=fwd, y=left).
16
+ No unnormalize step (unlike NoMaD).
17
+ """
18
+
19
+ import os
20
+ import math
21
+ import numpy as np
22
+ import torch
23
+ import yaml
24
+ import onnxruntime as ort
25
+ from torchvision import transforms
26
+
27
+ try:
28
+ from urbansim.custom.pp import PurePursuitController
29
+ except Exception:
30
+ PurePursuitController = None # optional — only needed for inference_vw_pp
31
+
32
+ MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
33
+ ONNX_PATH = os.path.join(MODEL_DIR, "vint_imagegoal.onnx")
34
+ INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
35
+
36
+ # NavDP training image size for ViNT (width, height).
37
+ IMG_W, IMG_H = 85, 64
38
+ NUM_WAYPOINTS = 5
39
+ CTX_FRAMES = 6 # context_size=5 past + 1 current
40
+ WAYPOINT_INDEX = 2
41
+
42
+ IMAGE_ASPECT_RATIO = 4 / 3
43
+
44
+ # Recon training camera (for the optional virtual rectification path).
45
+ TRAIN_CAM_FX = 272.547
46
+ TRAIN_CAM_FY = 266.358
47
+ TRAIN_CAM_CX = 320.0
48
+ TRAIN_CAM_CY = 220.0
49
+ TRAIN_CAM_W = 640
50
+ TRAIN_CAM_H = 440
51
+
52
+ DEFAULT_SRC_INTRINSICS = {
53
+ "fx": 210.667, "fy": 210.667,
54
+ "cx": 256.0, "cy": 144.0,
55
+ "w": 512, "h": 288,
56
+ }
57
+
58
+ IMAGENET_TRANSFORM = transforms.Compose([
59
+ transforms.ToTensor(),
60
+ transforms.Normalize(mean=[0.485, 0.456, 0.406],
61
+ std=[0.229, 0.224, 0.225]),
62
+ ])
63
+
64
+
65
+ class PDController:
66
+ MAX_V = 2.5
67
+ MAX_W = 0.65
68
+
69
+ def __init__(self):
70
+ self.last_v = None
71
+
72
+ def reset(self):
73
+ self.last_v = None
74
+
75
+ def __call__(self, waypoints, dt=1.0):
76
+ EPS = 1e-6
77
+ idx = min(WAYPOINT_INDEX, waypoints.shape[1] - 1)
78
+ dx = waypoints[:, idx, 0]
79
+ dyr = waypoints[:, -1, 1]
80
+ dxr = waypoints[:, -1, 0]
81
+ v = dx / dt
82
+ w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
83
+ * dxr.sign() / dt)
84
+ near_zero = dx.abs() < EPS
85
+ v = torch.where(near_zero, torch.zeros_like(v), v)
86
+ w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
87
+ if self.last_v is not None:
88
+ v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
89
+ v = v.clamp(-self.MAX_V, self.MAX_V)
90
+ w = w.clamp(-self.MAX_W, self.MAX_W)
91
+ self.last_v = v
92
+ return v, w
93
+
94
+
95
+ def _make_session(path):
96
+ ort.set_default_logger_severity(3)
97
+ opts = ort.SessionOptions()
98
+ opts.log_severity_level = 3
99
+ return ort.InferenceSession(
100
+ path, sess_options=opts,
101
+ providers=[
102
+ ("CUDAExecutionProvider",
103
+ {"arena_extend_strategy": "kSameAsRequested"}),
104
+ "CPUExecutionProvider"])
105
+
106
+
107
+ class VintGLOfficialNavigator:
108
+ """Vint-GL-Official navigator (NavDP image-goal ViNT, goal-free mode)."""
109
+
110
+ context_size = CTX_FRAMES # 6 frames in obs tensor
111
+ multimodal = False
112
+ num_waypoints = NUM_WAYPOINTS
113
+
114
+ def __init__(
115
+ self,
116
+ device: str = "cuda",
117
+ max_v: float = 2.5,
118
+ max_w: float = 0.65,
119
+ dt: float = 1.25,
120
+ center_crop: bool = True,
121
+ use_train_intrinsics: bool = True,
122
+ src_intrinsics: dict = None,
123
+ ):
124
+ self.device = device
125
+ self.dt = dt
126
+
127
+ # NavDP default is center_crop=False (direct resize). Flag exposed for
128
+ # A/B testing: True crops landscape input to 4:3 before resizing.
129
+ self.center_crop = center_crop
130
+
131
+ # Optional virtual rectification to the NoMaD/ViNT training camera
132
+ # (recon dataset). When True, the source pinhole frame is warped
133
+ # (affine, same math as a homography between two pinhole cameras)
134
+ # so its content lines up with what a recon-camera frame squished
135
+ # to 85x64 would look like.
136
+ self.use_train_intrinsics = use_train_intrinsics
137
+ self.src_intrinsics = dict(src_intrinsics or DEFAULT_SRC_INTRINSICS)
138
+ self._train_affine = None
139
+
140
+ self._sess = _make_session(ONNX_PATH)
141
+ with open(INFO_PATH, "r") as f:
142
+ self._info = yaml.safe_load(f)
143
+
144
+ # Action decoder params (NavDP convention — cumsum already baked
145
+ # into the network, only the metric scale remains here).
146
+ self._normalize_actions = bool(self._info.get("normalize", True))
147
+ self._waypoint_spacing = float(
148
+ self._info.get("metric_waypoint_spacing", 0.8 / 3.0))
149
+ self.model_fps = float(self._info.get("model_fps", 3.0))
150
+
151
+ self._controller = PDController()
152
+ self._controller.MAX_V = max_v
153
+ self._controller.MAX_W = max_w
154
+
155
+ # Cache for visualization (last best trajectory in metric meters).
156
+ self._last_best_traj = None
157
+
158
+ # Optional pure-pursuit controller (used by inference_vw_pp).
159
+ self._pp = (PurePursuitController(action_dt=3, waypoint_index=2)
160
+ if PurePursuitController is not None else None)
161
+
162
+ def reset(self):
163
+ self._controller.reset()
164
+ self._last_best_traj = None
165
+
166
+ # ------------------------------------------------------------------
167
+ # Preprocessing — mirrors NoMaD_GL_Official, only the target size differs.
168
+ # ------------------------------------------------------------------
169
+
170
+ def _ensure_train_affine(self):
171
+ if self._train_affine is not None:
172
+ return
173
+ # Recon intrinsics scaled to (IMG_W, IMG_H) = (85, 64).
174
+ fx_t = TRAIN_CAM_FX * IMG_W / TRAIN_CAM_W
175
+ fy_t = TRAIN_CAM_FY * IMG_H / TRAIN_CAM_H
176
+ cx_t = TRAIN_CAM_CX * IMG_W / TRAIN_CAM_W
177
+ cy_t = TRAIN_CAM_CY * IMG_H / TRAIN_CAM_H
178
+
179
+ src = self.src_intrinsics
180
+ a = src["fx"] / fx_t
181
+ e = src["fy"] / fy_t
182
+ c = src["cx"] - cx_t * a
183
+ f = src["cy"] - cy_t * e
184
+ self._train_affine = (a, 0.0, c, 0.0, e, f)
185
+
186
+ def _to_model_input_uint8(self, frame_uint8):
187
+ from PIL import Image as PILImage
188
+ pil = PILImage.fromarray(frame_uint8)
189
+ if self.use_train_intrinsics:
190
+ self._ensure_train_affine()
191
+ pil = pil.transform(
192
+ (IMG_W, IMG_H), PILImage.AFFINE, self._train_affine,
193
+ resample=PILImage.BILINEAR, fillcolor=0)
194
+ return np.array(pil)
195
+ if self.center_crop:
196
+ w, h = pil.size
197
+ if w > h:
198
+ crop_w = int(h * IMAGE_ASPECT_RATIO)
199
+ x0 = max((w - crop_w) // 2, 0)
200
+ pil = pil.crop((x0, 0, x0 + crop_w, h))
201
+ else:
202
+ crop_h = int(w / IMAGE_ASPECT_RATIO)
203
+ y0 = max((h - crop_h) // 2, 0)
204
+ pil = pil.crop((0, y0, w, y0 + crop_h))
205
+ pil = pil.resize((IMG_W, IMG_H))
206
+ return np.array(pil)
207
+
208
+ def _preprocess_images(self, obs_rgb):
209
+ """obs_rgb: (B, T=6, 3, H, W) float32 [0,1] → (B, 18, 64, 85) ImageNet-norm."""
210
+ from PIL import Image as PILImage
211
+ B, T = obs_rgb.shape[:2]
212
+ frames = []
213
+ for b in range(B):
214
+ ch_list = []
215
+ for t in range(T):
216
+ frame = (obs_rgb[b, t].transpose(1, 2, 0) * 255).astype(np.uint8)
217
+ model_in = self._to_model_input_uint8(frame)
218
+ pil = PILImage.fromarray(model_in)
219
+ ch_list.append(IMAGENET_TRANSFORM(pil).numpy())
220
+ frames.append(np.concatenate(ch_list, axis=0))
221
+ return np.stack(frames) # (B, 3*T, IMG_H, IMG_W)
222
+
223
+ # ------------------------------------------------------------------
224
+ # Inference
225
+ # ------------------------------------------------------------------
226
+
227
+ def inference_trajectory(self, obs):
228
+ """Run ViNT → metric waypoints + uniform scores.
229
+
230
+ Args:
231
+ obs: (B, 6, 3, H, W) float32 in [0,1]. 6 frames.
232
+
233
+ Returns:
234
+ trajectory: np.ndarray (B, 1, 5, 2) — meters in robot frame
235
+ scores: np.ndarray (B, 1) — uniform (1.0)
236
+ """
237
+ if isinstance(obs, torch.Tensor):
238
+ obs_np = obs.cpu().numpy()
239
+ else:
240
+ obs_np = np.asarray(obs, dtype=np.float32)
241
+ B = obs_np.shape[0]
242
+
243
+ obs_in = self._preprocess_images(obs_np) # (B, 18, 64, 85)
244
+ # Goal-free: random fake goal (matches NavDP ViNTAgent.step_nogoal).
245
+ fake_goal = np.random.randn(B, 3, IMG_H, IMG_W).astype(np.float32)
246
+
247
+ # Suppress ORT stderr noise during this call.
248
+ devnull = os.open(os.devnull, os.O_WRONLY)
249
+ old_stderr = os.dup(2)
250
+ os.dup2(devnull, 2)
251
+ all_action_pred = []
252
+ try:
253
+ for i in range(B):
254
+ dist_pred, action_pred = self._sess.run(
255
+ None, {"obs_img": obs_in[i:i+1], "goal_img": fake_goal[i:i+1]})
256
+ all_action_pred.append(action_pred)
257
+ finally:
258
+ os.dup2(old_stderr, 2)
259
+ os.close(devnull)
260
+ os.close(old_stderr)
261
+ action_pred = np.concatenate(all_action_pred, axis=0) # (B, 5, 4)
262
+ # action_pred shape (B, 5, 4): xy already cumsum'd, last two are sin/cos.
263
+ xy = action_pred[:, :, :2]
264
+ if self._normalize_actions:
265
+ xy = xy * self._waypoint_spacing # meters
266
+
267
+ trajectory = xy[:, None, :, :] # (B, 1, 5, 2)
268
+ scores = np.ones((B, 1), dtype=np.float32)
269
+ return trajectory.astype(np.float32), scores
270
+
271
+ def inference_vw(self, obs):
272
+ """Run ViNT → (v, w) velocity commands.
273
+
274
+ Caches the chosen trajectory on ``self._last_best_traj``.
275
+ """
276
+ trajectory, _ = self.inference_trajectory(obs)
277
+ best_traj = trajectory[:, 0] # (B, 5, 2)
278
+ self._last_best_traj = best_traj
279
+ waypoints = torch.from_numpy(best_traj).float().to(self.device)
280
+ v, w = self._controller(waypoints, dt=self.dt)
281
+ return torch.stack([v, w], dim=1), best_traj
282
+
283
+ def inference_vw_pp(self, obs, robot):
284
+ """Pure-pursuit variant of :meth:`inference_vw`."""
285
+ if self._pp is None:
286
+ raise RuntimeError(
287
+ "PurePursuitController unavailable — install urbansim or "
288
+ "make sure `urbansim.custom.pp` is on the Python path.")
289
+ trajectory, _ = self.inference_trajectory(obs)
290
+ best_traj = trajectory[:, 0]
291
+ self._last_best_traj = best_traj
292
+ waypoints = torch.from_numpy(best_traj).float().to(self.device)
293
+ v, w = self._pp.step(waypoints, robot=robot)
294
+ return torch.stack([v, w], dim=1)
Vint_GL_Official/model_info.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Vint_GL_Official
2
+ goal_type: goal_less
3
+ image_resolution: [64, 85]
4
+ context_size: 6
5
+ obs_normalize: true
6
+ normalize: true
7
+ action_stats:
8
+ min: [0.0, -2.0]
9
+ max: [5.0, 2.0]
10
+ model_max_v: 2.5
11
+ model_rate: 3
12
+ metric_waypoint_spacing: 0.8
13
+ model_fps: 3
14
+ len_traj_pred: 5
15
+ learn_angle: true
16
+ inputs:
17
+ obs_img:
18
+ shape: [batch, 18, 64, 85]
19
+ dtype: float32
20
+ description: 6 RGB frames x 3 channels, ImageNet normalized; resized to image_resolution
21
+ goal_img:
22
+ shape: [batch, 3, 64, 85]
23
+ dtype: float32
24
+ description: Goal image (random noise for goal-free)
25
+ outputs:
26
+ dist_pred:
27
+ shape: [batch, 1]
28
+ description: Predicted distance to goal
29
+ action_pred:
30
+ shape: [batch, 5, 4]
31
+ description: waypoints (cumsum'd dx,dy in dataset units; scale by metric_waypoint_spacing),
32
+ sin/cos heading
33
+ onnx: vint_imagegoal.onnx
Vint_GL_Official/vint_imagegoal.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d480bf0fc28f6c1d84f568bf28733f6b418920218e3affd6f9d59fff1506ecec
3
+ size 1415900
Vint_GL_Official/vint_imagegoal.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:399b8d50e7fcc812a69734fcc51f40dd313d964fec79d6fea3429252bc994791
3
+ size 95748096