Hollis71025 commited on about 13 hours ago

Commit

9c1f523

verified ·

1 Parent(s): 2587ef0

Add 7 navigation models (ONNX + inference wrappers) and model card

Browse files

Files changed (29) hide show

.gitattributes +3 -0
CityWalker_PG_Official/citywalker.onnx +3 -0
CityWalker_PG_Official/inference.py +203 -0
CityWalker_PG_Official/model_info.yaml +13 -0
GNM_GL_Official/gnm_imagegoal.onnx +3 -0
GNM_GL_Official/gnm_imagegoal.onnx.data +3 -0
GNM_GL_Official/inference.py +247 -0
GNM_GL_Official/model_info.yaml +33 -0
MBRA_PG_Official/inference.py +177 -0
MBRA_PG_Official/mbra.onnx +3 -0
MBRA_PG_Official/model_info.yaml +12 -0
MIMIC/inference.py +170 -0
MIMIC/mimic.onnx +3 -0
MIMIC/model_info.yaml +13 -0
NoMaD_GL_Official/inference.py +440 -0
NoMaD_GL_Official/model_info.yaml +40 -0
NoMaD_GL_Official/nomad_dist_pred.onnx +3 -0
NoMaD_GL_Official/nomad_dist_pred.onnx.data +0 -0
NoMaD_GL_Official/nomad_noise_pred.onnx +3 -0
NoMaD_GL_Official/nomad_vision_encoder.onnx +3 -0
NoMaD_GL_Official/nomad_vision_encoder.onnx.data +3 -0
README.md +232 -0
S2E/inference.py +167 -0
S2E/model_info.yaml +12 -0
S2E/s2e.onnx +3 -0
Vint_GL_Official/inference.py +294 -0
Vint_GL_Official/model_info.yaml +33 -0
Vint_GL_Official/vint_imagegoal.onnx +3 -0
Vint_GL_Official/vint_imagegoal.onnx.data +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+GNM_GL_Official/gnm_imagegoal.onnx.data filter=lfs diff=lfs merge=lfs -text
+NoMaD_GL_Official/nomad_vision_encoder.onnx.data filter=lfs diff=lfs merge=lfs -text
+Vint_GL_Official/vint_imagegoal.onnx.data filter=lfs diff=lfs merge=lfs -text

CityWalker_PG_Official/citywalker.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e8fb9ff081a883d80f0502d5b7e9046ed4b0dfafd46fe96ff5ad16194f0949a
+size 806082192

CityWalker_PG_Official/inference.py ADDED Viewed

	@@ -0,0 +1,203 @@

+#!/usr/bin/env python3
+"""CityWalker-PG-Official inference utilities.
+CityWalker uses a NON-STANDARD coordinate system internally:
+    model-internal:  y = forward, x = right
+    our standard:    x = forward, y = left
+This navigator accepts standard-frame inputs (goal_xy in x=forward, y=left)
+and returns standard-frame outputs. The coordinate swap is handled internally.
+    nav = CityWalkerPGNavigator(device="cuda")
+    traj, scores = nav.inference_trajectory(obs, goal_xy=np.array([5.0, 0.2]))
+    vw = nav.inference_vw(obs, goal_xy=np.array([5.0, 0.2]))
+"""
+import os
+import math
+import numpy as np
+import torch
+import yaml
+import onnxruntime as ort
+from torchvision import transforms
+MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
+ONNX_PATH = os.path.join(MODEL_DIR, "citywalker.onnx")
+INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
+# ImageNet normalization (CityWalker uses normalized images)
+IMAGENET_TRANSFORM = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                         std=[0.229, 0.224, 0.225]),
+])
+IMG_H, IMG_W = 350, 630
+class PDController:
+    MAX_V = 2.5
+    MAX_W = 0.65
+    def __init__(self):
+        self.last_v = None
+    def reset(self):
+        self.last_v = None
+    def __call__(self, waypoints, dt=1.0):
+        EPS = 1e-6
+        dxr = waypoints[:, 2, 0]
+        dyr = waypoints[:, 2, 1]
+        idx = 0
+        dx = waypoints[:, idx, 0]
+        v = dx / dt
+        w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
+             * dxr.sign() / dt)
+        near_zero = dx.abs() < EPS
+        v = torch.where(near_zero, torch.zeros_like(v), v)
+        w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
+        if self.last_v is not None:
+            v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
+        v = v.clamp(-self.MAX_V, self.MAX_V)
+        w = w.clamp(-self.MAX_W, self.MAX_W)
+        self.last_v = v
+        return v, w
+def _std_to_cw(xy):
+    """Standard frame (x=fwd, y=left) → CityWalker frame (y=fwd, x=right).
+    CityWalker model input: [y_cw, -x_cw] where y_cw=forward, x_cw=right.
+    Given standard (x_std=fwd, y_std=left):
+        y_cw = x_std (forward)
+        x_cw = -y_std (right = -left)
+    Model expects [y_cw, -x_cw] = [x_std, y_std]
+    Actually from the code: input_traj[..., [1,0]] then negate [0].
+    So model wants: col0 = -x_cw = y_std, col1 = y_cw = x_std.
+    """
+    out = np.empty_like(xy)
+    out[..., 0] = -xy[..., 1]   # col0 = y_std (= -x_cw)
+    out[..., 1] = xy[..., 0]   # col1 = x_std (= y_cw = forward)
+    return out
+def _cw_to_std(xy):
+    """CityWalker output frame → standard frame.
+    Model output: [y_std, x_std] (reverse of the swap above).
+    """
+    out = np.empty_like(xy)
+    out[..., 0] = xy[..., 1]   # x_std = forward
+    out[..., 1] = -xy[..., 0]   # y_std = left
+    return out
+class CityWalkerPGNavigator:
+    """CityWalker point-goal navigator.
+    All user-facing coordinates are standard frame: x=forward, y=left, meters.
+    """
+    context_size = 5
+    multimodal = False
+    def __init__(
+        self,
+        onnx_path: str = ONNX_PATH,
+        device: str = "cuda",
+        max_v: float = 2.5,
+        max_w: float = 0.65,
+        dt: float = 1.0,
+    ):
+        self.device = device
+        self.dt = dt
+        ort.set_default_logger_severity(3)
+        providers = [
+            ("CUDAExecutionProvider",
+             {"arena_extend_strategy": "kSameAsRequested"}),
+            "CPUExecutionProvider",
+        ]
+        self._session = ort.InferenceSession(onnx_path, providers=providers)
+        with open(INFO_PATH, "r") as f:
+            self._info = yaml.safe_load(f)
+        self._controller = PDController()
+        self._controller.MAX_V = max_v
+        self._controller.MAX_W = max_w
+        self._past_traj = None
+    def reset(self):
+        self._controller.reset()
+        self._past_traj = None
+    def _preprocess_images(self, obs_rgb):
+        """obs_rgb: (B, C, 3, H, W) float32 [0,1] → ImageNet-normalized."""
+        from PIL import Image as PILImage
+        B, C = obs_rgb.shape[:2]
+        out = np.empty((B, C, 3, IMG_H, IMG_W), dtype=np.float32)
+        for b in range(B):
+            for c in range(C):
+                # CHW [0,1] → HWC uint8
+                frame = (obs_rgb[b, c].transpose(1, 2, 0) * 255).astype(np.uint8)
+                pil = PILImage.fromarray(frame)
+                pil = pil.resize((IMG_W, IMG_H))
+                out[b, c] = IMAGENET_TRANSFORM(pil).numpy()
+        return out
+    def inference_trajectory(self, obs, goal_xy=None, past_traj_std=None):
+        """Run model → (trajectory, scores).
+        Args:
+            obs: (B, context_size, 3, H, W) float32 in [0,1]. 5 frames.
+            goal_xy: (2,) or (B, 2) goal in standard frame [x_fwd, y_left] meters.
+                     If None, uses [5.0, 0.0] (straight ahead 5m).
+            past_traj_std: (B, 6, 2) past trajectory in standard frame, or None.
+                           If None, uses zeros (stationary start).
+        Returns:
+            trajectory: np.ndarray (B, 1, 5, 2) in standard frame
+            scores:     np.ndarray (B, 1)
+        """
+        if isinstance(obs, torch.Tensor):
+            obs_np = obs.cpu().numpy()
+        else:
+            obs_np = np.asarray(obs, dtype=np.float32)
+        B = obs_np.shape[0]
+        # Preprocess images: resize + ImageNet normalize
+        obs_norm = self._preprocess_images(obs_np)
+        # Build trajectory input (past 6 waypoints in CW frame)
+        if past_traj_std is None:
+            past_std = np.zeros((B, 6, 2), dtype=np.float32)
+        else:
+            past_std = np.asarray(past_traj_std, dtype=np.float32)
+            if past_std.ndim == 2:
+                past_std = past_std[np.newaxis]
+        past_std[:, -1, 0] = np.random.uniform(5.0, 10.0, size=(B,))
+        past_std[:, -1, 1] = np.random.uniform(-2.0, 2.0, size=(B,))
+        traj_cw = _std_to_cw(past_std)
+        out = self._session.run(None, {
+            "obs_images": obs_norm,
+            "trajectory": traj_cw,
+        })
+        wp_cw = out[0]  # (B, 5, 2) in CW frame
+        # Convert back to standard frame
+        wp_std = _cw_to_std(wp_cw)
+        trajectory = wp_std[:, np.newaxis]  # (B, 1, 5, 2)
+        scores = np.ones((B, 1), dtype=np.float32)
+        return trajectory, scores
+    def inference_vw(self, obs, goal_xy=None, past_traj_std=None):
+        """Run model → (v, w) velocity commands in standard frame.
+        Returns:
+            vw: torch.Tensor (B, 2)
+        """
+        trajectory, _ = self.inference_trajectory(obs, goal_xy, past_traj_std)
+        best_traj = trajectory[:, 0]  # (B, 5, 2) standard frame
+        waypoints = torch.from_numpy(best_traj).float().to(self.device)
+        v, w = self._controller(waypoints, dt=self.dt)
+        return torch.stack([v, w], dim=1), best_traj

CityWalker_PG_Official/model_info.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+name: CityWalker-PG-Official
+goal_type: point_goal
+image_resolution: [350, 630]
+context_size: 5
+input_fps: 5.0
+obs_normalize: false
+inputs:
+  obs_images: {shape: [batch, 5, 3, 350, 630], dtype: float32, description: "5 RGB observation frames in [0,1]"}
+  trajectory: {shape: [batch, 6, 2], dtype: float32, description: "6 past trajectory waypoints (x,y) in meters"}
+outputs:
+  wp_pred: {shape: [batch, 5, 2], description: "5 predicted waypoints (x,y)"}
+  arrive_pred: {shape: [batch, 1], description: "Arrival probability"}
+num_waypoints: 5

GNM_GL_Official/gnm_imagegoal.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8914d698a58f0f49ca8da5a2f61d646d759a2178ad6f8a89f0e863b625110c77
+size 619733

GNM_GL_Official/gnm_imagegoal.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2764830aaf3d0a6189f1f2a0d5c9d2f4c725aa9cc9638a8beeae9e6c9dd59b8a
+size 34603008

GNM_GL_Official/inference.py ADDED Viewed

	@@ -0,0 +1,247 @@

+#!/usr/bin/env python3
+"""GNM-GL-Official inference wrapper (NavDP image-goal GNM in goal-free mode).
+Mirror of Vint_GL_Official/inference.py — same I/O contract, only the ONNX
+backbone and the class name differ. See that file for the action-decode and
+preprocessing rationale.
+    nav = GNMGLOfficialNavigator(device="cuda")
+    traj, scores = nav.inference_trajectory(obs)   # (B, 1, 5, 2) meters
+    vw = nav.inference_vw(obs)                     # caches nav._last_best_traj
+"""
+import os
+import math
+import numpy as np
+import torch
+import yaml
+import onnxruntime as ort
+from torchvision import transforms
+try:
+    from urbansim.custom.pp import PurePursuitController
+except Exception:
+    PurePursuitController = None  # optional — only needed for inference_vw_pp
+MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
+ONNX_PATH = os.path.join(MODEL_DIR, "gnm_imagegoal.onnx")
+INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
+IMG_W, IMG_H = 85, 64
+NUM_WAYPOINTS = 5
+CTX_FRAMES = 6
+WAYPOINT_INDEX = 2
+IMAGE_ASPECT_RATIO = 4 / 3
+TRAIN_CAM_FX = 272.547
+TRAIN_CAM_FY = 266.358
+TRAIN_CAM_CX = 320.0
+TRAIN_CAM_CY = 220.0
+TRAIN_CAM_W = 640
+TRAIN_CAM_H = 440
+DEFAULT_SRC_INTRINSICS = {
+    "fx": 210.667, "fy": 210.667,
+    "cx": 256.0, "cy": 144.0,
+    "w": 512, "h": 288,
+}
+IMAGENET_TRANSFORM = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                         std=[0.229, 0.224, 0.225]),
+])
+class PDController:
+    MAX_V = 2.5
+    MAX_W = 0.65
+    def __init__(self):
+        self.last_v = None
+    def reset(self):
+        self.last_v = None
+    def __call__(self, waypoints, dt=1.0):
+        EPS = 1e-6
+        idx = min(WAYPOINT_INDEX, waypoints.shape[1] - 1)
+        dx = waypoints[:, idx, 0]
+        dyr = waypoints[:, -1, 1]
+        dxr = waypoints[:, -1, 0]
+        v = dx / dt
+        w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
+             * dxr.sign() / dt)
+        near_zero = dx.abs() < EPS
+        v = torch.where(near_zero, torch.zeros_like(v), v)
+        w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
+        if self.last_v is not None:
+            v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
+        v = v.clamp(-self.MAX_V, self.MAX_V)
+        w = w.clamp(-self.MAX_W, self.MAX_W)
+        self.last_v = v
+        return v, w
+def _make_session(path):
+    ort.set_default_logger_severity(3)
+    opts = ort.SessionOptions()
+    opts.log_severity_level = 3
+    return ort.InferenceSession(
+        path, sess_options=opts,
+        providers=[
+            ("CUDAExecutionProvider",
+             {"arena_extend_strategy": "kSameAsRequested"}),
+            "CPUExecutionProvider"])
+class GNMGLOfficialNavigator:
+    """GNM-GL-Official navigator (NavDP image-goal GNM, goal-free mode)."""
+    context_size = CTX_FRAMES
+    multimodal = False
+    num_waypoints = NUM_WAYPOINTS
+    def __init__(
+        self,
+        device: str = "cuda",
+        max_v: float = 2.5,
+        max_w: float = 0.65,
+        dt: float = 1.25,
+        center_crop: bool = True,
+        use_train_intrinsics: bool = True,
+        src_intrinsics: dict = None,
+    ):
+        self.device = device
+        self.dt = dt
+        self.center_crop = center_crop
+        self.use_train_intrinsics = use_train_intrinsics
+        self.src_intrinsics = dict(src_intrinsics or DEFAULT_SRC_INTRINSICS)
+        self._train_affine = None
+        self._sess = _make_session(ONNX_PATH)
+        with open(INFO_PATH, "r") as f:
+            self._info = yaml.safe_load(f)
+        self._normalize_actions = bool(self._info.get("normalize", True))
+        self._waypoint_spacing = float(
+            self._info.get("metric_waypoint_spacing", 0.8 / 3.0))
+        self.model_fps = float(self._info.get("model_fps", 3.0))
+        self._controller = PDController()
+        self._controller.MAX_V = max_v
+        self._controller.MAX_W = max_w
+        self._last_best_traj = None
+        self._pp = (PurePursuitController(action_dt=3, waypoint_index=2)
+                    if PurePursuitController is not None else None)
+    def reset(self):
+        self._controller.reset()
+        self._last_best_traj = None
+    def _ensure_train_affine(self):
+        if self._train_affine is not None:
+            return
+        fx_t = TRAIN_CAM_FX * IMG_W / TRAIN_CAM_W
+        fy_t = TRAIN_CAM_FY * IMG_H / TRAIN_CAM_H
+        cx_t = TRAIN_CAM_CX * IMG_W / TRAIN_CAM_W
+        cy_t = TRAIN_CAM_CY * IMG_H / TRAIN_CAM_H
+        src = self.src_intrinsics
+        a = src["fx"] / fx_t
+        e = src["fy"] / fy_t
+        c = src["cx"] - cx_t * a
+        f = src["cy"] - cy_t * e
+        self._train_affine = (a, 0.0, c, 0.0, e, f)
+    def _to_model_input_uint8(self, frame_uint8):
+        from PIL import Image as PILImage
+        pil = PILImage.fromarray(frame_uint8)
+        if self.use_train_intrinsics:
+            self._ensure_train_affine()
+            pil = pil.transform(
+                (IMG_W, IMG_H), PILImage.AFFINE, self._train_affine,
+                resample=PILImage.BILINEAR, fillcolor=0)
+            return np.array(pil)
+        if self.center_crop:
+            w, h = pil.size
+            if w > h:
+                crop_w = int(h * IMAGE_ASPECT_RATIO)
+                x0 = max((w - crop_w) // 2, 0)
+                pil = pil.crop((x0, 0, x0 + crop_w, h))
+            else:
+                crop_h = int(w / IMAGE_ASPECT_RATIO)
+                y0 = max((h - crop_h) // 2, 0)
+                pil = pil.crop((0, y0, w, y0 + crop_h))
+        pil = pil.resize((IMG_W, IMG_H))
+        return np.array(pil)
+    def _preprocess_images(self, obs_rgb):
+        from PIL import Image as PILImage
+        B, T = obs_rgb.shape[:2]
+        frames = []
+        for b in range(B):
+            ch_list = []
+            for t in range(T):
+                frame = (obs_rgb[b, t].transpose(1, 2, 0) * 255).astype(np.uint8)
+                model_in = self._to_model_input_uint8(frame)
+                pil = PILImage.fromarray(model_in)
+                ch_list.append(IMAGENET_TRANSFORM(pil).numpy())
+            frames.append(np.concatenate(ch_list, axis=0))
+        return np.stack(frames)
+    def inference_trajectory(self, obs):
+        """Run GNM → metric waypoints + uniform scores.
+        Returns:
+            trajectory: np.ndarray (B, 1, 5, 2) — meters in robot frame
+            scores:     np.ndarray (B, 1)
+        """
+        if isinstance(obs, torch.Tensor):
+            obs_np = obs.cpu().numpy()
+        else:
+            obs_np = np.asarray(obs, dtype=np.float32)
+        B = obs_np.shape[0]
+        obs_in = self._preprocess_images(obs_np)
+        fake_goal = np.random.randn(B, 3, IMG_H, IMG_W).astype(np.float32)
+        devnull = os.open(os.devnull, os.O_WRONLY)
+        old_stderr = os.dup(2)
+        os.dup2(devnull, 2)
+        try:
+            dist_pred, action_pred = self._sess.run(
+                None, {"obs_img": obs_in, "goal_img": fake_goal})
+        finally:
+            os.dup2(old_stderr, 2)
+            os.close(devnull)
+            os.close(old_stderr)
+        xy = action_pred[:, :, :2]
+        if self._normalize_actions:
+            xy = xy * self._waypoint_spacing
+        trajectory = xy[:, None, :, :]  # (B, 1, 5, 2)
+        scores = np.ones((B, 1), dtype=np.float32)
+        return trajectory.astype(np.float32), scores
+    def inference_vw(self, obs):
+        trajectory, _ = self.inference_trajectory(obs)
+        best_traj = trajectory[:, 0]
+        self._last_best_traj = best_traj
+        waypoints = torch.from_numpy(best_traj).float().to(self.device)
+        v, w = self._controller(waypoints, dt=self.dt)
+        return torch.stack([v, w], dim=1)
+    def inference_vw_pp(self, obs, robot):
+        if self._pp is None:
+            raise RuntimeError(
+                "PurePursuitController unavailable — install urbansim or "
+                "make sure `urbansim.custom.pp` is on the Python path.")
+        trajectory, _ = self.inference_trajectory(obs)
+        best_traj = trajectory[:, 0]
+        self._last_best_traj = best_traj
+        waypoints = torch.from_numpy(best_traj).float().to(self.device)
+        v, w = self._pp.step(waypoints, robot=robot)
+        return torch.stack([v, w], dim=1)

GNM_GL_Official/model_info.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+name: GNM-GL-Official
+goal_type: goal_less
+image_resolution: [64, 85]
+context_size: 6
+obs_normalize: true
+normalize: true
+action_stats:
+  min: [0.0, -2.0]
+  max: [5.0, 2.0]
+model_max_v: 0.8
+model_rate: 3
+metric_waypoint_spacing: 0.8
+model_fps: 3
+len_traj_pred: 5
+learn_angle: true
+inputs:
+  obs_img:
+    shape: [batch, 18, 64, 85]
+    dtype: float32
+    description: 6 RGB frames x 3 channels, ImageNet normalized; resized to image_resolution
+  goal_img:
+    shape: [batch, 3, 64, 85]
+    dtype: float32
+    description: Goal image (random noise for goal-free)
+outputs:
+  dist_pred:
+    shape: [batch, 1]
+    description: Predicted distance to goal
+  action_pred:
+    shape: [batch, 5, 4]
+    description: waypoints (cumsum'd dx,dy in dataset units; scale by metric_waypoint_spacing),
+      sin/cos heading
+onnx: gnm_imagegoal.onnx

MBRA_PG_Official/inference.py ADDED Viewed

	@@ -0,0 +1,177 @@

+#!/usr/bin/env python3
+"""MBRA-PG-Official inference utilities.
+MBRA takes 6 ImageNet-normalized 96x96 frames (stacked as 18 channels)
+and a goal pose [x, y, sin(yaw), cos(yaw)] normalized by metric_spacing=0.25m.
+User provides goal_xy in standard frame (x=forward, y=left, meters).
+    nav = MBRAPGNavigator(device="cuda")
+    traj, scores = nav.inference_trajectory(obs, goal_xy=np.array([5.0, 0.2]))
+    vw = nav.inference_vw(obs, goal_xy=np.array([5.0, 0.2]))
+"""
+import os
+import math
+import numpy as np
+import torch
+import onnxruntime as ort
+import yaml
+from torchvision import transforms
+MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
+ONNX_PATH = os.path.join(MODEL_DIR, "mbra.onnx")
+INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
+METRIC_SPACING = 0.8  # waypoint normalization factor
+IMAGENET_TRANSFORM = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                         std=[0.229, 0.224, 0.225]),
+])
+IMG_SIZE = 96
+class PDController:
+    MAX_V = 2.5
+    MAX_W = 0.65
+    def __init__(self):
+        self.last_v = None
+    def reset(self):
+        self.last_v = None
+    def __call__(self, waypoints, dt=1.0):
+        EPS = 1e-6
+        idx = min(2, waypoints.shape[1] - 1)
+        dx = waypoints[:, idx, 0]
+        dyr = waypoints[:, -1, 1]
+        dxr = waypoints[:, -1, 0]
+        v = dx / dt
+        w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
+             * dxr.sign() / dt)
+        near_zero = dx.abs() < EPS
+        v = torch.where(near_zero, torch.zeros_like(v), v)
+        w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
+        if self.last_v is not None:
+            v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
+        v = v.clamp(-self.MAX_V, self.MAX_V)
+        w = w.clamp(-self.MAX_W, self.MAX_W)
+        self.last_v = v
+        return v, w
+class MBRAPGNavigator:
+    """MBRA point-goal navigator.
+    All user-facing coordinates: x=forward, y=left, meters.
+    """
+    context_size = 6
+    multimodal = False
+    def __init__(
+        self,
+        onnx_path: str = ONNX_PATH,
+        device: str = "cuda",
+        max_v: float = 2.5,
+        max_w: float = 0.65,
+        dt: float = 2.5,
+    ):
+        self.device = device
+        self.dt = dt
+        ort.set_default_logger_severity(3)
+        providers = [
+            ("CUDAExecutionProvider",
+             {"arena_extend_strategy": "kSameAsRequested"}),
+            "CPUExecutionProvider",
+        ]
+        self._session = ort.InferenceSession(onnx_path, providers=providers)
+        with open(INFO_PATH, "r") as f:
+            self._info = yaml.safe_load(f)
+        self._controller = PDController()
+        self._controller.MAX_V = max_v
+        self._controller.MAX_W = max_w
+    def reset(self):
+        self._controller.reset()
+    def _preprocess_images(self, obs_rgb):
+        """obs_rgb: (B, C, 3, H, W) float32 [0,1] → (B, C*3, 96, 96) ImageNet-norm."""
+        from PIL import Image as PILImage
+        B, C = obs_rgb.shape[:2]
+        frames = []
+        for b in range(B):
+            ch_list = []
+            for c in range(C):
+                frame = (obs_rgb[b, c].transpose(1, 2, 0) * 255).astype(np.uint8)
+                pil = PILImage.fromarray(frame).resize((IMG_SIZE, IMG_SIZE))
+                ch_list.append(IMAGENET_TRANSFORM(pil).numpy())  # (3, 96, 96)
+            frames.append(np.concatenate(ch_list, axis=0))  # (C*3, 96, 96)
+        return np.stack(frames)  # (B, 18, 96, 96)
+    @staticmethod
+    def _goal_to_pose(goal_xy):
+        """Convert standard goal [x_fwd, y_left] (meters) → MBRA goal_pose [4].
+        MBRA expects [x/spacing, y/spacing, cos(angle), sin(angle)].
+        """
+        x, y = float(goal_xy[0]), float(goal_xy[1])
+        angle = math.atan2(y, x)
+        return np.array([
+            x / METRIC_SPACING,
+            y / METRIC_SPACING,
+            math.cos(angle),
+            math.sin(angle),
+        ], dtype=np.float32)
+    def inference_trajectory(self, obs, goal_xy=None):
+        """Run model → (trajectory, scores).
+        Args:
+            obs: (B, 6, 3, H, W) float32 in [0,1]. 6 frames.
+            goal_xy: (2,) goal in standard frame [x_fwd, y_left] meters.
+                     If None, uses [5.0, 0.0].
+        Returns:
+            trajectory: np.ndarray (B, 1, 8, 2) waypoints [x, y] meters
+            scores:     np.ndarray (B, 1)
+        """
+        if isinstance(obs, torch.Tensor):
+            obs_np = obs.cpu().numpy()
+        else:
+            obs_np = np.asarray(obs, dtype=np.float32)
+        B = obs_np.shape[0]
+        obs_norm = self._preprocess_images(obs_np)
+        if goal_xy is None:
+            goal_xy = np.array([np.random.uniform(2.0, 10.0), np.random.uniform(-2.0, 2.0)])
+        goal_pose = self._goal_to_pose(goal_xy)
+        goal_batch = np.tile(goal_pose, (B, 1))
+        out = self._session.run(None, {
+            "obs_images": obs_norm,
+            "goal_pose": goal_batch,
+        })
+        wp_raw = out[0]  # (B, 8, 4) — [x, y, sin(yaw), cos(yaw)]
+        # Extract x, y and un-normalize
+        wp_xy = wp_raw[:, :, :2] * METRIC_SPACING  # (B, 8, 2) meters
+        trajectory = wp_xy[:, np.newaxis]  # (B, 1, 8, 2)
+        scores = np.ones((B, 1), dtype=np.float32)
+        return trajectory, scores
+    def inference_vw(self, obs, goal_xy=None):
+        """Run model → (v, w).
+        Returns:
+            vw: torch.Tensor (B, 2)
+        """
+        trajectory, _ = self.inference_trajectory(obs, goal_xy)
+        best_traj = trajectory[:, 0]
+        waypoints = torch.from_numpy(best_traj).float().to(self.device)
+        v, w = self._controller(waypoints, dt=self.dt)
+        return torch.stack([v, w], dim=1), best_traj

MBRA_PG_Official/mbra.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc42919737964d9c4db9428d8a836ad9d95007f1a41fcf5cf95526e1042f17a5
+size 253629409

MBRA_PG_Official/model_info.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+name: MBRA-PG-Official
+goal_type: point_goal
+image_resolution: [96, 96]
+context_size: 6
+input_fps: 5.0
+obs_normalize: true
+inputs:
+  obs_images: {shape: [batch, 6, 3, 96, 96], dtype: float32, description: "6 RGB frames x 3 channels, ImageNet normalized"}
+  goal_pose: {shape: [batch, 4], dtype: float32, description: "Goal pose [x, y, sin(yaw), cos(yaw)] in meters/rad"}
+outputs:
+  waypoints: {shape: [batch, 8, 4], description: "8 waypoints x 4 (x, y, sin(yaw), cos(yaw))"}
+num_waypoints: 8

MIMIC/inference.py ADDED Viewed

	@@ -0,0 +1,170 @@

+#!/usr/bin/env python3
+"""MIMIC inference utilities.
+Mirrors the style of :mod:`models.S2E_PG_Full.inference`, adapted for the
+MIMIC ONNX (``mimic.onnx``) — point-goal, multi-modal, 16-frame context with
+an explicit past-trajectory + past-mask input. Reference single-shot script:
+``urban-sim-human-official/run_v3_b_onnx.py``.
+The ONNX has four inputs and three outputs::
+    obs       : (B, 16, 3, 256, 256)  float32 in [0, 1]
+    goal      : (B, 3)                float32 = [dist/200, cos(a), sin(a)]
+    past_traj : (B, 15, 2)             float32 — past XY in camera frame x 0.25
+                                       (usually zeros at inference)
+    past_mask : (B, 16)                float32 — 1.0 for valid frames
+    best_reg_mid         : (B, 10, 3)       — best single trajectory (model-picked)
+    poses_reg_mid_denorm : (B, 62, 10, 3)   — 62 candidates
+    poses_cls_mid        : (B, 62)          — classification scores
+Outputs are in *camera frame* (col-0 = forward, col-1 = +left in the model
+output; the reference script's ``[-col1, col0]`` swap turns col1 into the
+controller's right-positive ``x_right``). We rescale by ``waypoint_scale``
+(=0.25) and return waypoints in standard frame ``(x_fwd, y_left)``.
+    nav = MIMICNavigator(device="cuda")
+    traj, scores = nav.inference_trajectory(obs, goal_xy=np.array([5.0, 0.2]))
+    vw, best = nav.inference_vw(obs, goal_xy=np.array([5.0, 0.2]))
+    nav.reset()
+"""
+import os
+import math
+import numpy as np
+import torch
+import yaml
+import onnxruntime as ort
+from urbansim.custom.pp import PurePursuitController
+MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
+ONNX_PATH = os.path.join(MODEL_DIR, "mimic.onnx")
+INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
+HORIZON_IDX = 13
+Scaler = 0.25
+class PDController:
+    """Convert predicted waypoints to (v, w) commands."""
+    MAX_V = 2.5
+    MAX_W = 0.65
+    def __init__(self):
+        self.last_v = None
+    def reset(self):
+        self.last_v = None
+    def __call__(self, waypoints, dt=1.0):
+        EPS = 1e-6
+        dxr = waypoints[:, -1, 0]
+        dyr = waypoints[:, -1, 1]
+        idx = min(6, waypoints.shape[1] - 1)
+        dx = waypoints[:, idx, 0]
+        v = dx / dt
+        w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
+             * dxr.sign() / dt)
+        near_zero = dx.abs() < EPS
+        v = torch.where(near_zero, torch.zeros_like(v), v)
+        w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
+        if self.last_v is not None:
+            v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
+        v = v.clamp(-self.MAX_V, self.MAX_V)
+        w = w.clamp(-self.MAX_W, self.MAX_W)
+        self.last_v = v
+        return v, w
+class MIMICNavigator:
+    """
+    """
+    context_size = 16
+    multimodal = False
+    def __init__(
+        self,
+        onnx_path: str = ONNX_PATH,
+        device: str = "cuda",
+        max_v: float = 2.5,
+        max_w: float = 0.65,
+        dt: float = 2.0,
+        horizon_idx: int = HORIZON_IDX,
+    ):
+        """
+        Args:
+            onnx_path: Path to the ONNX model file.
+            device: "cuda" or "cpu".
+            max_v: Max linear velocity (m/s).
+            max_w: Max angular velocity (rad/s).
+            dt: Time scaling factor for PD controller.
+            horizon_idx: Number of output waypoints to keep (up to 4s).
+        """
+        self.device = device
+        self.dt = dt
+        self.horizon_idx = horizon_idx
+        ort.set_default_logger_severity(3)
+        providers = [
+            ("CUDAExecutionProvider",
+             {"arena_extend_strategy": "kSameAsRequested"}),
+            "CPUExecutionProvider",
+        ]
+        self._session = ort.InferenceSession(onnx_path, providers=providers)
+        with open(INFO_PATH, "r") as f:
+            self._info = yaml.safe_load(f)
+        self._controller = PDController()
+        self._controller.MAX_V = max_v
+        self._controller.MAX_W = max_w
+        self._pp = PurePursuitController(action_dt=4, waypoint_index=6)
+        self._last_best_traj = None
+    def reset(self):
+        self._controller.reset()
+        self._last_best_traj = None
+    def inference_trajectory(self, obs):
+        """Run model → (trajectory, scores).
+        Args:
+            obs: (B, 16, 3, 288, 512) float32 in [0,1].
+                 Note: batch dim is fixed to 1 for this model.
+        Returns:
+            trajectory: np.ndarray (B, 1, K, 2) where K = horizon_idx
+            scores:     np.ndarray (B, 1)
+        """
+        if isinstance(obs, torch.Tensor):
+            obs_np = obs.cpu().numpy()
+        else:
+            obs_np = np.asarray(obs, dtype=np.float32)
+        B = obs_np.shape[0]
+        if B == 1:
+            out = self._session.run(None, {"input": obs_np})
+            raw = out[0]
+        else:
+            raw = np.concatenate(
+                [self._session.run(None, {"input": obs_np[i:i+1]})[0] for i in range(B)],
+                axis=0,
+            )
+        # Take x, y columns up to the 4-second horizon
+        traj_xy = raw[:, :self.horizon_idx, :2].astype(np.float32)
+        # Reshape to (B, 1, K, 2) for uniform interface
+        trajectory = traj_xy[:, np.newaxis] * Scaler
+        scores = np.ones((B, 1), dtype=np.float32)
+        return trajectory, scores
+    def inference_vw(self, obs):
+        """Run model → (v, w) velocity commands.
+        Returns:
+            vw: torch.Tensor (B, 2)
+        """
+        trajectory, _ = self.inference_trajectory(obs)
+        best_traj = trajectory[:, 0]  # (B, K, 2)
+        waypoints = torch.from_numpy(best_traj).float().to(self.device)
+        v, w = self._controller(waypoints, dt=self.dt)
+        self._last_best_traj = best_traj
+        return torch.stack([v, w], dim=1), best_traj

MIMIC/mimic.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7557512c791b824a1d693368d590ee6741af8a9aeef7974b763eb912df69d765
+size 318192969

MIMIC/model_info.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+name: MIMIC
+goal_type: goal_free
+image_resolution: [288, 512]
+context_size: 16
+input_fps: 5.0
+obs_normalize: false
+inputs:
+  input: {shape: [1, 16, 3, 288, 512], dtype: float32, description: "16 RGB frames [t-15, ..., t] in [0,1]"}
+outputs:
+  output: {shape: [1, 15, 3], description: "15 waypoints x 3 (x, y, yaw) at non-uniform timestamps [1,2,4,6,7,8,10,12,14,15,17,19,21,23,25] @ 5Hz (0.2s-5.0s)"}
+num_waypoints: 15
+output_timestamps_5hz: [1, 2, 4, 6, 7, 8, 10, 12, 14, 15, 17, 19, 21, 23, 25]
+output_fps: 5.0

NoMaD_GL_Official/inference.py ADDED Viewed

	@@ -0,0 +1,440 @@

+#!/usr/bin/env python3
+"""NoMaD-GL-Official inference utilities (3-component diffusion model).
+NoMaD uses iterative DDPM diffusion (10 steps) with 3 ONNX components:
+  1. vision_encoder: obs + goal_img + goal_mask → conditioning vector
+  2. noise_pred: single denoising step
+  3. dist_pred: distance estimation
+Goal-free mode: goal_mask=1, goal_img=random noise.
+Action decode (NavDP / official NoMaD convention):
+  Diffusion output is normalized delta-actions clipped to [-1, 1]. To recover
+  metric waypoints in the robot frame:
+    1. unnormalize:  d = (norm + 1)/2 * (max - min) + min     (dataset units)
+    2. cumsum over time → cumulative positions                (dataset units)
+    3. multiply by `metric_waypoint_spacing` (m) → meters     (robot frame)
+    nav = NoMaDGLNavigator(device="cuda")
+    traj, scores = nav.inference_trajectory(obs)     # (B, S, 8, 2) meters
+    vw = nav.inference_vw(obs)                       # caches nav._last_best_traj
+"""
+import os
+import math
+import numpy as np
+import torch
+import yaml
+import onnxruntime as ort
+from torchvision import transforms
+try:
+    from urbansim.custom.pp import PurePursuitController
+except Exception:
+    PurePursuitController = None  # optional — only needed for inference_vw_pp
+MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
+VE_PATH = os.path.join(MODEL_DIR, "nomad_vision_encoder.onnx")
+NP_PATH = os.path.join(MODEL_DIR, "nomad_noise_pred.onnx")
+DP_PATH = os.path.join(MODEL_DIR, "nomad_dist_pred.onnx")
+INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
+IMG_SIZE = 96
+NUM_DIFFUSION_STEPS = 10
+NUM_SAMPLES = 8
+WAYPOINT_INDEX = 2  # which waypoint to use for v/w
+IMAGE_ASPECT_RATIO = 4 / 3  # NavDP optional center-crop target
+# Training camera intrinsics (NoMaD recon dataset, from NavDP data_config.yaml).
+# Used by the optional `use_train_intrinsics=True` rectification path which
+# warps the source pinhole view to look like a recon-camera capture squished
+# to 96x96 — i.e. the geometric input distribution the model saw at training.
+TRAIN_CAM_FX = 272.547
+TRAIN_CAM_FY = 266.358
+TRAIN_CAM_CX = 320.0
+TRAIN_CAM_CY = 220.0
+TRAIN_CAM_W = 640   # = 2 * cx
+TRAIN_CAM_H = 440   # = 2 * cy
+# Default source intrinsics: the urban-sim undistorted pinhole image emitted
+# by test_inference_forward.py:build_undistort_maps. Override at __init__ time
+# if you change the upstream fisheye→pinhole step.
+DEFAULT_SRC_INTRINSICS = {
+    "fx": 210.667, "fy": 210.667,
+    "cx": 256.0, "cy": 144.0,
+    "w": 512, "h": 288,
+}
+IMAGENET_TRANSFORM = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                         std=[0.229, 0.224, 0.225]),
+])
+class PDController:
+    MAX_V = 2.5
+    MAX_W = 0.65
+    def __init__(self):
+        self.last_v = None
+    def reset(self):
+        self.last_v = None
+    def __call__(self, waypoints, dt=1.0):
+        EPS = 1e-6
+        idx = min(WAYPOINT_INDEX, waypoints.shape[1] - 1)
+        dx = waypoints[:, idx, 0]
+        dyr = waypoints[:, -1, 1]
+        dxr = waypoints[:, -1, 0]
+        v = dx / dt
+        w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
+             * dxr.sign() / dt)
+        near_zero = dx.abs() < EPS
+        v = torch.where(near_zero, torch.zeros_like(v), v)
+        w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
+        if self.last_v is not None:
+            v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
+        v = v.clamp(-self.MAX_V, self.MAX_V)
+        w = w.clamp(-self.MAX_W, self.MAX_W)
+        self.last_v = v
+        return v, w
+def _make_session(path):
+    ort.set_default_logger_severity(3)
+    opts = ort.SessionOptions()
+    opts.log_severity_level = 3
+    return ort.InferenceSession(
+        path, sess_options=opts,
+        providers=[
+            ("CUDAExecutionProvider",
+             {"arena_extend_strategy": "kSameAsRequested"}),
+            "CPUExecutionProvider"])
+class NoMaDGLNavigator:
+    """NoMaD goal-free diffusion navigator.
+    Runs DDPM diffusion loop (10 steps) to generate trajectory samples.
+    All coordinates: x=forward, y=left, meters.
+    """
+    context_size = 4  # 4 obs frames → 12 channels
+    multimodal = True
+    def __init__(
+        self,
+        device: str = "cuda",
+        max_v: float = 2.5,
+        max_w: float = 0.65,
+        dt: float = 2.0,
+        num_samples: int = NUM_SAMPLES,
+        num_diffusion_steps: int = NUM_DIFFUSION_STEPS,
+        center_crop: bool = True,
+        use_train_intrinsics: bool = True,
+        src_intrinsics: dict = None,
+    ):
+        self.device = device
+        self.dt = dt
+        self.num_samples = num_samples
+        self.num_steps = num_diffusion_steps
+        # NavDP default is center_crop=False (direct resize). Flag exposed for
+        # A/B testing: True crops landscape input to 4:3 before resizing.
+        self.center_crop = center_crop
+        # Optional virtual rectification to the NoMaD training camera (recon).
+        # When True, the source pinhole frame is warped (affine, same math as
+        # a homography between two pinhole cameras) so that its content lines
+        # up with what a recon-camera frame squished to 96x96 would look like.
+        self.use_train_intrinsics = use_train_intrinsics
+        self.src_intrinsics = dict(src_intrinsics or DEFAULT_SRC_INTRINSICS)
+        self._train_affine = None  # built lazily in _ensure_train_affine
+        self._ve = _make_session(VE_PATH)
+        self._np = _make_session(NP_PATH)
+        self._dp = _make_session(DP_PATH)
+        with open(INFO_PATH, "r") as f:
+            self._info = yaml.safe_load(f)
+        # Action decoder params (NavDP convention)
+        action_stats = self._info.get("action_stats", {})
+        self._act_min = np.asarray(
+            action_stats.get("min", [0.0, -4.0]), dtype=np.float32)
+        self._act_max = np.asarray(
+            action_stats.get("max", [5.0, 4.0]), dtype=np.float32)
+        self._normalize_actions = bool(self._info.get("normalize", True))
+        self._waypoint_spacing = float(
+            self._info.get("metric_waypoint_spacing", 0.25))
+        # Model's expected observation frame rate (Hz). NoMaD was trained at
+        # 3 Hz; callers should downsample input video to this rate.
+        self.model_fps = float(self._info.get("model_fps", 3.0))
+        self._controller = PDController()
+        self._controller.MAX_V = max_v
+        self._controller.MAX_W = max_w
+        # Cache slot for visualization — last (B, 8, 2) trajectory chosen by
+        # the action head. Matches Coco's contract so downstream viewers
+        # (`model._last_best_traj`) just work.
+        self._last_best_traj = None
+        # Optional pure-pursuit controller for `inference_vw_pp`.
+        self._pp = (PurePursuitController(action_dt=3, waypoint_index=5)
+                    if PurePursuitController is not None else None)
+        # Build DDPM schedule (squaredcos_cap_v2)
+        self._build_schedule()
+    def _build_schedule(self):
+        """Pre-compute DDPM alpha/beta schedule (matches diffusers squaredcos_cap_v2)."""
+        T = self.num_steps
+        def alpha_bar(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+        betas = []
+        for i in range(T):
+            t1 = i / T
+            t2 = (i + 1) / T
+            b = min(1 - alpha_bar(t2) / alpha_bar(t1), 0.999)
+            betas.append(b)
+        betas = np.array(betas, dtype=np.float64)
+        alphas = 1.0 - betas
+        alpha_cumprod = np.cumprod(alphas)
+        self._betas = betas
+        self._alpha_cumprod = alpha_cumprod
+        self._sqrt_alpha = np.sqrt(alphas)
+        self._sqrt_one_minus_alpha_cumprod = np.sqrt(1.0 - alpha_cumprod)
+        # For x0 prediction clipping
+        self._sqrt_recip_alpha_cumprod = np.sqrt(1.0 / alpha_cumprod)
+        self._sqrt_recip_alpha_cumprod_m1 = np.sqrt(1.0 / alpha_cumprod - 1.0)
+    def _ddpm_step(self, sample, noise_pred, t):
+        """Single DDPM reverse step: x_{t-1} from x_t and predicted noise."""
+        alpha_prod_t = self._alpha_cumprod[t]
+        beta_t = self._betas[t]
+        # Predict x0
+        x0_pred = (sample - math.sqrt(1.0 - alpha_prod_t) * noise_pred) / math.sqrt(alpha_prod_t)
+        x0_pred = np.clip(x0_pred, -1.0, 1.0)
+        # Compute mean of q(x_{t-1} | x_t, x_0)
+        if t > 0:
+            alpha_prod_prev = self._alpha_cumprod[t - 1]
+            beta_prod_t = 1.0 - alpha_prod_t
+            beta_prod_prev = 1.0 - alpha_prod_prev
+            coeff_x0 = math.sqrt(alpha_prod_prev) * beta_t / beta_prod_t
+            coeff_xt = math.sqrt(1.0 - beta_t) * beta_prod_prev / beta_prod_t
+            mean = coeff_x0 * x0_pred + coeff_xt * sample
+            # Variance
+            variance = beta_t * beta_prod_prev / beta_prod_t
+            noise = np.random.randn(*sample.shape).astype(np.float32)
+            return (mean + math.sqrt(variance) * noise).astype(np.float32)
+        else:
+            return x0_pred.astype(np.float32)
+    def reset(self):
+        self._controller.reset()
+        self._last_best_traj = None
+    def _ensure_train_affine(self):
+        """Build PIL affine coeffs mapping output (96x96) pixels → source pixels.
+        Both cameras are pinhole, so the ray (X/Z, Y/Z) for a target pixel
+        (u_t, v_t) is ((u_t - cx_t)/fx_t, (v_t - cy_t)/fy_t). Re-projecting that
+        ray into the source camera gives (u_s, v_s) = (ray_x * fx_s + cx_s,
+        ray_y * fy_s + cy_s). Substituting yields an affine map:
+            u_s = (fx_s / fx_t) * u_t + (cx_s - cx_t * fx_s / fx_t)
+            v_s = (fy_s / fy_t) * v_t + (cy_s - cy_t * fy_s / fy_t)
+        PIL.Image.transform expects (a, b, c, d, e, f) where:
+            u_s = a*u_t + b*v_t + c
+            v_s = d*u_t + e*v_t + f
+        """
+        if self._train_affine is not None:
+            return
+        # Effective recon intrinsics when its 640x440 native frame is squished
+        # to 96x96 (which is what the model saw during training).
+        fx_t = TRAIN_CAM_FX * IMG_SIZE / TRAIN_CAM_W
+        fy_t = TRAIN_CAM_FY * IMG_SIZE / TRAIN_CAM_H
+        cx_t = TRAIN_CAM_CX * IMG_SIZE / TRAIN_CAM_W
+        cy_t = TRAIN_CAM_CY * IMG_SIZE / TRAIN_CAM_H
+        src = self.src_intrinsics
+        a = src["fx"] / fx_t
+        e = src["fy"] / fy_t
+        c = src["cx"] - cx_t * a
+        f = src["cy"] - cy_t * e
+        self._train_affine = (a, 0.0, c, 0.0, e, f)
+    def _to_model_input_uint8(self, frame_uint8):
+        """Apply preprocessing to one (H, W, 3) uint8 RGB frame → (96, 96, 3) uint8.
+        Used for both the actual model input and for visualization (panel 2).
+        Does NOT apply ImageNet normalization.
+        """
+        from PIL import Image as PILImage
+        pil = PILImage.fromarray(frame_uint8)
+        if self.use_train_intrinsics:
+            self._ensure_train_affine()
+            pil = pil.transform(
+                (IMG_SIZE, IMG_SIZE), PILImage.AFFINE, self._train_affine,
+                resample=PILImage.BILINEAR, fillcolor=0)
+            return np.array(pil)
+        if self.center_crop:
+            w, h = pil.size
+            if w > h:
+                crop_w = int(h * IMAGE_ASPECT_RATIO)
+                x0 = max((w - crop_w) // 2, 0)
+                pil = pil.crop((x0, 0, x0 + crop_w, h))
+            else:
+                crop_h = int(w / IMAGE_ASPECT_RATIO)
+                y0 = max((h - crop_h) // 2, 0)
+                pil = pil.crop((0, y0, w, y0 + crop_h))
+        pil = pil.resize((IMG_SIZE, IMG_SIZE))
+        return np.array(pil)
+    def _preprocess_images(self, obs_rgb):
+        """obs_rgb: (B, T=4, 3, H, W) float32 [0,1] → (B, 12, 96, 96) ImageNet-norm."""
+        from PIL import Image as PILImage
+        B, T = obs_rgb.shape[:2]
+        frames = []
+        for b in range(B):
+            ch_list = []
+            for t in range(T):
+                frame = (obs_rgb[b, t].transpose(1, 2, 0) * 255).astype(np.uint8)
+                model_in = self._to_model_input_uint8(frame)
+                pil = PILImage.fromarray(model_in)
+                ch_list.append(IMAGENET_TRANSFORM(pil).numpy())
+            frames.append(np.concatenate(ch_list, axis=0))
+        return np.stack(frames)  # (B, 12, 96, 96)
+    def _decode_actions(self, sample):
+        """Decode normalized deltas → metric waypoints (NavDP recipe).
+        sample: (N, T, 2) normalized deltas in [-1, 1]
+        Returns: (N, T, 2) cumulative positions in meters (x=fwd, y=left).
+        """
+        # 1. Unnormalize from [-1, 1] → [min, max] (dataset units).
+        deltas = (sample + 1.0) / 2.0
+        deltas = deltas * (self._act_max - self._act_min) + self._act_min
+        # 2. Cumulative sum over time.
+        positions = np.cumsum(deltas, axis=1)
+        # 3. Scale by metric_waypoint_spacing (only when training-time normalize=True).
+        if self._normalize_actions:
+            positions = positions * self._waypoint_spacing
+        return positions.astype(np.float32)
+    def inference_trajectory(self, obs):
+        """Run diffusion model → (trajectories, scores).
+        Args:
+            obs: (B, 4, 3, H, W) float32 in [0,1]. 4 frames.
+        Returns:
+            trajectory: np.ndarray (B, num_samples, 8, 2) — METERS, robot frame
+                        (x=forward, y=left). Decoded from normalized deltas via
+                        unnorm → cumsum → scale.
+            scores:     np.ndarray (B, num_samples) — uniform (no scoring in goal-free)
+        """
+        if isinstance(obs, torch.Tensor):
+            obs_np = obs.cpu().numpy()
+        else:
+            obs_np = np.asarray(obs, dtype=np.float32)
+        B = obs_np.shape[0]
+        obs_norm = self._preprocess_images(obs_np)  # (B, 12, 96, 96)
+        # Goal-free: random goal image + mask=1
+        fake_goal = np.random.randn(B, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
+        mask = np.ones(B, dtype=np.int64)
+        # 1. Vision encoder
+        cond = self._ve.run(None, {
+            "obs_img": obs_norm,
+            "goal_img": fake_goal,
+            "goal_mask": mask,
+        })[0]  # (B, 256)
+        # Replicate for num_samples
+        cond_rep = np.repeat(cond, self.num_samples, axis=0)  # (B*S, 256)
+        # 2. Diffusion denoising loop (output: normalized deltas in [-1,1]).
+        BS = B * self.num_samples
+        sample = np.random.randn(BS, 8, 2).astype(np.float32)
+        for t in reversed(range(self.num_steps)):
+            t_arr = np.array(t, dtype=np.int64)
+            # Suppress stderr from ORT during inference
+            devnull = os.open(os.devnull, os.O_WRONLY)
+            old_stderr = os.dup(2)
+            os.dup2(devnull, 2)
+            try:
+                noise_pred = self._np.run(None, {
+                    "sample": sample,
+                    "timestep": t_arr,
+                    "global_cond": cond_rep,
+                })[0]
+            finally:
+                os.dup2(old_stderr, 2)
+                os.close(devnull)
+                os.close(old_stderr)
+            sample = self._ddpm_step(sample, noise_pred, t)
+        # 3. Decode normalized deltas → metric waypoints.
+        waypoints = self._decode_actions(sample)  # (BS, 8, 2) meters
+        trajectory = waypoints.reshape(B, self.num_samples, 8, 2)
+        scores = np.ones((B, self.num_samples), dtype=np.float32)
+        return trajectory, scores
+    def inference_vw(self, obs):
+        """Run model → (v, w) velocity commands.
+        Picks the first diffusion sample (goal-free has uniform scores, so any
+        sample is equally valid), runs the PD controller on its waypoints, and
+        caches the chosen trajectory on ``self._last_best_traj`` so visualizers
+        can pull it without re-running inference.
+        Args:
+            obs: (B, 4, 3, H, W) float32 in [0, 1]
+        Returns:
+            vw: torch.Tensor (B, 2) — [linear_vel, angular_vel]
+        """
+        trajectory, _ = self.inference_trajectory(obs)
+        best_traj = trajectory[:, 0]  # (B, 8, 2)
+        self._last_best_traj = best_traj  # cached for visualization
+        waypoints = torch.from_numpy(best_traj).float().to(self.device)
+        v, w = self._controller(waypoints, dt=self.dt)
+        return torch.stack([v, w], dim=1), best_traj
+    def inference_vw_pp(self, obs, robot):
+        """Pure-pursuit variant of :meth:`inference_vw`.
+        Picks the first diffusion sample's waypoints and feeds them to the
+        urbansim ``PurePursuitController`` instead of the PD head. Same return
+        contract as :meth:`inference_vw`.
+        Args:
+            obs:   (B, 4, 3, H, W) float32 in [0, 1]
+            robot: IsaacLab articulation handle (``env.scene["robot"]``).
+        Returns:
+            vw: torch.Tensor (B, 2) — [linear_vel, angular_vel]
+        """
+        if self._pp is None:
+            raise RuntimeError(
+                "PurePursuitController unavailable — install urbansim or "
+                "make sure `urbansim.custom.pp` is on the Python path."
+            )
+        trajectory, _ = self.inference_trajectory(obs)
+        best_traj = trajectory[:, 0]  # (B, 8, 2)
+        self._last_best_traj = best_traj  # cached for visualization
+        waypoints = torch.from_numpy(best_traj).float().to(self.device)
+        v, w = self._pp.step(waypoints, robot=robot)
+        return torch.stack([v, w], dim=1)

NoMaD_GL_Official/model_info.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+name: NoMaD-GL-Official
+goal_type: goal_less
+image_resolution: [96, 96]
+context_size: 4
+obs_normalize: true
+diffusion_steps: 10
+diffusion_scheduler: DDPMScheduler
+diffusion_config:
+  beta_schedule: squaredcos_cap_v2
+  clip_sample: true
+  prediction_type: epsilon
+inputs:
+  obs_img: {shape: [batch, 12, 96, 96], dtype: float32, description: "4 RGB frames x 3 channels, ImageNet normalized"}
+  goal_img: {shape: [batch, 3, 96, 96], dtype: float32, description: "Goal image (random noise for goal-free)"}
+  goal_mask: {shape: [batch], dtype: int64, description: "1=goal-free, 0=goal-conditioned"}
+outputs:
+  trajectories: {shape: [num_samples, 8, 2], description: "8 normalized delta-actions in [-1,1]; decode via unnorm→cumsum→scale"}
+  distances: {shape: [batch, 1], description: "Predicted distance to goal"}
+num_waypoints: 8
+num_samples: 8
+waypoint_index: 2
+# NoMaD action decoder (NavDP convention)
+#   1. unnormalize:  d = (norm + 1) / 2 * (max - min) + min
+#   2. cumsum over time → cumulative positions in dataset units
+#   3. multiply by metric_waypoint_spacing → meters in robot frame (x=fwd, y=left)
+normalize: true
+action_stats:
+  min: [0.0, -4.0]   # [min_dx, min_dy] in dataset waypoint-spacing units
+  max: [5.0, 4.0]     # [max_dx, max_dy]
+# Per-step metric spacing (meters): model_max_v / model_rate = 0.8 / 3.0
+model_max_v: 0.8
+model_rate: 3.0
+metric_waypoint_spacing: 0.26667
+# Inference frame rate (Hz). NoMaD was trained at 3 Hz observation rate; the
+# 4-frame context spans ~1.33 s and each predicted waypoint is 1/model_fps s
+# into the future. Upstream callers should downsample their input video to
+# match this — see test_inference_forward.py.
+model_fps: 3.0

NoMaD_GL_Official/nomad_dist_pred.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:adab58bef61d7b8d0f2bf04636853f0056229e06634e2536a5e7821d6b9def4f
+size 71682

NoMaD_GL_Official/nomad_dist_pred.onnx.data ADDED Viewed

Binary file (69.6 kB). View file

NoMaD_GL_Official/nomad_noise_pred.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3588e1ff53ba748240f160d3cd3fec0c9c987f05e59ec64a631c3536e036e34
+size 15554469

NoMaD_GL_Official/nomad_vision_encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3477772d7ff671dbba9711c58b2df89deb9b1b6a93cae07c530524ab6d47377
+size 47986697

NoMaD_GL_Official/nomad_vision_encoder.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57ceee7e8f0777b6e5cb444cddb1e17549e8188c78ba414463696b130031b320
+size 47448064

README.md CHANGED Viewed

@@ -1,3 +1,235 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
+tags:
+  - robotics
+  - navigation
+  - visual-navigation
+  - embodied-ai
+  - onnx
+pipeline_tag: robotics
 ---
+# Navigation Model Zoo
+A collection of vision-based navigation policies exported to **ONNX**, each wrapped in a small,
+uniform Python inference API. Maintained by **Honglin He @ UCLA-VAIL**.
+Every model takes a short history of RGB frames and predicts a local trajectory (and optionally a
+distance-to-goal / arrival signal); a built-in PD controller turns the trajectory into `(v, ω)`
+velocity commands. All models share the same wrapper interface so they can be swapped and
+benchmarked without per-model glue code.
+## Models
+| Folder | Model / paper | Goal mode | Context | Input H×W | Waypoints | Weights |
+|--------|---------------|-----------|:-------:|:---------:|:---------:|---------|
+| [`GNM_GL_Official`](GNM_GL_Official) | [GNM](https://arxiv.org/abs/2210.03370) · ICRA&nbsp;2023 | goal-free | 6 | 64×85 | 5 | `gnm_imagegoal.onnx` (+`.data`) · 35 MB |
+| [`Vint_GL_Official`](Vint_GL_Official) | [ViNT](https://arxiv.org/abs/2306.14846) · CoRL&nbsp;2023 | goal-free | 6 | 64×85 | 5 | `vint_imagegoal.onnx` (+`.data`) · 97 MB |
+| [`NoMaD_GL_Official`](NoMaD_GL_Official) | [NoMaD](https://arxiv.org/abs/2310.07896) · ICRA&nbsp;2024 | goal-free (diffusion) | 4 | 96×96 | 8 ×8 samples | 3× `.onnx` (+`.data`) · 111 MB |
+| [`CityWalker_PG_Official`](CityWalker_PG_Official) | [CityWalker](https://arxiv.org/abs/2411.17820) · CVPR&nbsp;2025 | point-goal | 5 | 350×630 | 5 | `citywalker.onnx` · 806 MB |
+| [`MBRA_PG_Official`](MBRA_PG_Official) | [MBRA](https://arxiv.org/abs/2505.05592) · RA-L&nbsp;2025 | point-goal | 6 | 96×96 | 8 | `mbra.onnx` · 254 MB |
+| [`S2E`](S2E) | [S2E](https://arxiv.org/abs/2507.22028) · ICLR&nbsp;2026 | point-goal / goal-free | 11 | 256×256 | 10 | `s2e.onnx` · 382 MB |
+| [`MIMIC`](MIMIC) | [MIMIC](https://arxiv.org/abs/2603.22527) · ICRA&nbsp;2026 | goal-free | 16 | 288×512 | 13 | `mimic.onnx` · 318 MB |
+Suffix legend: `PG` = point-goal, `GL` = goal-less (goal-free). Models with a `.onnx.data` companion
+(GNM, ViNT, NoMaD) use ONNX external weights — keep each `.onnx` and its `.onnx.data` together.
+## Common interface
+Each folder is a self-contained module exposing one navigator class. They all follow the same contract:
+```python
+import numpy as np
+from MBRA_PG_Official.inference import MBRAPGNavigator   # run from the repo root
+nav = MBRAPGNavigator(device="cuda")          # use device="cpu" if you have no GPU
+# obs: (B, nav.context_size, 3, H, W) float32 in [0, 1]
+#      the wrapper resizes & normalizes to the model's spec internally
+obs = np.random.rand(1, nav.context_size, 3, 96, 96).astype(np.float32)
+# Point-goal models take goal_xy (standard frame: x=forward, y=left, meters);
+# goal-free models omit it.
+traj, scores = nav.inference_trajectory(obs, goal_xy=np.array([5.0, 0.2]))  # (B, M, W, 2) meters
+vw, best     = nav.inference_vw(obs,        goal_xy=np.array([5.0, 0.2]))   # vw: (B, 2) = [v, ω]
+nav.reset()   # clears PD-controller velocity smoothing between episodes
+```
+Conventions shared by every model:
+- **Coordinate frame** — all user-facing inputs/outputs are *standard frame*: `x = forward`, `y = left`, in meters. Models with a different internal convention (e.g. CityWalker) convert transparently.
+- **Observations** — `(B, context_size, 3, H, W)`, `float32`, pixel values in `[0, 1]`. The wrapper handles resize and any ImageNet normalization. *(Exception: `MIMIC` expects frames already at 288×512 and does not resize.)*
+- **`inference_trajectory(obs[, goal_xy])`** → `(trajectory, scores)`. `trajectory` is `(B, M, W, 2)` in meters, where `M` is the number of modes (1 for unimodal, 8 for NoMaD) and `W` the waypoint count; `scores` is `(B, M)`.
+- **`inference_vw(obs[, goal_xy])`** → `(vw, best_traj)` where `vw` is a `(B, 2)` torch tensor of `[linear_v, angular_w]`. Tune limits with `max_v` / `max_w` at construction.
+- Goal-free models (`Vint`, `GNM`, `NoMaD`, `MIMIC`) ignore `goal_xy` — call `inference_trajectory(obs)`.
+## Installation
+```bash
+pip install onnxruntime-gpu numpy torch torchvision pyyaml pillow
+# CPU-only: use onnxruntime instead of onnxruntime-gpu
+pip install opencv-python   # required by S2E (frame resizing)
+```
+Optional, lab-internal dependency: `Vint`, `GNM`, and `NoMaD` expose an extra `inference_vw_pp()`
+method that uses `urbansim.custom.pp.PurePursuitController`; it is imported lazily and only needed
+for that method. **`MIMIC` imports `urbansim` at module load**, so its `inference.py` will not import
+without the `urbansim` package on your path.
+## Model details
+### GNM_GL_Official — `gnm_imagegoal.onnx` (+ `.onnx.data`)
+**Paper:** *GNM: A General Navigation Model to Drive Any Robot* (ICRA 2023) · [arXiv:2210.03370](https://arxiv.org/abs/2210.03370) · [code](https://github.com/robodhruv/drive-any-robot)
+Goal-free General Navigation Model — same NavDP image-goal I/O contract as ViNT (`obs_img (B,18,64,85)` + `goal_img (B,3,64,85)` → `dist_pred (B,1)`, `action_pred (B,5,4)`), with a lower top speed. Expects input downsampled to ≈ 3 Hz.
+### Vint_GL_Official — `vint_imagegoal.onnx` (+ `.onnx.data`)
+**Paper:** *ViNT: A Foundation Model for Visual Navigation* (CoRL 2023) · [arXiv:2306.14846](https://arxiv.org/abs/2306.14846) · [project](https://general-navigation-models.github.io/vint/)
+Goal-free ViNT (NavDP image-goal backbone run with a random goal image). **ONNX I/O:** `obs_img (B,18,64,85)` (6 ImageNet-normalized frames × 3 ch) + `goal_img (B,3,64,85)` (random noise) → `dist_pred (B,1)`, `action_pred (B,5,4)`. Cumulative `xy` is already baked in; the wrapper scales by the 0.8 m metric spacing. Expects input downsampled to ≈ 3 Hz.
+### NoMaD_GL_Official — 3× ONNX (diffusion, + `.onnx.data`)
+**Paper:** *NoMaD: Goal Masked Diffusion Policies for Navigation and Exploration* (ICRA 2024) · [arXiv:2310.07896](https://arxiv.org/abs/2310.07896) · [project](https://general-navigation-models.github.io/nomad/)
+Goal-free diffusion policy. Runs a 10-step DDPM loop (`squaredcos_cap_v2`) over 3 components:
+`nomad_vision_encoder.onnx` (`obs_img (B,12,96,96)` + `goal_img (B,3,96,96)` + `goal_mask (B)` → `cond (B,256)`), `nomad_noise_pred.onnx` (one denoising step), and `nomad_dist_pred.onnx`. Produces **8 trajectory samples** → `trajectory (B,8,8,2)` meters (decode: unnormalize → cumsum → ×0.267 m spacing). This is the only multi-modal model and the slowest (diffusion + multiple samples).
+### CityWalker_PG_Official — `citywalker.onnx`
+**Paper:** *CityWalker: Learning Embodied Urban Navigation from Web-Scale Videos* (CVPR 2025) · [arXiv:2411.17820](https://arxiv.org/abs/2411.17820) · [project](https://ai4ce.github.io/CityWalker/)
+Point-goal urban walker. **ONNX I/O:** `obs_images (B,5,3,350,630)` + `trajectory (B,6,2)` past waypoints → `wp_pred (B,5,2)`, `arrive_pred (B,1)` (arrival probability). Images are ImageNet-normalized internally; the model's internal `y=forward, x=right` frame is converted to standard frame by the wrapper. Input rate ≈ 5 Hz.
+### MBRA_PG_Official — `mbra.onnx`
+**Paper:** *Learning to Drive Anywhere with Model-Based Reannotation* (RA-L 2025) · [arXiv:2505.05592](https://arxiv.org/abs/2505.05592) · [project](https://model-base-reannotation.github.io/)
+Point-goal policy. **ONNX I/O:** `obs_images (B,6,3,96,96)` ImageNet-normalized + `goal_pose (B,4)` = `[x, y, sin(yaw), cos(yaw)]` → `waypoints (B,8,4)`. Goal is given as `goal_xy` (meters) and converted internally; waypoints are un-normalized by a 0.8 m metric spacing. Input rate ≈ 5 Hz.
+### S2E — `s2e.onnx`
+**Paper:** *From Seeing to Experiencing: Scaling Navigation Foundation Models with Reinforcement Learning* (ICLR 2026) · [arXiv:2507.22028](https://arxiv.org/abs/2507.22028) · [project](https://metadriverse.github.io/s2e)
+UCLA-VAIL navigation foundation model; this is the behavior-cloning, point-goal, web-pretrained variant (`S2EBC-PG-Web100`). **ONNX I/O:** `obs_images (B,11,3,256,256)` in `[0,1]` (no ImageNet norm) + `goal (B,3)` = `[norm_dist, cos(θ), sin(θ)]` → `wp_pred (B,10,3)` `[x,y,yaw]`, `wp_pred_score (B,63)` mode scores. Frames are resized to 256×256 with OpenCV.
+### MIMIC — `mimic.onnx`
+**Paper:** *Learning Sidewalk Autopilot from Multi-Scale Imitation with Corrective Behavior Expansion* (ICRA 2026) · [arXiv:2603.22527](https://arxiv.org/abs/2603.22527) · [project](https://vail-ucla.github.io/MIMIC)
+UCLA-VAIL goal-free long-context sidewalk policy. **ONNX I/O:** `input (1,16,3,288,512)` in `[0,1]` → `output (1,15,3)` `[x,y,yaw]` at non-uniform timestamps (0.2 s–5.0 s @ 5 Hz). Batch is processed one sample at a time; the wrapper keeps the first 13 waypoints (~4 s) and scales to meters. Requires `urbansim` (see Installation).
+## Downloading
+**Full repo** (includes the LFS-tracked ONNX weights):
+```bash
+hf download UCLA-VAIL/Navigation-Model-Zoo-Public --local-dir ./Navigation-Model-Zoo-Public
+```
+**One model** — fetch just its folder, e.g. MBRA:
+```bash
+hf download UCLA-VAIL/Navigation-Model-Zoo-Public \
+  --include "MBRA_PG_Official/*" --local-dir .
+```
+Then run from the repo root: `from MBRA_PG_Official.inference import MBRAPGNavigator`.
+> **External weights:** GNM, ViNT, and NoMaD ship `*.onnx.data` files — keep each `.onnx` and its
+> `.onnx.data` together in the same folder so ONNX Runtime can resolve the weights.
+## Intended use & limitations
+These are **research artifacts** for navigation research, reproduction, and benchmarking — not
+safety-validated for deployment on real robots without additional testing. Each policy's behavior
+is bounded by its training distribution (camera intrinsics, embodiment, frame rate, environment).
+Several wrappers rectify/resize inputs to a specific training camera; mismatched cameras may degrade
+performance.
+## License
+Released under **Apache 2.0**. Individual models carry the licenses and terms of their original
+sources (ViNT, GNM, NoMaD, CityWalker, MBRA) — check upstream before commercial use.
+## Citation
+If you use a model from this zoo, please cite its original paper.
+**GNM**
+```bibtex
+@inproceedings{shah2023gnm,
+  title={Gnm: A general navigation model to drive any robot},
+  author={Shah, Dhruv and Sridhar, Ajay and Bhorkar, Arjun and Hirose, Noriaki and Levine, Sergey},
+  booktitle={2023 IEEE International Conference on Robotics and Automation (ICRA)},
+  pages={7226--7233},
+  year={2023},
+  organization={IEEE}
+}
+```
+**ViNT**
+```bibtex
+@article{shah2023vint,
+  title={ViNT: A foundation model for visual navigation},
+  author={Shah, Dhruv and Sridhar, Ajay and Dashora, Nitish and Stachowicz, Kyle and Black, Kevin and Hirose, Noriaki and Levine, Sergey},
+  journal={arXiv preprint arXiv:2306.14846},
+  year={2023}
+}
+```
+**NoMaD**
+```bibtex
+@inproceedings{sridhar2024nomad,
+  title={Nomad: Goal masked diffusion policies for navigation and exploration},
+  author={Sridhar, Ajay and Shah, Dhruv and Glossop, Catherine and Levine, Sergey},
+  booktitle={2024 IEEE International Conference on Robotics and Automation (ICRA)},
+  pages={63--70},
+  year={2024},
+  organization={IEEE}
+}
+```
+**CityWalker**
+```bibtex
+@inproceedings{liu2025citywalker,
+  title={Citywalker: Learning embodied urban navigation from web-scale videos},
+  author={Liu, Xinhao and Li, Jintong and Jiang, Yicheng and Sujay, Niranjan and Yang, Zhicheng and Zhang, Juexiao and Abanes, John and Zhang, Jing and Feng, Chen},
+  booktitle={Proceedings of the Computer Vision and Pattern Recognition Conference},
+  pages={6875--6885},
+  year={2025}
+}
+```
+**MBRA**
+```bibtex
+@article{hirose2025learning,
+  title={Learning to drive anywhere with model-based reannotation},
+  author={Hirose, Noriaki and Ignatova, Lydia and Stachowicz, Kyle and Glossop, Catherine and Levine, Sergey and Shah, Dhruv},
+  journal={IEEE Robotics and Automation Letters},
+  volume={11},
+  number={2},
+  pages={1242--1249},
+  year={2025},
+  publisher={IEEE}
+}
+```
+**S2E**
+```bibtex
+@article{he2025seeing,
+  title={From seeing to experiencing: Scaling navigation foundation models with reinforcement learning},
+  author={He, Honglin and Ma, Yukai and Squicciarini, Brad  and Wu, Wayne and Zhou, Bolei},
+  journal={arXiv preprint arXiv:2507.22028},
+  year={2025}
+}
+```
+**MIMIC**
+```bibtex
+@article{he2026learning,
+  title={Learning Sidewalk Autopilot from Multi-Scale Imitation with Corrective Behavior Expansion},
+  author={He, Honglin and Ma, Yukai and Squicciarini, Brad and Wu, Wayne and Zhou, Bolei},
+  journal={arXiv preprint arXiv:2603.22527},
+  year={2026}
+}
+```
+## Contact
+Maintained by [UCLA-VAIL](https://vail-ucla.github.io/). Open an issue/discussion on the
+repository page for questions or contributions.

S2E/inference.py ADDED Viewed

	@@ -0,0 +1,167 @@

+#!/usr/bin/env python3
+"""S2EBC-PG-Web100 inference utilities.
+S2EBC-PG takes 11 frames at 256x256 (no ImageNet normalization, [0,1]),
+a goal vector [norm_dist, cos(angle), sin(angle)], and outputs
+10 waypoints x 3 (x, y, yaw). Waypoints are scaled by 1/0.1 = 10x.
+User provides goal_xy in standard frame (x=forward, y=left, meters).
+    nav = S2EBCPGNavigator(device="cuda")
+    traj, scores = nav.inference_trajectory(obs, goal_xy=np.array([5.0, 0.2]))
+    vw = nav.inference_vw(obs, goal_xy=np.array([5.0, 0.2]))
+"""
+import os
+import math
+import numpy as np
+import torch
+import onnxruntime as ort
+MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
+ONNX_PATH = os.path.join(MODEL_DIR, "s2e.onnx")
+IMG_SIZE = 256
+WP_SCALE = 0.25  # model outputs are divided by this to get meters
+class PDController:
+    MAX_V = 2.5
+    MAX_W = 0.65
+    def __init__(self):
+        self.last_v = None
+    def reset(self):
+        self.last_v = None
+    def __call__(self, waypoints, dt=1.0):
+        EPS = 1e-6
+        idx = min(4, waypoints.shape[1] - 1)
+        dx = waypoints[:, idx, 0]
+        dyr = waypoints[:, -1, 1]
+        dxr = waypoints[:, -1, 0]
+        v = dx / dt
+        w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
+             * dxr.sign() / dt )
+        near_zero = dx.abs() < EPS
+        v = torch.where(near_zero, torch.zeros_like(v), v)
+        w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
+        if self.last_v is not None:
+            v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
+        v = v.clamp(-self.MAX_V, self.MAX_V)
+        w = w.clamp(-self.MAX_W, self.MAX_W)
+        self.last_v = v
+        return v, w
+class S2ENavigator:
+    """S2E point-goal / goal-free navigator.
+    All user-facing coordinates: x=forward, y=left, meters.
+    """
+    context_size = 11
+    multimodal = False
+    def __init__(
+        self,
+        onnx_path: str = ONNX_PATH,
+        device: str = "cuda",
+        max_v: float = 2.5,
+        max_w: float = 0.65,
+        dt: float = 2.0,
+    ):
+        self.device = device
+        self.dt = dt
+        ort.set_default_logger_severity(3)
+        providers = [
+            ("CUDAExecutionProvider",
+             {"arena_extend_strategy": "kSameAsRequested"}),
+            "CPUExecutionProvider",
+        ]
+        self._session = ort.InferenceSession(onnx_path, providers=providers)
+        self._controller = PDController()
+        self._controller.MAX_V = max_v
+        self._controller.MAX_W = max_w
+        self._last_best_traj = None
+        INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
+        import yaml
+        with open(INFO_PATH, "r") as f:
+            self._info = yaml.safe_load(f)
+    def reset(self):
+        self._controller.reset()
+        self._last_best_traj = None
+    @staticmethod
+    def _goal_to_input(goal_xy):
+        """Standard goal [x_fwd, y_left] meters → model input [norm_dist, cos(θ), sin(θ)]."""
+        x, y = float(goal_xy[0]), float(goal_xy[1])
+        dist = math.sqrt(x * x + y * y)
+        norm_dist = max(min(dist, 200.0), 0.1) / 200.0
+        angle = math.atan2(y, x)
+        return np.array([norm_dist, math.cos(angle), math.sin(angle)],
+                        dtype=np.float32)
+    def inference_trajectory(self, obs, goal_xy=None):
+        """Run model → (trajectory, scores).
+        Args:
+            obs: (B, 11, 3, H, W) float32 in [0,1]. 11 frames.
+                 Images are resized internally to 256x256.
+            goal_xy: (2,) goal in standard frame, or None → [5.0, 0.0].
+        Returns:
+            trajectory: np.ndarray (B, 1, 10, 2) meters
+            scores:     np.ndarray (B, 1)
+        """
+        if isinstance(obs, torch.Tensor):
+            obs_np = obs.cpu().numpy()
+        else:
+            obs_np = np.asarray(obs, dtype=np.float32)
+        B = obs_np.shape[0]
+        # Resize to 256x256 if needed (obs is CHW)
+        if obs_np.shape[-2:] != (IMG_SIZE, IMG_SIZE):
+            import cv2
+            resized = np.empty((B, 11, 3, IMG_SIZE, IMG_SIZE), dtype=np.float32)
+            for b in range(B):
+                for c in range(11):
+                    frame_hwc = obs_np[b, c].transpose(1, 2, 0)  # HWC
+                    frame_rsz = cv2.resize(frame_hwc, (IMG_SIZE, IMG_SIZE))
+                    resized[b, c] = frame_rsz.transpose(2, 0, 1)
+            obs_np = resized
+        if goal_xy is None:
+            goal_xy = np.array([5.0, 0.0])
+        goal_input = self._goal_to_input(goal_xy)
+        goal_batch = np.tile(goal_input, (B, 1))
+        all_wp_raw = []
+        for i in range(B):
+            out = self._session.run(None, {
+                "obs_images": obs_np[i:i+1],  # (1, 11, 3, 256, 256)
+                "goal": goal_batch[i:i+1],  # (1, 3)
+            })
+            all_wp_raw.append(out[0][:, :, :2])
+        wp_raw = np.concatenate(all_wp_raw, axis=0)  # (B, 10, 2) — x, y (scaled)
+        wp_meters = wp_raw * WP_SCALE  # un-scale to meters
+        trajectory = wp_meters[:, np.newaxis].astype(np.float32)  # (B, 1, 10, 2)
+        scores = np.ones((B, 1), dtype=np.float32)
+        self._last_best_traj = trajectory[:, 0]
+        return trajectory, scores
+    def inference_vw(self, obs, goal_xy=None):
+        """Run model → (v, w).
+        Returns:
+            vw: torch.Tensor (B, 2)
+        """
+        trajectory, _ = self.inference_trajectory(obs, goal_xy)
+        best_traj = trajectory[:, 0]
+        waypoints = torch.from_numpy(best_traj).float().to(self.device)
+        v, w = self._controller(waypoints, dt=self.dt)
+        return torch.stack([v, w], dim=1), best_traj

S2E/model_info.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+name: S2E
+goal_type: point_goal / goal-free (random goal)
+image_resolution: [256, 256]
+context_size: 11
+obs_normalize: false
+inputs:
+  obs_images: {shape: [batch, 11, 3, 256, 256], dtype: float32, description: "11 RGB observation frames in [0,1]"}
+  goal: {shape: [batch, 3], dtype: float32, description: "Goal (x, y, yaw) in meters/rad"}
+outputs:
+  wp_pred: {shape: [batch, 10, 3], description: "10 waypoints x 3 (x, y, yaw)"}
+  wp_pred_score: {shape: [batch, 63], description: "Mode selection scores"}
+num_waypoints: 10

S2E/s2e.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee1410ab55a54946cf25323e82b3be2a8cb106e2ccb2ad878e7638839686108a
+size 381993971

Vint_GL_Official/inference.py ADDED Viewed

	@@ -0,0 +1,294 @@

+#!/usr/bin/env python3
+"""Vint-GL-Official inference wrapper (NavDP image-goal ViNT in goal-free mode).
+The ONNX has two inputs (obs_img, goal_img) and two outputs (dist, action).
+Goal-free mode: pass random noise as goal_img — same trick NavDP uses in
+its `ViNTAgent.step_nogoal`.
+    nav = VintGLOfficialNavigator(device="cuda")
+    traj, scores = nav.inference_trajectory(obs)   # (B, 1, 5, 2) meters
+    vw = nav.inference_vw(obs)                     # caches nav._last_best_traj
+Action decode (NavDP convention):
+  The model bakes `torch.cumsum` into its forward, so the ONNX output
+  `action_pred[:, :, :2]` is *already* cumulative — we just scale by
+  `metric_waypoint_spacing` to get meters in the robot frame (x=fwd, y=left).
+  No unnormalize step (unlike NoMaD).
+"""
+import os
+import math
+import numpy as np
+import torch
+import yaml
+import onnxruntime as ort
+from torchvision import transforms
+try:
+    from urbansim.custom.pp import PurePursuitController
+except Exception:
+    PurePursuitController = None  # optional — only needed for inference_vw_pp
+MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
+ONNX_PATH = os.path.join(MODEL_DIR, "vint_imagegoal.onnx")
+INFO_PATH = os.path.join(MODEL_DIR, "model_info.yaml")
+# NavDP training image size for ViNT (width, height).
+IMG_W, IMG_H = 85, 64
+NUM_WAYPOINTS = 5
+CTX_FRAMES = 6  # context_size=5 past + 1 current
+WAYPOINT_INDEX = 2
+IMAGE_ASPECT_RATIO = 4 / 3
+# Recon training camera (for the optional virtual rectification path).
+TRAIN_CAM_FX = 272.547
+TRAIN_CAM_FY = 266.358
+TRAIN_CAM_CX = 320.0
+TRAIN_CAM_CY = 220.0
+TRAIN_CAM_W = 640
+TRAIN_CAM_H = 440
+DEFAULT_SRC_INTRINSICS = {
+    "fx": 210.667, "fy": 210.667,
+    "cx": 256.0, "cy": 144.0,
+    "w": 512, "h": 288,
+}
+IMAGENET_TRANSFORM = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                         std=[0.229, 0.224, 0.225]),
+])
+class PDController:
+    MAX_V = 2.5
+    MAX_W = 0.65
+    def __init__(self):
+        self.last_v = None
+    def reset(self):
+        self.last_v = None
+    def __call__(self, waypoints, dt=1.0):
+        EPS = 1e-6
+        idx = min(WAYPOINT_INDEX, waypoints.shape[1] - 1)
+        dx = waypoints[:, idx, 0]
+        dyr = waypoints[:, -1, 1]
+        dxr = waypoints[:, -1, 0]
+        v = dx / dt
+        w = (torch.atan2(dyr, dxr.abs().clamp(min=EPS))
+             * dxr.sign() / dt)
+        near_zero = dx.abs() < EPS
+        v = torch.where(near_zero, torch.zeros_like(v), v)
+        w = torch.where(near_zero, dyr.sign() * (math.pi / 20.0), w)
+        if self.last_v is not None:
+            v = v.clamp(self.last_v - 0.5, self.last_v + 0.4)
+        v = v.clamp(-self.MAX_V, self.MAX_V)
+        w = w.clamp(-self.MAX_W, self.MAX_W)
+        self.last_v = v
+        return v, w
+def _make_session(path):
+    ort.set_default_logger_severity(3)
+    opts = ort.SessionOptions()
+    opts.log_severity_level = 3
+    return ort.InferenceSession(
+        path, sess_options=opts,
+        providers=[
+            ("CUDAExecutionProvider",
+             {"arena_extend_strategy": "kSameAsRequested"}),
+            "CPUExecutionProvider"])
+class VintGLOfficialNavigator:
+    """Vint-GL-Official navigator (NavDP image-goal ViNT, goal-free mode)."""
+    context_size = CTX_FRAMES   # 6 frames in obs tensor
+    multimodal = False
+    num_waypoints = NUM_WAYPOINTS
+    def __init__(
+        self,
+        device: str = "cuda",
+        max_v: float = 2.5,
+        max_w: float = 0.65,
+        dt: float = 1.25,
+        center_crop: bool = True,
+        use_train_intrinsics: bool = True,
+        src_intrinsics: dict = None,
+    ):
+        self.device = device
+        self.dt = dt
+        # NavDP default is center_crop=False (direct resize). Flag exposed for
+        # A/B testing: True crops landscape input to 4:3 before resizing.
+        self.center_crop = center_crop
+        # Optional virtual rectification to the NoMaD/ViNT training camera
+        # (recon dataset). When True, the source pinhole frame is warped
+        # (affine, same math as a homography between two pinhole cameras)
+        # so its content lines up with what a recon-camera frame squished
+        # to 85x64 would look like.
+        self.use_train_intrinsics = use_train_intrinsics
+        self.src_intrinsics = dict(src_intrinsics or DEFAULT_SRC_INTRINSICS)
+        self._train_affine = None
+        self._sess = _make_session(ONNX_PATH)
+        with open(INFO_PATH, "r") as f:
+            self._info = yaml.safe_load(f)
+        # Action decoder params (NavDP convention — cumsum already baked
+        # into the network, only the metric scale remains here).
+        self._normalize_actions = bool(self._info.get("normalize", True))
+        self._waypoint_spacing = float(
+            self._info.get("metric_waypoint_spacing", 0.8 / 3.0))
+        self.model_fps = float(self._info.get("model_fps", 3.0))
+        self._controller = PDController()
+        self._controller.MAX_V = max_v
+        self._controller.MAX_W = max_w
+        # Cache for visualization (last best trajectory in metric meters).
+        self._last_best_traj = None
+        # Optional pure-pursuit controller (used by inference_vw_pp).
+        self._pp = (PurePursuitController(action_dt=3, waypoint_index=2)
+                    if PurePursuitController is not None else None)
+    def reset(self):
+        self._controller.reset()
+        self._last_best_traj = None
+    # ------------------------------------------------------------------
+    # Preprocessing — mirrors NoMaD_GL_Official, only the target size differs.
+    # ------------------------------------------------------------------
+    def _ensure_train_affine(self):
+        if self._train_affine is not None:
+            return
+        # Recon intrinsics scaled to (IMG_W, IMG_H) = (85, 64).
+        fx_t = TRAIN_CAM_FX * IMG_W / TRAIN_CAM_W
+        fy_t = TRAIN_CAM_FY * IMG_H / TRAIN_CAM_H
+        cx_t = TRAIN_CAM_CX * IMG_W / TRAIN_CAM_W
+        cy_t = TRAIN_CAM_CY * IMG_H / TRAIN_CAM_H
+        src = self.src_intrinsics
+        a = src["fx"] / fx_t
+        e = src["fy"] / fy_t
+        c = src["cx"] - cx_t * a
+        f = src["cy"] - cy_t * e
+        self._train_affine = (a, 0.0, c, 0.0, e, f)
+    def _to_model_input_uint8(self, frame_uint8):
+        from PIL import Image as PILImage
+        pil = PILImage.fromarray(frame_uint8)
+        if self.use_train_intrinsics:
+            self._ensure_train_affine()
+            pil = pil.transform(
+                (IMG_W, IMG_H), PILImage.AFFINE, self._train_affine,
+                resample=PILImage.BILINEAR, fillcolor=0)
+            return np.array(pil)
+        if self.center_crop:
+            w, h = pil.size
+            if w > h:
+                crop_w = int(h * IMAGE_ASPECT_RATIO)
+                x0 = max((w - crop_w) // 2, 0)
+                pil = pil.crop((x0, 0, x0 + crop_w, h))
+            else:
+                crop_h = int(w / IMAGE_ASPECT_RATIO)
+                y0 = max((h - crop_h) // 2, 0)
+                pil = pil.crop((0, y0, w, y0 + crop_h))
+        pil = pil.resize((IMG_W, IMG_H))
+        return np.array(pil)
+    def _preprocess_images(self, obs_rgb):
+        """obs_rgb: (B, T=6, 3, H, W) float32 [0,1] → (B, 18, 64, 85) ImageNet-norm."""
+        from PIL import Image as PILImage
+        B, T = obs_rgb.shape[:2]
+        frames = []
+        for b in range(B):
+            ch_list = []
+            for t in range(T):
+                frame = (obs_rgb[b, t].transpose(1, 2, 0) * 255).astype(np.uint8)
+                model_in = self._to_model_input_uint8(frame)
+                pil = PILImage.fromarray(model_in)
+                ch_list.append(IMAGENET_TRANSFORM(pil).numpy())
+            frames.append(np.concatenate(ch_list, axis=0))
+        return np.stack(frames)  # (B, 3*T, IMG_H, IMG_W)
+    # ------------------------------------------------------------------
+    # Inference
+    # ------------------------------------------------------------------
+    def inference_trajectory(self, obs):
+        """Run ViNT → metric waypoints + uniform scores.
+        Args:
+            obs: (B, 6, 3, H, W) float32 in [0,1]. 6 frames.
+        Returns:
+            trajectory: np.ndarray (B, 1, 5, 2) — meters in robot frame
+            scores:     np.ndarray (B, 1) — uniform (1.0)
+        """
+        if isinstance(obs, torch.Tensor):
+            obs_np = obs.cpu().numpy()
+        else:
+            obs_np = np.asarray(obs, dtype=np.float32)
+        B = obs_np.shape[0]
+        obs_in = self._preprocess_images(obs_np)  # (B, 18, 64, 85)
+        # Goal-free: random fake goal (matches NavDP ViNTAgent.step_nogoal).
+        fake_goal = np.random.randn(B, 3, IMG_H, IMG_W).astype(np.float32)
+        # Suppress ORT stderr noise during this call.
+        devnull = os.open(os.devnull, os.O_WRONLY)
+        old_stderr = os.dup(2)
+        os.dup2(devnull, 2)
+        all_action_pred = []
+        try:
+            for i in range(B):
+                dist_pred, action_pred = self._sess.run(
+                    None, {"obs_img": obs_in[i:i+1], "goal_img": fake_goal[i:i+1]})
+                all_action_pred.append(action_pred)
+        finally:
+            os.dup2(old_stderr, 2)
+            os.close(devnull)
+            os.close(old_stderr)
+        action_pred = np.concatenate(all_action_pred, axis=0)  # (B, 5, 4)
+        # action_pred shape (B, 5, 4): xy already cumsum'd, last two are sin/cos.
+        xy = action_pred[:, :, :2]
+        if self._normalize_actions:
+            xy = xy * self._waypoint_spacing  # meters
+        trajectory = xy[:, None, :, :]  # (B, 1, 5, 2)
+        scores = np.ones((B, 1), dtype=np.float32)
+        return trajectory.astype(np.float32), scores
+    def inference_vw(self, obs):
+        """Run ViNT → (v, w) velocity commands.
+        Caches the chosen trajectory on ``self._last_best_traj``.
+        """
+        trajectory, _ = self.inference_trajectory(obs)
+        best_traj = trajectory[:, 0]  # (B, 5, 2)
+        self._last_best_traj = best_traj
+        waypoints = torch.from_numpy(best_traj).float().to(self.device)
+        v, w = self._controller(waypoints, dt=self.dt)
+        return torch.stack([v, w], dim=1), best_traj
+    def inference_vw_pp(self, obs, robot):
+        """Pure-pursuit variant of :meth:`inference_vw`."""
+        if self._pp is None:
+            raise RuntimeError(
+                "PurePursuitController unavailable — install urbansim or "
+                "make sure `urbansim.custom.pp` is on the Python path.")
+        trajectory, _ = self.inference_trajectory(obs)
+        best_traj = trajectory[:, 0]
+        self._last_best_traj = best_traj
+        waypoints = torch.from_numpy(best_traj).float().to(self.device)
+        v, w = self._pp.step(waypoints, robot=robot)
+        return torch.stack([v, w], dim=1)

Vint_GL_Official/model_info.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+name: Vint_GL_Official
+goal_type: goal_less
+image_resolution: [64, 85]
+context_size: 6
+obs_normalize: true
+normalize: true
+action_stats:
+  min: [0.0, -2.0]
+  max: [5.0, 2.0]
+model_max_v: 2.5
+model_rate: 3
+metric_waypoint_spacing: 0.8
+model_fps: 3
+len_traj_pred: 5
+learn_angle: true
+inputs:
+  obs_img:
+    shape: [batch, 18, 64, 85]
+    dtype: float32
+    description: 6 RGB frames x 3 channels, ImageNet normalized; resized to image_resolution
+  goal_img:
+    shape: [batch, 3, 64, 85]
+    dtype: float32
+    description: Goal image (random noise for goal-free)
+outputs:
+  dist_pred:
+    shape: [batch, 1]
+    description: Predicted distance to goal
+  action_pred:
+    shape: [batch, 5, 4]
+    description: waypoints (cumsum'd dx,dy in dataset units; scale by metric_waypoint_spacing),
+      sin/cos heading
+onnx: vint_imagegoal.onnx

Vint_GL_Official/vint_imagegoal.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d480bf0fc28f6c1d84f568bf28733f6b418920218e3affd6f9d59fff1506ecec
+size 1415900

Vint_GL_Official/vint_imagegoal.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:399b8d50e7fcc812a69734fcc51f40dd313d964fec79d6fea3429252bc994791
+size 95748096