init

Files changed (15) hide show

README.md +109 -0
models/GNM/gnm.onnx +3 -0
models/GNM/gnm.pth +3 -0
models/NaviBridger/cvae.pth +3 -0
models/NaviBridger/cvae.yaml +151 -0
models/NaviBridger/navibridger_cvae.pth +3 -0
models/NaviBridger/navibridger_cvae.yaml +150 -0
models/NaviBridger/navibridger_dist_pred_net.onnx +3 -0
models/NaviBridger/navibridger_vision_encoder.onnx +3 -0
models/NoMaD/nomad.pth +3 -0
models/NoMaD/nomad_dist_pred_net.onnx +3 -0
models/NoMaD/nomad_noise_pred_net.onnx +3 -0
models/NoMaD/nomad_vision_encoder.onnx +3 -0
models/ViNT/vint.onnx +3 -0
models/ViNT/vint.pth +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,112 @@
 ---
 license: mit
 ---

 ---
 license: mit
+tags:
+  - zero-shot evaluation
+  - foundation models
+  - visual navigation
+  - robot learning
+  - real-world evaluation
+  - onnx
+pipeline_tag: vnm_zeroshot_eval
+library_name: onnxruntime
+arxiv: 2603.25937
+base_model:
+    - rail-berkeley/crossformer
+    - robodhruv/visualnav-transformer
+    - hren20/NaiviBridger
 ---
+# Can Vision Foundation Models Navigate? Zero-Shot Real-World Evaluation and Lessons Learned — ONNX Models
+ONNX-optimized exports of visual navigation models for deployment on physical robots (e.g., Boston Dynamic Spot,  AgileX Limo, AgileX Bunker). These exports are derived from the original works listed below — all credit for architectures and training goes to the respective authors.
+See https://github.com/MaevaGuerrier/vnm-zeroshot-eval for deployment instructions.
+# Acknowledgements
+We would like to thank the authors of the following works, whose open-source models made this evaluation possible.
+- [GNM](https://arxiv.org/abs/2210.03370)
+- [ViNT](https://arxiv.org/abs/2306.14846)
+- [NoMaD](https://arxiv.org/abs/2310.07896)
+- [NaviBridger](https://arxiv.org/abs/2504.10041)
+- [CrossFormer](https://arxiv.org/abs/2408.11812)
+# Citations
+If you use this work, please cite:
+```bibtex
+@article{guerrier2026vnm,
+  title   = {Can Vision Foundation Models Navigate? Zero-Shot Real-World Evaluation and Lessons Learned},
+  author  = {Guerrier, Maeva and Soma, Karthik and Pavlasek, Jana and Beltrame, Giovanni},
+  journal = {arXiv preprint arXiv:2603.25937},
+  year    = {2026}
+}
+```
+Consider citing the original models as well:
+```bibtex
+@misc{shah2023gnmgeneralnavigationmodel,
+      title={GNM: A General Navigation Model to Drive Any Robot},
+      author={Dhruv Shah and Ajay Sridhar and Arjun Bhorkar and Noriaki Hirose and Sergey Levine},
+      year={2023},
+      eprint={2210.03370},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2210.03370},
+}
+```
+```bibtex
+@misc{shah2023vintfoundationmodelvisual,
+      title={ViNT: A Foundation Model for Visual Navigation},
+      author={Dhruv Shah and Ajay Sridhar and Nitish Dashora and Kyle Stachowicz and Kevin Black and Noriaki Hirose and Sergey Levine},
+      year={2023},
+      eprint={2306.14846},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2306.14846},
+}
+```
+```bibtex
+@misc{sridhar2023nomadgoalmaskeddiffusion,
+      title={NoMaD: Goal Masked Diffusion Policies for Navigation and Exploration},
+      author={Ajay Sridhar and Dhruv Shah and Catherine Glossop and Sergey Levine},
+      year={2023},
+      eprint={2310.07896},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2310.07896},
+}
+```
+```bibtex
+@misc{ren2025priordoesmattervisual,
+      title={Prior Does Matter: Visual Navigation via Denoising Diffusion Bridge Models},
+      author={Hao Ren and Yiming Zeng and Zetong Bi and Zhaoliang Wan and Junlong Huang and Hui Cheng},
+      year={2025},
+      eprint={2504.10041},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2504.10041},
+}
+```
+```bibtex
+@misc{doshi2024scalingcrossembodiedlearningpolicy,
+      title={Scaling Cross-Embodied Learning: One Policy for Manipulation, Navigation, Locomotion and Aviation},
+      author={Ria Doshi and Homer Walke and Oier Mees and Sudeep Dasari and Sergey Levine},
+      year={2024},
+      eprint={2408.11812},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2408.11812},
+}
+```

models/GNM/gnm.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c2525cb2d42b2a7d8174d00345285b7ee5acff5232a6fc91a7531b19b145652
+size 34630394

models/GNM/gnm.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b03e0255f8a547290d4079f4e7d610ff69987122f17e019bd36684c08b3ee95
+size 104806886

models/NaviBridger/cvae.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd8414f2b37e7bb20fb61c8cd7064d112c24fdedb8ef5f2e9c066749fcc02ab5
+size 915311478

models/NaviBridger/cvae.yaml ADDED Viewed

	@@ -0,0 +1,151 @@

+project_name: cvae
+run_name: cvae
+# training setup
+use_wandb: True # set to false if you don't want to log to wandb
+train: True
+batch_size: 256
+epochs: 30
+gpu_ids: [0]
+num_workers: 12
+lr: 1e-4
+optimizer: adamw
+clipping: False
+max_norm: 1.
+scheduler: "cosine"
+warmup: True
+warmup_epochs: 4
+cyclic_period: 10
+plateau_patience: 3
+plateau_factor: 0.5
+seed: 0
+save_freq: 1
+# model params
+model_type: cvae
+vision_encoder: navibridge_encoder
+encoding_size: 256
+obs_encoder: efficientnet-b0
+attn_unet: False
+cond_predict_scale: False
+mha_num_attention_heads: 4
+mha_num_attention_layers: 4
+mha_ff_dim_factor: 4
+down_dims: [64, 128, 256]
+# diffusion model params
+num_diffusion_iters: 10
+# mask
+goal_mask_prob: 0.5
+# normalization for the action space
+normalize: True
+# context
+context_type: temporal
+context_size: 3 # 5
+alpha: 1e-4
+# distance bounds for distance and action and distance predictions
+distance:
+  min_dist_cat: 0
+  max_dist_cat: 20
+action:
+  min_dist_cat: 3
+  max_dist_cat: 20
+# action output params
+len_traj_pred: 8
+action_dim: 2
+learn_angle: False
+# navibridge
+sampler_name: "uniform"
+pred_mode: "ve"
+weight_schedule: "karras"
+sigma_data: 0.5
+sigma_min: 0.002
+sigma_max: 80.0
+rho: 7.0
+beta_d: 2
+beta_min: 0.1
+cov_xy: 0.
+guidance: 1.
+# sample defaults
+clip_denoised: True
+sampler: "euler"
+churn_step_ratio: 0.
+# prior settings
+prior_policy: "gaussian"  # handcraft, gaussian, cvae
+class_num: 5
+angle_ranges: [[0, 67.5],
+              [67.5, 112.5],
+              [112.5, 180],
+              [180, 270],
+              [270, 360]]
+min_std_angle: 5.0
+max_std_angle: 20.0
+min_std_length: 1.0
+max_std_length: 5.0
+# cvae
+train_params:
+  batch_size: 256
+  num_itr: 3001
+  lr: 0.5e-5
+  lr_gamma: 0.99
+  lr_step: 1000
+  l2_norm: 0.0
+  ema: 0.99
+diffuse_params:
+  latent_dim: 64
+  layer: 3
+  net_type: vae_mlp
+  ckpt_path: /workspace/src/NaiviBridger/deployment/model_weights/cvae.pth
+  pretrain: False
+# dataset specific parameters
+image_size: [96, 96] # width, height
+datasets:
+  recon:
+    data_folder: ./datasets/recon
+    train: ./datasets/data_splits/recon/train # path to train folder with traj_names.txt
+    test: ./datasets/data_splits/recon/test # path to test folder with traj_names.txt
+    end_slack: 3 # because many trajectories end in collisions
+    goals_per_obs: 1 # how many goals are sampled per observation
+    negative_mining: True # negative mining from the ViNG paper (Shah et al.)
+  go_stanford:
+    data_folder: ./datasets/go_stanford/ # datasets/stanford_go_new
+    train: ./datasets/data_splits/go_stanford/train/
+    test: ./datasets/data_splits/go_stanford/test/
+    end_slack: 0
+    goals_per_obs: 2 # increase dataset size
+    negative_mining: True
+  sacson:
+    data_folder: ./datasets/sacson/
+    train: ./datasets/data_splits/sacson/train/
+    test: ./datasets/data_splits/sacson/test/
+    end_slack: 3 # because many trajectories end in collisions
+    goals_per_obs: 1
+    negative_mining: True
+  scand:
+    data_folder: ./datasets/scand/
+    train: ./datasets/data_splits/scand/train/
+    test: ./datasets/data_splits/scand/test/
+    end_slack: 0
+    goals_per_obs: 1
+    negative_mining: True
+# logging stuff
+## =0 turns off
+print_log_freq: 500 # in iterations
+image_log_freq: 1000 #0 # in iterations
+num_images_log: 8 #0
+pairwise_test_freq: 0 # in epochs
+eval_fraction: 0.25
+wandb_log_freq: 10 # in iterations
+eval_freq: 1 # in epochs

models/NaviBridger/navibridger_cvae.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:511334b25ca38da88787f1ccdf6eea1f0f7eff6d762acd9223a04fd347920fe2
+size 76547213

models/NaviBridger/navibridger_cvae.yaml ADDED Viewed

	@@ -0,0 +1,150 @@

+project_name: navibridge
+run_name: navibridge
+# training setup
+use_wandb: True # set to false if you don't want to log to wandb
+train: True
+batch_size: 224
+epochs: 30
+gpu_ids: [1]
+num_workers: 12
+lr: 1e-4
+optimizer: adamw
+clipping: False
+max_norm: 1.
+scheduler: "cosine"
+warmup: True
+warmup_epochs: 4
+cyclic_period: 10
+plateau_patience: 3
+plateau_factor: 0.5
+seed: 0
+save_freq: 1
+# model params
+model_type: navibridge
+vision_encoder: navibridge_encoder
+encoding_size: 256
+obs_encoder: efficientnet-b0
+attn_unet: False
+cond_predict_scale: False
+mha_num_attention_heads: 4
+mha_num_attention_layers: 4
+mha_ff_dim_factor: 4
+down_dims: [64, 128, 256]
+# diffusion model params
+num_diffusion_iters: 10
+# mask
+goal_mask_prob: 0.5
+# normalization for the action space
+normalize: True
+# context
+context_type: temporal
+context_size: 3 # 5
+alpha: 1e-4
+# distance bounds for distance and action and distance predictions
+distance:
+  min_dist_cat: 0
+  max_dist_cat: 20
+action:
+  min_dist_cat: 3
+  max_dist_cat: 20
+# action output params
+len_traj_pred: 8
+action_dim: 2
+learn_angle: False
+# navibridge
+sampler_name: "uniform"
+pred_mode: "ve"
+weight_schedule: "karras"
+sigma_data: 0.5
+sigma_min: 0.002
+sigma_max: 10.0
+rho: 7.0
+beta_d: 2
+beta_min: 0.1
+cov_xy: 0.
+guidance: 1.
+clip_denoised: True
+sampler: "euler"
+churn_step_ratio: 0.
+# prior settings
+prior_policy: "cvae"  # handcraft, gaussian, cvae
+class_num: 5
+angle_ranges: [[0, 67.5],
+              [67.5, 112.5],
+              [112.5, 180],
+              [180, 270],
+              [270, 360]]
+min_std_angle: 5.0
+max_std_angle: 20.0
+min_std_length: 1.0
+max_std_length: 5.0
+# cvae
+train_params:
+  batch_size: 256
+  num_itr: 3001
+  lr: 0.5e-5
+  lr_gamma: 0.99
+  lr_step: 1000
+  l2_norm: 0.0
+  ema: 0.99
+diffuse_params:
+  latent_dim: 64
+  layer: 3
+  net_type: vae_mlp
+  ckpt_path: /workspace/src/NaiviBridger/deployment/model_weights/cvae.pth
+  pretrain: False
+# dataset specific parameters
+image_size: [96, 96] # width, height
+datasets:
+  recon:
+    data_folder: ./datasets/recon
+    train: ./datasets/data_splits/recon/train # path to train folder with traj_names.txt
+    test: ./datasets/data_splits/recon/test # path to test folder with traj_names.txt
+    end_slack: 3 # because many trajectories end in collisions
+    goals_per_obs: 1 # how many goals are sampled per observation
+    negative_mining: True # negative mining from the ViNG paper (Shah et al.)
+  go_stanford:
+    data_folder: ./datasets/go_stanford/ # datasets/stanford_go_new
+    train: ./datasets/data_splits/go_stanford/train/
+    test: ./datasets/data_splits/go_stanford/test/
+    end_slack: 0
+    goals_per_obs: 2 # increase dataset size
+    negative_mining: True
+  sacson:
+    data_folder: ./datasets/sacson/
+    train: ./datasets/data_splits/sacson/train/
+    test: ./datasets/data_splits/sacson/test/
+    end_slack: 3 # because many trajectories end in collisions
+    goals_per_obs: 1
+    negative_mining: True
+  scand:
+    data_folder: ./datasets/scand/
+    train: ./datasets/data_splits/scand/train/
+    test: ./datasets/data_splits/scand/test/
+    end_slack: 0
+    goals_per_obs: 1
+    negative_mining: True
+# logging stuff
+## =0 turns off
+print_log_freq: 100 # in iterations
+image_log_freq: 1000 #0 # in iterations
+num_images_log: 8 #0
+pairwise_test_freq: 0 # in epochs
+eval_fraction: 0.25
+wandb_log_freq: 10 # in iterations
+eval_freq: 1 # in epochs

models/NaviBridger/navibridger_dist_pred_net.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cba7328b993c06858db7776f7c46293b17c032dcdefcb540a1cecce181a1bc61
+size 71653

models/NaviBridger/navibridger_vision_encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89e15954b138aca90940677ca0b75855408f86caed50b7b0f497a4234bbd7721
+size 47967171

models/NoMaD/nomad.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70f79b8262527e20e56ced64a3e3d7ef91855bc9e7c3fa348d78edcb83c6a333
+size 76473631

models/NoMaD/nomad_dist_pred_net.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47a4272c8b6fea3982cc403fcbb7275461135b173b02e4a8bac545138ff641ed
+size 71653

models/NoMaD/nomad_noise_pred_net.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d22b30234f2a2db469a82e5304feec65f20c7daa0df4f4d4224c4d32063153c5
+size 15550505

models/NoMaD/nomad_vision_encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35439a338d1a12f481c0186443521b7cd9d4a330eb82309f54a348f237e9fa97
+size 47967171

models/ViNT/vint.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:092fb24e9f73c6ea1a42e07442232e73b98a920195fc1b550e4aed52c3f43304
+size 96004784

models/ViNT/vint.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:155fd72de2e98ae0e2fef9404072e1aefa79dae5f7f2411d4bcf7e384b83aa1f
+size 430167114