bazaar-research commited on Apr 1

Commit

fb11af9

verified ·

1 Parent(s): 5f59008

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +46 -0
.gitignore +222 -0
.gitmodules +6 -0
.vscode/launch.json +88 -0
LEGAL.md +7 -0
LICENSE +202 -0
Makefile +21 -0
README.md +330 -0
assets/LingBot-VLA.pdf +3 -0
assets/PaliGemmaPI.png +3 -0
assets/QwenPI.png +3 -0
assets/QwenPI_PaliGemmaPI.png +3 -0
assets/Teaser.png +3 -0
assets/exp-gm-100.png +3 -0
assets/exp-robotwin.png +3 -0
assets/norm_stats/libero.json +280 -0
assets/norm_stats/robotwin_50.json +229 -0
assets/norm_stats/robotwin_5_customized.json +201 -0
assets/norm_stats/robotwin_all_new.json +229 -0
assets/scale_ps.png +3 -0
assets/scale_sr.png +3 -0
configs/norm/robotwin_5.yaml +12 -0
configs/vla/robotwin_load20000h.yaml +42 -0
configs/vla/robotwin_load20000h_depth.yaml +68 -0
deploy/__init__.py +0 -0
deploy/image_tools.py +58 -0
deploy/lingbot_robotwin_policy.py +506 -0
deploy/lingbot_robotwin_policy_rep.py +491 -0
deploy/msgpack_numpy.py +57 -0
deploy/websocket_client_policy.py +88 -0
deploy/websocket_policy_server.py +89 -0
docker/Dockerfile +34 -0
docs/Makefile +20 -0
docs/README.md +19 -0
docs/conf.py +66 -0
docs/config/config.md +96 -0
docs/examples/qwen2vl.rst +2 -0
docs/examples/qwen3_moe.md +125 -0
docs/index.rst +2 -0
docs/requirements-docs.txt +9 -0
docs/start/start.rst +2 -0
experiment/libero/README.md +18 -0
experiment/libero/libero/libero_utils.py +112 -0
experiment/libero/libero/req.txt +6 -0
experiment/libero/libero/run_libero_eval.py +300 -0
experiment/libero/robot_utils.py +84 -0
experiment/robotwin/README.md +85 -0
lingbotvla/__init__.py +16 -0
lingbotvla/checkpoint/__init__.py +25 -0
lingbotvla/checkpoint/checkpointer.py +340 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,49 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/LingBot-VLA.pdf filter=lfs diff=lfs merge=lfs -text
+assets/PaliGemmaPI.png filter=lfs diff=lfs merge=lfs -text
+assets/QwenPI.png filter=lfs diff=lfs merge=lfs -text
+assets/QwenPI_PaliGemmaPI.png filter=lfs diff=lfs merge=lfs -text
+assets/Teaser.png filter=lfs diff=lfs merge=lfs -text
+assets/exp-gm-100.png filter=lfs diff=lfs merge=lfs -text
+assets/exp-robotwin.png filter=lfs diff=lfs merge=lfs -text
+assets/scale_ps.png filter=lfs diff=lfs merge=lfs -text
+assets/scale_sr.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/MoGe/assets/normal_comaprison.jpg filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/MoGe/assets/overview_simplified.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/MoGe/assets/panorama_pipeline.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/MoGe/example_images/01_HouseIndoor.jpg filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/MoGe/example_images/02_Office.jpg filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/MoGe/example_images/03_Traffic.jpg filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/MoGe/example_images/05_Mountain.jpg filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/MoGe/example_images/06_MaitreyaBuddha.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/MoGe/example_images/07_Breads.jpg filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/MoGe/example_images/08_CatGirl.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/MoGe/example_images/09_Restaurant.jpg filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/MoGe/example_images/10_MedievalVillage.jpg filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/MoGe/example_images/panorama/Braunschweig_Panoram.jpg filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/assets/attention/fig-attention-vis.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/assets/dataset/diversity_figure.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/assets/device/device-divided.jpg filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/assets/device/device-full.jpg filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/assets/downstream_grasp/fig-grasp-demo.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/assets/downstream_tracking/fig-dynamic-tracking.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/assets/downstream_tracking/fig-scene-tracking-crop.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/assets/teaser/teaser-crop.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/examples/0/raw_depth.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/examples/0/rgb.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/examples/1/raw_depth.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/examples/1/rgb.jpg filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/examples/2/raw_depth.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/examples/2/rgb.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/examples/3/raw_depth.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/examples/3/rgb.jpg filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/examples/4/raw_depth.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/examples/4/rgb.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/examples/5/raw_depth.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/examples/5/rgb.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/examples/6/raw_depth.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/examples/7/raw_depth.png filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/examples/7/rgb.jpg filter=lfs diff=lfs merge=lfs -text
+lingbotvla/models/vla/vision_models/lingbot-depth/tech-report.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,222 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# log
+*log.txt
+ossutil_output/
+.sumi/
+env.sh
+pids_qwenpi.txt
+run.sh
+start_multi_eval.sh
+trash/
+eval.sh
+# xwc
+output/
+wandb/

.gitmodules ADDED Viewed

	@@ -0,0 +1,6 @@

+[submodule "lingbotvla/models/vla/vision_models/lingbot-depth"]
+	path = lingbotvla/models/vla/vision_models/lingbot-depth
+	url = https://github.com/Robbyant/lingbot-depth
+[submodule "lingbotvla/models/vla/vision_models/MoGe"]
+	path = lingbotvla/models/vla/vision_models/MoGe
+	url = https://github.com/microsoft/MoGe.git

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,88 @@

+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "deploy lingbotvla (模块方式)",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "deploy.lingbot_robotwin_policy",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+            "justMyCode": false,
+            "args": [
+                "--model_path",
+                "output/ori_4/checkpoints/global_step_12850/hf_ckpt",
+                "--use_length",
+                "50",
+                "--chunk_ret",
+                "true",
+                "--debug_infer_once"
+            ],
+            "env": {
+                "CUDA_VISIBLE_DEVICES": "0",
+                "QWEN25_PATH": "/group/ossdphi_algo_scratch_11/weicxu/huggingface_cache/hub/models--Qwen--Qwen2.5-VL-3B-Instruct/snapshots/66285546d2b821cf421d4f5eb2576359d3770cd3"
+            }
+        },
+        {
+            "name": "example_call_robotwin_server",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "deploy.example_call_robotwin_server",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+            "justMyCode": false,
+            "args": [
+                "--host",
+                "127.0.0.1",
+                "--port",
+                "8006"
+            ],
+            "env": {
+                "CUDA_VISIBLE_DEVICES": "0"
+            }
+        },
+        {
+            "name": "train lingbotvla",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": false,
+            "args": [
+                "configs/vla/robotwin_load20000h.yaml",
+                "--model.model_path",
+                "robbyant/lingbot-vla-4b",
+                "--data.train_path",
+                "mixed_robotwin_5tasks_repo_0.1.0",
+                "--train.output_dir",
+                "output/",
+                "--model.tokenizer_path",
+                "Qwen/Qwen2.5-VL-3B-Instruct",
+                "--train.micro_batch_size",
+                "1",
+                "--train.global_batch_size",
+                "1",
+                "--train.enable_full_shard",
+                "true",
+                "--train.use_compile",
+                "false",
+                "--train.enable_fp32",
+                "false",
+                "--train.freeze_vision_encoder",
+                "true",
+            ],
+            "env": {
+                "CUDA_VISIBLE_DEVICES": "2",
+                "LOCAL_RANK": "0",
+                "RANK": "0",
+                "WORLD_SIZE": "1",
+                "MASTER_ADDR": "localhost",
+                "MASTER_PORT": "29500",
+                "PYDEVD_USE_SYS_MONITORING": "0"
+            }
+        }
+    ]
+}

LEGAL.md ADDED Viewed

	@@ -0,0 +1,7 @@

+Legal Disclaimer
+Within this source code, the comments in Chinese shall be the original, governing version. Any comment in other languages are for reference only. In the event of any conflict between the Chinese language version comments and other language version comments, the Chinese language version shall prevail.
+法律免责声明
+关于代码注释部分，中文注释为官方版本，其它语言注释仅做参考。中文注释可能与其它语言注释存在不一致，当中文注释与其它语言注释存在不一致时，请以中文注释为准。

LICENSE ADDED Viewed

	@@ -0,0 +1,202 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [2026] [Robbyant Team]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

Makefile ADDED Viewed

	@@ -0,0 +1,21 @@

+.PHONY: build commit quality style test
+check_dirs := tasks tests lingbot docs setup.py
+build:
+	python3 setup.py sdist bdist_wheel
+commit:
+	pre-commit install
+	pre-commit run --all-files
+quality:
+	ruff check $(check_dirs)
+	ruff format --check $(check_dirs)
+style:
+	ruff check $(check_dirs) --fix
+	ruff format $(check_dirs)
+test:
+	pytest tests/

README.md ADDED Viewed

	@@ -0,0 +1,330 @@

+<h1 align="center">LingBot-VLA: A Pragmatic VLA Foundation Model</h1>
+<p align="center">
+  <a href="assets/LingBot-VLA.pdf"><img src="https://img.shields.io/static/v1?label=Paper&message=PDF&color=red&logo=arxiv"></a>
+  <a href="https://technology.robbyant.com/lingbot-vla"><img src="https://img.shields.io/badge/Project-Website-blue"></a>
+  <a href="https://huggingface.co/collections/robbyant/lingbot-vla"><img src="https://img.shields.io/static/v1?label=%F0%9F%A4%97%20Model&message=HuggingFace&color=yellow"></a>
+  <a href="https://modelscope.cn/collections/Robbyant/LingBot-VLA"><img src="https://img.shields.io/static/v1?label=%F0%9F%A4%96%20Model&message=ModelScope&color=purple"></a>
+  <a href="https://huggingface.co/datasets/robbyant/gm100"><img src="https://img.shields.io/static/v1?label=%F0%9F%A4%97%20GM-100&message=HuggingFace&color=yellow"></a>
+  <a href="LICENSE"><img src="https://img.shields.io/badge/License-Apache--2.0-green"></a>
+</p>
+<p align="center">
+  <img src="assets/Teaser.png" width="100%">
+</p>
+## 🥳 We are excited to introduce **LingBot-VLA**, a pragmatic Vision-Language-Action foundation model.
+**LingBot-VLA** has focused on **Pragmatic**:
+- **Large-scale Pre-training Data**: 20,000 hours of real-world
+data from 9 popular dual-arm robot configurations.
+<p align="center">
+  <img src="assets/scale_sr.png" width="45%" style="margin: 0 10px;">
+  <img src="assets/scale_ps.png" width="45%" style="margin: 0 10px;">
+</p>
+- **Strong Performance**: Achieve clear superiority over competitors on simulation and real-world benchmarks.
+- **Training Efficiency**: Represent a 1.5 ∼ 2.8× (depending on the relied VLM base model) speedup over existing VLA-oriented codebases.
+## 🚀 News
+- **[2026-01-27]** LingBot-VLA Technical Report is available on Arxiv.
+- **[2026-01-27]** Weights and code released!
+---
+## 🛠️ Installation
+Requirements
+ - Python 3.12.3
+ - Pytorch 2.8.0
+ - CUDA 12.8
+```bash
+# Install Lerobot
+pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cu128
+GIT_LFS_SKIP_SMUDGE=1 git clone https://github.com/huggingface/lerobot.git
+cd lerobot
+git checkout 0cf864870cf29f4738d3ade893e6fd13fbd7cdb5
+pip install -e .
+# Install flash attention
+pip install /path/to/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
+# Clone the repository
+git clone https://github.com/robbyant/lingbot-vla.git
+cd lingbot-vla/
+git submodule update --remote --recursive
+pip install -e .
+pip install -r requirements.txt
+# Install LingBot-Depth dependency
+cd ./lingbotvla/models/vla/vision_models/lingbot-depth/
+pip install -e . --no-deps
+cd ../MoGe
+pip install -e .
+```
+---
+## 📦 Model Download
+We release LingBot-VLA pre-trained weights in two configurations: depth-free version and a depth-distillated version.
+- **Pretrained Checkpoints for Post-Training with and without depth**
+| Model Name | Huggingface | ModelScope | Description |
+| :--- | :---: | :---: | :---: |
+| LingBot-VLA-4B &nbsp; | [🤗 lingbot-vla-4b](https://huggingface.co/robbyant/lingbot-vla-4b) | [🤖 lingbot-vla-4b](https://modelscope.cn/models/Robbyant/lingbot-vla-4b) | LingBot-VLA *w/o* Depth|
+| LingBot-VLA-4B-Depth | [🤗 lingbot-vla-4b-depth](https://huggingface.co/robbyant/lingbot-vla-4b-depth) | [🤖 lingbot-vla-4b-depth](https://modelscope.cn/models/Robbyant/lingbot-vla-4b-depth) | LingBot-VLA *w/* Depth |
+To train LingBot with our codebase, weights from [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct), [MoGe-2-vitb-normal](https://huggingface.co/Ruicheng/moge-2-vitb-normal), and [LingBot-Depth](https://huggingface.co/robbyant/lingbot-depth-pretrain-vitl-14) also need to be prepared.
+- **Run Command**:
+```bash
+python3 scripts/download_hf_model.py --repo_id robbyant/lingbot-vla-4b --local_dir lingbot-vla-4b
+```
+---
+## 💻 Post-Training Example
+- **Data Preparation**:
+Please follow [RoboTwin2.0 Preparation](experiment/robotwin/README.md)
+- **Training Configuration**:
+We provide the mixed post-training configuration in five RoboTwin 2.0 tasks ("open_microwave" "click_bell" "stack_blocks_three" "place_shoe" "put_object_cabinet").
+<details>
+<summary><b>Click to expand full YAML configuration</b></summary>
+```yaml
+model:
+  model_path: "path/to/lingbot_vla_checkpoint" # Path to pre-trained VLA foundation model (w/o or w depth)
+  tokenizer_path: "path/to/Qwen2.5-VL-3B-Instruct"
+  post_training: true            # Enable post-training/fine-tuning mode
+  adanorm_time: true
+  old_adanorm: true
+data:
+  datasets_type: vla
+  data_name: robotwin_5_new
+  train_path: "path/to/lerobot_merged_data" # merged data from 5 robotwin2.0 tasks
+  num_workers: 8
+  norm_type: bounds_99_woclip
+  norm_stats_file: assets/norm_stats/robotwin_50.json # file of normalization statistics
+train:
+  output_dir: "path/to/output"
+  loss_type: L1_fm               # we apply L1 flow-matching loss in robotwin2.0 finetuning
+  data_parallel_mode: fsdp2      # Use Fully Sharded Data Parallel (PyTorch FSDP2)
+  enable_full_shard: false       # Don't apply reshare after forward in FSDP2
+  module_fsdp_enable: true
+  use_compile: true              # Acceleration via torch.compile
+  use_wandb: false
+  rmpad: false
+  rmpad_with_pos_ids: false
+  ulysses_parallel_size: 1
+  freeze_vision_encoder: false   # ViT need to be optimized
+  tokenizer_max_length: 24       # token numbers of task prompt
+  action_dim: 14                 # Target robot action space dimension
+  max_action_dim: 75             # action dim in LingBot-VLA
+  max_state_dim: 75              # state dim in LingBot-VLA
+  lr: 1.0e-4
+  lr_decay_style: constant
+  num_train_epochs: 69           # finetuning 20k step
+  micro_batch_size: 32
+  global_batch_size: 256
+  max_steps: 220000
+  ckpt_manager: dcp
+  save_steps: 220000
+  save_epochs: 69
+  enable_fp32: true
+  enable_resume: true            # resume training automatically
+  # ===========================================================================
+  # Depth Injection Parameters
+  # (Required only for LingBot-VLA with Depth. Ignore if not using depth)
+  # ===========================================================================
+  align_params:
+    mode: 'query'                  # Query-based distillation
+    num_task_tokens: 8             # Number of learnable task-specific tokens
+    use_image_tokens: True
+    use_task_tokens: False
+    use_text_tokens: False
+    use_contrastive: True
+    contrastive_loss_weight: 0.3
+    depth_loss_weight: 0.002
+    llm:                           # VLM Projection Settings
+      dim_out: 2048
+      image_token_size: 8
+      image_input_size: 224
+    depth:
+      model_type: MoRGBD
+      moge_path: /"path/to/moGe-2-vitb-normal"
+      morgbd_path: "path/to/LingBot-Depth"
+      num_layers: 1
+      num_heads: 4
+      dim_head: 32
+      ff_mult: 1
+      num_backbone_tokens: 256
+      token_size: 16
+      dim_out: 1024
+      input_size: 224
+    visual_steps: 10000
+    visual_dir: "path/to/output/images" # visualization path of depth distillation
+```
+</details>
+- **Run Command**:
+```bash
+# without detph
+bash train.sh tasks/vla/train_lingbotvla.py ./configs/vla/robotwin_load20000h.yaml  --model.model_path /path/to/LingBot-VLA --data.train_path path/to/mixed_robotwin_5tasks --train.output_dir /path/to/lingbot_robotwin5tasks/ --model.tokenizer_path /path/to/Qwen2.5-VL-3B-Instruct --train.micro_batch_size ${your_batch_size} --train.global_batch_size ${your_batch_size * your_gpu_num}
+# with depth
+bash train.sh tasks/vla/train_lingbotvla.py ./configs/vla/robotwin_load20000h_depth.yaml  --model.model_path /path/to/LingBot-VLA-Depth  --data.train_path /path/to/mixed_robotwin_5tasks --train.output_dir /path/to/lingbot_depth_robotwin5tasks --model.tokenizer_path /path/to/Qwen2.5-VL-3B-Instruct --model.moge_path /path/to/moge2-vitb-normal.pt --model.morgbd_path /path/to/LingBot-Depth-Pretrained --train.micro_batch_size ${your_batch_size} --train.global_batch_size ${your_batch_size * your_gpu_num}
+```
+- **Evaluation**
+```bash
+# robotwin2.0
+export QWEN25_PATH=path_to_Qwen2.5-VL-3B-Instruct
+python -m deploy.lingbot_robotwin_policy \
+ --model_path path_to_your_model \
+ --use_length 50 \
+ --port port
+```
+- **Customized Post-training**:
+To construct post-training in specified downstream tasks, we have provided an example and please refer to [Custom](lingbotvla/data/vla_data/README.md) for details.
+---
+## 🏗️ Efficiency
+<p align="center">
+  <img src="assets/QwenPI_PaliGemmaPI.png" width="85%">
+</p>
+We evaluate the training efficiency of our codebase against established baselines for both <b>Qwen2.5-VL-3B-π</b> and <b>PaliGemma-3B-pt-224-π</b> models. The results demonstrate that our codebase
+achieved the fastest training speeds in both model settings. The above figures detail the training throughput across configurations of 8, 16, 32, 128, and 256 GPUs, alongside the theoretical linear scaling limit.
+> **📢 Note on Throughput Metrics:**
+> All throughput values (e.g., 261 samples/sec) represent the **total aggregate throughput across all GPUs**, not per-GPU performance.
+> <br><sup>(Updated: Previously mislabeled as per-GPU in earlier versions. We apologize for the confusion.)</sup>
+---
+## 📊 Performance
+Our LingBot-VLA achieves state-of-the-art results on real-world and simulation benchmarks:
+- **GM-100 across 3 robot platforms**
+<table>
+  <thead>
+    <tr>
+      <th rowspan="2">Platform</th>
+      <th colspan="2">WALL-OSS</th>
+      <th colspan="2">GR00T N1.6</th>
+      <th colspan="2">π<sub>0.5</sub></th>
+      <th colspan="2">Ours w/o depth</th>
+      <th colspan="2">Ours w/ depth</th>
+    </tr>
+    <tr>
+      <th>SR</th><th>PS</th>
+      <th>SR</th><th>PS</th>
+      <th>SR</th><th>PS</th>
+      <th>SR</th><th>PS</th>
+      <th>SR</th><th>PS</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Agibot G1</td>
+      <td>2.99%</td><td>8.75%</td><td>5.23%</td><td>12.63%</td><td>7.77%</td><td>21.98%</td><td><b>12.82%</b></td><td>30.04%</td><td>11.98%</td><td><b>30.47%</b></td>
+    </tr>
+    <tr>
+      <td>AgileX</td>
+      <td>2.26%</td><td>8.16%</td><td>3.26%</td><td>10.52%</td><td>17.20%</td><td>34.82%</td><td>15.50%</td><td>36.31%</td><td><b>18.93%</b></td><td><b>40.36%</b></td>
+    </tr>
+    <tr>
+      <td>Galaxea R1Pro</td>
+      <td>6.89%</td><td>14.13%</td><td>14.29%</td><td>24.83%</td><td>14.10%</td><td>26.14%</td><td>18.89%</td><td>34.71%</td><td><b>20.98%</b></td><td><b>35.40%</b></td>
+    </tr>
+    <tr>
+      <td><b>Average</b></td>
+      <td>4.05%</td><td>10.35%</td><td>7.59%</td><td>15.99%</td><td>13.02%</td><td>27.65%</td><td>15.74%</td><td>33.69%</td><td><b>17.30%</b></td><td><b>35.41%</b></td>
+    </tr>
+  </tbody>
+</table>
+- **RoboTwin 2.0 (Clean and Randomized)**
+<table>
+  <thead>
+    <tr>
+      <th rowspan="2" ><b>Simulation Tasks</b></th>
+      <th colspan="2"><b>&pi;<sub>0.5</sub></b></th>
+      <th colspan="2"><b>Ours w/o depth</b></th>
+      <th colspan="2"><b>Ours w/ depth</b></th>
+    </tr>
+    <tr>
+      <th><b>Clean</b></th>
+      <th><b>Rand.</b></th>
+      <th><b>Clean</b></th>
+      <th><b>Rand.</b></th>
+      <th><b>Clean</b></th>
+      <th><b>Rand.</b></th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr style="border-top: 1px solid #ccc;"> <!-- \midrule -->
+      <td><b>Average SR</b></td>
+      <td>82.74%</td>
+      <td>76.76%</td>
+      <td>86.50%</td>
+      <td>85.34%</td>
+      <td>88.56%</td>
+      <td>86.68%</td>
+    </tr>
+    <!-- 您可以在此处继续添加其他任务行 -->
+  </tbody>
+</table>
+📢 We have released our checkpoints of LingBot-VLA-Posttrain-Robotwin:
+| Model Name | Huggingface | ModelScope | Description |
+| :--- | :---: | :---: | :---: |
+| LingBot-VLA-4B-Posttrain-Robotwin &nbsp; | [🤗 lingbot-vla-4b-posttrain-robotwin](https://huggingface.co/robbyant/lingbot-vla-4b-posttrain-robotwin) | [🤖 lingbot-vla-4b-posttrain-robotwin](https://modelscope.cn/models/Robbyant/lingbot-vla-4b-posttrain-robotwin) | LingBot-VLA-Posttrain-Robotwin *w/o* Depth|
+| LingBot-VLA-4B-Depth-Posttrain-Robotwin | [🤗 lingbot-vla-4b-depth-posttrain-robotwin](https://huggingface.co/robbyant/lingbot-vla-4b-depth-posttrain-robotwin) | [🤖 lingbot-vla-4b-depth-posttrain-robotwin](https://modelscope.cn/models/Robbyant/lingbot-vla-4b-depth-posttrain-robotwin) | LingBot-VLA-Posttrain-Robotwin *w/* Depth |
+We also provided [evaluation code](deploy/lingbot_robotwin_policy_rep.py) for the community to reproduce the performance of LingBot-VLA on Robotwin 2.0:
+```bash
+export QWEN25_PATH=path_to_Qwen2.5-VL-3B-Instruct
+python -m deploy.lingbot_robotwin_policy_rep \
+ --model_path Path_to_LingBot-VLA-Posttrain-Robotwin \
+ --use_length 50 \
+ --port port
+```
+<p align="center">
+  <img src="assets/exp-gm-100.png" width="45%" style="margin: 0 10px;">
+  <img src="assets/exp-robotwin.png" width="45%" style="margin: 0 10px;">
+</p>
+---
+## 📝 Citation
+If you find our work useful in your research,  feel free to give us a cite.
+```bibtex
+@article{wu2026pragmatic,
+  title={A Pragmatic VLA Foundation Model},
+  author={Wei Wu and Fan Lu and Yunnan Wang and Shuai Yang and Shi Liu and Fangjing Wang and Shuailei Ma and He Sun and Yong Wang and Zhenqi Qiu and Houlong Xiong and Ziyu Wang and Shuai Zhou and Yiyu Ren and Kejia Zhang and Hui Yu and Jingmei Zhao and Qian Zhu and Ran Cheng and Yong-Lu Li and Yongtao Huang and Xing Zhu and Yujun Shen and Kecheng Zheng},
+  journal={arXiv preprint arXiv:2601.18692v1},
+  year={2026}
+}
+```
+---
+## 📄 License Agreement
+This project is licensed under the [Apache-2.0 License](LICENSE).
+## 😊 Acknowledgement
+We would like to express our sincere gratitude to the developers of [VeOmni](https://arxiv.org/abs/2508.02317) and [LeRobot](https://github.com/huggingface/lerobot#). This project benefits significantly from their outstanding work and contributions to the open-source community.

assets/LingBot-VLA.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b0a361d6084d74afc0bc9fcdee5051375b701a8e41013460107a46902bd0426
+size 10000817

assets/PaliGemmaPI.png ADDED Viewed

Git LFS Details

SHA256: e691d3ffcabb56307bd58397b04b575e03186b6e6f98aa86cd0a00f6327659b8
Pointer size: 131 Bytes
Size of remote file: 458 kB

assets/QwenPI.png ADDED Viewed

Git LFS Details

SHA256: f327696f64edd947a3f4b6ce4d81d88420bc8ca756fc80b4db937228d571f150
Pointer size: 131 Bytes
Size of remote file: 442 kB

assets/QwenPI_PaliGemmaPI.png ADDED Viewed

Git LFS Details

SHA256: 4ce326329047abdf297f713ae303693db983de4849f3ad5f32a92c3ca310658d
Pointer size: 131 Bytes
Size of remote file: 209 kB

assets/Teaser.png ADDED Viewed

Git LFS Details

SHA256: 7081c4c6c8586c21ade32fbfe7547f0841b201c46302ab495c9537cfc982ab54
Pointer size: 132 Bytes
Size of remote file: 9.14 MB

assets/exp-gm-100.png ADDED Viewed

Git LFS Details

SHA256: 9afddc707eb74534e0c1e3903eed0ee6a2ea24df883f7eb1b2fc8d0c5862068d
Pointer size: 131 Bytes
Size of remote file: 516 kB

assets/exp-robotwin.png ADDED Viewed

Git LFS Details

SHA256: 1d61317bee06123a946302d358ff14f11cc01640cfb820f31630cbf612373ecc
Pointer size: 131 Bytes
Size of remote file: 396 kB

assets/norm_stats/libero.json ADDED Viewed

	@@ -0,0 +1,280 @@

+{
+  "norm_stats": {
+    "state": {
+      "mean": [
+        -0.04617275670170784,
+        0.034034404903650284,
+        0.7647115588188171,
+        2.971421480178833,
+        -0.2198116034269333,
+        -0.1260652393102646,
+        0.02694438025355339,
+        -0.0272101741284132,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.1049584373831749,
+        0.15187117457389832,
+        0.3785041272640228,
+        0.3451951742172241,
+        0.910057544708252,
+        0.3253032863140106,
+        0.014151589013636112,
+        0.014038060791790485,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        -0.4003246918797493,
+        -0.268838057410717,
+        0.03963126605004072,
+        1.5141939243793487,
+        -2.7199491125106814,
+        -1.0708919448852539,
+        0.0017206525699933989,
+        -0.04004273633235134,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.1335429027736188,
+        0.3378903574764729,
+        1.2657122139371932,
+        3.2784227243721484,
+        2.4147262454509733,
+        0.5962245464324951,
+        0.04029089962062426,
+        -0.001789628425752747,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "actions": {
+      "mean": [
+        0.06667574495077133,
+        0.06483978033065796,
+        -0.80384361743927,
+        -2.970874071121216,
+        0.22662578523159027,
+        0.11959122866392136,
+        -0.036161474883556366,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.32812511920928955,
+        0.4197826683521271,
+        0.6153613924980164,
+        0.35168182849884033,
+        0.9132273197174072,
+        0.3432939946651459,
+        0.9993459582328796,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        -0.7088336983919143,
+        -0.8786727856397629,
+        -2.097322083187103,
+        -3.3041505486488343,
+        -2.4138620029449465,
+        -0.6111064100980759,
+        -1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        1.0219826289415357,
+        1.0526966882944104,
+        0.7265835452556608,
+        -1.491220802116394,
+        2.7264903316497806,
+        1.1191907620668413,
+        0.9996,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    }
+  }
+}

assets/norm_stats/robotwin_50.json ADDED Viewed

	@@ -0,0 +1,229 @@

+{
+  "norm_stats": {
+    "action.arm.position": {
+      "mean": [
+        -0.22649447619915009,
+        1.0910465717315674,
+        0.8046976923942566,
+        -0.3529793620109558,
+        0.056382808834314346,
+        -0.04518803581595421,
+        0.23444592952728271,
+        1.1117788553237915,
+        0.8302268385887146,
+        -0.3584558367729187,
+        -0.010058438405394554,
+        0.010835078544914722
+      ],
+      "std": [
+        0.36951732635498047,
+        0.9946224689483643,
+        0.7907869219779968,
+        0.663685142993927,
+        0.24930860102176666,
+        0.5646992921829224,
+        0.32377511262893677,
+        1.0205038785934448,
+        0.8121177554130554,
+        0.7205489277839661,
+        0.25676125288009644,
+        0.6210611462593079
+      ],
+      "q01": [
+        -0.9676963651657111,
+        -0.0003164021181873977,
+        -0.0008187678098678652,
+        -1.5952941972732544,
+        -0.4444093635320664,
+        -2.2108209049224854,
+        -0.13648582720756508,
+        -0.0025135905981064077,
+        -0.0016476722434163094,
+        -1.7023667912483216,
+        -1.0292453282356262,
+        -1.6702169750213622
+      ],
+      "q99": [
+        0.17045696868896432,
+        2.5792064671580563,
+        2.4791862522006034,
+        1.263499072647095,
+        1.2283580561399456,
+        1.4622943069458012,
+        1.096450059175491,
+        2.605947977209091,
+        2.5039097490906714,
+        1.3104696589708325,
+        1.074876550579071,
+        2.104229341125489
+      ],
+      "q02": [
+        -0.9234203773498537,
+        -0.0003164021181873977,
+        -0.0008187678098678652,
+        -1.509812859249115,
+        -0.32799621334075924,
+        -1.656348336791992,
+        -0.05942733430862468,
+        -0.0025135905981064077,
+        -0.0016476722434163094,
+        -1.6187864029407502,
+        -0.8712951603889465,
+        -1.5470734649658198
+      ],
+      "q98": [
+        0.11836757125854458,
+        2.4944407171577216,
+        2.3239549394726753,
+        1.0776700769424439,
+        1.0128444806575776,
+        1.2158620544433596,
+        0.945415413093567,
+        2.5296102081775667,
+        2.3580759009346366,
+        1.2048114322423933,
+        0.6983346325874327,
+        1.7523907409667974
+      ]
+    },
+    "action.effector.position": {
+      "mean": [
+        0.6722026467323303,
+        0.6737783551216125
+      ],
+      "std": [
+        0.45274168252944946,
+        0.45141810178756714
+      ],
+      "q01": [
+        -1e-10,
+        -1e-10
+      ],
+      "q99": [
+        0.99980000009996,
+        0.99980000009996
+      ],
+      "q02": [
+        -1e-10,
+        -1e-10
+      ],
+      "q98": [
+        0.99980000009996,
+        0.99980000009996
+      ]
+    },
+    "observation.state.arm.position": {
+      "mean": [
+        -0.22545991837978363,
+        1.0864390134811401,
+        0.8012449741363525,
+        -0.3515830338001251,
+        0.05604754388332367,
+        -0.0445503294467926,
+        0.23296862840652466,
+        1.1059207916259766,
+        0.8258985280990601,
+        -0.3568105697631836,
+        -0.00992637686431408,
+        0.010328034870326519
+      ],
+      "std": [
+        0.3688313364982605,
+        0.9950565099716187,
+        0.7906551957130432,
+        0.6622100472450256,
+        0.24865445494651794,
+        0.5626452565193176,
+        0.32314980030059814,
+        1.0208053588867188,
+        0.8119285702705383,
+        0.718558132648468,
+        0.25572913885116577,
+        0.6181830763816833
+      ],
+      "q01": [
+        -0.9676963651657111,
+        -0.0003164021181873977,
+        -0.0008187678098678652,
+        -1.5938075653076171,
+        -0.44261839199066166,
+        -2.198074409103393,
+        -0.13494465734958627,
+        -0.0025135905981064077,
+        -0.0016476722434163094,
+        -1.7015782970190048,
+        -1.0292453282356262,
+        -1.6682623161315915
+      ],
+      "q99": [
+        0.17045696868896432,
+        2.5792064671580563,
+        2.4782622562915084,
+        1.2545792808532719,
+        1.2247761130571364,
+        1.458045475006104,
+        1.0856618701696394,
+        2.6036578441381453,
+        2.502444082275033,
+        1.3057386935949324,
+        1.0699406078338622,
+        2.0983653644561766
+      ],
+      "q02": [
+        -0.9234203773498537,
+        -0.0003164021181873977,
+        -0.0008187678098678652,
+        -1.5083262272834776,
+        -0.32799621334075924,
+        -1.6499750888824458,
+        -0.05942733430862468,
+        -0.0025135905981064077,
+        -0.0016476722434163094,
+        -1.6172094144821167,
+        -0.8684746216773986,
+        -1.5470734649658198
+      ],
+      "q98": [
+        0.11836757125854458,
+        2.4944407171577216,
+        2.320258955836296,
+        1.0754401289939883,
+        1.0116504996299742,
+        1.2137376384735115,
+        0.945415413093567,
+        2.528846830487251,
+        2.3551445673033595,
+        1.2016574553251265,
+        0.6969243632316591,
+        1.746526764297485
+      ]
+    },
+    "observation.state.effector.position": {
+      "mean": [
+        0.6734354496002197,
+        0.6749846339225769
+      ],
+      "std": [
+        0.4522727429866791,
+        0.45095184445381165
+      ],
+      "q01": [
+        -1e-10,
+        -1e-10
+      ],
+      "q99": [
+        0.99980000009996,
+        0.99980000009996
+      ],
+      "q02": [
+        -1e-10,
+        -1e-10
+      ],
+      "q98": [
+        0.99980000009996,
+        0.99980000009996
+      ]
+    }
+  },
+  "count": 532992
+}

assets/norm_stats/robotwin_5_customized.json ADDED Viewed

	@@ -0,0 +1,201 @@

+{
+  "norm_stats": {
+    "action": {
+      "mean": [
+        -0.32207754254341125,
+        1.406205654144287,
+        1.1087545156478882,
+        -0.6245313882827759,
+        -0.027720848098397255,
+        -0.035565875470638275,
+        0.4717631936073303,
+        0.25276312232017517,
+        0.8104884624481201,
+        0.5522242188453674,
+        -0.1358797252178192,
+        0.13210205733776093,
+        -0.13196010887622833,
+        0.7805091738700867
+      ],
+      "std": [
+        0.2855374813079834,
+        0.9229381084442139,
+        0.8118345737457275,
+        0.49564430117607117,
+        0.16244904696941376,
+        0.5517618656158447,
+        0.4883338212966919,
+        0.40702372789382935,
+        1.036325216293335,
+        0.7480976581573486,
+        0.7034134268760681,
+        0.3450477123260498,
+        0.7341580390930176,
+        0.4033139646053314
+      ],
+      "q01": [
+        -0.8213654638230801,
+        -5.257390398583084e-7,
+        -0.00002296771708643064,
+        -1.6557389229632915,
+        -0.6564541918039322,
+        -1.1997157670021057,
+        0.0,
+        -0.0013322193384173175,
+        0.0,
+        -0.0000281171942333458,
+        -1.4858032744407654,
+        -0.013652276556193832,
+        -1.5582030366897581,
+        0.0
+      ],
+      "q99": [
+        0.01988644998967637,
+        2.618066892673189,
+        2.8887816588023267,
+        -0.00009503023102874764,
+        0.39941834962368006,
+        1.3274614672660827,
+        0.9998,
+        1.2499000839233396,
+        2.403721238327026,
+        2.223998639903084,
+        1.3482957191944123,
+        1.2036741195514797,
+        2.3008846492767336,
+        0.9998
+      ],
+      "q02": [
+        -0.8116190195694566,
+        -5.257390398583084e-7,
+        -0.00002296771708643064,
+        -1.5653808554142714,
+        -0.5909986785650253,
+        -0.9318809885978698,
+        0.0,
+        -0.0013322193384173175,
+        0.0,
+        -0.0000281171942333458,
+        -1.400590261220932,
+        -0.005905654035508634,
+        -1.5582030366897581,
+        0.0
+      ],
+      "q98": [
+        0.01988644998967637,
+        2.509362170317786,
+        2.6153081541584893,
+        -0.00009503023102874764,
+        0.34549802929162987,
+        1.2313367155075077,
+        0.9998,
+        1.2416952819347378,
+        2.374588215923309,
+        2.1395174845976728,
+        1.328065291595459,
+        1.1956508319407702,
+        2.172924092388153,
+        0.9998
+      ]
+    },
+    "observation.state": {
+      "mean": [
+        -0.320831835269928,
+        1.401549220085144,
+        1.1045918464660645,
+        -0.6217827796936035,
+        -0.0279570072889328,
+        -0.03499468415975571,
+        0.4726906716823578,
+        0.2512069344520569,
+        0.8065828680992126,
+        0.5495453476905823,
+        -0.13533149659633636,
+        0.13129419088363647,
+        -0.1315813809633255,
+        0.7816013693809509
+      ],
+      "std": [
+        0.28554511070251465,
+        0.924691379070282,
+        0.8124904036521912,
+        0.49545007944107056,
+        0.16213101148605347,
+        0.5504377484321594,
+        0.4883865714073181,
+        0.40611740946769714,
+        1.035233497619629,
+        0.7470027208328247,
+        0.7013660073280334,
+        0.3439686894416809,
+        0.7313857674598694,
+        0.4025507867336273
+      ],
+      "q01": [
+        -0.8213654638230801,
+        -5.257390398583084e-7,
+        -0.00002296771708643064,
+        -1.6557389229632915,
+        -0.6564541918039322,
+        -1.1997157670021057,
+        0.0,
+        -0.0013322193384173175,
+        0.0,
+        -0.0000281171942333458,
+        -1.483351101398468,
+        -0.013652276556193832,
+        -1.5582030366897581,
+        0.0
+      ],
+      "q99": [
+        0.01988644998967637,
+        2.6186390227908487,
+        2.889423615385998,
+        -0.00009503023102874764,
+        0.39780878782272344,
+        1.3274614672660827,
+        0.9998,
+        1.2499000839233396,
+        2.404215018367767,
+        2.2201366442319794,
+        1.347682675933838,
+        1.2036741195514797,
+        2.3008846492767336,
+        0.9998
+      ],
+      "q02": [
+        -0.8116190195694566,
+        -5.257390398583084e-7,
+        -0.00002296771708643064,
+        -1.5653808554142714,
+        -0.5909986785650253,
+        -0.9318809885978698,
+        0.0,
+        -0.0013322193384173175,
+        0.0,
+        -0.0000281171942333458,
+        -1.3981380881786347,
+        -0.005905654035508634,
+        -1.5582030366897581,
+        0.0
+      ],
+      "q98": [
+        0.01988644998967637,
+        2.509362170317786,
+        2.61595011074216,
+        -0.00009503023102874764,
+        0.3452297689914703,
+        1.2313367155075077,
+        0.9998,
+        1.2416952819347378,
+        2.374588215923309,
+        2.1380692362210083,
+        1.328065291595459,
+        1.1956508319407702,
+        2.1450514958381657,
+        0.9998
+      ]
+    }
+  },
+  "count": 74240
+}

assets/norm_stats/robotwin_all_new.json ADDED Viewed

	@@ -0,0 +1,229 @@

+{
+  "norm_stats": {
+    "action.arm.position": {
+      "mean": [
+        -0.2260681688785553,
+        1.090435266494751,
+        0.8042582273483276,
+        -0.3527189791202545,
+        0.056556474417448044,
+        -0.04530515521764755,
+        0.2346765249967575,
+        1.112542748451233,
+        0.8304542303085327,
+        -0.357768177986145,
+        -0.01014612801373005,
+        0.010991317220032215
+      ],
+      "std": [
+        0.3691432774066925,
+        0.994762122631073,
+        0.7908730506896973,
+        0.6637247800827026,
+        0.24963052570819855,
+        0.5638052821159363,
+        0.32393988966941833,
+        1.0204970836639404,
+        0.8119731545448303,
+        0.7209287285804749,
+        0.25776439905166626,
+        0.6208906769752502
+      ],
+      "q01": [
+        -0.9676963651657111,
+        -0.0003164021181873977,
+        -0.0026667596280574857,
+        -1.596037513256073,
+        -0.4467973255872727,
+        -2.20232324104309,
+        -0.13648582720756508,
+        -0.0017502129077910933,
+        -0.0023805056512355804,
+        -1.703943779706955,
+        -1.0264247895240783,
+        -1.6682623161315915
+      ],
+      "q99": [
+        0.17045696868896432,
+        2.5760957974332737,
+        2.4727182808369395,
+        1.259782492733002,
+        1.2253731035709379,
+        1.4495478111267097,
+        1.0841207003116606,
+        2.6036578441381453,
+        2.4987799152359367,
+        1.3104696589708325,
+        1.0692354731559752,
+        2.104229341125489
+      ],
+      "q02": [
+        -0.9260248472213748,
+        -0.0003164021181873977,
+        -0.0026667596280574857,
+        -1.5090695432662964,
+        -0.3291901943683624,
+        -1.6520995048522948,
+        -0.05942733430862468,
+        -0.0017502129077910933,
+        -0.0023805056512355804,
+        -1.6187864029407502,
+        -0.8741156991004944,
+        -1.5490281238555905
+      ],
+      "q98": [
+        0.1157631013870235,
+        2.4936630497265257,
+        2.3193349599272013,
+        1.0769267609596254,
+        1.0140384616851805,
+        1.2073643905639653,
+        0.9469565829515458,
+        2.528083452796936,
+        2.3551445673033595,
+        1.2071769149303435,
+        0.6969243632316591,
+        1.7504360820770266
+      ]
+    },
+    "action.effector.position": {
+      "mean": [
+        0.6723259687423706,
+        0.6735112071037292
+      ],
+      "std": [
+        0.4526418447494507,
+        0.4514695405960083
+      ],
+      "q01": [
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.9998,
+        0.9998
+      ],
+      "q02": [
+        0.0,
+        0.0
+      ],
+      "q98": [
+        0.9998,
+        0.9998
+      ]
+    },
+    "observation.state.arm.position": {
+      "mean": [
+        -0.22502799332141876,
+        1.0857956409454346,
+        0.8007810711860657,
+        -0.3513113558292389,
+        0.05622035637497902,
+        -0.044659487903118134,
+        0.23319771885871887,
+        1.106688141822815,
+        0.82613205909729,
+        -0.3561287522315979,
+        -0.010010534897446632,
+        0.010481182485818863
+      ],
+      "std": [
+        0.3684558570384979,
+        0.9951919317245483,
+        0.7907320857048035,
+        0.6622379422187805,
+        0.24897389113903046,
+        0.5617504119873047,
+        0.32331398129463196,
+        1.0208075046539307,
+        0.8117841482162476,
+        0.718940019607544,
+        0.25672635436058044,
+        0.6180205345153809
+      ],
+      "q01": [
+        -0.9676963651657111,
+        -0.0003164021181873977,
+        -0.0026667596280574857,
+        -1.5938075653076171,
+        -0.4462003350734711,
+        -2.195949993133545,
+        -0.13648582720756508,
+        -0.0017502129077910933,
+        -0.0023805056512355804,
+        -1.703943779706955,
+        -1.0257196548461915,
+        -1.6663076572418207
+      ],
+      "q99": [
+        0.16785249881744324,
+        2.5760957974332737,
+        2.47087028901875,
+        1.2516060169219974,
+        1.22238815100193,
+        1.4495478111267097,
+        1.073332511305809,
+        2.602131088757515,
+        2.494382914789021,
+        1.3104696589708325,
+        1.0657097997665406,
+        2.102274682235718
+      ],
+      "q02": [
+        -0.9234203773498537,
+        -0.0003164021181873977,
+        -0.0026667596280574857,
+        -1.5060962793350219,
+        -0.3291901943683624,
+        -1.6436018409728996,
+        -0.05788616445064587,
+        -0.0017502129077910933,
+        -0.0023805056512355804,
+        -1.6164209202528,
+        -0.8698848910331727,
+        -1.5490281238555905
+      ],
+      "q98": [
+        0.1157631013870235,
+        2.4928853822953303,
+        2.3174869681090113,
+        1.0754401289939883,
+        1.0122474901437757,
+        1.2031155586242681,
+        0.945415413093567,
+        2.527320075106621,
+        2.3522132336720825,
+        1.202445949554443,
+        0.694808959197998,
+        1.7484814231872559
+      ]
+    },
+    "observation.state.effector.position": {
+      "mean": [
+        0.6735715866088867,
+        0.6747165322303772
+      ],
+      "std": [
+        0.4521658420562744,
+        0.4510030150413513
+      ],
+      "q01": [
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.9998,
+        0.9998
+      ],
+      "q02": [
+        0.0,
+        0.0
+      ],
+      "q98": [
+        0.9998,
+        0.9998
+      ]
+    }
+  },
+  "count": 535680
+}

assets/scale_ps.png ADDED Viewed

Git LFS Details

SHA256: b23143996c78b30f658b9a81e0d46c96c2231d9dd2646775b0c057773a1fce14
Pointer size: 131 Bytes
Size of remote file: 481 kB

assets/scale_sr.png ADDED Viewed

Git LFS Details

SHA256: 3becc2bb6d5355f672dc110a4578277c3eac1cf53f3cba726e5e6277b8d9c413
Pointer size: 131 Bytes
Size of remote file: 466 kB

configs/norm/robotwin_5.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+model:
+  model_path: /path/to/LingBot-VLA-Depth
+  tokenizer_path: /path/to/Qwen2.5-VL-3B-Instruct/
+data:
+  datasets_type: vla
+  train_path: /path/to/mixed_robotwin_5tasks
+  norm_path: assets/norm_stats/robotwin_5_custom.json
+train:
+  global_batch_size: 512
+  output_dir: output/norm

configs/vla/robotwin_load20000h.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+model:
+  model_path: /path/to/LingBot-VLA
+  tokenizer_path: /path/to/Qwen2.5-VL-3B-Instruct/
+  post_training: true
+  adanorm_time: true
+  old_adanorm: true
+data:
+  datasets_type: vla
+  data_name: robotwin_5_new
+  train_path: /path/to/mixed_robotwin_5tasks
+  num_workers: 8
+  norm_type: bounds_99_woclip
+  norm_stats_file: assets/norm_stats/robotwin_50.json
+train:
+  output_dir: /path/to/lingbot_robotwin5tasks/
+  loss_type: L1_fm
+  data_parallel_mode: fsdp2
+  enable_full_shard: false
+  module_fsdp_enable: true
+  use_compile: true
+  use_wandb: false
+  rmpad: false
+  rmpad_with_pos_ids: false
+  ulysses_parallel_size: 1
+  freeze_vision_encoder: false
+  tokenizer_max_length: 24
+  action_dim: 14
+  max_action_dim: 75
+  max_state_dim: 75
+  lr: 1.0e-4
+  lr_decay_style: constant
+  num_train_epochs: 69
+  micro_batch_size: 32
+  global_batch_size: 256
+  max_steps: 220000
+  ckpt_manager: dcp
+  save_steps: 220000
+  save_epochs: 69
+  enable_fp32: true
+  enable_resume: true

configs/vla/robotwin_load20000h_depth.yaml ADDED Viewed

	@@ -0,0 +1,68 @@

+model:
+  model_path: /path/to/LingBot-VLA-Depth
+  tokenizer_path: /path/to/Qwen2.5-VL-3B-Instruct/
+  post_training: true
+  adanorm_time: true
+  old_adanorm: true
+  moge_path: /path/to/moge2-vitb-normal
+  morgbd_path: /path/to/LingBot-Depth-Pretrained
+data:
+  datasets_type: vla
+  data_name: robotwin_5_new
+  train_path: /path/to/mixed_robotwin_5tasks
+  num_workers: 8
+  norm_type: bounds_99_woclip
+  norm_stats_file: assets/norm_stats/robotwin_50.json
+train:
+  output_dir: /path/to/lingbot_depth_robotwin5tasks/
+  loss_type: L1_fm
+  data_parallel_mode: fsdp2
+  enable_full_shard: false
+  module_fsdp_enable: true
+  use_compile: true
+  use_wandb: false
+  rmpad: false
+  rmpad_with_pos_ids: false
+  ulysses_parallel_size: 1
+  freeze_vision_encoder: false
+  tokenizer_max_length: 24
+  action_dim: 14
+  max_action_dim: 75
+  max_state_dim: 75
+  lr: 1.0e-4
+  lr_decay_style: constant
+  num_train_epochs: 69
+  micro_batch_size: 32
+  global_batch_size: 256
+  max_steps: 220000
+  ckpt_manager: dcp
+  save_steps: 220000
+  save_epochs: 69
+  enable_fp32: true
+  enable_resume: true
+  align_params:
+    mode: 'query'
+    num_task_tokens: 8
+    use_image_tokens: True
+    use_task_tokens: False
+    use_text_tokens: False
+    use_contrastive: True
+    contrastive_loss_weight: 0.3
+    depth_loss_weight: 0.004
+    llm:
+      dim_out: 2048
+      image_token_size: 8
+      image_input_size: 224
+    depth:
+      model_type: MoRGBD
+      num_layers: 1
+      num_heads: 4
+      dim_head: 32
+      ff_mult: 1
+      num_backbone_tokens: 256
+      token_size: 16
+      dim_out: 1024
+      input_size: 224
+    visual_steps: 10000

deploy/__init__.py ADDED Viewed

File without changes

deploy/image_tools.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import numpy as np
+from PIL import Image
+def convert_to_uint8(img: np.ndarray) -> np.ndarray:
+    """Converts an image to uint8 if it is a float image.
+    This is important for reducing the size of the image when sending it over the network.
+    """
+    if np.issubdtype(img.dtype, np.floating):
+        img = (255 * img).astype(np.uint8)
+    return img
+def resize_with_pad(images: np.ndarray, height: int, width: int, method=Image.BILINEAR) -> np.ndarray:
+    """Replicates tf.image.resize_with_pad for multiple images using PIL. Resizes a batch of images to a target height.
+    Args:
+        images: A batch of images in [..., height, width, channel] format.
+        height: The target height of the image.
+        width: The target width of the image.
+        method: The interpolation method to use. Default is bilinear.
+    Returns:
+        The resized images in [..., height, width, channel].
+    """
+    # If the images are already the correct size, return them as is.
+    if images.shape[-3:-1] == (height, width):
+        return images
+    original_shape = images.shape
+    images = images.reshape(-1, *original_shape[-3:])
+    resized = np.stack([_resize_with_pad_pil(Image.fromarray(im), height, width, method=method) for im in images])
+    return resized.reshape(*original_shape[:-3], *resized.shape[-3:])
+def _resize_with_pad_pil(image: Image.Image, height: int, width: int, method: int) -> Image.Image:
+    """Replicates tf.image.resize_with_pad for one image using PIL. Resizes an image to a target height and
+    width without distortion by padding with zeros.
+    Unlike the jax version, note that PIL uses [width, height, channel] ordering instead of [batch, h, w, c].
+    """
+    cur_width, cur_height = image.size
+    if cur_width == width and cur_height == height:
+        return image  # No need to resize if the image is already the correct size.
+    ratio = max(cur_width / width, cur_height / height)
+    resized_height = int(cur_height / ratio)
+    resized_width = int(cur_width / ratio)
+    resized_image = image.resize((resized_width, resized_height), resample=method)
+    zero_image = Image.new(resized_image.mode, (width, height), 0)
+    pad_height = max(0, int((height - resized_height) / 2))
+    pad_width = max(0, int((width - resized_width) / 2))
+    zero_image.paste(resized_image, (pad_width, pad_height))
+    assert zero_image.size == (width, height)
+    return zero_image

deploy/lingbot_robotwin_policy.py ADDED Viewed

	@@ -0,0 +1,506 @@

+import json
+import os
+import time
+import random
+import numpy as np
+from collections import deque
+import torchvision
+import yaml
+from types import SimpleNamespace
+from packaging.version import Version
+from typing import Callable, Dict, List, Optional, Type, Union, Tuple, Any, Sequence
+from glob import glob
+from tqdm import tqdm
+from safetensors import safe_open
+from safetensors.torch import load_file
+from pathlib import Path
+from PIL import Image
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+import transformers
+from transformers.models.auto.tokenization_auto import AutoTokenizer
+from transformers import (
+    AutoConfig,
+    PretrainedConfig,
+    PreTrainedModel,
+    AutoProcessor,
+)
+from lerobot.configs.policies import PreTrainedConfig
+from lingbotvla.models.vla.pi0.modeling_pi0 import PI0Policy
+from lingbotvla.models.vla.pi0.modeling_lingbot_vla import LingbotVlaPolicy
+from lingbotvla.data.vla_data.transform import Normalizer, prepare_images, prepare_language, prepare_state
+from lingbotvla.models import build_processor
+def set_seed_everywhere(seed: int):
+    """Sets the random seed for Python, NumPy, and PyTorch functions."""
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    os.environ["PYTHONHASHSEED"] = str(seed)
+set_seed_everywhere(42)
+BASE_MODEL_PATH = {
+    'pi0': os.environ.get('PALIGEMMA_PATH', './paligemma-3b-pt-224/'),
+    'lingbotvla': os.environ.get('QWEN25_PATH', './Qwen2.5-VL-3B-Instruct/'),
+}
+def load_model_weights(policy, path_to_pi_model, strict=True):
+    all_safetensors = glob(os.path.join(path_to_pi_model, "*.safetensors"))
+    merged_weights = {}
+    for file_path in tqdm(all_safetensors):
+        with safe_open(file_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                merged_weights[key] = f.get_tensor(key)
+    policy.load_state_dict(merged_weights, strict=strict)
+def center_crop_image(image: Union[np.ndarray, Image.Image]) -> Image.Image:
+    crop_scale = 0.9
+    side_scale = float(np.sqrt(np.clip(crop_scale, 0.0, 1.0)))  # side length scale
+    out_size = (224, 224)
+    # Convert input to PIL Image
+    if isinstance(image, np.ndarray):
+        arr = image
+        if arr.dtype.kind == "f":
+            # If floats likely in [0,1], map to [0,255]
+            if arr.max() <= 1.0 and arr.min() >= 0.0:
+                arr = (np.clip(arr, 0.0, 1.0) * 255.0).astype(np.uint8)
+            else:
+                arr = np.clip(arr, 0.0, 255.0).astype(np.uint8)
+        elif arr.dtype == np.uint16:
+            # Map 16-bit to 8-bit
+            arr = (arr / 257).astype(np.uint8)
+        elif arr.dtype != np.uint8:
+            arr = arr.astype(np.uint8)
+        pil = Image.fromarray(arr)
+    elif isinstance(image, Image.Image):
+        pil = image
+    else:
+        raise TypeError("image must be a numpy array or PIL.Image.Image")
+    # Force RGB for consistent output
+    pil = pil.convert("RGB")
+    W, H = pil.size
+    # Compute centered crop box (integer pixels)
+    crop_w = max(1, int(round(W * side_scale)))
+    crop_h = max(1, int(round(H * side_scale)))
+    left = (W - crop_w) // 2
+    top = (H - crop_h) // 2
+    right = left + crop_w
+    bottom = top + crop_h
+    cropped = pil.crop((left, top, right, bottom))
+    resized = cropped.resize(out_size, resample=Image.BILINEAR)
+    return resized
+def resize_with_pad(img, width, height, pad_value=-1):
+    # assume no-op when width height fits already
+    if img.ndim != 4:
+        raise ValueError(f"(b,c,h,w) expected, but {img.shape}")
+    # channel last to channel first if necessary
+    if img.shape[1] not in (1, 3) and img.shape[-1] in (1, 3):
+        img = img.permute(0, 3, 1, 2)
+    cur_height, cur_width = img.shape[2:]
+    ratio = max(cur_width / width, cur_height / height)
+    resized_height = int(cur_height / ratio)
+    resized_width = int(cur_width / ratio)
+    resized_img = F.interpolate(
+        img, size=(resized_height, resized_width), mode="bilinear", align_corners=False
+    )
+    pad_height = max(0, int(height - resized_height))
+    pad_width = max(0, int(width - resized_width))
+    # pad on left and top of image
+    padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value)
+    return padded_img
+class PolicyPreprocessMixin:
+    @torch.no_grad
+    def select_action(
+        self, observation: dict[str, Tensor], use_bf16: bool = False, vlm_causal: bool = False, noise: Tensor | None = None
+    ):
+        self.eval()
+        device = 'cuda'
+        if use_bf16:
+            dtype = torch.bfloat16
+        else:
+            dtype = torch.float32
+        s1 = time.time()
+        if len(observation['images'].shape) == 4:
+            observation['images'] = observation['images'].unsqueeze(0)
+            observation['img_masks'] = observation['img_masks'].unsqueeze(0)
+        if 'expert_imgs' in observation:
+            actions = self.model.sample_actions(
+                observation['images'].to(dtype=dtype, device=device),
+                observation['img_masks'].to(device=device),
+                observation['lang_tokens'].unsqueeze(0).to(device=device),
+                observation['lang_masks'].unsqueeze(0).to(device=device),
+                observation['state'].unsqueeze(0).to(dtype=dtype, device=device),
+                observation['expert_imgs'].to(dtype=dtype, device=device),
+                vlm_causal = vlm_causal
+            )
+        else:
+            actions = self.model.sample_actions(
+                observation['images'].to(dtype=dtype, device=device),
+                observation['img_masks'].to(device=device),
+                observation['lang_tokens'].unsqueeze(0).to(device=device),
+                observation['lang_masks'].unsqueeze(0).to(device=device),
+                observation['state'].unsqueeze(0).to(dtype=dtype, device=device),
+                vlm_causal = vlm_causal
+            )
+        delta_time = time.time() - s1
+        print(f'sample_actions cost {delta_time} s')
+        observation['action'] = actions.squeeze(0)[:, :14].to(dtype=torch.float32, device='cpu')
+        if use_bf16:
+            observation['state'] = observation['state'].to(dtype=torch.float32)
+        data = self.normalizer.unnormalize(observation)
+        return data
+class LingBotVlaInferencePolicy(PolicyPreprocessMixin, LingbotVlaPolicy):
+    pass # Only combine necessary functions
+class PI0InfernecePolicy(PolicyPreprocessMixin, PI0Policy):
+    pass # Only combine necessary functions
+def merge_qwen_config(policy_config, qwen_config):
+    if hasattr(qwen_config, 'to_dict'):
+        config_dict = qwen_config.to_dict()
+    else:
+        config_dict = qwen_config
+    text_keys = {
+        "hidden_size",
+        "intermediate_size",
+        "num_hidden_layers",
+        "num_attention_heads",
+        "num_key_value_heads",
+        "rms_norm_eps",
+        "rope_theta",
+        "vocab_size",
+        "max_position_embeddings",
+        "hidden_act",
+        "tie_word_embeddings",
+        "tokenizer_path",
+    }
+    for key in text_keys:
+        if key in config_dict:
+            setattr(policy_config, key, config_dict[key])
+            print(f"✅ Merged LLM: {key} = {config_dict[key]}")
+    if "vision_config" in config_dict:
+        policy_config.vision_config = qwen_config.vision_config
+    else:
+        print("⚠️ Warning: 'vision_config' not found in qwen_config!")
+    return policy_config
+class QwenPiServer:
+    '''
+    policy wrapper to support action ensemble or chunk execution
+    '''
+    def __init__(
+        self,
+        path_to_pi_model="",
+        adaptive_ensemble_alpha=0.1,
+        action_ensemble_horizon=8,
+        use_length=1, # to control the execution length of the action chunk, -1 denotes using action ensemble
+        chunk_ret=False,
+        use_bf16=True,
+        use_fp32=False,
+    ) -> None:
+        assert not (use_bf16 and use_fp32), 'Bfloat16 or Float32!!!'
+        self.adaptive_ensemble_alpha = adaptive_ensemble_alpha
+        self.use_length = use_length
+        self.chunk_ret = chunk_ret
+        self.task_description = None
+        self.vla = self.load_vla(path_to_pi_model)
+        self.vla = self.vla.cuda().eval()
+        if use_bf16:
+            self.vla = self.vla.to(torch.bfloat16)
+        elif use_fp32:
+            self.vla.model.float()
+        self.global_step = 0
+        self.last_action_chunk = None
+        self.use_bf16 = use_bf16
+        self.use_fp32 = use_fp32
+    def load_vla(self, path_to_pi_model) -> LingbotVlaPolicy:
+        # load model
+        print(f"loading model from: {path_to_pi_model}")
+        config = PreTrainedConfig.from_pretrained(path_to_pi_model)
+        # load training config
+        training_config_path = Path(path_to_pi_model).parent.parent.parent/'lingbotvla_cli.yaml'
+        with open(training_config_path, 'r') as f:
+            training_config = yaml.safe_load(f)
+        f.close()
+        # update model config according to training config
+        training_model_config = training_config['model']
+        training_model_config.update(training_config['train'])
+        for k, v in training_model_config.items():
+            v = getattr(config, k, training_model_config[k])
+            setattr(config, k, v)
+        # Set attention_implementation to 'eager' to speed up evaluation.
+        config.attention_implementation = 'eager'
+        # set base model according to training config
+        training_base_model = training_config['model']['tokenizer_path']
+        if 'paligemma' in training_base_model:
+            model_name = 'pi0'
+            config.vocab_size = 257152 # set vocab size for paligamma
+        elif 'qwen2' in training_base_model.lower():
+            model_name = 'lingbotvla'
+        else:
+            raise ValueError(f"Unsupported base model of {path_to_pi_model}")
+        base_model_path = BASE_MODEL_PATH[model_name]
+        config.tokenizer_path = base_model_path
+        self.model_name = model_name
+        qwen_config = AutoConfig.from_pretrained(base_model_path)
+        config = merge_qwen_config(config, qwen_config)
+        if 'vocab_size' in training_config['model'] and training_config['model']['vocab_size'] != 0:
+            config.vocab_size = training_config['model']['vocab_size']
+        # load processors
+        self.processor = build_processor(base_model_path)
+        self.language_tokenizer = self.processor.tokenizer
+        self.image_processor = self.processor.image_processor
+        data_config = SimpleNamespace(**training_config['data'])
+        print('Initializing model ... ')
+        if 'paligemma' in training_base_model:
+            policy = PI0InfernecePolicy(config, tokenizer_path=base_model_path)
+        else:
+            policy = LingBotVlaInferencePolicy(config, tokenizer_path=base_model_path)
+        load_model_weights(policy, path_to_pi_model, strict=True)
+        policy.feature_transform = None
+        self.data_config = data_config
+        self.config = config
+        self.joint_max_dim = training_config['train']['max_action_dim']
+        self.action_dim = training_config['train']['action_dim']
+        self.chunk_size = training_config['train']['chunk_size']
+        policy.action_dim = self.action_dim
+        policy.chunk_size = self.chunk_size
+        self.norm_stats_file = data_config.norm_stats_file
+        if 'align_params' in training_config['train']:
+            self.use_depth_align = True
+        else: self.use_depth_align = False
+        with open(self.norm_stats_file) as f:
+            self.norm_stats = json.load(f)
+        policy.normalizer = Normalizer(
+            norm_stats=self.norm_stats['norm_stats'],
+            from_file=True,
+            data_type='robotwin',
+            norm_type={
+                "observation.images.cam_high": "identity",
+                "observation.images.cam_left_wrist": "identity",
+                "observation.images.cam_right_wrist": "identity",
+                "observation.state": self.data_config.norm_type,
+                "action": self.data_config.norm_type,
+            },
+        )
+        print('Model initialized ... ')
+        return policy
+    def reset(self, robo_name, path_to_pi_model = None) -> None:
+        if path_to_pi_model is not None:
+            self.vla = self.load_vla(path_to_pi_model)
+            self.vla = self.vla.cuda().eval()
+            if self.use_bf16:
+                self.vla = self.vla.to(torch.bfloat16)
+            elif self.use_fp32:
+                self.vla.model.float()
+        self.global_step = 0
+        self.last_action_chunk = None
+        if getattr(self.data_config, 'norm_type', None) is None:
+            self.data_config.norm_type = 'meanstd'
+        if getattr(self.config, 'vlm_causal', None) is None:
+            self.config.vlm_causal = False
+        if getattr(self.config, 'qwenvl_bos', None) is None:
+            self.config.qwenvl_bos = False
+        # if update ckpt path
+        if path_to_pi_model is not None:
+            all_safetensors = glob(os.path.join(path_to_pi_model, "*.safetensors"))
+            merged_weights = {}
+            for file_path in tqdm(all_safetensors):
+                with safe_open(file_path, framework="pt", device="cpu") as f:
+                    for key in f.keys():
+                        merged_weights[key] = f.get_tensor(key)
+            self.vla.load_state_dict(merged_weights, strict=True)
+    def resize_image(self, observation):
+        for image_feature in ['observation.images.cam_high', 'observation.images.cam_left_wrist', 'observation.images.cam_right_wrist']:
+            assert image_feature in observation
+            assert len(observation[image_feature].shape)==3 and observation[image_feature].shape[-1] == 3
+            image = observation[image_feature]
+            img_pil = Image.fromarray(image)
+            image_size = getattr(self.data_config, 'img_size', 224)
+            img_pil = img_pil.resize((image_size, image_size), Image.BILINEAR)
+            # img_resized shape: C*H*W
+            img_resized = np.transpose(np.array(img_pil), (2,0,1))  # (3,224,224)
+            observation[image_feature] = img_resized / 255.
+    def infer(self, observation, center_crop=True):
+        """Generates an action with the VLA policy."""
+        # (If trained with image augmentations) Center crop image and then resize back up to original size.
+        # IMPORTANT: Let's say crop scale == 0.9. To get the new height and width (post-crop), multiply
+        #            the original height and width by sqrt(0.9) -- not 0.9!
+        if 'reset' in observation and observation['reset']:
+            self.reset(robo_name=observation['robo_name'], path_to_pi_model=observation['path_to_pi_model'] if 'path_to_pi_model' in observation else None)
+            return dict(action = None)
+        self.resize_image(observation)
+        for k, v in observation.items():
+            if isinstance(v, np.ndarray):
+                observation[k] = torch.from_numpy(v)
+        if self.use_length == -1 or self.global_step % self.use_length == 0:
+            joint_max_dim = getattr(self, 'joint_max_dim')
+            action_dim = getattr(self, 'action_dim')
+            chunk_size = getattr(self, 'chunk_size')
+            normalized_observation = self.vla.normalizer.normalize(observation)
+            base_image = (normalized_observation["observation.images.cam_high"] * 255).to(torch.uint8)
+            left_wrist_image = (normalized_observation["observation.images.cam_left_wrist"] * 255).to(
+                torch.uint8
+            )
+            right_wrist_image = (normalized_observation["observation.images.cam_right_wrist"] * 255).to(
+                torch.uint8
+            )
+            obs_dict =  {
+                "image": {"base_0_rgb": base_image, "left_wrist_0_rgb": left_wrist_image, "right_wrist_0_rgb": right_wrist_image},
+                "state": normalized_observation["observation.state"].to(torch.float32),
+                "prompt": [observation["task"]],
+            }
+            state = prepare_state(self.config, obs_dict)
+            lang_tokens, lang_masks = prepare_language(self.config, self.language_tokenizer, obs_dict)
+            images, img_masks, _ = prepare_images(self.config, self.image_processor, obs_dict)
+            observation = {
+                'images': images,
+                'img_masks': img_masks,
+                'state': state,
+                'lang_tokens': lang_tokens,
+                'lang_masks': lang_masks,
+            }
+            if self.use_bf16:
+                observation['state'] = observation['state'].to(torch.bfloat16)
+        org_actions = ['action']
+        assert len(org_actions)==1, "Only support single action feature"
+        if self.chunk_ret:
+            action = self.vla.select_action(observation, self.use_bf16, self.config.vlm_causal)[org_actions[0]].float().cpu().numpy()
+            action = action[:self.use_length, :self.action_dim]
+        else:
+            if self.use_length == -1 or self.global_step % self.use_length == 0:
+                action = self.vla.select_action(observation, self.use_bf16, self.config.vlm_causal)[org_actions[0]]
+                self.last_action_chunk = action.float().cpu().numpy()
+            if self.use_length > 0:
+                action = self.last_action_chunk[self.global_step % self.use_length]
+            action = action[:, :self.action_dim]
+            print(f"on server step: {self.global_step}")
+            self.global_step+=1
+        return dict(action = action)
+import argparse
+from .websocket_policy_server import WebsocketPolicyServer
+def main():
+    parser = argparse.ArgumentParser(description="启动 QwenPi WebSocket 策略服务器")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+    )
+    parser.add_argument(
+        "--use_length",
+        type=int,
+        default=50,
+        help="used length of action chunk"
+    )
+    parser.add_argument(
+        "--chunk_ret",
+        type=bool,
+        default=True,
+        help=" True: The returned action tensor includes the horizon dimension. This allows the model to output a sequence of actions for each horizon step. False: The horizon dimension is omitted. The model selects and returns the next step autonomously based on its policy."
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8006,
+        help="port of WebSocket"
+    )
+    parser.add_argument(
+        "--debug_infer_once",
+        action="store_true",
+        help="Run one infer with dummy observation then exit (for debugging infer() without WebSocket client)",
+    )
+    args = parser.parse_args()
+    model = QwenPiServer(args.model_path, use_length=args.use_length, chunk_ret=args.chunk_ret)
+    if args.debug_infer_once:
+        # 调试用：不启动 WebSocket，只跑一次 infer，可在 infer / select_action 里下断点
+        dummy_obs = {
+            "observation.images.cam_high": np.zeros((224, 224, 3), dtype=np.uint8),
+            "observation.images.cam_left_wrist": np.zeros((224, 224, 3), dtype=np.uint8),
+            "observation.images.cam_right_wrist": np.zeros((224, 224, 3), dtype=np.uint8),
+            "observation.state": np.zeros(model.action_dim, dtype=np.float32),
+            "task": "dummy task for debug",
+            "reset": False,
+        }
+        out = model.infer(dummy_obs)
+        print("debug_infer_once result keys:", out.keys())
+        return
+    model_server = WebsocketPolicyServer(model, port=args.port)
+    model_server.serve_forever()
+if __name__ == "__main__":
+    main()

deploy/lingbot_robotwin_policy_rep.py ADDED Viewed

	@@ -0,0 +1,491 @@

+import json
+import os
+import time
+import random
+import numpy as np
+from collections import deque
+import torchvision
+import yaml
+from types import SimpleNamespace
+from packaging.version import Version
+from typing import Callable, Dict, List, Optional, Type, Union, Tuple, Any, Sequence
+from glob import glob
+from tqdm import tqdm
+from safetensors import safe_open
+from safetensors.torch import load_file
+from pathlib import Path
+from PIL import Image
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+import transformers
+from transformers.models.auto.tokenization_auto import AutoTokenizer
+from transformers import (
+    AutoConfig,
+    PretrainedConfig,
+    PreTrainedModel,
+    AutoProcessor,
+)
+from lerobot.configs.policies import PreTrainedConfig
+from lingbotvla.models.vla.pi0.modeling_pi0 import PI0Policy
+from lingbotvla.models.vla.pi0.modeling_lingbot_vla import LingbotVlaPolicy
+from lingbotvla.data.vla_data.transform import Normalizer, prepare_images, prepare_language, prepare_state
+from lingbotvla.models import build_processor
+def set_seed_everywhere(seed: int):
+    """Sets the random seed for Python, NumPy, and PyTorch functions."""
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    os.environ["PYTHONHASHSEED"] = str(seed)
+set_seed_everywhere(42)
+BASE_MODEL_PATH = {
+    'pi0': os.environ.get('PALIGEMMA_PATH', './paligemma-3b-pt-224/'),
+    'lingbotvla': os.environ.get('QWEN25_PATH', './Qwen2.5-VL-3B-Instruct/'),
+}
+def load_model_weights(policy, path_to_pi_model, strict=True):
+    all_safetensors = glob(os.path.join(path_to_pi_model, "*.safetensors"))
+    merged_weights = {}
+    for file_path in tqdm(all_safetensors):
+        with safe_open(file_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                merged_weights[key] = f.get_tensor(key)
+    policy.load_state_dict(merged_weights, strict=strict)
+def center_crop_image(image: Union[np.ndarray, Image.Image]) -> Image.Image:
+    crop_scale = 0.9
+    side_scale = float(np.sqrt(np.clip(crop_scale, 0.0, 1.0)))  # side length scale
+    out_size = (224, 224)
+    # Convert input to PIL Image
+    if isinstance(image, np.ndarray):
+        arr = image
+        if arr.dtype.kind == "f":
+            # If floats likely in [0,1], map to [0,255]
+            if arr.max() <= 1.0 and arr.min() >= 0.0:
+                arr = (np.clip(arr, 0.0, 1.0) * 255.0).astype(np.uint8)
+            else:
+                arr = np.clip(arr, 0.0, 255.0).astype(np.uint8)
+        elif arr.dtype == np.uint16:
+            # Map 16-bit to 8-bit
+            arr = (arr / 257).astype(np.uint8)
+        elif arr.dtype != np.uint8:
+            arr = arr.astype(np.uint8)
+        pil = Image.fromarray(arr)
+    elif isinstance(image, Image.Image):
+        pil = image
+    else:
+        raise TypeError("image must be a numpy array or PIL.Image.Image")
+    # Force RGB for consistent output
+    pil = pil.convert("RGB")
+    W, H = pil.size
+    # Compute centered crop box (integer pixels)
+    crop_w = max(1, int(round(W * side_scale)))
+    crop_h = max(1, int(round(H * side_scale)))
+    left = (W - crop_w) // 2
+    top = (H - crop_h) // 2
+    right = left + crop_w
+    bottom = top + crop_h
+    cropped = pil.crop((left, top, right, bottom))
+    resized = cropped.resize(out_size, resample=Image.BILINEAR)
+    return resized
+def resize_with_pad(img, width, height, pad_value=-1):
+    # assume no-op when width height fits already
+    if img.ndim != 4:
+        raise ValueError(f"(b,c,h,w) expected, but {img.shape}")
+    # channel last to channel first if necessary
+    if img.shape[1] not in (1, 3) and img.shape[-1] in (1, 3):
+        img = img.permute(0, 3, 1, 2)
+    cur_height, cur_width = img.shape[2:]
+    ratio = max(cur_width / width, cur_height / height)
+    resized_height = int(cur_height / ratio)
+    resized_width = int(cur_width / ratio)
+    resized_img = F.interpolate(
+        img, size=(resized_height, resized_width), mode="bilinear", align_corners=False
+    )
+    pad_height = max(0, int(height - resized_height))
+    pad_width = max(0, int(width - resized_width))
+    # pad on left and top of image
+    padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value)
+    return padded_img
+class PolicyPreprocessMixin:
+    @torch.no_grad
+    def select_action(
+        self, observation: dict[str, Tensor], use_bf16: bool = False, vlm_causal: bool = False, noise: Tensor | None = None
+    ):
+        self.eval()
+        device = 'cuda'
+        if use_bf16:
+            dtype = torch.bfloat16
+        else:
+            dtype = torch.float32
+        s1 = time.time()
+        if len(observation['images'].shape) == 4:
+            observation['images'] = observation['images'].unsqueeze(0)
+            observation['img_masks'] = observation['img_masks'].unsqueeze(0)
+        state_indices = list(range(12)) + list(range(73, 75)) + list(range(12, 14)) + list(range(14, 73))
+        observation['state'] = observation['state'][state_indices]
+        if 'expert_imgs' in observation:
+            actions = self.model.sample_actions(
+                observation['images'].to(dtype=dtype, device=device),
+                observation['img_masks'].to(device=device),
+                observation['lang_tokens'].unsqueeze(0).to(device=device),
+                observation['lang_masks'].unsqueeze(0).to(device=device),
+                observation['state'].unsqueeze(0).to(dtype=dtype, device=device),
+                observation['expert_imgs'].to(dtype=dtype, device=device),
+                vlm_causal = vlm_causal
+            )
+        else:
+            actions = self.model.sample_actions(
+                observation['images'].to(dtype=dtype, device=device),
+                observation['img_masks'].to(device=device),
+                observation['lang_tokens'].unsqueeze(0).to(device=device),
+                observation['lang_masks'].unsqueeze(0).to(device=device),
+                observation['state'].unsqueeze(0).to(dtype=dtype, device=device),
+                vlm_causal = vlm_causal
+            )
+        action_indices = list(range(6)) + [14] + list(range(6, 12)) + [15]
+        actions = actions[:, :, action_indices]
+        delta_time = time.time() - s1
+        print(f'sample_actions cost {delta_time} s')
+        observation['action'] = actions.squeeze(0)[:, :14].to(dtype=torch.float32, device='cpu')
+        if use_bf16:
+            observation['state'] = observation['state'].to(dtype=torch.float32)
+        data = self.normalizer.unnormalize(observation)
+        return data
+class LingBotVlaInferencePolicy(PolicyPreprocessMixin, LingbotVlaPolicy):
+    pass # Only combine necessary functions
+class PI0InfernecePolicy(PolicyPreprocessMixin, PI0Policy):
+    pass # Only combine necessary functions
+def merge_qwen_config(policy_config, qwen_config):
+    if hasattr(qwen_config, 'to_dict'):
+        config_dict = qwen_config.to_dict()
+    else:
+        config_dict = qwen_config
+    text_keys = {
+        "hidden_size",
+        "intermediate_size",
+        "num_hidden_layers",
+        "num_attention_heads",
+        "num_key_value_heads",
+        "rms_norm_eps",
+        "rope_theta",
+        "vocab_size",
+        "max_position_embeddings",
+        "hidden_act",
+        "tie_word_embeddings",
+        "tokenizer_path",
+    }
+    for key in text_keys:
+        if key in config_dict:
+            setattr(policy_config, key, config_dict[key])
+            print(f"✅ Merged LLM: {key} = {config_dict[key]}")
+    if "vision_config" in config_dict:
+        policy_config.vision_config = qwen_config.vision_config
+    else:
+        print("⚠️ Warning: 'vision_config' not found in qwen_config!")
+    return policy_config
+class QwenPiServer:
+    '''
+    policy wrapper to support action ensemble or chunk execution
+    '''
+    def __init__(
+        self,
+        path_to_pi_model="",
+        adaptive_ensemble_alpha=0.1,
+        action_ensemble_horizon=8,
+        use_length=1, # to control the execution length of the action chunk, -1 denotes using action ensemble
+        chunk_ret=False,
+        use_bf16=True,
+        use_fp32=False,
+    ) -> None:
+        assert not (use_bf16 and use_fp32), 'Bfloat16 or Float32!!!'
+        self.adaptive_ensemble_alpha = adaptive_ensemble_alpha
+        self.use_length = use_length
+        self.chunk_ret = chunk_ret
+        self.task_description = None
+        self.vla = self.load_vla(path_to_pi_model)
+        self.vla = self.vla.cuda().eval()
+        if use_bf16:
+            self.vla = self.vla.to(torch.bfloat16)
+        elif use_fp32:
+            self.vla.model.float()
+        self.global_step = 0
+        self.last_action_chunk = None
+        self.use_bf16 = use_bf16
+        self.use_fp32 = use_fp32
+    def load_vla(self, path_to_pi_model) -> LingbotVlaPolicy:
+        # load model
+        print(f"loading model from: {path_to_pi_model}")
+        config = PreTrainedConfig.from_pretrained(path_to_pi_model)
+        # load training config
+        training_config_path = Path(path_to_pi_model)/'lingbotvla_cli.yaml'
+        with open(training_config_path, 'r') as f:
+            training_config = yaml.safe_load(f)
+        f.close()
+        # update model config according to training config
+        training_model_config = training_config['model']
+        training_model_config.update(training_config['train'])
+        for k, v in training_model_config.items():
+            v = getattr(config, k, training_model_config[k])
+            setattr(config, k, v)
+        # Set attention_implementation to 'eager' to speed up evaluation.
+        config.attention_implementation = 'eager'
+        # set base model according to training config
+        training_base_model = os.environ.get('QWEN25_PATH', './Qwen2.5-VL-3B-Instruct/')
+        if 'paligemma' in training_base_model:
+            model_name = 'pi0'
+            config.vocab_size = 257152 # set vocab size for paligamma
+        elif 'qwen2' in training_base_model.lower():
+            model_name = 'lingbotvla'
+        else:
+            raise ValueError(f"Unsupported base model of {path_to_pi_model}")
+        base_model_path = BASE_MODEL_PATH[model_name]
+        config.tokenizer_path = base_model_path
+        self.model_name = model_name
+        qwen_config = AutoConfig.from_pretrained(base_model_path)
+        config = merge_qwen_config(config, qwen_config)
+        if 'vocab_size' in training_config['model'] and training_config['model']['vocab_size'] != 0:
+            config.vocab_size = training_config['model']['vocab_size']
+        # load processors
+        self.processor = build_processor(base_model_path)
+        self.language_tokenizer = self.processor.tokenizer
+        self.image_processor = self.processor.image_processor
+        data_config = SimpleNamespace(**training_config['data'])
+        print('Initializing model ... ')
+        if 'paligemma' in training_base_model:
+            policy = PI0InfernecePolicy(config, tokenizer_path=base_model_path)
+        else:
+            policy = LingBotVlaInferencePolicy(config, tokenizer_path=base_model_path, eval=True)
+        load_model_weights(policy, path_to_pi_model, strict=True)
+        policy.feature_transform = None
+        self.data_config = data_config
+        self.config = config
+        self.joint_max_dim = training_config['train']['max_action_dim']
+        self.action_dim = training_config['train']['action_dim']
+        self.chunk_size = training_config['train']['chunk_size']
+        policy.action_dim = self.action_dim
+        policy.chunk_size = self.chunk_size
+        self.norm_stats_file = 'assets/norm_stats/robotwin_all_new.json'
+        if 'align_params' in training_config['train']:
+            self.use_depth_align = True
+        else: self.use_depth_align = False
+        with open(self.norm_stats_file) as f:
+            self.norm_stats = json.load(f)
+        policy.normalizer = Normalizer(
+            norm_stats=self.norm_stats['norm_stats'],
+            from_file=True,
+            data_type='robotwin_rep',
+            norm_type={
+                "observation.images.cam_high": "identity",
+                "observation.images.cam_left_wrist": "identity",
+                "observation.images.cam_right_wrist": "identity",
+                "observation.state": self.data_config.norm_type,
+                "action": self.data_config.norm_type,
+            },
+        )
+        print('Model initialized ... ')
+        return policy
+    def reset(self, robo_name, path_to_pi_model = None) -> None:
+        if path_to_pi_model is not None:
+            self.vla = self.load_vla(path_to_pi_model)
+            self.vla = self.vla.cuda().eval()
+            if self.use_bf16:
+                self.vla = self.vla.to(torch.bfloat16)
+            elif self.use_fp32:
+                self.vla.model.float()
+        self.global_step = 0
+        self.last_action_chunk = None
+        if getattr(self.data_config, 'norm_type', None) is None:
+            self.data_config.norm_type = 'meanstd'
+        if getattr(self.config, 'vlm_causal', None) is None:
+            self.config.vlm_causal = False
+        if getattr(self.config, 'qwenvl_bos', None) is None:
+            self.config.qwenvl_bos = False
+        # if update ckpt path
+        if path_to_pi_model is not None:
+            all_safetensors = glob(os.path.join(path_to_pi_model, "*.safetensors"))
+            merged_weights = {}
+            for file_path in tqdm(all_safetensors):
+                with safe_open(file_path, framework="pt", device="cpu") as f:
+                    for key in f.keys():
+                        merged_weights[key] = f.get_tensor(key)
+            self.vla.load_state_dict(merged_weights, strict=True)
+    def resize_image(self, observation):
+        for image_feature in ['observation.images.cam_high', 'observation.images.cam_left_wrist', 'observation.images.cam_right_wrist']:
+            assert image_feature in observation
+            assert len(observation[image_feature].shape)==3 and observation[image_feature].shape[-1] == 3
+            image = observation[image_feature]
+            img_pil = Image.fromarray(image)
+            image_size = getattr(self.data_config, 'img_size', 224)
+            img_pil = img_pil.resize((image_size, image_size), Image.BILINEAR)
+            # img_resized shape: C*H*W
+            img_resized = np.transpose(np.array(img_pil), (2,0,1))  # (3,224,224)
+            observation[image_feature] = img_resized / 255.
+    def infer(self, observation, center_crop=True):
+        """Generates an action with the VLA policy."""
+        # (If trained with image augmentations) Center crop image and then resize back up to original size.
+        # IMPORTANT: Let's say crop scale == 0.9. To get the new height and width (post-crop), multiply
+        #            the original height and width by sqrt(0.9) -- not 0.9!
+        if 'reset' in observation and observation['reset']:
+            self.reset(robo_name=observation['robo_name'], path_to_pi_model=observation['path_to_pi_model'] if 'path_to_pi_model' in observation else None)
+            return dict(action = None)
+        self.resize_image(observation)
+        for k, v in observation.items():
+            if isinstance(v, np.ndarray):
+                observation[k] = torch.from_numpy(v)
+        if self.use_length == -1 or self.global_step % self.use_length == 0:
+            joint_max_dim = getattr(self, 'joint_max_dim')
+            action_dim = getattr(self, 'action_dim')
+            chunk_size = getattr(self, 'chunk_size')
+            indices = list(range(6)) + list(range(7, 13)) + [6] + [13]
+            observation["observation.state"] = observation["observation.state"][indices]
+            normalized_observation = self.vla.normalizer.normalize(observation)
+            base_image = (normalized_observation["observation.images.cam_high"] * 255).to(torch.uint8)
+            left_wrist_image = (normalized_observation["observation.images.cam_left_wrist"] * 255).to(
+                torch.uint8
+            )
+            right_wrist_image = (normalized_observation["observation.images.cam_right_wrist"] * 255).to(
+                torch.uint8
+            )
+            obs_dict =  {
+                "image": {"base_0_rgb": base_image, "left_wrist_0_rgb": left_wrist_image, "right_wrist_0_rgb": right_wrist_image},
+                "state": normalized_observation["observation.state"].to(torch.float32),
+                "prompt": [observation["task"]],
+            }
+            state = prepare_state(self.config, obs_dict)
+            lang_tokens, lang_masks = prepare_language(self.config, self.language_tokenizer, obs_dict)
+            images, img_masks, _ = prepare_images(self.config, self.image_processor, obs_dict)
+            observation = {
+                'images': images,
+                'img_masks': img_masks,
+                'state': state,
+                'lang_tokens': lang_tokens,
+                'lang_masks': lang_masks,
+            }
+            if self.use_bf16:
+                observation['state'] = observation['state'].to(torch.bfloat16)
+        org_actions = ['action']
+        assert len(org_actions)==1, "Only support single action feature"
+        if self.chunk_ret:
+            action = self.vla.select_action(observation, self.use_bf16, self.config.vlm_causal)[org_actions[0]].float().cpu().numpy()
+            action = action[:self.use_length, :self.action_dim]
+        else:
+            if self.use_length == -1 or self.global_step % self.use_length == 0:
+                action = self.vla.select_action(observation, self.use_bf16, self.config.vlm_causal)[org_actions[0]]
+                self.last_action_chunk = action.float().cpu().numpy()
+            if self.use_length > 0:
+                action = self.last_action_chunk[self.global_step % self.use_length]
+            action = action[:, :self.action_dim]
+            print(f"on server step: {self.global_step}")
+            self.global_step+=1
+        return dict(action = action)
+import argparse
+from .websocket_policy_server import WebsocketPolicyServer
+def main():
+    parser = argparse.ArgumentParser(description="启动 QwenPi WebSocket 策略服务器")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+    )
+    parser.add_argument(
+        "--use_length",
+        type=int,
+        default=50,
+        help="used length of action chunk"
+    )
+    parser.add_argument(
+        "--chunk_ret",
+        type=bool,
+        default=True,
+        help=" True: The returned action tensor includes the horizon dimension. This allows the model to output a sequence of actions for each horizon step. False: The horizon dimension is omitted. The model selects and returns the next step autonomously based on its policy."
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8006,
+        help="port of WebSocket"
+    )
+    args = parser.parse_args()
+    model = QwenPiServer(args.model_path, use_length=args.use_length, chunk_ret = args.chunk_ret)
+    model_server = WebsocketPolicyServer(model, port=args.port)
+    model_server.serve_forever()
+if __name__ == "__main__":
+    main()

deploy/msgpack_numpy.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Adds NumPy array support to msgpack.
+msgpack is good for (de)serializing data over a network for multiple reasons:
+- msgpack is secure (as opposed to pickle/dill/etc which allow for arbitrary code execution)
+- msgpack is widely used and has good cross-language support
+- msgpack does not require a schema (as opposed to protobuf/flatbuffers/etc) which is convenient in dynamically typed
+    languages like Python and JavaScript
+- msgpack is fast and efficient (as opposed to readable formats like JSON/YAML/etc); I found that msgpack was ~4x faster
+    than pickle for serializing large arrays using the below strategy
+The code below is adapted from https://github.com/lebedov/msgpack-numpy. The reason not to use that library directly is
+that it falls back to pickle for object arrays.
+"""
+import functools
+import msgpack
+import numpy as np
+def pack_array(obj):
+    if (isinstance(obj, (np.ndarray, np.generic))) and obj.dtype.kind in ("V", "O", "c"):
+        raise ValueError(f"Unsupported dtype: {obj.dtype}")
+    if isinstance(obj, np.ndarray):
+        return {
+            b"__ndarray__": True,
+            b"data": obj.tobytes(),
+            b"dtype": obj.dtype.str,
+            b"shape": obj.shape,
+        }
+    if isinstance(obj, np.generic):
+        return {
+            b"__npgeneric__": True,
+            b"data": obj.item(),
+            b"dtype": obj.dtype.str,
+        }
+    return obj
+def unpack_array(obj):
+    if b"__ndarray__" in obj:
+        return np.ndarray(buffer=obj[b"data"], dtype=np.dtype(obj[b"dtype"]), shape=obj[b"shape"])
+    if b"__npgeneric__" in obj:
+        return np.dtype(obj[b"dtype"]).type(obj[b"data"])
+    return obj
+Packer = functools.partial(msgpack.Packer, default=pack_array)
+packb = functools.partial(msgpack.packb, default=pack_array)
+Unpacker = functools.partial(msgpack.Unpacker, object_hook=unpack_array)
+unpackb = functools.partial(msgpack.unpackb, object_hook=unpack_array)

deploy/websocket_client_policy.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import logging
+import time
+from typing import Dict, Optional, Tuple
+from typing_extensions import override
+import websockets.sync.client
+from .msgpack_numpy import Packer, unpackb
+class WebsocketClientPolicy:
+    """Implements the Policy interface by communicating with a server over websocket.
+    See WebsocketPolicyServer for a corresponding server implementation.
+    """
+    def __init__(self, host: str = "0.0.0.0", port: Optional[int] = None, api_key: Optional[str] = None) -> None:
+        self._uri = f"ws://{host}"
+        if port is not None:
+            self._uri += f":{port}"
+        self._packer = Packer()
+        self._api_key = api_key
+        self._ws, self._server_metadata = self._wait_for_server()
+    def get_server_metadata(self) -> Dict:
+        return self._server_metadata
+    def _wait_for_server(self) -> Tuple[websockets.sync.client.ClientConnection, Dict]:
+        logging.info(f"Waiting for server at {self._uri}...")
+        while True:
+            try:
+                headers = {"Authorization": f"Api-Key {self._api_key}"} if self._api_key else None
+                conn = websockets.sync.client.connect(
+                    self._uri, compression=None, max_size=None, additional_headers=headers
+                )
+                metadata = unpackb(conn.recv())
+                return conn, metadata
+            except ConnectionRefusedError:
+                logging.info("Still waiting for server...")
+                time.sleep(5)
+    @override
+    def infer(self, obs: Dict) -> Dict:  # noqa: UP006
+        data = self._packer.pack(obs)
+        self._ws.send(data)
+        response = self._ws.recv()
+        if isinstance(response, str):
+            # we're expecting bytes; if the server sends a string, it's an error.
+            raise RuntimeError(f"Error in inference server:\n{response}")
+        return unpackb(response)
+    @override
+    def reset(self, robo_name: str) -> None:
+        self.infer(dict(reset=True, robo_name=robo_name))
+if __name__ == "__main__":
+    policy_on_device = WebsocketClientPolicy(port=8000)
+    import torch
+    import numpy as np
+    from PIL import Image
+    from .image_tools import convert_to_uint8
+    device = torch.device("cuda")
+    base_0_rgb = np.random.randint(0, 256, size=(1, 3, 224, 224), dtype=np.uint8)
+    left_wrist_0_rgb = np.random.randint(0, 256, size=(1, 3, 224, 224), dtype=np.uint8)
+    state = np.random.rand(1,8).astype(np.float32)
+    prompt = ["do something"]
+    # observation = {
+    #     "image": {
+    #         "base_0_rgb": torch.from_numpy(base_0_rgb).to(device)[None],
+    #         "left_wrist_0_rgb": torch.from_numpy(left_wrist_0_rgb).to(device)[None],
+    #     },
+    #     "state": torch.from_numpy(state).to(device)[None],
+    #     "prompt": prompt,
+    # }
+    observation = {
+        "image": {
+            "base_0_rgb": convert_to_uint8(base_0_rgb),
+            "left_wrist_0_rgb": convert_to_uint8(left_wrist_0_rgb),
+            "right_wrist_0_rgb": convert_to_uint8(left_wrist_0_rgb),
+        },
+        "state": state,
+        "prompt": prompt,
+    }
+    policy_on_device.infer(observation)
+    from IPython import embed;embed()

deploy/websocket_policy_server.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import asyncio
+import http
+import logging
+import time
+import traceback
+from .msgpack_numpy import Packer, unpackb
+import websockets.asyncio.server as _server
+import websockets.frames
+logger = logging.getLogger(__name__)
+class WebsocketPolicyServer:
+    """Serves a policy using the websocket protocol. See websocket_client_policy.py for a client implementation.
+    Currently only implements the `load` and `infer` methods.
+    """
+    def __init__(
+        self,
+        policy,
+        host: str = "0.0.0.0",
+        port: int | None = None,
+        metadata: dict | None = None,
+    ) -> None:
+        self._policy = policy
+        self._host = host
+        self._port = port
+        self._metadata = metadata or {}
+        logging.getLogger("websockets.server").setLevel(logging.INFO)
+    def serve_forever(self) -> None:
+        asyncio.run(self.run())
+    async def run(self):
+        async with _server.serve(
+            self._handler,
+            self._host,
+            self._port,
+            compression=None,
+            max_size=None,
+            process_request=_health_check,
+        ) as server:
+            await server.serve_forever()
+    async def _handler(self, websocket: _server.ServerConnection):
+        logger.info(f"Connection from {websocket.remote_address} opened")
+        packer = Packer()
+        await websocket.send(packer.pack(self._metadata))
+        prev_total_time = None
+        while True:
+            try:
+                start_time = time.monotonic()
+                obs = unpackb(await websocket.recv())
+                infer_time = time.monotonic()
+                action = self._policy.infer(obs)
+                infer_time = time.monotonic() - infer_time
+                action["server_timing"] = {
+                    "infer_ms": infer_time * 1000,
+                }
+                if prev_total_time is not None:
+                    # We can only record the last total time since we also want to include the send time.
+                    action["server_timing"]["prev_total_ms"] = prev_total_time * 1000
+                await websocket.send(packer.pack(action))
+                prev_total_time = time.monotonic() - start_time
+            except websockets.ConnectionClosed:
+                logger.info(f"Connection from {websocket.remote_address} closed")
+                break
+            except Exception:
+                await websocket.send(traceback.format_exc())
+                await websocket.close(
+                    code=websockets.frames.CloseCode.INTERNAL_ERROR,
+                    reason="Internal server error. Traceback included in previous frame.",
+                )
+                raise
+def _health_check(connection: _server.ServerConnection, request: _server.Request) -> _server.Response | None:
+    if request.path == "/healthz":
+        return connection.respond(http.HTTPStatus.OK, "OK\n")
+    # Continue with the normal request handling.
+    return None

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+# Start from the NVIDIA official image (ubuntu-22.04 + python-3.10)
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:24.08-py3
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+# Install systemctl and tini
+RUN apt-get update && \
+apt-get install -y -o Dpkg::Options::="--force-confdef" systemd tini && \
+apt-get clean || { echo "Installation failed"; exit 1; }
+RUN apt-get install -y tzdata \
+    && ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
+    && dpkg-reconfigure -f noninteractive tzdata
+# Change pip source
+RUN python -m pip install --upgrade pip
+# Install torch-2.5.1 + vllm-0.7.3
+RUN pip install --no-cache-dir vllm==0.7.3 torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 tensordict torchdata \
+    transformers>=4.49.0 accelerate datasets peft hf-transfer diffusers \
+    codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb ninja liger-kernel \
+    pytest yapf py-spy pyext pre-commit ruff packaging
+# Install flux
+RUN pip install --no-cache-dir byte-flux
+# Install flash-attn and triton
+RUN pip install --no-cache-dir flash-attn triton>=3.1.0

docs/Makefile ADDED Viewed

	@@ -0,0 +1,20 @@

+# Minimal makefile for Sphinx documentation
+#
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = LingBotVLA
+SOURCEDIR     = .
+BUILDDIR      = _build
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+.PHONY: help Makefile
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

docs/README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+# LingBotVLA documents
+## Build the docs
+```bash
+# Install dependencies.
+pip install -r requirements-docs.txt
+# Build the docs.
+make clean
+make html
+```
+## Open the docs with your browser
+```bash
+python -m http.server -d _build/html/
+```
+Launch your browser and open localhost:8000.

docs/conf.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+# -- Path setup --------------------------------------------------------------
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+# -- Project information -----------------------------------------------------
+project = "LingBotVLA"
+# pylint: disable=W0622
+copyright = "2026 Robbyant Team, based on VeOmni by ByteDance Seed Foundation MLSys Team"
+# -- General configuration ---------------------------------------------------
+# The master toctree document.
+master_doc = "index"
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "recommonmark",
+    "sphinx.ext.autosectionlabel",
+]
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+source_suffix = [".rst", "rest", ".md"]
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = "en"
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+# -- Options for HTML output -------------------------------------------------
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]

docs/config/config.md ADDED Viewed

	@@ -0,0 +1,96 @@

+## Config arguments Explanation
+### Model configuration arguments
+| Name | Type | Description | Default Value |
+| --- | --- | --- | --- |
+| model.config_path | str | Path to the model huggingface configuration, like `config.json` | model.model_path |
+| model.model_path | str | Path to the model parameter file. If empty, random initialization will be performed | None |
+| model.tokenizer_path | str | Path to the tokenizer | model.model_path |
+| model.encoders | dict | Configuration file for multi-modal encoders | {} |
+| model.decoders | dict | Configuration file for multi-modal decoders | {} |
+| model.input_encoder | str: {"encoder", "decoder"} | Use the encoder of the encoder or decoder to encode the input image | encoder |
+| model.output_encoder | str: {"encoder", "decoder"} | Use the encoder of the encoder or decoder to encode the output image | decoder |
+| model.encode_target | bool | Used to encode the training data for the diffusion model | False |
+### Data configuration arguments
+| Name | Type | Description | Default Value |
+| --- | --- | --- | --- |
+| data.train_path | str | Path of training dataset | Required |
+| data.train_size | int | Total number of tokens in the training set | 10,000,000 |
+| data.data_type | str: {"plaintext", "conversation"} | Dataset type.  | conversation |
+| data.dataloader_type | str: {"native"} | Use the pytorch dataloader or  | native |
+| data.datasets_type | str: {"mapping", "iterable"} | Dataset type. `IterativeDataset` or `MappingDataset`, or your custom datsets | mapping |
+| data.text_keys | str: {"content_split", "messages"} | The key corresponding to the text samples in the data dictionary. Generally, it is "content_split" for pretraining and "messages" for SFT. | content_split |
+| data.image_keys | str | The key corresponding to the image samples in the data dictionary. Generally, it is "images". | images |
+| data.chat_template | str | Name of the chat template. | default |
+| data.max_seq_len | int | Maximum training length. | 2048 |
+| data.num_workers | int | Number of multi-process loaders for the dataloader. | 4 |
+| data.drop_last | bool | Whether to discard the remaining data at the end. | True |
+| data.pin_memory | bool | Whether to pin the data in the CPU memory. | True |
+| data.prefetch_factor | int | Number of samples preprocessed by the dataloader. | 2 |
+#### Training configuration arguments
+| Name | Type | Description | Default Value |
+| --- | --- | --- | --- |
+| train.output_dir | str | Path to save the model. | Required |
+| train.lr | float | Maximum learning rate. | 5e - 5 |
+| train.lr_min | float | Minimum learning rate. | 1e - 7 |
+| train.weight_decay | float | Weight decay coefficient. | 0 |
+| train.optimizer | str: {"adamw", "anyprecision_adamw"} | Name of the optimizer. | adamw |
+| train.max_grad_norm | float | Gradient clipping norm. | 1.0 |
+| train.micro_batch_size | int | Number of samples processed simultaneously on each GPU. | 1 |
+| train.global_batch_size | int | Global batch size, which must be a multiple of the number of GPUs. | train.micro_batch_size * n_gpus |
+| train.num_train_epochs | int | Number of training epochs. | 1 |
+| train.rmpad | bool | Whether to use rmpad training based on cu_seqlens. | False |
+| train.rmpad_with_pos_ids | bool | Whether to use rmpad training based on position_ids. | False |
+| train.dyn_bsz_margin | int | Number of pad tokens in the dynamic batch. | 0 |
+| train.dyn_bsz_runtime | str: {"main", "worker"} | Running process of the dynamic batch. | worker |
+| train.bsz_warmup_ratio | float | Proportion of batch size warmup in the total number of steps. | 0 |
+| train.lr_warmup_ratio | float | Proportion of learning rate warmup in the total number of steps. | 0 |
+| train.lr_decay_style | str: {"constant", "linear", "cosine"} | Name of the learning rate scheduler. | cosine |
+| train.lr_decay_ratio | float | Proportion of learning rate decay in the total number of steps | 1.0 |
+| train.use_doptim | bool | Whether to use the distributed optimizer during Vescale training(no use for torch fsdp) | False |
+| train.enable_mixed_precision | bool | Whether to enable mixed precision training (higher memory usage but more stable) | True |
+| train.enable_gradient_checkpointing | bool | Whether to enable gradient checkpointing to reduce memory usage. | True |
+| train.enable_reentrant | bool | Whether to enable reentrant in gradient checkpointing. | True |
+| train.enable_full_shard | bool | Whether to use full sharding FSDP (equivalent to ZeRO3). | True |
+| train.enable_fsdp_offload | bool | Whether to enable FSDP CPU offloading (only supported for FSDP1). | False |
+| train.enable_activation_offload | bool | Whether to enable activation value CPU offloading. | False |
+| train.activation_gpu_limit | float | Size of the activation values retained on the GPU (in GB). | 0.0 |
+| train.enable_manual_eager | bool | Whether to use manual eager during Vescale training. | False |
+| train.init_device: meta | str | "cpu", "cuda", "meta", init device for model initialization. use "meta" or cpu for large model(>30B) | cuda |
+| train.enable_full_determinism | bool | Whether to enable deterministic mode (for bitwise alignment). | False |
+| train.empty_cache_steps | int | Number of steps between two cache clearings. -1 means not enabled. | 500 |
+| train.data_parallel_mode | str: {"ddp", "fsdp1", "fsdp2"} | Data parallel algorithm. | ddp |
+| train.tensor_parallel_size | int | Tensor parallel size (currently only supported for vescale training). | 1 |
+| train.pipeline_parallel_size | int | Pipeline parallel size (currently not supported). | 1 |
+| train.ulysses_parallel_size | int | Ulysses sequence parallel size (currently only supported for P6dense and Qwen2VL). | 1 |
+| train.context_parallel_size | int | Ring sequence parallel size (currently not supported) | 1 |
+| train.expert_parallel_size | int | Expert parallel size (currently only supported DeepseekMOE) | 1 |
+| train.load_checkpoint_path | str | Path to the omnistore checkpoint for resuming training. | None |
+| train.save_steps | int | Number of steps between two checkpoint saves. 0 means invalid. | 0 |
+| train.save_epochs | int | Number of epochs between two checkpoint saves. 0 means invalid. | 1 |
+| train.save_hf_weights | bool | Whether to save the model weights in the huggingface format. It is recommended to set it to False for models > 30B to prevent NCCL timeout. You can convert it after training. | True |
+| train.seed | int | Random seed. | 42 |
+| train.use_wandb | bool | Whether to enable byted wandb experiment logging. | True |
+| train.wandb_project | str | Name of the wandb experiment project. | LingBotVLA |
+| train.wandb_name | str | Name of the wandb experiment. | None |
+| train.enable_profiling | bool | Whether to use torch profiling. | False |
+| train.profile_start_step | int | Starting step of profiling. | 1 |
+| train.profile_end_step | int | Ending step of profiling. | 2 |
+| train.profile_trace_dir | str | Path to save the profiling results. | ./trace |
+| train.profile_record_shapes | bool | Whether to record the shapes of the input tensors. | True |
+| train.profile_profile_memory | bool | Whether to record the memory usage. | True |
+| train.profile_with_stack | bool | Whether to record the stack information. | True |
+| train.max_steps | int | Number of steps per training epoch (only used for debugging). | None |
+### Inference configuration arguments
+| Name | Type | Description | Default Value |
+| --- | --- | --- | --- |
+| infer.model_path | str | Path to the model parameter file. | Required |
+| infer.tokenizer_path | str | Path to the tokenizer. | model.model_path |
+| infer.seed | int | Random seed. | 42 |
+| infer.do_sample | bool | Whether to enable sampling. | True |
+| infer.temperature | float | Sampling temperature. | 1.0 |
+| infer.top_p | float | Sampling Top P value. | 1.0 |
+| infer.max_tokens | int | Maximum number of tokens generated each time. | 1024 |

docs/examples/qwen2vl.rst ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Qwen2VL example
2	+ =========================

docs/examples/qwen3_moe.md ADDED Viewed

	@@ -0,0 +1,125 @@

+Qwen3 MoE training guide
+1. Download qwen3 moe model
+```shell
+python3 scripts/download_hf_model.py \
+  --repo_id Qwen/Qwen3-30B-A3B \
+  --local_dir .
+```
+2. Merge qwen3 moe model experts to support GroupGemm optimize
+``` shell
+python3 scripts/moe_ckpt_merge/moe_merge.py --raw_hf_path Qwen3-30B-A3B  --merge_hf_path Qwen3-30B-A3B-merge
+```
+Most of the MoE models in Transformers referenced the open-source implementation of Mixtral MoE. In this implementation, MoE experts are divided into multiple blocks instead of being combined into a single `nn.Parameters`. Additionally, there are cpu-block operators like `torch.where()` and for loop, which are not very friendly for integrating MoE fusion operators.
+Origin [Qwen3MoeMLP](https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py#L200C1-L213C25) code
+```python
+class Qwen3MoeMLP(nn.Module):
+    def __init__(self, config, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class Qwen3MoeSparseMoeBlock(nn.Module):
+    def __init__(self, config):
+            ...
+        self.experts = nn.ModuleList(
+            [Qwen3MoeMLP(config, intermediate_size=config.moe_intermediate_size) for _ in range(self.num_experts)]
+        )
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+            ...
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+        for expert_idx in expert_hitted:
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
+```
+- Combine Qwen3MoeMLP to Qwen3MoeExperts, then use fused moe operator
+```python
+class Qwen3MoeExperts(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.hidden_dim = config.hidden_size
+        self.intermediate_size = config.moe_intermediate_size
+        self.gate_proj = torch.nn.Parameter(
+            torch.empty(self.num_experts, self.intermediate_size, self.hidden_dim),
+            requires_grad=True,
+        )
+        self.up_proj = torch.nn.Parameter(
+            torch.empty(self.num_experts, self.intermediate_size, self.hidden_dim),
+            requires_grad=True,
+        )
+        self.down_proj = torch.nn.Parameter(
+            torch.empty(self.num_experts, self.hidden_dim, self.intermediate_size),
+            requires_grad=True,
+        )
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_states, expert_idx=None, cumsum=None):
+        gate_proj_out = torch.matmul(hidden_states, self.gate_proj[expert_idx].transpose(0, 1))
+        up_proj_out = torch.matmul(hidden_states, self.up_proj[expert_idx].transpose(0, 1))
+        out = self.act_fn(gate_proj_out) * up_proj_out
+        out = torch.matmul(out, self.down_proj[expert_idx].transpose(0, 1))
+        return out
+class Qwen3MoeSparseFusedMoeBlock(nn.Module):
+    def __init__(self, config):
+            ...
+      self.experts = Qwen3MoeExperts(config)
+    def forward(self, hidden_states, expert_idx=None, routing_weights=None, selected_experts=None) -> torch.Tensor:
+          ...
+        out = fused_moe_forward(
+            module=self,
+            num_experts=self.num_experts,
+            routing_weights=routing_weights,
+            selected_experts=selected_experts,
+            hidden_states=hidden_states,
+            fc1_1_weight=self.gate_proj,
+            fc1_2_weight=self.up_proj,
+            fc2_weight=self.down_proj,
+        )
+      return out
+```
+3. Train qwen3 moe model
+```
+bash train.sh tasks/train_torch.py configs/pretrain/qwen3-moe.yaml
+```

docs/index.rst ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Welcome to LingBotVLA
2	+ =========================

docs/requirements-docs.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+# markdown suport
+recommonmark
+# markdown table suport
+sphinx-markdown-tables
+# theme default rtd
+# crate-docs-theme
+sphinx-rtd-theme

docs/start/start.rst ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Getting Started
2	+ =========================

experiment/libero/README.md ADDED Viewed

	@@ -0,0 +1,18 @@

+# Install official LIBERO
+```bash
+git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git libero # (here)
+cd libero
+pip install -e .
+cd experiment/libero/libero
+pip install -r req.txt
+```
+If can not import xxx from libero.libero please add the libero (here) path to the PYTHONPATH variable.
+The results will be save to /project_root/Libero
+- release_ensemble/ stores the log files (This directory can be changed by --local_log_dir variable)
+- rollouts stores the videos

experiment/libero/libero/libero_utils.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""Utils for evaluating policies in LIBERO simulation environments."""
+import math
+import os
+import imageio
+import numpy as np
+import tensorflow as tf
+from libero.libero import get_libero_path
+from libero.libero.envs import OffScreenRenderEnv
+from experiment.libero.robot_utils import (
+    DATE,
+    DATE_TIME,
+)
+def get_libero_env(task, model_family, resolution=256):
+    """Initializes and returns the LIBERO environment, along with the task description."""
+    task_description = task.language
+    task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file)
+    env_args = {"bddl_file_name": task_bddl_file, "camera_heights": resolution, "camera_widths": resolution}
+    env = OffScreenRenderEnv(**env_args)
+    env.seed(0)  # IMPORTANT: seed seems to affect object positions even when using fixed initial state
+    return env, task_description
+def get_libero_dummy_action(model_family: str):
+    """Get dummy/no-op action, used to roll out the simulation while the robot does nothing."""
+    return [0, 0, 0, 0, 0, 0, -1]
+def resize_image(img, resize_size):
+    """
+    Takes numpy array corresponding to a single image and returns resized image as numpy array.
+    NOTE (Moo Jin): To make input images in distribution with respect to the inputs seen at training time, we follow
+                    the same resizing scheme used in the Octo dataloader, which OpenVLA uses for training.
+    """
+    assert isinstance(resize_size, tuple)
+    # Resize to image size expected by model
+    with tf.device('/CPU:0'):
+        img = tf.image.encode_jpeg(img)  # Encode as JPEG, as done in RLDS dataset builder
+        img = tf.io.decode_image(img, expand_animations=False, dtype=tf.uint8)  # Immediately decode back
+        img = tf.image.resize(img, resize_size, method="lanczos3", antialias=True)
+        img = tf.cast(tf.clip_by_value(tf.round(img), 0, 255), tf.uint8)
+        img = img.numpy()
+    return img
+def get_libero_image(obs, resize_size):
+    """Extracts image from observations and preprocesses it."""
+    assert isinstance(resize_size, int) or isinstance(resize_size, tuple)
+    if isinstance(resize_size, int):
+        resize_size = (resize_size, resize_size)
+    img = obs["agentview_image"]
+    img = img[::-1, ::-1]  # IMPORTANT: rotate 180 degrees to match train preprocessing
+    img = resize_image(img, resize_size)
+    return img
+def get_libero_wrist_image(obs, resize_size):
+    """Extracts wrist camera image from observations and preprocesses it."""
+    assert isinstance(resize_size, int) or isinstance(resize_size, tuple)
+    if isinstance(resize_size, int):
+        resize_size = (resize_size, resize_size)
+    img = obs["robot0_eye_in_hand_image"]
+    img = img[::-1, ::-1]  # IMPORTANT: rotate 180 degrees to match train preprocessing
+    img = resize_image(img, resize_size)
+    return img
+def save_rollout_video(rollout_images, idx, success, task_description, log_file=None, ckpt_index=None, task_suite_name=None, task_id=None):
+    """Saves an MP4 replay of an episode."""
+    rollout_dir = f"./Libero/rollouts/{ckpt_index}/{task_suite_name}-task{task_id}-{DATE_TIME}-{ckpt_index}"
+    os.makedirs(rollout_dir, exist_ok=True)
+    processed_task_description = task_description.lower().replace(" ", "_").replace("\n", "_").replace(".", "_")[:50]
+    mp4_path = f"{rollout_dir}/{DATE_TIME}--episode={idx}--success={success}--task={processed_task_description}.mp4"
+    video_writer = imageio.get_writer(mp4_path, fps=30)
+    for img in rollout_images:
+        video_writer.append_data(img)
+    video_writer.close()
+    print(f"Saved rollout MP4 at path {mp4_path}")
+    if log_file is not None:
+        log_file.write(f"Saved rollout MP4 at path {mp4_path}\n")
+    return mp4_path
+def quat2axisangle(quat):
+    """
+    Copied from robosuite: https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55
+    Converts quaternion to axis-angle format.
+    Returns a unit vector direction scaled by its angle in radians.
+    Args:
+        quat (np.array): (x,y,z,w) vec4 float angles
+    Returns:
+        np.array: (ax,ay,az) axis-angle exponential coordinates
+    """
+    # clip quaternion
+    if quat[3] > 1.0:
+        quat[3] = 1.0
+    elif quat[3] < -1.0:
+        quat[3] = -1.0
+    den = np.sqrt(1.0 - quat[3] * quat[3])
+    if math.isclose(den, 0.0):
+        # This is (close to) a zero degree rotation, immediately return
+        return np.zeros(3)
+    return (quat[:3] * 2.0 * math.acos(quat[3])) / den

experiment/libero/libero/req.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+imageio[ffmpeg]
+robosuite==1.4.1
+bddl
+easydict
+cloudpickle
+gym

experiment/libero/libero/run_libero_eval.py ADDED Viewed

	@@ -0,0 +1,300 @@

+"""
+run_libero_eval.py
+Runs a model in a LIBERO simulation environment.
+Usage:
+    # OpenVLA:
+    # IMPORTANT: Set `center_crop=True` if model is fine-tuned with augmentations
+    python Libero/robot/libero/run_libero_eval.py \
+        --model_family openvla \
+        --pretrained_checkpoint <CHECKPOINT_PATH> \
+        --task_suite_name [ libero_spatial | libero_object | libero_goal | libero_10 | libero_90 ] \
+        --center_crop [ True | False ] \
+        --run_id_note <OPTIONAL TAG TO INSERT INTO RUN ID FOR LOGGING> \
+        --use_wandb [ True | False ] \
+        --wandb_project <PROJECT> \
+        --wandb_entity <ENTITY>
+"""
+import tensorflow as tf
+import os, json, re, io, base64, threading
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+for g in tf.config.list_physical_devices('GPU'):
+    tf.config.experimental.set_memory_growth(g, True)
+import os
+import sys
+parent_dir = os.path.dirname(os.getcwd())
+sys.path.insert(0, parent_dir)
+sys.path.insert(0, os.getcwd())
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Union
+import torch
+import draccus
+import numpy as np
+import tqdm
+from libero.libero import benchmark
+import wandb
+# Append current directory so that interpreter can find Libero.robot
+from experiment.libero.libero.libero_utils import (
+    get_libero_dummy_action,
+    get_libero_env,
+    get_libero_image,
+    get_libero_wrist_image,
+    quat2axisangle,
+    save_rollout_video,
+)
+from experiment.libero.robot_utils import (
+    DATE_TIME,
+    get_action,
+    get_image_resize_size,
+    get_model,
+    invert_gripper_action,
+    normalize_gripper_action,
+    set_seed_everywhere,
+)
+@dataclass
+class GenerateConfig:
+    # fmt: off
+    #################################################################################################################
+    # Model-specific parameters
+    #################################################################################################################
+    model_family: str = "instruct_vla"                    # Model family
+    pretrained_checkpoint: Union[str, Path] = ""     # Pretrained checkpoint path
+    unnorm_key: Optional[str] = None
+    # image_size: list[int] = [224, 224]
+    action_dim: int = 7
+    model_port: int = 8012
+    #################################################################################################################
+    # LIBERO environment-specific parameters
+    #################################################################################################################
+    task_suite_name: str = "libero_spatial"          # Task suite. Options: libero_spatial, libero_object, libero_goal, libero_10, libero_90
+    task_id: Optional[int] = None
+    num_steps_wait: int = 10                         # Number of steps to wait for objects to stabilize in sim
+    num_trials_per_task: int = 50                    # Number of rollouts per task
+    #################################################################################################################
+    # Utils
+    #################################################################################################################
+    run_id_note: Optional[str] = None                # Extra note to add in run ID for logging
+    local_log_dir: str = "./Libero/logs"        # Local directory for eval logs
+    use_wandb: bool = False                          # Whether to also log results in Weights & Biases
+    wandb_project: str = "YOUR_WANDB_PROJECT"        # Name of W&B project to log to (use default!)
+    wandb_entity: str = "YOUR_WANDB_ENTITY"          # Name of entity to log under
+    seed: int = 42                                    # Random Seed (for reproducibility)
+    use_length: int = 8
+    # fmt: on
+@draccus.wrap()
+def eval_libero(cfg: GenerateConfig) -> None:
+    ckpt_index = cfg.pretrained_checkpoint.split('/checkpoints/')[0].split('/')[-1]
+    # Set random seed
+    set_seed_everywhere(cfg.seed)
+    # [OpenVLA] Check that the model contains the action un-normalization key
+    if cfg.model_family == "openvla":
+        # [OpenVLA] Set action un-normalization key
+        cfg.unnorm_key = cfg.task_suite_name
+        model, server = get_model(cfg)
+        server = None
+        # In some cases, the key must be manually modified (e.g. after training on a modified version of the dataset
+        # with the suffix "_no_noops" in the dataset name)
+        if cfg.unnorm_key not in model.norm_stats and f"{cfg.unnorm_key}_no_noops" in model.norm_stats:
+            cfg.unnorm_key = f"{cfg.unnorm_key}_no_noops"
+        assert cfg.unnorm_key in model.norm_stats, f"Action un-norm key {cfg.unnorm_key} not found in VLA `norm_stats`!"
+    elif cfg.model_family == "instruct_vla":
+        # [OpenVLA] Set action un-normalization key
+        cfg.unnorm_key = f"{cfg.task_suite_name}_no_noops"
+        model, server = get_model(cfg)
+    # Initialize local logging
+    run_id = f"EVAL-{cfg.task_suite_name}-task{cfg.task_id}-{cfg.model_family}-{DATE_TIME}-{ckpt_index}"
+    if cfg.run_id_note is not None:
+        run_id += f"--{cfg.run_id_note}"
+    cfg.local_log_dir = os.path.join(cfg.local_log_dir, ckpt_index)
+    os.makedirs(cfg.local_log_dir, exist_ok=True)
+    local_log_filepath = os.path.join(cfg.local_log_dir, run_id + ".txt")
+    log_file = open(local_log_filepath, "w")
+    print(f"Logging to local log file: {local_log_filepath}")
+    # Initialize Weights & Biases logging as well
+    if cfg.use_wandb:
+        wandb.init(
+            entity=cfg.wandb_entity,
+            project=cfg.wandb_project,
+            name=run_id,
+        )
+    # Initialize LIBERO task suite
+    benchmark_dict = benchmark.get_benchmark_dict()
+    task_suite = benchmark_dict[cfg.task_suite_name]()
+    num_tasks_in_suite = task_suite.n_tasks
+    print(f"Task suite: {cfg.task_suite_name}")
+    log_file.write(f"Task suite: {cfg.task_suite_name}\n")
+    # Get expected image dimensions
+    resize_size = get_image_resize_size(cfg)
+    # Start evaluation
+    total_episodes, total_successes = 0, 0
+    for task_id in tqdm.tqdm(range(num_tasks_in_suite)):
+        # Get task
+        if cfg.task_id is not None:
+            if cfg.task_suite_name == 'libero_10':
+                if task_id != cfg.task_id:
+                    continue
+        task = task_suite.get_task(task_id)
+        # Get default LIBERO initial states
+        initial_states = task_suite.get_task_init_states(task_id)
+        # Initialize LIBERO environment and task description
+        env, task_description = get_libero_env(task, cfg.model_family, resolution=256)
+        # Start episodes
+        task_episodes, task_successes = 0, 0
+        for episode_idx in tqdm.tqdm(range(cfg.num_trials_per_task)):
+            print(f"\nTask: {task_description}")
+            log_file.write(f"\nTask: {task_description}\n")
+            # Reset environment
+            env.reset()
+            server.reset(robo_name='libero')
+            # Set initial states
+            obs = env.set_init_state(initial_states[episode_idx])
+            # Setup
+            t = 0
+            replay_images = []
+            if cfg.task_suite_name == "libero_spatial":
+                max_steps = 220  # longest training demo has 193 steps
+            elif cfg.task_suite_name == "libero_object":
+                max_steps = 280  # longest training demo has 254 steps
+            elif cfg.task_suite_name == "libero_goal":
+                max_steps = 300  # longest training demo has 270 steps
+            elif cfg.task_suite_name == "libero_10":
+                max_steps = 520  # longest training demo has 505 steps
+            elif cfg.task_suite_name == "libero_90":
+                max_steps = 400  # longest training demo has 373 steps
+            print(f"Starting episode {task_episodes+1}...")
+            log_file.write(f"Starting episode {task_episodes+1}...\n")
+            while t < max_steps + cfg.num_steps_wait:
+                # try:
+                    # IMPORTANT: Do nothing for the first few timesteps because the simulator drops objects
+                    # and we need to wait for them to fall
+                if t < cfg.num_steps_wait:
+                    obs, reward, done, info = env.step(get_libero_dummy_action(cfg.model_family))
+                    t += 1
+                    continue
+                # Get preprocessed image
+                img = get_libero_image(obs, resize_size)
+                wrist_img = get_libero_wrist_image(obs, resize_size)
+                # Save preprocessed image for replay video
+                replay_images.append(img)
+                # Prepare observations dict
+                # Note: OpenVLA does not take proprio state as input
+                state = np.concatenate(
+                        (obs["robot0_eef_pos"], quat2axisangle(obs["robot0_eef_quat"]), obs["robot0_gripper_qpos"]))
+                observation = {
+                        "image": img,
+                        "wrist_image": wrist_img,
+                        "state": state,
+                        "task": task_description,
+                    }
+                # Query model to get action
+                action = get_action(
+                    server, observation
+                ).copy()
+                # Normalize gripper action [0,1] -> [-1,+1] because the environment expects the latter
+                # action = normalize_gripper_action(action, binarize=True)
+                action[..., -1] = np.sign(action[..., -1]) # binarize
+                # [OpenVLA] The dataloader flips the sign of the gripper action to align with other datasets
+                # (0 = close, 1 = open), so flip it back (-1 = open, +1 = close) before executing the action
+                # action = invert_gripper_action(action) # skip since we use raw action
+                print('==>action is',action)
+                # Execute action in environment
+                obs, reward, done, info = env.step(action.tolist())
+                if done:
+                    task_successes += 1
+                    total_successes += 1
+                    break
+                t += 1
+                # except Exception as e:
+                #     print(f"Caught exception: {e}")
+                #     log_file.write(f"Caught exception: {e}\n")
+                #     break
+            task_episodes += 1
+            total_episodes += 1
+            # Save a replay video of the episode
+            save_rollout_video(
+                replay_images, total_episodes, success=done, task_description=task_description, log_file=log_file, ckpt_index=ckpt_index, task_suite_name=cfg.task_suite_name, task_id=task_id
+            )
+            # Log current results
+            print(f"Success: {done}")
+            print(f"# episodes completed so far: {total_episodes}")
+            print(f"# successes: {total_successes} ({total_successes / total_episodes * 100:.1f}%)")
+            log_file.write(f"Success: {done}\n")
+            log_file.write(f"# episodes completed so far: {total_episodes}\n")
+            log_file.write(f"# successes: {total_successes} ({total_successes / total_episodes * 100:.1f}%)\n")
+            log_file.flush()
+        # Log final results
+        print(f"Current task success rate: {float(task_successes) / float(task_episodes)}")
+        print(f"Current total success rate: {float(total_successes) / float(total_episodes)}")
+        log_file.write(f"Current task success rate: {float(task_successes) / float(task_episodes)}\n")
+        log_file.write(f"Current total success rate: {float(total_successes) / float(total_episodes)}\n")
+        log_file.flush()
+        if cfg.use_wandb:
+            wandb.log(
+                {
+                    f"success_rate/{task_description}": float(task_successes) / float(task_episodes),
+                    f"num_episodes/{task_description}": task_episodes,
+                }
+            )
+    # Save local log file
+    log_file.close()
+    # Push total metrics and local log file to wandb
+    if cfg.use_wandb:
+        wandb.log(
+            {
+                "success_rate/total": float(total_successes) / float(total_episodes),
+                "num_episodes/total": total_episodes,
+            }
+        )
+        wandb.save(local_log_filepath)
+if __name__ == "__main__":
+    eval_libero()

experiment/libero/robot_utils.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""Utils for evaluating robot policies in various environments."""
+import os
+import random
+import time
+import numpy as np
+import torch
+# Initialize important constants and pretty-printing mode in NumPy.
+ACTION_DIM = 7
+DATE = time.strftime("%Y_%m_%d")
+DATE_TIME = time.strftime("%Y_%m_%d-%H_%M_%S")
+np.set_printoptions(formatter={"float": lambda x: "{0:0.3f}".format(x)})
+def set_seed_everywhere(seed: int):
+    """Sets the random seed for Python, NumPy, and PyTorch functions."""
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    os.environ["PYTHONHASHSEED"] = str(seed)
+def get_model(cfg, wrap_diffusion_policy_for_droid=False):
+    """Load model for evaluation."""
+    from deploy.websocket_client_policy import WebsocketClientPolicy
+    cronus_server = WebsocketClientPolicy(port=cfg.model_port)
+    return None, cronus_server
+def get_image_resize_size(cfg):
+    """
+    Gets image resize size for a model class.
+    If `resize_size` is an int, then the resized image will be a square.
+    Else, the image will be a rectangle.
+    """
+    if cfg.model_family == "openvla" or "instruct_vla" in cfg.model_family:
+        resize_size = 224
+    else:
+        raise ValueError("Unexpected `model_family` found in config.")
+    return resize_size
+def get_action(server, obs):
+    """Queries the model to get an action."""
+    action = server.infer(obs)['action']
+    return action
+def normalize_gripper_action(action, binarize=True):
+    """
+    Changes gripper action (last dimension of action vector) from [0,1] to [-1,+1].
+    Necessary for some environments (not Bridge) because the dataset wrapper standardizes gripper actions to [0,1].
+    Note that unlike the other action dimensions, the gripper action is not normalized to [-1,+1] by default by
+    the dataset wrapper.
+    Normalization formula: y = 2 * (x - orig_low) / (orig_high - orig_low) - 1
+    """
+    # Just normalize the last action to [-1,+1].
+    orig_low, orig_high = 0.0, 1.0
+    action = np.array(action, copy=True)
+    action[..., -1] = 2 * (action[..., -1] - orig_low) / (orig_high - orig_low) - 1
+    if binarize:
+        # Binarize to -1 or +1.
+        action[..., -1] = np.sign(action[..., -1])
+    return action
+def invert_gripper_action(action):
+    """
+    Flips the sign of the gripper action (last dimension of action vector).
+    This is necessary for some environments where -1 = open, +1 = close, since
+    the RLDS dataloader aligns gripper actions such that 0 = close, 1 = open.
+    """
+    action[..., -1] = action[..., -1] * -1.0
+    return action

experiment/robotwin/README.md ADDED Viewed

	@@ -0,0 +1,85 @@

+# Generate Lerobot Dataset from RoboTwin Data
+This guide explains how to process raw data from **RoboTwin** and convert it into the **LerobotDataset** format following the official RoboTwin instructions.
+## 1. Clone the Official RoboTwin Repository
+```bash
+git clone git@github.com:RoboTwin-Platform/RoboTwin.git
+```
+## 2. Create Required Directories
+Navigate to the `policy/pi0` directory inside the cloned RoboTwin repository and create the folders:
+```bash
+cd ./policy/pi0
+mkdir processed_data training_data
+```
+## 3. Convert RoboTwin Raw Data to HDF5
+Use the provided script [process_data_pi0.sh](https://github.com/RoboTwin-Platform/RoboTwin/blob/main/policy/pi0/process_data_pi0.sh):
+```bash
+bash process_data_pi0.sh ${task_name} ${task_config} ${expert_data_num}
+```
+**Example (clean demo):**
+```bash
+bash process_data_pi0.sh beat_block_hammer demo_clean 50
+```
+**Example (randomized demo):**
+```bash
+bash process_data_pi0.sh beat_block_hammer demo_randomized 50
+```
+If successful, the output folder:
+```
+processed_data/${task_name}-${task_config}-${expert_data_num}/
+```
+## 4. Prepare Training Data
+Copy the required processed datasets into `training_data/${model_name}`:
+```bash
+cp -r processed_data/${task_name}-${task_config}-${expert_data_num} \
+      training_data/${model_name}/
+```
+## 5. Ensure Sufficient Disk Space
+The generated **LerobotDataset** will be stored under:
+```
+$XDG_CACHE_HOME/huggingface/lerobot/${repo_id}
+```
+By default, `XDG_CACHE_HOME` points to `~/.cache`, which must have sufficient free space.
+If space is low, change the cache location:
+```bash
+export XDG_CACHE_HOME=/path/to/your/cache
+```
+## 6. Generate LerobotDataset Format
+Run [process_data_pi0.sh](https://github.com/RoboTwin-Platform/RoboTwin/blob/main/policy/pi0/generate.sh) to convert the HDF5 datasets to Lerobot.
+Parameters:
+- **hdf5_path**: Path to the HDF5 training data (e.g., `./training_data/${model_name}/`)
+- **repo_id**: Name for the dataset (e.g., `my_repo`)
+```bash
+bash generate.sh ${hdf5_path} ${repo_id}
+```
+**Example:**
+```bash
+bash generate.sh ./training_data/demo_clean/ demo_clean_repo
+```
+Output:
+```
+${XDG_CACHE_HOME}/huggingface/lerobot/${repo_id}
+```

lingbotvla/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__version__ = "0.0.1"

lingbotvla/checkpoint/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .checkpointer import build_checkpointer
+from .format_utils import bytecheckpoint_ckpt_to_state_dict, ckpt_to_state_dict, dcp_to_torch_state_dict
+__all__ = [
+    "ckpt_to_state_dict",
+    "dcp_to_torch_state_dict",
+    "bytecheckpoint_ckpt_to_state_dict",
+    "build_checkpointer",
+]

lingbotvla/checkpoint/checkpointer.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Dict
+import torch
+import torch.distributed as dist
+from torch.distributed.checkpoint.default_planner import DefaultLoadPlanner
+from ..utils.import_utils import is_torch_version_greater_than
+from ..utils.logging import get_logger
+from pathlib import Path
+if is_torch_version_greater_than("2.4"):
+    import torch.distributed.checkpoint as dcp
+    from torch.distributed.checkpoint import (
+        FileSystemReader,
+        FileSystemWriter,
+    )
+    from torch.distributed.checkpoint.state_dict import (
+        get_model_state_dict,
+        get_optimizer_state_dict,
+        set_model_state_dict,
+        set_optimizer_state_dict,
+    )
+    from torch.distributed.checkpoint.stateful import Stateful
+else:
+    Stateful = ABC
+logger = get_logger(__name__)
+_EXTRA_STATE_FORMAT = "extra_state_rank_{}.pt"
+_MODEL_DIR = "model"
+_EMA_DIR = "ema"
+_OPTIMIZER_DIR = "optimizer"
+_EXTRA_STATE_DIR = "extra_state"
+class ModelState(Stateful):
+    """
+    A wrapper around a model to make it stateful.
+    Args:
+        model (Model): model to wrap.
+    """
+    def __init__(self, model):
+        self.model = model
+    def state_dict(self):
+        model_state_dict = get_model_state_dict(model=self.model)
+        return {"model": model_state_dict}
+    def load_state_dict(self, state_dict):
+        set_model_state_dict(model=self.model, model_state_dict=state_dict["model"])
+class OptimizerState(Stateful):
+    """
+    A wrapper around an optimizer to make it stateful.
+    Args:
+        model (Model): model to wrap.
+        optimizer (Optimizer): optimizer to wrap.
+    """
+    def __init__(self, model, optimizer):
+        self.model = model
+        self.optimizer = optimizer
+    def state_dict(self):
+        optimizer_state_dict = get_optimizer_state_dict(model=self.model, optimizers=self.optimizer)
+        return {"optim": optimizer_state_dict}
+    def load_state_dict(self, state_dict):
+        set_optimizer_state_dict(model=self.model, optimizers=self.optimizer, optim_state_dict=state_dict["optim"])
+def build_checkpointer(
+    dist_backend: str = "fsdp1",
+    ckpt_manager: str = "bytecheckpoint",
+):
+    """
+    create a checkpointer manager with given mode.
+    Args:
+        dist_backend (str, optional): checkpoint mode. Defaults to "fsdp1".
+            fsdp1: FSDP1 checkpoint from bytecheckpoint
+            fsdp2-vescale: FSDP2 checkpoint from bytecheckpoint
+            fsdp2: FSDP2 checkpoint from bytecheckpoint
+            ddp: DDP checkpoint from bytecheckpoint
+            dcp: DCP checkpoint from torch.distributed.checkpoint
+        ckpt_manager (str, optional): checkpoint manager. Defaults to "bytecheckpoint".
+            bytecheckpoint: bytecheckpoint checkpoint manager
+            dcp: torch dcp checkpoint manager
+    Raises:
+        ValueError: if ckpt_manager is not supported
+    Returns:
+        Checkpointer: checkpointer with given mode.
+    """
+    if ckpt_manager == "bytecheckpoint":
+        if dist_backend == "ddp":
+            from bytecheckpoint import DDPCheckpointer as Checkpointer
+        elif dist_backend == "fsdp1":
+            from bytecheckpoint import FSDPCheckpointer as Checkpointer
+        elif dist_backend == "fsdp2-vescale":
+            from bytecheckpoint import VeScaleCheckpointer as Checkpointer
+        elif dist_backend == "fsdp2":
+            from bytecheckpoint import FSDP2Checkpointer as Checkpointer
+    elif ckpt_manager == "dcp":
+        if not is_torch_version_greater_than("2.4"):
+            raise ValueError("DCP checkpoint manager requires torch version >= 2.4")
+        if dist_backend not in ["ddp", "fsdp1", "fsdp2"]:
+            raise ValueError(
+                f"Unsupported distributed backend: {dist_backend} for DCP checkpoint manager, supported modes are: ddp, fsdp1, fsdp2"
+            )
+        Checkpointer = DistributedCheckpointer
+    else:
+        raise ValueError(
+            f"Unknown checkpoint manager: {ckpt_manager}, supported modes are: bytecheckpoint, dcp, native"
+        )
+    return Checkpointer
+class CheckpointerBase(ABC):
+    """Base class for checkpointer"""
+    @abstractmethod
+    def save(
+        cls,
+        path: str,
+        state: Dict[str, Any],
+    ):
+        return
+    @abstractmethod
+    def load(
+        cls,
+        path: str,
+        state: Dict[str, Any],
+    ):
+        return
+class DistributedCheckpointer(CheckpointerBase):
+    """
+    Distributed checkpointer for torch.distributed.checkpoint
+    """
+    @classmethod
+    def save(
+        cls,
+        path: str,
+        state: Dict[str, Any],
+        global_steps: int = None,
+        save_async=False,
+    ) -> None:
+        """
+        save training state to distributed checkpoint
+        args:
+            path: path to save checkpoint
+            state: state to save
+            global_steps: global steps
+            save_async: whether to save asynchronously
+        return:
+            None
+        """
+        checkpoint_dir = f"{path}/global_step_{global_steps}" if global_steps else path
+        os.makedirs(checkpoint_dir, exist_ok=True)
+        if "model" not in state:
+            raise ValueError("Model must be provided to save a distributed checkpoint.")
+        if save_async:
+            model_dir = os.path.join(checkpoint_dir, _MODEL_DIR)
+            dcp.async_save(
+                state_dict={"state": ModelState(state["model"])},
+                storage_writer=FileSystemWriter(
+                    model_dir,
+                    thread_count=16,
+                    single_file_per_rank=True,
+                    sync_files=False,
+                ),
+            )
+            if "ema" in state and state["ema"] is not None:
+                ema_dir = os.path.join(checkpoint_dir, _EMA_DIR)
+                dcp.async_save(
+                    state_dict={"state": ModelState(state["ema"])},
+                    storage_writer=FileSystemWriter(
+                        ema_dir,
+                        thread_count=16,
+                        single_file_per_rank=True,
+                        sync_files=False,
+                    ),
+                )
+            if "optimizer" in state:
+                optimizer_dir = os.path.join(checkpoint_dir, _OPTIMIZER_DIR)
+                dcp.async_save(
+                    state_dict={"state": OptimizerState(model=state["model"], optimizer=state["optimizer"])},
+                    storage_writer=FileSystemWriter(
+                        optimizer_dir,
+                        thread_count=16,
+                        single_file_per_rank=True,
+                        sync_files=False,
+                    ),
+                )
+        else:
+            def safe_create_writer(output_dir):
+                tmp_path = Path(output_dir) / ".metadata.tmp"
+                if tmp_path.exists():
+                    print(f"Warning: removing existing tmp file: {tmp_path}")
+                    tmp_path.unlink()  # remove .metadata.tmp
+                return FileSystemWriter(
+                    output_dir,
+                    thread_count=16,
+                    single_file_per_rank=True,
+                    sync_files=False,
+                )
+            model_dir = os.path.join(checkpoint_dir, _MODEL_DIR)
+            storage_writer = safe_create_writer(model_dir)
+            dcp.save(
+                state_dict={"state": ModelState(state["model"])},
+                storage_writer=storage_writer,
+            )
+            if "ema" in state and state["ema"] is not None:
+                ema_dir = os.path.join(checkpoint_dir, _EMA_DIR)
+                storage_writer = safe_create_writer(ema_dir)
+                dcp.save(
+                    state_dict={"state": ModelState(state["ema"])},
+                    storage_writer=storage_writer,
+                )
+            if "optimizer" in state:
+                optimizer_dir = os.path.join(checkpoint_dir, _OPTIMIZER_DIR)
+                dcp.save(
+                    state_dict={"state": OptimizerState(model=state["model"], optimizer=state["optimizer"])},
+                    storage_writer=FileSystemWriter(
+                        optimizer_dir,
+                        thread_count=16,
+                        single_file_per_rank=True,
+                        sync_files=False,
+                    ),
+                )
+                # dist.barrier()
+        if "extra_state" in state:
+            extra_state_dir = os.path.join(checkpoint_dir, _EXTRA_STATE_DIR)
+            os.makedirs(extra_state_dir, exist_ok=True)
+            extra_state_path = os.path.join(extra_state_dir, _EXTRA_STATE_FORMAT.format(dist.get_rank()))
+            torch.save(
+                state["extra_state"],
+                extra_state_path,
+            )
+        logger.info_rank0(f"Saved checkpoint to {checkpoint_dir}")
+    @classmethod
+    def load(
+        cls,
+        path: str,
+        state: Dict[str, Any],
+        process_group=None,
+    ) -> Dict[str, Any]:
+        """
+        load training state from distributed checkpoint
+        args:
+            path: path to load checkpoint
+            state: state to load, "model" are required,  "optimizer" and "extra_state" are optional
+        return:
+            state: state loaded
+        """
+        checkpoint_dir = path
+        if state is None:
+            raise ValueError("State dict must be provided to load a distributed checkpoint.")
+        if "model" not in state:
+            raise ValueError("Model must be provided to load a distributed checkpoint.")
+        if "ema" in state and state["ema"] is not None:
+            ema_dir = os.path.join(checkpoint_dir, _EMA_DIR)
+            dcp.load(
+                state_dict={"state": ModelState(state["ema"])},
+                storage_reader=FileSystemReader(ema_dir),
+                process_group=process_group,
+            )
+        if "optimizer" in state:
+            model_dir = os.path.join(checkpoint_dir, _MODEL_DIR)
+            dcp.load(
+                state_dict={"state": ModelState(state["model"])},
+                storage_reader=FileSystemReader(model_dir),
+                process_group=process_group,
+            )
+            optimizer_dir = os.path.join(checkpoint_dir, _OPTIMIZER_DIR)
+            try:
+                dcp.load(
+                    state_dict={"state": OptimizerState(model=state["model"], optimizer=state["optimizer"])}, # 1043
+                    storage_reader=FileSystemReader(optimizer_dir), # 1027
+                    planner = DefaultLoadPlanner(allow_partial_load=True),
+                    process_group=process_group,
+                )
+            except:
+                logger.info_rank0(f"Skip loading Optimizer from {checkpoint_dir}")
+        else:
+            model_dir = os.path.join(checkpoint_dir, _MODEL_DIR)
+            dcp.load(
+                state_dict={"state": ModelState(state["model"])},
+                storage_reader=FileSystemReader(model_dir),
+                process_group=process_group,
+            )
+        if "extra_state" in state:
+            extra_state_dir = os.path.join(checkpoint_dir, _EXTRA_STATE_DIR)
+            os.makedirs(extra_state_dir, exist_ok=True)
+            extra_state_path = os.path.join(extra_state_dir, _EXTRA_STATE_FORMAT.format(dist.get_rank()))
+            state["extra_state"] = torch.load(
+                extra_state_path,
+            )
+        logger.info_rank0(f"Loaded checkpoint from {checkpoint_dir}")
+        return state