s3y commited on Sep 23, 2025

Commit

950ec1e

verified ·

1 Parent(s): 745c5b8

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
.idea/.gitignore +8 -0
.idea/workspace.xml +12 -0
handler.py +215 -0
openpi/.DS_Store +0 -0
openpi/.dockerignore +3 -0
openpi/.gitattributes +36 -0
openpi/.github/CODEOWNERS +16 -0
openpi/.github/workflows/pre-commit.yml +17 -0
openpi/.github/workflows/test.yml +31 -0
openpi/.gitignore +168 -0
openpi/.gitmodules +6 -0
openpi/.idea/.gitignore +8 -0
openpi/.idea/workspace.xml +12 -0
openpi/.pre-commit-config.yaml +16 -0
openpi/.python-version +1 -0
openpi/.vscode/settings.json +11 -0
openpi/CONTRIBUTING.md +33 -0
openpi/LICENSE +201 -0
openpi/README.md +323 -0
openpi/config.json +85 -0
openpi/docs/docker.md +25 -0
openpi/docs/norm_stats.md +69 -0
openpi/docs/remote_inference.md +71 -0
openpi/examples/aloha_real/Dockerfile +70 -0
openpi/examples/aloha_real/README.md +126 -0
openpi/examples/aloha_real/compose.yml +66 -0
openpi/examples/aloha_real/constants.py +71 -0
openpi/examples/aloha_real/convert_aloha_data_to_lerobot.py +272 -0
openpi/examples/aloha_real/env.py +57 -0
openpi/examples/aloha_real/main.py +51 -0
openpi/examples/aloha_real/real_env.py +176 -0
openpi/examples/aloha_real/requirements.in +18 -0
openpi/examples/aloha_real/requirements.txt +156 -0
openpi/examples/aloha_real/robot_utils.py +275 -0
openpi/examples/aloha_real/video_display.py +36 -0
openpi/examples/aloha_sim/Dockerfile +41 -0
openpi/examples/aloha_sim/README.md +36 -0
openpi/examples/aloha_sim/compose.yml +42 -0
openpi/examples/aloha_sim/env.py +56 -0
openpi/examples/aloha_sim/main.py +55 -0
openpi/examples/aloha_sim/requirements.in +8 -0
openpi/examples/aloha_sim/requirements.txt +132 -0
openpi/examples/aloha_sim/saver.py +40 -0
openpi/examples/convert_jax_model_to_pytorch.py +587 -0
openpi/examples/droid/README.md +84 -0
openpi/examples/droid/README_train.md +106 -0
openpi/examples/droid/compute_droid_nonidle_ranges.py +103 -0
openpi/examples/droid/convert_droid_data_to_lerobot.py +477 -0
openpi/examples/droid/main.py +246 -0

.DS_Store ADDED Viewed

Binary file (10.2 kB). View file

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

.idea/workspace.xml ADDED Viewed

	@@ -0,0 +1,12 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent">{
+  &quot;keyToString&quot;: {
+    &quot;settings.editor.selected.configurable&quot;: &quot;dev.sweep.assistant.settings.SweepSettingsConfigurable&quot;
+  }
+}</component>
+</project>

handler.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import base64
+import json
+import os
+from io import BytesIO
+from typing import Any, Dict, List
+import numpy as np
+from PIL import Image
+from openpi.policies import policy_config
+from openpi.training import config as train_config
+class EndpointHandler:
+    def __init__(self, path: str = ""):
+        """
+        Initialize the handler for pi0 model inference using openpi infrastructure.
+        Args:
+            path: Path to the model weights directory
+        """
+        # Set model path from environment variable or use provided path
+        model_path = os.environ.get("MODEL_PATH", path)
+        if not model_path:
+            model_path = "weights/pi0"
+        # Load the config.json to determine model type
+        config_path = os.path.join(model_path, "config.json")
+        with open(config_path, "r") as f:
+            model_config = json.load(f)
+        model_type = model_config.get("type", "pi0")
+        # Create training config based on model type
+        # This uses the openpi config system
+        if model_type == "pi0":
+            self.train_config = train_config.get_config("pi0")
+        else:
+            # Default to pi0 if type not recognized
+            self.train_config = train_config.get_config("pi0")
+        # Create trained policy using openpi infrastructure
+        # This handles all the model loading, preprocessing, etc.
+        self.policy = policy_config.create_trained_policy(
+            self.train_config,
+            model_path,
+            pytorch_device="cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu"
+        )
+        # Default number of inference steps
+        self.default_num_steps = 50
+    def _decode_base64_image(self, base64_str: str) -> np.ndarray:
+        """
+        Decode base64 image string to numpy array.
+        Args:
+            base64_str: Base64 encoded image string
+        Returns:
+            numpy array of shape (H, W, 3) with values in [0, 255]
+        """
+        # Remove data URL prefix if present
+        if base64_str.startswith("data:image"):
+            base64_str = base64_str.split(",", 1)[1]
+        # Decode base64
+        image_bytes = base64.b64decode(base64_str)
+        # Convert to PIL Image and then to numpy array
+        image = Image.open(BytesIO(image_bytes)).convert("RGB")
+        image_array = np.array(image)
+        return image_array
+    def _prepare_observation(self, images: Dict[str, str], state: List[float], prompt: str = None) -> Dict[str, Any]:
+        """
+        Prepare observation dictionary in the format expected by openpi.
+        Args:
+            images: Dictionary mapping camera names to base64 encoded images
+            state: List of robot state values
+            prompt: Optional text prompt
+        Returns:
+            Observation dictionary in openpi format
+        """
+        # Decode and process images
+        processed_images = {}
+        # Map input camera names to expected openpi format
+        # Based on the config, pi0 expects specific camera names
+        camera_mapping = {
+            "camera0": "cam_high",          # base camera
+            "camera1": "cam_left_wrist",    # left wrist camera
+            "camera2": "cam_right_wrist",   # right wrist camera
+            # Alternative mappings
+            "base_camera": "cam_high",
+            "left_wrist": "cam_left_wrist",
+            "right_wrist": "cam_right_wrist",
+            # Direct mappings
+            "cam_high": "cam_high",
+            "cam_left_wrist": "cam_left_wrist",
+            "cam_right_wrist": "cam_right_wrist"
+        }
+        for input_name, image_b64 in images.items():
+            # Map to openpi expected name
+            openpi_name = camera_mapping.get(input_name, input_name)
+            # Decode image
+            image_array = self._decode_base64_image(image_b64)
+            # Resize to expected resolution if needed
+            if image_array.shape[:2] != (224, 224):
+                image_pil = Image.fromarray(image_array)
+                image_resized = image_pil.resize((224, 224))
+                image_array = np.array(image_resized)
+            # Convert to format expected by openpi (H, W, C) with uint8
+            processed_images[openpi_name] = image_array.astype(np.uint8)
+        # Ensure we have the required cameras, create dummy ones if missing
+        required_cameras = ["cam_high", "cam_left_wrist", "cam_right_wrist"]
+        for cam_name in required_cameras:
+            if cam_name not in processed_images:
+                # Create a black dummy image
+                processed_images[cam_name] = np.zeros((224, 224, 3), dtype=np.uint8)
+        # Prepare state
+        state_array = np.array(state, dtype=np.float32)
+        # Create observation dict in openpi format
+        observation = {
+            "state": state_array,
+            "images": processed_images,
+        }
+        # Add prompt if provided
+        if prompt:
+            observation["prompt"] = prompt
+        return observation
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Main inference function called by HuggingFace endpoint.
+        Args:
+            data: Input data dictionary containing:
+                - inputs: Dictionary with:
+                    - images: Dict mapping camera names to base64 encoded images
+                    - state: List of robot state values
+                    - prompt: Optional text prompt
+                    - num_actions: Optional, number of actions to predict (default: 50)
+                    - noise: Optional, noise array for sampling
+        Returns:
+            List containing prediction results
+        """
+        try:
+            inputs = data.get("inputs", {})
+            # Extract inputs
+            images = inputs.get("images", {})
+            state = inputs.get("state", [])
+            prompt = inputs.get("prompt", "")
+            num_actions = inputs.get("num_actions", self.default_num_steps)
+            noise_input = inputs.get("noise", None)
+            # Validate inputs
+            if not images:
+                raise ValueError("No images provided")
+            if not state:
+                raise ValueError("No state provided")
+            # Prepare observation using openpi format
+            observation = self._prepare_observation(images, state, prompt)
+            # Prepare noise if provided
+            noise = None
+            if noise_input is not None:
+                noise = np.array(noise_input, dtype=np.float32)
+            # Run inference using openpi policy
+            # This handles all the preprocessing, model inference, and postprocessing
+            result = self.policy.infer(observation, noise=noise)
+            # Extract actions from result
+            actions = result["actions"]
+            # Convert to list format for JSON serialization
+            if isinstance(actions, np.ndarray):
+                actions_list = actions.tolist()
+            else:
+                actions_list = actions
+            # Return in expected format
+            return [{
+                "actions": actions_list,
+                "num_actions": len(actions_list),
+                "action_horizon": len(actions_list),
+                "action_dim": len(actions_list[0]) if actions_list else 0,
+                "success": True,
+                "metadata": {
+                    "model_type": self.train_config.model.model_type.value,
+                    "policy_metadata": getattr(self.policy, '_metadata', {})
+                }
+            }]
+        except Exception as e:
+            return [{
+                "error": str(e),
+                "success": False
+            }]

openpi/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

openpi/.dockerignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.venv
+checkpoints
+data

openpi/.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

openpi/.github/CODEOWNERS ADDED Viewed

	@@ -0,0 +1,16 @@

+# The CODEOWNERS file defines individuals or teams that are automatically requested for
+# review when someone opens a pull request that modifies certain code. When a draft pull
+# request is marked as ready for review, code owners are automatically notified.
+#
+# See: https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
+#
+# This is a comment.
+# Each line is a file pattern followed by one or more owners.
+# Global owners.
+* @jimmyt857 @Michael-Equi @uzhilinsky
+src/openpi/models/ @kvablack @uzhilinsky
+src/openpi/training/ @kvablack @uzhilinsky
+scripts/ @jimmyt857 @kvablack @uzhilinsky

openpi/.github/workflows/pre-commit.yml ADDED Viewed

	@@ -0,0 +1,17 @@

+name: pre-commit
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - "*"
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    env:
+      GIT_LFS_SKIP_SMUDGE: true
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v3
+      - uses: pre-commit/action@v3.0.1

openpi/.github/workflows/test.yml ADDED Viewed

	@@ -0,0 +1,31 @@

+name: Test
+on:
+  pull_request:
+    branches:
+      - "*"
+jobs:
+  run_tests:
+    name: Run Tests
+    runs-on: openpi-verylarge
+    env:
+      GIT_LFS_SKIP_SMUDGE: true
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install FFmpeg dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+      - name: Set up Python
+        run: uv python install
+      - name: Install the project
+        run: uv sync --all-extras --dev
+      - name: Run tests
+        run: uv run pytest --strict-markers -m "not manual"

openpi/.gitignore ADDED Viewed

	@@ -0,0 +1,168 @@

+# Data directories.
+assets/
+checkpoints/
+data/
+wandb/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

openpi/.gitmodules ADDED Viewed

	@@ -0,0 +1,6 @@

+[submodule "third_party/aloha"]
+	path = third_party/aloha
+	url = https://github.com/Physical-Intelligence/aloha.git
+[submodule "third_party/libero"]
+	path = third_party/libero
+	url = https://github.com/Lifelong-Robot-Learning/LIBERO.git

openpi/.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

openpi/.idea/workspace.xml ADDED Viewed

	@@ -0,0 +1,12 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent">{
+  &quot;keyToString&quot;: {
+    &quot;settings.editor.selected.configurable&quot;: &quot;dev.sweep.assistant.settings.SweepSettingsConfigurable&quot;
+  }
+}</component>
+</project>

openpi/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+exclude: third_party/
+repos:
+  - repo: https://github.com/astral-sh/uv-pre-commit
+    # uv version.
+    rev: 0.5.14
+    hooks:
+      - id: uv-lock
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.8.6
+    hooks:
+      # Run the linter.
+      - id: ruff
+        args: [--fix]
+      - id: ruff-format

openpi/.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

openpi/.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "[python]": {
+        "editor.defaultFormatter": "charliermarsh.ruff",
+        "editor.formatOnSave": true,
+    },
+    "python.testing.pytestArgs": [
+        "src"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
+}

openpi/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,33 @@

+# Contributing to openpi
+We welcome contributions, improvements, and modifications. Everyone is welcome to use openpi in accordance to the [license](LICENSE). Contributors are also welcome to submit bug reports, feature requests, and pull requests. We can't promise to approve every pull request, and we are a small team with limited bandwidth to review all requests, but we'll give it our best effort. Specifics are described below.
+## Issues and feature requests
+You are welcome to use the Github [discussion](https://github.com/Physical-Intelligence/openpi/discussions) feature if you would like to discuss something that is not directly reporting an issue or making a feature request. This is suitable for questions about how to use some aspect of openpi, or other topics.
+If you found a bug or other issue, please first check that the issue was not already reported (use the search bar on Github under Issues). If the issue has not yet been reported, please include this information when filing a Github issue:
+- Your OS type and version and the version of Python you are using
+- Code that allows us to reproduce your bug, including all dependencies
+- Traceback of any exception
+- Any other information that would help us, such as a screenshot
+In order for us to address any issue, we must be able to reproduce it, so if you encountered the issue after making modifications to openpi, please reproduce the issue without any other modifications and provide a code snippet that allows us to quickly reproduce the problem on `main`.
+If you would like to submit a feature request, please check that the feature request does not already exist, and please provide the following information:
+- The motivation for the feature
+- A description of the problem you are trying to solve or your use case
+- Enough information for us to understand the nature of the request
+- Some information for how you intend to use it (this might help us in understanding the motivation!)
+We can't promise to support every feature request, but it is helpful to us to know the use cases that you are interested in!
+## Submitting a pull request
+If you implemented support for a new robot or environment, or some other new feature, we welcome pull requests (PRs) to openpi. We encourage you to create a [feature request](https://github.com/Physical-Intelligence/openpi/issues) or make a post on the [discussion](https://github.com/Physical-Intelligence/openpi/discussions) board before starting to work on your PR, if you would like to get a sense for whether we are likely to approve your PR if it is submitted. Since we are a small team with limited ability to provide maintenance and support, we may not accept all PRs (e.g., if we believe it would make the code harder to maintain, or if reviewing the PR is out of scope for us), so contacting us in advance is a good way to get a sense for whether your PR is likely to get approved for merging into openpi directly. But even if it isn't, you are of course more than welcome to maintain your own fork with whatever modifications you would like. When creating PRs, we recommend every contribution to consider the following:
+- Make sure that your PR has a clear title and description
+- Run `pre-commit` (install using `pre-commit install` first), and run `ruff check .` and `ruff format .`
+- Make sure your PR passes all tests

openpi/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

openpi/README.md ADDED Viewed

	@@ -0,0 +1,323 @@

+# openpi
+openpi holds open-source models and packages for robotics, published by the [Physical Intelligence team](https://www.physicalintelligence.company/).
+Currently, this repo contains three types of models:
+- the [π₀ model](https://www.physicalintelligence.company/blog/pi0), a flow-based vision-language-action model (VLA).
+- the [π₀-FAST model](https://www.physicalintelligence.company/research/fast), an autoregressive VLA, based on the FAST action tokenizer.
+- the [π₀.₅ model](https://www.physicalintelligence.company/blog/pi05), an upgraded version of π₀ with better open-world generalization trained with [knowledge insulation](https://www.physicalintelligence.company/research/knowledge_insulation). Note that, in this repository, we currently only support the flow matching head for both $\pi_{0.5}$ training and inference.
+For all models, we provide _base model_ checkpoints, pre-trained on 10k+ hours of robot data, and examples for using them out of the box or fine-tuning them to your own datasets.
+This is an experiment: $\pi_0$ was developed for our own robots, which differ from the widely used platforms such as [ALOHA](https://tonyzhaozh.github.io/aloha/) and [DROID](https://droid-dataset.github.io/), and though we are optimistic that researchers and practitioners will be able to run creative new experiments adapting $\pi_0$ to their own platforms, we do not expect every such attempt to be successful. All this is to say: $\pi_0$ may or may not work for you, but you are welcome to try it and see!
+## Updates
+- [Sept 2025] We released PyTorch support in openpi.
+- [Sept 2025] We released pi05, an upgraded version of pi0 with better open-world generalization.
+- [Sept 2025]: We have added an [improved idle filter](examples/droid/README_train.md#data-filtering) for DROID training.
+- [Jun 2025]: We have added [instructions](examples/droid/README_train.md) for using `openpi` to train VLAs on the full [DROID dataset](https://droid-dataset.github.io/). This is an approximate open-source implementation of the training pipeline used to train pi0-FAST-DROID.
+## Requirements
+To run the models in this repository, you will need an NVIDIA GPU with at least the following specifications. These estimations assume a single GPU, but you can also use multiple GPUs with model parallelism to reduce per-GPU memory requirements by configuring `fsdp_devices` in the training config. Please also note that the current training script does not yet support multi-node training.
+| Mode               | Memory Required | Example GPU        |
+| ------------------ | --------------- | ------------------ |
+| Inference          | > 8 GB          | RTX 4090           |
+| Fine-Tuning (LoRA) | > 22.5 GB       | RTX 4090           |
+| Fine-Tuning (Full) | > 70 GB         | A100 (80GB) / H100 |
+The repo has been tested with Ubuntu 22.04, we do not currently support other operating systems.
+## Installation
+When cloning this repo, make sure to update submodules:
+```bash
+git clone --recurse-submodules git@github.com:Physical-Intelligence/openpi.git
+# Or if you already cloned the repo:
+git submodule update --init --recursive
+```
+We use [uv](https://docs.astral.sh/uv/) to manage Python dependencies. See the [uv installation instructions](https://docs.astral.sh/uv/getting-started/installation/) to set it up. Once uv is installed, run the following to set up the environment:
+```bash
+GIT_LFS_SKIP_SMUDGE=1 uv sync
+GIT_LFS_SKIP_SMUDGE=1 uv pip install -e .
+```
+NOTE: `GIT_LFS_SKIP_SMUDGE=1` is needed to pull LeRobot as a dependency.
+**Docker**: As an alternative to uv installation, we provide instructions for installing openpi using Docker. If you encounter issues with your system setup, consider using Docker to simplify installation. See [Docker Setup](docs/docker.md) for more details.
+## Model Checkpoints
+### Base Models
+We provide multiple base VLA model checkpoints. These checkpoints have been pre-trained on 10k+ hours of robot data, and can be used for fine-tuning.
+| Model        | Use Case    | Description                                                                                                 | Checkpoint Path                                |
+| ------------ | ----------- | ----------------------------------------------------------------------------------------------------------- | ---------------------------------------------- |
+| $\pi_0$      | Fine-Tuning | Base [π₀ model](https://www.physicalintelligence.company/blog/pi0) for fine-tuning                | `gs://openpi-assets/checkpoints/pi0_base`      |
+| $\pi_0$-FAST | Fine-Tuning | Base autoregressive [π₀-FAST model](https://www.physicalintelligence.company/research/fast) for fine-tuning | `gs://openpi-assets/checkpoints/pi0_fast_base` |
+| $\pi_{0.5}$    | Fine-Tuning | Base [π₀.₅ model](https://www.physicalintelligence.company/blog/pi05) for fine-tuning    | `gs://openpi-assets/checkpoints/pi05_base`      |
+### Fine-Tuned Models
+We also provide "expert" checkpoints for various robot platforms and tasks. These models are fine-tuned from the base models above and intended to run directly on the target robot. These may or may not work on your particular robot. Since these checkpoints were fine-tuned on relatively small datasets collected with more widely available robots, such as ALOHA and the DROID Franka setup, they might not generalize to your particular setup, though we found some of these, especially the DROID checkpoint, to generalize quite broadly in practice.
+| Model                    | Use Case    | Description                                                                                                                                                                                              | Checkpoint Path                                       |
+| ------------------------ | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------- |
+| $\pi_0$-FAST-DROID       | Inference   | $\pi_0$-FAST model fine-tuned on the [DROID dataset](https://droid-dataset.github.io/): can perform a wide range of simple table-top manipulation tasks 0-shot in new scenes on the DROID robot platform | `gs://openpi-assets/checkpoints/pi0_fast_droid`       |
+| $\pi_0$-DROID            | Fine-Tuning | $\pi_0$ model fine-tuned on the [DROID dataset](https://droid-dataset.github.io/): faster inference than $\pi_0$-FAST-DROID, but may not follow language commands as well                                | `gs://openpi-assets/checkpoints/pi0_droid`            |
+| $\pi_0$-ALOHA-towel      | Inference   | $\pi_0$ model fine-tuned on internal [ALOHA](https://tonyzhaozh.github.io/aloha/) data: can fold diverse towels 0-shot on ALOHA robot platforms                                                          | `gs://openpi-assets/checkpoints/pi0_aloha_towel`      |
+| $\pi_0$-ALOHA-tupperware | Inference   | $\pi_0$ model fine-tuned on internal [ALOHA](https://tonyzhaozh.github.io/aloha/) data: can unpack food from a tupperware container                                                                                                             | `gs://openpi-assets/checkpoints/pi0_aloha_tupperware` |
+| $\pi_0$-ALOHA-pen-uncap  | Inference   | $\pi_0$ model fine-tuned on public [ALOHA](https://dit-policy.github.io/) data: can uncap a pen                                                                                                          | `gs://openpi-assets/checkpoints/pi0_aloha_pen_uncap`  |
+| $\pi_{0.5}$-LIBERO      | Inference   | $\pi_{0.5}$ model fine-tuned for the [LIBERO](https://libero-project.github.io/datasets) benchmark: gets state-of-the-art performance (see [LIBERO README](examples/libero/README.md)) | `gs://openpi-assets/checkpoints/pi05_libero`      |
+| $\pi_{0.5}$-DROID      | Inference / Fine-Tuning | $\pi_{0.5}$ model fine-tuned on the [DROID dataset](https://droid-dataset.github.io/) with [knowledge insulation](https://www.physicalintelligence.company/research/knowledge_insulation): fast inference and good language-following | `gs://openpi-assets/checkpoints/pi05_droid`      |
+By default, checkpoints are automatically downloaded from `gs://openpi-assets` and are cached in `~/.cache/openpi` when needed. You can overwrite the download path by setting the `OPENPI_DATA_HOME` environment variable.
+## Running Inference for a Pre-Trained Model
+Our pre-trained model checkpoints can be run with a few lines of code (here our $\pi_0$-FAST-DROID model):
+```python
+from openpi.training import config as _config
+from openpi.policies import policy_config
+from openpi.shared import download
+config = _config.get_config("pi05_droid")
+checkpoint_dir = download.maybe_download("gs://openpi-assets/checkpoints/pi05_droid")
+# Create a trained policy.
+policy = policy_config.create_trained_policy(config, checkpoint_dir)
+# Run inference on a dummy example.
+example = {
+    "observation/exterior_image_1_left": ...,
+    "observation/wrist_image_left": ...,
+    ...
+    "prompt": "pick up the fork"
+}
+action_chunk = policy.infer(example)["actions"]
+```
+You can also test this out in the [example notebook](examples/inference.ipynb).
+We provide detailed step-by-step examples for running inference of our pre-trained checkpoints on [DROID](examples/droid/README.md) and [ALOHA](examples/aloha_real/README.md) robots.
+**Remote Inference**: We provide [examples and code](docs/remote_inference.md) for running inference of our models **remotely**: the model can run on a different server and stream actions to the robot via a websocket connection. This makes it easy to use more powerful GPUs off-robot and keep robot and policy environments separate.
+**Test inference without a robot**: We provide a [script](examples/simple_client/README.md) for testing inference without a robot. This script will generate a random observation and run inference with the model. See [here](examples/simple_client/README.md) for more details.
+## Fine-Tuning Base Models on Your Own Data
+We will fine-tune the $\pi_{0.5}$ model on the [LIBERO dataset](https://libero-project.github.io/datasets) as a running example for how to fine-tune a base model on your own data. We will explain three steps:
+1. Convert your data to a LeRobot dataset (which we use for training)
+2. Defining training configs and running training
+3. Spinning up a policy server and running inference
+### 1. Convert your data to a LeRobot dataset
+We provide a minimal example script for converting LIBERO data to a LeRobot dataset in [`examples/libero/convert_libero_data_to_lerobot.py`](examples/libero/convert_libero_data_to_lerobot.py). You can easily modify it to convert your own data! You can download the raw LIBERO dataset from [here](https://huggingface.co/datasets/openvla/modified_libero_rlds), and run the script with:
+```bash
+uv run examples/libero/convert_libero_data_to_lerobot.py --data_dir /path/to/your/libero/data
+```
+**Note:** If you just want to fine-tune on LIBERO, you can skip this step, because our LIBERO fine-tuning configs point to a pre-converted LIBERO dataset. This step is merely an example that you can adapt to your own data.
+### 2. Defining training configs and running training
+To fine-tune a base model on your own data, you need to define configs for data processing and training. We provide example configs with detailed comments for LIBERO below, which you can modify for your own dataset:
+- [`LiberoInputs` and `LiberoOutputs`](src/openpi/policies/libero_policy.py): Defines the data mapping from the LIBERO environment to the model and vice versa. Will be used for both, training and inference.
+- [`LeRobotLiberoDataConfig`](src/openpi/training/config.py): Defines how to process raw LIBERO data from LeRobot dataset for training.
+- [`TrainConfig`](src/openpi/training/config.py): Defines fine-tuning hyperparameters, data config, and weight loader.
+We provide example fine-tuning configs for [π₀](src/openpi/training/config.py), [π₀-FAST](src/openpi/training/config.py), and [π₀.₅](src/openpi/training/config.py) on LIBERO data.
+Before we can run training, we need to compute the normalization statistics for the training data. Run the script below with the name of your training config:
+```bash
+uv run scripts/compute_norm_stats.py --config-name pi05_libero
+```
+Now we can kick off training with the following command (the `--overwrite` flag is used to overwrite existing checkpoints if you rerun fine-tuning with the same config):
+```bash
+XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 uv run scripts/train.py pi05_libero --exp-name=my_experiment --overwrite
+```
+The command will log training progress to the console and save checkpoints to the `checkpoints` directory. You can also monitor training progress on the Weights & Biases dashboard. For maximally using the GPU memory, set `XLA_PYTHON_CLIENT_MEM_FRACTION=0.9` before running training -- this enables JAX to use up to 90% of the GPU memory (vs. the default of 75%).
+**Note:** We provide functionality for *reloading* normalization statistics for state / action normalization from pre-training. This can be beneficial if you are fine-tuning to a new task on a robot that was part of our pre-training mixture. For more details on how to reload normalization statistics, see the [norm_stats.md](docs/norm_stats.md) file.
+### 3. Spinning up a policy server and running inference
+Once training is complete, we can run inference by spinning up a policy server and then querying it from a LIBERO evaluation script. Launching a model server is easy (we use the checkpoint for iteration 20,000 for this example, modify as needed):
+```bash
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=pi05_libero --policy.dir=checkpoints/pi05_libero/my_experiment/20000
+```
+This will spin up a server that listens on port 8000 and waits for observations to be sent to it. We can then run an evaluation script (or robot runtime) that queries the server.
+For running the LIBERO eval in particular, we provide (and recommend using) a Dockerized workflow that handles both the policy server and the evaluation script together. See the [LIBERO README](examples/libero/README.md) for more details.
+If you want to embed a policy server call in your own robot runtime, we have a minimal example of how to do so in the [remote inference docs](docs/remote_inference.md).
+### More Examples
+We provide more examples for how to fine-tune and run inference with our models on the ALOHA platform in the following READMEs:
+- [ALOHA Simulator](examples/aloha_sim)
+- [ALOHA Real](examples/aloha_real)
+- [UR5](examples/ur5)
+## PyTorch Support
+openpi now provides PyTorch implementations of π₀ and π₀.₅ models alongside the original JAX versions! The PyTorch implementation has been validated on the LIBERO benchmark (both inference and finetuning). A few features are currently not supported (this may change in the future):
+- The π₀-FAST model
+- Mixed precision training
+- FSDP (fully-sharded data parallelism) training
+- LoRA (low-rank adaptation) training
+- EMA (exponential moving average) weights during training
+### Setup
+1. Make sure that you have the latest version of all dependencies installed: `uv sync`
+2. Double check that you have transformers 4.53.2 installed: `uv pip show transformers`
+3. Apply the transformers library patches:
+   ```bash
+   cp -r ./src/openpi/models_pytorch/transformers_replace/* .venv/lib/python3.11/site-packages/transformers/
+   ```
+This overwrites several files in the transformers library with necessary model changes: 1) supporting AdaRMS, 2) correctly controlling the precision of activations, and 3) allowing the KV cache to be used without being updated.
+**WARNING**: With the default uv link mode (hardlink), this will permanently affect the transformers library in your uv cache, meaning the changes will survive reinstallations of transformers and could even propagate to other projects that use transformers. To fully undo this operation, you must run `uv cache clean transformers`.
+### Converting JAX Models to PyTorch
+To convert a JAX model checkpoint to PyTorch format:
+```bash
+uv run examples/convert_jax_model_to_pytorch.py \
+    --checkpoint_dir /path/to/jax/checkpoint \
+    --config_name <config name> \
+    --output_path /path/to/converted/pytorch/checkpoint
+```
+### Running Inference with PyTorch
+The PyTorch implementation uses the same API as the JAX version - you only need to change the checkpoint path to point to the converted PyTorch model:
+```python
+from openpi.training import config as _config
+from openpi.policies import policy_config
+from openpi.shared import download
+config = _config.get_config("pi05_droid")
+checkpoint_dir = "/path/to/converted/pytorch/checkpoint"
+# Create a trained policy (automatically detects PyTorch format)
+policy = policy_config.create_trained_policy(config, checkpoint_dir)
+# Run inference (same API as JAX)
+action_chunk = policy.infer(example)["actions"]
+```
+### Policy Server with PyTorch
+The policy server works identically with PyTorch models - just point to the converted checkpoint directory:
+```bash
+uv run scripts/serve_policy.py policy:checkpoint \
+    --policy.config=pi05_droid \
+    --policy.dir=/path/to/converted/pytorch/checkpoint
+```
+### Finetuning with PyTorch
+To finetune a model in PyTorch:
+1. Convert the JAX base model to PyTorch format:
+   ```bash
+   uv run examples/convert_jax_model_to_pytorch.py \
+       --config_name <config name> \
+       --checkpoint_dir /path/to/jax/base/model \
+       --output_path /path/to/pytorch/base/model
+   ```
+2. Specify the converted PyTorch model path in your config using `pytorch_weight_path`
+3. Launch training using one of these modes:
+```bash
+# Single GPU training:
+uv run scripts/train_pytorch.py <config_name> --exp_name <run_name> --save_interval <interval>
+# Example:
+uv run scripts/train_pytorch.py debug --exp_name pytorch_test
+uv run scripts/train_pytorch.py debug --exp_name pytorch_test --resume  # Resume from latest checkpoint
+# Multi-GPU training (single node):
+uv run torchrun --standalone --nnodes=1 --nproc_per_node=<num_gpus> scripts/train_pytorch.py <config_name> --exp_name <run_name>
+# Example:
+uv run torchrun --standalone --nnodes=1 --nproc_per_node=2 scripts/train_pytorch.py pi0_aloha_sim --exp_name pytorch_ddp_test
+uv run torchrun --standalone --nnodes=1 --nproc_per_node=2 scripts/train_pytorch.py pi0_aloha_sim --exp_name pytorch_ddp_test --resume
+# Multi-Node Training:
+uv run torchrun \
+    --nnodes=<num_nodes> \
+    --nproc_per_node=<gpus_per_node> \
+    --node_rank=<rank_of_node> \
+    --master_addr=<master_ip> \
+    --master_port=<port> \
+    scripts/train_pytorch.py <config_name> --exp_name=<run_name> --save_interval <interval>
+```
+### Precision Settings
+JAX and PyTorch implementations handle precision as follows:
+**JAX:**
+1. Inference: most weights and computations in bfloat16, with a few computations in float32 for stability
+2. Training: defaults to mixed precision: weights and gradients in float32, (most) activations and computations in bfloat16. You can change to full float32 training by setting `dtype` to float32 in the config.
+**PyTorch:**
+1. Inference: matches JAX -- most weights and computations in bfloat16, with a few weights converted to float32 for stability
+2. Training: supports either full bfloat16 (default) or full float32. You can change it by setting `pytorch_training_precision` in the config. bfloat16 uses less memory but exhibits higher losses compared to float32. Mixed precision is not yet supported.
+With torch.compile, inference speed is comparable between JAX and PyTorch.
+## Troubleshooting
+We will collect common issues and their solutions here. If you encounter an issue, please check here first. If you can't find a solution, please file an issue on the repo (see [here](CONTRIBUTING.md) for guidelines).
+| Issue                                     | Resolution                                                                                                                                                                                   |
+| ----------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `uv sync` fails with dependency conflicts | Try removing the virtual environment directory (`rm -rf .venv`) and running `uv sync` again. If issues persist, check that you have the latest version of `uv` installed (`uv self update`). |
+| Training runs out of GPU memory           | Make sure you set `XLA_PYTHON_CLIENT_MEM_FRACTION=0.9` (or higher) before running training to allow JAX to use more GPU memory. You can also use `--fsdp-devices <n>` where `<n>` is your number of GPUs, to enable [fully-sharded data parallelism](https://engineering.fb.com/2021/07/15/open-source/fsdp/), which reduces memory usage in exchange for slower training (the amount of slowdown depends on your particular setup). If you are still running out of memory, you may way to consider disabling EMA.        |
+| Policy server connection errors           | Check that the server is running and listening on the expected port. Verify network connectivity and firewall settings between client and server.                                            |
+| Missing norm stats error when training    | Run `scripts/compute_norm_stats.py` with your config name before starting training.                                                                                                          |
+| Dataset download fails                    | Check your internet connection. For HuggingFace datasets, ensure you're logged in (`huggingface-cli login`).                                                                                 |
+| CUDA/GPU errors                           | Verify NVIDIA drivers are installed correctly. For Docker, ensure nvidia-container-toolkit is installed. Check GPU compatibility. You do NOT need CUDA libraries installed at a system level --- they will be installed via uv. You may even want to try *uninstalling* system CUDA libraries if you run into CUDA issues, since system libraries can sometimes cause conflicts. |
+| Import errors when running examples       | Make sure you've installed all dependencies with `uv sync`. Some examples may have additional requirements listed in their READMEs.                    |
+| Action dimensions mismatch                | Verify your data processing transforms match the expected input/output dimensions of your robot. Check the action space definitions in your policy classes.                                  |
+| Diverging training loss                            | Check the `q01`, `q99`, and `std` values in `norm_stats.json` for your dataset. Certain dimensions that are rarely used can end up with very small `q01`, `q99`, or `std` values, leading to huge states and actions after normalization. You can manually adjust the norm stats as a workaround. |

openpi/config.json ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+    "type": "pi0",
+    "n_obs_steps": 1,
+    "input_features": {
+        "observation.state": {
+            "type": "STATE",
+            "shape": [
+                6
+            ]
+        },
+        "observation.images.camera0": {
+            "type": "VISUAL",
+            "shape": [
+                3,
+                480,
+                640
+            ]
+        },
+        "observation.images.camera1": {
+            "type": "VISUAL",
+            "shape": [
+                3,
+                480,
+                640
+            ]
+        },
+        "observation.images.camera2": {
+            "type": "VISUAL",
+            "shape": [
+                3,
+                480,
+                640
+            ]
+        }
+    },
+    "output_features": {
+        "action": {
+            "type": "ACTION",
+            "shape": [
+                6
+            ]
+        }
+    },
+    "device": "cpu",
+    "use_amp": false,
+    "push_to_hub": true,
+    "repo_id": null,
+    "private": null,
+    "tags": null,
+    "license": null,
+    "chunk_size": 50,
+    "n_action_steps": 50,
+    "normalization_mapping": {
+        "VISUAL": "IDENTITY",
+        "STATE": "MEAN_STD",
+        "ACTION": "MEAN_STD"
+    },
+    "max_state_dim": 32,
+    "max_action_dim": 32,
+    "resize_imgs_with_padding": [
+        224,
+        224
+    ],
+    "empty_cameras": 0,
+    "adapt_to_pi_aloha": false,
+    "use_delta_joint_actions_aloha": false,
+    "tokenizer_max_length": 48,
+    "proj_width": 1024,
+    "num_steps": 10,
+    "use_cache": true,
+    "attention_implementation": "eager",
+    "freeze_vision_encoder": true,
+    "train_expert_only": false,
+    "train_state_proj": true,
+    "optimizer_lr": 2.5e-05,
+    "optimizer_betas": [
+        0.9,
+        0.95
+    ],
+    "optimizer_eps": 1e-08,
+    "optimizer_weight_decay": 1e-10,
+    "scheduler_warmup_steps": 1000,
+    "scheduler_decay_steps": 30000,
+    "scheduler_decay_lr": 2.5e-06
+}

openpi/docs/docker.md ADDED Viewed

	@@ -0,0 +1,25 @@

+### Docker Setup
+All of the examples in this repo provide instructions for being run normally, and also using Docker. Although not required, the Docker option is recommended as this will simplify software installation, produce a more stable environment, and also allow you to avoid installing ROS and cluttering your machine, for examples which depend on ROS.
+- Basic Docker installation instructions are [here](https://docs.docker.com/engine/install/).
+- Docker must be installed in [rootless mode](https://docs.docker.com/engine/security/rootless/).
+- To use your GPU you must also install the [NVIDIA container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
+- The version of docker installed with `snap` is incompatible with the NVIDIA container toolkit, preventing it from accessing `libnvidia-ml.so` ([issue](https://github.com/NVIDIA/nvidia-container-toolkit/issues/154)). The snap version can be uninstalled with `sudo snap remove docker`.
+- Docker Desktop is also incompatible with the NVIDIA runtime ([issue](https://github.com/NVIDIA/nvidia-container-toolkit/issues/229)). Docker Desktop can be uninstalled with `sudo apt remove docker-desktop`.
+If starting from scratch and your host machine is Ubuntu 22.04, you can use accomplish all of the above with the convenience scripts `scripts/docker/install_docker_ubuntu22.sh` and `scripts/docker/install_nvidia_container_toolkit.sh`.
+Build the Docker image and start the container with the following command:
+```bash
+docker compose -f scripts/docker/compose.yml up --build
+```
+To build and run the Docker image for a specific example, use the following command:
+```bash
+docker compose -f examples/<example_name>/compose.yml up --build
+```
+where `<example_name>` is the name of the example you want to run.
+During the first run of any example, Docker will build the images. Go grab a coffee while this happens. Subsequent runs will be faster since the images are cached.

openpi/docs/norm_stats.md ADDED Viewed

	@@ -0,0 +1,69 @@

+# Normalization statistics
+Following common practice, our models normalize the proprioceptive state inputs and action targets during policy training and inference. The statistics used for normalization are computed over the training data and stored alongside the model checkpoint.
+## Reloading normalization statistics
+When you fine-tune one of our models on a new dataset, you need to decide whether to (A) reuse existing normalization statistics or (B) compute new statistics over your new training data. Which option is better for you depends on the similarity of your robot and task to the robot and task distribution in the pre-training dataset. Below, we list all the available pre-training normalization statistics for each model.
+**If your target robot matches one of these pre-training statistics, consider reloading the same normalization statistics.** By reloading the normalization statistics, the actions in your dataset will be more "familiar" to the model, which can lead to better performance. You can reload the normalization statistics by adding an `AssetsConfig` to your training config that points to the corresponding checkpoint directory and normalization statistics ID, like below for the `Trossen` (aka ALOHA) robot statistics of the `pi0_base` checkpoint:
+```python
+TrainConfig(
+    ...
+    data=LeRobotAlohaDataConfig(
+        ...
+        assets=AssetsConfig(
+            assets_dir="gs://openpi-assets/checkpoints/pi0_base/assets",
+            asset_id="trossen",
+        ),
+    ),
+)
+```
+For an example of a full training config that reloads normalization statistics, see the `pi0_aloha_pen_uncap` config in the [training config file](https://github.com/physical-intelligence/openpi/blob/main/src/openpi/training/config.py).
+**Note:** To successfully reload normalization statistics, it's important that your robot + dataset are following the action space definitions used in pre-training. We provide a detailed description of our action space definitions below.
+**Note #2:** Whether reloading normalization statistics is beneficial depends on the similarity of your robot and task to the robot and task distribution in the pre-training dataset. We recommend to always try both, reloading and training with a fresh set of statistics computed on your new dataset (see [main README](../README.md) for instructions on how to compute new statistics), and pick the one that works better for your task.
+## Provided Pre-training Normalization Statistics
+Below is a list of all the pre-training normalization statistics we provide. We provide them for both, the `pi0_base` and `pi0_fast_base` models. For `pi0_base`, set the `assets_dir` to `gs://openpi-assets/checkpoints/pi0_base/assets` and for `pi0_fast_base`, set the `assets_dir` to `gs://openpi-assets/checkpoints/pi0_fast_base/assets`.
+| Robot | Description | Asset ID |
+|-------|-------------|----------|
+| ALOHA | 6-DoF dual arm robot with parallel grippers | trossen |
+| Mobile ALOHA | Mobile version of ALOHA mounted on a Slate base | trossen_mobile |
+| Franka Emika (DROID) | 7-DoF arm with parallel gripper based on the DROID setup | droid |
+| Franka Emika (non-DROID) | Franka FR3 arm with Robotiq 2F-85 gripper | franka |
+| UR5e | 6-DoF UR5e arm with Robotiq 2F-85 gripper | ur5e |
+| UR5e bi-manual | Bi-manual UR5e setup with Robotiq 2F-85 grippers | ur5e_dual |
+| ARX | Bi-manual ARX-5 robot arm setup with parallel gripper | arx |
+| ARX mobile | Mobile version of bi-manual ARX-5 robot arm setup mounted on a Slate base | arx_mobile |
+| Fibocom mobile | Fibocom mobile robot with 2x ARX-5 arms | fibocom_mobile |
+## Pi0 Model Action Space Definitions
+Out of the box, both the `pi0_base` and `pi0_fast_base` use the following action space definitions (left and right are defined looking from behind the robot towards the workspace):
+```
+    "dim_0:dim_5": "left arm joint angles",
+    "dim_6": "left arm gripper position",
+    "dim_7:dim_12": "right arm joint angles (for bi-manual only)",
+    "dim_13": "right arm gripper position (for bi-manual only)",
+    # For mobile robots:
+    "dim_14:dim_15": "x-y base velocity (for mobile robots only)",
+```
+The proprioceptive state uses the same definitions as the action space, except for the base x-y position (the last two dimensions) for mobile robots, which we don't include in the proprioceptive state.
+For 7-DoF robots (e.g. Franka), we use the first 7 dimensions of the action space for the joint actions, and the 8th dimension for the gripper action.
+General info for Pi robots:
+- Joint angles are expressed in radians, with position zero corresponding to the zero position reported by each robot's interface library, except for ALOHA, where the standard ALOHA code uses a slightly different convention (see the [ALOHA example code](../examples/aloha_real/README.md) for details).
+- Gripper positions are in [0.0, 1.0], with 0.0 corresponding to fully open and 1.0 corresponding to fully closed.
+- Control frequencies are either 20 Hz for UR5e and Franka, and 50 Hz for ARX and Trossen (ALOHA) arms.
+For DROID, we use the original DROID action configuration, with joint velocity actions in the first 7 dimensions and gripper actions in the 8th dimension + a control frequency of 15 Hz.

openpi/docs/remote_inference.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# Running openpi models remotely
+We provide utilities for running openpi models remotely. This is useful for running inference on more powerful GPUs off-robot, and also helps keep the robot and policy environments separate (and e.g. avoid dependency hell with robot software).
+## Starting a remote policy server
+To start a remote policy server, you can simply run the following command:
+```bash
+uv run scripts/serve_policy.py --env=[DROID | ALOHA | LIBERO]
+```
+The `env` argument specifies which $\pi_0$ checkpoint should be loaded. Under the hood, this script will execute a command like the following, which you can use to start a policy server, e.g. for checkpoints you trained yourself (here an example for the DROID environment):
+```bash
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=pi0_fast_droid --policy.dir=gs://openpi-assets/checkpoints/pi0_fast_droid
+```
+This will start a policy server that will serve the policy specified by the `config` and `dir` arguments. The policy will be served on the specified port (default: 8000).
+## Querying the remote policy server from your robot code
+We provide a client utility with minimal dependencies that you can easily embed into any robot codebase.
+First, install the `openpi-client` package in your robot environment:
+```bash
+cd $OPENPI_ROOT/packages/openpi-client
+pip install -e .
+```
+Then, you can use the client to query the remote policy server from your robot code. Here's an example of how to do this:
+```python
+from openpi_client import image_tools
+from openpi_client import websocket_client_policy
+# Outside of episode loop, initialize the policy client.
+# Point to the host and port of the policy server (localhost and 8000 are the defaults).
+client = websocket_client_policy.WebsocketClientPolicy(host="localhost", port=8000)
+for step in range(num_steps):
+    # Inside the episode loop, construct the observation.
+    # Resize images on the client side to minimize bandwidth / latency. Always return images in uint8 format.
+    # We provide utilities for resizing images + uint8 conversion so you match the training routines.
+    # The typical resize_size for pre-trained pi0 models is 224.
+    # Note that the proprioceptive `state` can be passed unnormalized, normalization will be handled on the server side.
+    observation = {
+        "observation/image": image_tools.convert_to_uint8(
+            image_tools.resize_with_pad(img, 224, 224)
+        ),
+        "observation/wrist_image": image_tools.convert_to_uint8(
+            image_tools.resize_with_pad(wrist_img, 224, 224)
+        ),
+        "observation/state": state,
+        "prompt": task_instruction,
+    }
+    # Call the policy server with the current observation.
+    # This returns an action chunk of shape (action_horizon, action_dim).
+    # Note that you typically only need to call the policy every N steps and execute steps
+    # from the predicted action chunk open-loop in the remaining steps.
+    action_chunk = client.infer(observation)["actions"]
+    # Execute the actions in the environment.
+    ...
+```
+Here, the `host` and `port` arguments specify the IP address and port of the remote policy server. You can also specify these as command-line arguments to your robot code, or hard-code them in your robot codebase. The `observation` is a dictionary of observations and the prompt, following the specification of the policy inputs for the policy you are serving. We have concrete examples of how to construct this dictionary for different environments in the [simple client example](examples/simple_client/main.py).

openpi/examples/aloha_real/Dockerfile ADDED Viewed

	@@ -0,0 +1,70 @@

+# Dockerfile for the Aloha real environment.
+# Build the container:
+# docker build . -t aloha_real -f examples/aloha_real/Dockerfile
+# Run the container:
+# docker run --rm -it --network=host -v /dev:/dev -v .:/app --privileged aloha_real /bin/bash
+FROM ros:noetic-robot@sha256:7cf0b9f6546abeba308ea42cb7ad3453f3e520e1af57cdf179fe915c939674bc
+SHELL ["/bin/bash", "-c"]
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    cmake \
+    curl \
+    libffi-dev \
+    python3-rosdep \
+    python3-rosinstall \
+    python3-rosinstall-generator \
+    whiptail \
+    git \
+    wget \
+    openssh-client \
+    ros-noetic-cv-bridge \
+    ros-noetic-usb-cam \
+    ros-noetic-realsense2-camera \
+    keyboard-configuration
+WORKDIR /root
+RUN curl 'https://raw.githubusercontent.com/Interbotix/interbotix_ros_manipulators/main/interbotix_ros_xsarms/install/amd64/xsarm_amd64_install.sh' > xsarm_amd64_install.sh
+RUN chmod +x xsarm_amd64_install.sh
+RUN export TZ='America/Los_Angeles' && ./xsarm_amd64_install.sh -d noetic -n
+COPY ./third_party/aloha /root/interbotix_ws/src/aloha
+RUN cd /root/interbotix_ws && source /opt/ros/noetic/setup.sh && source /root/interbotix_ws/devel/setup.sh && catkin_make
+# Install python 3.10 because this ROS image comes with 3.8
+RUN mkdir /python && \
+    cd /python && \
+    wget https://www.python.org/ftp/python/3.10.14/Python-3.10.14.tgz && \
+    tar -zxvf Python-3.10.14.tgz && \
+    cd Python-3.10.14 && \
+    ls -lhR && \
+    ./configure --enable-optimizations && \
+    make install && \
+    echo 'alias python3="/usr/local/bin/python3.10"' >> ~/.bashrc && \
+    echo 'alias python="/usr/local/bin/python3.10"' >> ~/.bashrc && \
+    cd ~ && rm -rf /python && \
+    rm -rf /var/lib/apt/lists/*
+COPY --from=ghcr.io/astral-sh/uv:0.5.6 /uv /bin/uv
+ENV UV_HTTP_TIMEOUT=120
+ENV UV_LINK_MODE=copy
+COPY ./examples/aloha_real/requirements.txt /tmp/requirements.txt
+COPY ./packages/openpi-client/pyproject.toml /tmp/openpi-client/pyproject.toml
+RUN uv pip sync --python 3.10 --system /tmp/requirements.txt /tmp/openpi-client/pyproject.toml
+ENV PYTHONPATH=/app:/app/src:/app/packages/openpi-client/src:/root/interbotix_ws/src/aloha/aloha_scripts:/root/interbotix_ws/src/aloha
+WORKDIR /app
+# Create an entrypoint script to run the setup commands, followed by the command passed in.
+RUN cat <<'EOF' > /usr/local/bin/entrypoint.sh
+#!/bin/bash
+source /opt/ros/noetic/setup.sh && source /root/interbotix_ws/devel/setup.sh && "$@"
+EOF
+RUN chmod +x /usr/local/bin/entrypoint.sh
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
+CMD ["python3", "/app/examples/aloha_real/main.py"]

openpi/examples/aloha_real/README.md ADDED Viewed

	@@ -0,0 +1,126 @@

+# Run Aloha (Real Robot)
+This example demonstrates how to run with a real robot using an [ALOHA setup](https://github.com/tonyzhaozh/aloha). See [here](../../docs/remote_inference.md) for instructions on how to load checkpoints and run inference. We list the relevant checkpoint paths for each provided fine-tuned model below.
+## Prerequisites
+This repo uses a fork of the ALOHA repo, with very minor modifications to use Realsense cameras.
+1. Follow the [hardware installation instructions](https://github.com/tonyzhaozh/aloha?tab=readme-ov-file#hardware-installation) in the ALOHA repo.
+1. Modify the `third_party/aloha/aloha_scripts/realsense_publisher.py` file to use serial numbers for your cameras.
+## With Docker
+```bash
+export SERVER_ARGS="--env ALOHA --default_prompt='take the toast out of the toaster'"
+docker compose -f examples/aloha_real/compose.yml up --build
+```
+## Without Docker
+Terminal window 1:
+```bash
+# Create virtual environment
+uv venv --python 3.10 examples/aloha_real/.venv
+source examples/aloha_real/.venv/bin/activate
+uv pip sync examples/aloha_real/requirements.txt
+uv pip install -e packages/openpi-client
+# Run the robot
+python -m examples.aloha_real.main
+```
+Terminal window 2:
+```bash
+roslaunch aloha ros_nodes.launch
+```
+Terminal window 3:
+```bash
+uv run scripts/serve_policy.py --env ALOHA --default_prompt='take the toast out of the toaster'
+```
+## **ALOHA Checkpoint Guide**
+The `pi0_base` model can be used in zero shot for a simple task on the ALOHA platform, and we additionally provide two example fine-tuned checkpoints, “fold the towel” and “open the tupperware and put the food on the plate,” which can perform more advanced tasks on the ALOHA.
+While we’ve found the policies to work in unseen conditions across multiple ALOHA stations, we provide some pointers here on how best to set up scenes to maximize the chance of policy success. We cover the prompts to use for the policies, objects we’ve seen it work well on, and well-represented initial state distributions. Running these policies in zero shot is still a very experimental feature, and there is no guarantee that they will work on your robot. The recommended way to use `pi0_base` is by finetuning with data from the target robot.
+---
+### **Toast Task**
+This task involves the robot taking two pieces of toast out of a toaster and placing them on a plate.
+- **Checkpoint path**: `gs://openpi-assets/checkpoints/pi0_base`
+- **Prompt**: "take the toast out of the toaster"
+- **Objects needed**: Two pieces of toast, a plate, and a standard toaster.
+- **Object Distribution**:
+  - Works on both real toast and rubber fake toast
+  - Compatible with standard 2-slice toasters
+  - Works with plates of varying colors
+### **Scene Setup Guidelines**
+<img width="500" alt="Screenshot 2025-01-31 at 10 06 02 PM" src="https://github.com/user-attachments/assets/3d043d95-9d1c-4dda-9991-e63cae61e02e" />
+- The toaster should be positioned in the top-left quadrant of the workspace.
+- Both pieces of toast should start inside the toaster, with at least 1 cm of bread sticking out from the top.
+- The plate should be placed roughly in the lower-center of the workspace.
+- Works with both natural and synthetic lighting, but avoid making the scene too dark (e.g., don't place the setup inside an enclosed space or under a curtain).
+### **Towel Task**
+This task involves folding a small towel (e.g., roughly the size of a hand towel) into eighths.
+- **Checkpoint path**: `gs://openpi-assets/checkpoints/pi0_aloha_towel`
+- **Prompt**: "fold the towel"
+- **Object Distribution**:
+  - Works on towels of varying solid colors
+  - Performance is worse on heavily textured or striped towels
+### **Scene Setup Guidelines**
+<img width="500" alt="Screenshot 2025-01-31 at 10 01 15 PM" src="https://github.com/user-attachments/assets/9410090c-467d-4a9c-ac76-96e5b4d00943" />
+- The towel should be flattened and roughly centered on the table.
+- Choose a towel that does not blend in with the table surface.
+### **Tupperware Task**
+This task involves opening a tupperware filled with food and pouring the contents onto a plate.
+- **Checkpoint path**: `gs://openpi-assets/checkpoints/pi0_aloha_tupperware`
+- **Prompt**: "open the tupperware and put the food on the plate"
+- **Objects needed**: Tupperware, food (or food-like items), and a plate.
+- **Object Distribution**:
+  - Works on various types of fake food (e.g., fake chicken nuggets, fries, and fried chicken).
+  - Compatible with tupperware of different lid colors and shapes, with best performance on square tupperware with a corner flap (see images below).
+  - The policy has seen plates of varying solid colors.
+### **Scene Setup Guidelines**
+<img width="500" alt="Screenshot 2025-01-31 at 10 02 27 PM" src="https://github.com/user-attachments/assets/60fc1de0-2d64-4076-b903-f427e5e9d1bf" />
+- Best performance observed when both the tupperware and plate are roughly centered in the workspace.
+- Positioning:
+  - Tupperware should be on the left.
+  - Plate should be on the right or bottom.
+  - The tupperware flap should point toward the plate.
+## Training on your own Aloha dataset
+1. Convert the dataset to the LeRobot dataset v2.0 format.
+    We provide a script [convert_aloha_data_to_lerobot.py](./convert_aloha_data_to_lerobot.py) that converts the dataset to the LeRobot dataset v2.0 format. As an example we have converted the `aloha_pen_uncap_diverse_raw` dataset from the [BiPlay repo](https://huggingface.co/datasets/oier-mees/BiPlay/tree/main/aloha_pen_uncap_diverse_raw) and uploaded it to the HuggingFace Hub as [physical-intelligence/aloha_pen_uncap_diverse](https://huggingface.co/datasets/physical-intelligence/aloha_pen_uncap_diverse).
+2. Define a training config that uses the custom dataset.
+    We provide the [pi0_aloha_pen_uncap config](../../src/openpi/training/config.py) as an example. You should refer to the root [README](../../README.md) for how to run training with the new config.
+IMPORTANT: Our base checkpoint includes normalization stats from various common robot configurations. When fine-tuning a base checkpoint with a custom dataset from one of these configurations, we recommend using the corresponding normalization stats provided in the base checkpoint. In the example, this is done by specifying the trossen asset_id and a path to the pretrained checkpoint’s asset directory within the AssetsConfig.

openpi/examples/aloha_real/compose.yml ADDED Viewed

	@@ -0,0 +1,66 @@

+# Run with:
+# docker compose -f examples/aloha_real/compose.yml up --build
+services:
+  runtime:
+    image: aloha_real
+    depends_on:
+      - aloha_ros_nodes
+      - ros_master
+      - openpi_server
+    build:
+      context: ../..
+      dockerfile: examples/aloha_real/Dockerfile
+    init: true
+    tty: true
+    network_mode: host
+    privileged: true
+    volumes:
+      - $PWD:/app
+      - ../../data:/data
+  aloha_ros_nodes:
+    image: aloha_real
+    depends_on:
+      - ros_master
+    build:
+      context: ../..
+      dockerfile: examples/aloha_real/Dockerfile
+    init: true
+    tty: true
+    network_mode: host
+    privileged: true
+    volumes:
+      - /dev:/dev
+    command: roslaunch --wait aloha ros_nodes.launch
+  ros_master:
+    image: ros:noetic-robot
+    network_mode: host
+    privileged: true
+    command:
+      - roscore
+  openpi_server:
+    image: openpi_server
+    build:
+      context: ../..
+      dockerfile: scripts/docker/serve_policy.Dockerfile
+    init: true
+    tty: true
+    network_mode: host
+    volumes:
+      - $PWD:/app
+      - ${OPENPI_DATA_HOME:-~/.cache/openpi}:/openpi_assets
+    environment:
+      - SERVER_ARGS
+      - OPENPI_DATA_HOME=/openpi_assets
+      - IS_DOCKER=true
+    # Comment out this block if not running on a machine with GPUs.
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]

openpi/examples/aloha_real/constants.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Ignore lint errors because this file is mostly copied from ACT (https://github.com/tonyzhaozh/act).
+# ruff: noqa
+### Task parameters
+### ALOHA fixed constants
+DT = 0.001
+JOINT_NAMES = ["waist", "shoulder", "elbow", "forearm_roll", "wrist_angle", "wrist_rotate"]
+START_ARM_POSE = [0, -0.96, 1.16, 0, -0.3, 0, 0.02239, -0.02239, 0, -0.96, 1.16, 0, -0.3, 0, 0.02239, -0.02239]
+# Left finger position limits (qpos[7]), right_finger = -1 * left_finger
+MASTER_GRIPPER_POSITION_OPEN = 0.02417
+MASTER_GRIPPER_POSITION_CLOSE = 0.01244
+PUPPET_GRIPPER_POSITION_OPEN = 0.05800
+PUPPET_GRIPPER_POSITION_CLOSE = 0.01844
+# Gripper joint limits (qpos[6])
+MASTER_GRIPPER_JOINT_OPEN = 0.3083
+MASTER_GRIPPER_JOINT_CLOSE = -0.6842
+PUPPET_GRIPPER_JOINT_OPEN = 1.4910
+PUPPET_GRIPPER_JOINT_CLOSE = -0.6213
+############################ Helper functions ############################
+MASTER_GRIPPER_POSITION_NORMALIZE_FN = lambda x: (x - MASTER_GRIPPER_POSITION_CLOSE) / (
+    MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE
+)
+PUPPET_GRIPPER_POSITION_NORMALIZE_FN = lambda x: (x - PUPPET_GRIPPER_POSITION_CLOSE) / (
+    PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE
+)
+MASTER_GRIPPER_POSITION_UNNORMALIZE_FN = (
+    lambda x: x * (MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE) + MASTER_GRIPPER_POSITION_CLOSE
+)
+PUPPET_GRIPPER_POSITION_UNNORMALIZE_FN = (
+    lambda x: x * (PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE) + PUPPET_GRIPPER_POSITION_CLOSE
+)
+MASTER2PUPPET_POSITION_FN = lambda x: PUPPET_GRIPPER_POSITION_UNNORMALIZE_FN(MASTER_GRIPPER_POSITION_NORMALIZE_FN(x))
+MASTER_GRIPPER_JOINT_NORMALIZE_FN = lambda x: (x - MASTER_GRIPPER_JOINT_CLOSE) / (
+    MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE
+)
+PUPPET_GRIPPER_JOINT_NORMALIZE_FN = lambda x: (x - PUPPET_GRIPPER_JOINT_CLOSE) / (
+    PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE
+)
+MASTER_GRIPPER_JOINT_UNNORMALIZE_FN = (
+    lambda x: x * (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE) + MASTER_GRIPPER_JOINT_CLOSE
+)
+PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN = (
+    lambda x: x * (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE) + PUPPET_GRIPPER_JOINT_CLOSE
+)
+MASTER2PUPPET_JOINT_FN = lambda x: PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN(MASTER_GRIPPER_JOINT_NORMALIZE_FN(x))
+MASTER_GRIPPER_VELOCITY_NORMALIZE_FN = lambda x: x / (MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE)
+PUPPET_GRIPPER_VELOCITY_NORMALIZE_FN = lambda x: x / (PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE)
+MASTER_POS2JOINT = (
+    lambda x: MASTER_GRIPPER_POSITION_NORMALIZE_FN(x) * (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE)
+    + MASTER_GRIPPER_JOINT_CLOSE
+)
+MASTER_JOINT2POS = lambda x: MASTER_GRIPPER_POSITION_UNNORMALIZE_FN(
+    (x - MASTER_GRIPPER_JOINT_CLOSE) / (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE)
+)
+PUPPET_POS2JOINT = (
+    lambda x: PUPPET_GRIPPER_POSITION_NORMALIZE_FN(x) * (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE)
+    + PUPPET_GRIPPER_JOINT_CLOSE
+)
+PUPPET_JOINT2POS = lambda x: PUPPET_GRIPPER_POSITION_UNNORMALIZE_FN(
+    (x - PUPPET_GRIPPER_JOINT_CLOSE) / (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE)
+)
+MASTER_GRIPPER_JOINT_MID = (MASTER_GRIPPER_JOINT_OPEN + MASTER_GRIPPER_JOINT_CLOSE) / 2

openpi/examples/aloha_real/convert_aloha_data_to_lerobot.py ADDED Viewed

	@@ -0,0 +1,272 @@

+"""
+Script to convert Aloha hdf5 data to the LeRobot dataset v2.0 format.
+Example usage: uv run examples/aloha_real/convert_aloha_data_to_lerobot.py --raw-dir /path/to/raw/data --repo-id <org>/<dataset-name>
+"""
+import dataclasses
+from pathlib import Path
+import shutil
+from typing import Literal
+import h5py
+from lerobot.common.datasets.lerobot_dataset import LEROBOT_HOME
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.common.datasets.push_dataset_to_hub._download_raw import download_raw
+import numpy as np
+import torch
+import tqdm
+import tyro
+@dataclasses.dataclass(frozen=True)
+class DatasetConfig:
+    use_videos: bool = True
+    tolerance_s: float = 0.0001
+    image_writer_processes: int = 10
+    image_writer_threads: int = 5
+    video_backend: str | None = None
+DEFAULT_DATASET_CONFIG = DatasetConfig()
+def create_empty_dataset(
+    repo_id: str,
+    robot_type: str,
+    mode: Literal["video", "image"] = "video",
+    *,
+    has_velocity: bool = False,
+    has_effort: bool = False,
+    dataset_config: DatasetConfig = DEFAULT_DATASET_CONFIG,
+) -> LeRobotDataset:
+    motors = [
+        "right_waist",
+        "right_shoulder",
+        "right_elbow",
+        "right_forearm_roll",
+        "right_wrist_angle",
+        "right_wrist_rotate",
+        "right_gripper",
+        "left_waist",
+        "left_shoulder",
+        "left_elbow",
+        "left_forearm_roll",
+        "left_wrist_angle",
+        "left_wrist_rotate",
+        "left_gripper",
+    ]
+    cameras = [
+        "cam_high",
+        "cam_low",
+        "cam_left_wrist",
+        "cam_right_wrist",
+    ]
+    features = {
+        "observation.state": {
+            "dtype": "float32",
+            "shape": (len(motors),),
+            "names": [
+                motors,
+            ],
+        },
+        "action": {
+            "dtype": "float32",
+            "shape": (len(motors),),
+            "names": [
+                motors,
+            ],
+        },
+    }
+    if has_velocity:
+        features["observation.velocity"] = {
+            "dtype": "float32",
+            "shape": (len(motors),),
+            "names": [
+                motors,
+            ],
+        }
+    if has_effort:
+        features["observation.effort"] = {
+            "dtype": "float32",
+            "shape": (len(motors),),
+            "names": [
+                motors,
+            ],
+        }
+    for cam in cameras:
+        features[f"observation.images.{cam}"] = {
+            "dtype": mode,
+            "shape": (3, 480, 640),
+            "names": [
+                "channels",
+                "height",
+                "width",
+            ],
+        }
+    if Path(LEROBOT_HOME / repo_id).exists():
+        shutil.rmtree(LEROBOT_HOME / repo_id)
+    return LeRobotDataset.create(
+        repo_id=repo_id,
+        fps=50,
+        robot_type=robot_type,
+        features=features,
+        use_videos=dataset_config.use_videos,
+        tolerance_s=dataset_config.tolerance_s,
+        image_writer_processes=dataset_config.image_writer_processes,
+        image_writer_threads=dataset_config.image_writer_threads,
+        video_backend=dataset_config.video_backend,
+    )
+def get_cameras(hdf5_files: list[Path]) -> list[str]:
+    with h5py.File(hdf5_files[0], "r") as ep:
+        # ignore depth channel, not currently handled
+        return [key for key in ep["/observations/images"].keys() if "depth" not in key]  # noqa: SIM118
+def has_velocity(hdf5_files: list[Path]) -> bool:
+    with h5py.File(hdf5_files[0], "r") as ep:
+        return "/observations/qvel" in ep
+def has_effort(hdf5_files: list[Path]) -> bool:
+    with h5py.File(hdf5_files[0], "r") as ep:
+        return "/observations/effort" in ep
+def load_raw_images_per_camera(ep: h5py.File, cameras: list[str]) -> dict[str, np.ndarray]:
+    imgs_per_cam = {}
+    for camera in cameras:
+        uncompressed = ep[f"/observations/images/{camera}"].ndim == 4
+        if uncompressed:
+            # load all images in RAM
+            imgs_array = ep[f"/observations/images/{camera}"][:]
+        else:
+            import cv2
+            # load one compressed image after the other in RAM and uncompress
+            imgs_array = []
+            for data in ep[f"/observations/images/{camera}"]:
+                imgs_array.append(cv2.cvtColor(cv2.imdecode(data, 1), cv2.COLOR_BGR2RGB))
+            imgs_array = np.array(imgs_array)
+        imgs_per_cam[camera] = imgs_array
+    return imgs_per_cam
+def load_raw_episode_data(
+    ep_path: Path,
+) -> tuple[dict[str, np.ndarray], torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+    with h5py.File(ep_path, "r") as ep:
+        state = torch.from_numpy(ep["/observations/qpos"][:])
+        action = torch.from_numpy(ep["/action"][:])
+        velocity = None
+        if "/observations/qvel" in ep:
+            velocity = torch.from_numpy(ep["/observations/qvel"][:])
+        effort = None
+        if "/observations/effort" in ep:
+            effort = torch.from_numpy(ep["/observations/effort"][:])
+        imgs_per_cam = load_raw_images_per_camera(
+            ep,
+            [
+                "cam_high",
+                "cam_low",
+                "cam_left_wrist",
+                "cam_right_wrist",
+            ],
+        )
+    return imgs_per_cam, state, action, velocity, effort
+def populate_dataset(
+    dataset: LeRobotDataset,
+    hdf5_files: list[Path],
+    task: str,
+    episodes: list[int] | None = None,
+) -> LeRobotDataset:
+    if episodes is None:
+        episodes = range(len(hdf5_files))
+    for ep_idx in tqdm.tqdm(episodes):
+        ep_path = hdf5_files[ep_idx]
+        imgs_per_cam, state, action, velocity, effort = load_raw_episode_data(ep_path)
+        num_frames = state.shape[0]
+        for i in range(num_frames):
+            frame = {
+                "observation.state": state[i],
+                "action": action[i],
+            }
+            for camera, img_array in imgs_per_cam.items():
+                frame[f"observation.images.{camera}"] = img_array[i]
+            if velocity is not None:
+                frame["observation.velocity"] = velocity[i]
+            if effort is not None:
+                frame["observation.effort"] = effort[i]
+            dataset.add_frame(frame)
+        dataset.save_episode(task=task)
+    return dataset
+def port_aloha(
+    raw_dir: Path,
+    repo_id: str,
+    raw_repo_id: str | None = None,
+    task: str = "DEBUG",
+    *,
+    episodes: list[int] | None = None,
+    push_to_hub: bool = True,
+    is_mobile: bool = False,
+    mode: Literal["video", "image"] = "image",
+    dataset_config: DatasetConfig = DEFAULT_DATASET_CONFIG,
+):
+    if (LEROBOT_HOME / repo_id).exists():
+        shutil.rmtree(LEROBOT_HOME / repo_id)
+    if not raw_dir.exists():
+        if raw_repo_id is None:
+            raise ValueError("raw_repo_id must be provided if raw_dir does not exist")
+        download_raw(raw_dir, repo_id=raw_repo_id)
+    hdf5_files = sorted(raw_dir.glob("episode_*.hdf5"))
+    dataset = create_empty_dataset(
+        repo_id,
+        robot_type="mobile_aloha" if is_mobile else "aloha",
+        mode=mode,
+        has_effort=has_effort(hdf5_files),
+        has_velocity=has_velocity(hdf5_files),
+        dataset_config=dataset_config,
+    )
+    dataset = populate_dataset(
+        dataset,
+        hdf5_files,
+        task=task,
+        episodes=episodes,
+    )
+    dataset.consolidate()
+    if push_to_hub:
+        dataset.push_to_hub()
+if __name__ == "__main__":
+    tyro.cli(port_aloha)

openpi/examples/aloha_real/env.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from typing import List, Optional  # noqa: UP035
+import einops
+from openpi_client import image_tools
+from openpi_client.runtime import environment as _environment
+from typing_extensions import override
+from examples.aloha_real import real_env as _real_env
+class AlohaRealEnvironment(_environment.Environment):
+    """An environment for an Aloha robot on real hardware."""
+    def __init__(
+        self,
+        reset_position: Optional[List[float]] = None,  # noqa: UP006,UP007
+        render_height: int = 224,
+        render_width: int = 224,
+    ) -> None:
+        self._env = _real_env.make_real_env(init_node=True, reset_position=reset_position)
+        self._render_height = render_height
+        self._render_width = render_width
+        self._ts = None
+    @override
+    def reset(self) -> None:
+        self._ts = self._env.reset()
+    @override
+    def is_episode_complete(self) -> bool:
+        return False
+    @override
+    def get_observation(self) -> dict:
+        if self._ts is None:
+            raise RuntimeError("Timestep is not set. Call reset() first.")
+        obs = self._ts.observation
+        for k in list(obs["images"].keys()):
+            if "_depth" in k:
+                del obs["images"][k]
+        for cam_name in obs["images"]:
+            img = image_tools.convert_to_uint8(
+                image_tools.resize_with_pad(obs["images"][cam_name], self._render_height, self._render_width)
+            )
+            obs["images"][cam_name] = einops.rearrange(img, "h w c -> c h w")
+        return {
+            "state": obs["qpos"],
+            "images": obs["images"],
+        }
+    @override
+    def apply_action(self, action: dict) -> None:
+        self._ts = self._env.step(action["actions"])

openpi/examples/aloha_real/main.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import dataclasses
+import logging
+from openpi_client import action_chunk_broker
+from openpi_client import websocket_client_policy as _websocket_client_policy
+from openpi_client.runtime import runtime as _runtime
+from openpi_client.runtime.agents import policy_agent as _policy_agent
+import tyro
+from examples.aloha_real import env as _env
+@dataclasses.dataclass
+class Args:
+    host: str = "0.0.0.0"
+    port: int = 8000
+    action_horizon: int = 25
+    num_episodes: int = 1
+    max_episode_steps: int = 1000
+def main(args: Args) -> None:
+    ws_client_policy = _websocket_client_policy.WebsocketClientPolicy(
+        host=args.host,
+        port=args.port,
+    )
+    logging.info(f"Server metadata: {ws_client_policy.get_server_metadata()}")
+    metadata = ws_client_policy.get_server_metadata()
+    runtime = _runtime.Runtime(
+        environment=_env.AlohaRealEnvironment(reset_position=metadata.get("reset_pose")),
+        agent=_policy_agent.PolicyAgent(
+            policy=action_chunk_broker.ActionChunkBroker(
+                policy=ws_client_policy,
+                action_horizon=args.action_horizon,
+            )
+        ),
+        subscribers=[],
+        max_hz=50,
+        num_episodes=args.num_episodes,
+        max_episode_steps=args.max_episode_steps,
+    )
+    runtime.run()
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, force=True)
+    tyro.cli(main)

openpi/examples/aloha_real/real_env.py ADDED Viewed

	@@ -0,0 +1,176 @@

+# Ignore lint errors because this file is mostly copied from ACT (https://github.com/tonyzhaozh/act).
+# ruff: noqa
+import collections
+import time
+from typing import Optional, List
+import dm_env
+from interbotix_xs_modules.arm import InterbotixManipulatorXS
+from interbotix_xs_msgs.msg import JointSingleCommand
+import numpy as np
+from examples.aloha_real import constants
+from examples.aloha_real import robot_utils
+# This is the reset position that is used by the standard Aloha runtime.
+DEFAULT_RESET_POSITION = [0, -0.96, 1.16, 0, -0.3, 0]
+class RealEnv:
+    """
+    Environment for real robot bi-manual manipulation
+    Action space:      [left_arm_qpos (6),             # absolute joint position
+                        left_gripper_positions (1),    # normalized gripper position (0: close, 1: open)
+                        right_arm_qpos (6),            # absolute joint position
+                        right_gripper_positions (1),]  # normalized gripper position (0: close, 1: open)
+    Observation space: {"qpos": Concat[ left_arm_qpos (6),          # absolute joint position
+                                        left_gripper_position (1),  # normalized gripper position (0: close, 1: open)
+                                        right_arm_qpos (6),         # absolute joint position
+                                        right_gripper_qpos (1)]     # normalized gripper position (0: close, 1: open)
+                        "qvel": Concat[ left_arm_qvel (6),         # absolute joint velocity (rad)
+                                        left_gripper_velocity (1),  # normalized gripper velocity (pos: opening, neg: closing)
+                                        right_arm_qvel (6),         # absolute joint velocity (rad)
+                                        right_gripper_qvel (1)]     # normalized gripper velocity (pos: opening, neg: closing)
+                        "images": {"cam_high": (480x640x3),        # h, w, c, dtype='uint8'
+                                   "cam_low": (480x640x3),         # h, w, c, dtype='uint8'
+                                   "cam_left_wrist": (480x640x3),  # h, w, c, dtype='uint8'
+                                   "cam_right_wrist": (480x640x3)} # h, w, c, dtype='uint8'
+    """
+    def __init__(self, init_node, *, reset_position: Optional[List[float]] = None, setup_robots: bool = True):
+        # reset_position = START_ARM_POSE[:6]
+        self._reset_position = reset_position[:6] if reset_position else DEFAULT_RESET_POSITION
+        self.puppet_bot_left = InterbotixManipulatorXS(
+            robot_model="vx300s",
+            group_name="arm",
+            gripper_name="gripper",
+            robot_name="puppet_left",
+            init_node=init_node,
+        )
+        self.puppet_bot_right = InterbotixManipulatorXS(
+            robot_model="vx300s", group_name="arm", gripper_name="gripper", robot_name="puppet_right", init_node=False
+        )
+        if setup_robots:
+            self.setup_robots()
+        self.recorder_left = robot_utils.Recorder("left", init_node=False)
+        self.recorder_right = robot_utils.Recorder("right", init_node=False)
+        self.image_recorder = robot_utils.ImageRecorder(init_node=False)
+        self.gripper_command = JointSingleCommand(name="gripper")
+    def setup_robots(self):
+        robot_utils.setup_puppet_bot(self.puppet_bot_left)
+        robot_utils.setup_puppet_bot(self.puppet_bot_right)
+    def get_qpos(self):
+        left_qpos_raw = self.recorder_left.qpos
+        right_qpos_raw = self.recorder_right.qpos
+        left_arm_qpos = left_qpos_raw[:6]
+        right_arm_qpos = right_qpos_raw[:6]
+        left_gripper_qpos = [
+            constants.PUPPET_GRIPPER_POSITION_NORMALIZE_FN(left_qpos_raw[7])
+        ]  # this is position not joint
+        right_gripper_qpos = [
+            constants.PUPPET_GRIPPER_POSITION_NORMALIZE_FN(right_qpos_raw[7])
+        ]  # this is position not joint
+        return np.concatenate([left_arm_qpos, left_gripper_qpos, right_arm_qpos, right_gripper_qpos])
+    def get_qvel(self):
+        left_qvel_raw = self.recorder_left.qvel
+        right_qvel_raw = self.recorder_right.qvel
+        left_arm_qvel = left_qvel_raw[:6]
+        right_arm_qvel = right_qvel_raw[:6]
+        left_gripper_qvel = [constants.PUPPET_GRIPPER_VELOCITY_NORMALIZE_FN(left_qvel_raw[7])]
+        right_gripper_qvel = [constants.PUPPET_GRIPPER_VELOCITY_NORMALIZE_FN(right_qvel_raw[7])]
+        return np.concatenate([left_arm_qvel, left_gripper_qvel, right_arm_qvel, right_gripper_qvel])
+    def get_effort(self):
+        left_effort_raw = self.recorder_left.effort
+        right_effort_raw = self.recorder_right.effort
+        left_robot_effort = left_effort_raw[:7]
+        right_robot_effort = right_effort_raw[:7]
+        return np.concatenate([left_robot_effort, right_robot_effort])
+    def get_images(self):
+        return self.image_recorder.get_images()
+    def set_gripper_pose(self, left_gripper_desired_pos_normalized, right_gripper_desired_pos_normalized):
+        left_gripper_desired_joint = constants.PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN(left_gripper_desired_pos_normalized)
+        self.gripper_command.cmd = left_gripper_desired_joint
+        self.puppet_bot_left.gripper.core.pub_single.publish(self.gripper_command)
+        right_gripper_desired_joint = constants.PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN(
+            right_gripper_desired_pos_normalized
+        )
+        self.gripper_command.cmd = right_gripper_desired_joint
+        self.puppet_bot_right.gripper.core.pub_single.publish(self.gripper_command)
+    def _reset_joints(self):
+        robot_utils.move_arms(
+            [self.puppet_bot_left, self.puppet_bot_right], [self._reset_position, self._reset_position], move_time=1
+        )
+    def _reset_gripper(self):
+        """Set to position mode and do position resets: first close then open. Then change back to PWM mode
+        NOTE: This diverges from the original Aloha code which first opens then closes the gripper. Pi internal aloha data
+        was collected with the gripper starting in the open position. Leaving the grippers fully closed was also found to
+        increase the frequency of motor faults.
+        """
+        robot_utils.move_grippers(
+            [self.puppet_bot_left, self.puppet_bot_right], [constants.PUPPET_GRIPPER_JOINT_CLOSE] * 2, move_time=1
+        )
+        robot_utils.move_grippers(
+            [self.puppet_bot_left, self.puppet_bot_right], [constants.PUPPET_GRIPPER_JOINT_OPEN] * 2, move_time=0.5
+        )
+    def get_observation(self):
+        obs = collections.OrderedDict()
+        obs["qpos"] = self.get_qpos()
+        obs["qvel"] = self.get_qvel()
+        obs["effort"] = self.get_effort()
+        obs["images"] = self.get_images()
+        return obs
+    def get_reward(self):
+        return 0
+    def reset(self, *, fake=False):
+        if not fake:
+            # Reboot puppet robot gripper motors
+            self.puppet_bot_left.dxl.robot_reboot_motors("single", "gripper", True)
+            self.puppet_bot_right.dxl.robot_reboot_motors("single", "gripper", True)
+            self._reset_joints()
+            self._reset_gripper()
+        return dm_env.TimeStep(
+            step_type=dm_env.StepType.FIRST, reward=self.get_reward(), discount=None, observation=self.get_observation()
+        )
+    def step(self, action):
+        state_len = int(len(action) / 2)
+        left_action = action[:state_len]
+        right_action = action[state_len:]
+        self.puppet_bot_left.arm.set_joint_positions(left_action[:6], blocking=False)
+        self.puppet_bot_right.arm.set_joint_positions(right_action[:6], blocking=False)
+        self.set_gripper_pose(left_action[-1], right_action[-1])
+        time.sleep(constants.DT)
+        return dm_env.TimeStep(
+            step_type=dm_env.StepType.MID, reward=self.get_reward(), discount=None, observation=self.get_observation()
+        )
+def get_action(master_bot_left, master_bot_right):
+    action = np.zeros(14)  # 6 joint + 1 gripper, for two arms
+    # Arm actions
+    action[:6] = master_bot_left.dxl.joint_states.position[:6]
+    action[7 : 7 + 6] = master_bot_right.dxl.joint_states.position[:6]
+    # Gripper actions
+    action[6] = constants.MASTER_GRIPPER_JOINT_NORMALIZE_FN(master_bot_left.dxl.joint_states.position[6])
+    action[7 + 6] = constants.MASTER_GRIPPER_JOINT_NORMALIZE_FN(master_bot_right.dxl.joint_states.position[6])
+    return action
+def make_real_env(init_node, *, reset_position: Optional[List[float]] = None, setup_robots: bool = True) -> RealEnv:
+    return RealEnv(init_node, reset_position=reset_position, setup_robots=setup_robots)

openpi/examples/aloha_real/requirements.in ADDED Viewed

	@@ -0,0 +1,18 @@

+Pillow
+dm_control
+einops
+h5py
+matplotlib
+modern_robotics
+msgpack
+numpy>=1.22.4,<2.0.0
+opencv-python
+packaging
+pexpect
+pyquaternion
+pyrealsense2
+pyyaml
+requests
+rospkg
+tyro
+websockets

openpi/examples/aloha_real/requirements.txt ADDED Viewed

	@@ -0,0 +1,156 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile examples/aloha_real/requirements.in -o examples/aloha_real/requirements.txt --python-version 3.10
+absl-py==2.1.0
+    # via
+    #   dm-control
+    #   dm-env
+    #   labmaze
+    #   mujoco
+catkin-pkg==1.0.0
+    # via rospkg
+certifi==2024.8.30
+    # via requests
+charset-normalizer==3.4.0
+    # via requests
+contourpy==1.1.1
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+distro==1.9.0
+    # via rospkg
+dm-control==1.0.23
+    # via -r examples/aloha_real/requirements.in
+dm-env==1.6
+    # via dm-control
+dm-tree==0.1.8
+    # via
+    #   dm-control
+    #   dm-env
+docstring-parser==0.16
+    # via tyro
+docutils==0.20.1
+    # via catkin-pkg
+einops==0.8.0
+    # via -r examples/aloha_real/requirements.in
+etils==1.3.0
+    # via mujoco
+fonttools==4.55.2
+    # via matplotlib
+glfw==2.8.0
+    # via
+    #   dm-control
+    #   mujoco
+h5py==3.11.0
+    # via -r examples/aloha_real/requirements.in
+idna==3.10
+    # via requests
+importlib-resources==6.4.5
+    # via etils
+kiwisolver==1.4.7
+    # via matplotlib
+labmaze==1.0.6
+    # via dm-control
+lxml==5.3.0
+    # via dm-control
+markdown-it-py==3.0.0
+    # via rich
+matplotlib==3.7.5
+    # via -r examples/aloha_real/requirements.in
+mdurl==0.1.2
+    # via markdown-it-py
+modern-robotics==1.1.1
+    # via -r examples/aloha_real/requirements.in
+msgpack==1.1.0
+    # via -r examples/aloha_real/requirements.in
+mujoco==3.2.3
+    # via dm-control
+numpy==1.24.4
+    # via
+    #   -r examples/aloha_real/requirements.in
+    #   contourpy
+    #   dm-control
+    #   dm-env
+    #   h5py
+    #   labmaze
+    #   matplotlib
+    #   modern-robotics
+    #   mujoco
+    #   opencv-python
+    #   pyquaternion
+    #   scipy
+opencv-python==4.10.0.84
+    # via -r examples/aloha_real/requirements.in
+packaging==24.2
+    # via
+    #   -r examples/aloha_real/requirements.in
+    #   matplotlib
+pexpect==4.9.0
+    # via -r examples/aloha_real/requirements.in
+pillow==10.4.0
+    # via
+    #   -r examples/aloha_real/requirements.in
+    #   matplotlib
+protobuf==5.29.1
+    # via dm-control
+ptyprocess==0.7.0
+    # via pexpect
+pygments==2.18.0
+    # via rich
+pyopengl==3.1.7
+    # via
+    #   dm-control
+    #   mujoco
+pyparsing==3.1.4
+    # via
+    #   catkin-pkg
+    #   dm-control
+    #   matplotlib
+pyquaternion==0.9.9
+    # via -r examples/aloha_real/requirements.in
+pyrealsense2==2.55.1.6486
+    # via -r examples/aloha_real/requirements.in
+python-dateutil==2.9.0.post0
+    # via
+    #   catkin-pkg
+    #   matplotlib
+pyyaml==6.0.2
+    # via
+    #   -r examples/aloha_real/requirements.in
+    #   rospkg
+requests==2.32.3
+    # via
+    #   -r examples/aloha_real/requirements.in
+    #   dm-control
+rich==13.9.4
+    # via tyro
+rospkg==1.5.1
+    # via -r examples/aloha_real/requirements.in
+scipy==1.10.1
+    # via dm-control
+setuptools==75.3.0
+    # via
+    #   catkin-pkg
+    #   dm-control
+    #   labmaze
+shtab==1.7.1
+    # via tyro
+six==1.17.0
+    # via python-dateutil
+tqdm==4.67.1
+    # via dm-control
+typeguard==4.4.0
+    # via tyro
+typing-extensions==4.12.2
+    # via
+    #   etils
+    #   rich
+    #   typeguard
+    #   tyro
+tyro==0.9.2
+    # via -r examples/aloha_real/requirements.in
+urllib3==2.2.3
+    # via requests
+websockets==14.1
+    # via -r examples/aloha_real/requirements.in
+zipp==3.20.2
+    # via etils

openpi/examples/aloha_real/robot_utils.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# Ignore lint errors because this file is mostly copied from ACT (https://github.com/tonyzhaozh/act).
+# ruff: noqa
+from collections import deque
+import datetime
+import json
+import time
+from aloha.msg import RGBGrayscaleImage
+from cv_bridge import CvBridge
+from interbotix_xs_msgs.msg import JointGroupCommand
+from interbotix_xs_msgs.msg import JointSingleCommand
+import numpy as np
+import rospy
+from sensor_msgs.msg import JointState
+from examples.aloha_real import constants
+class ImageRecorder:
+    def __init__(self, init_node=True, is_debug=False):
+        self.is_debug = is_debug
+        self.bridge = CvBridge()
+        self.camera_names = ["cam_high", "cam_low", "cam_left_wrist", "cam_right_wrist"]
+        if init_node:
+            rospy.init_node("image_recorder", anonymous=True)
+        for cam_name in self.camera_names:
+            setattr(self, f"{cam_name}_rgb_image", None)
+            setattr(self, f"{cam_name}_depth_image", None)
+            setattr(self, f"{cam_name}_timestamp", 0.0)
+            if cam_name == "cam_high":
+                callback_func = self.image_cb_cam_high
+            elif cam_name == "cam_low":
+                callback_func = self.image_cb_cam_low
+            elif cam_name == "cam_left_wrist":
+                callback_func = self.image_cb_cam_left_wrist
+            elif cam_name == "cam_right_wrist":
+                callback_func = self.image_cb_cam_right_wrist
+            else:
+                raise NotImplementedError
+            rospy.Subscriber(f"/{cam_name}", RGBGrayscaleImage, callback_func)
+            if self.is_debug:
+                setattr(self, f"{cam_name}_timestamps", deque(maxlen=50))
+        self.cam_last_timestamps = {cam_name: 0.0 for cam_name in self.camera_names}
+        time.sleep(0.5)
+    def image_cb(self, cam_name, data):
+        setattr(
+            self,
+            f"{cam_name}_rgb_image",
+            self.bridge.imgmsg_to_cv2(data.images[0], desired_encoding="bgr8"),
+        )
+        # setattr(
+        #     self,
+        #     f"{cam_name}_depth_image",
+        #     self.bridge.imgmsg_to_cv2(data.images[1], desired_encoding="mono16"),
+        # )
+        setattr(
+            self,
+            f"{cam_name}_timestamp",
+            data.header.stamp.secs + data.header.stamp.nsecs * 1e-9,
+        )
+        # setattr(self, f'{cam_name}_secs', data.images[0].header.stamp.secs)
+        # setattr(self, f'{cam_name}_nsecs', data.images[0].header.stamp.nsecs)
+        # cv2.imwrite('/home/lucyshi/Desktop/sample.jpg', cv_image)
+        if self.is_debug:
+            getattr(self, f"{cam_name}_timestamps").append(
+                data.images[0].header.stamp.secs + data.images[0].header.stamp.nsecs * 1e-9
+            )
+    def image_cb_cam_high(self, data):
+        cam_name = "cam_high"
+        return self.image_cb(cam_name, data)
+    def image_cb_cam_low(self, data):
+        cam_name = "cam_low"
+        return self.image_cb(cam_name, data)
+    def image_cb_cam_left_wrist(self, data):
+        cam_name = "cam_left_wrist"
+        return self.image_cb(cam_name, data)
+    def image_cb_cam_right_wrist(self, data):
+        cam_name = "cam_right_wrist"
+        return self.image_cb(cam_name, data)
+    def get_images(self):
+        image_dict = {}
+        for cam_name in self.camera_names:
+            while getattr(self, f"{cam_name}_timestamp") <= self.cam_last_timestamps[cam_name]:
+                time.sleep(0.00001)
+            rgb_image = getattr(self, f"{cam_name}_rgb_image")
+            depth_image = getattr(self, f"{cam_name}_depth_image")
+            self.cam_last_timestamps[cam_name] = getattr(self, f"{cam_name}_timestamp")
+            image_dict[cam_name] = rgb_image
+            image_dict[f"{cam_name}_depth"] = depth_image
+        return image_dict
+    def print_diagnostics(self):
+        def dt_helper(l):
+            l = np.array(l)
+            diff = l[1:] - l[:-1]
+            return np.mean(diff)
+        for cam_name in self.camera_names:
+            image_freq = 1 / dt_helper(getattr(self, f"{cam_name}_timestamps"))
+            print(f"{cam_name} {image_freq=:.2f}")
+        print()
+class Recorder:
+    def __init__(self, side, init_node=True, is_debug=False):
+        self.secs = None
+        self.nsecs = None
+        self.qpos = None
+        self.effort = None
+        self.arm_command = None
+        self.gripper_command = None
+        self.is_debug = is_debug
+        if init_node:
+            rospy.init_node("recorder", anonymous=True)
+        rospy.Subscriber(f"/puppet_{side}/joint_states", JointState, self.puppet_state_cb)
+        rospy.Subscriber(
+            f"/puppet_{side}/commands/joint_group",
+            JointGroupCommand,
+            self.puppet_arm_commands_cb,
+        )
+        rospy.Subscriber(
+            f"/puppet_{side}/commands/joint_single",
+            JointSingleCommand,
+            self.puppet_gripper_commands_cb,
+        )
+        if self.is_debug:
+            self.joint_timestamps = deque(maxlen=50)
+            self.arm_command_timestamps = deque(maxlen=50)
+            self.gripper_command_timestamps = deque(maxlen=50)
+        time.sleep(0.1)
+    def puppet_state_cb(self, data):
+        self.qpos = data.position
+        self.qvel = data.velocity
+        self.effort = data.effort
+        self.data = data
+        if self.is_debug:
+            self.joint_timestamps.append(time.time())
+    def puppet_arm_commands_cb(self, data):
+        self.arm_command = data.cmd
+        if self.is_debug:
+            self.arm_command_timestamps.append(time.time())
+    def puppet_gripper_commands_cb(self, data):
+        self.gripper_command = data.cmd
+        if self.is_debug:
+            self.gripper_command_timestamps.append(time.time())
+    def print_diagnostics(self):
+        def dt_helper(l):
+            l = np.array(l)
+            diff = l[1:] - l[:-1]
+            return np.mean(diff)
+        joint_freq = 1 / dt_helper(self.joint_timestamps)
+        arm_command_freq = 1 / dt_helper(self.arm_command_timestamps)
+        gripper_command_freq = 1 / dt_helper(self.gripper_command_timestamps)
+        print(f"{joint_freq=:.2f}\n{arm_command_freq=:.2f}\n{gripper_command_freq=:.2f}\n")
+def get_arm_joint_positions(bot):
+    return bot.arm.core.joint_states.position[:6]
+def get_arm_gripper_positions(bot):
+    return bot.gripper.core.joint_states.position[6]
+def move_arms(bot_list, target_pose_list, move_time=1):
+    num_steps = int(move_time / constants.DT)
+    curr_pose_list = [get_arm_joint_positions(bot) for bot in bot_list]
+    traj_list = [
+        np.linspace(curr_pose, target_pose, num_steps)
+        for curr_pose, target_pose in zip(curr_pose_list, target_pose_list)
+    ]
+    for t in range(num_steps):
+        for bot_id, bot in enumerate(bot_list):
+            bot.arm.set_joint_positions(traj_list[bot_id][t], blocking=False)
+        time.sleep(constants.DT)
+def move_grippers(bot_list, target_pose_list, move_time):
+    print(f"Moving grippers to {target_pose_list=}")
+    gripper_command = JointSingleCommand(name="gripper")
+    num_steps = int(move_time / constants.DT)
+    curr_pose_list = [get_arm_gripper_positions(bot) for bot in bot_list]
+    traj_list = [
+        np.linspace(curr_pose, target_pose, num_steps)
+        for curr_pose, target_pose in zip(curr_pose_list, target_pose_list)
+    ]
+    with open(f"/data/gripper_traj_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl", "a") as f:
+        for t in range(num_steps):
+            d = {}
+            for bot_id, bot in enumerate(bot_list):
+                gripper_command.cmd = traj_list[bot_id][t]
+                bot.gripper.core.pub_single.publish(gripper_command)
+                d[bot_id] = {"obs": get_arm_gripper_positions(bot), "act": traj_list[bot_id][t]}
+            f.write(json.dumps(d) + "\n")
+            time.sleep(constants.DT)
+def setup_puppet_bot(bot):
+    bot.dxl.robot_reboot_motors("single", "gripper", True)
+    bot.dxl.robot_set_operating_modes("group", "arm", "position")
+    bot.dxl.robot_set_operating_modes("single", "gripper", "current_based_position")
+    torque_on(bot)
+def setup_master_bot(bot):
+    bot.dxl.robot_set_operating_modes("group", "arm", "pwm")
+    bot.dxl.robot_set_operating_modes("single", "gripper", "current_based_position")
+    torque_off(bot)
+def set_standard_pid_gains(bot):
+    bot.dxl.robot_set_motor_registers("group", "arm", "Position_P_Gain", 800)
+    bot.dxl.robot_set_motor_registers("group", "arm", "Position_I_Gain", 0)
+def set_low_pid_gains(bot):
+    bot.dxl.robot_set_motor_registers("group", "arm", "Position_P_Gain", 100)
+    bot.dxl.robot_set_motor_registers("group", "arm", "Position_I_Gain", 0)
+def torque_off(bot):
+    bot.dxl.robot_torque_enable("group", "arm", False)
+    bot.dxl.robot_torque_enable("single", "gripper", False)
+def torque_on(bot):
+    bot.dxl.robot_torque_enable("group", "arm", True)
+    bot.dxl.robot_torque_enable("single", "gripper", True)
+# for DAgger
+def sync_puppet_to_master(master_bot_left, master_bot_right, puppet_bot_left, puppet_bot_right):
+    print("\nSyncing!")
+    # activate master arms
+    torque_on(master_bot_left)
+    torque_on(master_bot_right)
+    # get puppet arm positions
+    puppet_left_qpos = get_arm_joint_positions(puppet_bot_left)
+    puppet_right_qpos = get_arm_joint_positions(puppet_bot_right)
+    # get puppet gripper positions
+    puppet_left_gripper = get_arm_gripper_positions(puppet_bot_left)
+    puppet_right_gripper = get_arm_gripper_positions(puppet_bot_right)
+    # move master arms to puppet positions
+    move_arms(
+        [master_bot_left, master_bot_right],
+        [puppet_left_qpos, puppet_right_qpos],
+        move_time=1,
+    )
+    # move master grippers to puppet positions
+    move_grippers(
+        [master_bot_left, master_bot_right],
+        [puppet_left_gripper, puppet_right_gripper],
+        move_time=1,
+    )

openpi/examples/aloha_real/video_display.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import matplotlib.pyplot as plt
+import numpy as np
+from openpi_client.runtime import subscriber as _subscriber
+from typing_extensions import override
+class VideoDisplay(_subscriber.Subscriber):
+    """Displays video frames."""
+    def __init__(self) -> None:
+        self._ax: plt.Axes | None = None
+        self._plt_img: plt.Image | None = None
+    @override
+    def on_episode_start(self) -> None:
+        plt.ion()
+        self._ax = plt.subplot()
+        self._plt_img = None
+    @override
+    def on_step(self, observation: dict, action: dict) -> None:
+        assert self._ax is not None
+        im = observation["image"][0]  # [C, H, W]
+        im = np.transpose(im, (1, 2, 0))  # [H, W, C]
+        if self._plt_img is None:
+            self._plt_img = self._ax.imshow(im)
+        else:
+            self._plt_img.set_data(im)
+        plt.pause(0.001)
+    @override
+    def on_episode_end(self) -> None:
+        plt.ioff()
+        plt.close()

openpi/examples/aloha_sim/Dockerfile ADDED Viewed

	@@ -0,0 +1,41 @@

+# Dockerfile for the Aloha simulation environment.
+# Build the container:
+# docker build . -t aloha_sim -f examples/aloha_sim/Dockerfile
+# Run the container:
+# docker run --rm -it --network=host -v .:/app aloha_sim /bin/bash
+FROM python:3.11-slim@sha256:370c586a6ffc8c619e6d652f81c094b34b14b8f2fb9251f092de23f16e299b78
+COPY --from=ghcr.io/astral-sh/uv:0.5.1 /uv /uvx /bin/
+RUN apt-get update && \
+    apt-get install -y \
+    libosmesa6-dev \
+    libgl1-mesa-glx \
+    libglew-dev \
+    libglfw3-dev \
+    libgles2-mesa-dev
+ENV MUJOCO_GL=egl
+WORKDIR /app
+# Copy from the cache instead of linking since it's a mounted volume
+ENV UV_LINK_MODE=copy
+# Write the virtual environment outside of the project directory so it doesn't
+# leak out of the container when we mount the application code.
+ENV UV_PROJECT_ENVIRONMENT=/.venv
+# Copy the requirements files so we can install dependencies.
+# The rest of the project is mounted as a volume, so we don't need to rebuild on changes.
+# This strategy is best for development-style usage.
+COPY ./examples/aloha_sim/requirements.txt /tmp/requirements.txt
+COPY ./packages/openpi-client/pyproject.toml /tmp/openpi-client/pyproject.toml
+# Install python dependencies.
+RUN uv venv --python 3.11.9 $UV_PROJECT_ENVIRONMENT
+RUN uv pip sync /tmp/requirements.txt /tmp/openpi-client/pyproject.toml
+ENV PYTHONPATH=/app:/app/src:/app/packages/openpi-client/src
+CMD ["/bin/bash", "-c", "source /.venv/bin/activate && python examples/aloha_sim/main.py"]

openpi/examples/aloha_sim/README.md ADDED Viewed

	@@ -0,0 +1,36 @@

+# Run Aloha Sim
+## With Docker
+```bash
+export SERVER_ARGS="--env ALOHA_SIM"
+docker compose -f examples/aloha_sim/compose.yml up --build
+```
+## Without Docker
+Terminal window 1:
+```bash
+# Create virtual environment
+uv venv --python 3.10 examples/aloha_sim/.venv
+source examples/aloha_sim/.venv/bin/activate
+uv pip sync examples/aloha_sim/requirements.txt
+uv pip install -e packages/openpi-client
+# Run the simulation
+MUJOCO_GL=egl python examples/aloha_sim/main.py
+```
+Note: If you are seeing EGL errors, you may need to install the following dependencies:
+```bash
+sudo apt-get install -y libegl1-mesa-dev libgles2-mesa-dev
+```
+Terminal window 2:
+```bash
+# Run the server
+uv run scripts/serve_policy.py --env ALOHA_SIM
+```

openpi/examples/aloha_sim/compose.yml ADDED Viewed

	@@ -0,0 +1,42 @@

+# Run with:
+# docker compose -f examples/aloha_sim/compose.yml up --build
+services:
+  runtime:
+    image: aloha_sim
+    depends_on:
+      - openpi_server
+    build:
+      context: ../..
+      dockerfile: examples/aloha_sim/Dockerfile
+    init: true
+    tty: true
+    network_mode: host
+    privileged: true
+    volumes:
+      - $PWD:/app
+      - ../../data:/data
+  openpi_server:
+    image: openpi_server
+    build:
+      context: ../..
+      dockerfile: scripts/docker/serve_policy.Dockerfile
+    init: true
+    tty: true
+    network_mode: host
+    volumes:
+      - $PWD:/app
+      - ${OPENPI_DATA_HOME:-~/.cache/openpi}:/openpi_assets
+    environment:
+      - SERVER_ARGS
+      - OPENPI_DATA_HOME=/openpi_assets
+      - IS_DOCKER=true
+    # Comment out this block if not running on a machine with GPUs.
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]

openpi/examples/aloha_sim/env.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import gym_aloha  # noqa: F401
+import gymnasium
+import numpy as np
+from openpi_client import image_tools
+from openpi_client.runtime import environment as _environment
+from typing_extensions import override
+class AlohaSimEnvironment(_environment.Environment):
+    """An environment for an Aloha robot in simulation."""
+    def __init__(self, task: str, obs_type: str = "pixels_agent_pos", seed: int = 0) -> None:
+        np.random.seed(seed)
+        self._rng = np.random.default_rng(seed)
+        self._gym = gymnasium.make(task, obs_type=obs_type)
+        self._last_obs = None
+        self._done = True
+        self._episode_reward = 0.0
+    @override
+    def reset(self) -> None:
+        gym_obs, _ = self._gym.reset(seed=int(self._rng.integers(2**32 - 1)))
+        self._last_obs = self._convert_observation(gym_obs)  # type: ignore
+        self._done = False
+        self._episode_reward = 0.0
+    @override
+    def is_episode_complete(self) -> bool:
+        return self._done
+    @override
+    def get_observation(self) -> dict:
+        if self._last_obs is None:
+            raise RuntimeError("Observation is not set. Call reset() first.")
+        return self._last_obs  # type: ignore
+    @override
+    def apply_action(self, action: dict) -> None:
+        gym_obs, reward, terminated, truncated, info = self._gym.step(action["actions"])
+        self._last_obs = self._convert_observation(gym_obs)  # type: ignore
+        self._done = terminated or truncated
+        self._episode_reward = max(self._episode_reward, reward)
+    def _convert_observation(self, gym_obs: dict) -> dict:
+        img = gym_obs["pixels"]["top"]
+        img = image_tools.convert_to_uint8(image_tools.resize_with_pad(img, 224, 224))
+        # Convert axis order from [H, W, C] --> [C, H, W]
+        img = np.transpose(img, (2, 0, 1))
+        return {
+            "state": gym_obs["agent_pos"],
+            "images": {"cam_high": img},
+        }

openpi/examples/aloha_sim/main.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import dataclasses
+import logging
+import pathlib
+import env as _env
+from openpi_client import action_chunk_broker
+from openpi_client import websocket_client_policy as _websocket_client_policy
+from openpi_client.runtime import runtime as _runtime
+from openpi_client.runtime.agents import policy_agent as _policy_agent
+import saver as _saver
+import tyro
+@dataclasses.dataclass
+class Args:
+    out_dir: pathlib.Path = pathlib.Path("data/aloha_sim/videos")
+    task: str = "gym_aloha/AlohaTransferCube-v0"
+    seed: int = 0
+    action_horizon: int = 10
+    host: str = "0.0.0.0"
+    port: int = 8000
+    display: bool = False
+def main(args: Args) -> None:
+    runtime = _runtime.Runtime(
+        environment=_env.AlohaSimEnvironment(
+            task=args.task,
+            seed=args.seed,
+        ),
+        agent=_policy_agent.PolicyAgent(
+            policy=action_chunk_broker.ActionChunkBroker(
+                policy=_websocket_client_policy.WebsocketClientPolicy(
+                    host=args.host,
+                    port=args.port,
+                ),
+                action_horizon=args.action_horizon,
+            )
+        ),
+        subscribers=[
+            _saver.VideoSaver(args.out_dir),
+        ],
+        max_hz=50,
+    )
+    runtime.run()
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, force=True)
+    tyro.cli(main)

openpi/examples/aloha_sim/requirements.in ADDED Viewed

	@@ -0,0 +1,8 @@

+gym-aloha
+imageio
+matplotlib
+msgpack
+numpy>=1.22.4,<2.0.0
+typing-extensions
+tyro
+websockets

openpi/examples/aloha_sim/requirements.txt ADDED Viewed

	@@ -0,0 +1,132 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile examples/aloha_sim/requirements.in -o examples/aloha_sim/requirements.txt --python-version 3.10
+absl-py==2.1.0
+    # via
+    #   dm-control
+    #   dm-env
+    #   labmaze
+    #   mujoco
+certifi==2024.8.30
+    # via requests
+charset-normalizer==3.4.0
+    # via requests
+cloudpickle==3.1.0
+    # via gymnasium
+contourpy==1.3.1
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+dm-control==1.0.14
+    # via gym-aloha
+dm-env==1.6
+    # via dm-control
+dm-tree==0.1.8
+    # via
+    #   dm-control
+    #   dm-env
+docstring-parser==0.16
+    # via tyro
+farama-notifications==0.0.4
+    # via gymnasium
+fonttools==4.55.2
+    # via matplotlib
+glfw==2.8.0
+    # via
+    #   dm-control
+    #   mujoco
+gym-aloha==0.1.1
+    # via -r examples/aloha_sim/requirements.in
+gymnasium==1.0.0
+    # via gym-aloha
+idna==3.10
+    # via requests
+imageio==2.36.1
+    # via
+    #   -r examples/aloha_sim/requirements.in
+    #   gym-aloha
+imageio-ffmpeg==0.5.1
+    # via imageio
+kiwisolver==1.4.7
+    # via matplotlib
+labmaze==1.0.6
+    # via dm-control
+lxml==5.3.0
+    # via dm-control
+markdown-it-py==3.0.0
+    # via rich
+matplotlib==3.9.3
+    # via -r examples/aloha_sim/requirements.in
+mdurl==0.1.2
+    # via markdown-it-py
+msgpack==1.1.0
+    # via -r examples/aloha_sim/requirements.in
+mujoco==2.3.7
+    # via
+    #   dm-control
+    #   gym-aloha
+numpy==1.26.4
+    # via
+    #   -r examples/aloha_sim/requirements.in
+    #   contourpy
+    #   dm-control
+    #   dm-env
+    #   gymnasium
+    #   imageio
+    #   labmaze
+    #   matplotlib
+    #   mujoco
+    #   scipy
+packaging==24.2
+    # via matplotlib
+pillow==11.0.0
+    # via
+    #   imageio
+    #   matplotlib
+protobuf==5.29.1
+    # via dm-control
+psutil==6.1.0
+    # via imageio
+pygments==2.18.0
+    # via rich
+pyopengl==3.1.7
+    # via
+    #   dm-control
+    #   mujoco
+pyparsing==3.2.0
+    # via
+    #   dm-control
+    #   matplotlib
+python-dateutil==2.9.0.post0
+    # via matplotlib
+requests==2.32.3
+    # via dm-control
+rich==13.9.4
+    # via tyro
+scipy==1.14.1
+    # via dm-control
+setuptools==75.6.0
+    # via
+    #   dm-control
+    #   imageio-ffmpeg
+    #   labmaze
+shtab==1.7.1
+    # via tyro
+six==1.17.0
+    # via python-dateutil
+tqdm==4.67.1
+    # via dm-control
+typeguard==4.4.1
+    # via tyro
+typing-extensions==4.12.2
+    # via
+    #   -r examples/aloha_sim/requirements.in
+    #   gymnasium
+    #   rich
+    #   typeguard
+    #   tyro
+tyro==0.9.2
+    # via -r examples/aloha_sim/requirements.in
+urllib3==2.2.3
+    # via requests
+websockets==14.1
+    # via -r examples/aloha_sim/requirements.in

openpi/examples/aloha_sim/saver.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import logging
+import pathlib
+import imageio
+import numpy as np
+from openpi_client.runtime import subscriber as _subscriber
+from typing_extensions import override
+class VideoSaver(_subscriber.Subscriber):
+    """Saves episode data."""
+    def __init__(self, out_dir: pathlib.Path, subsample: int = 1) -> None:
+        out_dir.mkdir(parents=True, exist_ok=True)
+        self._out_dir = out_dir
+        self._images: list[np.ndarray] = []
+        self._subsample = subsample
+    @override
+    def on_episode_start(self) -> None:
+        self._images = []
+    @override
+    def on_step(self, observation: dict, action: dict) -> None:
+        im = observation["images"]["cam_high"]  # [C, H, W]
+        im = np.transpose(im, (1, 2, 0))  # [H, W, C]
+        self._images.append(im)
+    @override
+    def on_episode_end(self) -> None:
+        existing = list(self._out_dir.glob("out_[0-9]*.mp4"))
+        next_idx = max([int(p.stem.split("_")[1]) for p in existing], default=-1) + 1
+        out_path = self._out_dir / f"out_{next_idx}.mp4"
+        logging.info(f"Saving video to {out_path}")
+        imageio.mimwrite(
+            out_path,
+            [np.asarray(x) for x in self._images[:: self._subsample]],
+            fps=50 // max(1, self._subsample),
+        )

openpi/examples/convert_jax_model_to_pytorch.py ADDED Viewed

	@@ -0,0 +1,587 @@

+#!/usr/bin/env python3
+"""
+Load a JAX model and print all parameter keys, with optional conversion to PyTorch.
+This script loads a JAX model checkpoint using orbax and can either:
+1. Print out all the parameter keys in a hierarchical structure for inspection
+2. Convert the JAX model to PyTorch format using our PI0Pytorch model
+Usage:
+    # Just inspect keys:
+    python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /path/to/checkpoint --inspect_only
+    python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /path/to/checkpoint --inspect_only
+    # Convert to PyTorch:
+    python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /path/to/checkpoint --output_path /path/to/output
+    python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /path/to/checkpoint --output_path /path/to/output
+Example:
+    # pi0_droid
+    python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /home/$USER/.cache/openpi/openpi-assets/checkpoints/pi0_droid --output_path /home/$USER/.cache/openpi/openpi-assets/checkpoints/pi0_droid_pytorch
+    # pi0_aloha_sim
+    python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /home/$USER/.cache/openpi/openpi-assets/checkpoints/pi0_aloha_sim --output_path /home/$USER/.cache/openpi/openpi-assets/checkpoints/pi0_aloha_sim_pytorch
+    # pi05_droid
+    python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /home/$USER/.cache/openpi/openpi-assets/checkpoints/pi05_droid --output_path /home/$USER/.cache/openpi/openpi-assets/checkpoints/pi05_droid_pytorch
+"""
+import json
+import os
+import pathlib
+import shutil
+from typing import Literal
+from flax.nnx import traversals
+import numpy as np
+import orbax.checkpoint as ocp
+import safetensors
+import torch
+import tyro
+import openpi.models.gemma
+import openpi.models.model
+import openpi.models.pi0_config
+import openpi.models_pytorch.pi0_pytorch
+from openpi.training import utils
+import openpi.training.config as _config
+def slice_paligemma_state_dict(state_dict, config):
+    """Convert PaliGemma JAX parameters to PyTorch format."""
+    suffix = "/value" if "img/embedding/kernel/value" in state_dict else ""
+    # patch embeddings
+    jax_key = f"img/embedding/kernel{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.embeddings.patch_embedding.weight"
+    state_dict[pytorch_key] = state_dict.pop(jax_key).transpose(3, 2, 0, 1)
+    jax_key = f"img/embedding/bias{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.embeddings.patch_embedding.bias"
+    state_dict[pytorch_key] = state_dict.pop(jax_key)
+    # positional embeddings
+    jax_key = f"img/pos_embedding{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.embeddings.position_embedding.weight"
+    state_dict[pytorch_key] = state_dict.pop(jax_key).reshape(-1, config.vision_config.hidden_size)
+    # extract vision layers to be sliced at index 0. There are 27 layers in the base model.
+    encoderblock_layernorm0_scale = state_dict.pop(f"img/Transformer/encoderblock/LayerNorm_0/scale{suffix}")
+    encoderblock_layernorm0_bias = state_dict.pop(f"img/Transformer/encoderblock/LayerNorm_0/bias{suffix}")
+    encoderblock_layernorm1_scale = state_dict.pop(f"img/Transformer/encoderblock/LayerNorm_1/scale{suffix}")
+    encoderblock_layernorm1_bias = state_dict.pop(f"img/Transformer/encoderblock/LayerNorm_1/bias{suffix}")
+    encoderblock_mlp_dense0_kernel = state_dict.pop(f"img/Transformer/encoderblock/MlpBlock_0/Dense_0/kernel{suffix}")
+    encoderblock_mlp_dense0_bias = state_dict.pop(f"img/Transformer/encoderblock/MlpBlock_0/Dense_0/bias{suffix}")
+    encoderblock_mlp_dense1_kernel = state_dict.pop(f"img/Transformer/encoderblock/MlpBlock_0/Dense_1/kernel{suffix}")
+    encoderblock_mlp_dense1_bias = state_dict.pop(f"img/Transformer/encoderblock/MlpBlock_0/Dense_1/bias{suffix}")
+    encoderblock_attention_0_key_kernel = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/key/kernel{suffix}"
+    )
+    encoderblock_attention_0_key_bias = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/key/bias{suffix}"
+    )
+    encoderblock_attention_0_value_kernel = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/value/kernel{suffix}"
+    )
+    encoderblock_attention_0_value_bias = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/value/bias{suffix}"
+    )
+    encoderblock_attention_0_query_kernel = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/query/kernel{suffix}"
+    )
+    encoderblock_attention_0_query_bias = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/query/bias{suffix}"
+    )
+    encoderblock_attention_0_out_kernel = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/out/kernel{suffix}"
+    )
+    encoderblock_attention_0_out_bias = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/out/bias{suffix}"
+    )
+    for i in range(config.vision_config.num_hidden_layers):
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.layer_norm1.weight"
+        ] = encoderblock_layernorm0_scale[i].transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.layer_norm1.bias"
+        ] = encoderblock_layernorm0_bias[i]
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.layer_norm2.weight"
+        ] = encoderblock_layernorm1_scale[i].transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.layer_norm2.bias"
+        ] = encoderblock_layernorm1_bias[i]
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.mlp.fc1.weight"
+        ] = encoderblock_mlp_dense0_kernel[i].transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.mlp.fc1.bias"
+        ] = encoderblock_mlp_dense0_bias[i]
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.mlp.fc2.weight"
+        ] = encoderblock_mlp_dense1_kernel[i].transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.mlp.fc2.bias"
+        ] = encoderblock_mlp_dense1_bias[i]
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.k_proj.weight"
+        ] = encoderblock_attention_0_key_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.k_proj.bias"
+        ] = encoderblock_attention_0_key_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.v_proj.weight"
+        ] = encoderblock_attention_0_value_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.v_proj.bias"
+        ] = encoderblock_attention_0_value_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.q_proj.weight"
+        ] = encoderblock_attention_0_query_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.q_proj.bias"
+        ] = encoderblock_attention_0_query_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.out_proj.weight"
+        ] = encoderblock_attention_0_out_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.out_proj.bias"
+        ] = encoderblock_attention_0_out_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
+    jax_key = f"img/Transformer/encoder_norm/scale{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.post_layernorm.weight"
+    state_dict[pytorch_key] = state_dict.pop(jax_key).transpose()
+    jax_key = f"img/Transformer/encoder_norm/bias{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.post_layernorm.bias"
+    state_dict[pytorch_key] = state_dict.pop(jax_key)
+    # multimodal projector
+    jax_key = f"img/head/kernel{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.multi_modal_projector.linear.weight"
+    state_dict[pytorch_key] = state_dict.pop(jax_key).transpose()
+    jax_key = f"img/head/bias{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.multi_modal_projector.linear.bias"
+    state_dict[pytorch_key] = state_dict.pop(jax_key)
+    # text decoder (gemma)
+    jax_key = f"llm/embedder/input_embedding{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight"
+    state_dict[pytorch_key] = state_dict.pop(jax_key)
+    # pop the einsum attention + mlp representations
+    llm_attention_attn_vec_einsum = state_dict.pop(f"llm/layers/attn/attn_vec_einsum/w{suffix}")
+    llm_attention_kv_einsum = state_dict.pop(f"llm/layers/attn/kv_einsum/w{suffix}")
+    llm_attention_q_einsum = state_dict.pop(f"llm/layers/attn/q_einsum/w{suffix}")
+    llm_mlp_gating_einsum = state_dict.pop(f"llm/layers/mlp/gating_einsum{suffix}")
+    llm_mlp_linear = state_dict.pop(f"llm/layers/mlp/linear{suffix}")
+    llm_input_layernorm = state_dict.pop(f"llm/layers/pre_attention_norm/scale{suffix}")
+    llm_post_attention_layernorm = state_dict.pop(f"llm/layers/pre_ffw_norm/scale{suffix}")
+    for i in range(config.text_config.num_hidden_layers):
+        q_proj_weight_reshaped = (
+            llm_attention_q_einsum[i]
+            .transpose(0, 2, 1)
+            .reshape(
+                config.text_config.num_attention_heads * config.text_config.head_dim, config.text_config.hidden_size
+            )
+        )
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.self_attn.q_proj.weight"] = (
+            q_proj_weight_reshaped
+        )
+        k_proj_weight_reshaped = llm_attention_kv_einsum[i, 0, 0].transpose()
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.self_attn.k_proj.weight"] = (
+            k_proj_weight_reshaped
+        )
+        v_proj_weight_reshaped = llm_attention_kv_einsum[i, 1, 0].transpose()
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.self_attn.v_proj.weight"] = (
+            v_proj_weight_reshaped
+        )
+        o_proj_weight_reshaped = (
+            llm_attention_attn_vec_einsum[i]
+            .transpose(2, 0, 1)
+            .reshape(
+                config.text_config.num_attention_heads * config.text_config.head_dim, config.text_config.hidden_size
+            )
+        )
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.self_attn.o_proj.weight"] = (
+            o_proj_weight_reshaped
+        )
+        gate_proj_weight = llm_mlp_gating_einsum[i, 0]
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.mlp.gate_proj.weight"] = (
+            gate_proj_weight.transpose()
+        )
+        up_proj_weight = llm_mlp_gating_einsum[i, 1]
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.mlp.up_proj.weight"] = (
+            up_proj_weight.transpose()
+        )
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.mlp.down_proj.weight"] = (
+            llm_mlp_linear[i].transpose()
+        )
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.input_layernorm.weight"] = (
+            llm_input_layernorm[i]
+        )
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.post_attention_layernorm.weight"
+        ] = llm_post_attention_layernorm[i]
+    jax_key = f"llm/final_norm/scale{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.language_model.norm.weight"
+    state_dict[pytorch_key] = state_dict.pop(jax_key)
+    expert_dict = {}
+    final_state_dict = {}
+    # Expert-related keys to extract (including pi05 Dense layer parameters)
+    expert_keys = [
+        f"llm/final_norm_1/scale{suffix}",
+        f"llm/final_norm_1/Dense_0/bias{suffix}",
+        f"llm/final_norm_1/Dense_0/kernel{suffix}",
+        f"llm/layers/attn/attn_vec_einsum_1/w{suffix}",
+        f"llm/layers/attn/kv_einsum_1/w{suffix}",
+        f"llm/layers/attn/q_einsum_1/w{suffix}",
+        f"llm/layers/mlp_1/gating_einsum{suffix}",
+        f"llm/layers/mlp_1/linear{suffix}",
+        f"llm/layers/pre_attention_norm_1/scale{suffix}",
+        f"llm/layers/pre_attention_norm_1/Dense_0/bias{suffix}",
+        f"llm/layers/pre_attention_norm_1/Dense_0/kernel{suffix}",
+        f"llm/layers/pre_ffw_norm_1/scale{suffix}",
+        f"llm/layers/pre_ffw_norm_1/Dense_0/bias{suffix}",
+        f"llm/layers/pre_ffw_norm_1/Dense_0/kernel{suffix}",
+    ]
+    for key, value in state_dict.items():
+        if key not in expert_keys:
+            final_state_dict[key] = torch.from_numpy(value)
+        else:
+            expert_dict[key] = value
+    return final_state_dict, expert_dict
+def slice_gemma_state_dict(state_dict, config, *, num_expert, checkpoint_dir, pi05):
+    """Convert Gemma JAX parameters to PyTorch format."""
+    # Add missing attributes to config if they don't exist
+    if not hasattr(config, "vocab_size"):
+        config.vocab_size = 257152  # PALIGEMMA_VOCAB_SIZE
+    if not hasattr(config, "hidden_size"):
+        config.hidden_size = config.width
+    if not hasattr(config, "num_hidden_layers"):
+        config.num_hidden_layers = config.depth
+    if not hasattr(config, "num_attention_heads"):
+        config.num_attention_heads = config.num_heads
+    suffix = "/value" if f"llm/layers/attn/attn_vec_einsum_{num_expert}/w/value" in state_dict else ""
+    llm_attention_attn_vec_einsum = state_dict.pop(f"llm/layers/attn/attn_vec_einsum_{num_expert}/w{suffix}")
+    llm_attention_kv_einsum = state_dict.pop(f"llm/layers/attn/kv_einsum_{num_expert}/w{suffix}")
+    llm_attention_q_einsum = state_dict.pop(f"llm/layers/attn/q_einsum_{num_expert}/w{suffix}")
+    llm_mlp_gating_einsum = state_dict.pop(f"llm/layers/mlp_{num_expert}/gating_einsum{suffix}")
+    llm_mlp_linear = state_dict.pop(f"llm/layers/mlp_{num_expert}/linear{suffix}")
+    # Check if we have Dense layers (for pi05/adaptive normalization) or scale layers (for regular pi0)
+    if "pi05" in checkpoint_dir:
+        # Pi05 with adaptive normalization
+        llm_input_layernorm_bias = state_dict.pop(f"llm/layers/pre_attention_norm_{num_expert}/Dense_0/bias{suffix}")
+        llm_post_attention_layernorm_bias = state_dict.pop(f"llm/layers/pre_ffw_norm_{num_expert}/Dense_0/bias{suffix}")
+        llm_input_layernorm_kernel = state_dict.pop(
+            f"llm/layers/pre_attention_norm_{num_expert}/Dense_0/kernel{suffix}"
+        )
+        llm_post_attention_layernorm_kernel = state_dict.pop(
+            f"llm/layers/pre_ffw_norm_{num_expert}/Dense_0/kernel{suffix}"
+        )
+    else:
+        # Regular pi0 with standard RMSNorm
+        llm_input_layernorm = state_dict.pop(f"llm/layers/pre_attention_norm_{num_expert}/scale{suffix}")
+        llm_post_attention_layernorm = state_dict.pop(f"llm/layers/pre_ffw_norm_{num_expert}/scale{suffix}")
+    for i in range(config.num_hidden_layers):
+        q_proj_weight_reshaped = (
+            llm_attention_q_einsum[i]
+            .transpose(0, 2, 1)
+            .reshape(config.num_attention_heads * config.head_dim, config.hidden_size)
+        )
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.self_attn.q_proj.weight"] = (
+            q_proj_weight_reshaped
+        )
+        k_proj_weight_reshaped = llm_attention_kv_einsum[i, 0, 0].transpose()
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.self_attn.k_proj.weight"] = (
+            k_proj_weight_reshaped
+        )
+        v_proj_weight_reshaped = llm_attention_kv_einsum[i, 1, 0].transpose()
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.self_attn.v_proj.weight"] = (
+            v_proj_weight_reshaped
+        )
+        o_proj_weight_reshaped = (
+            llm_attention_attn_vec_einsum[i]
+            .reshape(config.num_attention_heads * config.head_dim, config.hidden_size)
+            .transpose(1, 0)
+        )
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.self_attn.o_proj.weight"] = (
+            o_proj_weight_reshaped
+        )
+        gate_proj_weight = llm_mlp_gating_einsum[i, 0]
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.mlp.gate_proj.weight"] = (
+            gate_proj_weight.transpose()
+        )
+        up_proj_weight = llm_mlp_gating_einsum[i, 1]
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.mlp.up_proj.weight"] = (
+            up_proj_weight.transpose()
+        )
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.mlp.down_proj.weight"] = llm_mlp_linear[
+            i
+        ].transpose()
+        if "pi05" in checkpoint_dir:
+            # Pi05 with adaptive normalization - use Dense layer parameters directly
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.input_layernorm.dense.bias"] = (
+                llm_input_layernorm_bias[i]
+            )
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.post_attention_layernorm.dense.bias"] = (
+                llm_post_attention_layernorm_bias[i]
+            )
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.input_layernorm.dense.weight"] = (
+                llm_input_layernorm_kernel[i].transpose()
+            )
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.post_attention_layernorm.dense.weight"] = (
+                llm_post_attention_layernorm_kernel[i].transpose()
+            )
+        else:
+            # Regular pi0 with standard RMSNorm
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.input_layernorm.weight"] = (
+                llm_input_layernorm[i]
+            )
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.post_attention_layernorm.weight"] = (
+                llm_post_attention_layernorm[i]
+            )
+    # Handle final norm layer
+    if "pi05" in checkpoint_dir:
+        # Pi05 with adaptive normalization - use Dense layer parameters directly
+        final_norm_bias = state_dict.pop(f"llm/final_norm_{num_expert}/Dense_0/bias{suffix}")
+        final_norm_kernel = state_dict.pop(f"llm/final_norm_{num_expert}/Dense_0/kernel{suffix}")
+        state_dict["paligemma_with_expert.gemma_expert.model.norm.dense.bias"] = final_norm_bias
+        state_dict["paligemma_with_expert.gemma_expert.model.norm.dense.weight"] = final_norm_kernel.transpose()
+    else:
+        # Regular pi0 with standard RMSNorm
+        state_dict["paligemma_with_expert.gemma_expert.model.norm.weight"] = state_dict.pop(
+            f"llm/final_norm_{num_expert}/scale{suffix}"
+        )
+        # state_dict["paligemma_with_expert.gemma_expert.lm_head.weight"] = embedding_vector # weights are tied.
+    final_state_dict = {}
+    for key, value in state_dict.items():
+        if not isinstance(value, torch.Tensor):
+            final_state_dict[key] = torch.from_numpy(value)
+        else:
+            final_state_dict[key] = value
+    return final_state_dict
+def slice_initial_orbax_checkpoint(checkpoint_dir: str, restore_precision: str | None = None):
+    """Load and process params by restoring via JAX model loader first.
+    This respects dtype conversions that occur during model restore.
+    """
+    # Use repository restore utility to load a pure dict of params (value suffix removed)
+    params = openpi.models.model.restore_params(
+        f"{checkpoint_dir}/params/", restore_type=np.ndarray, dtype=restore_precision
+    )
+    return {"paligemma_params": traversals.flatten_mapping(params["PaliGemma"], sep="/"), "projection_params": params}
+def load_jax_model_and_print_keys(checkpoint_dir: str):
+    """
+    Load JAX model from checkpoint and print all parameter keys.
+    Args:
+        checkpoint_dir: Path to the checkpoint directory
+    """
+    checkpoint_dir = os.path.abspath(checkpoint_dir) if not checkpoint_dir.startswith("gs://") else checkpoint_dir
+    # Initialize checkpointer
+    checkpointer = ocp.PyTreeCheckpointer()
+    metadata = checkpointer.metadata(f"{checkpoint_dir}/params")
+    print(utils.array_tree_to_info(metadata))
+def convert_pi0_checkpoint(
+    checkpoint_dir: str, precision: str, output_path: str, model_config: openpi.models.pi0_config.Pi0Config
+):
+    """
+    Convert PI0 JAX checkpoint to PyTorch format.
+    Args:
+        checkpoint_dir: Path to the JAX checkpoint
+        precision: Model precision (float32, bfloat16, float16)
+        output_path: Path to save the converted PyTorch model
+        model_config: Model config
+    """
+    print(f"Converting PI0 checkpoint from {checkpoint_dir} to {output_path}")
+    print(f"Model config: {model_config}")
+    # Break down orbax ckpts by restoring via JAX to respect dtype
+    initial_params = slice_initial_orbax_checkpoint(checkpoint_dir=checkpoint_dir, restore_precision="float32")
+    # Process projection params
+    if model_config.pi05:
+        keys = [
+            "action_in_proj",
+            "action_out_proj",
+            "time_mlp_in",
+            "time_mlp_out",
+        ]
+    else:
+        keys = [
+            "state_proj",
+            "action_in_proj",
+            "action_out_proj",
+            "action_time_mlp_in",
+            "action_time_mlp_out",
+        ]
+    projection_params = {}
+    for key in keys:
+        kernel_params = initial_params["projection_params"][key]["kernel"]
+        bias_params = initial_params["projection_params"][key]["bias"]
+        if isinstance(kernel_params, dict):
+            weight = kernel_params["value"]
+            bias = bias_params["value"]
+        else:
+            weight = kernel_params
+            bias = bias_params
+        pytorch_weight_key = f"{key}.weight"
+        pytorch_bias_key = f"{key}.bias"
+        projection_params[pytorch_weight_key] = torch.from_numpy(np.array(weight)).T
+        projection_params[pytorch_bias_key] = torch.from_numpy(np.array(bias))
+    # Create configs based on checkpoint path
+    # All models use the same PaliGemma config structure
+    class PaliGemmaConfig:
+        def __init__(self):
+            self.vision_config = type(
+                "obj",
+                (object,),
+                {
+                    "hidden_size": 1152,
+                    "num_hidden_layers": 27,
+                    "num_attention_heads": 16,
+                    "intermediate_size": 4304,
+                    "patch_size": 14,
+                    "projection_dim": 2048,
+                },
+            )()
+            self.text_config = type(
+                "obj",
+                (object,),
+                {
+                    "hidden_size": 2048,
+                    "num_hidden_layers": 18,
+                    "num_attention_heads": 8,
+                    "head_dim": 256,
+                    "intermediate_size": 16384,
+                },
+            )()
+    paligemma_config = PaliGemmaConfig()
+    action_expert_config = openpi.models.gemma.get_config("gemma_300m")
+    # Process PaliGemma weights
+    paligemma_params, expert_params = slice_paligemma_state_dict(initial_params["paligemma_params"], paligemma_config)
+    # Process Gemma weights from expert_params
+    gemma_params = slice_gemma_state_dict(
+        expert_params, action_expert_config, num_expert=1, checkpoint_dir=checkpoint_dir, pi05=model_config.pi05
+    )
+    # Instantiate model
+    pi0_model = openpi.models_pytorch.pi0_pytorch.PI0Pytorch(model_config)
+    # Combine all parameters (no prefix needed for our model structure)
+    all_params = {**paligemma_params, **gemma_params, **projection_params}
+    # Load state dict
+    pi0_model.load_state_dict(all_params, strict=False)
+    if precision == "float32":
+        pi0_model = pi0_model.to(torch.float32)
+    elif precision == "bfloat16":
+        pi0_model = pi0_model.to(torch.bfloat16)
+    else:
+        raise ValueError(f"Invalid precision: {precision}")
+    # Save the converted model using safetensors
+    os.makedirs(output_path, exist_ok=True)
+    # Save model weights as SafeTensors using save_model to handle tied weights
+    safetensors.torch.save_model(pi0_model, os.path.join(output_path, "model.safetensors"))
+    # Copy assets folder if it exists
+    assets_source = pathlib.Path(checkpoint_dir).parent / "assets"
+    if assets_source.exists():
+        assets_dest = pathlib.Path(output_path) / "assets"
+        if assets_dest.exists():
+            shutil.rmtree(assets_dest)
+        shutil.copytree(assets_source, assets_dest)
+    # Save config as JSON for reference
+    config_dict = {
+        "action_dim": model_config.action_dim,
+        "action_horizon": model_config.action_horizon,
+        "paligemma_variant": model_config.paligemma_variant,
+        "action_expert_variant": model_config.action_expert_variant,
+        "precision": precision,
+    }
+    with open(os.path.join(output_path, "config.json"), "w") as f:
+        json.dump(config_dict, f, indent=2)
+    print("Model conversion completed successfully!")
+    print(f"Model saved to {output_path}")
+def main(
+    checkpoint_dir: str,
+    config_name: str,
+    output_path: str | None = None,
+    precision: Literal["float32", "bfloat16", "float16"] = "bfloat16",
+    *,
+    inspect_only: bool = False,
+):
+    """Load JAX model and optionally convert to PyTorch.
+    Args:
+        checkpoint_dir: Path to the JAX checkpoint directory
+        output_path: Path to save converted PyTorch model (required for conversion)
+        precision: Precision for model conversion
+        inspect_only: Only inspect parameter keys, don't convert
+    """
+    model_config = _config.get_config(config_name).model
+    if not isinstance(model_config, openpi.models.pi0_config.Pi0Config):
+        raise ValueError(f"Config {config_name} is not a Pi0Config")
+    if inspect_only:
+        load_jax_model_and_print_keys(checkpoint_dir)
+    else:
+        if not output_path:
+            print("Error: --output_path is required for conversion. Use --inspect_only to only view keys.")
+            return
+        convert_pi0_checkpoint(checkpoint_dir, precision, output_path, model_config)
+if __name__ == "__main__":
+    tyro.cli(main)

openpi/examples/droid/README.md ADDED Viewed

	@@ -0,0 +1,84 @@

+# DROID Policies in openpi
+We offer instructions for:
+- [Running inference for our best $pi_{0.5}$-DROID policy](./README.md#running-droid-inference)
+- [Running inference for other pre-trained DROID policies ($\pi_0$, $\pi_0$-FAST, ...)](./README.md#running-roboarena-baseline-policies)
+- [Pre-training *generalist* policies on the *full* DROID dataset](./README_train.md#training-on-droid)
+- [Fine-tuning expert $\pi_{0.5}$ on your custom DROID dataset](./README_train.md#fine-tuning-on-custom-droid-datasets)
+## Running DROID Inference
+This example shows how to run the fine-tuned $\pi_{0.5}$-DROID model on the [DROID robot platform](https://github.com/droid-dataset/droid). Based on the [public RoboArena benchmark](https://robo-arena.github.io/leaderboard), this is currently our strongest generalist DROID policy.
+### Step 1: Start a policy server
+Since the DROID control laptop does not have a powerful GPU, we will start a remote policy server on a different machine with a more powerful GPU and then query it from the DROID control laptop during inference.
+1. On a machine with a powerful GPU (~NVIDIA 4090), clone and install the `openpi` repository following the instructions in the [README](https://github.com/Physical-Intelligence/openpi).
+2. Start the OpenPI server via the following command:
+```bash
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=pi05_droid --policy.dir=gs://openpi-assets/checkpoints/pi05_droid
+```
+You can also run the equivalent command below:
+```bash
+uv run scripts/serve_policy.py --env=DROID
+```
+### Step 2: Run the DROID robot
+1. Make sure you have the most recent version of the DROID package installed on both the DROID control laptop and the NUC.
+2. On the control laptop, activate your DROID conda environment.
+3. Clone the openpi repo and install the openpi client, which we will use to connect to the policy server (this has very few dependencies and should be very fast to install): with the DROID conda environment activated, run `cd $OPENPI_ROOT/packages/openpi-client && pip install -e .`.
+4. Install `tyro`, which we will use for command line parsing: `pip install tyro`.
+5. Copy the `main.py` file from this directory to the `$DROID_ROOT/scripts` directory.
+6. Replace the camera IDs in the `main.py` file with the IDs of your cameras (you can find the camera IDs by running `ZED_Explorer` in the command line, which will open a tool that shows you all connected cameras and their IDs -- you can also use it to make sure that the cameras are well-positioned to see the scene you want the robot to interact with).
+7. Run the `main.py` file. Make sure to point the IP and host address to the policy server. (To make sure the server machine is reachable from the DROID laptop, you can run `ping <server_ip>` from the DROID laptop.) Also make sure to specify the external camera to use for the policy (we only input one external camera), choose from ["left", "right"].
+```bash
+python3 scripts/main.py --remote_host=<server_ip> --remote_port=<server_port> --external_camera="left"
+```
+The script will ask you to enter a free-form language instruction for the robot to follow. Make sure to point the cameras at the scene you want the robot to interact with. You _do not_ need to carefully control camera angle, object positions, etc. The policy is fairly robust in our experience. Happy prompting!
+## Troubleshooting
+| Issue | Solution |
+|-------|----------|
+| Cannot reach policy server | Make sure the server is running and the IP and port are correct. You can check that the server machine is reachable by running `ping <server_ip>` from the DROID laptop. |
+| Cannot find cameras | Make sure the camera IDs are correct and that the cameras are connected to the DROID laptop. Sometimes replugging the cameras can help. You can check all connected cameras by running `ZED_Explore` in the command line. |
+| Policy inference is slow / inconsistent | Try using a wired internet connection for the DROID laptop to reduce latency (0.5 - 1 sec latency per chunk is normal). |
+| Policy does not perform the task well | In our experiments, the policy could perform simple table top manipulation tasks (pick-and-place) across a wide range of environments, camera positions, and lighting conditions. If the policy does not perform the task well, you can try modifying the scene or object placement to make the task easier. Also make sure that the camera view you are passing to the policy can see all relevant objects in the scene (the policy is only conditioned on a single external camera + wrist camera, make sure you are feeding the desired camera to the policy). Use `ZED_Explore` to check that the camera view you are passing to the policy can see all relevant objects in the scene. Finally, the policy is far from perfect and will fail on more complex manipulation tasks, but it usually makes a decent effort. :) |
+## Running Other Policies
+We provide configs for running the baseline DROID policies from the [RoboArena](https://robo-arena.github.io/) paper. Simply run the commands below to start inference servers for the respective policies. Then follow the instructions above to run evaluation on the DROID robot.
+```
+# Train from pi0-FAST, using FAST tokenizer
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=pi0_fast_droid --policy.dir=gs://openpi-assets/checkpoints/pi0_fast_droid
+# Train from pi0, using flow matching
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=pi0_droid --policy.dir=gs://openpi-assets/checkpoints/pi0_droid
+# Trained from PaliGemma, using RT-2 / OpenVLA style binning tokenizer.
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=paligemma_binning_droid --policy.dir=gs://openpi-assets/checkpoints/roboarena/paligemma_binning_droid
+# Trained from PaliGemma, using FAST tokenizer (using universal FAST+ tokenizer).
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=paligemma_fast_droid --policy.dir=gs://openpi-assets/checkpoints/roboarena/paligemma_fast_droid
+# Trained from PaliGemma, using FAST tokenizer (tokenizer trained on DROID dataset).
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=paligemma_fast_specialist_droid --policy.dir=gs://openpi-assets/checkpoints/roboarena/paligemma_fast_specialist_droid
+# Trained from PaliGemma, using FSQ tokenizer.
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=paligemma_vq_droid --policy.dir=gs://openpi-assets/checkpoints/roboarena/paligemma_vq_droid
+# pi0-style diffusion / flow VLA, trained on DROID from PaliGemma.
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=paligemma_diffusion_droid --policy.dir=gs://openpi-assets/checkpoints/roboarena/paligemma_diffusion_droid
+```
+You can find the inference configs in [roboarena_config.py](../../src/openpi/training/misc/roboarena_config.py).

openpi/examples/droid/README_train.md ADDED Viewed

	@@ -0,0 +1,106 @@

+# Training on DROID
+Here we describe how to fine-tune the pi0.5 model on the *full* DROID dataset. This is an approximate open-source reproduction of the pi05-DROID training pipeline.
+(small differences in data loading and the used action space) -- For a tutorial on how to fine-tune your model with a smaller, custom dataset collected on the DROID platform, see below.
+In contrast to the rest of openpi, which uses LeRobot for data loading, we need to use RLDS as the data format for full DROID training (since at the moment LeRobot isn't scalable enough
+for larger datasets like DROID -- they are working on improving it though). Below, we provide instructions for updating your openpi environment for RLDS data loading and where to download the DROID dataset.
+## Install
+We need a few additional dependencies for RLDS data loading. Run:
+```bash
+uv sync --group rlds
+```
+## Download DROID dataset
+You can download the DROID dataset with the following command (after installing the `gsutil` google cloud CLI):
+```
+gsutil -m cp -r gs://gresearch/robotics/droid/1.0.1 <your_download_path>/droid/1.0.1
+```
+Note that downloading version 1.0.1 is important (not v1.0.0): it contains the complete set of language annotations (~75k episodes) while v1.0.0 only has annotations for 30k episodes. If for some reason you would like to use another version, modify the line `version="1.0.1"` in the `DroidRldsDataset` object [here](src/openpi/training/droid_rlds_dataset.py).
+You will need 1.8TB of disk storage to download the DROID RLDS dataset.
+## Run
+First, change the `rlds_data_dir` path in your `TrainConfig` to the directory that you downloaded the `droid` dataset into (see [src/openpi/training/config.py](src/openpi/training/config.py)).
+Then, compute normalization statistics (this will take ~10 minutes):
+```bash
+uv run --group rlds scripts/compute_norm_stats.py --config-name pi05_full_droid_finetune --max-frames 10_000_000
+```
+Run training:
+```bash
+XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 uv run --group rlds scripts/train.py pi05_full_droid_finetune --exp-name=my_experiment --overwrite
+```
+**Note**: The original pi0.5-DROID model was trained with joint velocity actions.
+Joint velocity actions are not compatible with simulated evaluation environments (much harder to simulate).
+Thus, we do not recommend training with joint velocity actions and instead use joint position actions here.
+## Compute Requirements
+Our DROID training config requires approximately 2 days on 8x H100 GPUs for convergence (100k iterations, bs256, approx. 1 epoch).
+If you start from PaliGemma instead of pi0 initialization, plan with ~5 days on 8x H100s (240k iterations, i.e. 3 epochs).
+We have experimented with LoRA for cheaper finetuning, but haven't found the policies to perform well so far.
+## Data Filtering
+Like any diverse real-robot dataset, the DROID dataset isn't perfectly "clean" and we have found data filtering to significantly improve policy performance. Concretely, the DROID dataset contains many *idle* timesteps in which the robot does not move (in part due to the VR teleoperation interface that was used during data collection, we will not go into too much detail here). Appropriate filtering of these idle transitions can improve policy performance.
+By default, our openpi training recipe implements the same idle filter used to train all pi-DROID models. We implement it by pre-computing which dataset indices to sample during training. You can check [compute_droid_nonidle_ranges.py](examples/droid/compute_droid_nonidle_ranges.py) for how we compute these indices. Roughly speaking, we filter any time steps for which the next chunk of actions would be largely idle. During training, our code automatically pulls our pre-computed list of indices from cloud storage and applies them. If you want to modify the idle filter / create your custom sampling logic, you can modify our script to generate a new index list and provide it via the `filter_dict_path="<path_to_filter_dict>"` argument in [src/openpi/training/config.py](src/openpi/training/config.py).
+**Note**: our list of filtering indices is only valid for the `droid/1.0.1` dataset mentioned in the download section above, and will not provide valid filtering for any other version of the DROID dataset, so make sure you download the dataset above! If you have a custom DROID version, you can rerun the [compute_droid_nonidle_ranges.py](examples/droid/compute_droid_nonidle_ranges.py) script to generate a new list of sampling indices.
+## RoboArena
+Consider submitting your DROID policies to the [RoboArena benchmark](https://robo-arena.github.io/), which allows you to evaluate your policies on diverse tasks & scenes, **in the real world**! :)
+If you have questions about RoboArena, please email [karl.pertsch@gmail.com](mailto:karl.pertsch@gmail.com).
+# Fine-Tuning on Custom DROID Datasets
+Here we describe how to fine-tune a model on a custom (smaller) dataset collected on the DROID platform. Like for other datasets, we will first convert the custom DROID dataset to LeRobot and then fine-tune a model (pi05-droid) on it.
+Note: We use LeRobot here, since we assume the custom DROID fine-tuning dataset to be relatively small (<10s of hours). For larger datasets (like the full DROID dataset) we recommend using RLDS for it's better efficiency (see the example above).
+## Step 1: Converting your custom DROID dataset to LeRobot
+We will use a small subset of the real DROID dataset for this example. This is a subset of just 30 demonstrations -- we assume that you will use your own dataset instead, but here is the command to download our subset (1.6GB):
+```
+gsutil -m cp -r gs://gresearch/robotics/droid_raw/1.0.1/IRIS/success/2023-12-04 <your_target_path>
+```
+We will also download the language annotations for the DROID dataset so we can pair our demonstrations with language instructions. Again, for your own data you can manually enter your language instructions and don't need to download our annotations. To download the DROID language annotations (12MB), run:
+```
+gsutil -m cp -r gs://gresearch/robotics/droid_raw/1.0.1/aggregated-annotations-030724.json <your_target_dir>
+```
+For your own dataset, make sure that each episode's directory contains a folder called `recordings/MP4` -- if not, you need to first run the MP4 video extraction (from SVO files) using the script [here](https://github.com/droid-dataset/droid/blob/main/scripts/convert/svo_to_mp4.py).
+Now, we will use the `convert_droid_to_lerobot.py` script to create a LeRobot version of this dataset (takes <5min for the 30 demonstrations):
+```
+uv run examples/droid/convert_droid_data_to_lerobot.py --data_dir <your_target_path>
+```
+## Step 2: Run fine-tuning with your custom dataset
+Now we can run fine-tuning with our converted custom dataset. We provide an example config for fine-tuning `pi05_droid` on the custom dataset we created.
+You can modify the config easily to work with other base models, or use your custom DROID dataset in `config.py` (seach for `pi05_droid_finetune`).
+To launch training:
+```
+uv run scripts/train.py pi05_droid_finetune --exp-name=my_experiment --overwrite
+```
+Once trained, you can follow the instructions in [`examples/droid/README.md`](examples/droid/README.md) to serve the policy and run it on the robot.

openpi/examples/droid/compute_droid_nonidle_ranges.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+Iterates through the DROID dataset and creates a json mapping from episode unique IDs to ranges of time steps
+that should be sampled during training (all others are filtered out).
+Filtering logic:
+We look for ranges of consecutive steps that contain at most min_idle_len consecutive idle frames
+(default to 7 -- as most DROID action-chunking policies run the first 8 actions generated in each chunk, filtering
+this way means the policy will not get stuck outputting stationary actions). Additionally, we also only keep non-idle
+ranges of length at least min_non_idle_len (default to 16 frames = ~1 second), while also removing the last
+filter_last_n_in_ranges frames from the end of each range (as those all correspond to action chunks with many idle actions).
+This leaves us with trajectory segments consisting of contiguous, significant movement. Training on this filtered set
+yields policies that output fewer stationary actions (i.e., get "stuck" in states less).
+"""
+import json
+import os
+from pathlib import Path
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from tqdm import tqdm
+os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Set to the GPU you want to use, or leave empty for CPU
+builder = tfds.builder_from_directory(
+    # path to the `droid` directory (not its parent)
+    builder_dir="<path_to_droid_dataset_tfds_files>",
+)
+ds = builder.as_dataset(split="train", shuffle_files=False)
+tf.data.experimental.ignore_errors(ds)
+keep_ranges_path = "<path_to_where_to_save_the_json>"
+min_idle_len = 7  # If more than this number of consecutive idle frames, filter all of them out
+min_non_idle_len = 16  # If fewer than this number of consecutive non-idle frames, filter all of them out
+filter_last_n_in_ranges = 10  # When using a filter dict, remove this many frames from the end of each range
+keep_ranges_map = {}
+if Path(keep_ranges_path).exists():
+    with Path(keep_ranges_path).open("r") as f:
+        keep_ranges_map = json.load(f)
+    print(f"Resuming from {len(keep_ranges_map)} episodes already processed")
+for ep_idx, ep in enumerate(tqdm(ds)):
+    recording_folderpath = ep["episode_metadata"]["recording_folderpath"].numpy().decode()
+    file_path = ep["episode_metadata"]["file_path"].numpy().decode()
+    key = f"{recording_folderpath}--{file_path}"
+    if key in keep_ranges_map:
+        continue
+    joint_velocities = [step["action_dict"]["joint_velocity"].numpy() for step in ep["steps"]]
+    joint_velocities = np.array(joint_velocities)
+    is_idle_array = np.hstack(
+        [np.array([False]), np.all(np.abs(joint_velocities[1:] - joint_velocities[:-1]) < 1e-3, axis=1)]
+    )
+    # Find what steps go from idle to non-idle and vice-versa
+    is_idle_padded = np.concatenate(
+        [[False], is_idle_array, [False]]
+    )  # Start and end with False, so idle at first step is a start of motion
+    is_idle_diff = np.diff(is_idle_padded.astype(int))
+    is_idle_true_starts = np.where(is_idle_diff == 1)[0]  # +1 transitions --> going from idle to non-idle
+    is_idle_true_ends = np.where(is_idle_diff == -1)[0]  # -1 transitions --> going from non-idle to idle
+    # Find which steps correspond to idle segments of length at least min_idle_len
+    true_segment_masks = (is_idle_true_ends - is_idle_true_starts) >= min_idle_len
+    is_idle_true_starts = is_idle_true_starts[true_segment_masks]
+    is_idle_true_ends = is_idle_true_ends[true_segment_masks]
+    keep_mask = np.ones(len(joint_velocities), dtype=bool)
+    for start, end in zip(is_idle_true_starts, is_idle_true_ends, strict=True):
+        keep_mask[start:end] = False
+    # Get all non-idle ranges of at least 16
+    # Same logic as above, but for keep_mask, allowing us to filter out contiguous ranges of length < min_non_idle_len
+    keep_padded = np.concatenate([[False], keep_mask, [False]])
+    keep_diff = np.diff(keep_padded.astype(int))
+    keep_true_starts = np.where(keep_diff == 1)[0]  # +1 transitions --> going from filter out to keep
+    keep_true_ends = np.where(keep_diff == -1)[0]  # -1 transitions --> going from keep to filter out
+    # Find which steps correspond to non-idle segments of length at least min_non_idle_len
+    true_segment_masks = (keep_true_ends - keep_true_starts) >= min_non_idle_len
+    keep_true_starts = keep_true_starts[true_segment_masks]
+    keep_true_ends = keep_true_ends[true_segment_masks]
+    # Add mapping from episode unique ID key to list of non-idle ranges to keep
+    keep_ranges_map[key] = []
+    for start, end in zip(keep_true_starts, keep_true_ends, strict=True):
+        keep_ranges_map[key].append((int(start), int(end) - filter_last_n_in_ranges))
+    if ep_idx % 1000 == 0:
+        with Path(keep_ranges_path).open("w") as f:
+            json.dump(keep_ranges_map, f)
+print("Done!")
+with Path(keep_ranges_path).open("w") as f:
+    json.dump(keep_ranges_map, f)

openpi/examples/droid/convert_droid_data_to_lerobot.py ADDED Viewed

	@@ -0,0 +1,477 @@

+"""
+Minimal example script for converting a dataset collected on the DROID platform to LeRobot format.
+Usage:
+uv run examples/droid/convert_droid_data_to_lerobot.py --data_dir /path/to/your/data
+If you want to push your dataset to the Hugging Face Hub, you can use the following command:
+uv run examples/droid/convert_droid_data_to_lerobot.py --data_dir /path/to/your/data --push_to_hub
+The resulting dataset will get saved to the $LEROBOT_HOME directory.
+"""
+from collections import defaultdict
+import copy
+import glob
+import json
+from pathlib import Path
+import shutil
+import cv2
+import h5py
+from lerobot.common.datasets.lerobot_dataset import HF_LEROBOT_HOME
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+import tyro
+REPO_NAME = "your_hf_username/my_droid_dataset"  # Name of the output dataset, also used for the Hugging Face Hub
+def resize_image(image, size):
+    image = Image.fromarray(image)
+    return np.array(image.resize(size, resample=Image.BICUBIC))
+def main(data_dir: str, *, push_to_hub: bool = False):
+    # Clean up any existing dataset in the output directory
+    output_path = HF_LEROBOT_HOME / REPO_NAME
+    if output_path.exists():
+        shutil.rmtree(output_path)
+    data_dir = Path(data_dir)
+    # Create LeRobot dataset, define features to store
+    # We will follow the DROID data naming conventions here.
+    # LeRobot assumes that dtype of image data is `image`
+    dataset = LeRobotDataset.create(
+        repo_id=REPO_NAME,
+        robot_type="panda",
+        fps=15,  # DROID data is typically recorded at 15fps
+        features={
+            # We call this "left" since we will only use the left stereo camera (following DROID RLDS convention)
+            "exterior_image_1_left": {
+                "dtype": "image",
+                "shape": (180, 320, 3),  # This is the resolution used in the DROID RLDS dataset
+                "names": ["height", "width", "channel"],
+            },
+            "exterior_image_2_left": {
+                "dtype": "image",
+                "shape": (180, 320, 3),
+                "names": ["height", "width", "channel"],
+            },
+            "wrist_image_left": {
+                "dtype": "image",
+                "shape": (180, 320, 3),
+                "names": ["height", "width", "channel"],
+            },
+            "joint_position": {
+                "dtype": "float32",
+                "shape": (7,),
+                "names": ["joint_position"],
+            },
+            "gripper_position": {
+                "dtype": "float32",
+                "shape": (1,),
+                "names": ["gripper_position"],
+            },
+            "actions": {
+                "dtype": "float32",
+                "shape": (8,),  # We will use joint *velocity* actions here (7D) + gripper position (1D)
+                "names": ["actions"],
+            },
+        },
+        image_writer_threads=10,
+        image_writer_processes=5,
+    )
+    # Load language annotations
+    # Note: we load the DROID language annotations for this example, but you can manually define them for your own data
+    with (data_dir / "aggregated-annotations-030724.json").open() as f:
+        language_annotations = json.load(f)
+    # Loop over raw DROID fine-tuning datasets and write episodes to the LeRobot dataset
+    # We assume the following directory structure:
+    # RAW_DROID_PATH/
+    #   - <...>/
+    #     - recordings/
+    #        - MP4/
+    #          - <camera_id>.mp4  # single-view video of left stereo pair camera
+    #     - trajectory.hdf5
+    #   - <...>/
+    episode_paths = list(data_dir.glob("**/trajectory.h5"))
+    print(f"Found {len(episode_paths)} episodes for conversion")
+    # We will loop over each dataset_name and write episodes to the LeRobot dataset
+    for episode_path in tqdm(episode_paths, desc="Converting episodes"):
+        # Load raw data
+        recording_folderpath = episode_path.parent / "recordings" / "MP4"
+        trajectory = load_trajectory(str(episode_path), recording_folderpath=str(recording_folderpath))
+        # To load the language instruction, we need to parse out the episode_id from the metadata file
+        # Again, you can modify this step for your own data, to load your own language instructions
+        metadata_filepath = next(iter(episode_path.parent.glob("metadata_*.json")))
+        episode_id = metadata_filepath.name.split(".")[0].split("_")[-1]
+        language_instruction = language_annotations.get(episode_id, {"language_instruction1": "Do something"})[
+            "language_instruction1"
+        ]
+        print(f"Converting episode with language instruction: {language_instruction}")
+        # Write to LeRobot dataset
+        for step in trajectory:
+            camera_type_dict = step["observation"]["camera_type"]
+            wrist_ids = [k for k, v in camera_type_dict.items() if v == 0]
+            exterior_ids = [k for k, v in camera_type_dict.items() if v != 0]
+            dataset.add_frame(
+                {
+                    # Note: need to flip BGR --> RGB for loaded images
+                    "exterior_image_1_left": resize_image(
+                        step["observation"]["image"][exterior_ids[0]][..., ::-1], (320, 180)
+                    ),
+                    "exterior_image_2_left": resize_image(
+                        step["observation"]["image"][exterior_ids[1]][..., ::-1], (320, 180)
+                    ),
+                    "wrist_image_left": resize_image(step["observation"]["image"][wrist_ids[0]][..., ::-1], (320, 180)),
+                    "joint_position": np.asarray(
+                        step["observation"]["robot_state"]["joint_positions"], dtype=np.float32
+                    ),
+                    "gripper_position": np.asarray(
+                        step["observation"]["robot_state"]["gripper_position"][None], dtype=np.float32
+                    ),
+                    # Important: we use joint velocity actions here since pi05-droid was pre-trained on joint velocity actions
+                    "actions": np.concatenate(
+                        [step["action"]["joint_velocity"], step["action"]["gripper_position"][None]], dtype=np.float32
+                    ),
+                    "task": language_instruction,
+                }
+            )
+        dataset.save_episode()
+    # Optionally push to the Hugging Face Hub
+    if push_to_hub:
+        dataset.push_to_hub(
+            tags=["libero", "panda", "rlds"],
+            private=False,
+            push_videos=True,
+            license="apache-2.0",
+        )
+##########################################################################################################
+################ The rest of this file are functions to parse the raw DROID data #########################
+################ You don't need to worry about understanding this part           #########################
+################ It was copied from here: https://github.com/JonathanYang0127/r2d2_rlds_dataset_builder/blob/parallel_convert/r2_d2/r2_d2.py
+##########################################################################################################
+camera_type_dict = {
+    "hand_camera_id": 0,
+    "varied_camera_1_id": 1,
+    "varied_camera_2_id": 1,
+}
+camera_type_to_string_dict = {
+    0: "hand_camera",
+    1: "varied_camera",
+    2: "fixed_camera",
+}
+def get_camera_type(cam_id):
+    if cam_id not in camera_type_dict:
+        return None
+    type_int = camera_type_dict[cam_id]
+    return camera_type_to_string_dict[type_int]
+class MP4Reader:
+    def __init__(self, filepath, serial_number):
+        # Save Parameters #
+        self.serial_number = serial_number
+        self._index = 0
+        # Open Video Reader #
+        self._mp4_reader = cv2.VideoCapture(filepath)
+        if not self._mp4_reader.isOpened():
+            raise RuntimeError("Corrupted MP4 File")
+    def set_reading_parameters(
+        self,
+        image=True,  # noqa: FBT002
+        concatenate_images=False,  # noqa: FBT002
+        resolution=(0, 0),
+        resize_func=None,
+    ):
+        # Save Parameters #
+        self.image = image
+        self.concatenate_images = concatenate_images
+        self.resolution = resolution
+        self.resize_func = cv2.resize
+        self.skip_reading = not image
+        if self.skip_reading:
+            return
+    def get_frame_resolution(self):
+        width = self._mp4_reader.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH)
+        height = self._mp4_reader.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT)
+        return (width, height)
+    def get_frame_count(self):
+        if self.skip_reading:
+            return 0
+        return int(self._mp4_reader.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT))
+    def set_frame_index(self, index):
+        if self.skip_reading:
+            return
+        if index < self._index:
+            self._mp4_reader.set(cv2.CAP_PROP_POS_FRAMES, index - 1)
+            self._index = index
+        while self._index < index:
+            self.read_camera(ignore_data=True)
+    def _process_frame(self, frame):
+        frame = copy.deepcopy(frame)
+        if self.resolution == (0, 0):
+            return frame
+        return self.resize_func(frame, self.resolution)
+    def read_camera(self, ignore_data=False, correct_timestamp=None):  # noqa: FBT002
+        # Skip if Read Unnecesary #
+        if self.skip_reading:
+            return {}
+        # Read Camera #
+        success, frame = self._mp4_reader.read()
+        self._index += 1
+        if not success:
+            return None
+        if ignore_data:
+            return None
+        # Return Data #
+        data_dict = {}
+        if self.concatenate_images or "stereo" not in self.serial_number:
+            data_dict["image"] = {self.serial_number: self._process_frame(frame)}
+        else:
+            single_width = frame.shape[1] // 2
+            data_dict["image"] = {
+                self.serial_number + "_left": self._process_frame(frame[:, :single_width, :]),
+                self.serial_number + "_right": self._process_frame(frame[:, single_width:, :]),
+            }
+        return data_dict
+    def disable_camera(self):
+        if hasattr(self, "_mp4_reader"):
+            self._mp4_reader.release()
+class RecordedMultiCameraWrapper:
+    def __init__(self, recording_folderpath, camera_kwargs={}):  # noqa: B006
+        # Save Camera Info #
+        self.camera_kwargs = camera_kwargs
+        # Open Camera Readers #
+        mp4_filepaths = glob.glob(recording_folderpath + "/*.mp4")
+        all_filepaths = mp4_filepaths
+        self.camera_dict = {}
+        for f in all_filepaths:
+            serial_number = f.split("/")[-1][:-4]
+            cam_type = get_camera_type(serial_number)
+            camera_kwargs.get(cam_type, {})
+            if f.endswith(".mp4"):
+                Reader = MP4Reader  # noqa: N806
+            else:
+                raise ValueError
+            self.camera_dict[serial_number] = Reader(f, serial_number)
+    def read_cameras(self, index=None, camera_type_dict={}, timestamp_dict={}):  # noqa: B006
+        full_obs_dict = defaultdict(dict)
+        # Read Cameras In Randomized Order #
+        all_cam_ids = list(self.camera_dict.keys())
+        # random.shuffle(all_cam_ids)
+        for cam_id in all_cam_ids:
+            if "stereo" in cam_id:
+                continue
+            try:
+                cam_type = camera_type_dict[cam_id]
+            except KeyError:
+                print(f"{self.camera_dict} -- {camera_type_dict}")
+                raise ValueError(f"Camera type {cam_id} not found in camera_type_dict")  # noqa: B904
+            curr_cam_kwargs = self.camera_kwargs.get(cam_type, {})
+            self.camera_dict[cam_id].set_reading_parameters(**curr_cam_kwargs)
+            timestamp = timestamp_dict.get(cam_id + "_frame_received", None)
+            if index is not None:
+                self.camera_dict[cam_id].set_frame_index(index)
+            data_dict = self.camera_dict[cam_id].read_camera(correct_timestamp=timestamp)
+            # Process Returned Data #
+            if data_dict is None:
+                return None
+            for key in data_dict:
+                full_obs_dict[key].update(data_dict[key])
+        return full_obs_dict
+def get_hdf5_length(hdf5_file, keys_to_ignore=[]):  # noqa: B006
+    length = None
+    for key in hdf5_file:
+        if key in keys_to_ignore:
+            continue
+        curr_data = hdf5_file[key]
+        if isinstance(curr_data, h5py.Group):
+            curr_length = get_hdf5_length(curr_data, keys_to_ignore=keys_to_ignore)
+        elif isinstance(curr_data, h5py.Dataset):
+            curr_length = len(curr_data)
+        else:
+            raise ValueError
+        if length is None:
+            length = curr_length
+        assert curr_length == length
+    return length
+def load_hdf5_to_dict(hdf5_file, index, keys_to_ignore=[]):  # noqa: B006
+    data_dict = {}
+    for key in hdf5_file:
+        if key in keys_to_ignore:
+            continue
+        curr_data = hdf5_file[key]
+        if isinstance(curr_data, h5py.Group):
+            data_dict[key] = load_hdf5_to_dict(curr_data, index, keys_to_ignore=keys_to_ignore)
+        elif isinstance(curr_data, h5py.Dataset):
+            data_dict[key] = curr_data[index]
+        else:
+            raise ValueError
+    return data_dict
+class TrajectoryReader:
+    def __init__(self, filepath, read_images=True):  # noqa: FBT002
+        self._hdf5_file = h5py.File(filepath, "r")
+        is_video_folder = "observations/videos" in self._hdf5_file
+        self._read_images = read_images and is_video_folder
+        self._length = get_hdf5_length(self._hdf5_file)
+        self._video_readers = {}
+        self._index = 0
+    def length(self):
+        return self._length
+    def read_timestep(self, index=None, keys_to_ignore=[]):  # noqa: B006
+        # Make Sure We Read Within Range #
+        if index is None:
+            index = self._index
+        else:
+            assert not self._read_images
+            self._index = index
+        assert index < self._length
+        # Load Low Dimensional Data #
+        keys_to_ignore = [*keys_to_ignore.copy(), "videos"]
+        timestep = load_hdf5_to_dict(self._hdf5_file, self._index, keys_to_ignore=keys_to_ignore)
+        # Increment Read Index #
+        self._index += 1
+        # Return Timestep #
+        return timestep
+    def close(self):
+        self._hdf5_file.close()
+def load_trajectory(
+    filepath=None,
+    read_cameras=True,  # noqa: FBT002
+    recording_folderpath=None,
+    camera_kwargs={},  # noqa: B006
+    remove_skipped_steps=False,  # noqa: FBT002
+    num_samples_per_traj=None,
+    num_samples_per_traj_coeff=1.5,
+):
+    read_recording_folderpath = read_cameras and (recording_folderpath is not None)
+    traj_reader = TrajectoryReader(filepath)
+    if read_recording_folderpath:
+        camera_reader = RecordedMultiCameraWrapper(recording_folderpath, camera_kwargs)
+    horizon = traj_reader.length()
+    timestep_list = []
+    # Choose Timesteps To Save #
+    if num_samples_per_traj:
+        num_to_save = num_samples_per_traj
+        if remove_skipped_steps:
+            num_to_save = int(num_to_save * num_samples_per_traj_coeff)
+        max_size = min(num_to_save, horizon)
+        indices_to_save = np.sort(np.random.choice(horizon, size=max_size, replace=False))
+    else:
+        indices_to_save = np.arange(horizon)
+    # Iterate Over Trajectory #
+    for i in indices_to_save:
+        # Get HDF5 Data #
+        timestep = traj_reader.read_timestep(index=i)
+        # If Applicable, Get Recorded Data #
+        if read_recording_folderpath:
+            timestamp_dict = timestep["observation"]["timestamp"]["cameras"]
+            camera_type_dict = {
+                k: camera_type_to_string_dict[v] for k, v in timestep["observation"]["camera_type"].items()
+            }
+            camera_obs = camera_reader.read_cameras(
+                index=i, camera_type_dict=camera_type_dict, timestamp_dict=timestamp_dict
+            )
+            camera_failed = camera_obs is None
+            # Add Data To Timestep If Successful #
+            if camera_failed:
+                break
+            timestep["observation"].update(camera_obs)
+        # Filter Steps #
+        step_skipped = not timestep["observation"]["controller_info"].get("movement_enabled", True)
+        delete_skipped_step = step_skipped and remove_skipped_steps
+        # Save Filtered Timesteps #
+        if delete_skipped_step:
+            del timestep
+        else:
+            timestep_list.append(timestep)
+    # Remove Extra Transitions #
+    timestep_list = np.array(timestep_list)
+    if (num_samples_per_traj is not None) and (len(timestep_list) > num_samples_per_traj):
+        ind_to_keep = np.random.choice(len(timestep_list), size=num_samples_per_traj, replace=False)
+        timestep_list = timestep_list[ind_to_keep]
+    # Close Readers #
+    traj_reader.close()
+    # Return Data #
+    return timestep_list
+if __name__ == "__main__":
+    tyro.cli(main)

openpi/examples/droid/main.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# ruff: noqa
+import contextlib
+import dataclasses
+import datetime
+import faulthandler
+import os
+import signal
+import time
+from moviepy.editor import ImageSequenceClip
+import numpy as np
+from openpi_client import image_tools
+from openpi_client import websocket_client_policy
+import pandas as pd
+from PIL import Image
+from droid.robot_env import RobotEnv
+import tqdm
+import tyro
+faulthandler.enable()
+# DROID data collection frequency -- we slow down execution to match this frequency
+DROID_CONTROL_FREQUENCY = 15
+@dataclasses.dataclass
+class Args:
+    # Hardware parameters
+    left_camera_id: str = "<your_camera_id>"  # e.g., "24259877"
+    right_camera_id: str = "<your_camera_id>"  # e.g., "24514023"
+    wrist_camera_id: str = "<your_camera_id>"  # e.g., "13062452"
+    # Policy parameters
+    external_camera: str | None = (
+        None  # which external camera should be fed to the policy, choose from ["left", "right"]
+    )
+    # Rollout parameters
+    max_timesteps: int = 600
+    # How many actions to execute from a predicted action chunk before querying policy server again
+    # 8 is usually a good default (equals 0.5 seconds of action execution).
+    open_loop_horizon: int = 8
+    # Remote server parameters
+    remote_host: str = "0.0.0.0"  # point this to the IP address of the policy server, e.g., "192.168.1.100"
+    remote_port: int = (
+        8000  # point this to the port of the policy server, default server port for openpi servers is 8000
+    )
+# We are using Ctrl+C to optionally terminate rollouts early -- however, if we press Ctrl+C while the policy server is
+# waiting for a new action chunk, it will raise an exception and the server connection dies.
+# This context manager temporarily prevents Ctrl+C and delays it after the server call is complete.
+@contextlib.contextmanager
+def prevent_keyboard_interrupt():
+    """Temporarily prevent keyboard interrupts by delaying them until after the protected code."""
+    interrupted = False
+    original_handler = signal.getsignal(signal.SIGINT)
+    def handler(signum, frame):
+        nonlocal interrupted
+        interrupted = True
+    signal.signal(signal.SIGINT, handler)
+    try:
+        yield
+    finally:
+        signal.signal(signal.SIGINT, original_handler)
+        if interrupted:
+            raise KeyboardInterrupt
+def main(args: Args):
+    # Make sure external camera is specified by user -- we only use one external camera for the policy
+    assert (
+        args.external_camera is not None and args.external_camera in ["left", "right"]
+    ), f"Please specify an external camera to use for the policy, choose from ['left', 'right'], but got {args.external_camera}"
+    # Initialize the Panda environment. Using joint velocity action space and gripper position action space is very important.
+    env = RobotEnv(action_space="joint_velocity", gripper_action_space="position")
+    print("Created the droid env!")
+    # Connect to the policy server
+    policy_client = websocket_client_policy.WebsocketClientPolicy(args.remote_host, args.remote_port)
+    df = pd.DataFrame(columns=["success", "duration", "video_filename"])
+    while True:
+        instruction = input("Enter instruction: ")
+        # Rollout parameters
+        actions_from_chunk_completed = 0
+        pred_action_chunk = None
+        # Prepare to save video of rollout
+        timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H:%M:%S")
+        video = []
+        bar = tqdm.tqdm(range(args.max_timesteps))
+        print("Running rollout... press Ctrl+C to stop early.")
+        for t_step in bar:
+            start_time = time.time()
+            try:
+                # Get the current observation
+                curr_obs = _extract_observation(
+                    args,
+                    env.get_observation(),
+                    # Save the first observation to disk
+                    save_to_disk=t_step == 0,
+                )
+                video.append(curr_obs[f"{args.external_camera}_image"])
+                # Send websocket request to policy server if it's time to predict a new chunk
+                if actions_from_chunk_completed == 0 or actions_from_chunk_completed >= args.open_loop_horizon:
+                    actions_from_chunk_completed = 0
+                    # We resize images on the robot laptop to minimize the amount of data sent to the policy server
+                    # and improve latency.
+                    request_data = {
+                        "observation/exterior_image_1_left": image_tools.resize_with_pad(
+                            curr_obs[f"{args.external_camera}_image"], 224, 224
+                        ),
+                        "observation/wrist_image_left": image_tools.resize_with_pad(curr_obs["wrist_image"], 224, 224),
+                        "observation/joint_position": curr_obs["joint_position"],
+                        "observation/gripper_position": curr_obs["gripper_position"],
+                        "prompt": instruction,
+                    }
+                    # Wrap the server call in a context manager to prevent Ctrl+C from interrupting it
+                    # Ctrl+C will be handled after the server call is complete
+                    with prevent_keyboard_interrupt():
+                        # this returns action chunk [10, 8] of 10 joint velocity actions (7) + gripper position (1)
+                        pred_action_chunk = policy_client.infer(request_data)["actions"]
+                    assert pred_action_chunk.shape == (10, 8)
+                # Select current action to execute from chunk
+                action = pred_action_chunk[actions_from_chunk_completed]
+                actions_from_chunk_completed += 1
+                # Binarize gripper action
+                if action[-1].item() > 0.5:
+                    # action[-1] = 1.0
+                    action = np.concatenate([action[:-1], np.ones((1,))])
+                else:
+                    # action[-1] = 0.0
+                    action = np.concatenate([action[:-1], np.zeros((1,))])
+                # clip all dimensions of action to [-1, 1]
+                action = np.clip(action, -1, 1)
+                env.step(action)
+                # Sleep to match DROID data collection frequency
+                elapsed_time = time.time() - start_time
+                if elapsed_time < 1 / DROID_CONTROL_FREQUENCY:
+                    time.sleep(1 / DROID_CONTROL_FREQUENCY - elapsed_time)
+            except KeyboardInterrupt:
+                break
+        video = np.stack(video)
+        save_filename = "video_" + timestamp
+        ImageSequenceClip(list(video), fps=10).write_videofile(save_filename + ".mp4", codec="libx264")
+        success: str | float | None = None
+        while not isinstance(success, float):
+            success = input(
+                "Did the rollout succeed? (enter y for 100%, n for 0%), or a numeric value 0-100 based on the evaluation spec"
+            )
+            if success == "y":
+                success = 1.0
+            elif success == "n":
+                success = 0.0
+            success = float(success) / 100
+            if not (0 <= success <= 1):
+                print(f"Success must be a number in [0, 100] but got: {success * 100}")
+        df = df.append(
+            {
+                "success": success,
+                "duration": t_step,
+                "video_filename": save_filename,
+            },
+            ignore_index=True,
+        )
+        if input("Do one more eval? (enter y or n) ").lower() != "y":
+            break
+        env.reset()
+    os.makedirs("results", exist_ok=True)
+    timestamp = datetime.datetime.now().strftime("%I:%M%p_%B_%d_%Y")
+    csv_filename = os.path.join("results", f"eval_{timestamp}.csv")
+    df.to_csv(csv_filename)
+    print(f"Results saved to {csv_filename}")
+def _extract_observation(args: Args, obs_dict, *, save_to_disk=False):
+    image_observations = obs_dict["image"]
+    left_image, right_image, wrist_image = None, None, None
+    for key in image_observations:
+        # Note the "left" below refers to the left camera in the stereo pair.
+        # The model is only trained on left stereo cams, so we only feed those.
+        if args.left_camera_id in key and "left" in key:
+            left_image = image_observations[key]
+        elif args.right_camera_id in key and "left" in key:
+            right_image = image_observations[key]
+        elif args.wrist_camera_id in key and "left" in key:
+            wrist_image = image_observations[key]
+    # Drop the alpha dimension
+    left_image = left_image[..., :3]
+    right_image = right_image[..., :3]
+    wrist_image = wrist_image[..., :3]
+    # Convert to RGB
+    left_image = left_image[..., ::-1]
+    right_image = right_image[..., ::-1]
+    wrist_image = wrist_image[..., ::-1]
+    # In addition to image observations, also capture the proprioceptive state
+    robot_state = obs_dict["robot_state"]
+    cartesian_position = np.array(robot_state["cartesian_position"])
+    joint_position = np.array(robot_state["joint_positions"])
+    gripper_position = np.array([robot_state["gripper_position"]])
+    # Save the images to disk so that they can be viewed live while the robot is running
+    # Create one combined image to make live viewing easy
+    if save_to_disk:
+        combined_image = np.concatenate([left_image, wrist_image, right_image], axis=1)
+        combined_image = Image.fromarray(combined_image)
+        combined_image.save("robot_camera_views.png")
+    return {
+        "left_image": left_image,
+        "right_image": right_image,
+        "wrist_image": wrist_image,
+        "cartesian_position": cartesian_position,
+        "joint_position": joint_position,
+        "gripper_position": gripper_position,
+    }
+if __name__ == "__main__":
+    args: Args = tyro.cli(Args)
+    main(args)