yqi19 commited on about 1 month ago

Commit

b60a439

verified ·

1 Parent(s): b184199

add: source files (batch 1)

Browse files

Files changed (50) hide show

.github/ISSUE_TEMPLATE/bug_report.yml +46 -0
.github/ISSUE_TEMPLATE/documentation.yml +21 -0
.github/ISSUE_TEMPLATE/feature_request.yml +26 -0
.github/actions/setup-venv/action.yml +55 -0
.github/pull_request_template.md +17 -0
.github/workflows/main.yml +68 -0
AGENTS.md +80 -0
ATTRIBUTIONS.md +0 -0
CLAUDE.md +80 -0
CONTRIBUTING.md +7 -0
FAQ.md +81 -0
README.md +555 -0
examples/DROID/README.md +110 -0
examples/DROID/main_gr00t.py +469 -0
examples/DROID/server_client.py +365 -0
examples/DROID/utils.py +81 -0
examples/LIBERO/README.md +196 -0
examples/LIBERO/modality.json +75 -0
examples/SO100/README.md +87 -0
examples/SO100/modality.json +35 -0
examples/SO100/so100_config.py +70 -0
examples/SimplerEnv/README.md +141 -0
examples/SimplerEnv/bridge_modality.json +77 -0
examples/SimplerEnv/convert_av1_to_h264.py +129 -0
examples/SimplerEnv/fractal_modality.json +77 -0
examples/finetune.sh +158 -0
examples/mask-guided-background-suppression/README.md +203 -0
examples/mask-guided-background-suppression/so101_config.py +62 -0
examples/mask-guided-background-suppression/test_extra_augmentation.py +198 -0
getting_started/data_config.md +331 -0
getting_started/data_preparation.md +164 -0
getting_started/finetune_new_embodiment.md +153 -0
getting_started/hardware_recommendation.md +95 -0
getting_started/policy.md +574 -0
getting_started/real_world_deployment.md +459 -0
gr00t/__init__.py +129 -0
gr00t/configs/__init__.py +14 -0
gr00t/configs/base_config.py +150 -0
gr00t/configs/data/__init__.py +14 -0
gr00t/configs/data/data_config.py +95 -0
gr00t/configs/data/embodiment_configs.py +208 -0
gr00t/configs/deepspeed/zero2_config.json +33 -0
gr00t/configs/deepspeed/zero3_config.json +31 -0
gr00t/configs/finetune_config.py +163 -0
gr00t/configs/model/__init__.py +52 -0
gr00t/configs/model/gr00t_n1d7.py +179 -0
gr00t/configs/training/__init__.py +14 -0
gr00t/configs/training/training_config.py +127 -0
gr00t/data/__init__.py +14 -0
gr00t/data/collator/__init__.py +16 -0

.github/ISSUE_TEMPLATE/bug_report.yml ADDED Viewed

	@@ -0,0 +1,46 @@

+name: 🐛 Bug Report
+description: Create a report to help us reproduce and fix the bug
+labels: 'bug'
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/NVIDIA/Isaac-GR00T/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: 🐛 Describe the bug
+    description: |
+      Please provide a clear and concise description of what the bug is.
+      If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example:
+      ```python
+      # All necessary imports at the beginning
+      import gr00t
+      # A succinct reproducing example trimmed down to the essential parts:
+      assert False is True, "Oh no!"
+      ```
+      If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
+      Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
+    placeholder: |
+      A clear and concise description of what the bug is.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Versions
+    description: |
+      Please run the following and paste the output below.
+      ```sh
+      python --version && pip freeze
+      ```
+  validations:
+    required: true
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!

.github/ISSUE_TEMPLATE/documentation.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+name: 📚 Documentation
+description: Report an issue related to https://github.com/NVIDIA/Isaac-GR00T
+labels: 'documentation'
+body:
+- type: textarea
+  attributes:
+    label: 📚 The doc issue
+    description: >
+      A clear and concise description of what content in https://github.com/NVIDIA/Isaac-GR00T is an issue.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Suggest a potential alternative/fix
+    description: >
+      Tell us how we could improve the documentation in this regard.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!

.github/ISSUE_TEMPLATE/feature_request.yml ADDED Viewed

	@@ -0,0 +1,26 @@

+name: 🚀 Feature request
+description: Submit a proposal/request for a new feature
+labels: 'feature request'
+body:
+- type: textarea
+  attributes:
+    label: 🚀 The feature, motivation and pitch
+    description: >
+      A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Alternatives
+    description: >
+      A description of any alternative solutions or features you've considered, if any.
+- type: textarea
+  attributes:
+    label: Additional context
+    description: >
+      Add any other context or screenshots about the feature request.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!

.github/actions/setup-venv/action.yml ADDED Viewed

	@@ -0,0 +1,55 @@

+name: Python virtualenv
+description: Set up a Python virtual environment with caching
+inputs:
+  python-version:
+    description: The Python version to use
+    required: true
+  cache-prefix:
+    description: Update this to invalidate the cache
+    required: true
+    default: v4
+runs:
+  using: composite
+  steps:
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ inputs.python-version }}
+    - shell: bash
+      run: |
+        # Install prerequisites.
+        pip install --upgrade pip setuptools wheel virtualenv
+    - shell: bash
+      run: |
+        # Get the exact Python version to use in the cache key.
+        echo "PYTHON_VERSION=$(python --version)" >> $GITHUB_ENV
+    - uses: actions/cache@v4
+      id: virtualenv-cache
+      with:
+        path: .venv
+        key: ${{ inputs.cache-prefix }}-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('pyproject.toml') }}
+    - if: steps.virtualenv-cache.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        # Set up virtual environment without cache hit.
+        test -d .venv || virtualenv -p $(which python) --copies --reset-app-data .venv
+        . .venv/bin/activate
+        pip install ruff
+    - if: steps.virtualenv-cache.outputs.cache-hit == 'true'
+      shell: bash
+      run: |
+        # Set up virtual environment from cache hit.
+        . .venv/bin/activate
+    - shell: bash
+      run: |
+        # Show environment info.
+        . .venv/bin/activate
+        echo "✓ Installed $(python --version) virtual environment to $(which python)"
+        echo "Packages:"
+        pip freeze

.github/pull_request_template.md ADDED Viewed

	@@ -0,0 +1,17 @@

+<!-- To ensure we can review your pull request promptly please complete this template entirely. -->
+<!-- Please reference the issue number here. You can replace "Fixes" with "Closes" if it makes more sense. -->
+Fixes #
+Changes proposed in this pull request:
+<!-- Please list all changes/additions here. -->
+-
+## Before submitting
+<!-- Please complete this checklist BEFORE submitting your PR to speed along the review process. -->
+- [ ] I've read and followed all steps in the [Making a pull request](https://github.com/NVIDIA/Isaac-GR00T/blob/main/CONTRIBUTING.md#making-a-pull-request)
+    section of the `CONTRIBUTING` docs.
+- [ ] I've updated or added any relevant docstrings.
+- [ ] If this PR fixes a bug, I've added a test that will fail without my fix.
+- [ ] If this PR adds a new feature, I've added tests that sufficiently cover my new functionality.

.github/workflows/main.yml ADDED Viewed

	@@ -0,0 +1,68 @@

+name: Main
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+    tags:
+      - "v*.*.*"
+env:
+  # Change this to invalidate existing cache.
+  CACHE_PREFIX: v4
+  PYTHONPATH: ./
+jobs:
+  checks:
+    name: Python ${{ matrix.python }} - ${{ matrix.task.name }}
+    runs-on: [ubuntu-latest]
+    timeout-minutes: 15
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - python: "3.10"
+            task:
+              name: Lint
+              run: |
+                ruff check .
+                ruff format --check .
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          lfs: true
+      - name: Pull LFS objects
+        run: git lfs pull
+      - name: Setup Python environment
+        uses: ./.github/actions/setup-venv
+        with:
+          python-version: ${{ matrix.python }}
+          cache-prefix: ${{ env.CACHE_PREFIX }}
+      - name: ${{ matrix.task.name }}
+        run: |
+          . .venv/bin/activate
+          ${{ matrix.task.run }}
+      - name: Upload package distribution files
+        if: matrix.task.name == 'Build'
+        uses: actions/upload-artifact@v4
+        with:
+          name: package
+          path: dist
+      - name: Clean up
+        if: always()
+        run: |
+          . .venv/bin/activate
+          pip uninstall -y gr00t

AGENTS.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# CLAUDE.md — Isaac GR00T N1.7
+## Project overview
+Isaac GR00T N1.7 is an open vision-language-action (VLA) model for generalized humanoid robot skills.
+The repo contains the model, training pipeline, evaluation harness, and deployment tooling.
+- **Language:** Python 3.10 (dGPU, Orin); Python 3.12 (Thor, DGX Spark — see deployment dir)
+- **Package manager:** [uv](https://docs.astral.sh/uv/)
+- **Build system:** setuptools (see `pyproject.toml`)
+- **CI:** internal GitLab CI (`.gitlab-ci.yml` + includes under `ci/`, not shipped to the public GitHub EA repo); public GitHub Actions (`.github/workflows/`)
+## Quick-start commands
+```bash
+# Install (dev mode with all extras)
+uv sync --all-extras
+# Lint and format (uses ruff via pre-commit)
+pre-commit run --all-files
+# Run CPU tests
+python -m pytest tests/ -m "not gpu" -v --timeout=300
+# Run GPU tests
+python -m pytest tests/ -m gpu -v --timeout=300
+# Build package
+uv build
+# Validate lockfile
+uv lock --locked
+```
+## Code style
+- Formatter: `ruff format` (double quotes, spaces, line-length 100)
+- Linter: `ruff check` with rules E, F, I (ignores E501)
+- Config lives in `pyproject.toml` under `[tool.ruff]`
+- Run `pre-commit run --all-files` before committing
+## Directory layout
+```
+gr00t/              # Main package
+  configs/          #   Training, data, and model configs
+  data/             #   Data loading, embodiment tags, dataset processing
+  eval/             #   Evaluation (run_gr00t_server.py)
+  experiment/       #   Training pipeline (launch_finetune.py, trainer.py)
+  model/            #   Model architecture (N1.7, base, modules)
+  policy/           #   Policy inference (Gr00tPolicy, server/client)
+examples/           # Per-embodiment example configs and READMEs
+scripts/            # Deployment, conversion, and utility scripts
+  deployment/       #   Platform install scripts (dgpu, orin, thor, spark)
+tests/              # pytest suite (markers: gpu, not gpu)
+getting_started/    # User-facing guides and notebooks
+```
+## Key entry points
+- **Fine-tune:** `bash examples/finetune.sh --base-model-path <path> --dataset-path <path> --embodiment-tag <tag> --output-dir <dir>`
+- **Inference server:** `python gr00t/eval/run_gr00t_server.py --model-path <path> --embodiment-tag <tag>`
+- **ONNX export:** `python scripts/deployment/export_onnx_n1d7.py`
+- **TensorRT build:** `python scripts/deployment/build_trt_pipeline.py`
+- **Benchmark:** `python scripts/deployment/benchmark_inference.py`
+## Testing
+- Test markers: `gpu` (requires GPU), default is CPU-safe
+- Fixtures live in `tests/fixtures/` and `demo_data/`
+- CI runs CPU and GPU tests in separate jobs with 300s timeout
+## Deployment platforms
+- **dGPU (H100, A100, RTX):** CUDA 12.8 — install via `scripts/deployment/dgpu/install_deps.sh`, container via top-level `docker/Dockerfile` (supports x86_64 and aarch64)
+- **Jetson Orin:** CUDA 12.6 — install via `scripts/deployment/orin/install_deps.sh`, container via `scripts/deployment/orin/Dockerfile`
+- **Jetson Thor:** CUDA 13.0 — install via `scripts/deployment/thor/install_deps.sh`, container via `scripts/deployment/thor/Dockerfile`
+- **DGX Spark:** CUDA 13.0 — install via `scripts/deployment/spark/install_deps.sh`, container via `scripts/deployment/spark/Dockerfile`
+Each Jetson/Spark platform ships an `activate_*.sh` helper (`scripts/activate_orin.sh`, `scripts/activate_spark.sh`, `scripts/activate_thor.sh`) that exports platform-specific library paths. For dGPU, the standard `source .venv/bin/activate` is sufficient.

ATTRIBUTIONS.md ADDED Viewed

The diff for this file is too large to render. See raw diff

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# CLAUDE.md — Isaac GR00T N1.7
+## Project overview
+Isaac GR00T N1.7 is an open vision-language-action (VLA) model for generalized humanoid robot skills.
+The repo contains the model, training pipeline, evaluation harness, and deployment tooling.
+- **Language:** Python 3.10 (dGPU, Orin); Python 3.12 (Thor, DGX Spark — see deployment dir)
+- **Package manager:** [uv](https://docs.astral.sh/uv/)
+- **Build system:** setuptools (see `pyproject.toml`)
+- **CI:** internal GitLab CI (`.gitlab-ci.yml` + includes under `ci/`, not shipped to the public GitHub EA repo); public GitHub Actions (`.github/workflows/`)
+## Quick-start commands
+```bash
+# Install (dev mode with all extras)
+uv sync --all-extras
+# Lint and format (uses ruff via pre-commit)
+pre-commit run --all-files
+# Run CPU tests
+python -m pytest tests/ -m "not gpu" -v --timeout=300
+# Run GPU tests
+python -m pytest tests/ -m gpu -v --timeout=300
+# Build package
+uv build
+# Validate lockfile
+uv lock --locked
+```
+## Code style
+- Formatter: `ruff format` (double quotes, spaces, line-length 100)
+- Linter: `ruff check` with rules E, F, I (ignores E501)
+- Config lives in `pyproject.toml` under `[tool.ruff]`
+- Run `pre-commit run --all-files` before committing
+## Directory layout
+```
+gr00t/              # Main package
+  configs/          #   Training, data, and model configs
+  data/             #   Data loading, embodiment tags, dataset processing
+  eval/             #   Evaluation (run_gr00t_server.py)
+  experiment/       #   Training pipeline (launch_finetune.py, trainer.py)
+  model/            #   Model architecture (N1.7, base, modules)
+  policy/           #   Policy inference (Gr00tPolicy, server/client)
+examples/           # Per-embodiment example configs and READMEs
+scripts/            # Deployment, conversion, and utility scripts
+  deployment/       #   Platform install scripts (dgpu, orin, thor, spark)
+tests/              # pytest suite (markers: gpu, not gpu)
+getting_started/    # User-facing guides and notebooks
+```
+## Key entry points
+- **Fine-tune:** `bash examples/finetune.sh --base-model-path <path> --dataset-path <path> --embodiment-tag <tag> --output-dir <dir>`
+- **Inference server:** `python gr00t/eval/run_gr00t_server.py --model-path <path> --embodiment-tag <tag>`
+- **ONNX export:** `python scripts/deployment/export_onnx_n1d7.py`
+- **TensorRT build:** `python scripts/deployment/build_trt_pipeline.py`
+- **Benchmark:** `python scripts/deployment/benchmark_inference.py`
+## Testing
+- Test markers: `gpu` (requires GPU), default is CPU-safe
+- Fixtures live in `tests/fixtures/` and `demo_data/`
+- CI runs CPU and GPU tests in separate jobs with 300s timeout
+## Deployment platforms
+- **dGPU (H100, A100, RTX):** CUDA 12.8 — install via `scripts/deployment/dgpu/install_deps.sh`, container via top-level `docker/Dockerfile` (supports x86_64 and aarch64)
+- **Jetson Orin:** CUDA 12.6 — install via `scripts/deployment/orin/install_deps.sh`, container via `scripts/deployment/orin/Dockerfile`
+- **Jetson Thor:** CUDA 13.0 — install via `scripts/deployment/thor/install_deps.sh`, container via `scripts/deployment/thor/Dockerfile`
+- **DGX Spark:** CUDA 13.0 — install via `scripts/deployment/spark/install_deps.sh`, container via `scripts/deployment/spark/Dockerfile`
+Each Jetson/Spark platform ships an `activate_*.sh` helper (`scripts/activate_orin.sh`, `scripts/activate_spark.sh`, `scripts/activate_thor.sh`) that exports platform-specific library paths. For dGPU, the standard `source .venv/bin/activate` is sufficient.

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,7 @@

+# Contributions
+During Early Access we are not accepting pull requests while the codebase stabilizes. If you encounter issues or have suggestions, please open an [Issue](https://github.com/NVIDIA/Isaac-GR00T/issues) in this repository.
+## Support
+Support during Early Access is best-effort. We will continue iterating toward a more stable General Availability (GA) release.

FAQ.md ADDED Viewed

	@@ -0,0 +1,81 @@

+# GR00T N1.7 FAQ
+## Infrastructure & Hardware
+### Is the data loader GPU-accelerated?
+No, the current data loader is CPU-based. However, it has been heavily optimized for multimodal data to ensure it does not become a training bottleneck. We validated this on various configurations, including GB200, H100, and local desktops with RTX 4090 GPUs. We are actively exploring GPU-accelerated approaches for future releases.
+### Is the same data loader used for both pre-training and post-training?
+Yes, the data loading pipeline is unified across both training stages.
+### What is the role of the Policy Remote Server in the deployment diagram?
+The Policy Remote Server decouples inference from the physical robot. This allows users to run the policy on a high-compute cluster (e.g., H100s) for faster inference while the robot operates in a separate environment. It separates dependencies and enables scaling beyond the robot's onboard compute.
+## Workflow & Architecture
+### Why retain only specific LLM layers (e.g., 16 layers) during fine-tuning?
+This configuration was empirically tuned for the backbone (e.g., Eagle/Cosmos-Reason). Research suggests early layers capture grammatical structure, while middle-to-late layers are highly expressive. However, the very last layers are often over-optimized for next-token prediction; pruning or freezing them can sometimes yield better representations for vision-language-action alignment.
+### How do you verify if the language model is successfully aligned with the action space?
+We evaluate this end-to-end via downstream task success. We design evaluation tasks that are ambiguous without language instructions (e.g., "pick the pear" from a bowl of mixed fruit). If the robot succeeds, it confirms the model is correctly grounding language commands into physical actions.
+## Data Strategy & Volume
+### How much data is required for post-training on a new embodiment or task?
+Data requirements depend heavily on task complexity and scene variation. Typical guidelines include:
+- **Simple, fixed-location tasks (Pick & Place):** ~100 trajectories.
+- **Complex scenes or multi-step tasks:** ~500+ trajectories.
+- **High-DoF humanoid tasks:** ~2,000+ trajectories (e.g., shelf-picking with G1).
+- **Fine manipulation:** ~100–500 episodes, ideally with human motion pre-training.
+### What is the recommended strategy for improving success rates on hard tasks?
+We recommend an iterative approach: start with ~100 teleoperated demonstrations, train a policy, and then use HG-DAgger (Human Gated Dataset Aggregation). Run the policy, intervene when it fails, and add the corrections from those trajectories to the dataset. This helps the model cover out-of-distribution states that pure behavior cloning (BC) might miss, and recover from partial failure states (e.g., a grip slipping or imprecise item placement).
+### Does including real-robot data from other embodiments help if I only care about one robot?
+Yes. Even if cross-embodiment generalization is not your goal, including diverse real-robot data adds visual diversity and robustness to the VLA's backbone, improving performance on your specific target robot.
+### Does GR00T N1.7 support synthetic data generation via Cosmos?
+While research models (like DreamGen) show promise, a robust, product-ready pipeline for generating synthetic training data via Cosmos is currently in development and not yet part of the standard release.
+## Model Capabilities
+### Can the model handle lighting changes or different object colors?
+VLMs can struggle with drastic appearance changes (e.g., hard shadows or significant hue shifts). While we haven't released specific lighting ablations, we strongly recommend using color jitter augmentation during training and collecting diverse data (20–50 episodes) under different lighting conditions to prevent overfitting.
+### Can GR00T models perform reasoning or Visual Question Answering (VQA)?
+The GR00T N1.x series is optimized specifically for action generation, not open-ended reasoning or VQA. Capabilities requiring complex semantic reasoning are targeted for future N2 releases.
+### Can the model learn "retry" behaviors?
+The current architecture is stateless and does not inherently "know" if a previous attempt failed. While some retry behavior may emerge from high-quality data, explicit recovery strategies are best achieved through DAgger (collecting data on recovery from failure) or Reinforcement Learning (RL), rather than pure Imitation Learning.
+### Does the model distinguish between left and right arms in bimanual tasks?
+Yes, provided the training data is distinct or annotated (e.g., instructions specifying "left arm" vs. "right arm"). If the dataset contains mixed, unannotated data where both arms perform identical tasks indiscriminately, the model may struggle to distinguish them.
+### Is there a zero-shot cross-embodiment VLA model?
+No. While cross-embodiment data improves generalization, a true "zero-shot" model (one that works perfectly on a new robot without *any* fine-tuning) does not currently exist in the open VLA landscape.
+### Will differences in object shape between training and deployment cause the success rate to drop?
+It depends on the degree of deviation. If the target object's shape differs drastically from the training data, performance will likely drop significantly. However, if the shape variation is minor and shares a similar grasping affordance (e.g., a slightly different bottle shape that is still grasped from the side), the model may still succeed, though with potentially lower reliability than on the original objects.
+### Has the impact of large viewpoint changes (e.g., head movement) on task difficulty been studied?
+Yes. Large viewpoint changes effectively change the observation distribution, which can complicate simple tasks. For example, a "simple" handover becomes complex if the robot's head moves significantly, altering the camera's perspective of its own hands.
+- **Current Status:** Most public GR00T demos feature a relatively fixed head position to stabilize observations.
+- **Mitigation:** To handle natural head movement, we recommend training with aggressive camera pose augmentation or collecting data that explicitly includes head motion to ensure the policy becomes robust to viewpoint shifts.

README.md ADDED Viewed

	@@ -0,0 +1,555 @@

+<div align="center">
+  <img src="media/header_compress.png" width="800" alt="NVIDIA Isaac GR00T N1.7 Header">
+  <!-- --- -->
+  <p style="font-size: 1.2em;">
+    <a href="https://developer.nvidia.com/isaac/gr00t"><strong>Website</strong></a> |
+    <a href="https://huggingface.co/collections/nvidia/gr00t-n17"><strong>Model</strong></a> |
+    <a href="https://huggingface.co/collections/nvidia/physical-ai"><strong>Dataset</strong></a> |
+    <a href="https://arxiv.org/abs/2503.14734"><strong>Paper</strong></a> |
+    <a href="https://developer.nvidia.com/isaac"><strong>NVIDIA Isaac</strong></a> |
+    <a href="FAQ.md"><strong>FAQ</strong></a>
+  </p>
+</div>
+## Table of Contents
+- [NVIDIA Isaac GR00T](#nvidia-isaac-gr00t)
+- [What's New in GR00T N1.7](#whats-new-in-gr00t-n17)
+- [Installation](#installation)
+- [Model Checkpoints & Embodiment Tags](#model-checkpoints--embodiment-tags)
+- [Data Format](#data-format)
+- [Inference](#inference)
+- [Fine-tuning](#fine-tuning)
+- [Evaluation](#evaluation)
+- [Contributions](#contributions)
+- [License](#license)
+- [Citation](#citation)
+---
+## NVIDIA Isaac GR00T
+<table style="width:100%; table-layout:fixed;">
+  <tr>
+    <td style="width:33.33%; text-align:center;">
+      <img src="media/unitree_g1.gif" style="max-width:100%; height:auto;">
+    </td>
+    <td style="width:33.33%; text-align:center;">
+      <img src="media/agibot_g1.gif" style="max-width:100%; height:auto;">
+    </td>
+    <td style="width:33.33%; text-align:center;">
+      <img src="media/yam.gif" style="max-width:100%; height:auto;">
+    </td>
+  </tr>
+</table>
+> We just released GR00T N1.7 Early Access, the latest version of GR00T N1 with a new VLM backbone (Cosmos-Reason2-2B / Qwen3-VL) and improved performance.
+> **This is an Early Access (EA) release.** You are welcome to download the model, explore the codebase, and begin building on the stack, with the understanding that support and stability guarantees are limited until the GA release.
+>
+> **What's available:**
+> - Pre-trained GR00T N1.7 model weights and reference code
+> - Fine-tuning and inference with custom robot data or demonstrations
+> - Experimentation, prototyping, and research use cases
+>
+> **Available at GA:**
+> - Production deployment with commercial support
+> - Complete benchmarks and a fully validated, stable feature set
+> - Pull request contributions
+>
+> We welcome feedback - please feel free to raise issues in this repository.
+> To use older versions: [N1.6](https://github.com/NVIDIA/Isaac-GR00T/releases/tag/n1.6-release) | [N1.5](https://github.com/NVIDIA/Isaac-GR00T/tree/n1.5-release)
+NVIDIA Isaac GR00T N1.7 is an open vision-language-action (VLA) model for generalized humanoid robot skills. This cross-embodiment model takes multimodal input, including language and images, to perform manipulation tasks in diverse environments.
+GR00T N1.7 is trained on a diverse mixture of robot data including bimanual, semi-humanoid and an expansive humanoid dataset. It is adaptable through post-training for specific embodiments, tasks and environments.
+GR00T N1.7 is fully commercially licensable under Apache 2.0. It delivers comparable performance to N1.6, with improved generalization and language-following capabilities driven by the inclusion of 20K hours of EgoScale human video data in pretraining.
+The neural network architecture of GR00T N1.7 is a combination of vision-language foundation model and diffusion transformer head that denoises continuous actions. Here is a schematic diagram of the architecture:
+<div align="center">
+<img src="media/model-architecture.png" width="800" alt="model-architecture">
+</div>
+### Workflow Overview
+1. **Prepare data** — Collect robot demonstrations (video, state, action) and convert them to the [GR00T LeRobot format](#data-format). Demo datasets are included for quick testing.
+2. **Run inference** — Try zero-shot inference with the base model on [pretrain embodiments](#embodiment-tags), or use a [finetuned checkpoint](#checkpoints) for benchmark tasks.
+3. **Fine-tune** — Adapt the model to your robot using [`launch_finetune.py`](#fine-tuning) with your own data and modality config.
+4. **Evaluate** — Validate with [open-loop evaluation](#open-loop-evaluation), then test in [simulation benchmarks](#benchmark-examples) or on real hardware via the [Policy API](getting_started/policy.md).
+5. **Deploy** — Connect `Gr00tPolicy` to your robot controller, optionally accelerated with [TensorRT](scripts/deployment/README.md).
+## What's New in GR00T N1.7
+GR00T N1.7 builds on N1.6 with a new VLM backbone and code-level improvements.
+1. **Relative EEF Action Space** — N1.7 adopts a relative end-effector action space shared across robot and human embodiments. Representing actions as deltas from the current pose (rather than absolute targets) improves generalization and is a key factor in the model's cross-embodiment performance. See [`getting_started/finetune_new_embodiment.md`](getting_started/finetune_new_embodiment.md) for guidance on configuring relative EEF for your own robot.
+2. **Human Video Pretraining** — N1.7 is pretrained on 20K hours of EgoScale human video data alongside diverse robot demonstrations. Because the relative EEF action representation is consistent across both human and robot data, the model can transfer manipulation priors learned from human video directly to robot control.
+### Key Changes from N1.6
+- **New VLM backbone:** Cosmos-Reason2-2B (Qwen3-VL architecture), replacing the Eagle backbone used in N1.6. Supports flexible resolution and encodes images in their native aspect ratio without padding.
+- Simplified data processing pipeline (`processing_gr00t_n1d7.py`).
+- Added full pipeline export to ONNX and TensorRT with improved frequency.
+---
+## Installation
+### Hardware Requirements
+**Inference:** 1 GPU with 16 GB+ VRAM (e.g., RTX 4090, L40, H100, Jetson AGX Thor/Orin, DGX Spark).
+**Fine-tuning:** 1 or more GPUs with 40 GB+ VRAM recommended. We recommend H100 or L40 nodes for optimal performance. Other hardware (e.g., A6000) works but may require longer training time. See the [Hardware Recommendation Guide](getting_started/hardware_recommendation.md) for detailed specs.
+**CUDA / Python per platform:** dGPU on CUDA 12.8 with Python 3.10; Jetson Orin on CUDA 12.6 with Python 3.10; Jetson Thor and DGX Spark on CUDA 13.0 with Python 3.12. The per-platform install scripts and Dockerfiles live under `scripts/deployment/`; see the [Deployment & Inference Guide](scripts/deployment/README.md) for the full matrix.
+### Clone the Repository
+GR00T relies on submodules for certain dependencies. Include them when cloning:
+**Note:** `git-lfs` is **required** to download parquet data files in `/demo_data`. Install it before cloning: `sudo apt install git-lfs && git lfs install`.
+```sh
+git clone --recurse-submodules https://github.com/NVIDIA/Isaac-GR00T
+cd Isaac-GR00T
+```
+If you've already cloned without submodules, initialize them separately:
+```sh
+git submodule update --init --recursive
+```
+### Set Up the Environment
+GR00T uses [uv](https://github.com/astral-sh/uv) for fast, reproducible dependency management. Install uv first:
+```sh
+curl -LsSf https://astral.sh/uv/install.sh | sh
+```
+#### dGPU (x86_64) — Default
+Install FFmpeg (required by `torchcodec`, the default video backend):
+```sh
+sudo apt-get update && sudo apt-get install -y ffmpeg
+```
+Create the environment and install GR00T:
+```sh
+uv sync --python 3.10
+```
+GPU dependencies (flash-attn, TensorRT, etc.) are included in the default install.
+Verify the installation:
+```sh
+uv run python -c "import gr00t; print('GR00T installed successfully')"
+```
+> **`flash-attn` message on every `uv run`:** You may see `Installing flash-attn...` each time you run `uv run`. This is a known `uv` behavior with URL-pinned wheel sources — `uv` re-validates the cached wheel against the source URL on each invocation. It is **not** rebuilding from source; the wheel is already cached locally and the operation takes 2-3 seconds. This only affects x86_64 platforms.
+> To suppress it, remove the `flash-attn` entries under `[tool.uv.sources]` in your local `pyproject.toml` after the initial install. But that will break `uv lock` and cause flash-attn to build from source on next lock regeneration.
+<details>
+<summary><strong>Alternative: pip install (without uv)</strong></summary>
+If you prefer pip/conda over uv, create a Python 3.10 virtualenv and install:
+```sh
+python3.10 -m venv .venv && source .venv/bin/activate
+pip install -e .
+```
+Note: GPU dependencies (flash-attn, TensorRT) may require manual installation with pip. The `uv` workflow handles these automatically.
+</details>
+> **If fine-tuning fails with `CUDA_HOME is unset`:** Run `bash scripts/deployment/dgpu/install_deps.sh` once to configure CUDA paths, or manually `export CUDA_HOME=/usr/local/cuda`.
+> **CUDA 13.x Users (Thor, Spark, and other CUDA 13+ platforms):** PyTorch 2.7 pins Triton to 3.3.1, which does not recognize CUDA major version 13+. This causes a `RuntimeError` in Triton's `ptx_get_version()`. Run the patch script to fix:
+> ```sh
+> uv run bash scripts/patch_triton_cuda13.sh
+> ```
+> **GB300 (sm_103) Users:** Triton 3.3.1 (pinned by PyTorch 2.7) does not support the GB300 GPU architecture (sm_103). `torch.compile` will fail on GB300. Use PyTorch eager mode or TensorRT inference instead. Triton 3.5.1+ adds sm_103 support but is not yet compatible with the pinned PyTorch version.
+> **aarch64 Video Backend:** On aarch64 platforms (Thor, Orin, Spark), `torchcodec` is the required video backend. `install_deps.sh` prefers the prebuilt aarch64 wheel under `scripts/deployment/dgpu/wheels/` (shared by Thor/Spark against FFmpeg 6; Orin uses a matching build against FFmpeg 4) and falls back to a source build only if the wheel is missing. If you encounter `NotImplementedError` from the video backend, ensure `torchcodec` was installed successfully during setup. Other backends (decord, pyav) are not supported on aarch64.
+<details>
+<summary><strong>DGX Spark</strong> (tested with DGX Spark GB10)</summary>
+```bash
+bash scripts/deployment/spark/install_deps.sh
+source .venv/bin/activate
+source scripts/activate_spark.sh
+```
+See the [Spark setup guide](scripts/deployment/README.md#dgx-spark-setup) for Docker and bare metal details.
+</details>
+<details>
+<summary><strong>Jetson AGX Thor</strong> (tested with JetPack 7.1)</summary>
+> **flash-attn on older systems (e.g., Ubuntu 20.04 with glibc < 2.35):** The pre-built `flash-attn` wheel may fail with `ImportError: glibc_compat.so: cannot open shared object file`. To fix this, build from source:
+> ```sh
+> uv pip install flash-attn==2.7.4.post1 --no-binary flash-attn --no-cache
+> ```
+> This compiles locally (~10-30 minutes) and avoids the glibc compatibility issue.
+```bash
+bash scripts/deployment/thor/install_deps.sh
+source .venv/bin/activate
+source scripts/activate_thor.sh
+```
+See the [Thor setup guide](scripts/deployment/README.md#jetson-thor-setup) for Docker and bare metal details.
+</details>
+<details>
+<summary><strong>Jetson Orin</strong> (tested with JetPack 6.2)</summary>
+```bash
+bash scripts/deployment/orin/install_deps.sh
+source .venv/bin/activate
+source scripts/activate_orin.sh
+```
+See the [Orin setup guide](scripts/deployment/README.md#jetson-orin-setup) for Docker and bare metal details.
+</details>
+For a containerized setup that avoids system-level dependency conflicts, see our [Docker Setup Guide](docker/README.md).
+---
+## Model Checkpoints & Embodiment Tags
+### Checkpoints
+| Checkpoint | Type | Embodiment Tag | Description |
+|------------|------|---------------|-------------|
+| [`nvidia/GR00T-N1.7-3B`](https://huggingface.co/nvidia/GR00T-N1.7-3B) | Base | See [pretrain tags](getting_started/policy.md#--embodiment-tag) | Base model (3B params) — zero-shot inference on pretrain embodiments, or finetune for new tasks |
+| [`nvidia/GR00T-N1.7-LIBERO`](https://huggingface.co/nvidia/GR00T-N1.7-LIBERO) | Finetuned | `LIBERO_PANDA` | Finetuned on [LIBERO](https://libero-project.github.io/) benchmark (Franka Panda) |
+| [`nvidia/GR00T-N1.7-DROID`](https://huggingface.co/nvidia/GR00T-N1.7-DROID) | Finetuned | `OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT` | Finetuned on [DROID](https://droid-dataset.github.io/) dataset |
+| [`nvidia/GR00T-N1.7-SimplerEnv-Bridge`](https://huggingface.co/nvidia/GR00T-N1.7-SimplerEnv-Bridge) | Finetuned | `SIMPLER_ENV_WIDOWX` | Finetuned on SimplerEnv Bridge (WidowX) |
+| [`nvidia/GR00T-N1.7-SimplerEnv-Fractal`](https://huggingface.co/nvidia/GR00T-N1.7-SimplerEnv-Fractal) | Finetuned | `SIMPLER_ENV_GOOGLE` | Finetuned on SimplerEnv Fractal (Google Robot) |
+> Older versions: [N1.6 checkpoints](https://github.com/NVIDIA/Isaac-GR00T/tree/n1.6-release) | [N1.5 checkpoints](https://github.com/NVIDIA/Isaac-GR00T/tree/n1.5-release)
+### Embodiment Tags
+Every inference or finetuning command requires an `--embodiment-tag`. The tag determines which modality config (state/action keys, normalization) the model uses. Tags are **case-insensitive**.
+For the full list of pretrain and posttrain tags, see the [Policy API Guide — Embodiment Tags](getting_started/policy.md#--embodiment-tag).
+---
+## Data Format
+GR00T uses a flavor of the [LeRobot v2 dataset format](https://github.com/huggingface/lerobot) with an additional `meta/modality.json` file that describes state/action/video structure. A dataset looks like:
+```
+my_dataset/
+  meta/
+    info.json            # dataset metadata
+    episodes.jsonl       # episode index and lengths
+    tasks.jsonl          # language task descriptions
+    modality.json        # state/action/video key mapping (GR00T-specific)
+  data/chunk-000/        # parquet files (state, action per timestep)
+  videos/chunk-000/      # mp4 video files per episode
+```
+The `modality.json` maps how the concatenated state/action arrays split into named fields (e.g., `x`, `y`, `z`, `gripper`) and which video keys are available. This is what the embodiment tag uses to interpret the data.
+**Included demo datasets** (ready to use, no download needed):
+| Dataset | Robot | Embodiment Tag | Use Case |
+|---------|-------|---------------|----------|
+| `demo_data/droid_sample` | DROID (3 episodes) | `OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT` | Zero-shot or finetuned inference (DROID) |
+| `demo_data/libero_demo` | LIBERO Panda (5 episodes) | `LIBERO_PANDA` | Inference with finetuned checkpoint |
+| `demo_data/simplerenv_bridge_sample` | WidowX (SimplerEnv Bridge) | `SIMPLER_ENV_WIDOWX` | Inference with finetuned SimplerEnv Bridge checkpoint |
+| `demo_data/simplerenv_fractal_sample` | Google Robot (SimplerEnv Fractal) | `SIMPLER_ENV_GOOGLE` | Inference with finetuned SimplerEnv Fractal checkpoint |
+| `demo_data/cube_to_bowl_5` | SO100 arm (5 episodes) | `NEW_EMBODIMENT` | Fine-tuning custom embodiment example |
+| `demo_data/cube_to_bowl_5_with_mask` | SO100 arm + per-frame masks | `NEW_EMBODIMENT` | [Mask-guided background suppression](examples/mask-guided-background-suppression/README.md) example |
+> To generate more DROID episodes: `python scripts/download_droid_sample.py --num-episodes 10`
+**Using your own data:** Convert your demonstrations to the format above. If coming from LeRobot v3, use the conversion script: `python scripts/lerobot_conversion/convert_v3_to_v2.py`. See the full [Data Preparation Guide](getting_started/data_preparation.md) for schema details and examples.
+---
+## Inference
+### Zero-Shot Inference (Base Model)
+The included `demo_data/droid_sample` dataset works with the base model out of the box — no finetuning or checkpoint download needed:
+```bash
+uv run python scripts/deployment/standalone_inference_script.py \
+    --model-path nvidia/GR00T-N1.7-3B \
+    --dataset-path demo_data/droid_sample \
+    --embodiment-tag OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT \
+    --traj-ids 1 2 \
+    --inference-mode pytorch \
+    --action-horizon 8
+```
+This runs open-loop inference on 2 DROID episodes, comparing predicted actions against ground truth. The base model downloads automatically from HuggingFace on first run (~6 GB).
+### Finetuned Inference
+For posttrain embodiments, use a finetuned checkpoint. Most finetuned checkpoints (e.g., DROID, SimplerEnv) have a flat file structure and can be passed directly as a HuggingFace model ID — no manual download needed:
+```bash
+uv run python scripts/deployment/standalone_inference_script.py \
+    --model-path nvidia/GR00T-N1.7-DROID \
+    --dataset-path demo_data/droid_sample \
+    --embodiment-tag OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT \
+    --traj-ids 1 2 \
+    --inference-mode pytorch \
+    --action-horizon 8
+```
+Some checkpoints (e.g., LIBERO) use a nested folder structure with model files under a subfolder. HuggingFace does not support nested repo paths in `--model-path`, so you must download first:
+```bash
+uv run hf download nvidia/GR00T-N1.7-LIBERO \
+    --include "libero_10/config.json" "libero_10/embodiment_id.json" \
+    "libero_10/model-*.safetensors" "libero_10/model.safetensors.index.json" \
+    "libero_10/processor_config.json" "libero_10/statistics.json" \
+    --local-dir checkpoints/GR00T-N1.7-LIBERO
+```
+```bash
+uv run python scripts/deployment/standalone_inference_script.py \
+    --model-path checkpoints/GR00T-N1.7-LIBERO/libero_10 \
+    --dataset-path demo_data/libero_demo \
+    --embodiment-tag LIBERO_PANDA \
+    --traj-ids 0 1 2 \
+    --inference-mode pytorch \
+    --action-horizon 8
+```
+### Server-Client Inference (for Deployment)
+For real-world deployment or simulation evaluation, use the server-client architecture. The policy runs on a GPU server; a lightweight client sends observations and receives actions over ZMQ.
+**Terminal 1 — Start the policy server:**
+```bash
+uv run python gr00t/eval/run_gr00t_server.py \
+    --model-path nvidia/GR00T-N1.7-3B \
+    --embodiment-tag OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT \
+    --device cuda:0
+```
+**Terminal 2 — Run open-loop evaluation as a client:**
+```bash
+uv run python gr00t/eval/open_loop_eval.py \
+    --dataset-path demo_data/droid_sample \
+    --embodiment-tag OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT \
+    --host 127.0.0.1 \
+    --port 5555 \
+    --traj-ids 1 2 \
+    --action-horizon 8
+```
+> **Tip:** If you get `ZMQError: Address already in use`, the default port 5555 is occupied. Use `--port <other_port>`.
+For connecting to a real robot (e.g., DROID hardware), see [examples/DROID/README.md](examples/DROID/README.md). For faster inference with TensorRT, see the [Deployment & Inference Guide](scripts/deployment/README.md).
+See the complete [Policy API Guide](getting_started/policy.md) for documentation on observation/action formats, batched inference, and troubleshooting.
+---
+## Fine-tuning
+### Reproducing Benchmark Results
+Each benchmark has a self-contained README with dataset download, finetune, and evaluation commands:
+| Benchmark | Embodiment | Guide |
+|-----------|-----------|-------|
+| LIBERO | `LIBERO_PANDA` | [examples/LIBERO/README.md](examples/LIBERO/README.md) |
+| SimplerEnv (Fractal) | `SIMPLER_ENV_GOOGLE` | [examples/SimplerEnv/README.md](examples/SimplerEnv/README.md) |
+| SimplerEnv (Bridge) | `SIMPLER_ENV_WIDOWX` | [examples/SimplerEnv/README.md](examples/SimplerEnv/README.md) |
+| SO100 | `NEW_EMBODIMENT` | [examples/SO100/README.md](examples/SO100/README.md) |
+### Fine-tune on Your Own Robot ("NEW_EMBODIMENT")
+To finetune GR00T on your own robot data and configuration, follow the detailed tutorial at [`getting_started/finetune_new_embodiment.md`](getting_started/finetune_new_embodiment.md).
+Ensure your input data follows the [GR00T LeRobot format](#data-format), and specify your modality configuration via `--modality-config-path`.
+**Single GPU:**
+```bash
+CUDA_VISIBLE_DEVICES=0 uv run python \
+    gr00t/experiment/launch_finetune.py \
+    --base-model-path nvidia/GR00T-N1.7-3B \
+    --dataset-path demo_data/cube_to_bowl_5 \
+    --embodiment-tag NEW_EMBODIMENT \
+    --modality-config-path examples/SO100/so100_config.py \
+    --num-gpus 1 \
+    --output-dir /tmp/test_finetune \
+    --max-steps 2000 \
+    --global-batch-size 32 \
+    --dataloader-num-workers 4
+```
+**Multi-GPU (e.g., 8xH100):**
+```bash
+uv run torchrun --nproc_per_node=8 --master_port=29500 \
+    gr00t/experiment/launch_finetune.py \
+    --base-model-path nvidia/GR00T-N1.7-3B \
+    --dataset-path demo_data/cube_to_bowl_5 \
+    --embodiment-tag NEW_EMBODIMENT \
+    --modality-config-path examples/SO100/so100_config.py \
+    --num-gpus 8 \
+    --output-dir /tmp/test_finetune_8gpu \
+    --max-steps 2000 \
+    --global-batch-size 32 \
+    --dataloader-num-workers 4
+```
+Replace `demo_data/cube_to_bowl_5` and `examples/SO100/so100_config.py` with your own dataset and modality config. See [`examples/SO100`](examples/SO100/README.md) for a complete walkthrough.
+> **Note:** Use `uv run torchrun` (not bare `torchrun`) to ensure the correct virtual environment is used. Add `--use-wandb` to enable Weights & Biases logging. For more extensive configuration, use `gr00t/experiment/launch_train.py`.
+### Training Tips
+- Maximize batch size for your hardware and train for a few thousand steps.
+- Users may observe 5-6% variance between runs due to non-deterministic image augmentations. Keep this in mind when comparing to reported benchmarks.
+- **`--state_dropout_prob`** (model config default: 0.8; finetune CLI default: 0.2; see `gr00t/configs/finetune_config.py`): Randomly drops state inputs during training to improve generalization and reduce state-dependency. The shipped benchmark scripts override the CLI default per suite: LIBERO 10-Long uses 0.2 (the CLI default), SimplerEnv Bridge uses 0.8, SimplerEnv Fractal uses 0.5. If your task relies heavily on proprioceptive state, lower this value.
+---
+## Evaluation
+### Open-Loop Evaluation
+Compare predicted actions against ground truth from your dataset:
+```bash
+uv run python gr00t/eval/open_loop_eval.py \
+    --dataset-path <DATASET_PATH> \
+    --embodiment-tag NEW_EMBODIMENT \
+    --model-path <CHECKPOINT_PATH> \
+    --traj-ids 0 \
+    --action-horizon 16
+```
+This generates a visualization at `/tmp/open_loop_eval/traj_{traj_id}.jpeg` with ground truth vs. predicted actions and MSE metrics. Use `--save-plot-path <dir>` to save plots to a custom location.
+### Closed-Loop Evaluation
+Test your model in simulation or on real hardware using the server-client architecture:
+```bash
+# Start the policy server
+uv run python gr00t/eval/run_gr00t_server.py \
+    --embodiment-tag NEW_EMBODIMENT \
+    --model-path <CHECKPOINT_PATH> \
+    --device cuda:0 \
+    --host 0.0.0.0 --port 5555
+```
+```python
+from gr00t.policy.server_client import PolicyClient
+policy = PolicyClient(host="localhost", port=5555)
+env = YourEnvironment()
+obs, info = env.reset()
+action, info = policy.get_action(obs)
+obs, reward, done, truncated, info = env.step(action)
+```
+**Debugging with ReplayPolicy:** To verify your environment setup without a trained model, start the server with `--dataset-path <DATASET_PATH>` (omit `--model-path`) to replay recorded actions from the dataset.
+See the complete [Policy API Guide](getting_started/policy.md) for observation/action formats, batched inference, and troubleshooting.
+### Benchmark Examples
+We support evaluation on public benchmarks using a server-client architecture. The policy server reuses the project root's uv environment; simulation clients have individual setup scripts.
+You can use [the verification script](scripts/eval/check_sim_eval_ready.py) to verify that all dependencies are properly configured.
+**Zero-shot** (evaluate with the base model, no finetuning):
+- [DROID](examples/DROID/README.md) — real-world DROID robot (also available as the finetuned `nvidia/GR00T-N1.7-DROID` checkpoint; `examples/DROID/README.md` covers both paths)
+**Finetuned** (evaluate with finetuned checkpoints):
+- [DROID](examples/DROID/README.md) — real-world DROID robot via `nvidia/GR00T-N1.7-DROID`
+- [LIBERO](examples/LIBERO/README.md) — LIBERO benchmark (Franka Panda)
+- [SimplerEnv](examples/SimplerEnv/README.md) — Google Robot (Fractal) and WidowX (Bridge)
+- [SO100](examples/SO100/README.md) — SO100 custom embodiment workflow
+<details>
+<summary><strong>Adding a New Sim Benchmark</strong></summary>
+Each sim benchmark registers its environments under a gym env_name with the format `{prefix}/{task_name}` (e.g., `libero_sim/LIVING_ROOM_SCENE2_put_soup_in_basket`). The evaluation framework uses the prefix to look up the corresponding `EmbodimentTag` via a mapping in [`gr00t/eval/sim/env_utils.py`](gr00t/eval/sim/env_utils.py).
+> **Important:** The env_name prefix and the `EmbodimentTag` value are often different. For example, `libero_sim` maps to `EmbodimentTag.LIBERO_PANDA` (`"libero_sim"`). Do not assume they match.
+To add a new benchmark:
+1. Add an entry to `ENV_PREFIX_TO_EMBODIMENT_TAG` in `gr00t/eval/sim/env_utils.py`:
+   ```python
+   ENV_PREFIX_TO_EMBODIMENT_TAG = {
+       ...
+       "my_new_benchmark": EmbodimentTag.MY_ROBOT,
+   }
+   ```
+2. If the benchmark has multiple env_name prefixes (e.g., `my_benchmark_v1`, `my_benchmark_v2`), all related prefixes **must** map to the same `EmbodimentTag`.
+3. Add corresponding test cases in `tests/gr00t/eval/sim/test_env_utils.py` and update the `test_all_known_prefixes_present` test.
+</details>
+---
+# Contributions
+During Early Access we are not accepting pull requests while the codebase stabilizes. If you encounter issues or have suggestions, please open an [Issue](https://github.com/NVIDIA/Isaac-GR00T/issues) in this repository.
+# Support
+Support during Early Access is best-effort. We will continue iterating toward a more stable General Availability (GA) release.
+## License
+- **Code:** Apache 2.0 — see [LICENSE](LICENSE)
+- **Model weights:** [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/)
+```
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+```
+## Citation
+[Paper Site](https://research.nvidia.com/labs/lpr/publication/gr00tn1_2025/)
+```bibtex
+@inproceedings{gr00tn1_2025,
+  archivePrefix = {arxiv},
+  eprint     = {2503.14734},
+  title      = {{GR00T} {N1}: An Open Foundation Model for Generalist Humanoid Robots},
+  author     = {NVIDIA and Johan Bjorck and Fernando Castañeda, Nikita Cherniadev and Xingye Da and Runyu Ding and Linxi "Jim" Fan and Yu Fang and Dieter Fox and Fengyuan Hu and Spencer Huang and Joel Jang and Zhenyu Jiang and Jan Kautz and Kaushil Kundalia and Lawrence Lao and Zhiqi Li and Zongyu Lin and Kevin Lin and Guilin Liu and Edith Llontop and Loic Magne and Ajay Mandlekar and Avnish Narayan and Soroush Nasiriany and Scott Reed and You Liang Tan and Guanzhi Wang and Zu Wang and Jing Wang and Qi Wang and Jiannan Xiang and Yuqi Xie and Yinzhen Xu and Zhenjia Xu and Seonghyeon Ye and Zhiding Yu and Ao Zhang and Hao Zhang and Yizhou Zhao and Ruijie Zheng and Yuke Zhu},
+  month      = {March},
+  year       = {2025},
+  booktitle  = {ArXiv Preprint},
+}
+```

examples/DROID/README.md ADDED Viewed

	@@ -0,0 +1,110 @@

+# GR00T DROID
+The N1.7 base model supports DROID inference out of the box via the `OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT` pretrain tag. A finetuned checkpoint is also available at [`nvidia/GR00T-N1.7-DROID`](https://huggingface.co/nvidia/GR00T-N1.7-DROID).
+> **Note:** The DROID dataset contains multiple language instruction paraphrases per episode (`language_instruction`, `language_instruction_2`, `language_instruction_3`). These are used for language augmentation during training. At inference time, only the first language key is used.
+## Data Format
+The DROID embodiment expects the following modality structure:
+| Modality | Keys | Dimensions |
+|----------|------|------------|
+| Video | `exterior_image_1_left`, `wrist_image_left` | 2 cameras |
+| State | `eef_9d`, `gripper_position`, `joint_position` | 9D + 1D + 7D = 17D |
+| Action | `eef_9d`, `gripper_position`, `joint_position` | 9D + 1D + 7D = 17D |
+| Language | `annotation.language.language_instruction` | text |
+Action representations:
+- `eef_9d`: relative end-effector (XYZ + rotation 6D)
+- `gripper_position`: absolute (1D)
+- `joint_position`: relative joint positions (7D)
+### Preparing DROID Demo Data
+The full DROID dataset ([lerobot/droid_1.0.1](https://huggingface.co/datasets/lerobot/droid_1.0.1)) is ~358 GB with 95k+ episodes in LeRobot v3.0 format. To create a small sample for testing:
+```bash
+uv pip install jsonlines   # one-time dependency
+python scripts/download_droid_sample.py
+```
+This downloads the first data/video chunk (~170 MB) and extracts 3 episodes into `demo_data/droid_sample/` in GR00T LeRobot v2.0 format.
+**Key conversion notes:**
+- Source is LeRobot v3.0 (consolidated parquet + concatenated videos) — the script converts to v2.0 (per-episode parquet + per-episode mp4).
+- Video keys in the raw dataset (`exterior_1_left`, `wrist_left`) differ from the model config keys (`exterior_image_1_left`, `wrist_image_left`). The data loader auto-maps by position — no manual renaming needed.
+- Language instructions are loaded via the `task_index` column mapped through `tasks.jsonl`.
+## 1. Standalone Inference (with demo data)
+After preparing demo data, run inference directly (no server needed):
+```bash
+uv run python scripts/deployment/standalone_inference_script.py \
+    --model-path nvidia/GR00T-N1.7-3B \
+    --dataset-path demo_data/droid_sample \
+    --embodiment-tag OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT \
+    --traj-ids 0 1 \
+    --inference-mode pytorch \
+    --action-horizon 8
+```
+> **Note:** Episode 0 may have an empty language instruction. If inference fails on episode 0, try `--traj-ids 1 2`.
+Expected zero-shot performance on the base model (not finetuned):
+| Metric | Value |
+|--------|-------|
+| Average MSE | ~0.0149 |
+| Average MAE | ~0.0753 |
+| Inference per step (base) | ~262 ms (H100) |
+| Inference per step (finetuned) | ~253 ms (H100) |
+## 2. Inference Server (for real-world deployment)
+### Using the base model (zero-shot):
+```bash
+uv run python gr00t/eval/run_gr00t_server.py \
+    --model-path nvidia/GR00T-N1.7-3B \
+    --embodiment-tag OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT
+```
+### Using the finetuned model:
+```bash
+uv run python gr00t/eval/run_gr00t_server.py \
+    --model-path nvidia/GR00T-N1.7-DROID \
+    --embodiment-tag OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT
+```
+## 3. Fine-tuning
+Fine-tune the base model on DROID data using the shared launcher:
+```bash
+NUM_GPUS=8 MAX_STEPS=20000 GLOBAL_BATCH_SIZE=640 SAVE_STEPS=1000 uv run bash examples/finetune.sh \
+    --base-model-path nvidia/GR00T-N1.7-3B \
+    --dataset-path demo_data/droid_sample \
+    --embodiment-tag OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT \
+    --output-dir /tmp/droid_finetune
+```
+> **Note:** The above uses the small `demo_data/droid_sample` (3 episodes) for quick validation. For production training, replace `--dataset-path` with the full DROID dataset.
+## 4. Robot Control Script
+1. Install the DROID package on the robot control laptop/workstation — [instructions](https://droid-dataset.github.io/droid/software-setup/host-installation.html#configuring-the-laptopworkstation)
+2. Install dependencies for the GR00T control script in the environment from step 1:
+```bash
+pip install tyro moviepy==1.0.3 pydantic numpy==1.26.4
+```
+3. Enter the camera IDs for your ZED cameras in `examples/DROID/main_gr00t.py`.
+4. Start the control script:
+```bash
+python examples/DROID/main_gr00t.py --external-camera="left" # or "right"
+```

examples/DROID/main_gr00t.py ADDED Viewed

	@@ -0,0 +1,469 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ruff: noqa
+# NOTE: this requires installation of the droid repo.
+# Adapted from https://github.com/Physical-Intelligence/openpi/blob/main/examples/droid/main.py
+from __future__ import annotations
+import contextlib
+import dataclasses
+import datetime
+import faulthandler
+import os
+import signal
+import time
+from collections import deque
+import cv2
+import imageio
+import numpy as np
+import pandas as pd
+import tqdm
+import tyro
+from moviepy.editor import ImageSequenceClip
+from PIL import Image
+from droid.robot_env import RobotEnv
+from server_client import PolicyClient
+from utils import resize_with_pad
+from scipy.spatial.transform import Rotation
+faulthandler.enable()
+# DROID data collection frequency -- we slow down execution to match this frequency
+DROID_CONTROL_FREQUENCY = 15
+RESOLUTION = (180, 320)  # resize images to this resolution before sending to the policy server
+# Egocentric frame correction: R_euler is post-multiplied by this matrix
+# to match the OXE DROID training pipeline (TFG convention).
+DROID_EEF_ROTATION_CORRECT = np.array(
+    [[0, 0, -1], [-1, 0, 0], [0, 1, 0]],
+    dtype=np.float64,
+)
+def compute_eef_9d(cartesian_position: np.ndarray) -> np.ndarray:
+    """Convert cartesian_position (XYZ + euler 3D) to eef_9d (XYZ + rot6d).
+    Uses extrinsic XYZ Euler convention (scipy ``"XYZ"``, equivalent to
+    ``tfg.rotation_matrix_3d.from_euler``) and post-multiplies by
+    ``DROID_EEF_ROTATION_CORRECT`` to match the pretrained model.
+    """
+    c = np.asarray(cartesian_position, dtype=np.float64).reshape(6)
+    xyz = c[:3]
+    euler = c[3:6]
+    rot_robot = Rotation.from_euler("XYZ", euler).as_matrix()
+    rot_mat = rot_robot @ DROID_EEF_ROTATION_CORRECT
+    rot6d = rot_mat[:2, :].reshape(6)
+    return np.concatenate([xyz, rot6d]).astype(np.float32)
+@dataclasses.dataclass
+class Args:
+    # Hardware parameters
+    left_camera_id: str = "<SET THIS>"  # e.g., "24259877"
+    right_camera_id: str = "<SET THIS>"  # e.g., "24514023"
+    wrist_camera_id: str = "<SET THIS>"  # e.g., "13062452"
+    # Policy parameters
+    policy_host: str = "localhost"
+    policy_port: int = 5555
+    policy_api_token: str = None
+    results_dir: str = None  # if None, will use the current timestamp as the results directory
+    # Rollout parameters
+    max_timesteps: int = 600  # how many steps to run each rollout
+    # How many actions to execute from a predicted action chunk before querying policy server again
+    open_loop_horizon: int = 15
+    external_camera: str = (
+        "left"  # which exterior camera to use for the policy server, choose from ["left", "right"]
+    )
+    render_camera: str = "left"  # which camera to render saved video from
+    render_fps: int = 50
+    debug: bool = False
+    vis_cameras: bool = False
+    delay_seconds: int = 5
+# We are using Ctrl+C to optionally terminate rollouts early -- however, if we press Ctrl+C while the policy server is
+# waiting for a new action chunk, it will raise an exception and the server connection dies.
+# This context manager temporarily prevents Ctrl+C and delays it after the server call is complete.
+@contextlib.contextmanager
+def prevent_keyboard_interrupt():
+    """Temporarily prevent keyboard interrupts by delaying them until after the protected code."""
+    interrupted = False
+    original_handler = signal.getsignal(signal.SIGINT)
+    def handler(signum, frame):
+        nonlocal interrupted
+        interrupted = True
+    signal.signal(signal.SIGINT, handler)
+    try:
+        yield
+    finally:
+        signal.signal(signal.SIGINT, original_handler)
+        if interrupted:
+            raise KeyboardInterrupt
+def main(args: Args):
+    assert args.external_camera in ["left", "right"], (
+        f"Invalid exterior camera: {args.exterior_camera}"
+    )
+    if args.results_dir is None:
+        results_dir = f"results_gr00t_{datetime.datetime.now().strftime('%Y_%m_%d')}"
+    else:
+        results_dir = args.results_dir
+    # Initialize the Panda environment.
+    env = RobotEnv(action_space="joint_position", gripper_action_space="position")
+    print("Created the droid env!")
+    os.makedirs(results_dir, exist_ok=True)
+    policy_client = PolicyClient(
+        host=args.policy_host, port=args.policy_port, api_token=args.policy_api_token
+    )
+    modality_config = policy_client.get_modality_config()
+    video_delta = modality_config["video"].delta_indices
+    video_T = len(video_delta)
+    video_history_len = max(-min(video_delta), 0) + 1 if video_delta else 1
+    video_keys = modality_config["video"].modality_keys
+    state_keys = modality_config["state"].modality_keys
+    state_T = len(modality_config["state"].delta_indices)
+    print(
+        f"Model config — video T={video_T} (delta={video_delta}), "
+        f"state T={state_T}, keys: video={video_keys}, state={state_keys}"
+    )
+    df = pd.DataFrame(columns=["success", "duration", "video_filename"])
+    if args.debug:
+        debug_dir = os.path.join(results_dir, "debug_data")
+        os.makedirs(debug_dir, exist_ok=True)
+        os.makedirs(os.path.join(debug_dir, "videos/wrist_image/"), exist_ok=True)
+        os.makedirs(os.path.join(debug_dir, "videos/exterior_image_1_left/"), exist_ok=True)
+    instruction = None
+    while True:
+        if instruction is None:
+            instruction = input("Enter instruction: ")
+        else:
+            if input("Change instruction? (enter y or n) ").lower() == "y":
+                instruction = input("Enter instruction: ")
+        time.sleep(args.delay_seconds)
+        # Rollout parameters
+        actions_from_chunk_completed = 0
+        pred_action_chunk = None
+        # Prepare to save video of rollout
+        timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H:%M:%S")
+        video = []
+        if args.debug:
+            model_wrist_image_writer = imageio.get_writer(
+                os.path.join(
+                    debug_dir, "videos/wrist_image/", f"model_wrist_image_{timestamp}.mp4"
+                ),
+                fps=5,
+            )
+            model_exterior_image_1_left_writer = imageio.get_writer(
+                os.path.join(
+                    debug_dir,
+                    "videos/exterior_image_1_left/",
+                    f"model_exterior_image_1_left_{timestamp}.mp4",
+                ),
+                fps=5,
+            )
+        bar = tqdm.tqdm(range(args.max_timesteps))
+        print("Running rollout... press Ctrl+C to stop early.")
+        # Profiling variables (reset for each rollout)
+        rollout_start_time = time.time()
+        obs_times = deque(maxlen=50)  # Track observation collection times
+        server_times = deque(maxlen=50)  # Track server response times
+        action_count = 0
+        frame_buffer = deque(maxlen=video_history_len)
+        for t_step in bar:
+            step_start_time = time.time()
+            try:
+                # Get the current observation
+                obs_start_time = time.time()
+                curr_obs = _extract_observation(
+                    args,
+                    env.get_observation(),
+                    # Save the first observation to disk
+                    save_to_disk=t_step == 0,
+                )
+                obs_time = time.time() - obs_start_time
+                obs_times.append(obs_time)
+                video.append(curr_obs[f"{args.render_camera}_image"])
+                # Resize every step so the rolling frame buffer stays current.
+                left_image = resize_with_pad(curr_obs["left_image"], RESOLUTION[0], RESOLUTION[1])
+                right_image = resize_with_pad(curr_obs["right_image"], RESOLUTION[0], RESOLUTION[1])
+                wrist_image = resize_with_pad(curr_obs["wrist_image"], RESOLUTION[0], RESOLUTION[1])
+                if args.external_camera == "left":
+                    ext_image = left_image
+                elif args.external_camera == "right":
+                    ext_image = right_image
+                frame_buffer.append({"ext": ext_image, "wrist": wrist_image})
+                # Send websocket request to policy server if it's time to predict a new chunk
+                if (
+                    actions_from_chunk_completed == 0
+                    or actions_from_chunk_completed >= args.open_loop_horizon
+                ):
+                    actions_from_chunk_completed = 0
+                    if args.debug:
+                        model_wrist_image_writer.append_data(wrist_image)
+                        model_exterior_image_1_left_writer.append_data(ext_image)
+                    # Build video tensor with T frames derived from the model's
+                    # delta_indices (e.g. [-15, 0] -> T=2, [0] -> T=1).
+                    if video_T == 1:
+                        video_dict = {
+                            "exterior_image_1_left": ext_image[None, None, ...],
+                            "wrist_image_left": wrist_image[None, None, ...],
+                        }  # (B=1, T=1, H, W, C)
+                    else:
+                        hist_frame = frame_buffer[0]
+                        cur_frame = frame_buffer[-1]
+                        video_dict = {
+                            "exterior_image_1_left": np.stack(
+                                [hist_frame["ext"], cur_frame["ext"]]
+                            )[None, ...],
+                            "wrist_image_left": np.stack([hist_frame["wrist"], cur_frame["wrist"]])[
+                                None, ...
+                            ],
+                        }  # (B=1, T=video_T, H, W, C)
+                    # Build state dict from the model's reported state keys.
+                    state_dict = {}
+                    state_source = {
+                        "eef_9d": curr_obs["eef_9d"],
+                        "gripper_position": curr_obs["gripper_position"],
+                        "joint_position": curr_obs["joint_position"],
+                    }
+                    for key in state_keys:
+                        state_dict[key] = state_source[key][None, None, ...].astype(
+                            np.float32
+                        )  # (B=1, T=1, D)
+                    lang_key = modality_config["language"].modality_keys[0]
+                    request_data = {
+                        "video": video_dict,
+                        "state": state_dict,
+                        "language": {lang_key: [[instruction]]},
+                    }
+                    if args.vis_cameras:
+                        # viz the left image 1 and wrist image and use cv2 to display them side by side
+                        left_image_display = cv2.resize(
+                            left_image, (wrist_image.shape[1], wrist_image.shape[0])
+                        )
+                        combined_display = np.concatenate([left_image_display, wrist_image], axis=1)
+                        # convert to bgr
+                        combined_display = combined_display[..., ::-1]
+                        cv2.imshow("Camera Views", combined_display)
+                        cv2.waitKey(1)
+                    # Wrap the server call in a context manager to prevent Ctrl+C from interrupting it
+                    # Ctrl+C will be handled after the server call is complete
+                    server_start_time = time.time()
+                    with prevent_keyboard_interrupt():
+                        # this returns action chunk [N, 8] of joint position actions (7) + gripper position (1)
+                        response = policy_client.get_action(request_data)
+                    server_time = time.time() - server_start_time
+                    server_times.append(server_time)
+                    pred_action_chunk = np.concatenate(
+                        (
+                            response[0]["joint_position"][0],
+                            response[0]["gripper_position"][0],
+                        ),
+                        axis=1,
+                    )
+                # Select current action to execute from chunk
+                action = pred_action_chunk[actions_from_chunk_completed]
+                actions_from_chunk_completed += 1
+                # Binarize gripper action
+                if action[-1].item() > 0.5:
+                    action = np.concatenate([action[:-1], np.ones((1,))])
+                else:
+                    action = np.concatenate([action[:-1], np.zeros((1,))])
+                env.step(action)
+                action_count += 1
+                # Sleep to match DROID data collection frequency
+                elapsed_time = time.time() - step_start_time
+                if elapsed_time < 1 / DROID_CONTROL_FREQUENCY:
+                    time.sleep(1 / DROID_CONTROL_FREQUENCY - elapsed_time)
+                #  profiling stats
+                if obs_times:
+                    avg_obs_time = np.mean(obs_times) * 1000
+                    min_obs_time = np.min(obs_times) * 1000
+                    max_obs_time = np.max(obs_times) * 1000
+                else:
+                    avg_obs_time = min_obs_time = max_obs_time = 0
+                if server_times:
+                    avg_server_time = np.mean(server_times) * 1000
+                    min_server_time = np.min(server_times) * 1000
+                    max_server_time = np.max(server_times) * 1000
+                else:
+                    avg_server_time = min_server_time = max_server_time = 0
+                total_elapsed = time.time() - rollout_start_time
+                actions_per_sec = action_count / total_elapsed if total_elapsed > 0 else 0
+                bar.set_description(
+                    f"Obs: {avg_obs_time:.1f}ms [{min_obs_time:.1f}-{max_obs_time:.1f}] | "
+                    f"Server: {avg_server_time:.1f}ms [{min_server_time:.1f}-{max_server_time:.1f}] | "
+                    f"Actions/sec: {actions_per_sec:.2f}"
+                )
+            except KeyboardInterrupt:
+                break
+        os.makedirs(os.path.join(results_dir, "videos"), exist_ok=True)
+        video = np.stack(video)
+        # replace whitespace with underscores in instruction
+        sanitized_instruction = instruction.replace(" ", "_")
+        save_filename = os.path.join(
+            results_dir, "videos", f"{sanitized_instruction}_video_" + timestamp
+        )
+        ImageSequenceClip(list(video), fps=args.render_fps).write_videofile(
+            save_filename + ".mp4", codec="libx264"
+        )
+        if args.debug:
+            model_wrist_image_writer.close()
+            model_exterior_image_1_left_writer.close()
+        success: str | float | None = None
+        while not isinstance(success, float):
+            success = input(
+                "Did the rollout succeed? (enter y for 100%, n for 0%), or a numeric value 0-100 based on the evaluation spec"
+            )
+            if success == "y":
+                success = 1.0
+            elif success == "n":
+                success = 0.0
+            success = float(success) / 100
+            if not (0 <= success <= 1):
+                print(f"Success must be a number in [0, 100] but got: {success * 100}")
+        new_row = {
+            "success": success,
+            "duration": t_step,
+            "video_filename": save_filename,
+        }
+        new_index = len(df)
+        df.loc[new_index] = new_row
+        if input("Do one more eval? (enter y or n) ").lower() != "y":
+            break
+        env.reset(randomize=False)
+    timestamp = datetime.datetime.now().strftime("%I:%M%p_%B_%d_%Y")
+    csv_filename = os.path.join(results_dir, f"eval_{timestamp}.csv")
+    df.to_csv(csv_filename)
+    print(f"Results saved to {csv_filename}")
+def _extract_observation(args: Args, obs_dict, *, stereo_camera="left", save_to_disk=False):
+    image_observations = obs_dict["image"]
+    key_left = f"{args.left_camera_id}_{stereo_camera}"
+    key_right = f"{args.right_camera_id}_{stereo_camera}"
+    key_wrist = f"{args.wrist_camera_id}_{stereo_camera}"
+    left_image = image_observations.get(key_left)
+    right_image = image_observations.get(key_right)
+    wrist_image = image_observations.get(key_wrist)
+    available = list(image_observations.keys())
+    assert left_image is not None, (
+        f"Left camera not found for key {key_left!r}. Available keys: {available}. "
+        "Set --left-camera-id to the ZED serial used in observation keys."
+    )
+    assert right_image is not None, (
+        f"Right camera not found for key {key_right!r}. Available keys: {available}. "
+        "Set --right-camera-id to the ZED serial used in observation keys."
+    )
+    assert wrist_image is not None, (
+        f"Wrist camera not found for key {key_wrist!r}. Available keys: {available}. "
+        "Set --wrist-camera-id to the ZED serial used in observation keys."
+    )
+    # Drop the alpha dimension
+    left_image = left_image[..., :3]
+    right_image = right_image[..., :3]
+    wrist_image = wrist_image[..., :3]
+    # Convert to RGB
+    left_image = left_image[..., ::-1]
+    right_image = right_image[..., ::-1]
+    wrist_image = wrist_image[..., ::-1]
+    # In addition to image observations, also capture the proprioceptive state
+    robot_state = obs_dict["robot_state"]
+    cartesian_position = np.array(robot_state["cartesian_position"])
+    joint_position = np.array(robot_state["joint_positions"])
+    gripper_position = np.array([robot_state["gripper_position"]])
+    eef_9d = compute_eef_9d(cartesian_position)
+    # Save the images to disk so that they can be viewed live while the robot is running
+    # Create one combined image to make live viewing easy
+    if save_to_disk:
+        combined_image = np.concatenate([left_image, wrist_image, right_image], axis=1)
+        combined_image = Image.fromarray(combined_image)
+        combined_image.save("robot_camera_views.png")
+    return {
+        "left_image": left_image,
+        "right_image": right_image,
+        "wrist_image": wrist_image,
+        "cartesian_position": cartesian_position,
+        "eef_9d": eef_9d,
+        "joint_position": joint_position,
+        "gripper_position": gripper_position,
+    }
+if __name__ == "__main__":
+    args: Args = tyro.cli(Args)
+    main(args)

examples/DROID/server_client.py ADDED Viewed

	@@ -0,0 +1,365 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from dataclasses import asdict, dataclass, is_dataclass
+from enum import Enum
+import io
+from typing import Any
+import msgpack
+import numpy as np
+import zmq
+def to_json_serializable(obj: Any) -> Any:
+    """
+    Recursively convert dataclasses and numpy arrays to JSON-serializable format.
+    Args:
+        obj: Object to convert (can be dataclass, numpy array, dict, list, etc.)
+    Returns:
+        JSON-serializable representation of the object
+    """
+    if is_dataclass(obj) and not isinstance(obj, type):
+        # Convert dataclass to dict, then recursively process the dict
+        return to_json_serializable(asdict(obj))
+    elif isinstance(obj, np.ndarray):
+        # Convert numpy array to list
+        return obj.tolist()
+    elif isinstance(obj, np.integer):
+        # Convert numpy integers to Python int
+        return int(obj)
+    elif isinstance(obj, np.floating):
+        # Convert numpy floats to Python float
+        return float(obj)
+    elif isinstance(obj, np.bool_):
+        # Convert numpy bool to Python bool
+        return bool(obj)
+    elif isinstance(obj, dict):
+        # Recursively process dictionary values
+        return {key: to_json_serializable(value) for key, value in obj.items()}
+    elif isinstance(obj, (list, tuple)):
+        # Recursively process list/tuple elements
+        return [to_json_serializable(item) for item in obj]
+    elif isinstance(obj, set):
+        # Convert set to list
+        return [to_json_serializable(item) for item in obj]
+    elif isinstance(obj, (str, int, float, bool, type(None))):
+        # Already JSON-serializable
+        return obj
+    elif isinstance(obj, Enum):
+        return obj.name
+    else:
+        # For other types, try to convert to string as fallback
+        # You might want to handle specific types differently
+        return str(obj)
+class MessageType(Enum):
+    START_OF_EPISODE = "start_of_episode"
+    END_OF_EPISODE = "end_of_episode"
+    EPISODE_STEP = "episode_step"
+    IMAGE = "image"
+    TEXT = "text"
+class ActionRepresentation(Enum):
+    RELATIVE = "relative"
+    DELTA = "delta"
+    ABSOLUTE = "absolute"
+class ActionType(Enum):
+    EEF = "eef"
+    NON_EEF = "non_eef"
+class ActionFormat(Enum):
+    DEFAULT = "default"
+    XYZ_ROT6D = "xyz+rot6d"
+    XYZ_ROTVEC = "xyz+rotvec"
+@dataclass
+class ActionConfig:
+    rep: ActionRepresentation
+    type: ActionType
+    format: ActionFormat
+    state_key: str | None = None
+@dataclass
+class ModalityConfig:
+    """Configuration for a modality defining how data should be sampled and loaded.
+    This class specifies which indices to sample relative to a base index and which
+    keys to load for a particular modality (e.g., video, state, action).
+    """
+    delta_indices: list[int]
+    """Delta indices to sample relative to the current index. The returned data will correspond to the original data at a sampled base index + delta indices."""
+    modality_keys: list[str]
+    """The keys to load for the modality in the dataset."""
+    sin_cos_embedding_keys: list[str] | None = None
+    """Optional list of keys to apply sin/cos encoding. If None or empty, use min/max normalization for all keys."""
+    mean_std_embedding_keys: list[str] | None = None
+    """Optional list of keys to apply mean/std normalization. If None or empty, use min/max normalization for all keys."""
+    action_configs: list[ActionConfig] | None = None
+    def __post_init__(self):
+        """Set default values for action-related fields if not specified."""
+        if self.action_configs is not None:
+            assert len(self.action_configs) == len(self.modality_keys), (
+                f"Number of action configs ({len(self.action_configs)}) must match number of modality keys ({len(self.modality_keys)})"
+            )
+            parsed_action_configs = []
+            for action_config in self.action_configs:
+                if isinstance(action_config, dict):
+                    action_config = ActionConfig(
+                        rep=ActionRepresentation[action_config["rep"]],
+                        type=ActionType[action_config["type"]],
+                        format=ActionFormat[action_config["format"]],
+                        state_key=action_config.get("state_key", None),
+                    )
+                parsed_action_configs.append(action_config)
+            self.action_configs = parsed_action_configs
+class MsgSerializer:
+    @staticmethod
+    def to_bytes(data: Any) -> bytes:
+        return msgpack.packb(data, default=MsgSerializer.encode_custom_classes)
+    @staticmethod
+    def from_bytes(data: bytes) -> Any:
+        return msgpack.unpackb(data, object_hook=MsgSerializer.decode_custom_classes)
+    @staticmethod
+    def decode_custom_classes(obj):
+        if not isinstance(obj, dict):
+            return obj
+        if "__ModalityConfig_class__" in obj:
+            return ModalityConfig(**obj["as_json"])
+        if "__ndarray_class__" in obj:
+            return np.load(io.BytesIO(obj["as_npy"]), allow_pickle=False)
+        return obj
+    @staticmethod
+    def encode_custom_classes(obj):
+        if isinstance(obj, ModalityConfig):
+            # Convert to dict and let msgpack recursively handle nested objects
+            return {"__ModalityConfig_class__": True, "as_json": to_json_serializable(obj)}
+        if isinstance(obj, np.ndarray):
+            output = io.BytesIO()
+            np.save(output, obj, allow_pickle=False)
+            return {"__ndarray_class__": True, "as_npy": output.getvalue()}
+        return obj
+class BasePolicy(ABC):
+    """Abstract base class for robotic control policies.
+    This class defines the interface that all policies must implement, including
+    methods for action computation, input/output validation, and state management.
+    Subclasses must implement:
+        - check_observation(): Validate observation format
+        - check_action(): Validate action format
+        - _get_action(): Core action computation logic
+        - reset(): Reset policy to initial state
+    """
+    def __init__(self, *, strict: bool = True):
+        self.strict = strict
+    @abstractmethod
+    def check_observation(self, observation: dict[str, Any]) -> None:
+        """Check if the observation is valid.
+        Args:
+            observation: Dictionary containing the current state/observation of the environment
+        Raises:
+            AssertionError: If the observation is invalid.
+        """
+        pass
+    @abstractmethod
+    def check_action(self, action: dict[str, Any]) -> None:
+        """Check if the action is valid.
+        Args:
+            action: Dictionary containing the action to be executed
+        Raises:
+            AssertionError: If the action is invalid.
+        """
+        pass
+    @abstractmethod
+    def _get_action(
+        self, observation: dict[str, Any], options: dict[str, Any] | None = None
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
+        """Compute and return the next action based on current observation.
+        This method should be overridden by subclasses to implement policy-specific
+        action computation. Input validation is handled by the public get_action() method.
+        Args:
+            observation: Dictionary containing the current state/observation
+            options: Optional configuration dict for action computation
+        Returns:
+            Tuple of (action, info):
+                - action: Dictionary containing the action to be executed
+                - info: Dictionary containing additional metadata (e.g., confidence scores)
+        """
+        pass
+    def get_action(
+        self, observation: dict[str, Any], options: dict[str, Any] | None = None
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
+        """Compute and return the next action based on current observation with validation.
+        This is the main public interface. It validates the observation, calls
+        the internal _get_action(), and validates the resulting action.
+        Args:
+            observation: Dictionary containing the current state/observation
+            options: Optional configuration dict for action computation
+        Returns:
+            Tuple of (action, info):
+                - action: Dictionary containing the validated action
+                - info: Dictionary containing additional metadata
+        Raises:
+            AssertionError/ValueError: If observation or action validation fails
+        """
+        if self.strict:
+            self.check_observation(observation)
+        action, info = self._get_action(observation, options)
+        if self.strict:
+            self.check_action(action)
+        return action, info
+    @abstractmethod
+    def reset(self, options: dict[str, Any] | None = None) -> dict[str, Any]:
+        """Reset the policy to its initial state.
+        Args:
+            options: Dictionary containing the options for the reset
+        Returns:
+            Dictionary containing the info after resetting the policy
+        """
+        pass
+class PolicyClient(BasePolicy):
+    def __init__(
+        self,
+        host: str = "localhost",
+        port: int = 5555,
+        timeout_ms: int = 15000,
+        api_token: str = None,
+        strict: bool = False,
+    ):
+        super().__init__(strict=strict)
+        self.context = zmq.Context()
+        self.host = host
+        self.port = port
+        self.timeout_ms = timeout_ms
+        self.api_token = api_token
+        self._init_socket()
+    def _init_socket(self):
+        """Initialize or reinitialize the socket with current settings"""
+        self.socket = self.context.socket(zmq.REQ)
+        self.socket.connect(f"tcp://{self.host}:{self.port}")
+    def ping(self) -> bool:
+        try:
+            self.call_endpoint("ping", requires_input=False)
+            return True
+        except zmq.error.ZMQError:
+            self._init_socket()  # Recreate socket for next attempt
+            return False
+    def kill_server(self):
+        """
+        Kill the server.
+        """
+        self.call_endpoint("kill", requires_input=False)
+    def call_endpoint(
+        self, endpoint: str, data: dict | None = None, requires_input: bool = True
+    ) -> Any:
+        """
+        Call an endpoint on the server.
+        Args:
+            endpoint: The name of the endpoint.
+            data: The input data for the endpoint.
+            requires_input: Whether the endpoint requires input data.
+        """
+        request: dict = {"endpoint": endpoint}
+        if requires_input:
+            request["data"] = data
+        if self.api_token:
+            request["api_token"] = self.api_token
+        self.socket.send(MsgSerializer.to_bytes(request))
+        message = self.socket.recv()
+        if message == b"ERROR":
+            raise RuntimeError("Server error. Make sure we are running the correct policy server.")
+        response = MsgSerializer.from_bytes(message)
+        if isinstance(response, dict) and "error" in response:
+            raise RuntimeError(f"Server error: {response['error']}")
+        return response
+    def __del__(self):
+        """Cleanup resources on destruction"""
+        self.socket.close()
+        self.context.term()
+    def _get_action(
+        self, observation: dict[str, Any], options: dict[str, Any] | None = None
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
+        response = self.call_endpoint(
+            "get_action", {"observation": observation, "options": options}
+        )
+        return tuple(response)  # Convert list (from msgpack) to tuple of (action, info)
+    def reset(self, options: dict[str, Any] | None = None) -> dict[str, Any]:
+        return self.call_endpoint("reset", {"options": options})
+    def get_modality_config(self) -> dict[str, ModalityConfig]:
+        return self.call_endpoint("get_modality_config", requires_input=False)
+    def check_observation(self, observation: dict[str, Any]) -> None:
+        raise NotImplementedError(
+            "check_observation is not implemented. Please use `strict=False` to disable strict mode or implement this method in the subclass."
+        )
+    def check_action(self, action: dict[str, Any]) -> None:
+        raise NotImplementedError(
+            "check_action is not implemented. Please use `strict=False` to disable strict mode or implement this method in the subclass."
+        )

examples/DROID/utils.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Taken from https://github.com/Physical-Intelligence/openpi/tree/main/packages/openpi-client/src/openpi_client
+"""
+import numpy as np
+from PIL import Image
+def convert_to_uint8(img: np.ndarray) -> np.ndarray:
+    """Converts an image to uint8 if it is a float image.
+    This is important for reducing the size of the image when sending it over the network.
+    """
+    if np.issubdtype(img.dtype, np.floating):
+        img = (255 * img).astype(np.uint8)
+    return img
+def resize_with_pad(
+    images: np.ndarray, height: int, width: int, method=Image.BILINEAR
+) -> np.ndarray:
+    """Replicates tf.image.resize_with_pad for multiple images using PIL. Resizes a batch of images to a target height.
+    Args:
+        images: A batch of images in [..., height, width, channel] format.
+        height: The target height of the image.
+        width: The target width of the image.
+        method: The interpolation method to use. Default is bilinear.
+    Returns:
+        The resized images in [..., height, width, channel].
+    """
+    # If the images are already the correct size, return them as is.
+    if images.shape[-3:-1] == (height, width):
+        return images
+    original_shape = images.shape
+    images = images.reshape(-1, *original_shape[-3:])
+    resized = np.stack(
+        [_resize_with_pad_pil(Image.fromarray(im), height, width, method=method) for im in images]
+    )
+    return resized.reshape(*original_shape[:-3], *resized.shape[-3:])
+def _resize_with_pad_pil(image: Image.Image, height: int, width: int, method: int) -> Image.Image:
+    """Replicates tf.image.resize_with_pad for one image using PIL. Resizes an image to a target height and
+    width without distortion by padding with zeros.
+    Unlike the jax version, note that PIL uses [width, height, channel] ordering instead of [batch, h, w, c].
+    """
+    cur_width, cur_height = image.size
+    if cur_width == width and cur_height == height:
+        return image  # No need to resize if the image is already the correct size.
+    ratio = max(cur_width / width, cur_height / height)
+    resized_height = int(cur_height / ratio)
+    resized_width = int(cur_width / ratio)
+    resized_image = image.resize((resized_width, resized_height), resample=method)
+    zero_image = Image.new(resized_image.mode, (width, height), 0)
+    pad_height = max(0, int((height - resized_height) / 2))
+    pad_width = max(0, int((width - resized_width) / 2))
+    zero_image.paste(resized_image, (pad_width, pad_height))
+    assert zero_image.size == (width, height)
+    return zero_image

examples/LIBERO/README.md ADDED Viewed

	@@ -0,0 +1,196 @@

+# LIBERO
+Benchmark for studying knowledge transfer in lifelong robot learning. Includes multiple suites: **Spatial** (spatial reasoning), **Object** (object generalization), **Goal** (goal-conditioned learning), and **10 Long** (long-horizon multi-step tasks). Provides RGB images, proprioception data, and language task specifications.
+For more information, see the [official website](https://libero-project.github.io/main.html).
+---
+# LIBERO evaluation benchmark result
+> **Note:** The full task list is attached at the end of this document.
+All four suites were finetuned with the same hyper-parameters, including
+`--state-dropout-prob 0.2` (the finetune CLI default from
+`gr00t/configs/finetune_config.py`).
+| Task      | Success rate       | max_steps | grad_accum_steps | batch_size |
+|-----------|--------------------|-----------|------------------|------------|
+| Spatial   | 195/200 (97.65%)        | 20K       | 1                | 640        |
+| Goal      | 195/200 (97.5%)        | 20K       | 1                | 640        |
+| Object    | 197/200 (98.45%)        | 20K       | 1                | 640        |
+| 10 (Long) | 189/200 (94.35%)        | 20K       | 1                | 640        |
+# Fine-tune LIBERO 10 (long)
+To reproduce our finetune results, use the following commands to setup dataset and launch finetune experiments. Please remember to set `WANDB_API_KEY` since `--use-wandb` is turned on by default. If you don't have a WANDB account, please remove this argument:
+```bash
+uv run hf download \
+    --repo-type dataset IPEC-COMMUNITY/libero_10_no_noops_1.0.0_lerobot \
+    --local-dir examples/LIBERO/libero_10_no_noops_1.0.0_lerobot/
+# Copy the patches and run the finetune script
+cp -r examples/LIBERO/modality.json examples/LIBERO/libero_10_no_noops_1.0.0_lerobot/meta/
+```
+Run the shared finetune launcher:
+```bash
+NUM_GPUS=8 MAX_STEPS=20000 GLOBAL_BATCH_SIZE=640 SAVE_STEPS=1000 uv run bash examples/finetune.sh \
+    --base-model-path nvidia/GR00T-N1.7-3B \
+    --dataset-path examples/LIBERO/libero_10_no_noops_1.0.0_lerobot/ \
+    --embodiment-tag LIBERO_PANDA \
+    --output-dir /tmp/libero_10 \
+    --state-dropout-prob 0.2
+```
+# Fine-tune LIBERO goal
+```bash
+uv run hf download \
+    --repo-type dataset IPEC-COMMUNITY/libero_goal_no_noops_1.0.0_lerobot \
+    --local-dir examples/LIBERO/libero_goal_no_noops_1.0.0_lerobot/
+# Copy the patches and run the finetune script
+cp -r examples/LIBERO/modality.json examples/LIBERO/libero_goal_no_noops_1.0.0_lerobot/meta/
+## This is a patch for one of the episode where the image seems to be corrupted.
+cp examples/LIBERO/patches/episode_000082.mp4 examples/LIBERO/libero_goal_no_noops_1.0.0_lerobot/videos/chunk-000/observation.images.wrist_image/
+```
+Run the shared finetune launcher:
+```bash
+NUM_GPUS=8 MAX_STEPS=20000 GLOBAL_BATCH_SIZE=640 SAVE_STEPS=1000 uv run bash examples/finetune.sh \
+    --base-model-path nvidia/GR00T-N1.7-3B \
+    --dataset-path examples/LIBERO/libero_goal_no_noops_1.0.0_lerobot/ \
+    --embodiment-tag LIBERO_PANDA \
+    --output-dir /tmp/libero_goal
+```
+# Fine-tune LIBERO object
+```bash
+uv run hf download \
+    --repo-type dataset IPEC-COMMUNITY/libero_object_no_noops_1.0.0_lerobot \
+    --local-dir examples/LIBERO/libero_object_no_noops_1.0.0_lerobot/
+# Copy the patches and run the finetune script
+cp -r examples/LIBERO/modality.json examples/LIBERO/libero_object_no_noops_1.0.0_lerobot/meta/
+```
+Run the shared finetune launcher:
+```bash
+NUM_GPUS=8 MAX_STEPS=20000 GLOBAL_BATCH_SIZE=640 SAVE_STEPS=1000 uv run bash examples/finetune.sh \
+    --base-model-path nvidia/GR00T-N1.7-3B \
+    --dataset-path examples/LIBERO/libero_object_no_noops_1.0.0_lerobot/ \
+    --embodiment-tag LIBERO_PANDA \
+    --output-dir /tmp/libero_object
+```
+# Fine-tune LIBERO spatial
+```bash
+uv run hf download \
+    --repo-type dataset IPEC-COMMUNITY/libero_spatial_no_noops_1.0.0_lerobot \
+    --local-dir examples/LIBERO/libero_spatial_no_noops_1.0.0_lerobot/
+# Copy the patches and run the finetune script
+cp -r examples/LIBERO/modality.json examples/LIBERO/libero_spatial_no_noops_1.0.0_lerobot/meta/
+```
+Run the shared finetune launcher:
+```bash
+NUM_GPUS=8 MAX_STEPS=20000 GLOBAL_BATCH_SIZE=640 SAVE_STEPS=1000 uv run bash examples/finetune.sh \
+    --base-model-path nvidia/GR00T-N1.7-3B \
+    --dataset-path examples/LIBERO/libero_spatial_no_noops_1.0.0_lerobot/ \
+    --embodiment-tag LIBERO_PANDA \
+    --output-dir /tmp/libero_spatial
+```
+# Evaluate checkpoint
+First, setup the evaluation simulation environment. This only needs to run once for each simulation benchmark. After it's done, we only need to launch server and client.
+```bash
+sudo apt update
+sudo apt install libegl1-mesa-dev libglu1-mesa
+bash gr00t/eval/sim/LIBERO/setup_libero.sh
+```
+Then, download the finetuned model to a local directory (HuggingFace does not support nested repo paths directly):
+```bash
+uv run hf download nvidia/GR00T-N1.7-LIBERO --include "libero_10/config.json" "libero_10/embodiment_id.json" "libero_10/model-*.safetensors" "libero_10/model.safetensors.index.json" "libero_10/processor_config.json" "libero_10/statistics.json" --local-dir checkpoints/GR00T-N1.7-LIBERO
+```
+Run client server evaluation under the project root directory in separate terminals:
+**Terminal 1 - Server:**
+```bash
+uv run python gr00t/eval/run_gr00t_server.py \
+    --model-path checkpoints/GR00T-N1.7-LIBERO/libero_10 \
+    --embodiment-tag LIBERO_PANDA \
+    --use-sim-policy-wrapper
+```
+> **Note:** Replace `checkpoints/GR00T-N1.7-LIBERO/libero_10` with your own checkpoint path (e.g., `/tmp/libero_10/checkpoint-20000/`) if evaluating a locally finetuned model.
+**Terminal 2 - Client:**
+```bash
+gr00t/eval/sim/LIBERO/libero_uv/.venv/bin/python gr00t/eval/rollout_policy.py \
+    --n-episodes 10 \
+    --policy-client-host 127.0.0.1 \
+    --policy-client-port 5555 \
+    --max-episode-steps 720 \
+    --env-name libero_sim/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it \
+    --n-action-steps 8 \
+    --n-envs 5
+```
+# Full task list
+## Libero 10 (Long)
+- `libero_sim/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket`
+- `libero_sim/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket`
+- `libero_sim/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it`
+- `libero_sim/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it`
+- `libero_sim/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate`
+- `libero_sim/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy`
+- `libero_sim/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate`
+- `libero_sim/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket`
+- `libero_sim/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove`
+- `libero_sim/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it`
+## Libero Goal
+- `libero_sim/open_the_middle_drawer_of_the_cabinet`
+- `libero_sim/put_the_bowl_on_the_stove`
+- `libero_sim/put_the_wine_bottle_on_top_of_the_cabinet`
+- `libero_sim/open_the_top_drawer_and_put_the_bowl_inside`
+- `libero_sim/put_the_bowl_on_top_of_the_cabinet`
+- `libero_sim/push_the_plate_to_the_front_of_the_stove`
+- `libero_sim/put_the_cream_cheese_in_the_bowl`
+- `libero_sim/turn_on_the_stove`
+- `libero_sim/put_the_bowl_on_the_plate`
+- `libero_sim/put_the_wine_bottle_on_the_rack`
+## Libero Object
+- `libero_sim/pick_up_the_alphabet_soup_and_place_it_in_the_basket`
+- `libero_sim/pick_up_the_cream_cheese_and_place_it_in_the_basket`
+- `libero_sim/pick_up_the_salad_dressing_and_place_it_in_the_basket`
+- `libero_sim/pick_up_the_bbq_sauce_and_place_it_in_the_basket`
+- `libero_sim/pick_up_the_ketchup_and_place_it_in_the_basket`
+- `libero_sim/pick_up_the_tomato_sauce_and_place_it_in_the_basket`
+- `libero_sim/pick_up_the_butter_and_place_it_in_the_basket`
+- `libero_sim/pick_up_the_milk_and_place_it_in_the_basket`
+- `libero_sim/pick_up_the_chocolate_pudding_and_place_it_in_the_basket`
+- `libero_sim/pick_up_the_orange_juice_and_place_it_in_the_basket`
+## Libero Spatial
+- `libero_sim/pick_up_the_black_bowl_between_the_plate_and_the_ramekin_and_place_it_on_the_plate`
+- `libero_sim/pick_up_the_black_bowl_next_to_the_ramekin_and_place_it_on_the_plate`
+- `libero_sim/pick_up_the_black_bowl_from_table_center_and_place_it_on_the_plate`
+- `libero_sim/pick_up_the_black_bowl_on_the_cookie_box_and_place_it_on_the_plate`
+- `libero_sim/pick_up_the_black_bowl_in_the_top_drawer_of_the_wooden_cabinet_and_place_it_on_the_plate`
+- `libero_sim/pick_up_the_black_bowl_on_the_ramekin_and_place_it_on_the_plate`
+- `libero_sim/pick_up_the_black_bowl_next_to_the_cookie_box_and_place_it_on_the_plate`
+- `libero_sim/pick_up_the_black_bowl_on_the_stove_and_place_it_on_the_plate`
+- `libero_sim/pick_up_the_black_bowl_next_to_the_plate_and_place_it_on_the_plate`
+- `libero_sim/pick_up_the_black_bowl_on_the_wooden_cabinet_and_place_it_on_the_plate`

examples/LIBERO/modality.json ADDED Viewed

	@@ -0,0 +1,75 @@

+{
+    "state": {
+        "x": {
+            "start": 0,
+            "end": 1
+        },
+        "y": {
+            "start": 1,
+            "end": 2
+        },
+        "z": {
+            "start": 2,
+            "end": 3
+        },
+        "roll": {
+            "start": 3,
+            "end": 4
+        },
+        "pitch": {
+            "start": 4,
+            "end": 5
+        },
+        "yaw": {
+            "start": 5,
+            "end": 6
+        },
+        "gripper": {
+            "start": 6,
+            "end": 8
+        }
+    },
+    "action": {
+        "x": {
+            "start": 0,
+            "end": 1
+        },
+        "y": {
+            "start": 1,
+            "end": 2
+        },
+        "z": {
+            "start": 2,
+            "end": 3
+        },
+        "roll": {
+            "start": 3,
+            "end": 4
+        },
+        "pitch": {
+            "start": 4,
+            "end": 5
+        },
+        "yaw": {
+            "start": 5,
+            "end": 6
+        },
+        "gripper": {
+            "start": 6,
+            "end": 7
+        }
+    },
+    "video": {
+        "image": {
+            "original_key": "observation.images.image"
+        },
+        "wrist_image": {
+            "original_key": "observation.images.wrist_image"
+        }
+    },
+    "annotation": {
+        "human.action.task_description": {
+            "original_key": "task_index"
+        }
+    }
+}

examples/SO100/README.md ADDED Viewed

	@@ -0,0 +1,87 @@

+# Finetuning SO100 Model
+This guide shows how to finetune dataset collected from [SO100](https://huggingface.co/docs/lerobot/en/so101) robot, and evaluate the model on the real robot.
+## Dataset
+To collect the dataset via teleoperation, please refer to the official documentation in lerobot: https://huggingface.co/docs/lerobot/il_robots?teleoperate_so101=Command
+**Dataset Path:** [izuluaga/finish_sandwich](https://huggingface.co/datasets/izuluaga/finish_sandwich)
+Visualize it with this [link](https://huggingface.co/spaces/lerobot/visualize_dataset?path=%2Fizuluaga%2Ffinish_sandwich%2Fepisode_0)
+## Handling the dataset
+```bash
+uv run --project scripts/lerobot_conversion \
+  python scripts/lerobot_conversion/convert_v3_to_v2.py \
+  --repo-id izuluaga/finish_sandwich \
+  --root examples/SO100/finish_sandwich_lerobot
+```
+Then move the `modality.json` file to the root of the dataset.
+```bash
+cp examples/SO100/modality.json examples/SO100/finish_sandwich_lerobot/izuluaga/finish_sandwich/meta/modality.json
+```
+## Finetuning
+Run the shared finetune launcher directly, using absolute joint positions (feel free to experiment with relative positions):
+```bash
+CUDA_VISIBLE_DEVICES=0 NUM_GPUS=1 uv run bash examples/finetune.sh \
+  --base-model-path nvidia/GR00T-N1.7-3B \
+  --dataset-path examples/SO100/finish_sandwich_lerobot/izuluaga/finish_sandwich \
+  --modality-config-path examples/SO100/so100_config.py \
+  --embodiment-tag NEW_EMBODIMENT \
+  --output-dir /tmp/so100_finetune
+```
+## Open-Loop Evaluation
+Evaluate the finetuned model with the following command:
+```bash
+uv run python gr00t/eval/open_loop_eval.py \
+  --dataset-path examples/SO100/finish_sandwich_lerobot/izuluaga/finish_sandwich/ \
+  --embodiment-tag NEW_EMBODIMENT \
+  --model-path /tmp/so100_finetune/checkpoint-10000 \
+  --traj-ids 0 \
+  --action-horizon 16 \
+  --steps 400
+```
+### Evaluation Results
+The evaluation produces visualizations comparing predicted actions against ground truth trajectories:
+<img src="../../media/open_loop_eval_so100.jpg" width="800" alt="Open-loop evaluation results showing predicted vs ground truth trajectories" />
+## Closed-Loop Evaluation
+Please refer to [eval_so100.py](../../gr00t/eval/real_robot/SO100/eval_so100.py) for how to write SO100 deployment code using Policy API.
+1. set up client side deps
+```bash
+cd gr00t/eval/real_robot/SO100
+uv venv
+source .venv/bin/activate
+uv pip install -e . --verbose
+uv pip install --no-deps -e ../../../../
+```
+2. Start policy server
+```bash
+uv run python gr00t/eval/run_gr00t_server.py \
+  --model-path /tmp/so100_finetune/checkpoint-10000 \
+  --embodiment-tag NEW_EMBODIMENT
+```
+3. Run the eval script, as client.
+```bash
+uv run python gr00t/eval/real_robot/SO100/eval_so100.py \
+  --robot.type=so101_follower --robot.port=/dev/ttyACM2 \
+  --robot.id=orange_follower \
+  --robot.cameras="{ wrist: {type: opencv, index_or_path: 2, width: 640, height: 480, fps: 30}, front: {type: opencv, index_or_path: 6, width: 640, height: 480, fps: 30}}" \
+  --policy-host=localhost --policy-port=5555 --lang-instruction="finish the ham cheese olives sandwich"
+```

examples/SO100/modality.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+    "state": {
+        "single_arm": {
+            "start": 0,
+            "end": 5
+        },
+        "gripper": {
+            "start": 5,
+            "end": 6
+        }
+    },
+    "action": {
+        "single_arm": {
+            "start": 0,
+            "end": 5
+        },
+        "gripper": {
+            "start": 5,
+            "end": 6
+        }
+    },
+    "video": {
+        "front": {
+            "original_key": "observation.images.front"
+        },
+        "wrist": {
+            "original_key": "observation.images.wrist"
+        }
+    },
+    "annotation": {
+        "human.task_description": {
+            "original_key": "task_index"
+        }
+    }
+}

examples/SO100/so100_config.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from gr00t.configs.data.embodiment_configs import register_modality_config
+from gr00t.data.embodiment_tags import EmbodimentTag
+from gr00t.data.types import (
+    ActionConfig,
+    ActionFormat,
+    ActionRepresentation,
+    ActionType,
+    ModalityConfig,
+)
+so100_config = {
+    # Video: current frame only; keys must match "video" entries in meta/modality.json
+    "video": ModalityConfig(
+        delta_indices=[0],
+        modality_keys=["front", "wrist"],  # front third-person view + wrist egocentric
+    ),
+    # State: current proprioceptive reading; keys must match "state" entries in meta/modality.json
+    "state": ModalityConfig(
+        delta_indices=[0],
+        modality_keys=[
+            "single_arm",  # joint positions
+            "gripper",  # gripper state
+        ],
+    ),
+    # Action: 16-step prediction horizon; one ActionConfig per modality key
+    "action": ModalityConfig(
+        delta_indices=list(range(0, 16)),  # predict 16 future steps
+        modality_keys=[
+            "single_arm",
+            "gripper",
+        ],
+        action_configs=[
+            # single_arm: RELATIVE = delta from current state (better generalization)
+            ActionConfig(
+                rep=ActionRepresentation.RELATIVE,
+                type=ActionType.NON_EEF,  # joint-space, not end-effector
+                format=ActionFormat.DEFAULT,
+            ),
+            # gripper: ABSOLUTE = target position (binary open/close works better absolute)
+            ActionConfig(
+                rep=ActionRepresentation.ABSOLUTE,
+                type=ActionType.NON_EEF,
+                format=ActionFormat.DEFAULT,
+            ),
+        ],
+    ),
+    # Language: task instruction from annotation field in the dataset
+    "language": ModalityConfig(
+        delta_indices=[0],
+        modality_keys=["annotation.human.task_description"],
+    ),
+}
+register_modality_config(so100_config, embodiment_tag=EmbodimentTag.NEW_EMBODIMENT)

examples/SimplerEnv/README.md ADDED Viewed

	@@ -0,0 +1,141 @@

+# SimplerEnv
+Framework for evaluating real-world robot manipulation policies (RT-1, RT-1-X, Octo) in simulation. Replicates common setups like Google Robot and WidowX+Bridge, with GPU-accelerated simulations (10-15x speedup). Offers visual matching and variant aggregation evaluation methods for robust policy assessment.
+For more information, see the [official repository](https://github.com/simpler-env/SimplerEnv).
+---
+# Fine-tune Simpler Env bridge dataset (WidowX robot)
+To reproduce our finetune results, use the following commands to setup dataset and launch finetune experiments. Please remember to set `WANDB_API_KEY` since `--use-wandb` is turned on by default. If you don't have a WANDB account, please remove this argument:
+```bash
+uv run hf download \
+    --repo-type dataset IPEC-COMMUNITY/bridge_orig_lerobot \
+    --local-dir examples/SimplerEnv/bridge_orig_lerobot/
+# Copy the patches and run the finetune script
+cp examples/SimplerEnv/bridge_modality.json examples/SimplerEnv/bridge_orig_lerobot/meta/modality.json
+```
+```bash
+NUM_GPUS=8 MAX_STEPS=20000 GLOBAL_BATCH_SIZE=1024 SAVE_STEPS=1000 uv run bash examples/finetune.sh \
+    --base-model-path nvidia/GR00T-N1.7-3B \
+    --dataset-path examples/SimplerEnv/bridge_orig_lerobot/ \
+    --embodiment-tag SIMPLER_ENV_WIDOWX \
+    --output-dir /tmp/bridge_finetune \
+    --state-dropout-prob 0.8
+```
+# Fine-tune Simpler Env fractal dataset (Google robot)
+```bash
+uv run hf download \
+    --repo-type dataset IPEC-COMMUNITY/fractal20220817_data_lerobot \
+    --local-dir examples/SimplerEnv/fractal20220817_data_lerobot/
+# Copy the patches and run the finetune script
+cp -r examples/SimplerEnv/fractal_modality.json examples/SimplerEnv/fractal20220817_data_lerobot/meta/modality.json
+uv run python examples/SimplerEnv/convert_av1_to_h264.py examples/SimplerEnv/fractal20220817_data_lerobot --jobs 16  # (Optional) if AV1 doesn't work on your machine
+```
+```bash
+NUM_GPUS=8 MAX_STEPS=20000 GLOBAL_BATCH_SIZE=1024 SAVE_STEPS=1000 uv run bash examples/finetune.sh \
+    --base-model-path nvidia/GR00T-N1.7-3B \
+    --dataset-path examples/SimplerEnv/fractal20220817_data_lerobot/ \
+    --embodiment-tag SIMPLER_ENV_GOOGLE \
+    --output-dir /tmp/fractal_finetune \
+    --state-dropout-prob 0.5
+```
+# Evaluate checkpoint
+First, setup the evaluation simulation environment. This only needs to run once for each simulation benchmark. After it's done, we only need to launch server and client.
+```bash
+sudo apt update
+sudo apt install libegl1-mesa-dev libglu1-mesa
+bash gr00t/eval/sim/SimplerEnv/setup_SimplerEnv.sh
+```
+Then, run client server evaluation under the project root directory in separate terminals:
+## Fractal (Google Robot) Evaluation
+**Terminal 1 - Server:**
+You can use either a local finetuned checkpoint path or the remote finetuned checkpoint (provided by us):
+**Option 1: Local finetuned checkpoint**
+```bash
+uv run python gr00t/eval/run_gr00t_server.py \
+    --model-path /tmp/fractal_finetune/checkpoint-30000 \
+    --embodiment-tag SIMPLER_ENV_GOOGLE \
+    --use-sim-policy-wrapper
+```
+**Option 2: Remote finetuned checkpoint (directly runnable)**
+```bash
+uv run python gr00t/eval/run_gr00t_server.py \
+    --model-path nvidia/GR00T-N1.7-SimplerEnv-Fractal \
+    --embodiment-tag SIMPLER_ENV_GOOGLE \
+    --use-sim-policy-wrapper
+```
+**Terminal 2 - Client:**
+```bash
+gr00t/eval/sim/SimplerEnv/simpler_uv/.venv/bin/python gr00t/eval/rollout_policy.py \
+    --n-episodes 10 \
+    --policy-client-host 127.0.0.1 \
+    --policy-client-port 5555 \
+    --max-episode-steps 300 \
+    --env-name simpler_env_google/google_robot_pick_coke_can \
+    --n-action-steps 1 \
+    --n-envs 5
+```
+## Bridge (WidowX) Evaluation
+**Terminal 1 - Server:**
+**Option 1: Local finetuned checkpoint**
+```bash
+uv run python gr00t/eval/run_gr00t_server.py \
+    --model-path /tmp/bridge_finetune/checkpoint-30000 \
+    --embodiment-tag SIMPLER_ENV_WIDOWX \
+    --use-sim-policy-wrapper
+```
+**Option 2: Remote finetuned checkpoint (directly runnable)**
+```bash
+uv run python gr00t/eval/run_gr00t_server.py \
+    --model-path nvidia/GR00T-N1.7-SimplerEnv-Bridge \
+    --embodiment-tag SIMPLER_ENV_WIDOWX \
+    --use-sim-policy-wrapper
+```
+**Terminal 2 - Client:**
+```bash
+gr00t/eval/sim/SimplerEnv/simpler_uv/.venv/bin/python gr00t/eval/rollout_policy.py \
+    --n-episodes 10 \
+    --policy-client-host 127.0.0.1 \
+    --policy-client-port 5555 \
+    --max-episode-steps 300 \
+    --env-name simpler_env_widowx/widowx_spoon_on_towel \
+    --n-action-steps 4 \
+    --n-envs 5
+```
+Other supported tasks are:
+```
+simpler_env_google/google_robot_pick_object
+simpler_env_google/google_robot_move_near
+simpler_env_google/google_robot_open_drawer
+...
+simpler_env_widowx/widowx_spoon_on_towel
+simpler_env_widowx/widowx_carrot_on_plate
+simpler_env_widowx/widowx_stack_cube
+```
+you can replace the env_name with the corresponding tasks listed in the SimplerEnv fork this repo pins at `external_dependencies/SimplerEnv` (see `.gitmodules`).

examples/SimplerEnv/bridge_modality.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+    "state": {
+        "x": {
+            "start": 0,
+            "end": 1
+        },
+        "y": {
+            "start": 1,
+            "end": 2
+        },
+        "z": {
+            "start": 2,
+            "end": 3
+        },
+        "roll": {
+            "start": 3,
+            "end": 4
+        },
+        "pitch": {
+            "start": 4,
+            "end": 5
+        },
+        "yaw": {
+            "start": 5,
+            "end": 6
+        },
+        "pad": {
+            "start": 6,
+            "end": 7
+        },
+        "gripper": {
+            "start": 7,
+            "end": 8
+        }
+    },
+    "action": {
+        "x": {
+            "start": 0,
+            "end": 1
+        },
+        "y": {
+            "start": 1,
+            "end": 2
+        },
+        "z": {
+            "start": 2,
+            "end": 3
+        },
+        "roll": {
+            "start": 3,
+            "end": 4
+        },
+        "pitch": {
+            "start": 4,
+            "end": 5
+        },
+        "yaw": {
+            "start": 5,
+            "end": 6
+        },
+        "gripper": {
+            "start": 6,
+            "end": 7
+        }
+    },
+    "video": {
+        "image_0": {
+            "original_key": "observation.images.image_0"
+        }
+    },
+    "annotation": {
+        "human.action.task_description": {
+            "original_key": "task_index"
+        },
+        "human.validity": {}
+    }
+}

examples/SimplerEnv/convert_av1_to_h264.py ADDED Viewed

	@@ -0,0 +1,129 @@

+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from concurrent.futures import ThreadPoolExecutor
+import os
+from pathlib import Path
+import subprocess
+VIDEO_EXTS = {".mp4", ".mov", ".mkv"}
+def run(cmd):
+    return subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+def is_av1(path: Path) -> bool:
+    cmd = [
+        "ffprobe",
+        "-v",
+        "error",
+        "-select_streams",
+        "v:0",
+        "-show_entries",
+        "stream=codec_name",
+        "-of",
+        "default=nw=1:nk=1",
+        str(path),
+    ]
+    proc = run(cmd)
+    if proc.returncode != 0:
+        print(f"[ffprobe FAIL] {path}: {proc.stderr.strip()}")
+        return False
+    codec = proc.stdout.strip()
+    return codec in ("av01", "av1")
+def convert_file(path: Path):
+    if not is_av1(path):
+        print(f"[SKIP] {path}")
+        return
+    tmp = path.with_suffix(path.suffix + ".mp4")
+    print(f"[CONVERT] {path} -> {tmp}")
+    cmd = [
+        "ffmpeg",
+        "-y",
+        "-i",
+        str(path),
+        "-c:v",
+        "libx264",
+        "-qp",
+        "0",
+        "-pix_fmt",
+        "yuv420p",
+        "-c:a",
+        "copy",
+        "-vsync",
+        "passthrough",
+        "-copyts",
+        "-muxdelay",
+        "0",
+        "-muxpreload",
+        "0",
+        str(tmp),
+    ]
+    proc = run(cmd)
+    if proc.returncode != 0:
+        print(f"[ffmpeg FAIL] {path}: {proc.stderr.strip()}")
+        if tmp.exists():
+            tmp.unlink()
+        return
+    tmp.replace(path)
+    print(f"[DONE] {path}")
+def find_videos(root: Path):
+    for dirpath, _, filenames in os.walk(root):
+        for name in filenames:
+            p = Path(dirpath) / name
+            if p.suffix.lower() in VIDEO_EXTS:
+                yield p
+def main():
+    ap = argparse.ArgumentParser(
+        description="Recursively convert AV1 videos to H.264 (lossless-ish) in place."
+    )
+    ap.add_argument("root", nargs="?", default=".", help="Root directory (default: .)")
+    ap.add_argument(
+        "-j",
+        "--jobs",
+        type=int,
+        default=os.cpu_count() or 4,
+        help="Number of parallel workers (default: CPU count)",
+    )
+    args = ap.parse_args()
+    root = Path(args.root).resolve()
+    files = list(find_videos(root))
+    print(f"Scanning {root}, found {len(files)} candidate video files")
+    if not files:
+        return
+    with ThreadPoolExecutor(max_workers=args.jobs) as ex:
+        for p in files:
+            ex.submit(convert_file, p)
+if __name__ == "__main__":
+    main()

examples/SimplerEnv/fractal_modality.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+    "state": {
+        "x": {
+            "start": 0,
+            "end": 1
+        },
+        "y": {
+            "start": 1,
+            "end": 2
+        },
+        "z": {
+            "start": 2,
+            "end": 3
+        },
+        "rx": {
+            "start": 3,
+            "end": 4
+        },
+        "ry": {
+            "start": 4,
+            "end": 5
+        },
+        "rz": {
+            "start": 5,
+            "end": 6
+        },
+        "rw": {
+            "start": 6,
+            "end": 7
+        },
+        "gripper": {
+            "start": 7,
+            "end": 8
+        }
+    },
+    "action": {
+        "x": {
+            "start": 0,
+            "end": 1
+        },
+        "y": {
+            "start": 1,
+            "end": 2
+        },
+        "z": {
+            "start": 2,
+            "end": 3
+        },
+        "roll": {
+            "start": 3,
+            "end": 4
+        },
+        "pitch": {
+            "start": 4,
+            "end": 5
+        },
+        "yaw": {
+            "start": 5,
+            "end": 6
+        },
+        "gripper": {
+            "start": 6,
+            "end": 7
+        }
+    },
+    "video": {
+        "image": {
+            "original_key": "observation.images.image"
+        }
+    },
+    "annotation": {
+        "human.action.task_description": {
+            "original_key": "task_index"
+        },
+        "human.validity": {}
+    }
+}

examples/finetune.sh ADDED Viewed

	@@ -0,0 +1,158 @@

+#!/usr/bin/env bash
+set -x -euo pipefail
+NUM_GPUS="${NUM_GPUS:-1}"
+MASTER_PORT="${MASTER_PORT:-29500}"
+SAVE_STEPS="${SAVE_STEPS:-1000}"
+MAX_STEPS="${MAX_STEPS:-10000}"
+USE_WANDB="${USE_WANDB:-1}"
+DATALOADER_NUM_WORKERS="${DATALOADER_NUM_WORKERS:-4}"
+GLOBAL_BATCH_SIZE="${GLOBAL_BATCH_SIZE:-32}"
+SHARD_SIZE="${SHARD_SIZE:-1024}"
+NUM_SHARDS_PER_EPOCH="${NUM_SHARDS_PER_EPOCH:-100000}"
+EPISODE_SAMPLING_RATE="${EPISODE_SAMPLING_RATE:-0.1}"
+BASE_MODEL_PATH=""
+DATASET_PATH=""
+MODALITY_CONFIG_PATH=""
+EMBODIMENT_TAG=""
+OUTPUT_DIR=""
+EXPERIMENT_NAME=""
+WANDB_PROJECT=""
+STATE_DROPOUT_PROB=""
+EXTRA_ARGS=()
+usage() {
+    cat <<'EOF'
+Usage: bash examples/finetune.sh \
+  --base-model-path <path> \
+  --dataset-path <path> \
+  --embodiment-tag <tag> \
+  --output-dir <path> \
+  [--modality-config-path <path>] \
+  [--state-dropout-prob <value>] \
+  [--save-only-model] \
+  [-- <extra launch_finetune.py args>...]
+EOF
+}
+while [ "$#" -gt 0 ]; do
+    case "$1" in
+        --base-model-path)
+            BASE_MODEL_PATH="$2"
+            shift 2
+            ;;
+        --dataset-path)
+            DATASET_PATH="$2"
+            shift 2
+            ;;
+        --modality-config-path)
+            MODALITY_CONFIG_PATH="$2"
+            shift 2
+            ;;
+        --embodiment-tag)
+            EMBODIMENT_TAG="$2"
+            shift 2
+            ;;
+        --output-dir)
+            OUTPUT_DIR="$2"
+            shift 2
+            ;;
+        --experiment-name)
+            EXPERIMENT_NAME="$2"
+            shift 2
+            ;;
+        --wandb-project)
+            WANDB_PROJECT="$2"
+            shift 2
+            ;;
+        --state-dropout-prob)
+            STATE_DROPOUT_PROB="$2"
+            shift 2
+            ;;
+        --save-only-model)
+            SAVE_ONLY_MODEL=1
+            shift
+            ;;
+        --help|-h)
+            usage
+            exit 0
+            ;;
+        --)
+            shift
+            EXTRA_ARGS=("$@")
+            break
+            ;;
+        *)
+            echo "Unknown argument: $1" >&2
+            usage >&2
+            exit 1
+            ;;
+    esac
+done
+for required_var in BASE_MODEL_PATH DATASET_PATH EMBODIMENT_TAG OUTPUT_DIR; do
+    if [ -z "${!required_var}" ]; then
+        echo "Missing required argument: ${required_var}" >&2
+        usage >&2
+        exit 1
+    fi
+done
+WANDB_FLAG=()
+if [ "$USE_WANDB" = "1" ]; then
+    WANDB_FLAG+=(--use_wandb)
+fi
+LAUNCH_CMD=(
+    gr00t/experiment/launch_finetune.py
+    --base_model_path "$BASE_MODEL_PATH"
+    --dataset_path "$DATASET_PATH"
+    --embodiment_tag "$EMBODIMENT_TAG"
+    --num_gpus "$NUM_GPUS"
+    --output_dir "$OUTPUT_DIR"
+    --save_steps "$SAVE_STEPS"
+    --save_total_limit 5
+    --max_steps "$MAX_STEPS"
+    --warmup_ratio 0.05
+    --weight_decay 1e-5
+    --learning_rate 1e-4
+    "${WANDB_FLAG[@]}"
+    --global_batch_size "$GLOBAL_BATCH_SIZE"
+    --color_jitter_params brightness 0.3 contrast 0.4 saturation 0.5 hue 0.08
+    --dataloader_num_workers "$DATALOADER_NUM_WORKERS"
+    --shard_size "$SHARD_SIZE"
+    --num_shards_per_epoch "$NUM_SHARDS_PER_EPOCH"
+    --episode_sampling_rate "$EPISODE_SAMPLING_RATE"
+)
+if [ -n "$MODALITY_CONFIG_PATH" ]; then
+    LAUNCH_CMD+=(--modality_config_path "$MODALITY_CONFIG_PATH")
+fi
+if [ -n "$EXPERIMENT_NAME" ]; then
+    LAUNCH_CMD+=(--experiment_name "$EXPERIMENT_NAME")
+fi
+if [ -n "$WANDB_PROJECT" ]; then
+    LAUNCH_CMD+=(--wandb_project "$WANDB_PROJECT")
+fi
+if [ -n "$STATE_DROPOUT_PROB" ]; then
+    LAUNCH_CMD+=(--state_dropout_prob "$STATE_DROPOUT_PROB")
+fi
+if [ -n "${SAVE_ONLY_MODEL:-}" ]; then
+    LAUNCH_CMD+=(--save_only_model)
+fi
+if [ "${#EXTRA_ARGS[@]}" -gt 0 ]; then
+    LAUNCH_CMD+=("${EXTRA_ARGS[@]}")
+fi
+if [ "$NUM_GPUS" = "1" ]; then
+    # Restrict to a single GPU so HF Trainer doesn't wrap the model in DataParallel,
+    # which crashes with a StopIteration error in the model's device property.
+    export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
+    exec python "${LAUNCH_CMD[@]}"
+fi
+exec torchrun --nproc_per_node="$NUM_GPUS" --master_port="$MASTER_PORT" "${LAUNCH_CMD[@]}"

examples/mask-guided-background-suppression/README.md ADDED Viewed

	@@ -0,0 +1,203 @@

+# Mask-Guided Background Suppression
+Mask-guided augmentations leverage per-frame segmentation masks to apply targeted image transformations during training. This enables **domain randomization** on specific regions (e.g., replacing backgrounds with noise, tinting foreground objects) without affecting the rest of the image.
+This feature is controlled via the `--extra_augmentation_config` argument, which accepts a JSON string specifying which mask regions to augment and how.
+---
+## Prerequisites
+1. **Segmentation masks** must be pre-generated and stored alongside your dataset. The dataset's `info.json` must include a `mask_path` template, and `modality.json` must define a `"mask"` section mapping camera views.
+2. **Albumentations transforms** are enabled by default in N1.7 (`use_albumentations_transforms=True` in model config). No extra flag is needed.
+---
+## Supported Augmentation Types
+### 1. Background Noise Transform
+Replaces pixels in specified mask regions with **random RGB noise**. Useful for sim-to-real transfer or preventing the model from overfitting to static backgrounds.
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `target_mask_values` | `list[int]` | Mask label values to replace with noise (e.g., `[0]` for background) |
+| `p` | `float` | Probability of applying the transform per frame (0.0 to 1.0) |
+### 2. Masked Region Color Transform
+Applies a **random color tint** to pixels in specified mask regions. Useful for augmenting the appearance of specific objects (e.g., tables, tools) to improve color generalization.
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `target_mask_values` | `list[int]` | Mask label values to apply the tint to (e.g., `[4]`, `[5]`) |
+| `p` | `float` | Probability of applying the transform per frame (0.0 to 1.0) |
+| `alpha_range` | `[min, max]` | Range for blending intensity between original and tint color (default: `[0.3, 1.0]`) |
+---
+## Configuration Format
+The `--extra_augmentation_config` argument takes a JSON string with two optional keys:
+```json
+{
+    "background_noise_transforms": [
+        {"target_mask_values": [0], "p": 0.9}
+    ],
+    "masked_region_transforms": [
+        {"target_mask_values": [4], "p": 1.0, "alpha_range": [0.0, 1.0]}
+    ]
+}
+```
+Multiple transforms of each type can be specified (e.g., different mask values with different probabilities).
+---
+## Quick Start with Demo Data
+The included demo dataset `demo_data/cube_to_bowl_5_with_mask` contains a single episode with front and wrist camera views, along with pre-generated segmentation masks. The masks were generated using [SAM 3](https://github.com/facebookresearch/sam3) with the text prompt `"background"`, then converted so that background pixels = `0` and foreground pixels = `1` (see [Generating mask files](#generating-mask-files) below).
+### 1. Background noise only
+Replace background (mask=0) with random noise:
+```bash
+uv run python test_extra_augmentation.py \
+    --dataset_path ../../demo_data/cube_to_bowl_5_with_mask \
+    --embodiment_tag NEW_EMBODIMENT \
+    --modality_config_path so101_config.py \
+    --extra_augmentation_config '{"background_noise_transforms": [{"target_mask_values": [0], "p": 1.0}]}'
+```
+### 2. Background noise + foreground color tint
+Apply both transforms together:
+```bash
+uv run python test_extra_augmentation.py \
+    --dataset_path ../../demo_data/cube_to_bowl_5_with_mask \
+    --embodiment_tag NEW_EMBODIMENT \
+    --modality_config_path so101_config.py \
+    --extra_augmentation_config '{"background_noise_transforms": [{"target_mask_values": [0], "p": 1.0}], "masked_region_transforms": [{"target_mask_values": [1], "p": 1.0, "alpha_range": [0.3, 1.0]}]}' \
+    --output_dir /tmp/augmentation_vis --num_frames 5
+```
+Both commands save side-by-side comparison images (**Original | Augmented | Mask**) under `output_dir/<view_name>/`, with frames sampled evenly across the episode.
+### 3. Fine-tune with mask-guided augmentation
+```bash
+export NUM_GPUS=8
+torchrun --nproc_per_node=$NUM_GPUS --master_port=29500 \
+    gr00t/experiment/launch_finetune.py \
+    --base-model-path nvidia/GR00T-N1.7-3B \
+    --dataset-path <YOUR_DATASET_WITH_MASKS> \
+    --embodiment-tag <YOUR_EMBODIMENT_TAG> \
+    --num-gpus $NUM_GPUS \
+    --output-dir /tmp/mask_augmentation_run \
+    --save-steps 1000 \
+    --save-total-limit 5 \
+    --max-steps 20000 \
+    --warmup-ratio 0.05 \
+    --weight-decay 1e-5 \
+    --learning-rate 1e-4 \
+    --use-wandb \
+    --global-batch-size 640 \
+    --dataloader-num-workers 4 \
+    --extra-augmentation-config '{"background_noise_transforms": [{"target_mask_values": [0], "p": 0.9}], "masked_region_transforms": [{"target_mask_values": [4], "p": 1.0, "alpha_range": [0, 1]}]}'
+```
+---
+## Dataset Setup
+To use mask-guided augmentation with your own dataset, ensure:
+1. **Mask files** are stored as `.npz` files under a `masks/` directory, following the same chunk/episode structure as videos. Each `.npz` contains a single `uint8` array of shape `(num_frames, H, W)` where each pixel holds an integer semantic label (e.g., `0` = background, `1` = object A, `2` = object B).
+   ```
+   masks/
+   └── chunk-000/
+       └── observation.images.front/
+           └── episode_000000_masks.npz
+   ```
+   See [Generating mask files](#generating-mask-files) below for how to produce these files.
+2. **`info.json`** includes a `mask_path` template:
+   ```json
+   {
+       "mask_path": "masks/chunk-{episode_chunk:03d}/{mask_key}/episode_{episode_index:06d}_masks.npz"
+   }
+   ```
+3. **`modality.json`** includes a `"mask"` section mapping view names to their original keys. The keys should match the actual camera view names in your dataset:
+   ```json
+   {
+       "mask": {
+           "<view_name>": {
+               "original_key": "<observation.images.xxx>"
+           }
+       }
+   }
+   ```
+   For example, if your dataset has `front` and `wrist` cameras:
+   ```json
+   {
+       "mask": {
+           "front": {
+               "original_key": "observation.images.front"
+           },
+           "wrist": {
+               "original_key": "observation.images.wrist"
+           }
+       }
+   }
+   ```
+---
+## Generating Mask Files
+You can generate mask files using any video segmentation model that produces per-pixel labels. The demo masks in this example were created with [SAM 3](https://github.com/facebookresearch/sam3) (see the [video predictor example](https://github.com/facebookresearch/sam3/blob/main/examples/sam3_video_predictor_example.ipynb) for SAM 3 usage). The workflow was:
+1. Run SAM 3 on each episode video with a text prompt such as `"background"`. SAM 3 returns per-frame binary masks via `propagate_in_video`.
+2. Convert the binary masks into the label format expected by this pipeline (`0` = background, non-zero = foreground categories) and save as `.npz`:
+```python
+import numpy as np
+# sam3_binary_masks: (num_frames, H, W) bool array from SAM 3 (True where prompt matched)
+# For a "background" prompt, invert so that background=0 and foreground=1:
+label_masks = (~sam3_binary_masks).astype(np.uint8)
+# For multiple prompts, merge into one label array instead:
+# label_masks = np.zeros((num_frames, H, W), dtype=np.uint8)
+# label_masks[prompt_0_masks] = 1
+# label_masks[prompt_1_masks] = 2
+np.savez_compressed("episode_000000_masks.npz", label_masks)
+```
+The pipeline loads the array from the `.npz` file (it expects the key `arr_0`, which is the default for `np.savez_compressed`). A single `.npy` file containing the `(num_frames, H, W)` array also works.
+---
+## How It Works
+The augmentation pipeline applies mask-based transforms **per-frame** before the standard augmentations (crop, resize, color jitter, etc.):
+1. For each frame, the corresponding segmentation mask is loaded.
+2. `BackgroundNoiseTransform` replaces all pixels where `mask == target_value` with random RGB noise.
+3. `MaskedColorTransform` blends a random color into all pixels where `mask == target_value`, controlled by `alpha_range`.
+4. Standard augmentations (shared across views via replay) are then applied on top.
+This ordering ensures that mask-guided augmentations are applied independently per frame, while standard augmentations remain consistent across camera views within the same timestep.

examples/mask-guided-background-suppression/so101_config.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from gr00t.configs.data.embodiment_configs import register_modality_config
+from gr00t.data.embodiment_tags import EmbodimentTag
+from gr00t.data.types import (
+    ActionConfig,
+    ActionFormat,
+    ActionRepresentation,
+    ActionType,
+    ModalityConfig,
+)
+so101_config = {
+    "video": ModalityConfig(
+        delta_indices=[0],
+        modality_keys=["front", "wrist"],
+    ),
+    "mask": ModalityConfig(
+        delta_indices=[0],
+        modality_keys=["front", "wrist"],
+    ),
+    "state": ModalityConfig(
+        delta_indices=[0],
+        modality_keys=["single_arm", "gripper"],
+    ),
+    "action": ModalityConfig(
+        delta_indices=list(range(16)),
+        modality_keys=["single_arm", "gripper"],
+        action_configs=[
+            ActionConfig(
+                rep=ActionRepresentation.RELATIVE,
+                type=ActionType.NON_EEF,
+                format=ActionFormat.DEFAULT,
+            ),
+            ActionConfig(
+                rep=ActionRepresentation.ABSOLUTE,
+                type=ActionType.NON_EEF,
+                format=ActionFormat.DEFAULT,
+            ),
+        ],
+    ),
+    "language": ModalityConfig(
+        delta_indices=[0],
+        modality_keys=["annotation.human.task_description"],
+    ),
+}
+register_modality_config(so101_config, embodiment_tag=EmbodimentTag.NEW_EMBODIMENT)

examples/mask-guided-background-suppression/test_extra_augmentation.py ADDED Viewed

	@@ -0,0 +1,198 @@

+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Smoke test: apply extra_augmentation_config to raw frames and save comparison images."""
+from __future__ import annotations
+import argparse
+import importlib
+import json
+import os
+from pathlib import Path
+import sys
+from gr00t.configs.data.embodiment_configs import MODALITY_CONFIGS
+from gr00t.data.dataset.lerobot_episode_loader import LeRobotEpisodeLoader
+from gr00t.data.embodiment_tags import EmbodimentTag
+from gr00t.model.gr00t_n1d7.image_augmentations import (
+    apply_with_replay,
+    build_image_transformations_albumentations,
+)
+import numpy as np
+from PIL import Image
+def save_comparison(original, augmented, mask, output_path):
+    orig_arr = np.array(original)
+    aug_arr = augmented.transpose(1, 2, 0) if augmented.shape[0] == 3 else augmented
+    panels = [orig_arr, aug_arr]
+    if mask is not None:
+        mask_vis = np.where(mask[..., None] > 0, 255, 0).astype(np.uint8)
+        mask_vis = np.broadcast_to(mask_vis, (*mask.shape[:2], 3)).copy()
+        if mask_vis.shape[:2] != orig_arr.shape[:2]:
+            mask_vis = np.array(
+                Image.fromarray(mask_vis).resize(
+                    (orig_arr.shape[1], orig_arr.shape[0]), Image.NEAREST
+                )
+            )
+        panels.append(mask_vis)
+    h = panels[0].shape[0]
+    resized = []
+    for p in panels:
+        if p.shape[0] != h:
+            new_w = int(p.shape[1] * h / p.shape[0])
+            p = np.array(Image.fromarray(p).resize((new_w, h), Image.BILINEAR))
+        resized.append(p)
+    Image.fromarray(np.concatenate(resized, axis=1)).save(output_path)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_path", required=True)
+    parser.add_argument("--embodiment_tag", required=True)
+    parser.add_argument("--modality_config_path", default=None)
+    parser.add_argument("--extra_augmentation_config", type=str, required=True)
+    parser.add_argument("--output_dir", type=str, default="/tmp/augmentation_vis")
+    parser.add_argument("--num_frames", type=int, default=5)
+    parser.add_argument("--video_backend", type=str, default="torchcodec")
+    args = parser.parse_args()
+    if args.modality_config_path:
+        path = Path(args.modality_config_path)
+        sys.path.append(str(path.parent))
+        importlib.import_module(path.stem)
+    embodiment_tag = EmbodimentTag[args.embodiment_tag].value
+    modality_configs = MODALITY_CONFIGS[embodiment_tag]
+    extra_aug_config = json.loads(args.extra_augmentation_config)
+    train_transform, _ = build_image_transformations_albumentations(
+        image_target_size=[224, 224],
+        image_crop_size=[224, 224],
+        random_rotation_angle=0,
+        color_jitter_params=None,
+        shortest_image_edge=512,
+        crop_fraction=0.95,
+        extra_augmentation_config=extra_aug_config,
+    )
+    loader = LeRobotEpisodeLoader(
+        dataset_path=args.dataset_path,
+        modality_configs=modality_configs,
+        video_backend=args.video_backend,
+    )
+    episode_df = loader[0]
+    video_cols = [c for c in episode_df.columns if c.startswith("video.")]
+    mask_cols = [c for c in episode_df.columns if c.startswith("mask.")]
+    print(f"Video columns: {video_cols}")
+    print(f"Mask columns:  {mask_cols}")
+    num_frames = min(args.num_frames, len(episode_df))
+    frame_indices = np.linspace(0, len(episode_df) - 1, num_frames, dtype=int)
+    os.makedirs(args.output_dir, exist_ok=True)
+    for vcol in video_cols:
+        view_name = vcol.replace("video.", "")
+        mcol = f"mask.{view_name}"
+        has_mask = mcol in mask_cols
+        view_dir = os.path.join(args.output_dir, view_name.replace(".", "_"))
+        os.makedirs(view_dir, exist_ok=True)
+        for fidx in frame_indices:
+            orig_img = episode_df[vcol].iloc[fidx]
+            mask_arr = np.array(episode_df[mcol].iloc[fidx]) if has_mask else None
+            masks_list = [mask_arr] if mask_arr is not None else None
+            transformed, _ = apply_with_replay(train_transform, [orig_img], masks_list)
+            aug_arr = transformed[0].numpy()
+            out_path = os.path.join(view_dir, f"frame_{fidx:04d}.png")
+            save_comparison(orig_img, aug_arr, mask_arr, out_path)
+            print(f"  Saved: {out_path}")
+    print(f"\nDone! {num_frames} frames x {len(video_cols)} views saved to {args.output_dir}")
+    print("\n" + "=" * 60)
+    print("Testing full training pipeline (processor + dataloader) ...")
+    print("=" * 60)
+    from gr00t.configs.base_config import get_default_config
+    from gr00t.data.dataset.factory import DatasetFactory
+    from gr00t.model.gr00t_n1d7.processing_gr00t_n1d7 import Gr00tN1d7Processor
+    config = get_default_config()
+    config = config.load_dict(
+        {
+            "data": {
+                "download_cache": False,
+                "video_backend": args.video_backend,
+                "datasets": [
+                    {
+                        "dataset_paths": [args.dataset_path],
+                        "mix_ratio": 1.0,
+                        "embodiment_tag": embodiment_tag,
+                    }
+                ],
+            }
+        }
+    )
+    config.model.extra_augmentation_config = extra_aug_config
+    config.model.use_albumentations_transforms = True
+    processor = Gr00tN1d7Processor(
+        modality_configs=config.data.modality_configs,
+        statistics=None,
+        image_crop_size=config.model.image_crop_size,
+        image_target_size=config.model.image_target_size,
+        random_rotation_angle=config.model.random_rotation_angle,
+        color_jitter_params=config.model.color_jitter_params,
+        model_name=config.model.model_name,
+        model_type=config.model.backbone_model_type,
+        formalize_language=config.model.formalize_language,
+        max_state_dim=config.model.max_state_dim,
+        max_action_dim=config.model.max_action_dim,
+        apply_sincos_state_encoding=config.model.apply_sincos_state_encoding,
+        max_action_horizon=config.model.action_horizon,
+        use_albumentations=config.model.use_albumentations_transforms,
+        extra_augmentation_config=config.model.extra_augmentation_config,
+        shortest_image_edge=config.model.shortest_image_edge,
+        crop_fraction=config.model.crop_fraction,
+        use_relative_action=config.model.use_relative_action,
+    )
+    processor.train()
+    dataset_factory = DatasetFactory(config=config)
+    train_dataset, _ = dataset_factory.build(processor=processor)
+    sample = next(iter(train_dataset))
+    print(f"Sample keys: {list(sample.keys())}")
+    print(f"VLM keys:    {list(sample['vlm_content'].keys())}")
+    for k, v in sample.items():
+        if hasattr(v, "shape"):
+            print(f"  {k}: shape={v.shape}, dtype={v.dtype}")
+    print("\nPipeline test PASSED!")
+if __name__ == "__main__":
+    main()

getting_started/data_config.md ADDED Viewed

	@@ -0,0 +1,331 @@

+# How to prepare your modality configuration
+## Overview
+The modality configuration defines how your robot's data should be loaded, processed, and interpreted by the model. This configuration bridges your dataset's physical structure (defined in `meta/modality.json`) and the model's data processing pipeline.
+Each embodiment requires a Python configuration file that specifies:
+- Which observations to use (video cameras, proprioceptive states)
+- How to sample data temporally (current frame, historical frames, future action horizons)
+- How actions should be interpreted and transformed
+- Which language annotations to use
+## Configuration Structure
+A modality configuration is a Python dictionary containing four top-level keys: `"video"`, `"state"`, `"action"`, and `"language"`. Each key maps to a `ModalityConfig` object.
+Here's the [SO-100 example](../examples/SO100/so100_config.py):
+```python
+from gr00t.configs.data.embodiment_configs import register_modality_config
+from gr00t.data.types import ModalityConfig, ActionConfig, ActionRepresentation, ActionType, ActionFormat
+so100_config = {
+    "video": ModalityConfig(...),
+    "state": ModalityConfig(...),
+    "action": ModalityConfig(...),
+    "language": ModalityConfig(...),
+}
+register_modality_config(so100_config, embodiment_tag=EmbodimentTag.NEW_EMBODIMENT)
+```
+## Understanding `ModalityConfig`
+Each `ModalityConfig` specifies two required fields and several optional ones:
+### Required Fields
+**1. `delta_indices` (list[int])**
+Defines which temporal offsets to sample relative to the current timestep:
+- Current observation: Use [0] for the current timestep (recommended for video and state)
+- Future actions: Use positive indices (e.g., list(range(0, 16))) for action prediction horizons
+> **Note:** Negative indices (e.g., [-2, -1, 0]) are supported by the data loader for historical context, but no current N1.7 embodiment config uses them. Stick with [0] for video and state unless you have a specific reason to stack frames.
+Examples:
+```python
+# Single current frame for video
+delta_indices=[0]
+# 16-step action prediction horizon
+delta_indices=list(range(0, 16))
+```
+> **Note:** If you modify `delta_indices` for the action modality (e.g., changing the action horizon from 16 to 8), you **must** regenerate the dataset statistics by re-running `python gr00t/data/stats.py --dataset-path <dataset_path> --embodiment-tag <embodiment_tag>`. The normalization statistics (especially `meta/relative_stats.json`) are computed based on the original `delta_indices` length, and a mismatch will cause errors during training.
+<details>
+<summary>Example: What happens if you change <code>delta_indices</code> without regenerating stats?</summary>
+Suppose your action config originally uses a 16-step horizon:
+```python
+"action": ModalityConfig(
+    delta_indices=list(range(0, 16)),  # 16 steps
+    ...
+)
+```
+Running `python gr00t/data/stats.py` generates `meta/relative_stats.json` with per-step statistics of shape `(16, D)`, where `D` is the action dimension.
+If you later change the horizon to 8 steps:
+```python
+"action": ModalityConfig(
+    delta_indices=list(range(0, 8)),  # 8 steps
+    ...
+)
+```
+The training data will now have shape `(8, D)`, but the normalization parameters from `relative_stats.json` still have shape `(16, D)`. This dimension mismatch causes an `IndexError` during normalization:
+```
+IndexError: boolean index did not match indexed array along dimension 0;
+dimension is 8 but corresponding boolean dimension is 16
+```
+**Fix:** Re-run `python gr00t/data/stats.py --dataset-path <dataset_path> --embodiment-tag <embodiment_tag>` after changing `delta_indices` to regenerate matching statistics.
+</details>
+**2. `modality_keys` (list[str])**
+Specifies which keys to load from your dataset. These keys **must match** the keys defined in your `meta/modality.json` file.
+For the SO-100 example:
+- **Video keys**: Must match keys in `meta/modality.json` under `"video"` (e.g., `"front"`, `"wrist"`)
+- **State keys**: Must match keys in `meta/modality.json` under `"state"` (e.g., `"single_arm"`, `"gripper"`)
+- **Action keys**: Must match keys in `meta/modality.json` under `"action"` (e.g., `"single_arm"`, `"gripper"`)
+- **Language keys**: Must match keys in `meta/modality.json` under `"annotation"` (e.g., `"annotation.human.task_description"` for SO-100)
+### Optional Fields
+**3. `sin_cos_embedding_keys` (list[str] | None)**
+Specifies which state keys should use sine/cosine encoding. Best for dimensions that are in radians (e.g., joint angles). If not specified, min-max normalization is used. Note that this will duplicate the number of dimensions by 2, and is only recommended for proprioceptive states.
+```python
+"state": ModalityConfig(
+    delta_indices=[0],
+    modality_keys=["single_arm", "gripper"],
+    sin_cos_embedding_keys=["single_arm"],  # Apply sin/cos to joint angles
+)
+```
+**4. `mean_std_embedding_keys` (list[str] | None)**
+Specifies which keys should use mean/standard deviation normalization instead of min-max normalization.
+**5. `action_configs` (list[ActionConfig] | None)**
+Required for the `"action"` modality. Defines how each action modality should be interpreted and transformed. The list must have the **same length and same order** as `modality_keys` — `action_configs[0]` applies to `modality_keys[0]`, `action_configs[1]` to `modality_keys[1]`, etc. A mismatch in ordering will silently apply the wrong representation (e.g., RELATIVE to a gripper that should be ABSOLUTE). See more details in the [Action Modality](#understanding-actionconfig) section.
+## Configuring Each Modality
+### Video Modality
+Defines which camera views to use:
+```python
+"video": ModalityConfig(
+    delta_indices=[0],  # Current frame only
+    modality_keys=[
+        "front",  # Must match a key in meta/modality.json under "video"
+    ],
+)
+```
+For multiple cameras:
+```python
+"video": ModalityConfig(
+    delta_indices=[0],
+    modality_keys=["front", "wrist"],
+)
+```
+### State Modality
+Defines proprioceptive observations (joint positions, gripper states, etc.):
+```python
+"state": ModalityConfig(
+    delta_indices=[0],  # Current state
+    modality_keys=[
+        "single_arm",      # Must match keys in meta/modality.json under "state"
+        "gripper",
+    ],
+)
+```
+### Action Modality
+Defines the action space and prediction horizon:
+```python
+"action": ModalityConfig(
+    delta_indices=list(range(0, 16)),  # Predict 16 steps into the future
+    modality_keys=[
+        "single_arm",      # Must match keys in meta/modality.json under "action"
+        "gripper",
+    ],
+    action_configs=[
+        # One ActionConfig per modality_key
+        # single_arm
+        ActionConfig(
+            rep=ActionRepresentation.RELATIVE,  # relative control of the single arm
+            type=ActionType.NON_EEF,
+            format=ActionFormat.DEFAULT,
+        ),
+        # gripper
+        ActionConfig(
+            rep=ActionRepresentation.ABSOLUTE,  # absolute control of the gripper
+            type=ActionType.NON_EEF,
+            format=ActionFormat.DEFAULT,
+        ),
+    ],
+)
+```
+#### Understanding `ActionConfig`
+Each `ActionConfig` has three required fields and one optional field:
+**1. `rep` (ActionRepresentation)**
+Defines how actions should be interpreted:
+- `RELATIVE`: Actions are deltas from the current state (introduced in the UMI paper)
+- `ABSOLUTE`: Actions are target positions
+Using relative actions will lead to smoother actions, but might suffer from drifting. If you want to use relative actions, please make sure the state and action stored in the dataset are absolute, and the absolute to relative will be handled in the processor.
+**2. `type` (ActionType)**
+Specifies the control space:
+- `EEF`: End-effector/Cartesian space control (Expecting a 9-dimensional vector: x, y, z positions + rotation 6D)
+- `NON_EEF`: Joint space control and other non-EEF control spaces (joint angles, positions, gripper positions, etc.)
+**3. `format` (ActionFormat)**
+Defines the action representation format:
+- `DEFAULT`: Standard format (e.g., joint angles, gripper positions)
+- `XYZ_ROT6D`: 3D position + 6D rotation representation for end-effector control
+- `XYZ_ROTVEC`: 3D position + rotation vector for end-effector control
+**4. `state_key` (str | None)**
+Optional. Specifies the corresponding reference state key for computing relative actions when `rep=RELATIVE`. If not provided, the system will use the action key as the reference state key.
+Example with `state_key`:
+```python
+"joint_pos_action_left": ActionConfig(
+    rep=ActionRepresentation.RELATIVE,
+    type=ActionType.NON_EEF,
+    format=ActionFormat.DEFAULT,
+    state_key="joint_pos_obs_left",  # Use this state to compute relative action
+)
+```
+### Language Modality
+Defines which language annotations to use:
+```python
+"language": ModalityConfig(
+    delta_indices=[0],
+    modality_keys=["annotation.human.task_description"],  # Must match annotation keys in meta/modality.json
+)
+```
+## Complete Example: SO-100
+Here's the complete SO-100 configuration with explanations:
+```python
+so100_config = {
+    "video": ModalityConfig(
+        delta_indices=[0],
+        modality_keys=["front", "wrist"],
+    ),
+    "state": ModalityConfig(
+        delta_indices=[0],
+        modality_keys=[
+            "single_arm",
+            "gripper",
+        ],
+    ),
+    "action": ModalityConfig(
+        delta_indices=list(range(0, 16)),
+        modality_keys=[
+            "single_arm",
+            "gripper",
+        ],
+        action_configs=[
+            ActionConfig(
+                rep=ActionRepresentation.RELATIVE,
+                type=ActionType.NON_EEF,
+                format=ActionFormat.DEFAULT,
+            ),
+            ActionConfig(
+                rep=ActionRepresentation.ABSOLUTE,
+                type=ActionType.NON_EEF,
+                format=ActionFormat.DEFAULT,
+            ),
+        ],
+    ),
+    "language": ModalityConfig(
+        delta_indices=[0],
+        modality_keys=["annotation.human.task_description"],
+    ),
+}
+```
+## Key Relationships with `meta/modality.json`
+The modality configuration's `modality_keys` must reference keys that exist in your dataset's `meta/modality.json`:
+**Example `meta/modality.json`:**
+```json
+{
+    "state": {
+        "single_arm": {"start": 0, "end": 5},
+        "gripper": {"start": 5, "end": 6},
+    },
+    "action": {
+        "single_arm": {"start": 0, "end": 5},
+        "gripper": {"start": 5, "end": 6},
+    },
+    "video": {
+        "front": {"original_key": "observation.images.front"},
+        "wrist": {"original_key": "observation.images.wrist"},
+    },
+    "annotation": {
+        "human.task_description": {
+            "original_key": "task_index"
+        }
+    }
+}
+```
+The system will:
+1. Use `modality_keys` to look up the corresponding entries in `meta/modality.json`
+2. Extract the correct slices from the concatenated state/action arrays
+3. Apply the specified transformations (normalization, action representation conversion)
+## Registering Your Configuration
+After defining your configuration, register it so it's available to the training and inference pipelines:
+```python
+from gr00t.configs.data.embodiment_configs import register_modality_config
+your_modality_config = {
+    ...
+}
+register_modality_config(your_modality_config, embodiment_tag=EmbodimentTag.NEW_EMBODIMENT)
+```
+Save your configuration to a Python file and pass the path to the `modality_config_path` argument when running the finetuning script.

getting_started/data_preparation.md ADDED Viewed

	@@ -0,0 +1,164 @@

+# Robot Data Preparation Guide
+## Overview
+This guide shows how to convert your robot data to work with our flavor of the [LeRobot dataset V2 format](https://github.com/huggingface/lerobot?tab=readme-ov-file#the-lerobotdataset-format) ([LeRobot docs](https://huggingface.co/docs/lerobot)) -- `GR00T LeRobot`. While we have added additional structure, our schema maintains full compatibility with the upstream LeRobot v2. The additional metadata and structure allow for more detailed specification and language annotations for your robot data.
+> The TLDR: Add a `meta/modality.json` file to your LeRobot v2 dataset and follow the schema below.
+## LeRobot v2 Requirements
+If you already have a dataset in the LeRobot v2 format, you can skip this section.
+If you have a dataset in the LeRobot v3.0 format, please use [this script](../scripts/lerobot_conversion/convert_v3_to_v2.py) to convert it to the LeRobot v2 format.
+> **Why LeRobot v2?** GR00T currently uses the LeRobot v2 data format because many upstream datasets (DROID, LIBERO, Bridge, etc.) are published in v2. We plan to support both v2 and v3 formats natively in a future release. For now, please convert v3 datasets to v2 using the script above.
+If you have a dataset in another format, please convert it to the LeRobot v2 format satisfying the following requirements.
+### Structure Requirements
+The folder should follow a similar structure as below and contain these core folders and files:
+```
+.
+├─meta
+│ ├─episodes.jsonl
+│ ├─modality.json # -> GR00T LeRobot specific
+│ ├─info.json
+│ └─tasks.jsonl
+├─videos
+│ └─chunk-000
+│   └─observation.images.ego_view
+│     └─episode_000001.mp4
+│     └─episode_000000.mp4
+└─data
+  └─chunk-000
+    ├─episode_000001.parquet
+    └─episode_000000.parquet
+```
+### Video Observations (video/chunk-*)
+The videos folder will contain the mp4 files associated with each episode following episode_00000X.mp4 naming format where X indicates the episode number.
+**Requirements**:
+- Must be stored as MP4 files.
+- Should be named using the format: `observation.images.<video_name>`
+### Data (data/chunk-*)
+The data folder will contain all of the parquet files associated with each episode following episode_00000X.parquet naming format where X indicates the episode number.
+Each parquet file will contain:
+- State information: stored as observation.state which is a 1D concatenated array of all state modalities.
+- Action: stored as action which is a 1D concatenated array of all action modalities.
+- Timestamp: stored as timestamp which is a float point number of the starting time.
+- Annotations: stored as annotation.<annotation_source>.<annotation_type>(.<annotation_name>) (see the annotation field in the example configuration for example naming.).  No other columns should have the annotation prefix, see the (multiple-annotation-support) if interested in adding multiple annotations.
+#### Example Parquet File
+Here is a sample of the `cube_to_bowl` dataset that is present in the [demo_data](../demo_data/cube_to_bowl_5/) directory.
+```
+{
+    "observation.state":[-0.01,...,0],       // 1D array: all state modalities concatenated per modality.json order
+    "action":[-0.010,...,0],                 // 1D array: all action modalities concatenated per modality.json order
+    "timestamp":0.049,                       // float: wall-clock time of this observation (seconds)
+    "annotation.human.action.task_description":0,  // int: index into meta/tasks.jsonl for the language instruction
+    "task_index":0,                          // int: task identifier (same as annotation index for single-task)
+    "annotation.human.validity":1,           // int: index into meta/tasks.jsonl for validity label
+    "episode_index":0,                       // int: which episode this frame belongs to
+    "index":0,                               // int: global frame index across all episodes in the dataset
+    "next.reward":0,                         // float: reward at the next timestep (0 if unused)
+    "next.done":false                        // bool: true if this is the last frame of the episode
+}
+```
+### Meta
+- `episodes.jsonl` contains a list of all the episodes in the entire dataset. Each episode contains a list of tasks and the length of the episode.
+- `tasks.jsonl` contains a list of all the tasks in the entire dataset.
+- `info.json` contains the dataset information.
+#### meta/tasks.jsonl
+Here is a sample of the `meta/tasks.jsonl` file that contains the task descriptions.
+```
+{"task_index": 0, "task": "pick the squash from the counter and place it in the plate"}
+{"task_index": 1, "task": "valid"}
+```
+You can refer the task index in the parquet file to get the task description. So in this case, the `annotation.human.action.task_description` for the first observation is "pick the squash from the counter and place it in the plate" and `annotation.human.validity` is "valid".
+`tasks.jsonl` contains a list of all the tasks in the entire dataset.
+#### meta/episodes.jsonl
+Here is a sample of the `meta/episodes.jsonl` file that contains the episode information.
+```
+{"episode_index": 0, "tasks": [...], "length": 416}
+{"episode_index": 1, "tasks": [...], "length": 470}
+```
+`episodes.jsonl` contains a list of all the episodes in the entire dataset. Each episode contains a list of tasks and the length of the episode.
+## GR00T LeRobot Specific Requirements
+### The `meta/modality.json` Configuration
+We require an additional metadata file `meta/modality.json` that is not present in the standard LeRobot format. This file provides detailed metadata about state and action modalities, enabling:
+- **Separate Data Storage and Interpretation:**
+  - **State and Action:** Stored as concatenated float32 arrays. The `modality.json` file supplies the metadata necessary to interpret these arrays as distinct, fine-grained fields.
+  - **Video:** Stored as separate files, with the configuration file allowing them to be renamed to a standardized format.
+  - **Annotations:** Keeps track of all annotation fields. If there are no annotations, do not include the `annotation` field in the configuration file.
+- **Fine-Grained Splitting:** Divides the state and action arrays into more semantically meaningful fields.
+- **Clear Mapping:** Explicit mapping of data dimensions.
+- **Sophisticated Data Transformations:** Supports field-specific normalization and rotation transformations during training.
+#### Schema
+```json
+{
+    "state": {
+        "<state_key>": {
+            "start": <int>,         // Starting index in the state array
+            "end": <int>            // Ending index in the state array
+        }
+    },
+    "action": {
+        "<action_key>": {
+            "start": <int>,         // Starting index in the action array
+            "end": <int>            // Ending index in the action array
+        }
+    },
+    "video": {
+        "<new_key>": {
+            "original_key": "<original_video_key>"
+        }
+    },
+    "annotation": {
+        "<annotation_key>": {}  // Empty dictionary to maintain consistency with other modalities
+    }
+}
+```
+#### Example
+For a concrete example of `modality.json` and the full dataset structure, see the publicly available datasets on HuggingFace:
+[nvidia/PhysicalAI-Robotics-GR00T-X-Embodiment-Sim](https://huggingface.co/datasets/nvidia/PhysicalAI-Robotics-GR00T-X-Embodiment-Sim/tree/main).
+You can also find a working example in the included demo data at [`demo_data/cube_to_bowl_5/meta/modality.json`](../demo_data/cube_to_bowl_5/meta/modality.json).
+#### Notes
+- All indices are zero-based and follow Python's array slicing convention (`[start:end]`).
+## GR00T LeRobot Extensions to Standard LeRobot
+GR00T LeRobot is a flavor of the standard LeRobot format with more opinionated requirements:
+- We will compute `meta/stats.json` and `meta/relative_stats.json` for each dataset, and store them in the `meta` folder.
+- Proprioceptive states must always be included in the "observation.state" keys.
+- We support multi-channel annotation formats (e.g., coarsegrained, finetuned), allowing users to add as many annotation channels as needed via the `annotation.<annotation_source>.<annotation_type>` key.
+- We require an additional metadata file `meta/modality.json` that is not present in the standard LeRobot format.
+### Multiple Annotation Support
+To support multiple annotations within a single parquet file, users may add extra columns to the parquet file. Users should treat these columns the same way as the `task_index` column in the original LeRobot v2 dataset:
+In LeRobot v2, actual language descriptions are stored in a row of the `meta/tasks.jsonl` file, while the parquet file stores only the corresponding index in the `task_index` column. We follow the same convention and store the corresponding index for each annotation in the `annotation.<annotation_source>.<annotation_type>` column. Although the `task_index` column may still be used for the default annotation, a dedicated column `annotation.<annotation_source>.<annotation_type>` is required to ensure it is loadable by our custom data loader.

getting_started/finetune_new_embodiment.md ADDED Viewed

	@@ -0,0 +1,153 @@

+# Fine-tune on Custom Embodiments ("NEW_EMBODIMENT")
+This guide demonstrates how to finetune GR00T on your own robot data and configuration. We provide a complete example for the Huggingface [SO-100](https://github.com/TheRobotStudio/SO-ARM100) robot under `examples/SO100`, which uses `demo_data/cube_to_bowl_5` as the demo dataset.
+## Step 1: Prepare Your Data
+Prepare your data in **GR00T-flavored LeRobot v2 format** by following the [data preparation guide](data_preparation.md).
+## Step 2: Prepare Your Modality Configuration
+Define your own modality configuration by following the [modality config guide](data_config.md). Below is an example configuration that corresponds to the demo data:
+```python
+from gr00t.configs.data.embodiment_configs import register_modality_config
+from gr00t.data.embodiment_tags import EmbodimentTag
+from gr00t.data.types import (
+    ActionConfig,
+    ActionFormat,
+    ActionRepresentation,
+    ActionType,
+    ModalityConfig,
+)
+so100_config = {
+    # Video: use current frame only ([0]); list camera view names matching modality.json
+    "video": ModalityConfig(
+        delta_indices=[0],
+        modality_keys=[
+            "front",
+            "wrist",
+        ],
+    ),
+    # State: current proprioceptive reading; keys must match modality.json "state" entries
+    "state": ModalityConfig(
+        delta_indices=[0],
+        modality_keys=[
+            "single_arm",
+            "gripper",
+        ],
+    ),
+    # Action: 16-step prediction horizon; each key needs an ActionConfig
+    "action": ModalityConfig(
+        delta_indices=list(range(0, 16)),  # predict 16 future steps
+        modality_keys=[
+            "single_arm",
+            "gripper",
+        ],
+        action_configs=[
+            # single_arm: RELATIVE = delta from current state (better generalization)
+            ActionConfig(
+                rep=ActionRepresentation.RELATIVE,
+                type=ActionType.NON_EEF,       # joint-space, not end-effector
+                format=ActionFormat.DEFAULT,
+            ),
+            # gripper: ABSOLUTE = target position (binary open/close works better absolute)
+            ActionConfig(
+                rep=ActionRepresentation.ABSOLUTE,
+                type=ActionType.NON_EEF,
+                format=ActionFormat.DEFAULT,
+            ),
+        ],
+    ),
+    # Language: task instruction from annotation field in the dataset
+    "language": ModalityConfig(
+        delta_indices=[0],
+        modality_keys=["annotation.human.task_description"],
+    ),
+}
+# Important: always register under EmbodimentTag.NEW_EMBODIMENT for custom embodiments
+register_modality_config(so100_config, embodiment_tag=EmbodimentTag.NEW_EMBODIMENT)
+```
+## Step 3: Run Fine-tuning
+We'll use `gr00t/experiment/launch_finetune.py` as the entry point. Ensure that the uv environment is enabled before launching. You can do this by running the command `uv run bash <example_script_name>`.
+### View Available Arguments
+```bash
+# Display all available arguments
+uv run python gr00t/experiment/launch_finetune.py --help
+```
+### Execute Fine-tuning
+```bash
+# Configure for single GPU
+export NUM_GPUS=1
+CUDA_VISIBLE_DEVICES=0 uv run python \
+    gr00t/experiment/launch_finetune.py \
+    --base-model-path nvidia/GR00T-N1.7-3B \
+    --dataset-path ./demo_data/cube_to_bowl_5 \
+    --embodiment-tag NEW_EMBODIMENT \
+    --modality-config-path examples/SO100/so100_config.py \
+    --num-gpus $NUM_GPUS \
+    --output-dir /tmp/so100 \
+    --save-total-limit 5 \
+    --save-steps 2000 \
+    --max-steps 2000 \
+    --use-wandb \
+    --global-batch-size 32 \
+    --color-jitter-params brightness 0.3 contrast 0.4 saturation 0.5 hue 0.08 \
+    --dataloader-num-workers 4
+```
+### Key Parameters
+| Parameter | Description |
+|-----------|-------------|
+| `--base-model-path` | Path to the pre-trained base model checkpoint |
+| `--dataset-path` | Path to your training dataset |
+| `--embodiment-tag` | Tag to identify your robot embodiment |
+| `--modality-config-path` | Path to user-specified modality config (required only for `NEW_EMBODIMENT` tag) |
+| `--output-dir` | Directory where checkpoints will be saved |
+| `--save-steps` | Save checkpoint every N steps |
+| `--max-steps` | Total number of training steps |
+| `--use-wandb` | Enable Weights & Biases logging for experiment tracking |
+> **Note:** Validation during fine-tuning is disabled by default (`eval_strategy="no"` in the training config). To enable periodic validation, pass `--eval-strategy steps --eval-steps 500` (runs validation every 500 steps) or `--eval-strategy epoch` (runs validation every epoch). You can also adjust `--eval-batch-size` (default: 2).
+## Step 4: Open Loop Evaluation
+After finetuning, evaluate the model's performance using open loop evaluation:
+```bash
+uv run python gr00t/eval/open_loop_eval.py \
+    --dataset-path ./demo_data/cube_to_bowl_5 \
+    --embodiment-tag NEW_EMBODIMENT \
+    --model-path /tmp/so100/checkpoint-2000 \
+    --traj-ids 0 \
+    --action-horizon 16 \
+    --steps 400 \
+    --modality-keys single_arm gripper
+```
+### `open_loop_eval.py` Parameters
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `--dataset-path` | `demo_data/cube_to_bowl_5/` | Path to LeRobot-format dataset |
+| `--embodiment-tag` | `new_embodiment` | Robot embodiment tag (case-insensitive) |
+| `--model-path` | `None` | Path to checkpoint. If omitted, connects to a running server via `--host`/`--port` |
+| `--traj-ids` | `[0]` | Episode indices to evaluate (space-separated, e.g., `0 1 2`) |
+| `--action-horizon` | `16` | Action steps predicted per inference call |
+| `--steps` | `200` | Max steps per trajectory (capped by actual trajectory length) |
+| `--denoising-steps` | `4` | Diffusion denoising iterations |
+| `--save-plot-path` | `None` | Directory to save GT-vs-predicted comparison plots |
+| `--modality-keys` | `None` | Action keys to plot. If omitted, plots all action dimensions |
+| `--host` / `--port` | `127.0.0.1` / `5555` | Server address when `--model-path` is omitted |
+### Example Evaluation Result
+The evaluation generates visualizations comparing predicted actions against ground truth trajectories:
+<img src="../media/open_loop_eval_so100.jpg" width="800" alt="Open loop evaluation results showing predicted vs ground truth trajectories" />

getting_started/hardware_recommendation.md ADDED Viewed

	@@ -0,0 +1,95 @@

+# Hardware Recommendations
+GR00T N1.7 has two hardware profiles: **fine-tuning** (needs GPU VRAM and compute) and **inference/deployment** (needs low latency). This guide helps you choose the right hardware for each.
+![Workflow Diagram](../media/GR00T-reference-arch-diagram.png "Post-training and deployment workflow")
+---
+## Inference Hardware
+**Minimum:** 1 GPU with 16 GB+ VRAM, CUDA 12.6+.
+The table below summarizes end-to-end inference frequency across tested platforms (GR00T N1.7, 4 denoising steps, 1 camera):
+| Platform | VRAM | PyTorch Eager | With TensorRT | Use Case |
+|----------|------|---------------|---------------|----------|
+| H100 80GB HBM3 | 80 GB | 11.7 Hz | 35.9 Hz | High-frequency control, multi-env batch inference |
+| H20 96GB HBM3 | 96 GB | 12.0 Hz | 29.4 Hz | Cost-effective datacenter inference |
+| RTX Pro 6000 Blackwell | 96 GB | 12.8 Hz | 35.9 Hz | Workstation inference, development |
+| RTX Pro 5000 72GB | 72 GB | 7.9 Hz | 24.7 Hz | Workstation inference |
+| L40 | 48 GB | 7.8 Hz | 26.0 Hz | Cloud inference |
+| L20 | 48 GB | 7.1 Hz | 23.3 Hz | Cloud inference |
+| DGX Spark | 128 GB shared | 7.9 Hz | 10.1 Hz | Desktop edge, prototyping |
+| AGX Thor | 128 GB shared | 6.9 Hz | 10.7 Hz | Robot-mounted edge deployment |
+| Orin* | 64 GB shared | 2.9 Hz | 4.6 Hz | Legacy Jetson edge |
+> *Orin uses DiT-only TensorRT (TRT 10.3 does not support the backbone engine). All other platforms use the full TensorRT pipeline.
+### Key Insights
+- **30+ Hz** (H100, RTX Pro 6000 with TensorRT): suitable for high-frequency closed-loop control where sub-30 ms latency matters.
+- **10+ Hz** (Thor, Spark with TRT; most dGPUs with torch.compile): sufficient for typical manipulation tasks running at a 10 Hz control rate.
+- **< 5 Hz** (Orin): only suitable for slow, non-reactive tasks. Orin's TRT 10.3 cannot accelerate the backbone — gains are limited to DiT-only mode.
+- **TensorRT Full Pipeline** provides 1.5--3.3x speedup over PyTorch Eager depending on platform. Biggest gains are on datacenter GPUs where backbone acceleration is significant.
+- **torch.compile** is a good zero-effort middle ground (no engine build step), achieving 1.1--1.9x speedup across all platforms.
+> For full per-component latency breakdown, see the [Deployment Benchmark Results](../scripts/deployment/README.md#benchmark-results).
+---
+## Fine-Tuning Hardware
+**Minimum:** 1 GPU with 40 GB+ VRAM. GR00T N1.7 is a ~3B parameter model (bfloat16).
+| Setup | GPUs | VRAM per GPU | Global Batch Size | Notes |
+|-------|------|-------------|-------------------|-------|
+| Quick start / prototyping | 1x H100, L40, or A100 | 40--80 GB | 32 | Single GPU; sufficient for demo datasets |
+| Recommended | 4--8x H100 or L40 | 40--80 GB each | 64--640 | Multi-GPU via torchrun; faster convergence |
+| Full scale | 8x RTX Pro 6000 or DGX | 96 GB each | 640 | Large datasets, production fine-tuning |
+### Key Details
+- **Default fine-tuning** tunes the projector + diffusion action head (not the full LLM backbone), keeping peak VRAM under ~35 GB per GPU.
+- **Enabling `--tune-llm` or `--tune-visual`** significantly increases VRAM — 80 GB+ per GPU recommended.
+- **`--gradient-accumulation-steps`** can compensate for fewer GPUs. For example, 4 GPUs with 8 accumulation steps and per-GPU batch of 8 gives an effective global batch size of 256.
+- **Reduce `--num-shards-per-epoch`** if host memory (not VRAM) is limited — this controls how much dataset is preloaded into RAM.
+---
+## Software Requirements
+| Requirement | Version |
+|-------------|---------|
+| Python | 3.10 |
+| CUDA | 12.6+ (dGPU, Orin) / 13.0 (Thor, Spark) |
+| PyTorch | 2.7+ |
+| OS | Ubuntu 22.04+ (dGPU), JetPack 6.2 (Orin), Ubuntu 24.04 (Thor, Spark) |
+| Package manager | [uv](https://docs.astral.sh/uv/) (recommended) |
+Platform-specific installation instructions: see the [Deployment Guide](../scripts/deployment/README.md).
+---
+## Recommended Configurations
+### Starter Kit
+For development, small-scale fine-tuning, and edge deployment:
+| Component | Recommendation |
+|-----------|---------------|
+| Training | 1--4x L40 (48 GB) or RTX Pro 5000/6000 workstation |
+| Edge Deployment | [Jetson AGX Thor](https://developer.nvidia.com/embedded/jetson) Developer Kit (128 GB shared memory, Blackwell GPU) |
+| Storage | 500 GB+ SSD (datasets + checkpoints) |
+### Center of Excellence
+For production fine-tuning and high-throughput inference:
+| Component | Recommendation |
+|-----------|---------------|
+| Training | DGX with 8x H100/B200, or RTX Pro Server with 8x RTX Pro 6000 Blackwell |
+| Inference Server | H100 or H20 node with TensorRT Full Pipeline (35+ Hz per GPU) |
+| Edge Deployment | [Jetson AGX Thor](https://developer.nvidia.com/embedded/jetson) or [DGX Spark](https://developer.nvidia.com/dgx-spark) |
+| Storage | Scalable networked storage (NFS/S3) for large-scale datasets |

getting_started/policy.md ADDED Viewed

	@@ -0,0 +1,574 @@

+# Understanding the GR00T Policy API
+This guide explains how to use the `Gr00tPolicy` class to load and run inference with your trained model. After training, you'll use this API to integrate your model with evaluation environments.
+## Loading the Policy
+Initialize a policy by providing the embodiment tag, model checkpoint path, and device:
+```python
+from gr00t.policy import Gr00tPolicy
+from gr00t.data.embodiment_tags import EmbodimentTag
+# Load your trained model
+policy = Gr00tPolicy(
+    model_path="/path/to/your/checkpoint",
+    embodiment_tag=EmbodimentTag.NEW_EMBODIMENT,  # or other embodiment tags
+    device="cuda:0",  # or "cpu", or device index like 0
+    strict=True  # Enable input/output validation (recommended during development)
+)
+```
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `embodiment_tag` | `EmbodimentTag \| str` | *(required)* | Robot type; accepts enum or case-insensitive string (e.g., `"NEW_EMBODIMENT"`) |
+| `model_path` | `str` | *(required)* | Path to model checkpoint directory (local path or HuggingFace model ID) |
+| `device` | `str \| int` | *(required)* | Inference device: `"cuda:0"`, `0`, or `"cpu"` |
+| `strict` | `bool` | `True` | Validates observation shapes and dtypes at runtime. Recommended during development; disable in production for speed |
+## Inference Parameter Guide
+When running inference scripts (e.g., `standalone_inference_script.py`, `open_loop_eval.py`), the key parameters are:
+### `--embodiment-tag`
+Determines which modality config the model uses (state/action keys, normalization). **Must match the robot type of your dataset.**
+The tag is **case-insensitive** and accepts either the enum name or the string value.
+For example, `--embodiment-tag OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT` and `--embodiment-tag LIBERO_PANDA` all resolve correctly. An unknown tag will produce an error listing all known options.
+- **Pretrain tags** (e.g., `OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT`, `XDOF`, `REAL_G1`) — use for zero-shot inference on datasets that match the pretrained embodiment. The modality config is loaded from the base model checkpoint.
+- **Posttrain tags** (`OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT`, `LIBERO_PANDA`, `SIMPLER_ENV_GOOGLE`, `SIMPLER_ENV_WIDOWX`) — require a finetuned checkpoint. Passing these to the base model will produce an error.
+- **`NEW_EMBODIMENT`** — use for custom robots. Requires a `--modality-config-path` during finetuning. After finetuning, the config is saved in the checkpoint and loaded automatically during inference.
+    - Only one `NEW_EMBODIMENT` modality config may be registered per Python process. Examples like [`examples/SO100/so100_config.py`](../examples/SO100/so100_config.py) and [`examples/mask-guided-background-suppression/so101_config.py`](../examples/mask-guided-background-suppression/so101_config.py) each register under this tag; importing both in the same process will fail. In normal CLI use the selected `--modality-config-path` is the only one imported, so this is not an issue — just don't wire both configs into the same script.
+#### Known Embodiment Tags
+**Pretrain tags** — baked into the base model (`nvidia/GR00T-N1.7-3B`), ready for zero-shot inference:
+| Tag | Robot / Data Source | Value |
+|-----|---------------------|-------|
+| `OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT` | DROID (relative EEF + joint) | `oxe_droid_relative_eef_relative_joint` |
+| `XDOF` | Generic X-DOF (relative EEF + joint) | `xdof_relative_eef_relative_joint` |
+| `XDOF_SUBTASK` | Generic X-DOF (subtask variant) | `xdof_relative_eef_relative_joint_subtask` |
+| `REAL_G1` | Real-world Unitree G1 (relative EEF + joint) | `real_g1_relative_eef_relative_joints` |
+| `REAL_R1_PRO_SHARPA` | Real-world R1 Pro Sharpa (relative EEF) | `real_r1_pro_sharpa_relative_eef` |
+| `REAL_R1_PRO_SHARPA_HUMAN` | R1 Pro Sharpa — human teleop data | `real_r1_pro_sharpa_relative_eef_human` |
+| `REAL_R1_PRO_SHARPA_MAXINSIGHTS` | R1 Pro Sharpa — MaxInsights (single-cam) | `real_r1_pro_sharpa_relative_eef_maxinsights` |
+| `REAL_R1_PRO_SHARPA_MECKA` | R1 Pro Sharpa — Mecka (single-cam) | `real_r1_pro_sharpa_relative_eef_mecka` |
+**Posttrain tags** — require a finetuned checkpoint (not usable with the base model directly):
+| Tag | Robot | Value | Checkpoint |
+|-----|-------|-------|------------|
+| `OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT` | DROID (relative EEF + joint) | `oxe_droid_relative_eef_relative_joint` | `nvidia/GR00T-N1.7-DROID` |
+| `LIBERO_PANDA` | LIBERO Panda | `libero_sim` | `nvidia/GR00T-N1.7-LIBERO` |
+| `SIMPLER_ENV_GOOGLE` | SimplerEnv Google Robot | `simpler_env_google` | `nvidia/GR00T-N1.7-SimplerEnv-Fractal` |
+| `SIMPLER_ENV_WIDOWX` | SimplerEnv WidowX | `simpler_env_widowx` | `nvidia/GR00T-N1.7-SimplerEnv-Bridge` |
+**Generic tag** for any new robot: `NEW_EMBODIMENT` (requires `--modality-config-path`)
+> **`OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT` appears in both tables by design.** DROID is supported both zero-shot (via the base model) and via the finetuned `nvidia/GR00T-N1.7-DROID` checkpoint. Pass the tag with either `--model-path nvidia/GR00T-N1.7-3B` (zero-shot) or `--model-path nvidia/GR00T-N1.7-DROID` (finetuned); see `examples/DROID/README.md`.
+> **Important:** Pretrain tags work with the base model for zero-shot inference. Posttrain tags require a finetuned checkpoint — using them with the base model will fail with an error listing the supported tags. You also cannot mix embodiment tags and datasets (e.g., `--embodiment-tag LIBERO_PANDA` expects LIBERO state keys and will fail on an SO100 dataset).
+### `--traj-ids`
+Which episode indices to evaluate. Check your dataset's `meta/episodes.jsonl` to see available episodes. For example, `--traj-ids 0 1 2` runs on the first 3 episodes.
+### `--action-horizon`
+Number of future action steps predicted per inference call. The model's maximum is 16 (from model config). Common values:
+- `16` — full horizon, used for open-loop evaluation
+- `8` — shorter horizon, common for real-time deployment where actions are re-planned frequently
+This parameter is robot-agnostic — the same value works across different datasets and embodiments.
+### `--inference-mode`
+- `pytorch` — standard PyTorch inference (default, no setup required)
+- `tensorrt` — accelerated inference using TensorRT engine (requires ONNX export + engine build first, see [Deployment Guide](../scripts/deployment/README.md))
+### Expected Output (PyTorch mode)
+The inference scripts produce:
+- Per-trajectory **MSE** and **MAE** (unnormalized action prediction error vs ground truth)
+- **Timing stats**: model load time, avg/min/max/P90 inference time per step
+- **Summary**: average MSE/MAE across all trajectories
+### Example: Matching Parameters to Dataset
+| Dataset | Embodiment Tag | Notes |
+|---------|---------------|-------|
+| `demo_data/droid_sample` | `OXE_DROID_RELATIVE_EEF_RELATIVE_JOINT` | DROID — works with base model (zero-shot) or finetuned `nvidia/GR00T-N1.7-DROID` |
+| `demo_data/libero_demo` | `LIBERO_PANDA` | LIBERO Panda — uses finetuned checkpoint from `nvidia/GR00T-N1.7-LIBERO` (must be downloaded locally first, see [README](../README.md)) |
+| `demo_data/cube_to_bowl_5` | `NEW_EMBODIMENT` | SO100 arm — only works with a finetuned checkpoint, not the base model |
+## Understanding the Observation Format
+The policy expects observations as a nested dictionary with three modalities:
+```python
+observation = {
+    "video": {
+        "camera_name": np.ndarray,  # Shape: (B, T, H, W, 3), dtype: uint8
+        # ... one entry per camera
+    },
+    "state": {
+        "state_name": np.ndarray,   # Shape: (B, T, D), dtype: float32
+        # ... one entry per state stream
+    },
+    "language": {
+        "task": [[str]],            # Shape: (B, 1), list of lists of strings
+    }
+}
+```
+### Dimensions
+- **`B`**: Batch size (number of parallel environments)
+- **`T`**: Temporal horizon (number of historical observations)
+- **`H, W`**: Image height and width
+- **`D`**: State dimension
+- **`C`**: Number of channels (must be 3 for RGB)
+### Data Type Requirements
+- **Videos** must be `np.uint8` arrays with RGB pixel values in range [0, 255]
+- **States** must be `np.float32` arrays
+- **Language** instructions are lists of lists of strings
+### Important Notes
+- The temporal horizon `T` is determined by your model's training configuration
+- Different modalities may have different temporal horizons (query via `get_modality_config()`)
+- Language instructions are typically single timestep (`T=1`)
+- All arrays in a batch must have the same batch size `B`
+## Understanding the Action Format
+The policy returns actions in a similar nested structure:
+```python
+action = {
+    "action_name": np.ndarray,  # Shape: (B, T, D), dtype: float32
+    # ... one entry per action stream
+}
+```
+### Dimensions
+- **`B`**: Batch size (matches input batch size)
+- **`T`**: Action horizon (number of future action steps to predict)
+- **`D`**: Action dimension (e.g., 7 for arm joints, 1 for gripper)
+### Important Notes
+- Actions are returned in **physical units** (e.g., joint positions in radians, velocities in rad/s)
+- Actions are **not normalized** - they're ready to send to your robot controller
+- The action horizon `T` allows predicting multiple future steps (useful for action chunking)
+## Running Inference
+Use the `get_action()` method to compute actions from observations:
+```python
+# Get action from current observation
+action, info = policy.get_action(observation)
+# Access the action array
+arm_action = action["action_name"]  # Shape: (B, T, D)
+# Extract the first action to execute
+next_action = arm_action[:, 0, :]  # Shape: (B, D)
+```
+The method returns a tuple of:
+- `action`: Dictionary of action arrays
+- `info`: Dictionary of additional information (currently empty, reserved for future use)
+## Querying Modality Configurations
+To understand what observations your policy expects and what actions it produces, query the modality configuration:
+```python
+# Get modality configs for your embodiment
+modality_configs = policy.get_modality_config()
+# Check what camera keys are expected
+video_keys = modality_configs["video"].modality_keys
+print(f"Expected cameras: {video_keys}")
+# Check video temporal horizon
+video_horizon = len(modality_configs["video"].delta_indices)
+print(f"Video frames needed: {video_horizon}")
+# Check state keys and horizon
+state_keys = modality_configs["state"].modality_keys
+state_horizon = len(modality_configs["state"].delta_indices)
+print(f"Expected states: {state_keys}, horizon: {state_horizon}")
+# Check action keys and horizon
+action_keys = modality_configs["action"].modality_keys
+action_horizon = len(modality_configs["action"].delta_indices)
+print(f"Action outputs: {action_keys}, horizon: {action_horizon}")
+```
+This is especially useful when:
+- You're unsure what observations your trained model expects
+- You need to verify the temporal horizons for each modality
+- You're debugging observation/action format mismatches
+## Resetting the Policy
+Reset the policy between episodes:
+```python
+# Reset policy state (if any) between episodes
+info = policy.reset()
+```
+Currently, the policy is stateless, but calling `reset()` is good practice for future compatibility.
+## Adapting the Policy to Your Environment
+Most environments use different observation/action formats than the Policy API expects. You'll typically need to write a **policy wrapper** that:
+1. **Transforms observations**: Convert your environment's observation format to the Policy API format
+2. **Calls the policy**: Use `policy.get_action()` to compute actions
+3. **Transforms actions**: Convert the policy's actions back to your environment's format
+### Example Workflow
+```python
+# In your environment loop
+env_obs = env.reset()  # Environment-specific format
+# Transform to Policy API format
+policy_obs = transform_observation(env_obs)
+# Get action from policy
+policy_action, _ = policy.get_action(policy_obs)
+# Transform back to environment format
+env_action = transform_action(policy_action)
+# Execute in environment
+env_obs, reward, done, info = env.step(env_action)
+```
+### Using Server-Client Architecture for Remote Inference
+For many use cases, especially when working with real robots or distributed systems, you may want to run the policy on a separate machine (e.g., a GPU server) and send observations/actions over the network. GR00T provides a built-in server-client architecture using ZeroMQ for this purpose.
+#### Why Use Server-Client Architecture?
+- **Separate compute resources**: Run policy inference on a GPU server while controlling the robot from a different machine
+- **Dependency isolation**: Avoid dependency issues with the client policy
+```mermaid
+sequenceDiagram
+    participant Robot as Robot / Sim Client
+    participant Client as PolicyClient (ZMQ REQ)
+    participant Server as PolicyServer (ZMQ REP)
+    participant Policy as Gr00tPolicy (GPU)
+    Robot->>Client: observation dict
+    Client->>Server: msgpack(endpoint="get_action", data=obs)
+    Server->>Policy: policy.get_action(obs)
+    Policy-->>Server: (action_dict, info_dict)
+    Server-->>Client: msgpack(action, info)
+    Client-->>Robot: action dict
+```
+#### Starting the Policy Server
+Launch the server using the `run_gr00t_server.py` script:
+```bash
+uv run python gr00t/eval/run_gr00t_server.py \
+    --embodiment-tag NEW_EMBODIMENT \
+    --model-path /path/to/your/checkpoint \
+    --device cuda:0 \
+    --host 0.0.0.0 \
+    --port 5555
+```
+**Parameters:**
+- `--embodiment-tag`: The embodiment tag for your robot (e.g., `NEW_EMBODIMENT`)
+- `--model-path`: Path to your trained model checkpoint directory
+- `--device`: Device to run inference on (`cuda:0`, `cuda:1`, `cpu`, etc.)
+- `--host`: Host address (`127.0.0.1` for local only, `0.0.0.0` to accept external connections)
+- `--port`: Port number (default: 5555)
+- `--strict` / `--no-strict`: Enable or disable input/output validation (default: True)
+- `--use-sim-policy-wrapper`: Whether to use `Gr00tSimPolicyWrapper` for GR00T simulation environments (default: False)
+Once started, the server will display:
+```
+Starting GR00T inference server...
+  Embodiment tag: NEW_EMBODIMENT
+  Model path: /path/to/your/checkpoint
+  Device: cuda:0
+  Host: 0.0.0.0
+  Port: 5555
+Server is ready and listening on tcp://0.0.0.0:5555
+```
+#### Using the Policy Client
+On the client side (your environment/robot control code), use `PolicyClient` to connect to the server:
+```python
+from gr00t.policy.server_client import PolicyClient
+# Connect to the policy server
+policy = PolicyClient(
+    host="localhost",  # or IP address of your GPU server
+    port=5555,
+    timeout_ms=15000,  # 15 second timeout for inference
+    strict=False,      # leave the validation to the server
+)
+# Verify connection
+if not policy.ping():
+    raise RuntimeError("Cannot connect to policy server!")
+# Use just like a regular policy
+observation = get_observation()  # Your observation in Policy API format
+action, info = policy.get_action(observation)
+```
+**Parameters:**
+- `host`: Hostname or IP address of the policy server
+- `port`: Port number (must match server port)
+- `timeout_ms`: Timeout in milliseconds for network requests (default: 15000)
+- `api_token`: Optional API token for authentication (default: None)
+- `strict`: Enable client-side validation (usually False since server validates)
+#### Client API
+The `PolicyClient` implements the same `BasePolicy` interface, so it's a drop-in replacement:
+```python
+# Get modality configuration from the server
+modality_configs = policy.get_modality_config()
+# Get action — returns (action_dict, info_dict)
+action, info = policy.get_action(observation, options=None)
+# Reset policy state (e.g., switch episode in ReplayPolicy)
+info = policy.reset(options=None)
+# Check server health — returns True if server responds
+is_alive = policy.ping()
+# Shutdown the server remotely (optional)
+policy.kill_server()
+```
+#### Server API Reference
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `policy` | `BasePolicy` | *(required)* | The policy instance to serve (e.g., `Gr00tPolicy`, `ReplayPolicy`) |
+| `host` | `str` | `"*"` | Bind address. `"*"` accepts connections on all interfaces |
+| `port` | `int` | `5555` | TCP port for ZMQ REP socket |
+| `api_token` | `str` | `None` | If set, clients must include a matching token in every request |
+**Built-in endpoints:** `get_action`, `reset`, `get_modality_config`, `ping`, `kill`. Custom endpoints can be added via `server.register_endpoint(name, handler)`.
+#### Error Handling
+The server-client uses ZeroMQ REQ/REP sockets over TCP with msgpack serialization.
+- **Timeout:** If the server does not respond within `timeout_ms`, the ZMQ socket will raise `zmq.error.Again`. The default 15 s timeout accommodates cold-start model loading on the first call.
+- **Connection loss:** If `ping()` returns `False`, the client automatically recreates its ZMQ socket for the next attempt. Your control loop should retry or halt.
+- **Server-side errors:** Exceptions in the policy are caught, serialized as `{"error": "..."}`, and re-raised as `RuntimeError` on the client side.
+#### Debugging with ReplayPolicy
+When developing a new environment integration or debugging your inference loop, running a full model inference can be cumbersome. `ReplayPolicy` allows you to **replay recorded actions from an existing dataset**, helping you verify that:
+- Your environment setup works correctly
+- Observations are formatted properly
+- Action execution behaves as expected
+- The server-client communication is functioning
+This eliminates the need for a trained model during the development phase.
+##### Starting the Server with ReplayPolicy
+Instead of providing `--model-path`, use `--dataset-path` to start the server in replay mode:
+```bash
+uv run python gr00t/eval/run_gr00t_server.py \
+    --dataset-path /path/to/lerobot_dataset \
+    --embodiment-tag NEW_EMBODIMENT \
+    --host 0.0.0.0 \
+    --port 5555 \
+    --execution-horizon 8 # should match the executed action horizon in the environment
+```
+**Parameters:**
+- `--dataset-path`: Path to a LeRobot-compatible dataset directory
+- `--embodiment-tag`: The embodiment tag for modality configuration
+- `--execution-horizon`: Number of steps to advance the dataset per `get_action()` call. Should match the number of executed action steps in the environment.
+- `--modality-config-path`: (Optional) Path to custom modality config JSON file. If not provided, uses the config from `embodiment-tag`
+- `--use-sim-policy-wrapper`: Apply `Gr00tSimPolicyWrapper` for GR00T simulation environments
+##### Using ReplayPolicy from the Client
+On the client side, use `PolicyClient` exactly as you would with a real model:
+```python
+from gr00t.policy.server_client import PolicyClient
+# Connect to the replay policy server
+policy = PolicyClient(host="localhost", port=5555)
+# Use exactly like a regular policy
+action, info = policy.get_action(observation)
+# info contains replay metadata
+print(f"Replaying step {info['current_step']} of episode {info['episode_index']}")
+```
+##### Switching Episodes
+ReplayPolicy starts with episode 0 by default. To switch to a different episode:
+```python
+# Reset to a specific episode
+policy.reset(options={"episode_index": 5})
+# Optionally start from a specific step within the episode
+policy.reset(options={"episode_index": 5, "step_index": 10})
+```
+The number of available episodes can be queried via the `info` dict returned from `reset()` or `get_action()`.
+##### Example: Validating a LIBERO Environment
+Here's a complete example of using ReplayPolicy to validate a simulation setup:
+```bash
+# Terminal 1: Start the replay server
+uv run python gr00t/eval/run_gr00t_server.py \
+    --dataset-path <your_dataset_path> \
+    --embodiment-tag <YOUR_EMBODIMENT_TAG> \
+    --action-horizon 8 \
+    --use-sim-policy-wrapper
+# Terminal 2: Run evaluation with the replay policy
+uv run python gr00t/eval/rollout_policy.py \
+    --n-episodes 1 \
+    --policy-client-host 127.0.0.1 \
+    --policy-client-port 5555 \
+    --max-episode-steps 720 \
+    --env-name <env_prefix>/<task_name> \
+    --n-action-steps 8 \
+    --n-envs 1
+```
+If your environment is set up correctly, replaying ground-truth actions should achieve high (often 100%) success rates. Low success rates indicate issues with:
+- Environment reset state not matching the dataset
+- Observation preprocessing differences
+- Action space mismatches
+> **Tip:** ReplayPolicy is an excellent first step when integrating a new environment. Debug with replay first, then switch to model inference once the pipeline is validated.
+#### Integrating the GR00T N1.7 Client Into Your Deployment Pipeline
+GR00T's server–client architecture allows you to keep the **client side extremely lightweight**, making it easy to embed into any custom deployment pipeline without pulling in the full dependency stack.
+For a minimal working example, see
+[`eval_so100.py`](../gr00t/eval/real_robot/SO100/eval_so100.py).
+In most cases, your deployment environment only needs to install the local GR00T client code:
+```bash
+uv pip install -e . --verbose --no-deps
+```
+The client relies solely on a small set of interfaces:
+- `gr00t/policy/server_client.py`
+- `gr00t/policy/policy.py`
+- `gr00t/data/types.py`
+- `gr00t/data/embodiment_tags.py`
+## Common Patterns
+### Batched Inference
+The policy supports batched inference for efficiency:
+```python
+# Run 4 environments in parallel
+batch_size = 4
+observation = {
+    "video": {"wrist_cam": np.zeros((batch_size, T_video, H, W, 3), dtype=np.uint8)},
+    "state": {"joints": np.zeros((batch_size, T_state, D_state), dtype=np.float32)},
+    "language": {"task": [["pick up the cube"]] * batch_size},
+}
+action, _ = policy.get_action(observation)
+# action["action_name"] has shape (batch_size, action_horizon, action_dim)
+```
+### Single Environment Inference
+For single environments, use batch size of 1:
+```python
+# Add batch dimension (B=1)
+observation = {
+    "video": {"wrist_cam": video[np.newaxis, ...]},  # (1, T, H, W, 3)
+    "state": {"joints": state[np.newaxis, ...]},     # (1, T, D)
+    "language": {"task": [["pick up the cube"]]},    # List of length 1
+}
+action, _ = policy.get_action(observation)
+# Remove batch dimension
+single_action = action["action_name"][0]  # (action_horizon, action_dim)
+```
+### Action Chunking
+When the action horizon `T > 1`, you can use action chunking:
+```python
+action, _ = policy.get_action(observation)
+action_chunk = action["action_name"][:, :, :]  # (B, T, D)
+# Execute actions over multiple timesteps
+for t in range(action_chunk.shape[1]):
+    env.step(action_chunk[:, t, :])
+```
+### Training Dataloading Optimization
+When training a model, you can optimize the dataloading speed vs memory usage via various command line arguments.
+examples:
+```bash
+uv run python gr00t/experiment/launch_finetune.py \
+    .... \
+    --num-shards-per-epoch 100 \
+    --dataloader-num-workers 2
+    --shard-size 512 \
+```
+If vram is limited, you can reduce the all the numbers above to reduce the memory usage.
+To ensure more IID during sampling of shards, you can reduce the `episode_sampling_rate` to 0.05 or lower.
+## Troubleshooting
+1. **Enable strict mode** during development: `strict=True`
+2. **Print modality configs** to understand expected formats
+3. **Check shapes** of your observations before calling `get_action()`
+4. **Use the reference wrapper** (`Gr00tSimPolicyWrapper`) as a template
+5. **Validate incrementally**: Test with dummy observations first before connecting to real environments

getting_started/real_world_deployment.md ADDED Viewed

	@@ -0,0 +1,459 @@

+# GR00T Real-World Deployment Guide
+This guide covers building an end-to-end real-world VLA pipeline—from data collection and training to deployment—with practical engineering recommendations.
+## Overview
+A typical GR00T real-world deployment workflow includes:
+1. **[Hardware Preparation](#1-hardware-and-environment-preparation-device-requirements)**: Verify that the robot platform, sensors, and compute resources are ready.
+2. **[Data Collection](#2-data-collection)**: Choose an appropriate teleoperation setup and collect at least 100 valid episodes.
+3. **[Data Preprocessing](#3-data-preprocessing)**: Clean data, align timestamps, and convert to LeRobot format.
+4. **[Model Training](#4-vla-model-training)**: Fine-tune GR00T N1.*.
+5. **[Model Evaluation](#validation)**: Run open-loop evaluation to validate convergence and model quality.
+6. **[Deployment Setup](#5-deployment-and-closed-loop-control)**: Build a ZMQ Server-Client architecture.
+7. **[Closed-Loop Testing](#5-deployment-and-closed-loop-control)**: Run closed-loop control on real hardware and monitor jittering and stop-and-go behavior.
+8. **[Optimization](#6-common-issues-jittering-and-stop-and-go)**: Tune RTC parameters and trajectory smoothing strategies based on real-world performance.
+## 1. Hardware and Environment Preparation (Device Requirements)
+Ensure your robot hardware, sensor pipeline, and control interfaces are stable and available.
+### Robot Platform
+- **Recommended platforms**: Robotic arms with SDK-level control support (e.g., Franka, UR, Piper, SO101).
+- **Basic requirements**:
+  - Real-time joint state feedback.
+  - High-frequency action execution (30 FPS recommended).
+  - Stable control interface.
+### Multimodal Sensors
+| Sensor Type | Specification | Purpose |
+|-------------|---------------|---------|
+| **Wrist-mounted camera** | 30 FPS, RGB | Capture close-range manipulation visuals |
+| **Third-person camera (3rd view)** | 30 FPS, RGB | Capture global scene context |
+| **Robot proprioceptive state** | Real-time acquisition | Joint states and gripper state |
+### Compute Resources
+- **Training phase**: NVIDIA GPU servers (e.g., H100 or H20) are recommended for larger batch sizes.
+- **Deployment phase**: Edge hardware such as Jetson AGX Thor supports on-device inference.
+> For details, see the [hardware recommendation guide](hardware_recommendation.md).
+### Teleoperation Devices
+Teleoperation device selection is critical for data quality.
+### Teleoperation Device Comparison
+In the table below:
+- **Embodiment dependency**: how similar the teleoperation device and target robot must be in joint topology, degrees of freedom (DoF), and workspace. Higher dependency implies harder cross-embodiment transfer.
+- **Operational intuition**: how naturally operator inputs map to robot motion. Higher intuition means faster onboarding and lower demonstration error.
+| Device Type | Cost Level (Reference) | Embodiment Dependency | Operational Intuition | Notes |
+|-------------|------------------------|-----------------------|-----------------------|-------|
+| **Keyboard/Gamepad/SpaceMouse/Joylo** | Low | Low: command mapping via keys/controls | Medium: requires adaptation to key-motion mapping | Low entry cost; a good starting point and useful in mobile scenarios |
+| **Master-Slave arm systems** | Medium | High: master/slave arms usually require similar kinematics and workspace | High: near one-to-one human-robot mapping | Suitable for single-robot setups; commonly used by robot OEMs; can reduce the risk of reaching joint limits during demonstrations |
+| **UMI / Fast-UMI / Pika Sense** | Medium | Low: hardware-agnostic action representation reusable across arms | High: after calibration, end-effector (EEF) following is intuitive | Suitable for training general VLA models; low-DoF arms may still hit joint limits |
+| **VR-based teleoperation** | Medium (headset + rendering + network) | Low: mainly depends on software integration | Medium: depends on immersive visual feedback and tracking quality | A flexible solution, but with higher integration overhead |
+| **Glove / Motion Capture** | High (commercial mocap suite + data gloves) | Low: retarget through kinematic mapping to different embodiments | High: intuitive full-hand/full-body control | Suitable for full-body control and dexterous-hand tasks |
+| **Exoskeleton** | High | High: usually requires matched joint structure | High: natural action correspondence | Extendable to multi-joint humanoid control |
+## 2. Data Collection
+Key considerations for data collection:
+### Timestamp Synchronization
+- The FPS of both camera streams should be strictly matched, and capture triggers should be as synchronized as possible.
+- Joint state sampling frequency should exceed camera FPS to enable accurate downsampling.
+- Record full timestamps during collection for downstream temporal alignment.
+### Action Representation
+- If training and collection use the same embodiment (e.g., master-slave arms), log joint-space `Joint States` during collection. For task-space models, compute EEF pose via forward kinematics (FK) in post-processing.
+- If embodiments differ (e.g., collect with UMI, deploy on Piper), directly record task-space EEF pose during collection.
+### Data Distribution
+- Current imitation-learning-based models perform more reliably in previously seen scenarios. In early-stage experiments, start with data collection and validation in a limited domain.
+- After pipeline validation, gradually expand the domain by varying lighting, object placement, and initial robot poses to improve generalization.
+### Scene Consistency
+- Keep third-person camera extrinsics fixed and ensure a rigid wrist-camera mount.
+- In early experiments, prioritize scene consistency; avoid varying lighting, object placement, or initial robot poses.
+### Joint Limits
+- If collecting joint-space data, avoid operating near joint limits to reduce the number of samples in those regions.
+## 3. Data Preprocessing
+Raw data must be cleaned, synchronized, and converted before training.
+### Trajectory Filtering
+Data filtering is recommended in two stages: script-based filtering and manual review.
+#### Script Filtering
+- Check image timestamps and remove samples with:
+  1. Excessive latency in a single camera stream.
+  2. Excessive timestamp difference between the two camera streams.
+- Detect and remove abnormal jumps in robot state sequences.
+#### Manual Filtering
+Replay trajectories with synchronized visualization to catch issues missed by scripts:
+- Remove samples with poor synchronization between image and action sequences.
+- Remove blurry frames.
+- Remove failed task executions.
+- Remove low-quality trajectories (e.g., redundant paths, discontinuous actions).
+### Trajectory Preprocessing
+1. Timestamp alignment: align camera frames and robot joint states to a shared time base.
+2. Head-tail trimming: remove idle segments at the start and end of trajectories.
+3. Split long trajectories (several minutes) into multiple subtasks.
+### Format Conversion
+Convert all data to a standard format (e.g., LeRobot) for GR00T compatibility:
+- See the [data preparation guide](data_preparation.md) for format requirements.
+- Use the provided conversion scripts to convert data to GR00T LeRobot format.
+## 4. VLA Model Training
+### Training Parameter Configuration
+**Dataset size recommendations**
+For single-task `finetune`:
+- **Minimum data size**: Prepare at least **100 valid episodes**. For very narrow task domains, ~30 episodes may suffice. A capture frequency of 20–50 Hz is recommended for manipulation tasks.
+- **Episode length**: No hard limit, but each episode must contain a complete action cycle with idle frames removed. Split overly long episodes into subtasks.
+- **Recommended data size**: 200+ episodes usually provide more stable performance.
+**Core parameters**
+- **Input/output mode**: Default to `State-relative Action Prediction`. Compared with `Absolute Action`, it converges more easily and improves inter-chunk consistency.
+- **Training space**: Both joint space and task space are valid. For low-DoF arms, joint space is often preferred to reduce singularity-related risks.
+- **Action Chunk Size**: Default is 16. If combined with RTC to mitigate stop-and-go, set it to at least 32.
+- **Batch Size**: Increase the batch size as much as GPU memory allows.
+> For additional training options, see the [fine-tuning guide](finetune_new_embodiment.md).
+**Compute resources**
+- Fine-tuning requires significantly less compute than pretraining.
+- A single compute node (8 x H100 or 8 x H20) is usually sufficient.
+### Validation
+After training, run open-loop validation to confirm convergence, then proceed to closed-loop deployment validation.
+> Open-loop validation is only a preliminary check. Final performance must be verified with closed-loop testing on real robots. For details, see the [fine-tuning guide](finetune_new_embodiment.md).
+## 5. Deployment and Closed-Loop Control
+### System Architecture
+GR00T supports two inference modes:
+1. **Direct `Gr00tPolicy` usage**: Suitable when model inference and robot control run on the same machine.
+2. **ZMQ Server-Client architecture**: Suitable for real-world deployment and decouples local robot control (`Local Client`) from remote inference (`Model Server`).
+For real-world deployment, **ZMQ inference service** is recommended:
+- Move compute-intensive inference to GPU servers.
+- Keep robot-side control code lightweight.
+- Avoid installing the full inference dependency stack on the robot side.
+### On-Device Deployment Logic
+Deployment code has two phases: **initialization** and the **main control loop**.
+The pseudo-code below uses a synchronous workflow, which may cause stop-and-go. See later sections for mitigation via asynchronous execution + RTC.
+**Pseudo-code workflow:**
+```python
+# ========== Initialization ==========
+# 1. Initialize and test cameras
+hand_camera = initialize_hand_camera()  # e.g., OrbbecSDK
+env_camera = initialize_env_camera()    # e.g., RealSense
+test_cameras()  # Show preview and verify normal operation
+# 2. Connect and test robot
+robot = connect_robot()  # e.g., Piper SDK
+robot.enable()
+robot.reset_to_initial_position()
+test_robot()  # Send test command and verify robot response
+# 3. Connect and test GR00T model server
+gr00t_client = connect_to_gr00t_server(host, port)
+if not gr00t_client.ping():
+    raise ConnectionError("Failed to connect to model server")
+test_model()  # Send test observation and verify inference
+# ========== Main control loop ==========
+while True:
+    # 1. Acquire sensor data
+    hand_image = hand_camera.get_frame()
+    env_image = env_camera.get_frame()
+    joint_states = robot.get_joint_states()
+    gripper_state = robot.get_gripper_state()
+    # 2. Format observation
+    observation = format_observation(
+        hand_image,
+        env_image,
+        joint_states,
+        gripper_state,
+        task_description,
+    )
+    # 3. Model inference (via ZMQ)
+    actions = gr00t_client.get_action(observation)
+    # 4. Trajectory post-processing
+    actions_arm = actions["joint_states"]
+    actions_arm = smooth_trajectory(actions_arm)  # smoothing
+    actions_arm = check_safety_limits(actions_arm)  # safety checks
+    # 5. Execute actions
+    for action_step in actions_arm:
+        robot.execute_action(action_step)
+        sleep(1.0 / 30.0)  # 30 FPS
+```
+### Key Implementation Notes
+**Important notes:**
+- **Image format**: Use compressed formats such as JPG to reduce transmission bandwidth.
+- **Safe operation**:
+  - **Soft Limits**: Add joint-angle and EEF pose range checks. If a predicted action exceeds workspace bounds, raise an alarm and stop immediately.
+  - **E-Stop logic**: Bind an emergency stop hotkey (e.g., Space) on the control PC, or use a physical E-Stop switch.
+- **Action smoothing**: Apply interpolation and smoothing to predicted action sequences.
+> For more deployment details, see the [policy API guide](policy.md).
+## 6. Common Issues: Jittering and Stop-and-Go
+The most common issues in real-world deployment are **jittering** and **stop-and-go**.
+### Fixing Jittering
+**Jittering** here refers to visible shaking or vibration of the end-effector or joints during task execution.
+Jittering typically originates from **inconsistent model outputs** or **insufficient robot-side control quality**. Analyze these two components separately to localize the issue. The suggestions below are general guidelines and may not apply to every robot platform or control stack — always verify against your own hardware and environment.
+```mermaid
+flowchart TD
+    A[Jittering observed] --> B[Save & visualize action chunks in 3D]
+    B --> C{Where is the jitter?}
+    C -->|Inside each chunk| D[Case A: Model undertrained or poor data quality]
+    C -->|Between consecutive chunks| E[Case B: Inconsistent chunk predictions]
+    C -->|Chunks look smooth| F[Case C: Robot hardware / low-level control issue]
+    D --> D1[Add more data, train longer, check train/eval consistency]
+    E --> E1[Use state-relative actions + RTC chunking strategy]
+    F --> F1[Check drive control, interpolation, hardware status]
+```
+**Diagnosis and mitigation**
+1. **Save and visualize Action Chunks**
+   - Save all predicted `Action Chunks`.
+   - Visualize continuous TCP (tool center point) trajectories in 3D.
+   - **Note**: Convert joint-space outputs to task space via FK before visualization.
+2. **Analyze visualization results**
+   **Case A: Significant jitter inside each chunk**
+   - **Cause**: The model is undertrained, or data quality is insufficient.
+   - **Solution**: Improve data quality, add more training data, or train longer. Keep training and validation environments consistent.
+   **Case B: Significant jitter between chunks**
+   - **Cause**: Inconsistent adjacent `Action Chunk` predictions.
+   - **Solution**:
+     - Use `State-relative Action Prediction`. Predicting actions relative to the current state produces a more uniform output distribution, making the network easier to train.
+     - Use RTC (`Real-Time Chunking`) or similar strategies.
+   **Case C: Little jitter after visualization**
+   - **Cause**: Likely a robot hardware or low-level control issue.
+   - **Solution**: Check drive control, interpolation, and hardware status.
+**Quantitative diagnostic metrics**
+Trajectory jitter can also be quantified using these three metrics:
+**Metric 1: Mean intra-chunk acceleration magnitude**
+Measures intra-chunk smoothness. Only valid under fixed sampling frequency.
+Formula: $a_t = pos_{t+1} - 2 \cdot pos_t + pos_{t-1}$
+```python
+def metric_intra_accel(chunks):
+    """
+    Args:
+        chunks: numpy array with shape (N_chunks, Chunk_Length, Joint_Dim)
+    Returns:
+        float: Mean acceleration magnitude
+    """
+    velocity = np.diff(chunks, axis=1)  # first-order difference
+    acceleration = np.diff(velocity, axis=1)  # second-order difference
+    acc_magnitude = np.linalg.norm(acceleration, axis=-1)  # L2 norm per step
+    return np.mean(acc_magnitude)
+```
+**Metric 2: Position jump at chunk boundary (L2 distance)**
+Measures position continuity between chunks by comparing the last executed step of `Chunk[i]` with step 0 of `Chunk[i+1]`.
+```python
+def metric_boundary_jump(chunks, execute_steps=None):
+    """
+    Args:
+        chunks: numpy array with shape (N_chunks, Chunk_Length, Joint_Dim)
+        execute_steps: number of executed steps per chunk; if None, use full chunk length
+    Returns:
+        float: Mean position jump
+    """
+    chunks = np.array(chunks)
+    exec_steps = chunks.shape[1] if execute_steps is None else execute_steps
+    last_frame_prev = chunks[:-1, exec_steps - 1, :]  # last frame of previous chunk
+    first_frame_curr = chunks[1:, 0, :]  # first frame of current chunk
+    jumps = np.linalg.norm(first_frame_curr - last_frame_prev, axis=-1)  # Euclidean distance
+    return np.mean(jumps)
+```
+**Metric 3: Cosine similarity of velocity direction at chunk boundary**
+Measures velocity-direction consistency between chunks. Values closer to 1 indicate better consistency.
+```python
+def metric_momentum_shift(chunks, execute_steps=None):
+    """
+    Args:
+        chunks: numpy array with shape (N_chunks, Chunk_Length, Joint_Dim)
+        execute_steps: number of executed steps per chunk; if None, use full chunk length
+    Returns:
+        float: Mean cosine similarity
+    """
+    chunks = np.array(chunks)
+    exec_steps = chunks.shape[1] if execute_steps is None else execute_steps
+    # velocity at the end of previous chunk
+    idx = exec_steps - 1
+    if idx < 1:
+        raise ValueError("execute_steps must be >= 2 to compute end velocity")
+    v_end = chunks[:-1, idx, :] - chunks[:-1, idx - 1, :]
+    # velocity at the start of current chunk
+    v_start = chunks[1:, 1, :] - chunks[1:, 0, :]
+    # cosine similarity
+    dot_product = np.sum(v_end * v_start, axis=-1)
+    norm_prev = np.linalg.norm(v_end, axis=-1)
+    norm_curr = np.linalg.norm(v_start, axis=-1)
+    epsilon = 1e-8
+    cosine_sim = dot_product / (norm_prev * norm_curr + epsilon)
+    return np.mean(cosine_sim)
+```
+### Fixing Stop-and-Go
+Stop-and-Go here refers to a behavior in which the robot intermittently pauses during motion, producing periodic stop-and-go behavior.
+#### Root Cause
+In **synchronous single-step closed-loop** control, stop-and-go occurs when the **end-to-end latency** (observation capture → VLA inference → action conversion) exceeds control-frequency requirements.
+- **Control-frequency requirement**: At 30 FPS, latency must stay below ~33 ms.
+- **Typical latency sources**: Data capture, network transfer, model inference, and post-processing often exceed 33 ms combined.
+- **Consequence**: The next prediction is not ready when the current action finishes, causing pauses.
+#### Solutions
+**Option 1: Optimize the inference pipeline (direct but difficult)**
+Reduce full workflow latency below 33 ms:
+- Optimize network bandwidth (reduce transfer time).
+- Use edge inference (reduce network latency).
+- Quantize the VLA model (speed up inference).
+- Use a smaller model (e.g., ACT).
+**Limitation**: For VLA models, meeting strict real-time requirements through optimization alone is often impractical.
+**Option 2: Use algorithmic scheduling strategies (recommended)**
+When direct optimization is insufficient, use one or more of the following:
+- **Asynchronous Inference**: A background thread runs inference while the main thread executes actions.
+- **Receding Horizon**: Execute only the first few steps of each `Action Chunk` before triggering a new inference.
+- **Temporal Ensemble**: Aggregate predictions across multiple timesteps.
+- **Real-Time Chunking (RTC)**: Overlap the start of the current prediction with unexecuted steps from the previous one.
+**Recommended strategy**: `Asynchronous Inference + RTC` is usually the most effective.
+#### Real-Time Chunking (RTC) Details
+**Principle**
+RTC treats action prediction as an inpainting problem: overlapping the start of the new prediction with unexecuted steps from the previous one ensures smooth transitions.
+**Applicability**
+- Validated for **diffusion / flow-based** VLA policies.
+- Requires `Action Chunk` length ≥ 32 steps.
+- Should be combined with asynchronous inference.
+**Implementation essentials**
+1. **Predict longer Action Chunks**:
+   - Increase from the default 16 steps to at least 32.
+   - Provide a larger soft fusion window.
+2. **Asynchronous inference architecture**:
+   - **Background thread**: Continuously infer, capture observations, and prepare action batches.
+   - **Main thread**: Execute the current action sequence.
+   - Buffer predictions in a queue to avoid blocking.
+3. **Action fusion mechanism**:
+   - Use RTC for soft fusion in the overlap region.
+   - Ensure smooth transitions between adjacent chunks.
+**Pseudocode: Async Inference + RTC**
+In the RTC (Real-Time Chunking) framework, two key parameters control how adjacent action chunks overlap and transition:
+- **`overlap`**: The number of action steps retained from the previous prediction to constrain the current one, ensuring temporal consistency between consecutive chunks.
+- **`frozen`**: The number of steps that remain completely frozen (i.e., not updated by the new prediction), typically set to match the inference latency.
+Below is a simplified async inference + RTC loop. Note that official RTC support for GR00T is coming soon; the current implementation may require manual adaptation.
+```
+actions = policy.infer(obs)                        # blocking first call
+loop:
+    for i in range(action_horizon):
+        if i == action_horizon - overlap - 1:
+            future = async policy.infer(new_obs)   # non-blocking
+        robot.execute(actions[i])
+        if i == action_horizon - frozen - 1:
+            actions = future.get()                 # swap in next chunk
+            break                                  # discard frozen tail
+```

gr00t/__init__.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+def _patch_hf_local_first() -> None:
+    """Patch from_pretrained to prefer the local HF snapshot cache over network calls.
+    When a HF repo ID is passed we try snapshot_download(local_files_only=True)
+    first; if the model is not cached we fall through to the normal download path.
+    This avoids 429 rate-limit errors when many CI jobs run concurrently.
+    Covers: PreTrainedModel, PretrainedConfig, ProcessorMixin, AutoConfig,
+    AutoProcessor — every transformers from_pretrained entrypoint.
+    Triggered by GROOT_HF_LOCAL_FIRST (set by conftest.py, survives uv run) or
+    PYTEST_CURRENT_TEST (set automatically by pytest).
+    """
+    def _resolve(name_or_path: str) -> str:
+        hf_home = os.environ.get("HF_HOME")
+        hf_hub = os.environ.get("HUGGINGFACE_HUB_CACHE")
+        hf_cache_info = f"HF_HOME={hf_home} HUGGINGFACE_HUB_CACHE={hf_hub}"
+        if os.path.isdir(name_or_path):
+            print(f"[groot/hf] local path: {name_or_path} | {hf_cache_info}", flush=True)
+            return name_or_path
+        try:
+            from huggingface_hub import snapshot_download
+            resolved = snapshot_download(name_or_path, local_files_only=True)
+            print(
+                f"[groot/hf] cache hit: {name_or_path} -> {resolved} | {hf_cache_info}", flush=True
+            )
+            return resolved
+        except Exception:
+            print(
+                f"[groot/hf] cache miss (will download): {name_or_path} | {hf_cache_info}",
+                flush=True,
+            )
+            return name_or_path
+    def _wrap(cls: type) -> None:
+        if "from_pretrained" not in cls.__dict__:
+            return
+        original = cls.from_pretrained
+        if getattr(original, "_groot_hf_local_patched", False):
+            return
+        def _make_patched(orig):
+            @classmethod  # type: ignore[misc]
+            def patched(klass, pretrained_model_name_or_path, *args, **kwargs):
+                resolved = _resolve(str(pretrained_model_name_or_path))
+                return orig.__func__(klass, resolved, *args, **kwargs)
+            patched._groot_hf_local_patched = True  # type: ignore[attr-defined]
+            return patched
+        cls.from_pretrained = _make_patched(original)
+    try:
+        import transformers as _transformers
+        for _attr in (
+            "PreTrainedModel",
+            "PretrainedConfig",
+            "ProcessorMixin",
+            "AutoConfig",
+            "AutoProcessor",
+        ):
+            _cls = getattr(_transformers, _attr, None)
+            if _cls is not None:
+                _wrap(_cls)
+    except Exception:
+        pass
+def _patch_mistral() -> None:
+    """Suppress 429 / connection errors from the HuggingFace Hub in mistral regex patching.
+    transformers calls model_info() inside a nested is_base_mistral() function
+    unconditionally even when loading from a fully local checkpoint. Qwen3VL /
+    Cosmos is never Mistral, so returning the tokenizer unchanged on any network
+    failure is correct.
+    NOTE: is_base_mistral is a *nested* function inside _patch_mistral_regex, so
+    it is not accessible as a module-level attribute — we must wrap the classmethod.
+    Triggered by GROOT_PATCH_MISTRAL (set by conftest.py, survives uv run) or
+    PYTEST_CURRENT_TEST (set automatically by pytest, belt-and-suspenders).
+    """
+    try:
+        import transformers.tokenization_utils_base as _tub
+        _cls = _tub.PreTrainedTokenizerBase
+        _orig = _cls._patch_mistral_regex.__func__
+        if getattr(_orig, "_groot_patched", False):
+            return
+        def _safe(cls, tokenizer, pretrained_model_name_or_path, **kwargs):
+            try:
+                return _orig(cls, tokenizer, pretrained_model_name_or_path, **kwargs)
+            except Exception:
+                return tokenizer
+        _safe._groot_patched = True  # type: ignore[attr-defined]
+        _cls._patch_mistral_regex = classmethod(_safe)
+    except Exception:
+        pass
+if os.environ.get("PYTEST_CURRENT_TEST") or os.environ.get("GROOT_HF_LOCAL_FIRST"):
+    _patch_hf_local_first()
+if os.environ.get("PYTEST_CURRENT_TEST") or os.environ.get("GROOT_PATCH_MISTRAL"):
+    _patch_mistral()

gr00t/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

gr00t/configs/base_config.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, field
+import json
+from pathlib import Path
+from typing import List, Optional
+import yaml
+from gr00t.data.types import ActionConfig, ActionFormat, ActionRepresentation, ActionType
+from .data.data_config import DataConfig, SingleDatasetConfig
+from .model import create_model_union_type
+from .model.gr00t_n1d7 import Gr00tN1d7Config
+from .training.training_config import TrainingConfig
+ModelUnionType = create_model_union_type()
+@dataclass
+class Config:
+    """Complete configuration."""
+    load_config_path: Optional[str] = None
+    model: ModelUnionType = field(default_factory=lambda: Gr00tN1d7Config())
+    data: DataConfig = field(default_factory=DataConfig)
+    training: TrainingConfig = field(default_factory=TrainingConfig)
+    def save(self, path: Path):
+        """Save configuration to YAML file."""
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with open(path, "w") as f:
+            yaml.dump(self, f)
+    def load(self, path: Path):
+        """Load configuration from YAML file."""
+        data = yaml.load(path.read_text(), Loader=yaml.Loader)
+        if isinstance(data, dict):  # for training
+            self.load_dict(data)
+        elif isinstance(data, self.__class__):
+            self = data
+        else:
+            raise ValueError(f"Invalid config file: {path}")
+        # config = cls(**config) # if yaml.dump(self.__dict__, ...) is used
+        return self
+    def load_dict(self, data: dict):
+        if "model" in data:
+            self.model = self.model.__class__(**data["model"])
+        if "data" in data:
+            self.data = DataConfig(**data["data"])
+            # Ensure nested datasets are converted to dataclass instances
+            converted: List[SingleDatasetConfig] = []
+            for ds in self.data.datasets:
+                if isinstance(ds, dict):
+                    converted.append(SingleDatasetConfig(**ds))
+                else:
+                    converted.append(ds)
+            self.data.datasets = converted
+        if "training" in data:
+            self.training = TrainingConfig(**data["training"])
+        return self
+    @classmethod
+    def from_pretrained(cls, path: Path) -> "Config":
+        """Load configuration from YAML file."""
+        data = yaml.load(path.read_text(), Loader=yaml.Loader)
+        return data
+    def get_deepspeed_config(self) -> dict:
+        """Generate DeepSpeed configuration."""
+        stage = self.training.deepspeed_stage
+        gr00t_dir = Path(__file__).parent.parent
+        if stage == 2:
+            config = json.load(open(gr00t_dir / "configs/deepspeed/zero2_config.json"))
+        elif stage == 3:
+            config = json.load(open(gr00t_dir / "configs/deepspeed/zero3_config.json"))
+        else:
+            raise ValueError(f"Invalid DeepSpeed stage: {stage}")
+        return config
+    def validate(self):
+        """Validate configuration."""
+        # Check dataset path(s)
+        embodiment_tags = set()
+        for d_cfg in self.data.datasets:
+            # (Disable missing data check because we now support caching PDX data sources.)
+            # if not Path(d_cfg.dataset_path).exists():
+            #     raise ValueError(f"Dataset path does not exist: {d_cfg.dataset_path}")
+            if d_cfg.dataset_type == "physical_embodiment" and not d_cfg.embodiment_tag:
+                raise ValueError(f"Embodiment tag is empty for dataset {d_cfg.dataset_path}")
+            if d_cfg.embodiment_tag is not None:
+                embodiment_tags.add(d_cfg.embodiment_tag)
+        stripped_modality_configs = {}
+        for embodiment_tag in embodiment_tags:
+            modality_cfg = self.data.modality_configs.get(embodiment_tag)
+            if modality_cfg is None:
+                raise ValueError(
+                    f"No modality config registered for embodiment tag '{embodiment_tag}'. "
+                    f"Available tags: {sorted(self.data.modality_configs.keys())}. "
+                    f"Provide --modality-config-path to register a custom modality config, "
+                    f"or use one of the pre-registered tags."
+                )
+            stripped_modality_configs[embodiment_tag] = modality_cfg
+        self.data.modality_configs = stripped_modality_configs
+        # ensure mix ratios are valid
+        total_ratio = sum(d.mix_ratio for d in self.data.datasets)
+        if total_ratio <= 0:
+            raise ValueError("Sum of mix_ratio must be greater than zero")
+        # Fill in default values for action configs
+        for embodiment_tag in self.data.modality_configs:
+            # Fill in default values for action representation, type and format
+            if self.data.modality_configs[embodiment_tag]["action"].action_configs is None:
+                self.data.modality_configs[embodiment_tag]["action"].action_configs = [
+                    ActionConfig(
+                        rep=ActionRepresentation.ABSOLUTE,
+                        type=ActionType.NON_EEF,
+                        format=ActionFormat.DEFAULT,
+                    )
+                ] * len(self.data.modality_configs[embodiment_tag]["action"].modality_keys)
+        # Validate precision settings
+        if self.training.fp16 and self.training.bf16:
+            raise ValueError("Cannot use both fp16 and bf16")
+def get_default_config() -> Config:
+    """Get default configuration."""
+    return Config()

gr00t/configs/data/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

gr00t/configs/data/data_config.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, field
+from typing import Any, List, Optional
+from gr00t.data.types import ModalityConfig
+from .embodiment_configs import MODALITY_CONFIGS
+@dataclass
+class SingleDatasetConfig:
+    """Configuration for a single dataset in a mixed-training setup.
+    A list of these objects can be supplied in ``DataConfig.datasets`` to mix
+    multiple datasets at arbitrary ratios.  For convenience the *legacy*
+    single-dataset fields still exist; if ``datasets`` is non-empty they take
+    precedence.
+    """
+    # Path to the dataset root directory (can be strings or dicts for complex configs)
+    dataset_paths: List[Any]
+    # Robot embodiment identifier (e.g. "gr1", "franka")
+    embodiment_tag: Optional[str] = None
+    # Relative sampling probability (will be normalised across the list)
+    mix_ratio: float = 1.0
+    dataset_type: str = "physical_embodiment"
+    # Optional validation dataset path for open-loop evaluation
+    # If not provided, falls back to dataset_paths for evaluation
+    val_dataset_path: Optional[str] = None
+@dataclass
+class DataConfig:
+    """Dataset configuration (supports single or multiple datasets)."""
+    # Leave empty by default for backwards-compatibility with the original
+    # single-dataset workflow.  Users can supply one or more configs via CLI or
+    # YAML when they need mixing.
+    datasets: List[SingleDatasetConfig] = field(default_factory=list)
+    # Modality configs
+    # There are three sources of modality configs:
+    # 1. Default modality configs in code: gr00t/configs/data/embodiment_configs.py
+    # 2. Modality configs supplied through command line: --data.modality_configs (although rare and inconvenient)
+    # 1 and 2 are unified through `config.data.modality_configs`.
+    # 3. modality configs saved in the pretrained checkpoint.
+    modality_configs: dict[str, dict[str, ModalityConfig]] = field(
+        default_factory=lambda: MODALITY_CONFIGS
+    )
+    # Sharded dataset configuration
+    download_cache: bool = False
+    shard_size: int = 2**10
+    episode_sampling_rate: float = 0.1
+    num_shards_per_epoch: int = int(1e5)
+    # Override statistics from the pretrained checkpoint
+    override_pretraining_statistics: bool = True
+    # General task / mode config (shared across datasets)
+    mode: str = "single_turn"
+    random_chop: float = 0.0
+    mock_dataset_mode: bool = False  # if True, cache the first datapoint of each dataset and always return one of them to simulate best-case dataloading
+    # Data loading
+    shuffle: bool = True
+    seed: int = 42
+    multiprocessing_context: str = "fork"  # Options: "fork", "spawn", and "forkserver"
+    allow_padding: bool = False
+    # Subsample ratio for the dataset
+    subsample_ratio: float = 1.0
+    # DP Image Config
+    image_crop_size: List[int] = field(default_factory=lambda: [244, 244])
+    image_target_size: List[int] = field(default_factory=lambda: [224, 224])
+    video_backend: str = "torchcodec"

gr00t/configs/data/embodiment_configs.py ADDED Viewed

	@@ -0,0 +1,208 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from gr00t.data.embodiment_tags import EmbodimentTag
+from gr00t.data.types import (
+    ActionConfig,
+    ActionFormat,
+    ActionRepresentation,
+    ActionType,
+    ModalityConfig,
+)
+MODALITY_CONFIGS = {
+    ##### Pre-registered pretrain configurations #####
+    "oxe_droid_relative_eef_relative_joint": {
+        "video": ModalityConfig(
+            delta_indices=[-15, 0],
+            modality_keys=["exterior_image_1_left", "wrist_image_left"],
+        ),
+        "state": ModalityConfig(
+            delta_indices=[0],
+            modality_keys=["eef_9d", "gripper_position", "joint_position"],
+        ),
+        "action": ModalityConfig(
+            delta_indices=list(range(40)),
+            modality_keys=["eef_9d", "gripper_position", "joint_position"],
+            action_configs=[
+                ActionConfig(
+                    rep=ActionRepresentation.RELATIVE,
+                    type=ActionType.EEF,
+                    format=ActionFormat.XYZ_ROT6D,
+                    state_key="eef_9d",
+                ),
+                ActionConfig(
+                    rep=ActionRepresentation.ABSOLUTE,
+                    type=ActionType.NON_EEF,
+                    format=ActionFormat.DEFAULT,
+                    state_key="gripper_position",
+                ),
+                ActionConfig(
+                    rep=ActionRepresentation.RELATIVE,
+                    type=ActionType.NON_EEF,
+                    format=ActionFormat.DEFAULT,
+                    state_key="joint_position",
+                ),
+            ],
+        ),
+        "language": ModalityConfig(
+            delta_indices=[0],
+            modality_keys=["annotation.language.language_instruction"],
+        ),
+    },
+    ##### Pre-registered posttrain configurations #####
+    "unitree_g1_full_body_with_waist_height_nav_cmd": {
+        "video": ModalityConfig(
+            delta_indices=[0],
+            modality_keys=["ego_view"],
+        ),
+        "state": ModalityConfig(
+            delta_indices=[0],
+            modality_keys=[
+                "left_leg",
+                "right_leg",
+                "waist",
+                "left_arm",
+                "right_arm",
+                "left_hand",
+                "right_hand",
+            ],
+        ),
+        "action": ModalityConfig(
+            delta_indices=list(range(50)),
+            modality_keys=[
+                "left_arm",
+                "right_arm",
+                "left_hand",
+                "right_hand",
+                "waist",
+                "base_height_command",
+                "navigate_command",
+            ],
+            action_configs=[
+                # left_arm
+                ActionConfig(
+                    rep=ActionRepresentation.RELATIVE,
+                    type=ActionType.NON_EEF,
+                    format=ActionFormat.DEFAULT,
+                ),
+                # right_arm
+                ActionConfig(
+                    rep=ActionRepresentation.RELATIVE,
+                    type=ActionType.NON_EEF,
+                    format=ActionFormat.DEFAULT,
+                ),
+                # left_hand
+                ActionConfig(
+                    rep=ActionRepresentation.ABSOLUTE,  # G1 hand is controlled by binary signals like a gripper
+                    type=ActionType.NON_EEF,
+                    format=ActionFormat.DEFAULT,
+                ),
+                # right_hand
+                ActionConfig(
+                    rep=ActionRepresentation.ABSOLUTE,  # G1 hand is controlled by binary signals like a gripper
+                    type=ActionType.NON_EEF,
+                    format=ActionFormat.DEFAULT,
+                ),
+                # waist
+                ActionConfig(
+                    rep=ActionRepresentation.ABSOLUTE,
+                    type=ActionType.NON_EEF,
+                    format=ActionFormat.DEFAULT,
+                ),
+                # base_height_command
+                ActionConfig(
+                    rep=ActionRepresentation.ABSOLUTE,
+                    type=ActionType.NON_EEF,
+                    format=ActionFormat.DEFAULT,
+                ),
+                # navigate_command
+                ActionConfig(
+                    rep=ActionRepresentation.ABSOLUTE,
+                    type=ActionType.NON_EEF,
+                    format=ActionFormat.DEFAULT,
+                ),
+            ],
+        ),
+        "language": ModalityConfig(
+            delta_indices=[0],
+            modality_keys=["annotation.human.task_description"],
+        ),
+    },
+    "libero_sim": {
+        "video": ModalityConfig(
+            delta_indices=[0],
+            modality_keys=["image", "wrist_image"],
+        ),
+        "state": ModalityConfig(
+            delta_indices=[0],
+            modality_keys=["x", "y", "z", "roll", "pitch", "yaw", "gripper"],
+        ),
+        "action": ModalityConfig(
+            delta_indices=list(range(16)),
+            modality_keys=["x", "y", "z", "roll", "pitch", "yaw", "gripper"],
+        ),
+        "language": ModalityConfig(
+            delta_indices=[0],
+            modality_keys=["annotation.human.action.task_description"],
+        ),
+    },
+    "simpler_env_widowx": {
+        "video": ModalityConfig(
+            delta_indices=[0],
+            modality_keys=["image_0"],
+        ),
+        "state": ModalityConfig(
+            delta_indices=[0],
+            modality_keys=["x", "y", "z", "roll", "pitch", "yaw", "pad", "gripper"],
+        ),
+        "action": ModalityConfig(
+            delta_indices=list(range(8)),
+            modality_keys=["x", "y", "z", "roll", "pitch", "yaw", "gripper"],
+        ),
+        "language": ModalityConfig(
+            delta_indices=[0],
+            modality_keys=["annotation.human.action.task_description"],
+        ),
+    },
+    "simpler_env_google": {
+        "video": ModalityConfig(
+            delta_indices=[0],
+            modality_keys=["image"],
+        ),
+        "state": ModalityConfig(
+            delta_indices=[0],
+            modality_keys=["x", "y", "z", "rx", "ry", "rz", "rw", "gripper"],
+        ),
+        "action": ModalityConfig(
+            delta_indices=list(range(8)),
+            modality_keys=["x", "y", "z", "roll", "pitch", "yaw", "gripper"],
+        ),
+        "language": ModalityConfig(
+            delta_indices=[0],
+            modality_keys=["annotation.human.action.task_description"],
+        ),
+    },
+}
+def register_modality_config(
+    config: dict, embodiment_tag: EmbodimentTag = EmbodimentTag.NEW_EMBODIMENT
+):
+    assert embodiment_tag.value not in MODALITY_CONFIGS, (
+        f"Embodiment tag {embodiment_tag} already registered"
+    )
+    MODALITY_CONFIGS[embodiment_tag.value] = config

gr00t/configs/deepspeed/zero2_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "checkpoint": {
+          "load_universal": false
+    },
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "zero_allow_untested_optimizer": true,
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "communication_data_type": "bf16",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "allgather_partitions": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 1e8,
+        "allgather_bucket_size": 1e8
+    }
+  }

gr00t/configs/deepspeed/zero3_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "zero_allow_untested_optimizer": true,
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "sub_group_size": 1e9,
+      "reduce_bucket_size": "auto",
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1e9,
+      "stage3_max_reuse_distance": 1e9,
+      "stage3_gather_16bit_weights_on_model_save": true
+    }
+  }

gr00t/configs/finetune_config.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Finetune config used for single node post-training.
+from dataclasses import dataclass
+@dataclass
+class FinetuneConfig:
+    """
+    Configuration for fine-tuning a Vision-Language-Action (VLA) model.
+    This dataclass defines all parameters needed to launch a fine-tuning job
+    on a pretrained base model using a custom dataset and embodiment-specific
+    modality configuration. It controls model tuning options, data augmentation,
+    and training hyperparameters.
+    """
+    # --- Data and Model Paths ---
+    base_model_path: str
+    """Path to the pretrained base model checkpoint (e.g., Hugging Face model hub or local directory)."""
+    dataset_path: str
+    """Path to the dataset root directory containing trajectory data for fine-tuning."""
+    embodiment_tag: str
+    """Embodiment tag (name or value, case-insensitive). See EmbodimentTag for known tags."""
+    modality_config_path: str | None = None
+    """
+    Path to a Python file defining the modality configuration for the given embodiment.
+    If None, use the pre-registered modality config in `gr00t/configs/data/embodiment_configs.py`.
+    """
+    # --- Model Tuning Flags ---
+    tune_llm: bool = False
+    """If True, fine-tune the language model (LLM) backbone during training."""
+    tune_visual: bool = False
+    """If True, fine-tune the visual encoder (e.g., ViT or CNN backbone)."""
+    tune_projector: bool = True
+    """If True, fine-tune the multimodal projector layers that map vision/language features to a shared space."""
+    tune_diffusion_model: bool = True
+    """If True, fine-tune the diffusion-based action decoder (if present in the model)."""
+    state_dropout_prob: float = 0.2
+    """
+    Dropout probability applied to state inputs for regularization during training.
+    """
+    # --- Data Augmentation ---
+    random_rotation_angle: int | None = None
+    """Maximum rotation angle (in degrees) for random rotation augmentation of input images."""
+    color_jitter_params: dict[str, float] | None = None
+    """
+    Parameters for color jitter augmentation on images.
+    Expected keys include:
+      - "brightness": float
+      - "contrast": float
+      - "saturation": float
+      - "hue": float
+    Example: {"brightness": 0.4, "contrast": 0.4, "saturation": 0.4, "hue": 0.1}
+    If None, applying the default color jitter augmentation from the pretrained model.
+    """
+    extra_augmentation_config: str | None = None
+    """
+    JSON string for extra image augmentations (mask-based and others).
+    Expected keys include:
+      - "background_noise_transforms": list of dicts for noise on mask regions
+          - "target_mask_values": list of int (e.g., [0])
+          - "p": float (probability of applying)
+      - "masked_region_transforms": list of dicts for color tint on mask regions
+          - "target_mask_values": list of int (e.g., [4] or [5])
+          - "p": float (probability of applying)
+          - "alpha_range": [min, max] for random_tint intensity
+    Example: {"background_noise_transforms": [{"target_mask_values": [0], "p": 0.9}],
+              "masked_region_transforms": [{"target_mask_values": [4], "p": 1.0, "alpha_range": [0, 1]}]}
+    If None, no extra augmentations are applied.
+    """
+    # --- Training Configuration ---
+    global_batch_size: int = 64
+    """Total effective batch size across all GPUs and accumulation steps."""
+    dataloader_num_workers: int = 2
+    """Number of parallel worker processes used for data loading."""
+    learning_rate: float = 1e-4
+    """Initial learning rate for optimizer."""
+    gradient_accumulation_steps: int = 1
+    """Number of forward passes to accumulate before performing a backward/update step."""
+    output_dir: str = "./outputs"
+    """Directory where model checkpoints, logs, and outputs are saved."""
+    experiment_name: str | None = None
+    """Optional experiment name used as the W&B run name. Defaults to the output directory basename."""
+    wandb_project: str = "finetune-gr00t-n1d7"
+    """W&B project name to log runs to."""
+    save_steps: int = 1000
+    """Frequency (in training steps) at which to save checkpoints."""
+    save_total_limit: int = 5
+    """Maximum number of checkpoints to keep before older ones are deleted."""
+    num_gpus: int = 1
+    """Number of GPUs available for distributed or single-node training."""
+    use_wandb: bool = False
+    """
+    If True, log metrics and artifacts to Weights & Biases (wandb).
+    The project is `finetune-gr00t-n1d7`.
+    You need to login to wandb to view the logs.
+    """
+    max_steps: int = 10000
+    """Total number of training steps to run before stopping."""
+    weight_decay: float = 1e-5
+    """Weight decay coefficient for optimizer (L2 regularization)."""
+    warmup_ratio: float = 0.05
+    """Proportion of total training steps used for learning rate warm-up."""
+    shard_size: int = 2**10
+    """Size of the shard to use for the dataset during preloading."""
+    episode_sampling_rate: float = 0.1
+    """Sampling rate for the episodes."""
+    num_shards_per_epoch: int = int(1e5)
+    """Number of shards to use for the dataset. reduce this number if vram is limited."""
+    save_only_model: bool = False
+    """If True, save only model weights (skip optimizer/scheduler/RNG states). Cannot resume training from these checkpoints."""
+    skip_weight_loading: bool = False
+    """If True, skip loading model weights from base_model_path (architecture only).
+    The processor (tokenizer/config) is still loaded from base_model_path.
+    Useful for CI/testing to skip the slow checkpoint shard loading."""

gr00t/configs/model/__init__.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+from pathlib import Path
+import typing
+import tyro
+MODEL_CONFIG_TYPES: dict[str, type] = {}
+def register_model_config(shortname: str, configtype: type):
+    MODEL_CONFIG_TYPES[shortname] = configtype
+for file in Path(__file__).parent.glob("*.py"):
+    if file.stem.startswith("_"):
+        continue
+    try:
+        importlib.import_module(f".{file.stem}", __name__)
+    except KeyboardInterrupt:
+        raise
+    except Exception as e:
+        print(f"Error importing module gr00t.configs.model.{file.stem}: {e}")
+def create_model_union_type():
+    if not MODEL_CONFIG_TYPES:
+        # A Union of no types is invalid, so just return None
+        return None
+    annotated_types = tuple(
+        typing.Annotated[model_type, tyro.conf.subcommand(model_shortname)]
+        for model_shortname, model_type in MODEL_CONFIG_TYPES.items()
+    )
+    # Create the Union dynamically
+    return typing.Union.__getitem__(annotated_types)

gr00t/configs/model/gr00t_n1d7.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import MISSING, asdict, dataclass, field, is_dataclass
+from enum import Enum
+import json
+from pathlib import Path
+import torch
+from transformers import PretrainedConfig
+from . import register_model_config
+@dataclass
+class Gr00tN1d7Config(PretrainedConfig):
+    """Unified configuration for Gr00tN1d7 model with backbone and action head.
+    Gr00tN1d7 uses the Cosmos-Reason2-2B (Qwen3-VL architecture) VLM backbone,
+    replacing the Eagle backbone used in Gr00tN1d6.
+    """
+    # Model identification
+    model_type: str = "Gr00tN1d7"
+    model_dtype: str = "bfloat16"  # Use bfloat16 for Flash Attention compatibility
+    # Backbone configuration
+    model_name: str = "nvidia/Cosmos-Reason2-2B"
+    backbone_model_type: str = "qwen"
+    model_revision: str | None = None
+    tune_top_llm_layers: int = 0  # Number of top LLM layers to tune
+    backbone_embedding_dim: int = 2048  # project_to_dim; must match Cosmos-Reason2-2B hidden size
+    tune_llm: bool = False
+    tune_visual: bool = False
+    select_layer: int = 12
+    reproject_vision: bool = False
+    use_flash_attention: bool = True
+    load_bf16: bool = False  # Enable BF16 loading
+    backbone_trainable_params_fp32: bool = True
+    ### Processing parameters
+    image_crop_size: tuple[int, int] | None = (230, 230)
+    image_target_size: tuple[int, int] | None = (256, 256)
+    shortest_image_edge: int | None = None
+    crop_fraction: float | None = None
+    random_rotation_angle: int | None = None
+    color_jitter_params: dict[str, float] | None = None
+    use_albumentations_transforms: bool = True
+    # Extra augmentation config (mask-based and others).
+    extra_augmentation_config: dict | None = None
+    formalize_language: bool = True
+    apply_sincos_state_encoding: bool = (
+        False  # Global flag to enable per-embodiment sin/cos encoding
+    )
+    use_percentiles: bool = True
+    use_relative_action: bool = False
+    # Action head configuration parameters
+    max_state_dim: int = 132  # Default from state_shape
+    max_action_dim: int = 132  # Default from action_shape
+    action_horizon: int = 40
+    hidden_size: int = 1024
+    input_embedding_dim: int = 1536
+    # State history: number of consecutive state timesteps fed to the state encoder
+    state_history_length: int = 1
+    # Global parameters
+    add_pos_embed: bool = True
+    attn_dropout: float = 0.2
+    use_vlln: bool = True
+    max_seq_len: int = 1024
+    use_alternate_vl_dit: bool = True  # True for AlternateVLDiT, False for DiT
+    attend_text_every_n_blocks: int = 2
+    diffusion_model_cfg: dict = field(
+        default_factory=lambda: {
+            "positional_embeddings": None,
+            "num_layers": 16,
+            "num_attention_heads": 32,
+            "attention_head_dim": 48,
+            "norm_type": "ada_norm",
+            "dropout": 0.2,
+            "final_dropout": True,
+            "output_dim": 1024,
+            "interleave_self_attention": True,
+        }
+    )
+    # Flow matching parameters
+    num_inference_timesteps: int = 4
+    noise_beta_alpha: float = 1.5
+    noise_beta_beta: float = 1.0
+    noise_s: float = 0.999
+    num_timestep_buckets: int = 1000
+    # Training parameters
+    tune_projector: bool = True
+    tune_diffusion_model: bool = True
+    tune_vlln: bool = True
+    # State augmentation parameters
+    state_dropout_prob: float = 0.8  # State dropout probability
+    exclude_state: bool = False  # Zero out all state inputs (ablation)
+    use_mean_std: bool = False  # Use mean/std normalization instead of min/max
+    # Multi-embodiment parameters
+    max_num_embodiments: int = 32
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+        # Ensures that all dataclass defaults (including those using default_factory)
+        # are explicitly assigned to the instance, even if dataclasses initialization or subclassing
+        # (PretrainedConfig) interferes with normal default injection.
+        for f in self.__dataclass_fields__.values():
+            if not hasattr(self, f.name):
+                if f.default is not MISSING:
+                    setattr(self, f.name, f.default)
+                elif getattr(f, "default_factory", MISSING) is not MISSING:
+                    setattr(self, f.name, f.default_factory())
+    def to_filtered_dict(self, exclude_augment: bool = True) -> dict:
+        """Return a dictionary representation of this config, optionally excluding augmentation keys."""
+        if is_dataclass(self):
+            cfg = asdict(self)
+        else:
+            cfg = dict(self.__dict__)
+        if exclude_augment:
+            exclude_keys = {
+                "random_rotation_angle",
+                "color_jitter_params",
+                "use_albumentations_transforms",
+                "formalize_language",
+                "image_crop_size",
+                "image_target_size",
+                "shortest_image_edge",
+                "crop_fraction",
+            }
+            cfg = {k: v for k, v in cfg.items() if k not in exclude_keys}
+        return cfg
+    def to_filtered_json(self, exclude_augment: bool = True, **kwargs) -> str:
+        """Return a JSON string of this config, optionally excluding augmentation keys."""
+        def default(o):
+            if isinstance(o, (Path, torch.dtype, torch.device)):
+                return str(o)
+            if isinstance(o, Enum):
+                return o.value
+            return str(o)
+        return json.dumps(
+            self.to_filtered_dict(exclude_augment),
+            indent=2,
+            default=default,
+            **kwargs,
+        )
+register_model_config("Gr00tN1d7", Gr00tN1d7Config)

gr00t/configs/training/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

gr00t/configs/training/training_config.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, field
+from typing import Optional
+@dataclass
+class TrainingConfig:
+    """Training configuration."""
+    # Output
+    output_dir: str = "./outputs"
+    experiment_name: Optional[str] = None
+    # Basic training
+    max_steps: int = 30000  # this will override num_epochs
+    global_batch_size: int = 1024
+    batch_size: Optional[int] = None
+    gradient_accumulation_steps: int = 1
+    # Optimization
+    learning_rate: float = 1e-4
+    lr_scheduler_type: str = "cosine"
+    weight_decay: float = 1e-5
+    warmup_ratio: float = 0.05
+    warmup_steps: int = 0  # this will override warmup_ratio
+    max_grad_norm: float = 1.0
+    # Optimizer choice (huggingface TrainingArguments.optim)
+    # Options include: 'adamw_torch', 'adamw_torch_fused', 'paged_adamw_32bit',
+    # 'paged_adamw_8bit' (requires bitsandbytes), 'adafactor', etc.
+    optim: str = "adamw_torch_fused"
+    start_from_checkpoint: Optional[str] = None
+    skip_weight_loading: bool = False  # skip loading checkpoint weights (architecture only)
+    # Mixed precision
+    tf32: bool = True
+    fp16: bool = False
+    bf16: bool = True
+    eval_bf16: bool = True
+    # Logging and saving
+    logging_steps: int = 10
+    save_steps: int = 1000
+    save_total_limit: int = 5
+    # Model saving
+    save_vl_model: bool = False  # Control whether to save VL model and processor in callbacks
+    save_only_model: bool = False  # Skip optimizer/scheduler/RNG states — cannot resume training
+    # Checkpoint uploading
+    upload_checkpoints: bool = False
+    upload_every: int = 1000
+    upload_last_n_checkpoints: int = 5
+    max_concurrent_uploads: int = 2
+    # Evaluation
+    eval_strategy: str = "no"  # no, steps, epoch
+    eval_steps: int = 500
+    eval_set_split_ratio: float = 0.1
+    eval_batch_size: int = 2
+    save_best_eval_metric_name: str = ""
+    save_best_eval_metric_greater_is_better: bool = True
+    # DeepSpeed (default)
+    deepspeed_stage: int = 2  # ZeRO stage (1, 2, or 3)
+    gradient_checkpointing: bool = False
+    # Transformers loading parameters
+    transformers_trust_remote_code: bool = True
+    transformers_local_files_only: bool = False
+    transformers_cache_dir: str | None = None
+    transformers_access_token: str | None = None  # Access token for HuggingFace Hub
+    # DDP
+    use_ddp: bool = False
+    ddp_bucket_cap_mb: int = 100
+    # Hardware
+    num_gpus: int = 1
+    dataloader_num_workers: int = 2
+    # Data handling
+    remove_unused_columns: bool = False
+    # Experiment tracking
+    use_wandb: bool = False
+    wandb_project: str = "finetune-gr00t-n1d7"
+    # Profiling
+    enable_profiling: bool = False
+    # Max number of retries in training for fault tolerance
+    max_retries: int = 3
+    # For testing.
+    assert_loss_less_than: float | None = None
+    # RL
+    add_rl_callback: bool = False
+    # Open-loop evaluation
+    enable_open_loop_eval: bool = False
+    """Enable open-loop evaluation on saved checkpoints."""
+    open_loop_eval_traj_ids: list[int] = field(default_factory=lambda: [0])
+    """List of trajectory IDs to evaluate."""
+    open_loop_eval_steps_per_traj: int = 100
+    """Number of steps to evaluate per trajectory."""
+    open_loop_eval_plot_indices: Optional[list[int]] = None
+    """List of action indices to plot. If None, plots all indices."""

gr00t/data/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

gr00t/data/collator/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .collators import BasicDataCollator