Upload pi05_base originals + sigma-renamed weight copies (flat root)

Browse files

Files changed (5) hide show

README.md +79 -3
config.json +82 -0
model.safetensors +3 -0
policy_postprocessor.json +24 -0
policy_preprocessor.json +49 -0

README.md CHANGED Viewed

@@ -1,3 +1,79 @@
----
-license: cc-by-nc-4.0
----

+---
+license: gemma
+language:
+- en
+---
+# π₀.₅ (Pi05)
+These weights directly come from the Pytorch conversion script of openpi and their `pi05_base` model.
+π₀.₅ is a **Vision-Language-Action model with open-world generalization**, from Physical Intelligence. The LeRobot implementation is adapted from their open source [OpenPI](https://github.com/Physical-Intelligence/openpi) repository.
+## Model Overview
+π₀.₅ represents a significant evolution from π₀, developed by [Physical Intelligence](https://www.physicalintelligence.company/blog/pi05) to address a big challenge in robotics: **open-world generalization**. While robots can perform impressive tasks in controlled environments, π₀.₅ is designed to generalize to entirely new environments and situations that were never seen during training.
+### The Generalization Challenge
+As Physical Intelligence explains, the fundamental challenge isn't performing tasks of agility or dexterity, but generalization, the ability to correctly perform tasks in new settings with new objects. Consider a robot cleaning different homes: each home has different objects in different places. Generalization must occur at multiple levels:
+- **Physical Level**: Understanding how to pick up a spoon (by the handle) or plate (by the edge), even with unseen objects in cluttered environments
+- **Semantic Level**: Understanding task semantics, where to put clothes and shoes (laundry hamper, not on the bed), and what tools are appropriate for cleaning spills
+- **Environmental Level**: Adapting to "messy" real-world environments like homes, grocery stores, offices, and hospitals
+### Co-Training on Heterogeneous Data
+The breakthrough innovation in π₀.₅ is **co-training on heterogeneous data sources**. The model learns from:
+1. **Multimodal Web Data**: Image captioning, visual question answering, object detection
+2. **Verbal Instructions**: Humans coaching robots through complex tasks step-by-step
+3. **Subtask Commands**: High-level semantic behavior labels (e.g., "pick up the pillow" for an unmade bed)
+4. **Cross-Embodiment Robot Data**: Data from various robot platforms with different capabilities
+5. **Multi-Environment Data**: Static robots deployed across many different homes
+6. **Mobile Manipulation Data**: ~400 hours of mobile robot demonstrations
+This diverse training mixture creates a "curriculum" that enables generalization across physical, visual, and semantic levels simultaneously.
+## Training
+Here's a complete training command for finetuning the base π₀.₅ model on your own dataset:
+```bash
+python src/lerobot/scripts/train.py \
+    --dataset.repo_id=your_dataset \
+    --policy.type=pi05 \
+    --output_dir=./outputs/pi05_training \
+    --job_name=pi05_training \
+    --policy.repo_id=your_repo_id \
+    --policy.pretrained_path=lerobot/pi05_base \
+    --policy.compile_model=true \
+    --policy.gradient_checkpointing=true \
+    --wandb.enable=true \
+    --policy.dtype=bfloat16 \
+    --steps=3000 \
+    --policy.scheduler_decay_steps=3000 \
+    --policy.device=cuda \
+    --batch_size=32
+```
+## Citation
+If you use this model, please cite the original OpenPI work:
+```bibtex
+@article{openpi2024,
+    title={Open-World Robotic Manipulation with Vision-Language-Action Models},
+    author={Physical Intelligence},
+    year={2024},
+    url={https://github.com/Physical-Intelligence/openpi}
+}
+```
+## Original Repository
+[OpenPI GitHub Repository](https://github.com/Physical-Intelligence/openpi)
+## License
+This model follows the same license as the original OpenPI repository.

config.json ADDED Viewed

	@@ -0,0 +1,82 @@

+{
+    "type": "pi05",
+    "n_obs_steps": 1,
+    "input_features": {
+        "observation.images.base_0_rgb": {
+            "type": "VISUAL",
+            "shape": [
+                3,
+                224,
+                224
+            ]
+        },
+        "observation.images.left_wrist_0_rgb": {
+            "type": "VISUAL",
+            "shape": [
+                3,
+                224,
+                224
+            ]
+        },
+        "observation.images.right_wrist_0_rgb": {
+            "type": "VISUAL",
+            "shape": [
+                3,
+                224,
+                224
+            ]
+        },
+        "observation.state": {
+            "type": "STATE",
+            "shape": [
+                32
+            ]
+        }
+    },
+    "output_features": {
+        "action": {
+            "type": "ACTION",
+            "shape": [
+                32
+            ]
+        }
+    },
+    "device": "mps",
+    "use_amp": false,
+    "push_to_hub": true,
+    "repo_id": null,
+    "private": null,
+    "tags": null,
+    "license": null,
+    "paligemma_variant": "gemma_2b",
+    "action_expert_variant": "gemma_300m",
+    "dtype": "float32",
+    "chunk_size": 50,
+    "n_action_steps": 50,
+    "max_action_dim": 32,
+    "max_state_dim": 32,
+    "num_inference_steps": 10,
+    "time_sampling_beta_alpha": 1.5,
+    "time_sampling_beta_beta": 1.0,
+    "min_period": 0.004,
+    "max_period": 4.0,
+    "image_resolution": [
+        224,
+        224
+    ],
+    "gradient_checkpointing": false,
+    "compile_model": false,
+    "compile_mode": "max-autotune",
+    "optimizer_lr": 2.5e-05,
+    "optimizer_betas": [
+        0.9,
+        0.95
+    ],
+    "optimizer_eps": 1e-08,
+    "optimizer_weight_decay": 0.01,
+    "optimizer_grad_clip_norm": 1.0,
+    "scheduler_warmup_steps": 1000,
+    "scheduler_decay_steps": 30000,
+    "scheduler_decay_lr": 2.5e-06,
+    "tokenizer_max_length": 200
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0eb11ca9587678c1d2ef8cf32807c29f8ce53a2bfdfc1aa4a4c96f16fca59b0f
+size 14467165872

policy_postprocessor.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "name": "policy_postprocessor",
+  "steps": [
+    {
+      "registry_name": "unnormalizer_processor",
+      "config": {
+        "eps": 1e-08,
+        "features": {},
+        "norm_map": {
+          "VISUAL": "IDENTITY",
+          "STATE": "QUANTILES",
+          "ACTION": "QUANTILES"
+        }
+      }
+    },
+    {
+      "registry_name": "device_processor",
+      "config": {
+        "device": "cpu",
+        "float_dtype": null
+      }
+    }
+  ]
+}

policy_preprocessor.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "name": "policy_preprocessor",
+  "steps": [
+    {
+      "registry_name": "rename_observations_processor",
+      "config": {
+        "rename_map": {}
+      }
+    },
+    {
+      "registry_name": "to_batch_processor",
+      "config": {}
+    },
+    {
+      "registry_name": "normalizer_processor",
+      "config": {
+        "eps": 1e-08,
+        "features": {},
+        "norm_map": {
+          "VISUAL": "IDENTITY",
+          "STATE": "QUANTILES",
+          "ACTION": "QUANTILES"
+        }
+      }
+    },
+    {
+      "registry_name": "pi05_prepare_state_tokenizer_processor_step",
+      "config": {}
+    },
+    {
+      "registry_name": "tokenizer_processor",
+      "config": {
+        "max_length": 200,
+        "task_key": "task",
+        "padding_side": "right",
+        "padding": "max_length",
+        "truncation": true,
+        "tokenizer_name": "google/paligemma-3b-pt-224"
+      }
+    },
+    {
+      "registry_name": "device_processor",
+      "config": {
+        "device": "cpu",
+        "float_dtype": null
+      }
+    }
+  ]
+}