Upload policy with preprocessor, postprocessor, and model card

Browse files

Files changed (8) hide show

README.md +19 -11
config.json +40 -40
model.safetensors +2 -2
policy_postprocessor.json +4 -4
policy_postprocessor_step_0_unnormalizer_processor.safetensors +2 -2
policy_preprocessor.json +36 -21
policy_preprocessor_step_2_normalizer_processor.safetensors +3 -0
train_config.json +119 -60

README.md CHANGED Viewed

@@ -1,26 +1,38 @@
 ---
-datasets: thomas0829/bimanual_so100_grab
 library_name: lerobot
 license: apache-2.0
-model_name: act
 pipeline_tag: robotics
 tags:
-- robotics
 - lerobot
-- act
 ---
-# Model Card for act
 <!-- Provide a quick summary of what the model is/does. -->
-[Action Chunking with Transformers (ACT)](https://huggingface.co/papers/2304.13705) is an imitation-learning method that predicts short action chunks instead of single steps. It learns from teleoperated data and often achieves high success rates.
 This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
 See the full documentation at [LeRobot Docs](https://huggingface.co/docs/lerobot/index).
 ---
 ## How to Get Started with the Model
@@ -33,7 +45,7 @@ Below is the short version on how to train and run inference/eval:
 ```bash
 lerobot-train \
   --dataset.repo_id=${HF_USER}/<dataset> \
-  --policy.type=act \
   --output_dir=outputs/train/<desired_policy_repo_id> \
   --job_name=lerobot_training \
   --policy.device=cuda \
@@ -56,7 +68,3 @@ lerobot-record \
 Prefix the dataset repo with **eval\_** and supply `--policy.path` pointing to a local or hub checkpoint.
 ---
-## Model Details
-- **License:** apache-2.0

 ---
+datasets: thomas0829/fold_the_towel
 library_name: lerobot
 license: apache-2.0
+model_name: pi05
 pipeline_tag: robotics
 tags:
 - lerobot
+- pi05
+- robotics
 ---
+# Model Card for pi05
 <!-- Provide a quick summary of what the model is/does. -->
+**π₀.₅ (Pi05) Policy**
+π₀.₅ (Pi05) Policy
+π₀.₅ is a Vision-Language-Action model with open-world generalization, from Physical Intelligence. The LeRobot implementation is adapted from their open source OpenPI repository.
+**Model Overview**
+π₀.₅ represents a significant evolution from π₀, developed by Physical Intelligence to address a big challenge in robotics: open-world generalization. While robots can perform impressive tasks in controlled environments, π₀.₅ is designed to generalize to entirely new environments and situations that were never seen during training.
+For more details, see the [Physical Intelligence π₀.₅ blog post](https://www.physicalintelligence.company/blog/pi05).
 This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
 See the full documentation at [LeRobot Docs](https://huggingface.co/docs/lerobot/index).
+## Model Details
+- **License:** apache-2.0
 ---
 ## How to Get Started with the Model
 ```bash
 lerobot-train \
   --dataset.repo_id=${HF_USER}/<dataset> \
+  --policy.type=pi05 \
   --output_dir=outputs/train/<desired_policy_repo_id> \
   --job_name=lerobot_training \
   --policy.device=cuda \
 Prefix the dataset repo with **eval\_** and supply `--policy.path` pointing to a local or hub checkpoint.
 ---

config.json CHANGED Viewed

@@ -1,35 +1,40 @@
 {
-    "type": "act",
     "n_obs_steps": 1,
     "input_features": {
         "observation.state": {
             "type": "STATE",
             "shape": [
-                12
             ]
         },
         "observation.images.left": {
             "type": "VISUAL",
             "shape": [
                 3,
-                480,
-                640
             ]
         },
-        "observation.images.right": {
             "type": "VISUAL",
             "shape": [
                 3,
-                480,
-                640
             ]
         },
-        "observation.images.top": {
             "type": "VISUAL",
             "shape": [
                 3,
-                720,
-                1280
             ]
         }
     },
@@ -37,42 +42,37 @@
         "action": {
             "type": "ACTION",
             "shape": [
-                12
             ]
         }
     },
     "device": "cuda",
     "use_amp": false,
     "push_to_hub": true,
-    "repo_id": "thomas0829/policy_bimanual_grab",
-    "private": null,
     "tags": null,
     "license": null,
-    "pretrained_path": null,
-    "chunk_size": 100,
-    "n_action_steps": 100,
-    "normalization_mapping": {
-        "VISUAL": "MEAN_STD",
-        "STATE": "MEAN_STD",
-        "ACTION": "MEAN_STD"
-    },
-    "vision_backbone": "resnet18",
-    "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
-    "replace_final_stride_with_dilation": false,
-    "pre_norm": false,
-    "dim_model": 512,
-    "n_heads": 8,
-    "dim_feedforward": 3200,
-    "feedforward_activation": "relu",
-    "n_encoder_layers": 4,
-    "n_decoder_layers": 1,
-    "use_vae": true,
-    "latent_dim": 32,
-    "n_vae_encoder_layers": 4,
-    "temporal_ensemble_coeff": null,
-    "dropout": 0.1,
-    "kl_weight": 10.0,
-    "optimizer_lr": 1e-05,
-    "optimizer_weight_decay": 0.0001,
-    "optimizer_lr_backbone": 1e-05
 }

 {
+    "type": "pi05",
     "n_obs_steps": 1,
+    "normalization_mapping": {
+        "VISUAL": "IDENTITY",
+        "STATE": "QUANTILES",
+        "ACTION": "QUANTILES"
+    },
     "input_features": {
         "observation.state": {
             "type": "STATE",
             "shape": [
+                14
             ]
         },
         "observation.images.left": {
             "type": "VISUAL",
             "shape": [
                 3,
+                224,
+                224
             ]
         },
+        "observation.images.top": {
             "type": "VISUAL",
             "shape": [
                 3,
+                224,
+                224
             ]
         },
+        "observation.images.right": {
             "type": "VISUAL",
             "shape": [
                 3,
+                224,
+                224
             ]
         }
     },
         "action": {
             "type": "ACTION",
             "shape": [
+                14
             ]
         }
     },
     "device": "cuda",
     "use_amp": false,
     "push_to_hub": true,
+    "repo_id": "thomas0829/test",
+    "private": false,
     "tags": null,
     "license": null,
+    "pretrained_path": "lerobot/pi05_base",
+    "paligemma_variant": "gemma_2b",
+    "action_expert_variant": "gemma_300m",
+    "dtype": "bfloat16",
+    "chunk_size": 50,
+    "n_action_steps": 50,
+    "max_state_dim": 32,
+    "max_action_dim": 32,
+    "num_inference_steps": 10,
+    "time_sampling_beta_alpha": 1.5,
+    "time_sampling_beta_beta": 1.0,
+    "time_sampling_scale": 0.999,
+    "time_sampling_offset": 0.001,
+    "min_period": 0.004,
+    "max_period": 4.0,
+    "rtc_config": null,
+    "image_resolution": [
+        224,
+        224
+    ],
+    "empty_cameras": 0,
+    "tokenizer_max_length": 200
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fc84041cff73193b00b49280406ac9cf798d2a6dcc3cfa62ade5af83621b18d8
-size 206748912

 version https://git-lfs.github.com/spec/v1
+oid sha256:4fda3408e745af5826c58b556695f784f3d7090ebfcd90a498aaaed583e1e10d
+size 7473096344

policy_postprocessor.json CHANGED Viewed

@@ -9,14 +9,14 @@
           "action": {
             "type": "ACTION",
             "shape": [
-              12
             ]
           }
         },
         "norm_map": {
-          "VISUAL": "MEAN_STD",
-          "STATE": "MEAN_STD",
-          "ACTION": "MEAN_STD"
         }
       },
       "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors"

           "action": {
             "type": "ACTION",
             "shape": [
+              14
             ]
           }
         },
         "norm_map": {
+          "VISUAL": "IDENTITY",
+          "STATE": "QUANTILES",
+          "ACTION": "QUANTILES"
         }
       },
       "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors"

policy_postprocessor_step_0_unnormalizer_processor.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a48c086134112145d831b7dd8ec6edddf16cb2ef39a97ca83c408d802c078d94
-size 9024

 version https://git-lfs.github.com/spec/v1
+oid sha256:74f49c612d42e9bf57d8cf688169046fd01c1078e58c30bd18daaa4fb42d7a22
+size 9400

policy_preprocessor.json CHANGED Viewed

@@ -11,13 +11,6 @@
       "registry_name": "to_batch_processor",
       "config": {}
     },
-    {
-      "registry_name": "device_processor",
-      "config": {
-        "device": "cuda",
-        "float_dtype": null
-      }
-    },
     {
       "registry_name": "normalizer_processor",
       "config": {
@@ -26,47 +19,69 @@
           "observation.state": {
             "type": "STATE",
             "shape": [
-              12
             ]
           },
           "observation.images.left": {
             "type": "VISUAL",
             "shape": [
               3,
-              480,
-              640
             ]
           },
-          "observation.images.right": {
             "type": "VISUAL",
             "shape": [
               3,
-              480,
-              640
             ]
           },
-          "observation.images.top": {
             "type": "VISUAL",
             "shape": [
               3,
-              720,
-              1280
             ]
           },
           "action": {
             "type": "ACTION",
             "shape": [
-              12
             ]
           }
         },
         "norm_map": {
-          "VISUAL": "MEAN_STD",
-          "STATE": "MEAN_STD",
-          "ACTION": "MEAN_STD"
         }
       },
-      "state_file": "policy_preprocessor_step_3_normalizer_processor.safetensors"
     }
   ]
 }

       "registry_name": "to_batch_processor",
       "config": {}
     },
     {
       "registry_name": "normalizer_processor",
       "config": {
           "observation.state": {
             "type": "STATE",
             "shape": [
+              14
             ]
           },
           "observation.images.left": {
             "type": "VISUAL",
             "shape": [
               3,
+              224,
+              224
             ]
           },
+          "observation.images.top": {
             "type": "VISUAL",
             "shape": [
               3,
+              224,
+              224
             ]
           },
+          "observation.images.right": {
             "type": "VISUAL",
             "shape": [
               3,
+              224,
+              224
             ]
           },
           "action": {
             "type": "ACTION",
             "shape": [
+              14
             ]
           }
         },
         "norm_map": {
+          "VISUAL": "IDENTITY",
+          "STATE": "QUANTILES",
+          "ACTION": "QUANTILES"
         }
       },
+      "state_file": "policy_preprocessor_step_2_normalizer_processor.safetensors"
+    },
+    {
+      "registry_name": "pi05_prepare_state_tokenizer_processor_step",
+      "config": {}
+    },
+    {
+      "registry_name": "tokenizer_processor",
+      "config": {
+        "max_length": 200,
+        "task_key": "task",
+        "padding_side": "right",
+        "padding": "max_length",
+        "truncation": true,
+        "tokenizer_name": "google/paligemma-3b-pt-224"
+      }
+    },
+    {
+      "registry_name": "device_processor",
+      "config": {
+        "device": "cuda",
+        "float_dtype": null
+      }
     }
   ]
 }

policy_preprocessor_step_2_normalizer_processor.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74f49c612d42e9bf57d8cf688169046fd01c1078e58c30bd18daaa4fb42d7a22
+size 9400

train_config.json CHANGED Viewed

@@ -1,10 +1,11 @@
 {
     "dataset": {
-        "repo_id": "thomas0829/bimanual_so100_grab",
         "root": null,
         "episodes": null,
         "image_transforms": {
-            "enable": false,
             "max_num_transforms": 3,
             "random_order": false,
             "tfs": {
@@ -57,47 +58,68 @@
                             1.5
                         ]
                     }
                 }
             }
         },
         "revision": null,
         "use_imagenet_stats": true,
         "video_backend": "torchcodec",
-        "streaming": false
     },
     "env": null,
     "policy": {
-        "type": "act",
         "n_obs_steps": 1,
         "input_features": {
             "observation.state": {
                 "type": "STATE",
                 "shape": [
-                    12
                 ]
             },
             "observation.images.left": {
                 "type": "VISUAL",
                 "shape": [
                     3,
-                    480,
-                    640
                 ]
             },
-            "observation.images.right": {
                 "type": "VISUAL",
                 "shape": [
                     3,
-                    480,
-                    640
                 ]
             },
-            "observation.images.top": {
                 "type": "VISUAL",
                 "shape": [
                     3,
-                    720,
-                    1280
                 ]
             }
         },
@@ -105,69 +127,98 @@
             "action": {
                 "type": "ACTION",
                 "shape": [
-                    12
                 ]
             }
         },
         "device": "cuda",
         "use_amp": false,
         "push_to_hub": true,
-        "repo_id": "thomas0829/policy_bimanual_grab",
-        "private": null,
         "tags": null,
         "license": null,
-        "pretrained_path": null,
-        "chunk_size": 100,
-        "n_action_steps": 100,
-        "normalization_mapping": {
-            "VISUAL": "MEAN_STD",
-            "STATE": "MEAN_STD",
-            "ACTION": "MEAN_STD"
-        },
-        "vision_backbone": "resnet18",
-        "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
-        "replace_final_stride_with_dilation": false,
-        "pre_norm": false,
-        "dim_model": 512,
-        "n_heads": 8,
-        "dim_feedforward": 3200,
-        "feedforward_activation": "relu",
-        "n_encoder_layers": 4,
-        "n_decoder_layers": 1,
-        "use_vae": true,
-        "latent_dim": 32,
-        "n_vae_encoder_layers": 4,
-        "temporal_ensemble_coeff": null,
-        "dropout": 0.1,
-        "kl_weight": 10.0,
-        "optimizer_lr": 1e-05,
-        "optimizer_weight_decay": 0.0001,
-        "optimizer_lr_backbone": 1e-05
     },
-    "output_dir": "outputs/train/bimanual_act_so100_grab",
-    "job_name": "bimanual_act_so100_grab",
     "resume": false,
-    "seed": 1000,
     "num_workers": 4,
-    "batch_size": 4,
-    "steps": 10000,
-    "eval_freq": 1000,
-    "log_freq": 200,
     "save_checkpoint": true,
-    "save_freq": 2500,
     "use_policy_training_preset": true,
     "optimizer": {
         "type": "adamw",
-        "lr": 1e-05,
-        "weight_decay": 0.0001,
-        "grad_clip_norm": 10.0,
         "betas": [
             0.9,
-            0.999
         ],
         "eps": 1e-08
     },
-    "scheduler": null,
     "eval": {
         "n_episodes": 50,
         "batch_size": 50,
@@ -175,11 +226,19 @@
     },
     "wandb": {
         "enable": true,
-        "disable_artifact": false,
-        "project": "lerobot",
         "entity": null,
-        "notes": null,
-        "run_id": "5spc6953",
         "mode": null
     }
 }

 {
     "dataset": {
+        "repo_id": "thomas0829/fold_the_towel",
+        "repo_ids": null,
         "root": null,
         "episodes": null,
         "image_transforms": {
+            "enable": true,
             "max_num_transforms": 3,
             "random_order": false,
             "tfs": {
                             1.5
                         ]
                     }
+                },
+                "affine": {
+                    "weight": 1.0,
+                    "type": "RandomAffine",
+                    "kwargs": {
+                        "degrees": [
+                            -5.0,
+                            5.0
+                        ],
+                        "translate": [
+                            0.05,
+                            0.05
+                        ]
+                    }
                 }
             }
         },
         "revision": null,
         "use_imagenet_stats": true,
         "video_backend": "torchcodec",
+        "force_cache_sync": false,
+        "use_annotated_tasks": false
     },
+    "num_datasets": 100,
     "env": null,
     "policy": {
+        "type": "pi05",
         "n_obs_steps": 1,
+        "normalization_mapping": {
+            "VISUAL": "IDENTITY",
+            "STATE": "QUANTILES",
+            "ACTION": "QUANTILES"
+        },
         "input_features": {
             "observation.state": {
                 "type": "STATE",
                 "shape": [
+                    14
                 ]
             },
             "observation.images.left": {
                 "type": "VISUAL",
                 "shape": [
                     3,
+                    224,
+                    224
                 ]
             },
+            "observation.images.top": {
                 "type": "VISUAL",
                 "shape": [
                     3,
+                    224,
+                    224
                 ]
             },
+            "observation.images.right": {
                 "type": "VISUAL",
                 "shape": [
                     3,
+                    224,
+                    224
                 ]
             }
         },
             "action": {
                 "type": "ACTION",
                 "shape": [
+                    14
                 ]
             }
         },
         "device": "cuda",
         "use_amp": false,
+        "compiled": false,
         "push_to_hub": true,
+        "repo_id": "thomas0829/test",
+        "private": false,
         "tags": null,
         "license": null,
+        "pretrained_path": "lerobot/pi05_base",
+        "paligemma_variant": "gemma_2b",
+        "action_expert_variant": "gemma_300m",
+        "dtype": "bfloat16",
+        "chunk_size": 50,
+        "n_action_steps": 50,
+        "max_state_dim": 32,
+        "max_action_dim": 32,
+        "num_inference_steps": 10,
+        "time_sampling_beta_alpha": 1.5,
+        "time_sampling_beta_beta": 1.0,
+        "time_sampling_scale": 0.999,
+        "time_sampling_offset": 0.001,
+        "min_period": 0.004,
+        "max_period": 4.0,
+        "rtc_config": null,
+        "image_resolution": [
+            224,
+            224
+        ],
+        "empty_cameras": 0,
+        "tokenizer_max_length": 200,
+        "gradient_checkpointing": true,
+        "compile_model": false,
+        "compile_mode": "max-autotune",
+        "attention_implementation": "eager",
+        "use_lora": false,
+        "lora_rank": 16,
+        "lora_alpha": 32.0,
+        "lora_dropout": 0.1,
+        "lora_target_modules": null,
+        "optimizer_lr": 2.5e-05,
+        "optimizer_betas": [
+            0.9,
+            0.95
+        ],
+        "optimizer_eps": 1e-08,
+        "optimizer_weight_decay": 0.01,
+        "optimizer_grad_clip_norm": 1.0,
+        "scheduler_warmup_steps": 1000,
+        "scheduler_decay_steps": 30000,
+        "scheduler_decay_lr": 1e-05
     },
+    "compile": false,
+    "strict": true,
+    "loss_threshold": 3.0,
+    "output_dir": "outputs/train/2026-02-25/15-59-03_pi05_training",
+    "job_name": "pi05_training",
     "resume": false,
+    "resume_scheduler": true,
+    "seed": 3407,
     "num_workers": 4,
+    "batch_size": 1,
+    "gradient_accumulation_steps": 2,
+    "steps": 10,
+    "eval_freq": 20000,
+    "log_freq": 10,
     "save_checkpoint": true,
+    "push_to_hub": false,
+    "repo_id": null,
+    "save_freq": 5000,
     "use_policy_training_preset": true,
     "optimizer": {
         "type": "adamw",
+        "lr": 2.5e-05,
+        "weight_decay": 0.01,
+        "grad_clip_norm": 1.0,
         "betas": [
             0.9,
+            0.95
         ],
         "eps": 1e-08
     },
+    "scheduler": {
+        "type": "cosine_decay_with_warmup",
+        "num_warmup_steps": 1000,
+        "num_decay_steps": 30000,
+        "peak_lr": 2.5e-05,
+        "decay_lr": 1e-05
+    },
     "eval": {
         "n_episodes": 50,
         "batch_size": 50,
     },
     "wandb": {
         "enable": true,
+        "disable_artifact": true,
+        "project": "yam-pi05-finetune",
         "entity": null,
+        "notes": "Full fine-tuning of pi05 on put_the_dolls_on_the_cloth dataset",
+        "run_id": null,
         "mode": null
+    },
+    "test_dataloader": false,
+    "num_epochs": 1,
+    "ddp_timeout_s": 6000,
+    "rename_map": {
+        "observation.images.front_camera": "observation.images.top",
+        "observation.images.left_camera": "observation.images.left",
+        "observation.images.right_camera": "observation.images.right"
     }
 }