|
|
--- |
|
|
library_name: lerobot |
|
|
pipeline_tag: robotics |
|
|
license: gemma |
|
|
language: |
|
|
- en |
|
|
--- |
|
|
# π0 fast |
|
|
|
|
|
π₀-FAST is a Vision-Language-Action model for general robot control that uses autoregressive next-token prediction to model continuous robot actions. |
|
|
|
|
|
It was proposed in [FAST: Efficient Action Tokenization for Vision-Language-Action Models](https://huggingface.co/papers/2501.09747). |
|
|
|
|
|
## How to Get Started |
|
|
|
|
|
```bash |
|
|
pip install "lerobot[pi]@git+https://github.com/huggingface/lerobot.git" |
|
|
``` |
|
|
|
|
|
```python |
|
|
import torch |
|
|
from lerobot.policies.factory import make_pre_post_processors |
|
|
import numpy as np |
|
|
from lerobot.policies.pi0_fast.modeling_pi0_fast import PI0FastPolicy |
|
|
|
|
|
model_id = "lerobot/pi0fast-base" |
|
|
model = PI0FastPolicy.from_pretrained(model_id) |
|
|
|
|
|
# select your device here |
|
|
device = torch.device("cuda") |
|
|
preprocess, postprocess = make_pre_post_processors( |
|
|
model.config, |
|
|
model_id, |
|
|
preprocessor_overrides={"device_processor": {"device": str(device)}}, |
|
|
) |
|
|
|
|
|
IMAGE_HEIGHT = 224 |
|
|
IMAGE_WIDTH = 224 |
|
|
batch_size = 1 |
|
|
prompt = "Pick up the red block and place it in the bin" |
|
|
|
|
|
# Create random RGB images in [0, 255] uint8 range (as PIL images would be) |
|
|
# Then convert to [0, 1] float32 range for LeRobot |
|
|
def fake_rgb(h, w): |
|
|
arr = np.random.randint(0, 255, (h, w, 3), dtype=np.uint8) |
|
|
t = torch.from_numpy(arr).permute(2, 0, 1) # CHW |
|
|
return t |
|
|
|
|
|
DUMMY_STATE_DIM = 7 |
|
|
batch = { |
|
|
f"observation.images.base_0_rgb": torch.stack( |
|
|
[fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)] |
|
|
).to(device), |
|
|
f"observation.images.left_wrist_0_rgb": torch.stack( |
|
|
[fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)] |
|
|
).to(device), |
|
|
f"observation.images.right_wrist_0_rgb": torch.stack( |
|
|
[fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)] |
|
|
).to(device), |
|
|
"observation.state": torch.randn(batch_size, DUMMY_STATE_DIM, dtype=torch.float32, device=device), |
|
|
"task": [prompt for _ in range(batch_size)], |
|
|
} |
|
|
|
|
|
batch = preprocess(batch) |
|
|
action = model.select_action(batch) |
|
|
# or if you're training, do: |
|
|
# loss, output_dict = policy.forward(batch) |
|
|
# loss.backward() |
|
|
action = postprocess(action) |
|
|
print(action) |
|
|
``` |
|
|
|
|
|
## How to Train the Model |
|
|
|
|
|
```bash |
|
|
python src/lerobot/scripts/lerobot_train.py \ |
|
|
--dataset.repo_id=your_dataset \ |
|
|
--policy.type=pi0_fast \ |
|
|
--output_dir=./outputs/pi0fast_training \ |
|
|
--job_name=pi0fast_training \ |
|
|
--policy.pretrained_path=lerobot/pi0fast-base \ |
|
|
--policy.dtype=bfloat16 \ |
|
|
--policy.gradient_checkpointing=true \ |
|
|
--policy.chunk_size=10 \ |
|
|
--policy.n_action_steps=10 \ |
|
|
--policy.max_action_tokens=256 \ |
|
|
--steps=100000 \ |
|
|
--batch_size=4 \ |
|
|
--policy.device=cuda |
|
|
``` |