rskill-diffusion-pusht / rskill.yaml
AdrianLlopart's picture
chore: publish rSkill OpenRAL/rskill-diffusion-pusht v0.1.0
59e3f25 verified
# rSkill manifest β€” OpenRAL packaging format V1 (CLAUDE.md Β§6.4)
# Wraps: lerobot/diffusion_pusht (Apache-2.0)
# Paper: Chi et al., 2023 β€” Diffusion Policy.
# ── Identity ───────────────────────────────────────────────────────────────
schema_version: "0.1"
name: "OpenRAL/rskill-diffusion-pusht"
version: "0.1.0"
license: "apache-2.0"
role: "s1"
kind: "vla" # ADR-00XX: rSkill kind discriminator. "vla" = learnable Vision-Language-Action policy.
# ── Policy identity ────────────────────────────────────────────────────────
model_family: "diffusion"
# ── Compatibility contract ─────────────────────────────────────────────────
# 2-D PushT pseudo-robot (single end-effector pushing a T block). Used by
# tests/sim/test_pusht_2d_diffusion_pusht.py against gym_pusht/PushT-v0.
embodiment_tags:
- "pusht"
# PushT exposes a single 96Γ—96 RGB top-down stream (named
# observation.image, not images.cameraN β€” PushT predates the multi-cam
# convention used by SmolVLA/ACT).
sensors_required:
- modality: "rgb"
vla_feature_key: "observation.image"
min_width: 96
min_height: 96
# Output side (ADR-0013). The pusht_2d scene-pseudo-robot exposes a 2-D
# (x, y) absolute position; robots/pusht_2d/robot.yaml advertises
# `cartesian_pose` as its supported control mode (the codebase
# convention for the PushT 2-D action regardless of dimensionality).
# The loader auto-fills n_dof (2) + vla_action_key from the robot YAML.
# PushT actions are absolute (x, y) targets in the world/scene frame.
actuators_required:
- kind: "cartesian_pose"
control_mode_semantics:
mode: "absolute"
reference_frame: "world"
# ── Runtime / weights ──────────────────────────────────────────────────────
runtime: "pytorch"
quantization:
dtype: "fp32"
backend: "pytorch"
weights_uri: "hf://lerobot/diffusion_pusht"
# ── Preprocessing (all knobs needed to interpret IO) ───────────────────────
processors:
preprocessor_uri: "hf://lerobot/diffusion_pusht/policy_preprocessor.json"
postprocessor_uri: "hf://lerobot/diffusion_pusht/policy_postprocessor.json"
# PushT is a 2-DoF planar pushing benchmark; proprio state is 2-D
# (x, y) of the end effector.
state_contract:
dim: 2
# ── Execution semantics ────────────────────────────────────────────────────
chunk_size: 8
# n_action_steps omitted β€” equals chunk_size (Diffusion Policy default).
latency_budget:
# Reference-host measurement (RTX 4070 Laptop, CUDA 12.8, PyTorch 2.10)
# of the warm full-chunk inference is 1756 ms β€” Diffusion Policy runs
# 100 DDPM denoising steps per chunk, the dominant cost in the suite.
# Pinning per_chunk_ms to 1250 ms with tolerance_pct=100 yields the
# previous 2.5 s ceiling (_WARM_CHUNK_CEILING_S in the sim test).
per_chunk_ms: 1250.0
# ── Provenance ─────────────────────────────────────────────────────────────
# Headline success rate from skills/diffusion-pusht/eval/pusht.json.
benchmarks:
pusht: 0.60
paper_url: "https://arxiv.org/abs/2303.04137"
source_repo: "hf://lerobot/diffusion_pusht"
description: >
Diffusion Policy (~263M-param U-Net with 100-step DDPM denoiser) for
the PushT 2-DoF pushing benchmark. Action chunks of length 8 within a
horizon of 16. The chunk inference cost is dominated by the denoising
loop, so cached pops are essentially free β€” this is the extreme test
of the queue-drain contract.
# ADR-0022 β€” action vocabulary surfaced to the reasoner LLM tool
# palette so it can pick this skill by what it does (action verb +
# object + scene), not just by its slug.
actions:
- "push"
objects:
- "t_shape"
scenes:
- "tabletop_2d"
# ADR-0019 β€” per-checkpoint action contract (consumed by the dataset bridge
# to bind the LeRobot v3 `action` feature shape).
action_contract:
dim: 2