File size: 5,310 Bytes
b0873f1 d592f2b b0873f1 d592f2b b0873f1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | # rSkill manifest β openral packaging format V1 (CLAUDE.md Β§6.4)
# Wraps: katefgroup/3d_diffuser_actor (diffuser_actor_peract.pth)
# Paper: Ke et al., 2024 β "3D Diffuser Actor: Policy Diffusion with 3D Scene
# Representations" (arXiv:2402.10885). RLBench PerAct 18-task setup.
#
# LICENSE: MIT (code + released checkpoints) β commercially permissive. No
# license guard needed (unlike RVT/RVT-2, which are NVIDIA non-commercial).
#
# RUNTIME: auto-managed out-of-process sidecar (ZMQ + msgpack), ADR-0062. The
# policy AND the CoppeliaSim/PyRep RLBench scene run in their own externally-
# provisioned py3.10 venv (CoppeliaSim is proprietary, free-EDU, NEVER vendored).
# The openral adapter (openral_sim.policies.rlbench_3dda) forks
# tools/rlbench_3dda_sidecar.py on first use; user workflow is one command:
#
# openral benchmark scene --config scenes/benchmark/rlbench_open_drawer.yaml \
# --rskill rskills/3d-diffuser-actor-rlbench
#
# Verified live on an 8 GB Ada GPU host (2026-06-19): open_drawer 4/4,
# meat_off_grill 3/3, close_jar solved. Inference VRAM peak ~0.43 GB.
# ββ Identity βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
schema_version: "0.1"
name: "OpenRAL/rskill-3d-diffuser-actor-rlbench"
# ADR-0060: the benchmark tasks this checkpoint is validated for (gate). The
# released PerAct checkpoint covers all 18 PerAct tasks; we ship + declare the
# three live-verified starter tasks here (the rest are a follow-up).
evaluated_tasks:
- "rlbench/open_drawer"
- "rlbench/meat_off_grill"
- "rlbench/close_jar"
version: "0.1.0"
license: "mit"
role: "s1"
kind: "vla"
# ββ Policy identity ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
model_family: "diffuser_actor"
# ββ Compatibility contract βββββββββββββββββββββββββββββββββββββββββββββββββ
embodiment_tags:
- "franka_panda"
# RLBench renders four fixed cameras (the PerAct set: left_shoulder /
# right_shoulder / wrist / front) and the policy fuses their RGB-D point clouds
# into a 3D scene representation. Those four are supplied by the SCENE backend
# (openral_sim.backends.rlbench), NOT by the robot's real sensor list β so the
# robot-capability gate uses a coarse modality-count requirement ("an RGB-vision
# embodiment") rather than keyed camera1..4 the franka_panda manifest doesn't
# declare. The per-camera 3D fusion happens inside the policy sidecar.
sensors_required:
- modality: "rgb"
count: 1
min_width: 128
min_height: 128
# The policy emits next-keyframe end-effector poses; RLBench executes each via
# its sampling-based motion planner (EndEffectorPoseViaPlanning). Absolute EE
# pose targets, not deltas.
actuators_required:
- kind: "cartesian_pose"
control_mode_semantics:
mode: "absolute"
reference_frame: "panda_link0"
# ββ Runtime / weights ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
runtime: "pytorch"
min_vram_gb:
bf16: 2.0
fp32: 2.0
weights_uri: "hf://katefgroup/3d_diffuser_actor"
# ββ Execution semantics ββββββββββββββββββββββββββββββββββββββββββββββββββββ
# One macro-keypose per step (the scene's mover plans + executes it). 100 DDIM
# denoising steps per keypose; ~1.2 s/keypose on an 8 GB Ada GPU.
chunk_size: 1
latency_budget:
per_chunk_ms: 3000.0
# ββ IO contract ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 8-D keyframe action: [x y z qx qy qz qw gripper_open] (world frame). The scene
# sidecar appends the peract fork's ignore_collisions channel + plans the motion.
action_contract:
dim: 8
slots:
- {range: [0, 6], control_mode: "cartesian_pose", ee: "panda_hand", frame: "panda_link0"}
- {range: [7, 7], control_mode: "gripper_position", ee: "panda_gripper"}
# ββ Provenance βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
paper_url: "https://arxiv.org/abs/2402.10885"
source_repo: "hf://katefgroup/3d_diffuser_actor"
description: >
3D Diffuser Actor (Ke et al., 2024) β a diffusion policy over end-effector
keyposes fusing multi-view RGB-D into a 3D scene representation, on the RLBench
PerAct 18-task benchmark. Shares the out-of-process CoppeliaSim/PyRep sidecar
with the rlbench scene backend (ADR-0062). MIT code + checkpoints. The PerAct
checkpoint is loaded verbatim; ships three live-verified starter tasks.
# ADR-0022 β action vocabulary surfaced to the reasoner LLM tool palette.
actions:
- "generalist"
- "open"
- "close"
- "pick"
- "place"
objects: []
scenes:
- "tabletop"
|