File size: 5,310 Bytes
b0873f1
 
 
 
 
 
 
 
d592f2b
b0873f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d592f2b
b0873f1
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# rSkill manifest β€” openral packaging format V1 (CLAUDE.md Β§6.4)
# Wraps: katefgroup/3d_diffuser_actor  (diffuser_actor_peract.pth)
# Paper: Ke et al., 2024 β€” "3D Diffuser Actor: Policy Diffusion with 3D Scene
#        Representations" (arXiv:2402.10885). RLBench PerAct 18-task setup.
#
# LICENSE: MIT (code + released checkpoints) β€” commercially permissive. No
# license guard needed (unlike RVT/RVT-2, which are NVIDIA non-commercial).
#
# RUNTIME: auto-managed out-of-process sidecar (ZMQ + msgpack), ADR-0062. The
# policy AND the CoppeliaSim/PyRep RLBench scene run in their own externally-
# provisioned py3.10 venv (CoppeliaSim is proprietary, free-EDU, NEVER vendored).
# The openral adapter (openral_sim.policies.rlbench_3dda) forks
# tools/rlbench_3dda_sidecar.py on first use; user workflow is one command:
#
#   openral benchmark scene --config scenes/benchmark/rlbench_open_drawer.yaml \
#               --rskill rskills/3d-diffuser-actor-rlbench
#
# Verified live on an 8 GB Ada GPU host (2026-06-19): open_drawer 4/4,
# meat_off_grill 3/3, close_jar solved. Inference VRAM peak ~0.43 GB.

# ── Identity ───────────────────────────────────────────────────────────────
schema_version: "0.1"
name: "OpenRAL/rskill-3d-diffuser-actor-rlbench"
# ADR-0060: the benchmark tasks this checkpoint is validated for (gate). The
# released PerAct checkpoint covers all 18 PerAct tasks; we ship + declare the
# three live-verified starter tasks here (the rest are a follow-up).
evaluated_tasks:
  - "rlbench/open_drawer"
  - "rlbench/meat_off_grill"
  - "rlbench/close_jar"
version: "0.1.0"
license: "mit"
role: "s1"
kind: "vla"

# ── Policy identity ────────────────────────────────────────────────────────
model_family: "diffuser_actor"

# ── Compatibility contract ─────────────────────────────────────────────────
embodiment_tags:
  - "franka_panda"

# RLBench renders four fixed cameras (the PerAct set: left_shoulder /
# right_shoulder / wrist / front) and the policy fuses their RGB-D point clouds
# into a 3D scene representation. Those four are supplied by the SCENE backend
# (openral_sim.backends.rlbench), NOT by the robot's real sensor list β€” so the
# robot-capability gate uses a coarse modality-count requirement ("an RGB-vision
# embodiment") rather than keyed camera1..4 the franka_panda manifest doesn't
# declare. The per-camera 3D fusion happens inside the policy sidecar.
sensors_required:
  - modality: "rgb"
    count: 1
    min_width: 128
    min_height: 128

# The policy emits next-keyframe end-effector poses; RLBench executes each via
# its sampling-based motion planner (EndEffectorPoseViaPlanning). Absolute EE
# pose targets, not deltas.
actuators_required:
  - kind: "cartesian_pose"
    control_mode_semantics:
      mode: "absolute"
      reference_frame: "panda_link0"

# ── Runtime / weights ──────────────────────────────────────────────────────
runtime: "pytorch"
min_vram_gb:
  bf16: 2.0
  fp32: 2.0
weights_uri: "hf://katefgroup/3d_diffuser_actor"

# ── Execution semantics ────────────────────────────────────────────────────
# One macro-keypose per step (the scene's mover plans + executes it). 100 DDIM
# denoising steps per keypose; ~1.2 s/keypose on an 8 GB Ada GPU.
chunk_size: 1
latency_budget:
  per_chunk_ms: 3000.0

# ── IO contract ────────────────────────────────────────────────────────────
# 8-D keyframe action: [x y z qx qy qz qw gripper_open] (world frame). The scene
# sidecar appends the peract fork's ignore_collisions channel + plans the motion.
action_contract:
  dim: 8
  slots:
    - {range: [0, 6], control_mode: "cartesian_pose", ee: "panda_hand", frame: "panda_link0"}
    - {range: [7, 7], control_mode: "gripper_position", ee: "panda_gripper"}

# ── Provenance ─────────────────────────────────────────────────────────────
paper_url: "https://arxiv.org/abs/2402.10885"
source_repo: "hf://katefgroup/3d_diffuser_actor"

description: >
  3D Diffuser Actor (Ke et al., 2024) β€” a diffusion policy over end-effector
  keyposes fusing multi-view RGB-D into a 3D scene representation, on the RLBench
  PerAct 18-task benchmark. Shares the out-of-process CoppeliaSim/PyRep sidecar
  with the rlbench scene backend (ADR-0062). MIT code + checkpoints. The PerAct
  checkpoint is loaded verbatim; ships three live-verified starter tasks.

# ADR-0022 β€” action vocabulary surfaced to the reasoner LLM tool palette.
actions:
  - "generalist"
  - "open"
  - "close"
  - "pick"
  - "place"
objects: []
scenes:
  - "tabletop"