chore: publish rSkill OpenRAL/rskill-3d-diffuser-actor-rlbench v0.1.0
Browse files- README.md +20 -11
- eval/rlbench.json +46 -51
README.md
CHANGED
|
@@ -120,22 +120,31 @@ externally-provisioned dependency (CLAUDE.md §1.9 / ADR-0061).
|
|
| 120 |
|
| 121 |
## Evaluation
|
| 122 |
|
| 123 |
-
[`eval/rlbench.json`](eval/rlbench.json)
|
| 124 |
-
|
| 125 |
-
`
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
```bash
|
| 133 |
openral benchmark run --suite rlbench --rskill rskills/3d-diffuser-actor-rlbench
|
| 134 |
```
|
| 135 |
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
## License
|
| 141 |
|
|
|
|
| 120 |
|
| 121 |
## Evaluation
|
| 122 |
|
| 123 |
+
[`eval/rlbench.json`](eval/rlbench.json) is the **full official protocol**
|
| 124 |
+
result (`reproduced_locally: true`), produced by the canonical
|
| 125 |
+
`openral benchmark run` (ADR-0009 PR D) on an 8 GB Ada host (2026-06-20) —
|
| 126 |
+
**25 episodes per task**, seeds 0–24, max 25 macro-keyposes:
|
| 127 |
+
|
| 128 |
+
| Task | Success rate |
|
| 129 |
+
|---|---|
|
| 130 |
+
| `open_drawer` | 22/25 = **0.88** |
|
| 131 |
+
| `meat_off_grill` | 24/25 = **0.96** |
|
| 132 |
+
| `close_jar` | 19/25 = **0.76** |
|
| 133 |
+
| **Average** | **0.867** |
|
| 134 |
+
|
| 135 |
+
(~946 ms mean step latency; in line with the 3D Diffuser Actor paper's ~0.81
|
| 136 |
+
RLBench PerAct average.) Reproduce with:
|
| 137 |
|
| 138 |
```bash
|
| 139 |
openral benchmark run --suite rlbench --rskill rskills/3d-diffuser-actor-rlbench
|
| 140 |
```
|
| 141 |
|
| 142 |
+
> **Note on variance.** RLBench's sampling-based `EndEffectorPoseViaPlanning`
|
| 143 |
+
> mover is non-deterministic, so per-task rates vary run-to-run; 3 of the 75
|
| 144 |
+
> episodes hit a planner path-failure and are counted as failed episodes (the
|
| 145 |
+
> sidecar handles them gracefully rather than aborting the run — ADR-0061).
|
| 146 |
+
> Per-task paper baselines (Ke et al., 2402.10885, Table 1) are intentionally
|
| 147 |
+
> not transcribed into the artifact to avoid mis-citation.
|
| 148 |
|
| 149 |
## License
|
| 150 |
|
eval/rlbench.json
CHANGED
|
@@ -1,72 +1,67 @@
|
|
| 1 |
{
|
| 2 |
-
"_comment": "Live single-episode verification of 3D Diffuser Actor (katefgroup/3d_diffuser_actor, MIT) on three RLBench PerAct tasks, reproduced locally on an 8 GB Ada GPU host (2026-06-19) via the CoppeliaSim/PyRep + 3DDA py3.10 sidecars (ADR-0061). This is the starter-PR proof, NOT the full official protocol: the canonical RLBench/PerAct/3DDA protocol is 25 evaluation episodes per task (seed 0, max 25 macro-keyposes) — run the full suite to overwrite these blocks (see source.reproduction_planned). Per-task paper baselines are reported in Ke et al. 2402.10885 Table 1 and are intentionally NOT transcribed here to avoid mis-citation.",
|
| 3 |
"schema_version": "0.1",
|
| 4 |
"source": {
|
| 5 |
-
"paper": "
|
| 6 |
-
"arxiv": "https://arxiv.org/abs/
|
| 7 |
-
"model_variant": "
|
| 8 |
-
"evaluated_by": "OpenRAL:
|
| 9 |
"reproduced_locally": true,
|
| 10 |
-
"reproduction_planned":
|
| 11 |
-
"reproduction_cli":
|
| 12 |
-
"description": "ADR-0009 PR D: `openral benchmark run` / `openral benchmark scene` is the canonical producer of RSkillEvalResult JSONs. Requires the externally-provisioned CoppeliaSim 4.1.0 + PyRep + RLBench@peract + 3D Diffuser Actor py3.10 sidecar venv (ADR-0061).",
|
| 13 |
-
"single_scene_example": "openral benchmark scene --config scenes/benchmark/rlbench_open_drawer.yaml --rskill rskills/3d-diffuser-actor-rlbench --n-episodes 1",
|
| 14 |
-
"all_suites": "openral benchmark run --suite rlbench --rskill rskills/3d-diffuser-actor-rlbench",
|
| 15 |
-
"suite_max_steps": 25,
|
| 16 |
-
"notes": [
|
| 17 |
-
"CoppeliaSim is proprietary / free-EDU and is NEVER vendored; provision it yourself per ADR-0061.",
|
| 18 |
-
"The 3D Diffuser Actor checkpoint and code are MIT-licensed — no install-time license guard.",
|
| 19 |
-
"Inference VRAM peak ~0.43 GB; the policy + RLBench scene share one py3.10 ZMQ sidecar.",
|
| 20 |
-
"results below are reproduced_locally=true at n_episodes=1 per task (live verification); flip to the full 25-episode protocol via the all_suites command above."
|
| 21 |
-
]
|
| 22 |
-
},
|
| 23 |
"table": null,
|
| 24 |
"status": "reproduced"
|
| 25 |
},
|
| 26 |
"benchmark": {
|
| 27 |
-
"name": "RLBench",
|
| 28 |
"dataset": null,
|
| 29 |
-
"protocol": "
|
| 30 |
"robot": "franka_panda",
|
| 31 |
-
"simulator": "CoppeliaSim 4.1.0 / PyRep (RLBench@peract
|
| 32 |
},
|
| 33 |
"eval_config": {
|
| 34 |
-
"
|
| 35 |
-
"seeds": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
"success_key": "is_success",
|
| 37 |
"max_steps": 25,
|
| 38 |
"vla_id": "diffuser_actor",
|
| 39 |
-
"weights_uri": "
|
| 40 |
-
"denoising_steps": 100,
|
| 41 |
-
"cameras": ["left_shoulder", "right_shoulder", "wrist", "front"],
|
| 42 |
-
"observation_size": [256, 256],
|
| 43 |
-
"action_dim": 8,
|
| 44 |
-
"inference_vram_gb_peak": 0.43
|
| 45 |
},
|
| 46 |
"results": {
|
| 47 |
-
"rlbench/
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
"mean_keypose_latency_ms": 1006.0
|
| 52 |
-
},
|
| 53 |
-
"rlbench/meat_off_grill": {
|
| 54 |
-
"success_rate": 1.0,
|
| 55 |
-
"n_episodes": 1,
|
| 56 |
-
"keyposes": 5,
|
| 57 |
-
"mean_keypose_latency_ms": 974.0
|
| 58 |
-
},
|
| 59 |
-
"rlbench/close_jar": {
|
| 60 |
-
"success_rate": 1.0,
|
| 61 |
-
"n_episodes": 1,
|
| 62 |
-
"keyposes": 6,
|
| 63 |
-
"mean_keypose_latency_ms": 964.0
|
| 64 |
-
},
|
| 65 |
-
"avg_success_rate": 1.0,
|
| 66 |
"n_tasks": 3,
|
| 67 |
-
"n_episodes_per_task":
|
| 68 |
-
"n_episodes_total":
|
|
|
|
| 69 |
},
|
| 70 |
"baselines": {},
|
| 71 |
"trace_id": null
|
| 72 |
-
}
|
|
|
|
| 1 |
{
|
|
|
|
| 2 |
"schema_version": "0.1",
|
| 3 |
"source": {
|
| 4 |
+
"paper": "https://arxiv.org/abs/1909.12271",
|
| 5 |
+
"arxiv": "https://arxiv.org/abs/1909.12271",
|
| 6 |
+
"model_variant": "diffuser_actor",
|
| 7 |
+
"evaluated_by": "OpenRAL:openral benchmark run",
|
| 8 |
"reproduced_locally": true,
|
| 9 |
+
"reproduction_planned": null,
|
| 10 |
+
"reproduction_cli": "openral benchmark run --suite rlbench --rskill rskills/3d-diffuser-actor-rlbench",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
"table": null,
|
| 12 |
"status": "reproduced"
|
| 13 |
},
|
| 14 |
"benchmark": {
|
| 15 |
+
"name": "RLBench (PerAct 18-task subset)",
|
| 16 |
"dataset": null,
|
| 17 |
+
"protocol": "25 episodes per task, success_key=is_success, max_steps=25",
|
| 18 |
"robot": "franka_panda",
|
| 19 |
+
"simulator": "CoppeliaSim 4.1.0 / PyRep (RLBench@peract)"
|
| 20 |
},
|
| 21 |
"eval_config": {
|
| 22 |
+
"n_episodes": 25,
|
| 23 |
+
"seeds": [
|
| 24 |
+
0,
|
| 25 |
+
1,
|
| 26 |
+
2,
|
| 27 |
+
3,
|
| 28 |
+
4,
|
| 29 |
+
5,
|
| 30 |
+
6,
|
| 31 |
+
7,
|
| 32 |
+
8,
|
| 33 |
+
9,
|
| 34 |
+
10,
|
| 35 |
+
11,
|
| 36 |
+
12,
|
| 37 |
+
13,
|
| 38 |
+
14,
|
| 39 |
+
15,
|
| 40 |
+
16,
|
| 41 |
+
17,
|
| 42 |
+
18,
|
| 43 |
+
19,
|
| 44 |
+
20,
|
| 45 |
+
21,
|
| 46 |
+
22,
|
| 47 |
+
23,
|
| 48 |
+
24
|
| 49 |
+
],
|
| 50 |
"success_key": "is_success",
|
| 51 |
"max_steps": 25,
|
| 52 |
"vla_id": "diffuser_actor",
|
| 53 |
+
"weights_uri": "rskills/3d-diffuser-actor-rlbench"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
},
|
| 55 |
"results": {
|
| 56 |
+
"rlbench/open_drawer_success_rate": 0.88,
|
| 57 |
+
"rlbench/meat_off_grill_success_rate": 0.96,
|
| 58 |
+
"rlbench/close_jar_success_rate": 0.76,
|
| 59 |
+
"avg_success_rate": 0.8666666666666667,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
"n_tasks": 3,
|
| 61 |
+
"n_episodes_per_task": 25,
|
| 62 |
+
"n_episodes_total": 75,
|
| 63 |
+
"mean_step_latency_ms_avg": 945.6086301968047
|
| 64 |
},
|
| 65 |
"baselines": {},
|
| 66 |
"trace_id": null
|
| 67 |
+
}
|