lsnu commited on
Commit
e7d8e79
·
verified ·
1 Parent(s): 63a70c7

2026-03-25 runpod handoff update

Browse files

Upload updated code, tests, environment recreation files, generated proxy datasets, new checkpoints, and raw result artifacts from the 2026-03-25 /workspace runpod session.

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +62 -158
  2. artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_phase.pt +3 -0
  3. artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_phase.pt +3 -0
  4. artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/checkpoint_best.pt +3 -0
  5. artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/config_resolved.yaml +153 -0
  6. artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/metrics.json +179 -0
  7. artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/summary.json +103 -0
  8. artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/checkpoint_best.pt +3 -0
  9. artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/config_resolved.yaml +153 -0
  10. artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/metrics.json +297 -0
  11. artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/summary.json +103 -0
  12. artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/checkpoint_best.pt +3 -0
  13. artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/config_resolved.yaml +153 -0
  14. artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/metrics.json +179 -0
  15. artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/summary.json +103 -0
  16. artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/checkpoint_best.pt +3 -0
  17. artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/config_resolved.yaml +153 -0
  18. artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/metrics.json +238 -0
  19. artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/summary.json +103 -0
  20. code/reveal_vla_bimanual/eval/ablations.py +6 -4
  21. code/reveal_vla_bimanual/eval/compare_rlbench_sweeps.py +143 -0
  22. code/reveal_vla_bimanual/eval/run_ablations.py +17 -1
  23. code/reveal_vla_bimanual/eval/run_peract2_task_sweep.py +28 -0
  24. code/reveal_vla_bimanual/eval/run_reveal_benchmark.py +141 -22
  25. code/reveal_vla_bimanual/eval/run_rlbench_rollout_eval.py +60 -4
  26. code/reveal_vla_bimanual/eval/run_teacher_audit.py +115 -0
  27. code/reveal_vla_bimanual/models/action_decoder.py +106 -4
  28. code/reveal_vla_bimanual/models/backbones.py +73 -43
  29. code/reveal_vla_bimanual/models/multiview_fusion.py +9 -1
  30. code/reveal_vla_bimanual/models/observation_memory.py +56 -18
  31. code/reveal_vla_bimanual/models/planner.py +64 -6
  32. code/reveal_vla_bimanual/models/policy.py +84 -7
  33. code/reveal_vla_bimanual/models/reveal_head.py +161 -7
  34. code/reveal_vla_bimanual/models/world_model.py +207 -4
  35. code/reveal_vla_bimanual/scripts/run_rlbench_handoff_eval.sh +107 -0
  36. code/reveal_vla_bimanual/sim_reveal/dataset.py +58 -4
  37. code/reveal_vla_bimanual/sim_reveal/procedural_envs.py +296 -9
  38. code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact.yaml +150 -0
  39. code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase.yaml +73 -0
  40. code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial.yaml +150 -0
  41. code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase.yaml +73 -0
  42. code/reveal_vla_bimanual/train/losses.py +46 -6
  43. results/2026-03-25-runpod/README.md +124 -0
  44. results/2026-03-25-runpod/instructions.md +717 -0
  45. results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/command.txt +1 -0
  46. results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/rollout_eval.json +333 -0
  47. results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/rollout_eval.md +14 -0
  48. results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/stderr.txt +4 -0
  49. results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/stdout.txt +334 -0
  50. results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_handover_item/command.txt +1 -0
README.md CHANGED
@@ -9,195 +9,99 @@ tags:
9
 
10
  # VLAarchtests
11
 
12
- Bundle uploaded from the `/workspace` runpod session dated `2026-03-25 UTC`.
13
 
14
- ## Top-Level Contents
15
 
16
  - `code/reveal_vla_bimanual/`
17
- - current project code
18
- - `artifacts/outputs/r3d/`
19
- - R3D-VLA proxy checkpoints, benchmarks, diagnostics, RLBench outputs, and PerAct2 smoke artifacts
20
- - `regression/baselines.md`
21
- - locked historical baselines from the downloaded snapshot
22
- - `results/phase_tracking.md`
23
- - phase-by-phase gate accounting and acceptance status
24
  - `tests/`
25
- - unit tests for RGB, RGB-D, planner, memory, world-model, and dataset contracts
26
  - `environment/`
27
- - same-machine setup helpers, env exports, runtime env vars, hardware snapshot, and upstream revision notes
28
- - `MODEL_INDEX.md`
29
- - checkpoint/result index for the current R3D bundle
 
30
 
31
- ## Work Completed In This Bundle
32
 
33
- ### Architecture and Training Changes
34
-
35
- - Added the repo-preserving R3D-VLA refactor with updates in:
36
  - `code/reveal_vla_bimanual/models/backbones.py`
37
  - `code/reveal_vla_bimanual/models/multiview_fusion.py`
 
 
38
  - `code/reveal_vla_bimanual/models/observation_memory.py`
39
  - `code/reveal_vla_bimanual/models/reveal_head.py`
40
  - `code/reveal_vla_bimanual/models/world_model.py`
 
41
  - `code/reveal_vla_bimanual/models/action_decoder.py`
42
  - `code/reveal_vla_bimanual/models/planner.py`
43
- - `code/reveal_vla_bimanual/models/policy.py`
44
  - `code/reveal_vla_bimanual/train/losses.py`
45
- - `code/reveal_vla_bimanual/train/smoke_checks.py`
46
  - `code/reveal_vla_bimanual/sim_reveal/dataset.py`
47
  - `code/reveal_vla_bimanual/sim_reveal/procedural_envs.py`
48
- - Added new configs:
49
- - `proxy_interaction_r3d_stage1_dummy.yaml`
50
- - `proxy_interaction_r3d_stage1_clip.yaml`
51
- - `proxy_interaction_r3d_stage2_dummy.yaml`
52
- - `proxy_interaction_r3d_stage2_clip.yaml`
53
- - `proxy_interaction_r3d_stage3_clip_rgbd.yaml`
54
- - `proxy_interaction_r3d_ablation_nodepth.yaml`
55
- - `proxy_interaction_r3d_ablation_noplanner.yaml`
56
- - `proxy_interaction_r3d_ablation_nowm.yaml`
57
- - `proxy_interaction_r3d_ablation_norolesym.yaml`
58
-
59
- ### Evaluation and Integration Changes
60
-
61
- - Completed the requested proxy matrix:
62
- - stage 1 dummy, 3 seeds
63
- - stage 1 dummy `no_planner`
64
- - stage 1 dummy `no_role_symmetry`
65
- - stage 2 dummy, 3 seeds
66
- - stage 2 dummy `no_world_model`
67
- - stage 2 dummy `short_history`
68
- - stage 1 clip, 3 seeds
69
- - stage 2 clip, 3 seeds
70
- - stage 3 clip RGB-D, 3 seeds
71
- - stage 3 clip RGB-D `no_depth`
72
- - Completed RLBench integration artifacts:
73
- - import/config smoke
74
- - `open_drawer` launch smoke
75
- - `open_drawer` rollout with JSON output
76
- - Added a dedicated PerAct2 13-task launch smoke harness:
77
- - `code/reveal_vla_bimanual/eval/run_peract2_launch_smoke.py`
78
- - `code/reveal_vla_bimanual/sim_rlbench/launch_smoke.py` now records finite-action checks
79
-
80
- ### Tests
81
-
82
- - Full local test suite result:
83
- - `10 passed`
84
-
85
- ## Same-Machine Setup Files
86
-
87
- - `environment/setup_same_machine.sh`
88
- - `environment/validate_same_machine.sh`
89
- - `environment/run_peract2_13_rollouts.sh`
90
- - `environment/runtime_env_vars.sh`
91
- - `environment/hardware_snapshot.txt`
92
- - `environment/glxinfo_B.txt`
93
- - `environment/upstream_revisions.txt`
94
- - `environment/system_packages_same_machine.txt`
95
- - `environment/rlbench_env_export.yaml`
96
- - `environment/rlbench_env_explicit.txt`
97
- - `environment/rlbench_pip_freeze.txt`
98
- - `environment/reveal_env_export.yaml`
99
- - `environment/reveal_env_explicit.txt`
100
- - `environment/reveal_pip_freeze.txt`
101
-
102
- ## Raw Proxy Benchmark Matrix
103
-
104
- | Run | Seeds | Mean success | foliage | bag | cloth | Reocclusion | Persistence MAE | Disturbance | Planner top-1 | Proposal diversity | Swap error |
105
- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
106
- | stage1 dummy full | `13,14,15` | 0.5787 | 0.4444 | 0.6111 | 0.6806 | 0.0000 | 1.9553 | 0.3649 | 0.2832 | 0.0245 | 0.007680 |
107
- | stage1 dummy `no_planner` | `13,14,15` | 0.5648 | 0.4306 | 0.5972 | 0.6667 | 0.0000 | 1.9599 | 0.3765 | n/a | n/a | n/a |
108
- | stage1 dummy `no_role_symmetry` | `13,14,15` | 0.5833 | 0.4583 | 0.6111 | 0.6806 | 0.0000 | 1.9475 | 0.3547 | n/a | n/a | n/a |
109
- | stage2 dummy full | `21,22,23` | 0.5463 | 0.4444 | 0.5417 | 0.6528 | 0.0121 | 2.2358 | 0.3148 | 0.3442 | 0.0245 | 0.005036 |
110
- | stage2 dummy `no_world_model` | `21,22,23` | 0.5463 | 0.4444 | 0.5417 | 0.6528 | 0.0027 | 2.3600 | 0.3287 | n/a | n/a | n/a |
111
- | stage2 dummy `short_history` | `21,22,23` | 0.5463 | 0.4444 | 0.5417 | 0.6528 | 0.0121 | 2.2349 | 0.3148 | n/a | n/a | n/a |
112
- | stage1 clip full | `7,8,9` | 0.5324 | 0.4306 | 0.5278 | 0.6389 | 0.0244 | 1.3636 | 0.2808 | 0.2676 | 0.0217 | 0.000155 |
113
- | stage2 clip full | `11,12,13` | 0.4954 | 0.3889 | 0.4583 | 0.6389 | 0.0117 | 2.3198 | 0.2722 | 0.2693 | 0.0216 | 0.000186 |
114
- | stage3 clip RGB-D full | `17,18,19` | 0.5741 | 0.4861 | 0.5417 | 0.6944 | 0.0151 | 1.7883 | 0.2258 | 0.3265 | 0.0270 | 0.000094 |
115
- | stage3 clip RGB-D `no_depth` | `17,18,19` | 0.5231 | 0.4167 | 0.4722 | 0.6806 | 0.0198 | 2.0491 | 0.2548 | n/a | n/a | n/a |
116
-
117
- Full artifact roots are indexed in `MODEL_INDEX.md`.
118
 
119
- Note: the `stage2 dummy no_world_model` row above reflects the `2026-03-25` post-fix null-rollout rerun. Pre-fix copies are retained as `reveal_benchmark_pre_null_rollout_fix.json` and `reveal_benchmark_pre_null_rollout_fix.md` under each `benchmark_no_world_model` seed directory.
120
 
121
- ## Raw Training Summaries
 
 
 
122
 
123
- | Run | Mean train time (s) | Mean peak GPU memory (MB) |
124
- | --- | ---: | ---: |
125
- | stage1 dummy full | 20.45 | 629.62 |
126
- | stage2 dummy full | 20.76 | 639.39 |
127
- | stage1 clip full | 156.16 | 1908.92 |
128
- | stage2 clip full | 141.55 | 1902.54 |
129
- | stage3 clip RGB-D full | 145.93 | 1952.12 |
130
-
131
- ## Raw RLBench Outputs
132
-
133
- ### Import And Launch Smokes
134
-
135
- - import/config smoke file:
136
- - `artifacts/outputs/r3d/rlbench_smokes/smoke_test_output.txt`
137
- - `open_drawer` launch smoke files:
138
- - `artifacts/outputs/r3d/rlbench_smokes/launch_smoke_open_drawer.txt`
139
- - `artifacts/outputs/r3d/rlbench_smokes/launch_smoke_open_drawer.stderr`
140
 
141
- Raw values from the current `open_drawer` launch smoke:
 
142
 
143
- | Field | Value |
144
- | --- | --- |
145
- | task | `RightOpenDrawer` |
146
- | headless | `true` |
147
- | front_rgb_shape | `[224, 224, 3]` |
148
- | wrist_left_rgb_shape | `[224, 224, 3]` |
149
- | wrist_right_rgb_shape | `[224, 224, 3]` |
150
- | action_finite | `true` |
151
- | action_dim | `18` |
152
- | reward | `0.0` |
153
- | done | `false` |
154
 
155
- ### Open-Drawer Rollout
 
 
 
156
 
157
- File:
158
 
159
- - `artifacts/outputs/r3d/rlbench_open_drawer_r3d_rollout/rollout_eval.json`
160
 
161
- Raw values:
162
-
163
- | Field | Value |
164
- | --- | --- |
165
- | plan_requested | `true` |
166
- | plan_applied | `true` |
167
- | task_class | `RightOpenDrawer` |
168
- | episodes_per_task | `1` |
169
- | episode_length | `5` |
170
- | mean_success | `0.0` |
171
- | mean_return | `0.0` |
172
- | reset_retries | `[0]` |
173
-
174
- ## Raw PerAct2 Integration Output
175
 
176
- Files:
177
 
178
- - `artifacts/outputs/r3d/peract2_13_launch_smoke/launch_smoke_summary.json`
179
- - `artifacts/outputs/r3d/peract2_13_launch_smoke/launch_smoke_summary.md`
 
 
180
 
181
- Raw values:
182
 
183
- | Field | Value |
184
- | --- | --- |
185
- | task_count | `13` |
186
- | launch_successes | `13` |
187
- | finite_action_tasks | `13` |
188
- | error_tasks | `[]` |
189
- | resolution | `224` |
190
- | headless | `true` |
191
 
192
- Per-task stdout/stderr/command logs are stored under:
193
 
194
- - `artifacts/outputs/r3d/peract2_13_launch_smoke/`
195
 
196
- ## Additional Indexes
197
 
198
- - Historical baseline note:
199
- - `regression/baselines.md`
200
- - Phase-by-phase gate tracking:
201
- - `results/phase_tracking.md`
202
- - Checkpoint and artifact index:
203
- - `MODEL_INDEX.md`
 
 
 
9
 
10
  # VLAarchtests
11
 
12
+ Update uploaded from the `/workspace` runpod session dated `2026-03-25 UTC`.
13
 
14
+ ## Updated Paths
15
 
16
  - `code/reveal_vla_bimanual/`
 
 
 
 
 
 
 
17
  - `tests/`
 
18
  - `environment/`
19
+ - `artifacts/data/reveal_proxy/`
20
+ - `artifacts/outputs/r3d_handoff/`
21
+ - `artifacts/outputs/r3d_handoff_phase/`
22
+ - `results/2026-03-25-runpod/`
23
 
24
+ ## Primary Source Changes
25
 
26
+ - Geometry path and camera-pose propagation updates:
 
 
27
  - `code/reveal_vla_bimanual/models/backbones.py`
28
  - `code/reveal_vla_bimanual/models/multiview_fusion.py`
29
+ - `code/reveal_vla_bimanual/models/policy.py`
30
+ - Spatial memory and world-model updates:
31
  - `code/reveal_vla_bimanual/models/observation_memory.py`
32
  - `code/reveal_vla_bimanual/models/reveal_head.py`
33
  - `code/reveal_vla_bimanual/models/world_model.py`
34
+ - Semantic candidate and planner updates:
35
  - `code/reveal_vla_bimanual/models/action_decoder.py`
36
  - `code/reveal_vla_bimanual/models/planner.py`
37
+ - Loss, dataset, and simulator updates:
38
  - `code/reveal_vla_bimanual/train/losses.py`
 
39
  - `code/reveal_vla_bimanual/sim_reveal/dataset.py`
40
  - `code/reveal_vla_bimanual/sim_reveal/procedural_envs.py`
41
+ - Evaluation and RLBench tooling updates:
42
+ - `code/reveal_vla_bimanual/eval/run_reveal_benchmark.py`
43
+ - `code/reveal_vla_bimanual/eval/run_teacher_audit.py`
44
+ - `code/reveal_vla_bimanual/eval/run_rlbench_rollout_eval.py`
45
+ - `code/reveal_vla_bimanual/eval/run_peract2_task_sweep.py`
46
+ - `code/reveal_vla_bimanual/eval/compare_rlbench_sweeps.py`
47
+ - `code/reveal_vla_bimanual/scripts/run_rlbench_handoff_eval.sh`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ ## Validation
50
 
51
+ - Test command:
52
+ - `PYTHONPATH=/workspace/VLAarchtests_work/code/reveal_vla_bimanual python -m pytest -q /workspace/VLAarchtests_work/tests`
53
+ - Result:
54
+ - `33 passed`
55
 
56
+ ## Generated Datasets
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ - `artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_phase.pt`
59
+ - `artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_phase.pt`
60
 
61
+ ## Generated Checkpoints
 
 
 
 
 
 
 
 
 
 
62
 
63
+ - `artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/`
64
+ - `artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/`
65
+ - `artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/`
66
+ - `artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/`
67
 
68
+ ## Raw Result Summary
69
 
70
+ ### Proxy Serious Comparisons
71
 
72
+ | File | Reference mean success | Compared mean success |
73
+ | --- | ---: | ---: |
74
+ | `results/2026-03-25-runpod/reports/reveal_handoff_compare_serious/reveal_benchmark.json` | 0.583333 | 0.216667 |
75
+ | `results/2026-03-25-runpod/reports/reveal_handoff_compare_serious_compact/reveal_benchmark.json` | 0.583333 | 0.520000 |
76
+ | `results/2026-03-25-runpod/reports/reveal_phase_compare_serious_compact/reveal_benchmark.json` | 0.583333 | 0.513333 |
77
+ | `results/2026-03-25-runpod/reports/reveal_phase_compare_serious_spatial_compactwm/reveal_benchmark.json` | 0.583333 | 0.493333 |
 
 
 
 
 
 
 
 
78
 
79
+ ### Proxy Ablations
80
 
81
+ - Full ablation matrix:
82
+ - `results/2026-03-25-runpod/reports/reveal_phase_ablations_compact/ablations.json`
83
+ - Teacher audit:
84
+ - `results/2026-03-25-runpod/reports/reveal_teacher_audit_serious/teacher_audit.json`
85
 
86
+ ### RLBench
87
 
88
+ | File | Mean success |
89
+ | --- | ---: |
90
+ | `results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/rollout_eval.json` | 0.000000 |
91
+ | `results/2026-03-25-runpod/reports/peract2_spatial_full_ep1/spatial_phase_seed17_noplan_split/rollout_eval.json` | 0.000000 |
92
+ | `results/2026-03-25-runpod/reports/peract2_spatial_full_ep1/spatial_phase_seed17_plan_split/rollout_eval.json` | 0.000000 |
 
 
 
93
 
94
+ ## Detailed Raw Index
95
 
96
+ - `results/2026-03-25-runpod/README.md`
97
 
98
+ ## Environment Recreation
99
 
100
+ - `environment/README.md`
101
+ - `environment/setup_same_machine.sh`
102
+ - `environment/validate_same_machine.sh`
103
+ - `environment/runtime_env_vars.sh`
104
+ - `environment/upstream_revisions.txt`
105
+ - `environment/rlbench_env_export.yaml`
106
+ - `environment/rlbench_env_explicit.txt`
107
+ - `environment/rlbench_pip_freeze.txt`
artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_phase.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:918679191157abb31b3523be4b69ff7b95da4c373d130dd24a0db1314b57ec19
3
+ size 583377508
artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_phase.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1b1b90c882067eba271f59430b783c7fda2edbe6f221360a245ef32eef602d1
3
+ size 200844508
artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/checkpoint_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:360037583572613590e99f5f06766729edbecfc4ee2ea0950fe12bb08e562d83
3
+ size 940662478
artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/config_resolved.yaml ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17
2
+ output_dir: /workspace/outputs/r3d_handoff
3
+ device: cuda
4
+ seed: 17
5
+ init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt
6
+ init_strict: false
7
+ data:
8
+ proxies:
9
+ - foliage_proxy
10
+ - bag_proxy
11
+ - cloth_proxy
12
+ resolution: 224
13
+ dataset_version: reveal_proxy_v6_rgbd_elastic_state
14
+ train_episodes_per_proxy: 48
15
+ val_episodes_per_proxy: 16
16
+ train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3.pt
17
+ val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3.pt
18
+ rebuild_dataset: false
19
+ chunk_horizon: 8
20
+ rollout_horizon: 5
21
+ history_steps: 6
22
+ planner_candidates: 8
23
+ seed: 17
24
+ optim:
25
+ epochs: 3
26
+ batch_size: 4
27
+ num_workers: 24
28
+ lr: 0.0001
29
+ weight_decay: 0.0001
30
+ trainer:
31
+ policy_type: elastic_reveal
32
+ use_bf16: true
33
+ grad_clip_norm: 1.0
34
+ freeze_backbone: true
35
+ gradient_checkpointing: false
36
+ plan_during_train: true
37
+ plan_during_eval: true
38
+ support_mode_conditioning: true
39
+ planner_mode: trainable
40
+ use_depth: true
41
+ use_world_model: true
42
+ use_role_tokens: true
43
+ compute_equivariance_probe: false
44
+ policy:
45
+ backbone:
46
+ model_name: openai/clip-vit-base-patch32
47
+ hidden_dim: 512
48
+ max_text_tokens: 32
49
+ freeze_backbone: true
50
+ gradient_checkpointing: false
51
+ use_dummy_backbone: false
52
+ fusion:
53
+ hidden_dim: 512
54
+ num_cameras: 3
55
+ num_layers: 4
56
+ num_heads: 8
57
+ ff_dim: 2048
58
+ dropout: 0.1
59
+ proprio_dim: 32
60
+ proprio_tokens: 1
61
+ memory:
62
+ hidden_dim: 512
63
+ action_dim: 14
64
+ history_steps: 6
65
+ scene_history_steps: 3
66
+ belief_history_steps: 8
67
+ num_layers: 2
68
+ dropout: 0.1
69
+ memory_bank_size: 4
70
+ scene_bank_size: 2
71
+ belief_bank_size: 2
72
+ num_heads: 8
73
+ max_history_steps: 8
74
+ decoder:
75
+ hidden_dim: 512
76
+ num_heads: 8
77
+ num_layers: 4
78
+ ff_dim: 2048
79
+ dropout: 0.1
80
+ chunk_size: 8
81
+ action_dim: 14
82
+ arm_action_dim: 7
83
+ num_candidates: 8
84
+ num_phases: 5
85
+ num_arm_roles: 4
86
+ num_proposal_modes: 7
87
+ planner_top_k: 4
88
+ reveal_head:
89
+ hidden_dim: 512
90
+ num_support_modes: 3
91
+ num_approach_templates: 32
92
+ rollout_horizon: 5
93
+ belief_map_size: 32
94
+ field_size: 16
95
+ num_heads: 8
96
+ predict_belief_map: true
97
+ num_phases: 5
98
+ num_arm_roles: 4
99
+ num_interaction_tokens: 8
100
+ num_tasks: 4
101
+ world_model:
102
+ hidden_dim: 512
103
+ action_dim: 14
104
+ num_support_modes: 3
105
+ num_approach_templates: 32
106
+ rollout_horizon: 5
107
+ field_size: 16
108
+ num_heads: 8
109
+ num_phases: 5
110
+ num_arm_roles: 4
111
+ num_interaction_tokens: 8
112
+ belief_map_size: 32
113
+ predict_belief_map: true
114
+ scene_bank_size: 2
115
+ belief_bank_size: 2
116
+ rollout_mode: compact_rollout
117
+ num_tasks: 4
118
+ planner:
119
+ hidden_dim: 512
120
+ num_candidates: 8
121
+ action_dim: 14
122
+ num_support_modes: 3
123
+ utility_margin: 0.1
124
+ num_heads: 8
125
+ num_layers: 2
126
+ num_phases: 5
127
+ num_arm_roles: 4
128
+ top_k: 4
129
+ loss_weights:
130
+ action: 1.0
131
+ phase: 0.05
132
+ arm_role: 0.1
133
+ support_mode: 0.1
134
+ corridor: 0.12
135
+ persistence: 0.06
136
+ disturbance: 0.06
137
+ world_model: 0.2
138
+ belief: 0.05
139
+ visibility: 0.05
140
+ clearance: 0.06
141
+ support_stability: 0.06
142
+ reocclusion: 0.06
143
+ occluder_contact: 0.05
144
+ grasp_affordance: 0.05
145
+ planner_success: 0.2
146
+ planner_risk: 0.08
147
+ planner_ranking: 0.2
148
+ proposal_reconstruction: 0.08
149
+ proposal_success: 0.12
150
+ proposal_ranking: 0.15
151
+ proposal_diversity: 0.05
152
+ role_swap_consistency: 0.02
153
+ task_metrics: 0.05
artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/metrics.json ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "epoch": 0,
4
+ "train": {
5
+ "action": 0.01788575194874092,
6
+ "arm_role": 0.001019889743704545,
7
+ "belief": 0.12027505389169643,
8
+ "clearance": 0.08003731069988326,
9
+ "corridor": 0.2066852554580883,
10
+ "disturbance": 0.002048209719176061,
11
+ "grasp_affordance": 0.009688427269850907,
12
+ "occluder_contact": 0.21002381004785237,
13
+ "persistence": 0.6925194860485039,
14
+ "phase": 0.2873851528293208,
15
+ "planner_ranking": 0.007970526718954237,
16
+ "planner_risk": 0.022958766870004567,
17
+ "planner_success": 0.11003280751603214,
18
+ "proposal_diversity": 0.0,
19
+ "proposal_ranking": 0.3698378854676297,
20
+ "proposal_reconstruction": 0.10649478184549432,
21
+ "proposal_success": 0.3088213802952515,
22
+ "reocclusion": 0.23717812233065305,
23
+ "role_swap_consistency": 0.0,
24
+ "support_mode": 0.024943622789884868,
25
+ "support_stability": 0.12860428792865652,
26
+ "task_metrics": 0.16847629100084305,
27
+ "total": 0.7690052716355574,
28
+ "uncertainty": 8.382000443433706e-05,
29
+ "visibility": 0.11110214183205053,
30
+ "world_model": 2.4172904924342506
31
+ },
32
+ "val": {
33
+ "action": 0.01380122694271532,
34
+ "arm_role": 0.0007760834101425258,
35
+ "belief": 0.10585840746308818,
36
+ "clearance": 0.07610336713718646,
37
+ "corridor": 0.20833940104101645,
38
+ "disturbance": 0.001970026997503627,
39
+ "grasp_affordance": 0.009207394397394224,
40
+ "occluder_contact": 0.20593324529402185,
41
+ "persistence": 0.9972314130176197,
42
+ "phase": 0.32413111886743345,
43
+ "planner_ranking": 0.00022911480162733687,
44
+ "planner_risk": 0.01705723936020425,
45
+ "planner_success": 0.01417768012845155,
46
+ "proposal_diversity": 0.0,
47
+ "proposal_ranking": 0.148933302498225,
48
+ "proposal_reconstruction": 0.08428755696072723,
49
+ "proposal_success": 0.1355827044356953,
50
+ "reocclusion": 0.3386235964117628,
51
+ "role_swap_consistency": 0.0,
52
+ "support_mode": 0.00921160001025507,
53
+ "support_stability": 0.13572556762532753,
54
+ "task_metrics": 0.14731343361464413,
55
+ "total": 0.5933836915276267,
56
+ "uncertainty": 6.096601287991331e-05,
57
+ "visibility": 0.0983928834850138,
58
+ "world_model": 1.8323120962489734
59
+ }
60
+ },
61
+ {
62
+ "epoch": 1,
63
+ "train": {
64
+ "action": 0.011145903746058282,
65
+ "arm_role": 0.004716875678614566,
66
+ "belief": 0.10514058776591953,
67
+ "clearance": 0.07779615305756268,
68
+ "corridor": 0.20047297412529588,
69
+ "disturbance": 0.0022256996764458323,
70
+ "grasp_affordance": 0.009927417171236717,
71
+ "occluder_contact": 0.20330693313949985,
72
+ "persistence": 0.6114087605882608,
73
+ "phase": 0.21543656336633782,
74
+ "planner_ranking": 0.00018106158740920363,
75
+ "planner_risk": 0.015639749421787107,
76
+ "planner_success": 0.007014186491601561,
77
+ "proposal_diversity": 0.0,
78
+ "proposal_ranking": 0.08341658792801593,
79
+ "proposal_reconstruction": 0.07808616882876346,
80
+ "proposal_success": 0.08362671854464632,
81
+ "reocclusion": 0.21602793348659027,
82
+ "role_swap_consistency": 0.0,
83
+ "support_mode": 0.002888570647490652,
84
+ "support_stability": 0.1253421003685186,
85
+ "task_metrics": 0.14696427873874965,
86
+ "total": 0.4685811519622803,
87
+ "uncertainty": 3.833678546901578e-05,
88
+ "visibility": 0.09538611636350029,
89
+ "world_model": 1.5017830422050074
90
+ },
91
+ "val": {
92
+ "action": 0.011654359860007058,
93
+ "arm_role": 0.0034928608396457453,
94
+ "belief": 0.09692509336905046,
95
+ "clearance": 0.07511017166755417,
96
+ "corridor": 0.19370697032321582,
97
+ "disturbance": 0.0026899561648447575,
98
+ "grasp_affordance": 0.0108991796574132,
99
+ "occluder_contact": 0.20079099861058322,
100
+ "persistence": 0.7645651453146429,
101
+ "phase": 0.3479848448751551,
102
+ "planner_ranking": 7.394229859611104e-05,
103
+ "planner_risk": 0.015624796357852492,
104
+ "planner_success": 0.004826839748685333,
105
+ "proposal_diversity": 0.0,
106
+ "proposal_ranking": 0.10358103387283557,
107
+ "proposal_reconstruction": 0.0756011808460409,
108
+ "proposal_success": 0.07432993885242578,
109
+ "reocclusion": 0.2191494649106806,
110
+ "role_swap_consistency": 0.0,
111
+ "support_mode": 0.0027684949190271172,
112
+ "support_stability": 0.1332334725919998,
113
+ "task_metrics": 0.1422290007273356,
114
+ "total": 0.4283526196624293,
115
+ "uncertainty": 6.312012101950654e-05,
116
+ "visibility": 0.09116258178696487,
117
+ "world_model": 1.2195359631018206
118
+ }
119
+ },
120
+ {
121
+ "epoch": 2,
122
+ "train": {
123
+ "action": 0.009001491786176829,
124
+ "arm_role": 0.0031435395541944003,
125
+ "belief": 0.09910694141136972,
126
+ "clearance": 0.0763780973459545,
127
+ "corridor": 0.1971655124798417,
128
+ "disturbance": 0.0022866554429607565,
129
+ "grasp_affordance": 0.010611724677054506,
130
+ "occluder_contact": 0.19787568214692566,
131
+ "persistence": 0.6391040890734171,
132
+ "phase": 0.2374798740211286,
133
+ "planner_ranking": 0.00014213861397461427,
134
+ "planner_risk": 0.011339436628316579,
135
+ "planner_success": 0.002072299244249926,
136
+ "proposal_diversity": 0.0,
137
+ "proposal_ranking": 0.06418653756478115,
138
+ "proposal_reconstruction": 0.07331816969733489,
139
+ "proposal_success": 0.04838304229472813,
140
+ "reocclusion": 0.21607661038931264,
141
+ "role_swap_consistency": 0.0,
142
+ "support_mode": 0.0016624049136513158,
143
+ "support_stability": 0.12395783413789774,
144
+ "task_metrics": 0.14624216885943161,
145
+ "total": 0.38278010518927325,
146
+ "uncertainty": 2.0790473172083742e-05,
147
+ "visibility": 0.09234962655525458,
148
+ "world_model": 1.1216850475261086
149
+ },
150
+ "val": {
151
+ "action": 0.010441844330423257,
152
+ "arm_role": 0.0016049532427904055,
153
+ "belief": 0.09470506825230339,
154
+ "clearance": 0.07375384743014972,
155
+ "corridor": 0.19222540205175226,
156
+ "disturbance": 0.0019139318584083494,
157
+ "grasp_affordance": 0.010620580760366989,
158
+ "occluder_contact": 0.1952310868284919,
159
+ "persistence": 0.6828906978621627,
160
+ "phase": 0.2299347263901974,
161
+ "planner_ranking": 0.00015113159439682602,
162
+ "planner_risk": 0.008906871584864954,
163
+ "planner_success": 0.0014728186387484047,
164
+ "proposal_diversity": 0.0,
165
+ "proposal_ranking": 0.08296609080086152,
166
+ "proposal_reconstruction": 0.07454461446314146,
167
+ "proposal_success": 0.05527880562074257,
168
+ "reocclusion": 0.21840402348475021,
169
+ "role_swap_consistency": 0.0,
170
+ "support_mode": 0.0009884886958664565,
171
+ "support_stability": 0.13373579051006923,
172
+ "task_metrics": 0.14126789863362457,
173
+ "total": 0.3722763684662906,
174
+ "uncertainty": 3.5526475199510585e-05,
175
+ "visibility": 0.08942856323538405,
176
+ "world_model": 1.038636032379035
177
+ }
178
+ }
179
+ ]
artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/summary.json ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "experiment_name": "proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17",
3
+ "device": "cuda",
4
+ "best_checkpoint": "/workspace/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/checkpoint_best.pt",
5
+ "final_train_total": 0.38278010518927325,
6
+ "final_val_total": 0.3722763684662906,
7
+ "train_time_sec": 108.92269134521484,
8
+ "peak_gpu_memory_mb": 2451.3857421875,
9
+ "num_train_samples": 380,
10
+ "num_val_samples": 131,
11
+ "planner_mode": "trainable",
12
+ "frozen_modules": [],
13
+ "init_info": {
14
+ "path": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt",
15
+ "loaded_keys": 828,
16
+ "skipped_shape_mismatch_keys": [
17
+ "decoder.proposal_mode_head.3.weight",
18
+ "decoder.proposal_mode_head.3.bias",
19
+ "decoder.proposal_mode_embeddings.weight"
20
+ ],
21
+ "missing_keys": [
22
+ "decoder.task_embedding.weight",
23
+ "decoder.proposal_mode_head.3.weight",
24
+ "decoder.proposal_mode_head.3.bias",
25
+ "decoder.proposal_mode_embeddings.weight",
26
+ "decoder.mode_residual_heads.6.0.weight",
27
+ "decoder.mode_residual_heads.6.0.bias",
28
+ "decoder.mode_residual_heads.6.1.weight",
29
+ "decoder.mode_residual_heads.6.1.bias",
30
+ "decoder.mode_residual_heads.6.3.weight",
31
+ "decoder.mode_residual_heads.6.3.bias",
32
+ "elastic_state_head.decoder.task_embedding.weight",
33
+ "elastic_state_head.decoder.task_field_affine.weight",
34
+ "elastic_state_head.decoder.task_field_affine.bias",
35
+ "elastic_state_head.decoder.task_summary_adapter.0.weight",
36
+ "elastic_state_head.decoder.task_summary_adapter.0.bias",
37
+ "elastic_state_head.decoder.task_summary_adapter.1.weight",
38
+ "elastic_state_head.decoder.task_summary_adapter.1.bias",
39
+ "elastic_state_head.decoder.task_phase_head.weight",
40
+ "elastic_state_head.decoder.task_phase_head.bias",
41
+ "elastic_state_head.decoder.task_support_head.weight",
42
+ "elastic_state_head.decoder.task_support_head.bias",
43
+ "elastic_state_head.decoder.task_reocclusion_head.weight",
44
+ "elastic_state_head.decoder.task_reocclusion_head.bias",
45
+ "elastic_state_head.decoder.task_metric_head.0.weight",
46
+ "elastic_state_head.decoder.task_metric_head.0.bias",
47
+ "elastic_state_head.decoder.task_metric_head.1.weight",
48
+ "elastic_state_head.decoder.task_metric_head.1.bias",
49
+ "elastic_state_head.decoder.task_metric_head.3.weight",
50
+ "elastic_state_head.decoder.task_metric_head.3.bias",
51
+ "world_model.task_embedding.weight",
52
+ "world_model.spatial_field_encoder.0.weight",
53
+ "world_model.spatial_field_encoder.0.bias",
54
+ "world_model.spatial_field_encoder.2.weight",
55
+ "world_model.spatial_field_encoder.2.bias",
56
+ "world_model.spatial_context_proj.0.weight",
57
+ "world_model.spatial_context_proj.0.bias",
58
+ "world_model.spatial_context_proj.1.weight",
59
+ "world_model.spatial_context_proj.1.bias",
60
+ "world_model.spatial_gate_z.weight",
61
+ "world_model.spatial_gate_z.bias",
62
+ "world_model.spatial_gate_r.weight",
63
+ "world_model.spatial_gate_r.bias",
64
+ "world_model.spatial_candidate.weight",
65
+ "world_model.spatial_candidate.bias",
66
+ "world_model.spatial_summary_proj.0.weight",
67
+ "world_model.spatial_summary_proj.0.bias",
68
+ "world_model.spatial_summary_proj.1.weight",
69
+ "world_model.spatial_summary_proj.1.bias",
70
+ "world_model.spatial_phase_head.weight",
71
+ "world_model.spatial_phase_head.bias",
72
+ "world_model.spatial_support_mode_head.weight",
73
+ "world_model.spatial_support_mode_head.bias",
74
+ "world_model.spatial_arm_role_head.weight",
75
+ "world_model.spatial_arm_role_head.bias",
76
+ "world_model.spatial_reocclusion_head.weight",
77
+ "world_model.spatial_reocclusion_head.bias",
78
+ "world_model.spatial_target_belief_head.weight",
79
+ "world_model.spatial_target_belief_head.bias",
80
+ "world_model.spatial_visibility_head.weight",
81
+ "world_model.spatial_visibility_head.bias",
82
+ "world_model.spatial_clearance_head.weight",
83
+ "world_model.spatial_clearance_head.bias",
84
+ "world_model.spatial_occluder_contact_head.weight",
85
+ "world_model.spatial_occluder_contact_head.bias",
86
+ "world_model.spatial_grasp_affordance_head.weight",
87
+ "world_model.spatial_grasp_affordance_head.bias",
88
+ "world_model.spatial_support_stability_head.weight",
89
+ "world_model.spatial_support_stability_head.bias",
90
+ "world_model.spatial_persistence_head.weight",
91
+ "world_model.spatial_persistence_head.bias",
92
+ "world_model.spatial_reocclusion_field_head.weight",
93
+ "world_model.spatial_reocclusion_field_head.bias",
94
+ "world_model.spatial_disturbance_head.weight",
95
+ "world_model.spatial_disturbance_head.bias",
96
+ "world_model.spatial_uncertainty_head.weight",
97
+ "world_model.spatial_uncertainty_head.bias",
98
+ "world_model.spatial_access_head.weight",
99
+ "world_model.spatial_access_head.bias"
100
+ ],
101
+ "unexpected_keys": []
102
+ }
103
+ }
artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/checkpoint_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1bf52506b13c794b6d8f6f4738294947703c16c6a2c3b46dc8ea68fd14e0c12
3
+ size 940663118
artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/config_resolved.yaml ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17
2
+ output_dir: /workspace/outputs/r3d_handoff
3
+ device: cuda
4
+ seed: 17
5
+ init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt
6
+ init_strict: false
7
+ data:
8
+ proxies:
9
+ - foliage_proxy
10
+ - bag_proxy
11
+ - cloth_proxy
12
+ resolution: 224
13
+ dataset_version: reveal_proxy_v6_rgbd_elastic_state
14
+ train_episodes_per_proxy: 48
15
+ val_episodes_per_proxy: 16
16
+ train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3.pt
17
+ val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3.pt
18
+ rebuild_dataset: false
19
+ chunk_horizon: 8
20
+ rollout_horizon: 5
21
+ history_steps: 6
22
+ planner_candidates: 8
23
+ seed: 17
24
+ optim:
25
+ epochs: 5
26
+ batch_size: 4
27
+ num_workers: 24
28
+ lr: 0.00015
29
+ weight_decay: 0.0001
30
+ trainer:
31
+ policy_type: elastic_reveal
32
+ use_bf16: true
33
+ grad_clip_norm: 1.0
34
+ freeze_backbone: true
35
+ gradient_checkpointing: false
36
+ plan_during_train: true
37
+ plan_during_eval: true
38
+ support_mode_conditioning: true
39
+ planner_mode: trainable
40
+ use_depth: true
41
+ use_world_model: true
42
+ use_role_tokens: true
43
+ compute_equivariance_probe: false
44
+ policy:
45
+ backbone:
46
+ model_name: openai/clip-vit-base-patch32
47
+ hidden_dim: 512
48
+ max_text_tokens: 32
49
+ freeze_backbone: true
50
+ gradient_checkpointing: false
51
+ use_dummy_backbone: false
52
+ fusion:
53
+ hidden_dim: 512
54
+ num_cameras: 3
55
+ num_layers: 4
56
+ num_heads: 8
57
+ ff_dim: 2048
58
+ dropout: 0.1
59
+ proprio_dim: 32
60
+ proprio_tokens: 1
61
+ memory:
62
+ hidden_dim: 512
63
+ action_dim: 14
64
+ history_steps: 6
65
+ scene_history_steps: 3
66
+ belief_history_steps: 8
67
+ num_layers: 2
68
+ dropout: 0.1
69
+ memory_bank_size: 4
70
+ scene_bank_size: 2
71
+ belief_bank_size: 2
72
+ num_heads: 8
73
+ max_history_steps: 8
74
+ decoder:
75
+ hidden_dim: 512
76
+ num_heads: 8
77
+ num_layers: 4
78
+ ff_dim: 2048
79
+ dropout: 0.1
80
+ chunk_size: 8
81
+ action_dim: 14
82
+ arm_action_dim: 7
83
+ num_candidates: 8
84
+ num_phases: 5
85
+ num_arm_roles: 4
86
+ num_proposal_modes: 7
87
+ planner_top_k: 4
88
+ reveal_head:
89
+ hidden_dim: 512
90
+ num_support_modes: 3
91
+ num_approach_templates: 32
92
+ rollout_horizon: 5
93
+ belief_map_size: 32
94
+ field_size: 16
95
+ num_heads: 8
96
+ predict_belief_map: true
97
+ num_phases: 5
98
+ num_arm_roles: 4
99
+ num_interaction_tokens: 8
100
+ num_tasks: 4
101
+ world_model:
102
+ hidden_dim: 512
103
+ action_dim: 14
104
+ num_support_modes: 3
105
+ num_approach_templates: 32
106
+ rollout_horizon: 5
107
+ field_size: 16
108
+ num_heads: 8
109
+ num_phases: 5
110
+ num_arm_roles: 4
111
+ num_interaction_tokens: 8
112
+ belief_map_size: 32
113
+ predict_belief_map: true
114
+ scene_bank_size: 2
115
+ belief_bank_size: 2
116
+ rollout_mode: spatial_rollout
117
+ num_tasks: 4
118
+ planner:
119
+ hidden_dim: 512
120
+ num_candidates: 8
121
+ action_dim: 14
122
+ num_support_modes: 3
123
+ utility_margin: 0.1
124
+ num_heads: 8
125
+ num_layers: 2
126
+ num_phases: 5
127
+ num_arm_roles: 4
128
+ top_k: 4
129
+ loss_weights:
130
+ action: 0.6
131
+ phase: 0.05
132
+ arm_role: 0.1
133
+ support_mode: 0.1
134
+ corridor: 0.15
135
+ persistence: 0.08
136
+ disturbance: 0.08
137
+ world_model: 0.35
138
+ belief: 0.05
139
+ visibility: 0.05
140
+ clearance: 0.08
141
+ support_stability: 0.08
142
+ reocclusion: 0.08
143
+ occluder_contact: 0.05
144
+ grasp_affordance: 0.05
145
+ planner_success: 0.25
146
+ planner_risk: 0.1
147
+ planner_ranking: 0.25
148
+ proposal_reconstruction: 0.05
149
+ proposal_success: 0.2
150
+ proposal_ranking: 0.25
151
+ proposal_diversity: 0.05
152
+ role_swap_consistency: 0.02
153
+ task_metrics: 0.1
artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/metrics.json ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "epoch": 0,
4
+ "train": {
5
+ "action": 0.01881810994328637,
6
+ "arm_role": 2.660249408922697e-07,
7
+ "belief": 0.11447437676159959,
8
+ "clearance": 0.08659498787632114,
9
+ "corridor": 0.2168508012632006,
10
+ "disturbance": 0.003011604138699017,
11
+ "grasp_affordance": 0.026647591468338904,
12
+ "occluder_contact": 0.23407171917589087,
13
+ "persistence": 0.715272118206332,
14
+ "phase": 0.3055733912869504,
15
+ "planner_ranking": 0.022415388478637062,
16
+ "planner_risk": 0.023148000005044436,
17
+ "planner_success": 0.10891202024527286,
18
+ "proposal_diversity": 0.0,
19
+ "proposal_ranking": 0.29254391558076204,
20
+ "proposal_reconstruction": 0.10000501573085785,
21
+ "proposal_success": 0.24436878922738528,
22
+ "reocclusion": 0.23760041894489212,
23
+ "role_swap_consistency": 0.0,
24
+ "support_mode": 0.029171819122214067,
25
+ "support_stability": 0.13612447682964174,
26
+ "task_metrics": 0.15792004442528673,
27
+ "total": 1.1042629627805007,
28
+ "uncertainty": 0.0002569015418885101,
29
+ "visibility": 0.11963542550802231,
30
+ "world_model": 2.1293361720285917
31
+ },
32
+ "val": {
33
+ "action": 0.01775987928902561,
34
+ "arm_role": 1.9868213740892315e-08,
35
+ "belief": 0.1041581260435509,
36
+ "clearance": 0.07728264966245854,
37
+ "corridor": 0.2031804034204194,
38
+ "disturbance": 0.0017973093819102469,
39
+ "grasp_affordance": 0.029909261431770796,
40
+ "occluder_contact": 0.23528439167774085,
41
+ "persistence": 0.775493811025764,
42
+ "phase": 0.3234691350636157,
43
+ "planner_ranking": 0.0003446909185647724,
44
+ "planner_risk": 0.01719488257147146,
45
+ "planner_success": 0.00382949538867582,
46
+ "proposal_diversity": 0.0,
47
+ "proposal_ranking": 0.12049920004651402,
48
+ "proposal_reconstruction": 0.07953478553981493,
49
+ "proposal_success": 0.08874417897878271,
50
+ "reocclusion": 0.2280347410476569,
51
+ "role_swap_consistency": 0.0,
52
+ "support_mode": 0.0012850317253844078,
53
+ "support_stability": 0.14258646694096652,
54
+ "task_metrics": 0.14477597267338724,
55
+ "total": 0.5612193951101014,
56
+ "uncertainty": 5.167457964654948e-05,
57
+ "visibility": 0.09895570508458397,
58
+ "world_model": 0.8950341885740106
59
+ }
60
+ },
61
+ {
62
+ "epoch": 1,
63
+ "train": {
64
+ "action": 0.011799810049859317,
65
+ "arm_role": 2.227331462659334e-08,
66
+ "belief": 0.10664581923108352,
67
+ "clearance": 0.08073694133444836,
68
+ "corridor": 0.20818061490886305,
69
+ "disturbance": 0.0034403698920198763,
70
+ "grasp_affordance": 0.02138785579682965,
71
+ "occluder_contact": 0.22385836949473933,
72
+ "persistence": 0.6393873089823358,
73
+ "phase": 0.23239254700510126,
74
+ "planner_ranking": 7.025458191932151e-05,
75
+ "planner_risk": 0.014862261340022087,
76
+ "planner_success": 0.0018582946357415303,
77
+ "proposal_diversity": 0.0,
78
+ "proposal_ranking": 0.07140745759304416,
79
+ "proposal_reconstruction": 0.07342678826106222,
80
+ "proposal_success": 0.052891033456513754,
81
+ "reocclusion": 0.22496669194415997,
82
+ "role_swap_consistency": 0.0,
83
+ "support_mode": 0.0026235160074735944,
84
+ "support_stability": 0.129776736858644,
85
+ "task_metrics": 0.14390431940555573,
86
+ "total": 0.5859858243088973,
87
+ "uncertainty": 4.795166801757564e-05,
88
+ "visibility": 0.0989507636741588,
89
+ "world_model": 1.0815125898311013
90
+ },
91
+ "val": {
92
+ "action": 0.012531062933813893,
93
+ "arm_role": 0.0,
94
+ "belief": 0.1047917457692551,
95
+ "clearance": 0.08090435149091663,
96
+ "corridor": 0.20609694809624643,
97
+ "disturbance": 0.004269244044487344,
98
+ "grasp_affordance": 0.03268951613625342,
99
+ "occluder_contact": 0.2295533585729021,
100
+ "persistence": 1.1918026357889175,
101
+ "phase": 0.28549350922306377,
102
+ "planner_ranking": 6.612739407914474e-05,
103
+ "planner_risk": 0.008759455501355908,
104
+ "planner_success": 0.00080455597895762,
105
+ "proposal_diversity": 0.0,
106
+ "proposal_ranking": 0.1058380516866843,
107
+ "proposal_reconstruction": 0.07328326593745839,
108
+ "proposal_success": 0.058020667765628205,
109
+ "reocclusion": 0.33030271168911096,
110
+ "role_swap_consistency": 0.0,
111
+ "support_mode": 0.007980566662312909,
112
+ "support_stability": 0.13863918806115785,
113
+ "task_metrics": 0.14007962495088577,
114
+ "total": 0.5981471421140613,
115
+ "uncertainty": 4.717415575242106e-05,
116
+ "visibility": 0.09760378188256061,
117
+ "world_model": 0.9283028335282297
118
+ }
119
+ },
120
+ {
121
+ "epoch": 2,
122
+ "train": {
123
+ "action": 0.01041558725563319,
124
+ "arm_role": 8.783842387952302e-09,
125
+ "belief": 0.1059942942700888,
126
+ "clearance": 0.08004858184017632,
127
+ "corridor": 0.20489231364703492,
128
+ "disturbance": 0.0035354765677383464,
129
+ "grasp_affordance": 0.0182398099795376,
130
+ "occluder_contact": 0.2177388886087819,
131
+ "persistence": 0.7130741648352629,
132
+ "phase": 0.2232393227125469,
133
+ "planner_ranking": 9.544059870988445e-05,
134
+ "planner_risk": 0.007491795662300367,
135
+ "planner_success": 0.0006995215439115111,
136
+ "proposal_diversity": 0.0,
137
+ "proposal_ranking": 0.06466802976731408,
138
+ "proposal_reconstruction": 0.07109034053589168,
139
+ "proposal_success": 0.03924152674643617,
140
+ "reocclusion": 0.24476394773840807,
141
+ "role_swap_consistency": 0.0,
142
+ "support_mode": 0.0035586046545129073,
143
+ "support_stability": 0.1275901448373732,
144
+ "task_metrics": 0.14374801657701794,
145
+ "total": 0.5572637532886706,
146
+ "uncertainty": 3.1927780274169266e-05,
147
+ "visibility": 0.09604649849628147,
148
+ "world_model": 1.0012797054491545
149
+ },
150
+ "val": {
151
+ "action": 0.01250085191603637,
152
+ "arm_role": 1.9868213740892315e-08,
153
+ "belief": 0.10097009620883247,
154
+ "clearance": 0.07564825914574391,
155
+ "corridor": 0.19414961970213687,
156
+ "disturbance": 0.0028900962097946886,
157
+ "grasp_affordance": 0.014792599320185907,
158
+ "occluder_contact": 0.21100006365414822,
159
+ "persistence": 0.7430237780014673,
160
+ "phase": 0.21708492669418003,
161
+ "planner_ranking": 4.6006352427026066e-05,
162
+ "planner_risk": 0.004453302675039705,
163
+ "planner_success": 0.0002918489108472413,
164
+ "proposal_diversity": 0.0,
165
+ "proposal_ranking": 0.0847416734464015,
166
+ "proposal_reconstruction": 0.07325861490134036,
167
+ "proposal_success": 0.04800050914513342,
168
+ "reocclusion": 0.21185640358563626,
169
+ "role_swap_consistency": 0.0,
170
+ "support_mode": 0.0001279175175472064,
171
+ "support_stability": 0.1355795610808965,
172
+ "task_metrics": 0.13995845525553732,
173
+ "total": 0.5864624706181613,
174
+ "uncertainty": 3.328838372660605e-05,
175
+ "visibility": 0.0915211413168546,
176
+ "world_model": 1.0730400555061572
177
+ }
178
+ },
179
+ {
180
+ "epoch": 3,
181
+ "train": {
182
+ "action": 0.009735075987287258,
183
+ "arm_role": 6.901590447676809e-09,
184
+ "belief": 0.10281775146722794,
185
+ "clearance": 0.07848918745784383,
186
+ "corridor": 0.20201588633813355,
187
+ "disturbance": 0.0031908015084091425,
188
+ "grasp_affordance": 0.015698057898369276,
189
+ "occluder_contact": 0.2077378544368242,
190
+ "persistence": 0.6618055252505368,
191
+ "phase": 0.2315950472103922,
192
+ "planner_ranking": 4.5244067302076614e-05,
193
+ "planner_risk": 0.004139781545381993,
194
+ "planner_success": 0.00024409074271955575,
195
+ "proposal_diversity": 0.0,
196
+ "proposal_ranking": 0.054642489266361255,
197
+ "proposal_reconstruction": 0.0706897854020721,
198
+ "proposal_success": 0.02984069204448085,
199
+ "reocclusion": 0.21985463225527813,
200
+ "role_swap_consistency": 0.0,
201
+ "support_mode": 0.00018626294637981214,
202
+ "support_stability": 0.12553868676094632,
203
+ "task_metrics": 0.14241597934773093,
204
+ "total": 0.5318458660652763,
205
+ "uncertainty": 2.4771544117715837e-05,
206
+ "visibility": 0.09392199994702088,
207
+ "world_model": 0.9659816164719431
208
+ },
209
+ "val": {
210
+ "action": 0.010274204266765579,
211
+ "arm_role": 0.0,
212
+ "belief": 0.09872564831466386,
213
+ "clearance": 0.07471577507076842,
214
+ "corridor": 0.19408509180401312,
215
+ "disturbance": 0.00259435343862909,
216
+ "grasp_affordance": 0.012839911272749305,
217
+ "occluder_contact": 0.20571506023406982,
218
+ "persistence": 0.7026729972073527,
219
+ "phase": 0.26675827372254746,
220
+ "planner_ranking": 4.497824862124672e-05,
221
+ "planner_risk": 0.002781865681754425,
222
+ "planner_success": 0.00015440414228413084,
223
+ "proposal_diversity": 0.0,
224
+ "proposal_ranking": 0.07663222694783613,
225
+ "proposal_reconstruction": 0.07091993872414935,
226
+ "proposal_success": 0.03695757263763384,
227
+ "reocclusion": 0.21823022356539062,
228
+ "role_swap_consistency": 0.0,
229
+ "support_mode": 0.00015865708998965354,
230
+ "support_stability": 0.13456520460771793,
231
+ "task_metrics": 0.1404704556546428,
232
+ "total": 0.5019133334810083,
233
+ "uncertainty": 1.7504166497460435e-05,
234
+ "visibility": 0.09078080579638481,
235
+ "world_model": 0.8508174094286832
236
+ }
237
+ },
238
+ {
239
+ "epoch": 4,
240
+ "train": {
241
+ "action": 0.00924236060757386,
242
+ "arm_role": 3.1370865671258223e-10,
243
+ "belief": 0.10063675104787476,
244
+ "clearance": 0.07763459076614757,
245
+ "corridor": 0.1999763826496507,
246
+ "disturbance": 0.0032564817505006337,
247
+ "grasp_affordance": 0.015768864574401003,
248
+ "occluder_contact": 0.20453226503572966,
249
+ "persistence": 0.6381541584980656,
250
+ "phase": 0.23467233871158802,
251
+ "planner_ranking": 0.002148842357724178,
252
+ "planner_risk": 0.005933802830986679,
253
+ "planner_success": 0.0012002266089487085,
254
+ "proposal_diversity": 0.0,
255
+ "proposal_ranking": 0.04519814905391908,
256
+ "proposal_reconstruction": 0.07035028267847865,
257
+ "proposal_success": 0.02132791725330447,
258
+ "reocclusion": 0.21220772236181226,
259
+ "role_swap_consistency": 0.0,
260
+ "support_mode": 0.00017794872585095856,
261
+ "support_stability": 0.12488894654732001,
262
+ "task_metrics": 0.14187381208727234,
263
+ "total": 0.5180026358679721,
264
+ "uncertainty": 1.678193236126677e-05,
265
+ "visibility": 0.09251636654922837,
266
+ "world_model": 0.9452698754636865
267
+ },
268
+ "val": {
269
+ "action": 0.009614509682058158,
270
+ "arm_role": 9.934106870446158e-09,
271
+ "belief": 0.0954468369935498,
272
+ "clearance": 0.07359647931474628,
273
+ "corridor": 0.19544327168753653,
274
+ "disturbance": 0.004919912796388168,
275
+ "grasp_affordance": 0.01787316725786888,
276
+ "occluder_contact": 0.2034355541973403,
277
+ "persistence": 0.8611267923631452,
278
+ "phase": 0.2359058087635221,
279
+ "planner_ranking": 7.647072767337125e-06,
280
+ "planner_risk": 0.0028961390207493396,
281
+ "planner_success": 0.00021185575601658925,
282
+ "proposal_diversity": 0.0,
283
+ "proposal_ranking": 0.06654375551398985,
284
+ "proposal_reconstruction": 0.07095728814601898,
285
+ "proposal_success": 0.02652511727347067,
286
+ "reocclusion": 0.28424677580143465,
287
+ "role_swap_consistency": 0.0,
288
+ "support_mode": 5.328719337491996e-05,
289
+ "support_stability": 0.13284552616603446,
290
+ "task_metrics": 0.1393586558600267,
291
+ "total": 0.5038685834769047,
292
+ "uncertainty": 1.3334046157483085e-05,
293
+ "visibility": 0.08824686581889789,
294
+ "world_model": 0.8240700237678759
295
+ }
296
+ }
297
+ ]
artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/summary.json ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "experiment_name": "proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17",
3
+ "device": "cuda",
4
+ "best_checkpoint": "/workspace/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/checkpoint_best.pt",
5
+ "final_train_total": 0.5180026358679721,
6
+ "final_val_total": 0.5038685834769047,
7
+ "train_time_sec": 163.31340551376343,
8
+ "peak_gpu_memory_mb": 2924.82177734375,
9
+ "num_train_samples": 380,
10
+ "num_val_samples": 131,
11
+ "planner_mode": "trainable",
12
+ "frozen_modules": [],
13
+ "init_info": {
14
+ "path": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt",
15
+ "loaded_keys": 828,
16
+ "skipped_shape_mismatch_keys": [
17
+ "decoder.proposal_mode_head.3.weight",
18
+ "decoder.proposal_mode_head.3.bias",
19
+ "decoder.proposal_mode_embeddings.weight"
20
+ ],
21
+ "missing_keys": [
22
+ "decoder.task_embedding.weight",
23
+ "decoder.proposal_mode_head.3.weight",
24
+ "decoder.proposal_mode_head.3.bias",
25
+ "decoder.proposal_mode_embeddings.weight",
26
+ "decoder.mode_residual_heads.6.0.weight",
27
+ "decoder.mode_residual_heads.6.0.bias",
28
+ "decoder.mode_residual_heads.6.1.weight",
29
+ "decoder.mode_residual_heads.6.1.bias",
30
+ "decoder.mode_residual_heads.6.3.weight",
31
+ "decoder.mode_residual_heads.6.3.bias",
32
+ "elastic_state_head.decoder.task_embedding.weight",
33
+ "elastic_state_head.decoder.task_field_affine.weight",
34
+ "elastic_state_head.decoder.task_field_affine.bias",
35
+ "elastic_state_head.decoder.task_summary_adapter.0.weight",
36
+ "elastic_state_head.decoder.task_summary_adapter.0.bias",
37
+ "elastic_state_head.decoder.task_summary_adapter.1.weight",
38
+ "elastic_state_head.decoder.task_summary_adapter.1.bias",
39
+ "elastic_state_head.decoder.task_phase_head.weight",
40
+ "elastic_state_head.decoder.task_phase_head.bias",
41
+ "elastic_state_head.decoder.task_support_head.weight",
42
+ "elastic_state_head.decoder.task_support_head.bias",
43
+ "elastic_state_head.decoder.task_reocclusion_head.weight",
44
+ "elastic_state_head.decoder.task_reocclusion_head.bias",
45
+ "elastic_state_head.decoder.task_metric_head.0.weight",
46
+ "elastic_state_head.decoder.task_metric_head.0.bias",
47
+ "elastic_state_head.decoder.task_metric_head.1.weight",
48
+ "elastic_state_head.decoder.task_metric_head.1.bias",
49
+ "elastic_state_head.decoder.task_metric_head.3.weight",
50
+ "elastic_state_head.decoder.task_metric_head.3.bias",
51
+ "world_model.task_embedding.weight",
52
+ "world_model.spatial_field_encoder.0.weight",
53
+ "world_model.spatial_field_encoder.0.bias",
54
+ "world_model.spatial_field_encoder.2.weight",
55
+ "world_model.spatial_field_encoder.2.bias",
56
+ "world_model.spatial_context_proj.0.weight",
57
+ "world_model.spatial_context_proj.0.bias",
58
+ "world_model.spatial_context_proj.1.weight",
59
+ "world_model.spatial_context_proj.1.bias",
60
+ "world_model.spatial_gate_z.weight",
61
+ "world_model.spatial_gate_z.bias",
62
+ "world_model.spatial_gate_r.weight",
63
+ "world_model.spatial_gate_r.bias",
64
+ "world_model.spatial_candidate.weight",
65
+ "world_model.spatial_candidate.bias",
66
+ "world_model.spatial_summary_proj.0.weight",
67
+ "world_model.spatial_summary_proj.0.bias",
68
+ "world_model.spatial_summary_proj.1.weight",
69
+ "world_model.spatial_summary_proj.1.bias",
70
+ "world_model.spatial_phase_head.weight",
71
+ "world_model.spatial_phase_head.bias",
72
+ "world_model.spatial_support_mode_head.weight",
73
+ "world_model.spatial_support_mode_head.bias",
74
+ "world_model.spatial_arm_role_head.weight",
75
+ "world_model.spatial_arm_role_head.bias",
76
+ "world_model.spatial_reocclusion_head.weight",
77
+ "world_model.spatial_reocclusion_head.bias",
78
+ "world_model.spatial_target_belief_head.weight",
79
+ "world_model.spatial_target_belief_head.bias",
80
+ "world_model.spatial_visibility_head.weight",
81
+ "world_model.spatial_visibility_head.bias",
82
+ "world_model.spatial_clearance_head.weight",
83
+ "world_model.spatial_clearance_head.bias",
84
+ "world_model.spatial_occluder_contact_head.weight",
85
+ "world_model.spatial_occluder_contact_head.bias",
86
+ "world_model.spatial_grasp_affordance_head.weight",
87
+ "world_model.spatial_grasp_affordance_head.bias",
88
+ "world_model.spatial_support_stability_head.weight",
89
+ "world_model.spatial_support_stability_head.bias",
90
+ "world_model.spatial_persistence_head.weight",
91
+ "world_model.spatial_persistence_head.bias",
92
+ "world_model.spatial_reocclusion_field_head.weight",
93
+ "world_model.spatial_reocclusion_field_head.bias",
94
+ "world_model.spatial_disturbance_head.weight",
95
+ "world_model.spatial_disturbance_head.bias",
96
+ "world_model.spatial_uncertainty_head.weight",
97
+ "world_model.spatial_uncertainty_head.bias",
98
+ "world_model.spatial_access_head.weight",
99
+ "world_model.spatial_access_head.bias"
100
+ ],
101
+ "unexpected_keys": []
102
+ }
103
+ }
artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/checkpoint_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3968c7aaeace3aeeb9ba2a6343ab2b35b792acbd00911d5eb76d90cd3db80a1c
3
+ size 940662478
artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/config_resolved.yaml ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17
2
+ output_dir: /workspace/outputs/r3d_handoff_phase
3
+ device: cuda
4
+ seed: 17
5
+ init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt
6
+ init_strict: false
7
+ data:
8
+ proxies:
9
+ - foliage_proxy
10
+ - bag_proxy
11
+ - cloth_proxy
12
+ resolution: 224
13
+ dataset_version: reveal_proxy_v6_rgbd_elastic_state_phase
14
+ train_episodes_per_proxy: 48
15
+ val_episodes_per_proxy: 16
16
+ train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_phase.pt
17
+ val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_phase.pt
18
+ rebuild_dataset: false
19
+ chunk_horizon: 8
20
+ rollout_horizon: 5
21
+ history_steps: 6
22
+ planner_candidates: 8
23
+ seed: 17
24
+ optim:
25
+ epochs: 3
26
+ batch_size: 4
27
+ num_workers: 24
28
+ lr: 0.0001
29
+ weight_decay: 0.0001
30
+ trainer:
31
+ policy_type: elastic_reveal
32
+ use_bf16: true
33
+ grad_clip_norm: 1.0
34
+ freeze_backbone: true
35
+ gradient_checkpointing: false
36
+ plan_during_train: true
37
+ plan_during_eval: true
38
+ support_mode_conditioning: true
39
+ planner_mode: trainable
40
+ use_depth: true
41
+ use_world_model: true
42
+ use_role_tokens: true
43
+ compute_equivariance_probe: false
44
+ policy:
45
+ backbone:
46
+ model_name: openai/clip-vit-base-patch32
47
+ hidden_dim: 512
48
+ max_text_tokens: 32
49
+ freeze_backbone: true
50
+ gradient_checkpointing: false
51
+ use_dummy_backbone: false
52
+ fusion:
53
+ hidden_dim: 512
54
+ num_cameras: 3
55
+ num_layers: 4
56
+ num_heads: 8
57
+ ff_dim: 2048
58
+ dropout: 0.1
59
+ proprio_dim: 32
60
+ proprio_tokens: 1
61
+ memory:
62
+ hidden_dim: 512
63
+ action_dim: 14
64
+ history_steps: 6
65
+ scene_history_steps: 3
66
+ belief_history_steps: 8
67
+ num_layers: 2
68
+ dropout: 0.1
69
+ memory_bank_size: 4
70
+ scene_bank_size: 2
71
+ belief_bank_size: 2
72
+ num_heads: 8
73
+ max_history_steps: 8
74
+ decoder:
75
+ hidden_dim: 512
76
+ num_heads: 8
77
+ num_layers: 4
78
+ ff_dim: 2048
79
+ dropout: 0.1
80
+ chunk_size: 8
81
+ action_dim: 14
82
+ arm_action_dim: 7
83
+ num_candidates: 8
84
+ num_phases: 5
85
+ num_arm_roles: 4
86
+ num_proposal_modes: 7
87
+ planner_top_k: 4
88
+ reveal_head:
89
+ hidden_dim: 512
90
+ num_support_modes: 3
91
+ num_approach_templates: 32
92
+ rollout_horizon: 5
93
+ belief_map_size: 32
94
+ field_size: 16
95
+ num_heads: 8
96
+ predict_belief_map: true
97
+ num_phases: 5
98
+ num_arm_roles: 4
99
+ num_interaction_tokens: 8
100
+ num_tasks: 4
101
+ world_model:
102
+ hidden_dim: 512
103
+ action_dim: 14
104
+ num_support_modes: 3
105
+ num_approach_templates: 32
106
+ rollout_horizon: 5
107
+ field_size: 16
108
+ num_heads: 8
109
+ num_phases: 5
110
+ num_arm_roles: 4
111
+ num_interaction_tokens: 8
112
+ belief_map_size: 32
113
+ predict_belief_map: true
114
+ scene_bank_size: 2
115
+ belief_bank_size: 2
116
+ rollout_mode: compact_rollout
117
+ num_tasks: 4
118
+ planner:
119
+ hidden_dim: 512
120
+ num_candidates: 8
121
+ action_dim: 14
122
+ num_support_modes: 3
123
+ utility_margin: 0.1
124
+ num_heads: 8
125
+ num_layers: 2
126
+ num_phases: 5
127
+ num_arm_roles: 4
128
+ top_k: 4
129
+ loss_weights:
130
+ action: 1.0
131
+ phase: 0.08
132
+ arm_role: 0.1
133
+ support_mode: 0.1
134
+ corridor: 0.12
135
+ persistence: 0.06
136
+ disturbance: 0.06
137
+ world_model: 0.2
138
+ belief: 0.05
139
+ visibility: 0.05
140
+ clearance: 0.06
141
+ support_stability: 0.06
142
+ reocclusion: 0.06
143
+ occluder_contact: 0.05
144
+ grasp_affordance: 0.05
145
+ planner_success: 0.2
146
+ planner_risk: 0.08
147
+ planner_ranking: 0.2
148
+ proposal_reconstruction: 0.08
149
+ proposal_success: 0.12
150
+ proposal_ranking: 0.15
151
+ proposal_diversity: 0.05
152
+ role_swap_consistency: 0.02
153
+ task_metrics: 0.05
artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/metrics.json ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "epoch": 0,
4
+ "train": {
5
+ "action": 0.01903669923838032,
6
+ "arm_role": 0.0011954489507173237,
7
+ "belief": 0.11834000094156516,
8
+ "clearance": 0.07986288266746622,
9
+ "corridor": 0.2066003169864416,
10
+ "disturbance": 0.00205025365499559,
11
+ "grasp_affordance": 0.00966440967382177,
12
+ "occluder_contact": 0.2094391022857867,
13
+ "persistence": 0.6896675200305732,
14
+ "phase": 0.3597658646734137,
15
+ "planner_ranking": 0.010135862800770586,
16
+ "planner_risk": 0.02269953653020294,
17
+ "planner_success": 0.10877308378878393,
18
+ "proposal_diversity": 0.0,
19
+ "proposal_ranking": 0.3664713734858914,
20
+ "proposal_reconstruction": 0.10571438976024326,
21
+ "proposal_success": 0.3060418816773515,
22
+ "reocclusion": 0.23476032413738337,
23
+ "role_swap_consistency": 0.0,
24
+ "support_mode": 0.02608335896542198,
25
+ "support_stability": 0.1283740213434947,
26
+ "task_metrics": 0.16838834607287456,
27
+ "total": 0.7838476789625067,
28
+ "uncertainty": 9.599007572480907e-05,
29
+ "visibility": 0.11086608359688206,
30
+ "world_model": 2.4192019870406702
31
+ },
32
+ "val": {
33
+ "action": 0.016467795622619717,
34
+ "arm_role": 0.00044356281741232976,
35
+ "belief": 0.10317085593035727,
36
+ "clearance": 0.07557447807806911,
37
+ "corridor": 0.20408162474632263,
38
+ "disturbance": 0.0014927912058986046,
39
+ "grasp_affordance": 0.00985990883782506,
40
+ "occluder_contact": 0.20460259372537787,
41
+ "persistence": 0.9340643006179369,
42
+ "phase": 0.4205810320422505,
43
+ "planner_ranking": 0.0002936208506016004,
44
+ "planner_risk": 0.0162254377748027,
45
+ "planner_success": 0.013315262752726223,
46
+ "proposal_diversity": 0.0,
47
+ "proposal_ranking": 0.14499353143301877,
48
+ "proposal_reconstruction": 0.08181179179386659,
49
+ "proposal_success": 0.13043119690634988,
50
+ "reocclusion": 0.2904955812475898,
51
+ "role_swap_consistency": 0.0,
52
+ "support_mode": 0.0027611398766043058,
53
+ "support_stability": 0.13547231697223402,
54
+ "task_metrics": 0.1477968403787324,
55
+ "total": 0.6003618971867994,
56
+ "uncertainty": 0.00011726474721482109,
57
+ "visibility": 0.09890205435680621,
58
+ "world_model": 1.8151403015310115
59
+ }
60
+ },
61
+ {
62
+ "epoch": 1,
63
+ "train": {
64
+ "action": 0.011172730509976024,
65
+ "arm_role": 0.005047608049292314,
66
+ "belief": 0.10358183007491262,
67
+ "clearance": 0.07736518386947482,
68
+ "corridor": 0.19873161581589988,
69
+ "disturbance": 0.002192211224188979,
70
+ "grasp_affordance": 0.010328439559395376,
71
+ "occluder_contact": 0.20153354829863498,
72
+ "persistence": 0.6124827290352091,
73
+ "phase": 0.2595963217710194,
74
+ "planner_ranking": 0.00016108446396796545,
75
+ "planner_risk": 0.01525886312960402,
76
+ "planner_success": 0.006821451556721801,
77
+ "proposal_diversity": 0.0,
78
+ "proposal_ranking": 0.08119581977983839,
79
+ "proposal_reconstruction": 0.07775982775186238,
80
+ "proposal_success": 0.08087819657827679,
81
+ "reocclusion": 0.21806120462715625,
82
+ "role_swap_consistency": 0.0,
83
+ "support_mode": 0.004165554046630859,
84
+ "support_stability": 0.12518173395410964,
85
+ "task_metrics": 0.1470285534074432,
86
+ "total": 0.4728757431632594,
87
+ "uncertainty": 4.4979283201859e-05,
88
+ "visibility": 0.09486380823348697,
89
+ "world_model": 1.477295964642575
90
+ },
91
+ "val": {
92
+ "action": 0.014699659296170328,
93
+ "arm_role": 0.0032501910410990768,
94
+ "belief": 0.09604058076034892,
95
+ "clearance": 0.0737346127403505,
96
+ "corridor": 0.19246741181070154,
97
+ "disturbance": 0.002424581844631272,
98
+ "grasp_affordance": 0.011332590499836388,
99
+ "occluder_contact": 0.1972112443410989,
100
+ "persistence": 0.754733283637148,
101
+ "phase": 0.27422264163606364,
102
+ "planner_ranking": 6.61312957218439e-05,
103
+ "planner_risk": 0.014809876634513565,
104
+ "planner_success": 0.005034577334299684,
105
+ "proposal_diversity": 0.0,
106
+ "proposal_ranking": 0.10355714928697456,
107
+ "proposal_reconstruction": 0.07605304298075763,
108
+ "proposal_success": 0.07239359052795352,
109
+ "reocclusion": 0.2211838181723248,
110
+ "role_swap_consistency": 0.0,
111
+ "support_mode": 0.0030512072068328657,
112
+ "support_stability": 0.13321079302466277,
113
+ "task_metrics": 0.14215884109338126,
114
+ "total": 0.42770718534787494,
115
+ "uncertainty": 4.0278254969604006e-05,
116
+ "visibility": 0.09008839387785304,
117
+ "world_model": 1.1843715039166538
118
+ }
119
+ },
120
+ {
121
+ "epoch": 2,
122
+ "train": {
123
+ "action": 0.009650313334637567,
124
+ "arm_role": 0.0029296059357492546,
125
+ "belief": 0.09819177420515764,
126
+ "clearance": 0.07597495360594046,
127
+ "corridor": 0.19736162976018692,
128
+ "disturbance": 0.002284994474719401,
129
+ "grasp_affordance": 0.011029037923895214,
130
+ "occluder_contact": 0.19601713622871197,
131
+ "persistence": 0.6342933682810613,
132
+ "phase": 0.2582773412528791,
133
+ "planner_ranking": 0.00024582118595916605,
134
+ "planner_risk": 0.011063184471506822,
135
+ "planner_success": 0.0019527532209299113,
136
+ "proposal_diversity": 0.0,
137
+ "proposal_ranking": 0.06367454680761224,
138
+ "proposal_reconstruction": 0.07267745521507765,
139
+ "proposal_success": 0.047251041174719205,
140
+ "reocclusion": 0.2153781190392022,
141
+ "role_swap_consistency": 0.0,
142
+ "support_mode": 0.0022934590515337494,
143
+ "support_stability": 0.12411213658357921,
144
+ "task_metrics": 0.14521450063115673,
145
+ "total": 0.3853033130106173,
146
+ "uncertainty": 2.1849837026509607e-05,
147
+ "visibility": 0.09172474040012611,
148
+ "world_model": 1.0909657170898035
149
+ },
150
+ "val": {
151
+ "action": 0.010615826838395813,
152
+ "arm_role": 0.002171783652474088,
153
+ "belief": 0.09422377767887982,
154
+ "clearance": 0.0732199704330979,
155
+ "corridor": 0.1925947627786434,
156
+ "disturbance": 0.001796035210700762,
157
+ "grasp_affordance": 0.010509083230951519,
158
+ "occluder_contact": 0.19385047753651938,
159
+ "persistence": 0.6774057000875473,
160
+ "phase": 0.35872898339188064,
161
+ "planner_ranking": 0.00021498550532586788,
162
+ "planner_risk": 0.009475005589510229,
163
+ "planner_success": 0.0013016004857931737,
164
+ "proposal_diversity": 0.0,
165
+ "proposal_ranking": 0.08124377544630658,
166
+ "proposal_reconstruction": 0.07418028581323045,
167
+ "proposal_success": 0.053315439402605545,
168
+ "reocclusion": 0.21817159472089825,
169
+ "role_swap_consistency": 0.0,
170
+ "support_mode": 0.0010789622478610413,
171
+ "support_stability": 0.13355377282608638,
172
+ "task_metrics": 0.14122987561153644,
173
+ "total": 0.3789277257341327,
174
+ "uncertainty": 3.114792006079259e-05,
175
+ "visibility": 0.08864631684440555,
176
+ "world_model": 0.9896242853366968
177
+ }
178
+ }
179
+ ]
artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/summary.json ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "experiment_name": "proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17",
3
+ "device": "cuda",
4
+ "best_checkpoint": "/workspace/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/checkpoint_best.pt",
5
+ "final_train_total": 0.3853033130106173,
6
+ "final_val_total": 0.3789277257341327,
7
+ "train_time_sec": 128.96558284759521,
8
+ "peak_gpu_memory_mb": 2450.287109375,
9
+ "num_train_samples": 380,
10
+ "num_val_samples": 131,
11
+ "planner_mode": "trainable",
12
+ "frozen_modules": [],
13
+ "init_info": {
14
+ "path": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt",
15
+ "loaded_keys": 828,
16
+ "skipped_shape_mismatch_keys": [
17
+ "decoder.proposal_mode_head.3.weight",
18
+ "decoder.proposal_mode_head.3.bias",
19
+ "decoder.proposal_mode_embeddings.weight"
20
+ ],
21
+ "missing_keys": [
22
+ "decoder.task_embedding.weight",
23
+ "decoder.proposal_mode_head.3.weight",
24
+ "decoder.proposal_mode_head.3.bias",
25
+ "decoder.proposal_mode_embeddings.weight",
26
+ "decoder.mode_residual_heads.6.0.weight",
27
+ "decoder.mode_residual_heads.6.0.bias",
28
+ "decoder.mode_residual_heads.6.1.weight",
29
+ "decoder.mode_residual_heads.6.1.bias",
30
+ "decoder.mode_residual_heads.6.3.weight",
31
+ "decoder.mode_residual_heads.6.3.bias",
32
+ "elastic_state_head.decoder.task_embedding.weight",
33
+ "elastic_state_head.decoder.task_field_affine.weight",
34
+ "elastic_state_head.decoder.task_field_affine.bias",
35
+ "elastic_state_head.decoder.task_summary_adapter.0.weight",
36
+ "elastic_state_head.decoder.task_summary_adapter.0.bias",
37
+ "elastic_state_head.decoder.task_summary_adapter.1.weight",
38
+ "elastic_state_head.decoder.task_summary_adapter.1.bias",
39
+ "elastic_state_head.decoder.task_phase_head.weight",
40
+ "elastic_state_head.decoder.task_phase_head.bias",
41
+ "elastic_state_head.decoder.task_support_head.weight",
42
+ "elastic_state_head.decoder.task_support_head.bias",
43
+ "elastic_state_head.decoder.task_reocclusion_head.weight",
44
+ "elastic_state_head.decoder.task_reocclusion_head.bias",
45
+ "elastic_state_head.decoder.task_metric_head.0.weight",
46
+ "elastic_state_head.decoder.task_metric_head.0.bias",
47
+ "elastic_state_head.decoder.task_metric_head.1.weight",
48
+ "elastic_state_head.decoder.task_metric_head.1.bias",
49
+ "elastic_state_head.decoder.task_metric_head.3.weight",
50
+ "elastic_state_head.decoder.task_metric_head.3.bias",
51
+ "world_model.task_embedding.weight",
52
+ "world_model.spatial_field_encoder.0.weight",
53
+ "world_model.spatial_field_encoder.0.bias",
54
+ "world_model.spatial_field_encoder.2.weight",
55
+ "world_model.spatial_field_encoder.2.bias",
56
+ "world_model.spatial_context_proj.0.weight",
57
+ "world_model.spatial_context_proj.0.bias",
58
+ "world_model.spatial_context_proj.1.weight",
59
+ "world_model.spatial_context_proj.1.bias",
60
+ "world_model.spatial_gate_z.weight",
61
+ "world_model.spatial_gate_z.bias",
62
+ "world_model.spatial_gate_r.weight",
63
+ "world_model.spatial_gate_r.bias",
64
+ "world_model.spatial_candidate.weight",
65
+ "world_model.spatial_candidate.bias",
66
+ "world_model.spatial_summary_proj.0.weight",
67
+ "world_model.spatial_summary_proj.0.bias",
68
+ "world_model.spatial_summary_proj.1.weight",
69
+ "world_model.spatial_summary_proj.1.bias",
70
+ "world_model.spatial_phase_head.weight",
71
+ "world_model.spatial_phase_head.bias",
72
+ "world_model.spatial_support_mode_head.weight",
73
+ "world_model.spatial_support_mode_head.bias",
74
+ "world_model.spatial_arm_role_head.weight",
75
+ "world_model.spatial_arm_role_head.bias",
76
+ "world_model.spatial_reocclusion_head.weight",
77
+ "world_model.spatial_reocclusion_head.bias",
78
+ "world_model.spatial_target_belief_head.weight",
79
+ "world_model.spatial_target_belief_head.bias",
80
+ "world_model.spatial_visibility_head.weight",
81
+ "world_model.spatial_visibility_head.bias",
82
+ "world_model.spatial_clearance_head.weight",
83
+ "world_model.spatial_clearance_head.bias",
84
+ "world_model.spatial_occluder_contact_head.weight",
85
+ "world_model.spatial_occluder_contact_head.bias",
86
+ "world_model.spatial_grasp_affordance_head.weight",
87
+ "world_model.spatial_grasp_affordance_head.bias",
88
+ "world_model.spatial_support_stability_head.weight",
89
+ "world_model.spatial_support_stability_head.bias",
90
+ "world_model.spatial_persistence_head.weight",
91
+ "world_model.spatial_persistence_head.bias",
92
+ "world_model.spatial_reocclusion_field_head.weight",
93
+ "world_model.spatial_reocclusion_field_head.bias",
94
+ "world_model.spatial_disturbance_head.weight",
95
+ "world_model.spatial_disturbance_head.bias",
96
+ "world_model.spatial_uncertainty_head.weight",
97
+ "world_model.spatial_uncertainty_head.bias",
98
+ "world_model.spatial_access_head.weight",
99
+ "world_model.spatial_access_head.bias"
100
+ ],
101
+ "unexpected_keys": []
102
+ }
103
+ }
artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/checkpoint_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc40df2f2c241001bf2e8c17a177cfbcda82acef7ae90997d8e145357d901349
3
+ size 940663118
artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/config_resolved.yaml ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17
2
+ output_dir: /workspace/outputs/r3d_handoff_phase
3
+ device: cuda
4
+ seed: 17
5
+ init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt
6
+ init_strict: false
7
+ data:
8
+ proxies:
9
+ - foliage_proxy
10
+ - bag_proxy
11
+ - cloth_proxy
12
+ resolution: 224
13
+ dataset_version: reveal_proxy_v6_rgbd_elastic_state_phase
14
+ train_episodes_per_proxy: 48
15
+ val_episodes_per_proxy: 16
16
+ train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_phase.pt
17
+ val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_phase.pt
18
+ rebuild_dataset: false
19
+ chunk_horizon: 8
20
+ rollout_horizon: 5
21
+ history_steps: 6
22
+ planner_candidates: 8
23
+ seed: 17
24
+ optim:
25
+ epochs: 4
26
+ batch_size: 4
27
+ num_workers: 24
28
+ lr: 0.00015
29
+ weight_decay: 0.0001
30
+ trainer:
31
+ policy_type: elastic_reveal
32
+ use_bf16: true
33
+ grad_clip_norm: 1.0
34
+ freeze_backbone: true
35
+ gradient_checkpointing: false
36
+ plan_during_train: true
37
+ plan_during_eval: true
38
+ support_mode_conditioning: true
39
+ planner_mode: trainable
40
+ use_depth: true
41
+ use_world_model: true
42
+ use_role_tokens: true
43
+ compute_equivariance_probe: false
44
+ policy:
45
+ backbone:
46
+ model_name: openai/clip-vit-base-patch32
47
+ hidden_dim: 512
48
+ max_text_tokens: 32
49
+ freeze_backbone: true
50
+ gradient_checkpointing: false
51
+ use_dummy_backbone: false
52
+ fusion:
53
+ hidden_dim: 512
54
+ num_cameras: 3
55
+ num_layers: 4
56
+ num_heads: 8
57
+ ff_dim: 2048
58
+ dropout: 0.1
59
+ proprio_dim: 32
60
+ proprio_tokens: 1
61
+ memory:
62
+ hidden_dim: 512
63
+ action_dim: 14
64
+ history_steps: 6
65
+ scene_history_steps: 3
66
+ belief_history_steps: 8
67
+ num_layers: 2
68
+ dropout: 0.1
69
+ memory_bank_size: 4
70
+ scene_bank_size: 2
71
+ belief_bank_size: 2
72
+ num_heads: 8
73
+ max_history_steps: 8
74
+ decoder:
75
+ hidden_dim: 512
76
+ num_heads: 8
77
+ num_layers: 4
78
+ ff_dim: 2048
79
+ dropout: 0.1
80
+ chunk_size: 8
81
+ action_dim: 14
82
+ arm_action_dim: 7
83
+ num_candidates: 8
84
+ num_phases: 5
85
+ num_arm_roles: 4
86
+ num_proposal_modes: 7
87
+ planner_top_k: 4
88
+ reveal_head:
89
+ hidden_dim: 512
90
+ num_support_modes: 3
91
+ num_approach_templates: 32
92
+ rollout_horizon: 5
93
+ belief_map_size: 32
94
+ field_size: 16
95
+ num_heads: 8
96
+ predict_belief_map: true
97
+ num_phases: 5
98
+ num_arm_roles: 4
99
+ num_interaction_tokens: 8
100
+ num_tasks: 4
101
+ world_model:
102
+ hidden_dim: 512
103
+ action_dim: 14
104
+ num_support_modes: 3
105
+ num_approach_templates: 32
106
+ rollout_horizon: 5
107
+ field_size: 16
108
+ num_heads: 8
109
+ num_phases: 5
110
+ num_arm_roles: 4
111
+ num_interaction_tokens: 8
112
+ belief_map_size: 32
113
+ predict_belief_map: true
114
+ scene_bank_size: 2
115
+ belief_bank_size: 2
116
+ rollout_mode: spatial_rollout
117
+ num_tasks: 4
118
+ planner:
119
+ hidden_dim: 512
120
+ num_candidates: 8
121
+ action_dim: 14
122
+ num_support_modes: 3
123
+ utility_margin: 0.1
124
+ num_heads: 8
125
+ num_layers: 2
126
+ num_phases: 5
127
+ num_arm_roles: 4
128
+ top_k: 4
129
+ loss_weights:
130
+ action: 0.6
131
+ phase: 0.08
132
+ arm_role: 0.1
133
+ support_mode: 0.1
134
+ corridor: 0.15
135
+ persistence: 0.08
136
+ disturbance: 0.08
137
+ world_model: 0.35
138
+ belief: 0.05
139
+ visibility: 0.05
140
+ clearance: 0.08
141
+ support_stability: 0.08
142
+ reocclusion: 0.08
143
+ occluder_contact: 0.05
144
+ grasp_affordance: 0.05
145
+ planner_success: 0.25
146
+ planner_risk: 0.1
147
+ planner_ranking: 0.25
148
+ proposal_reconstruction: 0.05
149
+ proposal_success: 0.2
150
+ proposal_ranking: 0.25
151
+ proposal_diversity: 0.05
152
+ role_swap_consistency: 0.02
153
+ task_metrics: 0.1
artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/metrics.json ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "epoch": 0,
4
+ "train": {
5
+ "action": 0.019181225144941555,
6
+ "arm_role": 3.1182640477230673e-07,
7
+ "belief": 0.11448187259467024,
8
+ "clearance": 0.0864047217133798,
9
+ "corridor": 0.21893725262856797,
10
+ "disturbance": 0.003079198535813607,
11
+ "grasp_affordance": 0.02621712978978298,
12
+ "occluder_contact": 0.2337783462122867,
13
+ "persistence": 0.719700569705425,
14
+ "phase": 0.3856928740677081,
15
+ "planner_ranking": 0.022916802166031102,
16
+ "planner_risk": 0.023334504154167678,
17
+ "planner_success": 0.10860285238411865,
18
+ "proposal_diversity": 0.0,
19
+ "proposal_ranking": 0.2922646708394352,
20
+ "proposal_reconstruction": 0.09971292505138799,
21
+ "proposal_success": 0.24467597556741613,
22
+ "reocclusion": 0.24054438805109576,
23
+ "role_swap_consistency": 0.0,
24
+ "support_mode": 0.03278662405515972,
25
+ "support_stability": 0.13613393926306774,
26
+ "task_metrics": 0.15838541047353494,
27
+ "total": 1.1187869517426743,
28
+ "uncertainty": 0.00028297689169975407,
29
+ "visibility": 0.11868352764531186,
30
+ "world_model": 2.122099159265819
31
+ },
32
+ "val": {
33
+ "action": 0.016976301219653,
34
+ "arm_role": 5.6895338003194025e-08,
35
+ "belief": 0.10437920215454968,
36
+ "clearance": 0.07773172505425685,
37
+ "corridor": 0.20573271156260461,
38
+ "disturbance": 0.002334372425979602,
39
+ "grasp_affordance": 0.04297696501298836,
40
+ "occluder_contact": 0.23979515650055624,
41
+ "persistence": 0.7753512069131389,
42
+ "phase": 0.44806069852502056,
43
+ "planner_ranking": 0.0006287269447208624,
44
+ "planner_risk": 0.018244338094849478,
45
+ "planner_success": 0.004400868512069185,
46
+ "proposal_diversity": 0.0,
47
+ "proposal_ranking": 0.12023697172602017,
48
+ "proposal_reconstruction": 0.07816453142599626,
49
+ "proposal_success": 0.08989288398262227,
50
+ "reocclusion": 0.23809522954803525,
51
+ "role_swap_consistency": 0.0,
52
+ "support_mode": 0.002169632747995131,
53
+ "support_stability": 0.1437560645016757,
54
+ "task_metrics": 0.14553586303284674,
55
+ "total": 0.6175428755355604,
56
+ "uncertainty": 7.00113979071444e-05,
57
+ "visibility": 0.09822999302184943,
58
+ "world_model": 0.9931504021991383
59
+ }
60
+ },
61
+ {
62
+ "epoch": 1,
63
+ "train": {
64
+ "action": 0.011922537655520597,
65
+ "arm_role": 3.544907820852179e-08,
66
+ "belief": 0.10706032749853636,
67
+ "clearance": 0.08102132846650324,
68
+ "corridor": 0.20774916157518564,
69
+ "disturbance": 0.003389558960426305,
70
+ "grasp_affordance": 0.021082493273149195,
71
+ "occluder_contact": 0.22566462457180023,
72
+ "persistence": 0.6525513453439712,
73
+ "phase": 0.2908584324937118,
74
+ "planner_ranking": 8.167305853045323e-05,
75
+ "planner_risk": 0.01491448436741178,
76
+ "planner_success": 0.0019715585316972513,
77
+ "proposal_diversity": 0.0,
78
+ "proposal_ranking": 0.06992289091607458,
79
+ "proposal_reconstruction": 0.07328974894787135,
80
+ "proposal_success": 0.05622971397089331,
81
+ "reocclusion": 0.2235310155016027,
82
+ "role_swap_consistency": 0.0,
83
+ "support_mode": 0.0016054740077570866,
84
+ "support_stability": 0.13015181959459657,
85
+ "task_metrics": 0.1444544498857699,
86
+ "total": 0.6000718728492135,
87
+ "uncertainty": 5.28337070471408e-05,
88
+ "visibility": 0.09962119711072821,
89
+ "world_model": 1.0844623948398389
90
+ },
91
+ "val": {
92
+ "action": 0.012654555594605026,
93
+ "arm_role": 0.0,
94
+ "belief": 0.10426627686529448,
95
+ "clearance": 0.08140931176868352,
96
+ "corridor": 0.21032546257430856,
97
+ "disturbance": 0.004191758795445132,
98
+ "grasp_affordance": 0.053641817603034506,
99
+ "occluder_contact": 0.23278299877137848,
100
+ "persistence": 1.2810496254400774,
101
+ "phase": 0.3091735607295325,
102
+ "planner_ranking": 0.00015768451346571743,
103
+ "planner_risk": 0.00882755185364548,
104
+ "planner_success": 0.0008127102125647732,
105
+ "proposal_diversity": 0.0,
106
+ "proposal_ranking": 0.1040246604490235,
107
+ "proposal_reconstruction": 0.074961857362227,
108
+ "proposal_success": 0.05817745603395231,
109
+ "reocclusion": 0.32169581723935675,
110
+ "role_swap_consistency": 0.0,
111
+ "support_mode": 0.0020671126813712444,
112
+ "support_stability": 0.13979825738704565,
113
+ "task_metrics": 0.14051661414630484,
114
+ "total": 0.6931917875102072,
115
+ "uncertainty": 7.29499135531748e-05,
116
+ "visibility": 0.09667146341367201,
117
+ "world_model": 1.1483498287923408
118
+ }
119
+ },
120
+ {
121
+ "epoch": 2,
122
+ "train": {
123
+ "action": 0.009906618949025869,
124
+ "arm_role": 1.2548346268503289e-08,
125
+ "belief": 0.10543899598874544,
126
+ "clearance": 0.08008571463195902,
127
+ "corridor": 0.20356425615517718,
128
+ "disturbance": 0.003584925674122611,
129
+ "grasp_affordance": 0.02068159209662362,
130
+ "occluder_contact": 0.21795249863674765,
131
+ "persistence": 0.7022376383202515,
132
+ "phase": 0.2805413355952815,
133
+ "planner_ranking": 5.863887835115939e-05,
134
+ "planner_risk": 0.007802685193325344,
135
+ "planner_success": 0.0006161007331684232,
136
+ "proposal_diversity": 0.0,
137
+ "proposal_ranking": 0.059995323853371176,
138
+ "proposal_reconstruction": 0.0712106538446326,
139
+ "proposal_success": 0.03740891177011164,
140
+ "reocclusion": 0.23808821498992314,
141
+ "role_swap_consistency": 0.0,
142
+ "support_mode": 0.0014756444253419575,
143
+ "support_stability": 0.1280404823783197,
144
+ "task_metrics": 0.14326360319790088,
145
+ "total": 0.5650381593327773,
146
+ "uncertainty": 4.478374925335964e-05,
147
+ "visibility": 0.09581161878610912,
148
+ "world_model": 1.0014131157021773
149
+ },
150
+ "val": {
151
+ "action": 0.011506305104403786,
152
+ "arm_role": 0.0,
153
+ "belief": 0.10174731707031076,
154
+ "clearance": 0.07599064165895636,
155
+ "corridor": 0.19603406366976825,
156
+ "disturbance": 0.0034094253584925987,
157
+ "grasp_affordance": 0.01412029577080499,
158
+ "occluder_contact": 0.21309178299976117,
159
+ "persistence": 0.7315149356921514,
160
+ "phase": 0.2930864508433098,
161
+ "planner_ranking": 0.0013956731370778168,
162
+ "planner_risk": 0.0041357744308753,
163
+ "planner_success": 0.0003894640280067864,
164
+ "proposal_diversity": 0.0,
165
+ "proposal_ranking": 0.08693639387263719,
166
+ "proposal_reconstruction": 0.07113255470088034,
167
+ "proposal_success": 0.05217952732786988,
168
+ "reocclusion": 0.2135303650390018,
169
+ "role_swap_consistency": 0.0,
170
+ "support_mode": 0.0002804717289770699,
171
+ "support_stability": 0.13648639472596574,
172
+ "task_metrics": 0.13825015418908812,
173
+ "total": 0.621522765267979,
174
+ "uncertainty": 5.1016551851441335e-05,
175
+ "visibility": 0.0916972327412981,
176
+ "world_model": 1.1354843739307288
177
+ }
178
+ },
179
+ {
180
+ "epoch": 3,
181
+ "train": {
182
+ "action": 0.009076371156659565,
183
+ "arm_role": 3.764503880550987e-09,
184
+ "belief": 0.10279968522096936,
185
+ "clearance": 0.07841498155735041,
186
+ "corridor": 0.2006541040775023,
187
+ "disturbance": 0.00325027588436282,
188
+ "grasp_affordance": 0.014321190530532284,
189
+ "occluder_contact": 0.2105231849770797,
190
+ "persistence": 0.6405418016016483,
191
+ "phase": 0.24933996639753642,
192
+ "planner_ranking": 8.54384282053831e-05,
193
+ "planner_risk": 0.004359905376393152,
194
+ "planner_success": 0.00026648731834843365,
195
+ "proposal_diversity": 0.0,
196
+ "proposal_ranking": 0.048603357446968164,
197
+ "proposal_reconstruction": 0.07084132981927771,
198
+ "proposal_success": 0.02799873605958725,
199
+ "reocclusion": 0.2113562585295815,
200
+ "role_swap_consistency": 0.0,
201
+ "support_mode": 0.0005042697253980135,
202
+ "support_stability": 0.1260564218226232,
203
+ "task_metrics": 0.14196890086719866,
204
+ "total": 0.5253663646547417,
205
+ "uncertainty": 2.7578330574023744e-05,
206
+ "visibility": 0.09414561410483561,
207
+ "world_model": 0.9370055976666902
208
+ },
209
+ "val": {
210
+ "action": 0.01043125390159813,
211
+ "arm_role": 0.0,
212
+ "belief": 0.09857180962959926,
213
+ "clearance": 0.07481752595666682,
214
+ "corridor": 0.19474056340528256,
215
+ "disturbance": 0.00304531856664019,
216
+ "grasp_affordance": 0.01476043919242467,
217
+ "occluder_contact": 0.20771399185513006,
218
+ "persistence": 0.6766496115561688,
219
+ "phase": 0.3030792711546052,
220
+ "planner_ranking": 6.782512608514011e-05,
221
+ "planner_risk": 0.002957202364500104,
222
+ "planner_success": 0.00018708011846297956,
223
+ "proposal_diversity": 0.0,
224
+ "proposal_ranking": 0.06918713425561278,
225
+ "proposal_reconstruction": 0.07121785075375528,
226
+ "proposal_success": 0.028793677348982204,
227
+ "reocclusion": 0.21873644242684046,
228
+ "role_swap_consistency": 0.0,
229
+ "support_mode": 0.0007097468169928161,
230
+ "support_stability": 0.13461551465319865,
231
+ "task_metrics": 0.13820900722886575,
232
+ "total": 0.5076249196673884,
233
+ "uncertainty": 2.9063129700675333e-05,
234
+ "visibility": 0.09054145137920525,
235
+ "world_model": 0.850949106794415
236
+ }
237
+ }
238
+ ]
artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/summary.json ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "experiment_name": "proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17",
3
+ "device": "cuda",
4
+ "best_checkpoint": "/workspace/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/checkpoint_best.pt",
5
+ "final_train_total": 0.5253663646547417,
6
+ "final_val_total": 0.5076249196673884,
7
+ "train_time_sec": 154.84144067764282,
8
+ "peak_gpu_memory_mb": 2926.07470703125,
9
+ "num_train_samples": 380,
10
+ "num_val_samples": 131,
11
+ "planner_mode": "trainable",
12
+ "frozen_modules": [],
13
+ "init_info": {
14
+ "path": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt",
15
+ "loaded_keys": 828,
16
+ "skipped_shape_mismatch_keys": [
17
+ "decoder.proposal_mode_head.3.weight",
18
+ "decoder.proposal_mode_head.3.bias",
19
+ "decoder.proposal_mode_embeddings.weight"
20
+ ],
21
+ "missing_keys": [
22
+ "decoder.task_embedding.weight",
23
+ "decoder.proposal_mode_head.3.weight",
24
+ "decoder.proposal_mode_head.3.bias",
25
+ "decoder.proposal_mode_embeddings.weight",
26
+ "decoder.mode_residual_heads.6.0.weight",
27
+ "decoder.mode_residual_heads.6.0.bias",
28
+ "decoder.mode_residual_heads.6.1.weight",
29
+ "decoder.mode_residual_heads.6.1.bias",
30
+ "decoder.mode_residual_heads.6.3.weight",
31
+ "decoder.mode_residual_heads.6.3.bias",
32
+ "elastic_state_head.decoder.task_embedding.weight",
33
+ "elastic_state_head.decoder.task_field_affine.weight",
34
+ "elastic_state_head.decoder.task_field_affine.bias",
35
+ "elastic_state_head.decoder.task_summary_adapter.0.weight",
36
+ "elastic_state_head.decoder.task_summary_adapter.0.bias",
37
+ "elastic_state_head.decoder.task_summary_adapter.1.weight",
38
+ "elastic_state_head.decoder.task_summary_adapter.1.bias",
39
+ "elastic_state_head.decoder.task_phase_head.weight",
40
+ "elastic_state_head.decoder.task_phase_head.bias",
41
+ "elastic_state_head.decoder.task_support_head.weight",
42
+ "elastic_state_head.decoder.task_support_head.bias",
43
+ "elastic_state_head.decoder.task_reocclusion_head.weight",
44
+ "elastic_state_head.decoder.task_reocclusion_head.bias",
45
+ "elastic_state_head.decoder.task_metric_head.0.weight",
46
+ "elastic_state_head.decoder.task_metric_head.0.bias",
47
+ "elastic_state_head.decoder.task_metric_head.1.weight",
48
+ "elastic_state_head.decoder.task_metric_head.1.bias",
49
+ "elastic_state_head.decoder.task_metric_head.3.weight",
50
+ "elastic_state_head.decoder.task_metric_head.3.bias",
51
+ "world_model.task_embedding.weight",
52
+ "world_model.spatial_field_encoder.0.weight",
53
+ "world_model.spatial_field_encoder.0.bias",
54
+ "world_model.spatial_field_encoder.2.weight",
55
+ "world_model.spatial_field_encoder.2.bias",
56
+ "world_model.spatial_context_proj.0.weight",
57
+ "world_model.spatial_context_proj.0.bias",
58
+ "world_model.spatial_context_proj.1.weight",
59
+ "world_model.spatial_context_proj.1.bias",
60
+ "world_model.spatial_gate_z.weight",
61
+ "world_model.spatial_gate_z.bias",
62
+ "world_model.spatial_gate_r.weight",
63
+ "world_model.spatial_gate_r.bias",
64
+ "world_model.spatial_candidate.weight",
65
+ "world_model.spatial_candidate.bias",
66
+ "world_model.spatial_summary_proj.0.weight",
67
+ "world_model.spatial_summary_proj.0.bias",
68
+ "world_model.spatial_summary_proj.1.weight",
69
+ "world_model.spatial_summary_proj.1.bias",
70
+ "world_model.spatial_phase_head.weight",
71
+ "world_model.spatial_phase_head.bias",
72
+ "world_model.spatial_support_mode_head.weight",
73
+ "world_model.spatial_support_mode_head.bias",
74
+ "world_model.spatial_arm_role_head.weight",
75
+ "world_model.spatial_arm_role_head.bias",
76
+ "world_model.spatial_reocclusion_head.weight",
77
+ "world_model.spatial_reocclusion_head.bias",
78
+ "world_model.spatial_target_belief_head.weight",
79
+ "world_model.spatial_target_belief_head.bias",
80
+ "world_model.spatial_visibility_head.weight",
81
+ "world_model.spatial_visibility_head.bias",
82
+ "world_model.spatial_clearance_head.weight",
83
+ "world_model.spatial_clearance_head.bias",
84
+ "world_model.spatial_occluder_contact_head.weight",
85
+ "world_model.spatial_occluder_contact_head.bias",
86
+ "world_model.spatial_grasp_affordance_head.weight",
87
+ "world_model.spatial_grasp_affordance_head.bias",
88
+ "world_model.spatial_support_stability_head.weight",
89
+ "world_model.spatial_support_stability_head.bias",
90
+ "world_model.spatial_persistence_head.weight",
91
+ "world_model.spatial_persistence_head.bias",
92
+ "world_model.spatial_reocclusion_field_head.weight",
93
+ "world_model.spatial_reocclusion_field_head.bias",
94
+ "world_model.spatial_disturbance_head.weight",
95
+ "world_model.spatial_disturbance_head.bias",
96
+ "world_model.spatial_uncertainty_head.weight",
97
+ "world_model.spatial_uncertainty_head.bias",
98
+ "world_model.spatial_access_head.weight",
99
+ "world_model.spatial_access_head.bias"
100
+ ],
101
+ "unexpected_keys": []
102
+ }
103
+ }
code/reveal_vla_bimanual/eval/ablations.py CHANGED
@@ -1,7 +1,9 @@
1
  MANDATORY_ABLATIONS: tuple[str, ...] = (
2
- "no_interaction_head",
3
- "no_world_model",
 
4
  "no_planner",
5
- "no_role_tokens",
6
- "short_history",
 
7
  )
 
1
  MANDATORY_ABLATIONS: tuple[str, ...] = (
2
+ "no_geometry",
3
+ "no_spatial_memory",
4
+ "compact_world_model",
5
  "no_planner",
6
+ "gaussian_candidates_only",
7
+ "no_task_head",
8
+ "no_support_mode_conditioning",
9
  )
code/reveal_vla_bimanual/eval/compare_rlbench_sweeps.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+
9
+ def _load_summary(path: Path) -> dict[str, Any]:
10
+ payload = json.loads(path.read_text(encoding="utf-8"))
11
+ task_scores = {
12
+ task_name: float(task_payload.get("mean_success", 0.0))
13
+ for task_name, task_payload in payload.get("tasks", {}).items()
14
+ }
15
+ task_returns = {
16
+ task_name: float(task_payload.get("mean_return", 0.0))
17
+ for task_name, task_payload in payload.get("tasks", {}).items()
18
+ }
19
+ task_path_recoveries = {
20
+ task_name: float(sum(task_payload.get("path_recoveries", [])) / max(1, len(task_payload.get("path_recoveries", []))))
21
+ for task_name, task_payload in payload.get("tasks", {}).items()
22
+ }
23
+ task_noop_fallbacks = {
24
+ task_name: float(sum(task_payload.get("noop_fallbacks", [])) / max(1, len(task_payload.get("noop_fallbacks", []))))
25
+ for task_name, task_payload in payload.get("tasks", {}).items()
26
+ }
27
+ return {
28
+ "path": str(path),
29
+ "checkpoint": payload.get("checkpoint"),
30
+ "mean_success": float(payload.get("mean_success", 0.0)),
31
+ "mean_return": float(sum(task_returns.values()) / max(1, len(task_returns))),
32
+ "mean_path_recoveries": float(sum(task_path_recoveries.values()) / max(1, len(task_path_recoveries))),
33
+ "mean_noop_fallbacks": float(sum(task_noop_fallbacks.values()) / max(1, len(task_noop_fallbacks))),
34
+ "plan_requested": bool(payload.get("plan_requested", False)),
35
+ "plan_applied": bool(payload.get("plan_applied", False)),
36
+ "no_planner": bool(payload.get("no_planner", False)),
37
+ "no_geometry": bool(payload.get("no_geometry", False)),
38
+ "disable_task_conditioning": bool(payload.get("disable_task_conditioning", False)),
39
+ "compact_world_model": bool(payload.get("compact_world_model", False)),
40
+ "task_scores": task_scores,
41
+ "task_returns": task_returns,
42
+ "task_path_recoveries": task_path_recoveries,
43
+ "task_noop_fallbacks": task_noop_fallbacks,
44
+ "error_tasks": list(payload.get("error_tasks", [])),
45
+ }
46
+
47
+
48
+ def _pairwise_delta(reference: dict[str, Any], candidate: dict[str, Any]) -> dict[str, Any]:
49
+ shared_tasks = sorted(set(reference["task_scores"]) & set(candidate["task_scores"]))
50
+ if not shared_tasks:
51
+ return {
52
+ "shared_task_count": 0,
53
+ "mean_success_delta": 0.0,
54
+ "mean_return_delta": 0.0,
55
+ "mean_path_recoveries_delta": 0.0,
56
+ "mean_noop_fallbacks_delta": 0.0,
57
+ "per_task_delta": {},
58
+ }
59
+ per_task_delta = {
60
+ task_name: float(candidate["task_scores"][task_name] - reference["task_scores"][task_name])
61
+ for task_name in shared_tasks
62
+ }
63
+ return {
64
+ "shared_task_count": len(shared_tasks),
65
+ "mean_success_delta": float(candidate["mean_success"] - reference["mean_success"]),
66
+ "mean_return_delta": float(candidate["mean_return"] - reference["mean_return"]),
67
+ "mean_path_recoveries_delta": float(candidate["mean_path_recoveries"] - reference["mean_path_recoveries"]),
68
+ "mean_noop_fallbacks_delta": float(candidate["mean_noop_fallbacks"] - reference["mean_noop_fallbacks"]),
69
+ "per_task_delta": per_task_delta,
70
+ }
71
+
72
+
73
+ def _markdown_lines(reference_label: str, comparison: dict[str, Any]) -> list[str]:
74
+ lines = [
75
+ "# RLBench Sweep Comparison",
76
+ "",
77
+ f"- Reference: `{reference_label}`",
78
+ "",
79
+ "## Runs",
80
+ "",
81
+ ]
82
+ for label, payload in comparison["runs"].items():
83
+ lines.append(
84
+ f"- `{label}`: mean_success={payload['mean_success']:.3f}, "
85
+ f"mean_return={payload['mean_return']:.3f}, "
86
+ f"mean_path_recoveries={payload['mean_path_recoveries']:.3f}, "
87
+ f"mean_noop_fallbacks={payload['mean_noop_fallbacks']:.3f}, "
88
+ f"plan_applied={payload['plan_applied']}, "
89
+ f"errors={len(payload['error_tasks'])}, "
90
+ f"path=`{payload['path']}`"
91
+ )
92
+ lines.extend(["", "## Pairwise Deltas", ""])
93
+ for label, payload in comparison["pairwise_against_reference"].items():
94
+ lines.append(
95
+ f"- `{label}`: mean_success_delta={payload['mean_success_delta']:.3f}, "
96
+ f"mean_return_delta={payload['mean_return_delta']:.3f}, "
97
+ f"mean_path_recoveries_delta={payload['mean_path_recoveries_delta']:.3f}, "
98
+ f"mean_noop_fallbacks_delta={payload['mean_noop_fallbacks_delta']:.3f}, "
99
+ f"shared_tasks={payload['shared_task_count']}"
100
+ )
101
+ return lines
102
+
103
+
104
+ def main() -> None:
105
+ parser = argparse.ArgumentParser()
106
+ parser.add_argument("--run", action="append", required=True, help="label=/abs/path/to/rollout_eval.json")
107
+ parser.add_argument("--reference-label", required=True)
108
+ parser.add_argument("--output-dir", required=True)
109
+ args = parser.parse_args()
110
+
111
+ runs: dict[str, dict[str, Any]] = {}
112
+ for item in args.run:
113
+ label, raw_path = item.split("=", 1)
114
+ runs[label] = _load_summary(Path(raw_path).resolve())
115
+
116
+ if args.reference_label not in runs:
117
+ raise ValueError(f"Missing reference label {args.reference_label!r} in provided runs.")
118
+
119
+ reference = runs[args.reference_label]
120
+ comparison = {
121
+ "reference_label": args.reference_label,
122
+ "runs": runs,
123
+ "pairwise_against_reference": {
124
+ label: _pairwise_delta(reference, payload)
125
+ for label, payload in runs.items()
126
+ if label != args.reference_label
127
+ },
128
+ }
129
+
130
+ output_dir = Path(args.output_dir).resolve()
131
+ output_dir.mkdir(parents=True, exist_ok=True)
132
+ (output_dir / "rlbench_comparison.json").write_text(
133
+ json.dumps(comparison, indent=2),
134
+ encoding="utf-8",
135
+ )
136
+ (output_dir / "rlbench_comparison.md").write_text(
137
+ "\n".join(_markdown_lines(args.reference_label, comparison)) + "\n",
138
+ encoding="utf-8",
139
+ )
140
+
141
+
142
+ if __name__ == "__main__":
143
+ main()
code/reveal_vla_bimanual/eval/run_ablations.py CHANGED
@@ -7,7 +7,7 @@ import time
7
 
8
  from eval.ablations import MANDATORY_ABLATIONS
9
  from eval.report import write_comparison_report
10
- from eval.run_reveal_benchmark import evaluate_model, load_model
11
  from sim_reveal import available_proxy_names
12
 
13
  import torch
@@ -20,6 +20,7 @@ def main() -> None:
20
  parser.add_argument("--resolution", type=int, default=None)
21
  parser.add_argument("--output-root", default="/workspace/reports/reveal_ablation")
22
  parser.add_argument("--proxies", nargs="*", default=None)
 
23
  parser.add_argument("--resume", action="store_true")
24
  args = parser.parse_args()
25
 
@@ -29,17 +30,20 @@ def main() -> None:
29
  proxies = list(args.proxies or available_proxy_names())
30
  output_root = Path(args.output_root)
31
  output_root.mkdir(parents=True, exist_ok=True)
 
32
 
33
  json_path = output_root / "ablations.json"
34
  partial_path = output_root / "ablations.partial.json"
35
  sections = {}
36
  raw = {}
 
37
  completed_labels: set[str] = set()
38
  if args.resume and partial_path.exists():
39
  partial = json.loads(partial_path.read_text(encoding="utf-8"))
40
  raw = partial.get("raw", {})
41
  sections = partial.get("sections", {})
42
  completed_labels = set(raw)
 
43
  print(json.dumps({"resume_from": str(partial_path), "completed": sorted(completed_labels)}, indent=2))
44
 
45
  ablations = (None, *MANDATORY_ABLATIONS)
@@ -56,7 +60,9 @@ def main() -> None:
56
  episodes=args.episodes,
57
  resolution=resolution,
58
  ablation=ablation,
 
59
  )
 
60
  raw[label] = {
61
  "per_task_success": metrics.per_task_success,
62
  "mean_success": metrics.mean_success,
@@ -65,6 +71,7 @@ def main() -> None:
65
  "reocclusion_rate": metrics.reocclusion_rate,
66
  "persistence_horizon_mae": metrics.persistence_horizon_mae,
67
  "disturbance_cost": metrics.disturbance_cost,
 
68
  }
69
  sections[label] = {
70
  "mean_success": metrics.mean_success,
@@ -73,12 +80,21 @@ def main() -> None:
73
  "reocclusion_rate": metrics.reocclusion_rate or 0.0,
74
  "persistence_horizon_mae": metrics.persistence_horizon_mae or 0.0,
75
  "disturbance_cost": metrics.disturbance_cost or 0.0,
 
76
  }
 
 
 
 
 
 
 
77
  partial_path.write_text(
78
  json.dumps(
79
  {
80
  "checkpoint": args.checkpoint,
81
  "episodes": args.episodes,
 
82
  "sections": sections,
83
  "raw": raw,
84
  "elapsed_seconds": time.monotonic() - start_time,
 
7
 
8
  from eval.ablations import MANDATORY_ABLATIONS
9
  from eval.report import write_comparison_report
10
+ from eval.run_reveal_benchmark import _paired_seed_summary, evaluate_model, load_model
11
  from sim_reveal import available_proxy_names
12
 
13
  import torch
 
20
  parser.add_argument("--resolution", type=int, default=None)
21
  parser.add_argument("--output-root", default="/workspace/reports/reveal_ablation")
22
  parser.add_argument("--proxies", nargs="*", default=None)
23
+ parser.add_argument("--chunk-commit-steps", type=int, default=0)
24
  parser.add_argument("--resume", action="store_true")
25
  args = parser.parse_args()
26
 
 
30
  proxies = list(args.proxies or available_proxy_names())
31
  output_root = Path(args.output_root)
32
  output_root.mkdir(parents=True, exist_ok=True)
33
+ chunk_commit_steps = None if args.chunk_commit_steps <= 0 else args.chunk_commit_steps
34
 
35
  json_path = output_root / "ablations.json"
36
  partial_path = output_root / "ablations.partial.json"
37
  sections = {}
38
  raw = {}
39
+ full_episode_records: list[dict[str, float | int | str]] | None = None
40
  completed_labels: set[str] = set()
41
  if args.resume and partial_path.exists():
42
  partial = json.loads(partial_path.read_text(encoding="utf-8"))
43
  raw = partial.get("raw", {})
44
  sections = partial.get("sections", {})
45
  completed_labels = set(raw)
46
+ full_episode_records = raw.get("full_model", {}).get("episode_records")
47
  print(json.dumps({"resume_from": str(partial_path), "completed": sorted(completed_labels)}, indent=2))
48
 
49
  ablations = (None, *MANDATORY_ABLATIONS)
 
60
  episodes=args.episodes,
61
  resolution=resolution,
62
  ablation=ablation,
63
+ chunk_commit_steps=chunk_commit_steps,
64
  )
65
+ metrics, episode_records = metrics
66
  raw[label] = {
67
  "per_task_success": metrics.per_task_success,
68
  "mean_success": metrics.mean_success,
 
71
  "reocclusion_rate": metrics.reocclusion_rate,
72
  "persistence_horizon_mae": metrics.persistence_horizon_mae,
73
  "disturbance_cost": metrics.disturbance_cost,
74
+ "episode_records": episode_records,
75
  }
76
  sections[label] = {
77
  "mean_success": metrics.mean_success,
 
80
  "reocclusion_rate": metrics.reocclusion_rate or 0.0,
81
  "persistence_horizon_mae": metrics.persistence_horizon_mae or 0.0,
82
  "disturbance_cost": metrics.disturbance_cost or 0.0,
83
+ "chunk_commit_steps": float(0 if chunk_commit_steps is None else chunk_commit_steps),
84
  }
85
+ if label == "full_model":
86
+ full_episode_records = episode_records
87
+ elif full_episode_records is not None:
88
+ paired = _paired_seed_summary(full_episode_records, episode_records)
89
+ raw[label]["paired_seed_summary_vs_full_model"] = paired
90
+ for key, value in paired.items():
91
+ sections[label][f"paired_{key}_vs_full_model"] = value
92
  partial_path.write_text(
93
  json.dumps(
94
  {
95
  "checkpoint": args.checkpoint,
96
  "episodes": args.episodes,
97
+ "chunk_commit_steps": 0 if chunk_commit_steps is None else chunk_commit_steps,
98
  "sections": sections,
99
  "raw": raw,
100
  "elapsed_seconds": time.monotonic() - start_time,
code/reveal_vla_bimanual/eval/run_peract2_task_sweep.py CHANGED
@@ -25,6 +25,10 @@ def _run_task(
25
  chunk_commit_steps: int,
26
  allow_unsupervised_planning: bool,
27
  disable_support_mode_conditioning: bool,
 
 
 
 
28
  ) -> dict[str, Any]:
29
  task_dir = output_dir / task_name
30
  task_dir.mkdir(parents=True, exist_ok=True)
@@ -53,10 +57,18 @@ def _run_task(
53
  command.append("--headless")
54
  if plan:
55
  command.append("--plan")
 
 
56
  if allow_unsupervised_planning:
57
  command.append("--allow-unsupervised-planning")
58
  if disable_support_mode_conditioning:
59
  command.append("--disable-support-mode-conditioning")
 
 
 
 
 
 
60
 
61
  completed = subprocess.run(
62
  command,
@@ -105,6 +117,10 @@ def _write_summary_markdown(path: Path, payload: dict[str, Any]) -> None:
105
  f"- Episodes per task: `{payload['episodes_per_task']}`",
106
  f"- Episode length: `{payload['episode_length']}`",
107
  f"- Resolution: `{payload['resolution']}`",
 
 
 
 
108
  f"- Task count: `{payload['task_count']}`",
109
  f"- Error tasks: `{payload['error_tasks']}`",
110
  f"- Mean success: `{payload['mean_success']:.3f}`",
@@ -144,6 +160,10 @@ def _run_mode(args: argparse.Namespace, plan: bool) -> Path:
144
  "episode_length": args.episode_length,
145
  "resolution": args.resolution,
146
  "device": args.device,
 
 
 
 
147
  "tasks": {},
148
  "subprocess_mode": "isolated_per_task",
149
  }
@@ -165,6 +185,10 @@ def _run_mode(args: argparse.Namespace, plan: bool) -> Path:
165
  chunk_commit_steps=args.chunk_commit_steps,
166
  allow_unsupervised_planning=args.allow_unsupervised_planning,
167
  disable_support_mode_conditioning=args.disable_support_mode_conditioning,
 
 
 
 
168
  )
169
 
170
  task_scores = [float(task_payload["mean_success"]) for task_payload in summary["tasks"].values()]
@@ -194,6 +218,10 @@ def main() -> None:
194
  parser.add_argument("--chunk-commit-steps", type=int, default=4)
195
  parser.add_argument("--allow-unsupervised-planning", action="store_true")
196
  parser.add_argument("--disable-support-mode-conditioning", action="store_true")
 
 
 
 
197
  parser.add_argument("--skip-noplan", action="store_true")
198
  parser.add_argument("--skip-plan", action="store_true")
199
  args = parser.parse_args()
 
25
  chunk_commit_steps: int,
26
  allow_unsupervised_planning: bool,
27
  disable_support_mode_conditioning: bool,
28
+ disable_task_conditioning: bool,
29
+ no_geometry: bool,
30
+ compact_world_model: bool,
31
+ no_planner: bool,
32
  ) -> dict[str, Any]:
33
  task_dir = output_dir / task_name
34
  task_dir.mkdir(parents=True, exist_ok=True)
 
57
  command.append("--headless")
58
  if plan:
59
  command.append("--plan")
60
+ if no_planner:
61
+ command.append("--no-planner")
62
  if allow_unsupervised_planning:
63
  command.append("--allow-unsupervised-planning")
64
  if disable_support_mode_conditioning:
65
  command.append("--disable-support-mode-conditioning")
66
+ if disable_task_conditioning:
67
+ command.append("--disable-task-conditioning")
68
+ if no_geometry:
69
+ command.append("--no-geometry")
70
+ if compact_world_model:
71
+ command.append("--compact-world-model")
72
 
73
  completed = subprocess.run(
74
  command,
 
117
  f"- Episodes per task: `{payload['episodes_per_task']}`",
118
  f"- Episode length: `{payload['episode_length']}`",
119
  f"- Resolution: `{payload['resolution']}`",
120
+ f"- No planner: `{payload['no_planner']}`",
121
+ f"- Disable task conditioning: `{payload['disable_task_conditioning']}`",
122
+ f"- No geometry: `{payload['no_geometry']}`",
123
+ f"- Compact world model: `{payload['compact_world_model']}`",
124
  f"- Task count: `{payload['task_count']}`",
125
  f"- Error tasks: `{payload['error_tasks']}`",
126
  f"- Mean success: `{payload['mean_success']:.3f}`",
 
160
  "episode_length": args.episode_length,
161
  "resolution": args.resolution,
162
  "device": args.device,
163
+ "no_planner": args.no_planner,
164
+ "disable_task_conditioning": args.disable_task_conditioning,
165
+ "no_geometry": args.no_geometry,
166
+ "compact_world_model": args.compact_world_model,
167
  "tasks": {},
168
  "subprocess_mode": "isolated_per_task",
169
  }
 
185
  chunk_commit_steps=args.chunk_commit_steps,
186
  allow_unsupervised_planning=args.allow_unsupervised_planning,
187
  disable_support_mode_conditioning=args.disable_support_mode_conditioning,
188
+ disable_task_conditioning=args.disable_task_conditioning,
189
+ no_geometry=args.no_geometry,
190
+ compact_world_model=args.compact_world_model,
191
+ no_planner=args.no_planner,
192
  )
193
 
194
  task_scores = [float(task_payload["mean_success"]) for task_payload in summary["tasks"].values()]
 
218
  parser.add_argument("--chunk-commit-steps", type=int, default=4)
219
  parser.add_argument("--allow-unsupervised-planning", action="store_true")
220
  parser.add_argument("--disable-support-mode-conditioning", action="store_true")
221
+ parser.add_argument("--disable-task-conditioning", action="store_true")
222
+ parser.add_argument("--no-geometry", action="store_true")
223
+ parser.add_argument("--compact-world-model", action="store_true")
224
+ parser.add_argument("--no-planner", action="store_true")
225
  parser.add_argument("--skip-noplan", action="store_true")
226
  parser.add_argument("--skip-plan", action="store_true")
227
  args = parser.parse_args()
code/reveal_vla_bimanual/eval/run_reveal_benchmark.py CHANGED
@@ -57,7 +57,12 @@ def load_model(checkpoint_path: str | Path, device: torch.device) -> tuple[torch
57
  allowed_missing = {
58
  key
59
  for key in incompatible.missing_keys
60
- if key.startswith("memory.action_proj.") or key.endswith("arm_identity.weight")
 
 
 
 
 
61
  }
62
  missing_other = sorted(set(incompatible.missing_keys) - allowed_missing)
63
  if missing_other or incompatible.unexpected_keys:
@@ -175,11 +180,18 @@ def select_chunk(
175
  history_depths=batch.get("history_depths"),
176
  history_depth_valid=batch.get("history_depth_valid"),
177
  plan=True,
 
178
  use_world_model=(ablation not in {"no_world_model", "no_planner"}),
179
  use_planner=(ablation != "no_planner"),
180
  use_depth=(ablation != "no_depth"),
 
 
181
  use_role_tokens=(ablation not in {"no_role_tokens", "no_role_symmetry"}),
182
  history_steps_override=(2 if ablation == "short_history" else None),
 
 
 
 
183
  )
184
  if "planned_chunk" in outputs and ablation != "no_planner":
185
  return outputs["planned_chunk"], outputs
@@ -204,6 +216,52 @@ def select_chunk(
204
  return outputs["action_mean"], outputs
205
 
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  def evaluate_model(
208
  model: torch.nn.Module,
209
  device: torch.device,
@@ -212,24 +270,26 @@ def evaluate_model(
212
  resolution: int,
213
  ablation: str | None = None,
214
  chunk_commit_steps: int | None = None,
215
- ) -> BenchmarkMetrics:
216
  per_task_success: dict[str, float] = {}
217
  visibility_scores = []
218
  corridor_scores = []
219
  reocclusion_scores = []
220
  persistence_errors = []
221
  disturbance_scores = []
 
222
  history_steps = int(getattr(model.config.memory, "history_steps", 0)) if hasattr(model, "config") else 0
223
 
224
  for proxy_offset, proxy_name in enumerate(proxies):
225
  successes = []
226
  for episode_idx in range(episodes):
 
227
  env = make_proxy_env(
228
  proxy_name=proxy_name,
229
  resolution=resolution,
230
- seed=proxy_offset * 10_000 + episode_idx,
231
  )
232
- observation, privileged_state = env.reset(seed=proxy_offset * 10_000 + episode_idx)
233
  episode_visibility = [float(privileged_state["visibility"])]
234
  episode_corridor = [float(privileged_state["corridor_feasible"][privileged_state["support_mode"]].any())]
235
  episode_disturbance = [float(privileged_state["disturbance_cost"])]
@@ -287,20 +347,39 @@ def evaluate_model(
287
  if done:
288
  break
289
  successes.append(float(privileged_state["retrieval_success"]))
290
- visibility_scores.append(visibility_integral(np.asarray(episode_visibility)))
291
- corridor_scores.append(corridor_availability(np.asarray(episode_corridor)))
292
- reocclusion_scores.append(reocclusion_rate(np.asarray(episode_corridor)))
293
- disturbance_scores.append(mean_disturbance_cost(np.asarray(episode_disturbance)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  per_task_success[proxy_name] = float(np.mean(successes))
295
 
296
- return BenchmarkMetrics(
297
- per_task_success=per_task_success,
298
- mean_success=mean_success(per_task_success),
299
- visibility_integral=float(np.mean(visibility_scores)) if visibility_scores else None,
300
- corridor_availability=float(np.mean(corridor_scores)) if corridor_scores else None,
301
- reocclusion_rate=float(np.mean(reocclusion_scores)) if reocclusion_scores else None,
302
- persistence_horizon_mae=float(np.mean(persistence_errors)) if persistence_errors else None,
303
- disturbance_cost=float(np.mean(disturbance_scores)) if disturbance_scores else None,
 
 
 
304
  )
305
 
306
 
@@ -319,7 +398,8 @@ def _metrics_to_dict(metrics: BenchmarkMetrics) -> dict[str, float | dict[str, f
319
  def main() -> None:
320
  parser = argparse.ArgumentParser()
321
  parser.add_argument("--model", action="append", required=True, help="label=/abs/path/checkpoint.pt")
322
- parser.add_argument("--episodes", type=int, default=24)
 
323
  parser.add_argument("--resolution", type=int, default=None)
324
  parser.add_argument("--ablation", default=None)
325
  parser.add_argument("--output-root", default="/workspace/reports/reveal_eval")
@@ -329,37 +409,76 @@ def main() -> None:
329
 
330
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
331
  proxies = list(args.proxies or available_proxy_names())
 
 
332
  output_root = Path(args.output_root)
333
  output_root.mkdir(parents=True, exist_ok=True)
334
 
335
  sections: dict[str, dict[str, float | str]] = {}
336
- raw_metrics: dict[str, dict[str, float | dict[str, float]]] = {}
 
 
 
 
 
 
 
 
 
337
  for item in args.model:
338
  label, checkpoint_path = item.split("=", maxsplit=1)
339
  model, checkpoint = load_model(checkpoint_path, device=device)
340
  resolution = int(args.resolution or checkpoint.get("data_resolution", 96))
341
- metrics = evaluate_model(
342
  model=model,
343
  device=device,
344
  proxies=proxies,
345
- episodes=args.episodes,
346
  resolution=resolution,
347
  ablation=args.ablation,
348
- chunk_commit_steps=(None if args.chunk_commit_steps <= 0 else args.chunk_commit_steps),
349
  )
350
- raw_metrics[label] = _metrics_to_dict(metrics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  sections[label] = {
352
  "checkpoint": checkpoint_path,
 
 
 
353
  "mean_success": metrics.mean_success,
354
  "visibility_integral": metrics.visibility_integral or 0.0,
355
  "corridor_availability": metrics.corridor_availability or 0.0,
356
  "reocclusion_rate": metrics.reocclusion_rate or 0.0,
357
  "persistence_horizon_mae": metrics.persistence_horizon_mae or 0.0,
358
  "disturbance_cost": metrics.disturbance_cost or 0.0,
 
 
359
  }
360
  for task_name, score in metrics.per_task_success.items():
361
  sections[label][f"{task_name}_success"] = score
362
 
 
 
 
 
 
 
 
 
 
363
  json_path = output_root / "reveal_benchmark.json"
364
  json_path.write_text(json.dumps(raw_metrics, indent=2), encoding="utf-8")
365
  write_comparison_report(output_root / "reveal_benchmark.md", "Reveal Proxy Benchmark", sections)
 
57
  allowed_missing = {
58
  key
59
  for key in incompatible.missing_keys
60
+ if key.startswith("memory.action_proj.")
61
+ or key.endswith("arm_identity.weight")
62
+ or key.endswith("task_embedding.weight")
63
+ or key.startswith("elastic_state_head.decoder.task_")
64
+ or key.startswith("world_model.task_")
65
+ or key.startswith("world_model.spatial_")
66
  }
67
  missing_other = sorted(set(incompatible.missing_keys) - allowed_missing)
68
  if missing_other or incompatible.unexpected_keys:
 
180
  history_depths=batch.get("history_depths"),
181
  history_depth_valid=batch.get("history_depth_valid"),
182
  plan=True,
183
+ support_mode_conditioning=(ablation != "no_support_mode_conditioning"),
184
  use_world_model=(ablation not in {"no_world_model", "no_planner"}),
185
  use_planner=(ablation != "no_planner"),
186
  use_depth=(ablation != "no_depth"),
187
+ use_geometry_tokens=(ablation != "no_geometry"),
188
+ use_camera_pose_tokens=(ablation != "no_camera_pose"),
189
  use_role_tokens=(ablation not in {"no_role_tokens", "no_role_symmetry"}),
190
  history_steps_override=(2 if ablation == "short_history" else None),
191
+ use_memory=(ablation != "no_spatial_memory"),
192
+ use_task_conditioning=(ablation != "no_task_head"),
193
+ rollout_mode_override=("compact_rollout" if ablation == "compact_world_model" else None),
194
+ use_proposal_candidates=(ablation != "gaussian_candidates_only"),
195
  )
196
  if "planned_chunk" in outputs and ablation != "no_planner":
197
  return outputs["planned_chunk"], outputs
 
216
  return outputs["action_mean"], outputs
217
 
218
 
219
+ def _bootstrap_interval(values: list[float], bootstrap_samples: int = 1000) -> dict[str, float]:
220
+ if not values:
221
+ return {"mean": 0.0, "low": 0.0, "high": 0.0}
222
+ array = np.asarray(values, dtype=np.float32)
223
+ mean = float(array.mean())
224
+ if array.size == 1:
225
+ return {"mean": mean, "low": mean, "high": mean}
226
+ rng = np.random.default_rng(0)
227
+ sample_indices = rng.integers(0, array.size, size=(bootstrap_samples, array.size))
228
+ sampled_means = array[sample_indices].mean(axis=1)
229
+ return {
230
+ "mean": mean,
231
+ "low": float(np.percentile(sampled_means, 2.5)),
232
+ "high": float(np.percentile(sampled_means, 97.5)),
233
+ }
234
+
235
+
236
+ def _paired_seed_summary(
237
+ reference_records: list[dict[str, float | int | str]],
238
+ candidate_records: list[dict[str, float | int | str]],
239
+ ) -> dict[str, float]:
240
+ reference_by_key = {
241
+ (str(record["proxy_name"]), int(record["seed"])): record for record in reference_records
242
+ }
243
+ success_deltas = []
244
+ visibility_deltas = []
245
+ reocclusion_deltas = []
246
+ disturbance_deltas = []
247
+ for record in candidate_records:
248
+ key = (str(record["proxy_name"]), int(record["seed"]))
249
+ baseline = reference_by_key.get(key)
250
+ if baseline is None:
251
+ continue
252
+ success_deltas.append(float(record["success"]) - float(baseline["success"]))
253
+ visibility_deltas.append(float(record["visibility_integral"]) - float(baseline["visibility_integral"]))
254
+ reocclusion_deltas.append(float(record["reocclusion_rate"]) - float(baseline["reocclusion_rate"]))
255
+ disturbance_deltas.append(float(record["disturbance_cost"]) - float(baseline["disturbance_cost"]))
256
+ return {
257
+ "paired_episodes": float(len(success_deltas)),
258
+ "success_delta": float(np.mean(success_deltas)) if success_deltas else 0.0,
259
+ "visibility_delta": float(np.mean(visibility_deltas)) if visibility_deltas else 0.0,
260
+ "reocclusion_delta": float(np.mean(reocclusion_deltas)) if reocclusion_deltas else 0.0,
261
+ "disturbance_delta": float(np.mean(disturbance_deltas)) if disturbance_deltas else 0.0,
262
+ }
263
+
264
+
265
  def evaluate_model(
266
  model: torch.nn.Module,
267
  device: torch.device,
 
270
  resolution: int,
271
  ablation: str | None = None,
272
  chunk_commit_steps: int | None = None,
273
+ ) -> tuple[BenchmarkMetrics, list[dict[str, float | int | str]]]:
274
  per_task_success: dict[str, float] = {}
275
  visibility_scores = []
276
  corridor_scores = []
277
  reocclusion_scores = []
278
  persistence_errors = []
279
  disturbance_scores = []
280
+ episode_records: list[dict[str, float | int | str]] = []
281
  history_steps = int(getattr(model.config.memory, "history_steps", 0)) if hasattr(model, "config") else 0
282
 
283
  for proxy_offset, proxy_name in enumerate(proxies):
284
  successes = []
285
  for episode_idx in range(episodes):
286
+ seed = proxy_offset * 10_000 + episode_idx
287
  env = make_proxy_env(
288
  proxy_name=proxy_name,
289
  resolution=resolution,
290
+ seed=seed,
291
  )
292
+ observation, privileged_state = env.reset(seed=seed)
293
  episode_visibility = [float(privileged_state["visibility"])]
294
  episode_corridor = [float(privileged_state["corridor_feasible"][privileged_state["support_mode"]].any())]
295
  episode_disturbance = [float(privileged_state["disturbance_cost"])]
 
347
  if done:
348
  break
349
  successes.append(float(privileged_state["retrieval_success"]))
350
+ episode_visibility_integral = visibility_integral(np.asarray(episode_visibility))
351
+ episode_corridor_availability = corridor_availability(np.asarray(episode_corridor))
352
+ episode_reocclusion = reocclusion_rate(np.asarray(episode_corridor))
353
+ episode_disturbance_cost = mean_disturbance_cost(np.asarray(episode_disturbance))
354
+ visibility_scores.append(episode_visibility_integral)
355
+ corridor_scores.append(episode_corridor_availability)
356
+ reocclusion_scores.append(episode_reocclusion)
357
+ disturbance_scores.append(episode_disturbance_cost)
358
+ episode_records.append(
359
+ {
360
+ "proxy_name": proxy_name,
361
+ "seed": seed,
362
+ "episode_index": episode_idx,
363
+ "success": float(privileged_state["retrieval_success"]),
364
+ "visibility_integral": episode_visibility_integral,
365
+ "corridor_availability": episode_corridor_availability,
366
+ "reocclusion_rate": episode_reocclusion,
367
+ "disturbance_cost": episode_disturbance_cost,
368
+ }
369
+ )
370
  per_task_success[proxy_name] = float(np.mean(successes))
371
 
372
+ return (
373
+ BenchmarkMetrics(
374
+ per_task_success=per_task_success,
375
+ mean_success=mean_success(per_task_success),
376
+ visibility_integral=float(np.mean(visibility_scores)) if visibility_scores else None,
377
+ corridor_availability=float(np.mean(corridor_scores)) if corridor_scores else None,
378
+ reocclusion_rate=float(np.mean(reocclusion_scores)) if reocclusion_scores else None,
379
+ persistence_horizon_mae=float(np.mean(persistence_errors)) if persistence_errors else None,
380
+ disturbance_cost=float(np.mean(disturbance_scores)) if disturbance_scores else None,
381
+ ),
382
+ episode_records,
383
  )
384
 
385
 
 
398
  def main() -> None:
399
  parser = argparse.ArgumentParser()
400
  parser.add_argument("--model", action="append", required=True, help="label=/abs/path/checkpoint.pt")
401
+ parser.add_argument("--episodes", type=int, default=None)
402
+ parser.add_argument("--benchmark-mode", choices=("smoke", "serious"), default="smoke")
403
  parser.add_argument("--resolution", type=int, default=None)
404
  parser.add_argument("--ablation", default=None)
405
  parser.add_argument("--output-root", default="/workspace/reports/reveal_eval")
 
409
 
410
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
411
  proxies = list(args.proxies or available_proxy_names())
412
+ episodes = int(args.episodes or (100 if args.benchmark_mode == "serious" else 24))
413
+ chunk_commit_steps = None if args.chunk_commit_steps <= 0 else args.chunk_commit_steps
414
  output_root = Path(args.output_root)
415
  output_root.mkdir(parents=True, exist_ok=True)
416
 
417
  sections: dict[str, dict[str, float | str]] = {}
418
+ raw_metrics: dict[str, dict[str, Any]] = {
419
+ "benchmark_config": {
420
+ "episodes": episodes,
421
+ "benchmark_mode": args.benchmark_mode,
422
+ "ablation": args.ablation,
423
+ "proxies": proxies,
424
+ "chunk_commit_steps": 0 if chunk_commit_steps is None else chunk_commit_steps,
425
+ }
426
+ }
427
+ episode_records_by_label: dict[str, list[dict[str, float | int | str]]] = {}
428
  for item in args.model:
429
  label, checkpoint_path = item.split("=", maxsplit=1)
430
  model, checkpoint = load_model(checkpoint_path, device=device)
431
  resolution = int(args.resolution or checkpoint.get("data_resolution", 96))
432
+ metrics, episode_records = evaluate_model(
433
  model=model,
434
  device=device,
435
  proxies=proxies,
436
+ episodes=episodes,
437
  resolution=resolution,
438
  ablation=args.ablation,
439
+ chunk_commit_steps=chunk_commit_steps,
440
  )
441
+ episode_records_by_label[label] = episode_records
442
+ success_values = [float(record["success"]) for record in episode_records]
443
+ visibility_values = [float(record["visibility_integral"]) for record in episode_records]
444
+ reocclusion_values = [float(record["reocclusion_rate"]) for record in episode_records]
445
+ disturbance_values = [float(record["disturbance_cost"]) for record in episode_records]
446
+ raw_metrics[label] = {
447
+ **_metrics_to_dict(metrics),
448
+ "bootstrap_ci": {
449
+ "success": _bootstrap_interval(success_values),
450
+ "visibility_integral": _bootstrap_interval(visibility_values),
451
+ "reocclusion_rate": _bootstrap_interval(reocclusion_values),
452
+ "disturbance_cost": _bootstrap_interval(disturbance_values),
453
+ },
454
+ "episode_records": episode_records,
455
+ }
456
  sections[label] = {
457
  "checkpoint": checkpoint_path,
458
+ "benchmark_mode": args.benchmark_mode,
459
+ "episodes": float(episodes),
460
+ "chunk_commit_steps": float(0 if chunk_commit_steps is None else chunk_commit_steps),
461
  "mean_success": metrics.mean_success,
462
  "visibility_integral": metrics.visibility_integral or 0.0,
463
  "corridor_availability": metrics.corridor_availability or 0.0,
464
  "reocclusion_rate": metrics.reocclusion_rate or 0.0,
465
  "persistence_horizon_mae": metrics.persistence_horizon_mae or 0.0,
466
  "disturbance_cost": metrics.disturbance_cost or 0.0,
467
+ "success_ci_low": raw_metrics[label]["bootstrap_ci"]["success"]["low"],
468
+ "success_ci_high": raw_metrics[label]["bootstrap_ci"]["success"]["high"],
469
  }
470
  for task_name, score in metrics.per_task_success.items():
471
  sections[label][f"{task_name}_success"] = score
472
 
473
+ labels = [item.split("=", maxsplit=1)[0] for item in args.model]
474
+ if labels:
475
+ reference_label = labels[0]
476
+ for label in labels[1:]:
477
+ summary = _paired_seed_summary(episode_records_by_label[reference_label], episode_records_by_label[label])
478
+ raw_metrics[label]["paired_seed_summary_vs_" + reference_label] = summary
479
+ for key, value in summary.items():
480
+ sections[label][f"paired_{key}_vs_{reference_label}"] = value
481
+
482
  json_path = output_root / "reveal_benchmark.json"
483
  json_path.write_text(json.dumps(raw_metrics, indent=2), encoding="utf-8")
484
  write_comparison_report(output_root / "reveal_benchmark.md", "Reveal Proxy Benchmark", sections)
code/reveal_vla_bimanual/eval/run_rlbench_rollout_eval.py CHANGED
@@ -122,11 +122,15 @@ def main() -> None:
122
  parser.add_argument("--resolution", type=int, default=224)
123
  parser.add_argument("--device", default="cuda")
124
  parser.add_argument("--plan", action="store_true")
 
125
  parser.add_argument("--allow-unsupervised-planning", action="store_true")
126
  parser.add_argument("--disable-support-mode-conditioning", action="store_true")
 
127
  parser.add_argument("--headless", action="store_true", default=True)
128
  parser.add_argument("--chunk-commit-steps", type=int, default=0)
129
  parser.add_argument("--reset-retries", type=int, default=20)
 
 
130
  args = parser.parse_args()
131
 
132
  checkpoint = torch.load(Path(args.checkpoint), map_location="cpu", weights_only=False)
@@ -138,7 +142,12 @@ def main() -> None:
138
  allowed_missing = {
139
  key
140
  for key in incompatible.missing_keys
141
- if key.startswith("memory.action_proj.") or key.endswith("arm_identity.weight")
 
 
 
 
 
142
  }
143
  missing_other = sorted(set(incompatible.missing_keys) - allowed_missing)
144
  if missing_other or incompatible.unexpected_keys:
@@ -147,7 +156,7 @@ def main() -> None:
147
  f"Missing keys: {missing_other}. Unexpected keys: {list(incompatible.unexpected_keys)}"
148
  )
149
  model.eval()
150
- plan_requested = bool(args.plan)
151
  plan_applied = plan_requested and planner_enabled(trainer_config, during_eval=True)
152
  planning_note = None
153
  if plan_requested and not policy_supports_planning(trainer_config.policy_type):
@@ -166,6 +175,9 @@ def main() -> None:
166
  "plan_applied": plan_applied,
167
  "planner_mode": trainer_config.planner_mode,
168
  "support_mode_conditioning": not args.disable_support_mode_conditioning,
 
 
 
169
  "episodes_per_task": args.episodes_per_task,
170
  "episode_length": args.episode_length,
171
  "resolution": args.resolution,
@@ -196,6 +208,9 @@ def main() -> None:
196
  env.launch()
197
  task = env.get_task(task_class)
198
  task_reset_retries: list[int] = []
 
 
 
199
  for _ in range(args.episodes_per_task):
200
  descriptions, obs, reset_retries = _reset_task_with_retries(task, max_attempts=max(1, args.reset_retries))
201
  task_reset_retries.append(int(reset_retries))
@@ -204,6 +219,7 @@ def main() -> None:
204
  success = 0.0
205
  episode_recoveries = 0
206
  episode_noop_fallbacks = 0
 
207
  history_images: list[np.ndarray] = []
208
  history_proprio: list[np.ndarray] = []
209
  history_actions: list[np.ndarray] = []
@@ -256,6 +272,10 @@ def main() -> None:
256
  history_actions=history_actions_tensor,
257
  plan=plan_applied,
258
  support_mode_conditioning=not args.disable_support_mode_conditioning,
 
 
 
 
259
  )
260
  else:
261
  outputs = model(
@@ -269,6 +289,20 @@ def main() -> None:
269
  chosen_chunk = outputs["action_mean"]
270
  if plan_applied and "planned_chunk" in outputs:
271
  chosen_chunk = outputs["planned_chunk"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  chunk_np = chosen_chunk[0].detach().float().cpu().numpy()
273
  commit_steps = chunk_np.shape[0] if args.chunk_commit_steps <= 0 else min(args.chunk_commit_steps, chunk_np.shape[0])
274
  done = False
@@ -292,6 +326,17 @@ def main() -> None:
292
  obs, reward, done, recovered_steps, noop_fallbacks = _step_bimanual_chunk(task, obs, step_action)
293
  episode_recoveries += recovered_steps
294
  episode_noop_fallbacks += noop_fallbacks
 
 
 
 
 
 
 
 
 
 
 
295
  total_reward += float(reward)
296
  timestep += 1
297
  if reward >= 1.0:
@@ -302,13 +347,21 @@ def main() -> None:
302
  break
303
  task_successes.append(success)
304
  task_returns.append(total_reward)
 
 
 
 
 
 
 
305
  results["tasks"][task_name] = {
306
  "task_class": task_class.__name__,
307
  "successes": task_successes,
308
  "returns": task_returns,
309
- "path_recoveries": episode_recoveries if args.episodes_per_task == 1 else None,
310
- "noop_fallbacks": episode_noop_fallbacks if args.episodes_per_task == 1 else None,
311
  "reset_retries": task_reset_retries,
 
312
  "mean_success": float(np.mean(task_successes)) if task_successes else 0.0,
313
  "mean_return": float(np.mean(task_returns)) if task_returns else 0.0,
314
  }
@@ -331,6 +384,9 @@ def main() -> None:
331
  f"- Plan requested: `{results['plan_requested']}`",
332
  f"- Plan applied: `{results['plan_applied']}`",
333
  f"- Support-mode conditioning: `{results['support_mode_conditioning']}`",
 
 
 
334
  f"- Mean success: `{results['mean_success']:.3f}`",
335
  "",
336
  "## Per-task",
 
122
  parser.add_argument("--resolution", type=int, default=224)
123
  parser.add_argument("--device", default="cuda")
124
  parser.add_argument("--plan", action="store_true")
125
+ parser.add_argument("--no-planner", action="store_true")
126
  parser.add_argument("--allow-unsupervised-planning", action="store_true")
127
  parser.add_argument("--disable-support-mode-conditioning", action="store_true")
128
+ parser.add_argument("--disable-task-conditioning", action="store_true")
129
  parser.add_argument("--headless", action="store_true", default=True)
130
  parser.add_argument("--chunk-commit-steps", type=int, default=0)
131
  parser.add_argument("--reset-retries", type=int, default=20)
132
+ parser.add_argument("--no-geometry", action="store_true")
133
+ parser.add_argument("--compact-world-model", action="store_true")
134
  args = parser.parse_args()
135
 
136
  checkpoint = torch.load(Path(args.checkpoint), map_location="cpu", weights_only=False)
 
142
  allowed_missing = {
143
  key
144
  for key in incompatible.missing_keys
145
+ if key.startswith("memory.action_proj.")
146
+ or key.endswith("arm_identity.weight")
147
+ or key.endswith("task_embedding.weight")
148
+ or key.startswith("elastic_state_head.decoder.task_")
149
+ or key.startswith("world_model.task_")
150
+ or key.startswith("world_model.spatial_")
151
  }
152
  missing_other = sorted(set(incompatible.missing_keys) - allowed_missing)
153
  if missing_other or incompatible.unexpected_keys:
 
156
  f"Missing keys: {missing_other}. Unexpected keys: {list(incompatible.unexpected_keys)}"
157
  )
158
  model.eval()
159
+ plan_requested = bool(args.plan) and not bool(args.no_planner)
160
  plan_applied = plan_requested and planner_enabled(trainer_config, during_eval=True)
161
  planning_note = None
162
  if plan_requested and not policy_supports_planning(trainer_config.policy_type):
 
175
  "plan_applied": plan_applied,
176
  "planner_mode": trainer_config.planner_mode,
177
  "support_mode_conditioning": not args.disable_support_mode_conditioning,
178
+ "task_conditioning": not args.disable_task_conditioning,
179
+ "geometry_enabled": not args.no_geometry,
180
+ "world_model_mode": "compact_rollout" if args.compact_world_model else "checkpoint_default",
181
  "episodes_per_task": args.episodes_per_task,
182
  "episode_length": args.episode_length,
183
  "resolution": args.resolution,
 
208
  env.launch()
209
  task = env.get_task(task_class)
210
  task_reset_retries: list[int] = []
211
+ task_path_recoveries: list[int] = []
212
+ task_noop_fallbacks: list[int] = []
213
+ task_episode_traces: list[dict[str, Any]] = []
214
  for _ in range(args.episodes_per_task):
215
  descriptions, obs, reset_retries = _reset_task_with_retries(task, max_attempts=max(1, args.reset_retries))
216
  task_reset_retries.append(int(reset_retries))
 
219
  success = 0.0
220
  episode_recoveries = 0
221
  episode_noop_fallbacks = 0
222
+ episode_trace: dict[str, Any] = {"language_goal": language_goal, "steps": []}
223
  history_images: list[np.ndarray] = []
224
  history_proprio: list[np.ndarray] = []
225
  history_actions: list[np.ndarray] = []
 
272
  history_actions=history_actions_tensor,
273
  plan=plan_applied,
274
  support_mode_conditioning=not args.disable_support_mode_conditioning,
275
+ use_planner=not args.no_planner,
276
+ use_geometry_tokens=not args.no_geometry,
277
+ use_task_conditioning=not args.disable_task_conditioning,
278
+ rollout_mode_override=("compact_rollout" if args.compact_world_model else None),
279
  )
280
  else:
281
  outputs = model(
 
289
  chosen_chunk = outputs["action_mean"]
290
  if plan_applied and "planned_chunk" in outputs:
291
  chosen_chunk = outputs["planned_chunk"]
292
+ best_local = 0
293
+ if isinstance(outputs.get("ranking_diagnostics"), dict) and "best_local_indices" in outputs["ranking_diagnostics"]:
294
+ best_local = int(outputs["ranking_diagnostics"]["best_local_indices"][0].detach().cpu().item())
295
+ chosen_macro_mode = None
296
+ if "planner_topk_mode_names" in outputs and outputs["planner_topk_mode_names"]:
297
+ chosen_macro_mode = outputs["planner_topk_mode_names"][0][best_local]
298
+ predicted_reocclusion = None
299
+ if "planned_rollout" in outputs and "reocclusion_field" in outputs["planned_rollout"]:
300
+ predicted_reocclusion = float(
301
+ outputs["planned_rollout"]["reocclusion_field"][0, best_local].mean().detach().cpu().item()
302
+ )
303
+ planner_scores = None
304
+ if "planner_scores" in outputs:
305
+ planner_scores = outputs["planner_scores"][0].detach().cpu().tolist()
306
  chunk_np = chosen_chunk[0].detach().float().cpu().numpy()
307
  commit_steps = chunk_np.shape[0] if args.chunk_commit_steps <= 0 else min(args.chunk_commit_steps, chunk_np.shape[0])
308
  done = False
 
326
  obs, reward, done, recovered_steps, noop_fallbacks = _step_bimanual_chunk(task, obs, step_action)
327
  episode_recoveries += recovered_steps
328
  episode_noop_fallbacks += noop_fallbacks
329
+ episode_trace["steps"].append(
330
+ {
331
+ "timestep": int(timestep),
332
+ "chosen_macro_mode": chosen_macro_mode,
333
+ "planner_scores": planner_scores,
334
+ "predicted_reocclusion": predicted_reocclusion,
335
+ "support_mode_conditioning": not args.disable_support_mode_conditioning,
336
+ "path_recoveries": int(recovered_steps),
337
+ "noop_fallbacks": int(noop_fallbacks),
338
+ }
339
+ )
340
  total_reward += float(reward)
341
  timestep += 1
342
  if reward >= 1.0:
 
347
  break
348
  task_successes.append(success)
349
  task_returns.append(total_reward)
350
+ task_path_recoveries.append(int(episode_recoveries))
351
+ task_noop_fallbacks.append(int(episode_noop_fallbacks))
352
+ episode_trace["success"] = float(success)
353
+ episode_trace["return"] = float(total_reward)
354
+ episode_trace["path_recoveries"] = int(episode_recoveries)
355
+ episode_trace["noop_fallbacks"] = int(episode_noop_fallbacks)
356
+ task_episode_traces.append(episode_trace)
357
  results["tasks"][task_name] = {
358
  "task_class": task_class.__name__,
359
  "successes": task_successes,
360
  "returns": task_returns,
361
+ "path_recoveries": task_path_recoveries,
362
+ "noop_fallbacks": task_noop_fallbacks,
363
  "reset_retries": task_reset_retries,
364
+ "episode_traces": task_episode_traces,
365
  "mean_success": float(np.mean(task_successes)) if task_successes else 0.0,
366
  "mean_return": float(np.mean(task_returns)) if task_returns else 0.0,
367
  }
 
384
  f"- Plan requested: `{results['plan_requested']}`",
385
  f"- Plan applied: `{results['plan_applied']}`",
386
  f"- Support-mode conditioning: `{results['support_mode_conditioning']}`",
387
+ f"- Task conditioning: `{results['task_conditioning']}`",
388
+ f"- Geometry enabled: `{results['geometry_enabled']}`",
389
+ f"- World-model mode: `{results['world_model_mode']}`",
390
  f"- Mean success: `{results['mean_success']:.3f}`",
391
  "",
392
  "## Per-task",
code/reveal_vla_bimanual/eval/run_teacher_audit.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ from pathlib import Path
6
+
7
+ import numpy as np
8
+
9
+ from eval.report import write_comparison_report
10
+ from sim_reveal.procedural_envs import available_proxy_names, make_proxy_env
11
+
12
+
13
+ BASELINES: tuple[str, ...] = ("teacher", "reveal_only", "retrieve_only", "no_hold", "random")
14
+
15
+
16
+ def _evaluate_baseline(
17
+ proxy_name: str,
18
+ baseline_name: str,
19
+ episodes: int,
20
+ resolution: int,
21
+ chunk_horizon: int,
22
+ rollout_horizon: int,
23
+ ) -> dict[str, float]:
24
+ success = []
25
+ reveal = []
26
+ visibility = []
27
+ hold = []
28
+ reocclusion = []
29
+ disturbance = []
30
+ utility = []
31
+ for seed in range(episodes):
32
+ env = make_proxy_env(proxy_name=proxy_name, resolution=resolution, seed=seed, rollout_horizon=rollout_horizon)
33
+ _, _ = env.reset(seed=seed)
34
+ chunk = env.baseline_action_chunk(baseline_name, chunk_horizon=chunk_horizon)
35
+ outcome = env.evaluate_action_chunk(chunk, rollout_horizon=rollout_horizon)
36
+ success_value = float(outcome["retrieval_success"])
37
+ reveal_value = float(outcome["reveal_achieved"])
38
+ visibility_value = float(outcome["visibility_integral"])
39
+ hold_value = float(outcome["hold_persistence"])
40
+ reocclusion_value = float(outcome["reocclusion_rate"])
41
+ disturbance_value = float(outcome["final_disturbance_cost"])
42
+ utility_value = success_value + 0.25 * reveal_value + 0.1 * hold_value + 0.05 * visibility_value
43
+ utility_value -= reocclusion_value + disturbance_value
44
+ success.append(success_value)
45
+ reveal.append(reveal_value)
46
+ visibility.append(visibility_value)
47
+ hold.append(hold_value)
48
+ reocclusion.append(reocclusion_value)
49
+ disturbance.append(disturbance_value)
50
+ utility.append(utility_value)
51
+ return {
52
+ "success": float(np.mean(success)),
53
+ "reveal_achieved": float(np.mean(reveal)),
54
+ "visibility_integral": float(np.mean(visibility)),
55
+ "hold_persistence": float(np.mean(hold)),
56
+ "reocclusion_rate": float(np.mean(reocclusion)),
57
+ "disturbance_cost": float(np.mean(disturbance)),
58
+ "utility": float(np.mean(utility)),
59
+ }
60
+
61
+
62
+ def main() -> None:
63
+ parser = argparse.ArgumentParser()
64
+ parser.add_argument("--episodes", type=int, default=100)
65
+ parser.add_argument("--resolution", type=int, default=96)
66
+ parser.add_argument("--chunk-horizon", type=int, default=8)
67
+ parser.add_argument("--rollout-horizon", type=int, default=5)
68
+ parser.add_argument("--output-root", default="/workspace/reports/reveal_teacher_audit")
69
+ parser.add_argument("--proxies", nargs="*", default=None)
70
+ args = parser.parse_args()
71
+
72
+ proxies = list(args.proxies or available_proxy_names())
73
+ output_root = Path(args.output_root)
74
+ output_root.mkdir(parents=True, exist_ok=True)
75
+
76
+ raw: dict[str, dict[str, dict[str, float]]] = {}
77
+ sections: dict[str, dict[str, float | str]] = {}
78
+ for proxy_name in proxies:
79
+ proxy_results = {
80
+ baseline_name: _evaluate_baseline(
81
+ proxy_name=proxy_name,
82
+ baseline_name=baseline_name,
83
+ episodes=args.episodes,
84
+ resolution=args.resolution,
85
+ chunk_horizon=args.chunk_horizon,
86
+ rollout_horizon=args.rollout_horizon,
87
+ )
88
+ for baseline_name in BASELINES
89
+ }
90
+ raw[proxy_name] = proxy_results
91
+ teacher_metrics = proxy_results["teacher"]
92
+ for baseline_name in BASELINES[1:]:
93
+ baseline_metrics = proxy_results[baseline_name]
94
+ label = f"{proxy_name}:{baseline_name}"
95
+ sections[label] = {
96
+ "episodes": float(args.episodes),
97
+ "teacher_success": teacher_metrics["success"],
98
+ "baseline_success": baseline_metrics["success"],
99
+ "teacher_utility": teacher_metrics["utility"],
100
+ "baseline_utility": baseline_metrics["utility"],
101
+ "success_delta": teacher_metrics["success"] - baseline_metrics["success"],
102
+ "utility_delta": teacher_metrics["utility"] - baseline_metrics["utility"],
103
+ "hold_delta": teacher_metrics["hold_persistence"] - baseline_metrics["hold_persistence"],
104
+ "reocclusion_delta": teacher_metrics["reocclusion_rate"] - baseline_metrics["reocclusion_rate"],
105
+ "disturbance_delta": teacher_metrics["disturbance_cost"] - baseline_metrics["disturbance_cost"],
106
+ }
107
+
108
+ json_path = output_root / "teacher_audit.json"
109
+ json_path.write_text(json.dumps(raw, indent=2), encoding="utf-8")
110
+ write_comparison_report(output_root / "teacher_audit.md", "Reveal Teacher Audit", sections)
111
+ print(json.dumps({"output_json": str(json_path), "sections": sections}, indent=2))
112
+
113
+
114
+ if __name__ == "__main__":
115
+ main()
code/reveal_vla_bimanual/models/action_decoder.py CHANGED
@@ -394,6 +394,79 @@ DEFAULT_PROPOSAL_MODES = (
394
  "retrieve",
395
  )
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
  def swap_arm_action_order(action_chunk: Tensor) -> Tensor:
399
  midpoint = action_chunk.shape[-1] // 2
@@ -416,6 +489,7 @@ class SymmetricCoordinatedChunkDecoder(nn.Module):
416
  self.arm_decoder = nn.TransformerDecoder(decoder_layer, num_layers=config.num_layers)
417
  self.query_embed = nn.Embedding(config.chunk_size, config.hidden_dim)
418
  self.arm_identity = nn.Embedding(2, config.hidden_dim)
 
419
  self.phase_adapter = nn.Linear(config.num_phases, config.hidden_dim)
420
  self.role_adapter = nn.Linear(config.num_arm_roles, config.hidden_dim)
421
  self.context_proj = nn.Sequential(
@@ -535,7 +609,8 @@ class SymmetricCoordinatedChunkDecoder(nn.Module):
535
  self,
536
  base_action: Tensor,
537
  pooled_context: Tensor,
538
- ) -> tuple[Tensor, Tensor, Tensor]:
 
539
  batch_size = pooled_context.shape[0]
540
  mode_logits = self.proposal_mode_head(pooled_context)
541
  mode_residuals = []
@@ -553,6 +628,13 @@ class SymmetricCoordinatedChunkDecoder(nn.Module):
553
  )
554
  proposal_candidates = []
555
  proposal_logits = []
 
 
 
 
 
 
 
556
  for slot_idx in range(self.config.num_candidates):
557
  mode_idx = int(mode_assignments[slot_idx])
558
  candidate = base_action + 0.35 * torch.tanh(mode_residuals[:, mode_idx]) + 0.05 * torch.tanh(slot_deltas[slot_idx]).unsqueeze(0)
@@ -571,7 +653,7 @@ class SymmetricCoordinatedChunkDecoder(nn.Module):
571
  stacked_candidates = torch.stack(proposal_candidates, dim=1)
572
  stacked_logits = torch.stack(proposal_logits, dim=1)
573
  stacked_candidates[:, 0] = base_action
574
- return stacked_candidates, stacked_logits, mode_logits
575
 
576
  def forward(
577
  self,
@@ -581,6 +663,7 @@ class SymmetricCoordinatedChunkDecoder(nn.Module):
581
  reveal_tokens: Tensor | None = None,
582
  memory_token: Tensor | None = None,
583
  compute_equivariance_probe: bool = False,
 
584
  ) -> dict[str, Tensor]:
585
  if memory_tokens is None:
586
  memory_tokens = memory_token
@@ -602,6 +685,20 @@ class SymmetricCoordinatedChunkDecoder(nn.Module):
602
  if memory_tokens is not None:
603
  decoder_memory = torch.cat([decoder_memory, memory_tokens], dim=1)
604
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
605
  base_queries = self.query_embed.weight.unsqueeze(0).expand(batch_size, -1, -1)
606
  arm_mean, arm_log_std, coordination = self._decode_arm_tokens(
607
  queries=base_queries,
@@ -621,7 +718,11 @@ class SymmetricCoordinatedChunkDecoder(nn.Module):
621
  ],
622
  dim=-1,
623
  )
624
- proposal_candidates, proposal_logits, proposal_mode_logits = self._proposal_outputs(action_mean, pooled_context)
 
 
 
 
625
 
626
  outputs = {
627
  "decoded_tokens": torch.cat([arm_mean[:, 0], arm_mean[:, 1]], dim=-1),
@@ -637,7 +738,8 @@ class SymmetricCoordinatedChunkDecoder(nn.Module):
637
  self.config.num_candidates,
638
  device=scene_tokens.device,
639
  ) % self.config.num_proposal_modes,
640
- "proposal_mode_names": list(DEFAULT_PROPOSAL_MODES[: self.config.num_proposal_modes]),
 
641
  }
642
  if compute_equivariance_probe:
643
  swapped_phase, swapped_roles, swapped_context = self._conditioning(
 
394
  "retrieve",
395
  )
396
 
397
+ TASK_PROPOSAL_MODES = {
398
+ "foliage": (
399
+ "sweep_left",
400
+ "sweep_right",
401
+ "pin_canopy",
402
+ "widen_gap",
403
+ "maintain_gap",
404
+ "insert_actor",
405
+ "retrieve",
406
+ ),
407
+ "bag": (
408
+ "pin_left_rim",
409
+ "pin_right_rim",
410
+ "widen_mouth",
411
+ "maintain_mouth",
412
+ "probe_inside",
413
+ "insert_actor",
414
+ "retrieve",
415
+ ),
416
+ "cloth": (
417
+ "lift_edge",
418
+ "separate_layer",
419
+ "stabilize_fold",
420
+ "maintain_lift",
421
+ "insert_actor",
422
+ "retrieve",
423
+ ),
424
+ }
425
+
426
+ TASK_INDEX = {"foliage": 0, "bag": 1, "cloth": 2}
427
+
428
+
429
+ def infer_task_name_from_text(text: str | None) -> str:
430
+ if not text:
431
+ return "generic"
432
+ lowered = text.lower()
433
+ if any(token in lowered for token in ("foliage", "canopy", "leaf", "leaves", "snail")):
434
+ return "foliage"
435
+ if any(token in lowered for token in ("bag", "mouth", "rim", "aperture")):
436
+ return "bag"
437
+ if any(token in lowered for token in ("cloth", "fold", "layer", "suitcase", "garment")):
438
+ return "cloth"
439
+ return "generic"
440
+
441
+
442
+ def proposal_mode_vocab(task_name: str, num_modes: int) -> tuple[str, ...]:
443
+ if task_name == "generic":
444
+ base_vocab = tuple(DEFAULT_PROPOSAL_MODES)
445
+ else:
446
+ vocab = TASK_PROPOSAL_MODES[task_name]
447
+ if len(vocab) > num_modes:
448
+ if num_modes >= 6:
449
+ base_vocab = (
450
+ vocab[0],
451
+ vocab[1],
452
+ vocab[2],
453
+ vocab[3],
454
+ vocab[-2],
455
+ vocab[-1],
456
+ )[:num_modes]
457
+ else:
458
+ base_vocab = vocab[:num_modes]
459
+ else:
460
+ base_vocab = vocab
461
+ if len(base_vocab) >= num_modes:
462
+ return tuple(base_vocab[:num_modes])
463
+ if not base_vocab:
464
+ return tuple("retrieve" for _ in range(num_modes))
465
+ padded = list(base_vocab)
466
+ while len(padded) < num_modes:
467
+ padded.append(base_vocab[-1])
468
+ return tuple(padded)
469
+
470
 
471
  def swap_arm_action_order(action_chunk: Tensor) -> Tensor:
472
  midpoint = action_chunk.shape[-1] // 2
 
489
  self.arm_decoder = nn.TransformerDecoder(decoder_layer, num_layers=config.num_layers)
490
  self.query_embed = nn.Embedding(config.chunk_size, config.hidden_dim)
491
  self.arm_identity = nn.Embedding(2, config.hidden_dim)
492
+ self.task_embedding = nn.Embedding(len(TASK_INDEX), config.hidden_dim)
493
  self.phase_adapter = nn.Linear(config.num_phases, config.hidden_dim)
494
  self.role_adapter = nn.Linear(config.num_arm_roles, config.hidden_dim)
495
  self.context_proj = nn.Sequential(
 
609
  self,
610
  base_action: Tensor,
611
  pooled_context: Tensor,
612
+ task_names: list[str],
613
+ ) -> tuple[Tensor, Tensor, Tensor, list[list[str]]]:
614
  batch_size = pooled_context.shape[0]
615
  mode_logits = self.proposal_mode_head(pooled_context)
616
  mode_residuals = []
 
628
  )
629
  proposal_candidates = []
630
  proposal_logits = []
631
+ proposal_mode_names = [
632
+ [
633
+ proposal_mode_vocab(task_name, self.config.num_proposal_modes)[int(mode_assignments[slot_idx])]
634
+ for slot_idx in range(self.config.num_candidates)
635
+ ]
636
+ for task_name in task_names
637
+ ]
638
  for slot_idx in range(self.config.num_candidates):
639
  mode_idx = int(mode_assignments[slot_idx])
640
  candidate = base_action + 0.35 * torch.tanh(mode_residuals[:, mode_idx]) + 0.05 * torch.tanh(slot_deltas[slot_idx]).unsqueeze(0)
 
653
  stacked_candidates = torch.stack(proposal_candidates, dim=1)
654
  stacked_logits = torch.stack(proposal_logits, dim=1)
655
  stacked_candidates[:, 0] = base_action
656
+ return stacked_candidates, stacked_logits, mode_logits, proposal_mode_names
657
 
658
  def forward(
659
  self,
 
663
  reveal_tokens: Tensor | None = None,
664
  memory_token: Tensor | None = None,
665
  compute_equivariance_probe: bool = False,
666
+ task_names: list[str] | None = None,
667
  ) -> dict[str, Tensor]:
668
  if memory_tokens is None:
669
  memory_tokens = memory_token
 
685
  if memory_tokens is not None:
686
  decoder_memory = torch.cat([decoder_memory, memory_tokens], dim=1)
687
 
688
+ canonical_task_names = [infer_task_name_from_text(name) for name in (task_names or ["generic"] * batch_size)]
689
+ task_ids = torch.as_tensor(
690
+ [TASK_INDEX[name] for name in canonical_task_names if name in TASK_INDEX],
691
+ device=scene_tokens.device,
692
+ dtype=torch.long,
693
+ )
694
+ if task_ids.numel() != batch_size:
695
+ task_ids = torch.as_tensor(
696
+ [TASK_INDEX.get(name, 0) for name in canonical_task_names],
697
+ device=scene_tokens.device,
698
+ dtype=torch.long,
699
+ )
700
+ interaction_context = interaction_context + self.task_embedding(task_ids)
701
+
702
  base_queries = self.query_embed.weight.unsqueeze(0).expand(batch_size, -1, -1)
703
  arm_mean, arm_log_std, coordination = self._decode_arm_tokens(
704
  queries=base_queries,
 
718
  ],
719
  dim=-1,
720
  )
721
+ proposal_candidates, proposal_logits, proposal_mode_logits, proposal_mode_names = self._proposal_outputs(
722
+ action_mean,
723
+ pooled_context,
724
+ canonical_task_names,
725
+ )
726
 
727
  outputs = {
728
  "decoded_tokens": torch.cat([arm_mean[:, 0], arm_mean[:, 1]], dim=-1),
 
738
  self.config.num_candidates,
739
  device=scene_tokens.device,
740
  ) % self.config.num_proposal_modes,
741
+ "proposal_mode_names": proposal_mode_names,
742
+ "proposal_task_names": canonical_task_names,
743
  }
744
  if compute_equivariance_probe:
745
  swapped_phase, swapped_roles, swapped_context = self._conditioning(
code/reveal_vla_bimanual/models/backbones.py CHANGED
@@ -10,6 +10,8 @@ import torch
10
  import torch.nn.functional as F
11
  from torch import Tensor, nn
12
 
 
 
13
 
14
  @dataclass
15
  class FrozenVLBackboneConfig:
@@ -22,6 +24,9 @@ class FrozenVLBackboneConfig:
22
  depth_patch_size: int = 16
23
  geometry_feature_dim: int = 8
24
  use_camera_geometry: bool = True
 
 
 
25
 
26
 
27
  class DepthPatchAdapter(nn.Module):
@@ -65,63 +70,64 @@ class DepthPatchAdapter(nn.Module):
65
  batch_views, _, height, width = depths.shape
66
  grid_h = max(1, height // self.patch_size)
67
  grid_w = max(1, width // self.patch_size)
68
- y_coords = torch.linspace(-1.0, 1.0, steps=grid_h, device=depths.device, dtype=depths.dtype)
69
- x_coords = torch.linspace(-1.0, 1.0, steps=grid_w, device=depths.device, dtype=depths.dtype)
70
- grid_y, grid_x = torch.meshgrid(y_coords, x_coords, indexing="ij")
71
- coords = torch.stack([grid_x, grid_y], dim=-1).reshape(1, grid_h * grid_w, 2).expand(batch_views, -1, -1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- geometry_terms: list[Tensor] = [coords]
74
  if camera_intrinsics is not None:
75
- fx = camera_intrinsics[:, 0, 0].unsqueeze(-1).unsqueeze(-1)
76
- fy = camera_intrinsics[:, 1, 1].unsqueeze(-1).unsqueeze(-1)
77
- cx = camera_intrinsics[:, 0, 2].unsqueeze(-1).unsqueeze(-1)
78
- cy = camera_intrinsics[:, 1, 2].unsqueeze(-1).unsqueeze(-1)
79
- intrinsic_features = torch.cat(
80
- [
81
- fx.expand(-1, grid_h * grid_w, -1),
82
- fy.expand(-1, grid_h * grid_w, -1),
83
- cx.expand(-1, grid_h * grid_w, -1),
84
- cy.expand(-1, grid_h * grid_w, -1),
85
- ],
86
- dim=-1,
87
- )
88
- geometry_terms.append(intrinsic_features)
89
  else:
90
- geometry_terms.append(torch.zeros(batch_views, grid_h * grid_w, 4, device=depths.device, dtype=depths.dtype))
 
 
 
91
 
92
  if camera_extrinsics is not None:
93
- translation = camera_extrinsics[:, :3, 3]
94
- translation = translation.unsqueeze(1).expand(-1, grid_h * grid_w, -1)
95
- geometry_terms.append(translation)
 
96
  else:
97
- geometry_terms.append(torch.zeros(batch_views, grid_h * grid_w, 3, device=depths.device, dtype=depths.dtype))
 
 
 
 
98
 
99
- geometry = torch.cat(geometry_terms, dim=-1)
100
  if geometry.shape[-1] < self.geometry_feature_dim:
101
  pad = self.geometry_feature_dim - geometry.shape[-1]
102
  geometry = F.pad(geometry, (0, pad))
103
  elif geometry.shape[-1] > self.geometry_feature_dim:
104
  geometry = geometry[..., : self.geometry_feature_dim]
105
 
106
- if camera_intrinsics is not None:
107
- camera_summary = torch.cat(
108
- [
109
- camera_intrinsics[:, 0, 0:1],
110
- camera_intrinsics[:, 1, 1:2],
111
- camera_intrinsics[:, 0, 2:3],
112
- camera_intrinsics[:, 1, 2:3],
113
- ],
114
- dim=-1,
115
- )
116
- else:
117
- camera_summary = torch.zeros(batch_views, 4, device=depths.device, dtype=depths.dtype)
118
  if camera_extrinsics is not None:
119
- camera_summary = torch.cat([camera_summary, camera_extrinsics[:, :3, 3]], dim=-1)
120
  else:
121
- camera_summary = torch.cat(
122
- [camera_summary, torch.zeros(batch_views, 3, device=depths.device, dtype=depths.dtype)],
123
- dim=-1,
124
- )
125
  return geometry, camera_summary
126
 
127
  def forward(
@@ -130,6 +136,8 @@ class DepthPatchAdapter(nn.Module):
130
  depth_valid: Tensor | None = None,
131
  camera_intrinsics: Tensor | None = None,
132
  camera_extrinsics: Tensor | None = None,
 
 
133
  ) -> dict[str, Tensor]:
134
  if depths.ndim == 4:
135
  depths = depths.unsqueeze(2)
@@ -161,7 +169,12 @@ class DepthPatchAdapter(nn.Module):
161
  camera_intrinsics=flat_intrinsics,
162
  camera_extrinsics=flat_extrinsics,
163
  )
164
- token_inputs = torch.cat([depth_patch, valid_patch, geometry_features], dim=-1)
 
 
 
 
 
165
  depth_tokens = self.depth_proj(token_inputs)
166
  geometry_tokens = self.geometry_proj(geometry_features)
167
  camera_tokens = self.camera_proj(camera_summary).unsqueeze(1)
@@ -324,12 +337,21 @@ class FrozenVLBackbone(nn.Module):
324
  camera_intrinsics: Tensor | None = None,
325
  camera_extrinsics: Tensor | None = None,
326
  return_aux: bool = False,
 
 
 
327
  ) -> Tensor | dict[str, Tensor | None]:
328
  rgb_tokens = self._encode_rgb_tokens(images)
329
  wants_aux = return_aux or depths is not None or depth_valid is not None or camera_intrinsics is not None or camera_extrinsics is not None
330
  if not wants_aux:
331
  return rgb_tokens
332
 
 
 
 
 
 
 
333
  depth_outputs: dict[str, Tensor | None] = {
334
  "depth_tokens": None,
335
  "geometry_tokens": None,
@@ -341,7 +363,15 @@ class FrozenVLBackbone(nn.Module):
341
  depth_valid=depth_valid,
342
  camera_intrinsics=camera_intrinsics,
343
  camera_extrinsics=camera_extrinsics,
 
 
344
  )
 
 
 
 
 
 
345
 
346
  return {
347
  "rgb_tokens": rgb_tokens,
 
10
  import torch.nn.functional as F
11
  from torch import Tensor, nn
12
 
13
+ from pytorch3d.transforms import matrix_to_quaternion
14
+
15
 
16
  @dataclass
17
  class FrozenVLBackboneConfig:
 
24
  depth_patch_size: int = 16
25
  geometry_feature_dim: int = 8
26
  use_camera_geometry: bool = True
27
+ use_depth_tokens: bool = True
28
+ use_geometry_tokens: bool = True
29
+ use_camera_pose_tokens: bool = True
30
 
31
 
32
  class DepthPatchAdapter(nn.Module):
 
70
  batch_views, _, height, width = depths.shape
71
  grid_h = max(1, height // self.patch_size)
72
  grid_w = max(1, width // self.patch_size)
73
+ patch_center_y = torch.linspace(
74
+ self.patch_size * 0.5,
75
+ max(self.patch_size * 0.5, height - (self.patch_size * 0.5)),
76
+ steps=grid_h,
77
+ device=depths.device,
78
+ dtype=depths.dtype,
79
+ )
80
+ patch_center_x = torch.linspace(
81
+ self.patch_size * 0.5,
82
+ max(self.patch_size * 0.5, width - (self.patch_size * 0.5)),
83
+ steps=grid_w,
84
+ device=depths.device,
85
+ dtype=depths.dtype,
86
+ )
87
+ pixel_y, pixel_x = torch.meshgrid(patch_center_y, patch_center_x, indexing="ij")
88
+ norm_x = ((pixel_x / max(width - 1, 1)) * 2.0 - 1.0).reshape(1, grid_h * grid_w, 1)
89
+ norm_y = ((pixel_y / max(height - 1, 1)) * 2.0 - 1.0).reshape(1, grid_h * grid_w, 1)
90
+ coords = torch.cat([norm_x, norm_y], dim=-1).expand(batch_views, -1, -1)
91
 
 
92
  if camera_intrinsics is not None:
93
+ fx = camera_intrinsics[:, 0, 0].unsqueeze(-1)
94
+ fy = camera_intrinsics[:, 1, 1].unsqueeze(-1)
95
+ cx = camera_intrinsics[:, 0, 2].unsqueeze(-1)
96
+ cy = camera_intrinsics[:, 1, 2].unsqueeze(-1)
97
+ patch_x = pixel_x.reshape(1, grid_h * grid_w).expand(batch_views, -1)
98
+ patch_y = pixel_y.reshape(1, grid_h * grid_w).expand(batch_views, -1)
99
+ ray_x = (patch_x - cx) / fx.clamp_min(1e-6)
100
+ ray_y = (patch_y - cy) / fy.clamp_min(1e-6)
 
 
 
 
 
 
101
  else:
102
+ ray_x = coords[..., 0]
103
+ ray_y = coords[..., 1]
104
+ ray_camera = torch.stack([ray_x, ray_y, torch.ones_like(ray_x)], dim=-1)
105
+ ray_camera = F.normalize(ray_camera, dim=-1)
106
 
107
  if camera_extrinsics is not None:
108
+ rotation = camera_extrinsics[:, :3, :3]
109
+ translation = camera_extrinsics[:, :3, 3].unsqueeze(1).expand(-1, grid_h * grid_w, -1)
110
+ ray_world = torch.matmul(rotation, ray_camera.transpose(1, 2)).transpose(1, 2)
111
+ quaternion = matrix_to_quaternion(rotation)
112
  else:
113
+ rotation = None
114
+ translation = torch.zeros(batch_views, grid_h * grid_w, 3, device=depths.device, dtype=depths.dtype)
115
+ ray_world = ray_camera
116
+ quaternion = torch.zeros(batch_views, 4, device=depths.device, dtype=depths.dtype)
117
+ quaternion[:, 0] = 1.0
118
 
119
+ geometry = torch.cat([coords, ray_world, translation], dim=-1)
120
  if geometry.shape[-1] < self.geometry_feature_dim:
121
  pad = self.geometry_feature_dim - geometry.shape[-1]
122
  geometry = F.pad(geometry, (0, pad))
123
  elif geometry.shape[-1] > self.geometry_feature_dim:
124
  geometry = geometry[..., : self.geometry_feature_dim]
125
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  if camera_extrinsics is not None:
127
+ translation_summary = camera_extrinsics[:, :3, 3]
128
  else:
129
+ translation_summary = torch.zeros(batch_views, 3, device=depths.device, dtype=depths.dtype)
130
+ camera_summary = torch.cat([quaternion, translation_summary], dim=-1)
 
 
131
  return geometry, camera_summary
132
 
133
  def forward(
 
136
  depth_valid: Tensor | None = None,
137
  camera_intrinsics: Tensor | None = None,
138
  camera_extrinsics: Tensor | None = None,
139
+ include_geometry_features: bool = True,
140
+ include_camera_pose: bool = True,
141
  ) -> dict[str, Tensor]:
142
  if depths.ndim == 4:
143
  depths = depths.unsqueeze(2)
 
169
  camera_intrinsics=flat_intrinsics,
170
  camera_extrinsics=flat_extrinsics,
171
  )
172
+ if not include_geometry_features:
173
+ geometry_features = torch.zeros_like(geometry_features)
174
+ if not include_camera_pose:
175
+ camera_summary = torch.zeros_like(camera_summary)
176
+ # Keep depth tokens depth-only so depth, geometry, and pose ablations are separable.
177
+ token_inputs = torch.cat([depth_patch, valid_patch, torch.zeros_like(geometry_features)], dim=-1)
178
  depth_tokens = self.depth_proj(token_inputs)
179
  geometry_tokens = self.geometry_proj(geometry_features)
180
  camera_tokens = self.camera_proj(camera_summary).unsqueeze(1)
 
337
  camera_intrinsics: Tensor | None = None,
338
  camera_extrinsics: Tensor | None = None,
339
  return_aux: bool = False,
340
+ use_depth_tokens: bool | None = None,
341
+ use_geometry_tokens: bool | None = None,
342
+ use_camera_pose_tokens: bool | None = None,
343
  ) -> Tensor | dict[str, Tensor | None]:
344
  rgb_tokens = self._encode_rgb_tokens(images)
345
  wants_aux = return_aux or depths is not None or depth_valid is not None or camera_intrinsics is not None or camera_extrinsics is not None
346
  if not wants_aux:
347
  return rgb_tokens
348
 
349
+ depth_enabled = self.config.use_depth_tokens if use_depth_tokens is None else use_depth_tokens
350
+ geometry_enabled = self.config.use_geometry_tokens if use_geometry_tokens is None else use_geometry_tokens
351
+ camera_pose_enabled = self.config.use_camera_pose_tokens if use_camera_pose_tokens is None else use_camera_pose_tokens
352
+ geometry_enabled = bool(self.config.use_camera_geometry and geometry_enabled)
353
+ camera_pose_enabled = bool(self.config.use_camera_geometry and camera_pose_enabled)
354
+
355
  depth_outputs: dict[str, Tensor | None] = {
356
  "depth_tokens": None,
357
  "geometry_tokens": None,
 
363
  depth_valid=depth_valid,
364
  camera_intrinsics=camera_intrinsics,
365
  camera_extrinsics=camera_extrinsics,
366
+ include_geometry_features=geometry_enabled,
367
+ include_camera_pose=camera_pose_enabled,
368
  )
369
+ if not depth_enabled:
370
+ depth_outputs["depth_tokens"] = None
371
+ if not geometry_enabled:
372
+ depth_outputs["geometry_tokens"] = None
373
+ if not camera_pose_enabled:
374
+ depth_outputs["camera_tokens"] = None
375
 
376
  return {
377
  "rgb_tokens": rgb_tokens,
code/reveal_vla_bimanual/models/multiview_fusion.py CHANGED
@@ -83,6 +83,7 @@ class MultiViewFusion(nn.Module):
83
  proprio: Tensor,
84
  language_tokens: Tensor,
85
  depth_tokens: Tensor | None = None,
 
86
  camera_tokens: Tensor | None = None,
87
  return_aux: bool = False,
88
  ) -> Tensor | dict[str, Tensor]:
@@ -102,6 +103,8 @@ class MultiViewFusion(nn.Module):
102
  geometry_sources = []
103
  if depth_tokens is not None:
104
  geometry_sources.append(depth_tokens[:, view_idx])
 
 
105
  if camera_tokens is not None:
106
  geometry_sources.append(camera_tokens[:, view_idx])
107
  if geometry_sources:
@@ -119,7 +122,12 @@ class MultiViewFusion(nn.Module):
119
  batch_size, self.config.proprio_tokens, hidden_dim
120
  )
121
  scene_tokens = torch.cat([fused, proprio_tokens, language_tokens], dim=1)
122
- if not (return_aux or depth_tokens is not None or camera_tokens is not None):
 
 
 
 
 
123
  return scene_tokens
124
  return {
125
  "scene_tokens": scene_tokens,
 
83
  proprio: Tensor,
84
  language_tokens: Tensor,
85
  depth_tokens: Tensor | None = None,
86
+ geometry_tokens: Tensor | None = None,
87
  camera_tokens: Tensor | None = None,
88
  return_aux: bool = False,
89
  ) -> Tensor | dict[str, Tensor]:
 
103
  geometry_sources = []
104
  if depth_tokens is not None:
105
  geometry_sources.append(depth_tokens[:, view_idx])
106
+ if geometry_tokens is not None:
107
+ geometry_sources.append(geometry_tokens[:, view_idx])
108
  if camera_tokens is not None:
109
  geometry_sources.append(camera_tokens[:, view_idx])
110
  if geometry_sources:
 
122
  batch_size, self.config.proprio_tokens, hidden_dim
123
  )
124
  scene_tokens = torch.cat([fused, proprio_tokens, language_tokens], dim=1)
125
+ if not (
126
+ return_aux
127
+ or depth_tokens is not None
128
+ or geometry_tokens is not None
129
+ or camera_tokens is not None
130
+ ):
131
  return scene_tokens
132
  return {
133
  "scene_tokens": scene_tokens,
code/reveal_vla_bimanual/models/observation_memory.py CHANGED
@@ -234,6 +234,14 @@ class _SelectiveMemoryBank(nn.Module):
234
  nn.GELU(),
235
  )
236
 
 
 
 
 
 
 
 
 
237
  def _truncate(self, history: Tensor | None) -> Tensor | None:
238
  if history is None or history.numel() == 0:
239
  return history
@@ -241,21 +249,48 @@ class _SelectiveMemoryBank(nn.Module):
241
  return history
242
  return history[:, -self.history_steps :]
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  def forward(
245
  self,
246
- pooled_current: Tensor,
247
  history_scene_tokens: Tensor | None = None,
248
  history_actions: Tensor | None = None,
249
  ) -> dict[str, Tensor]:
250
  history_scene_tokens = self._truncate(history_scene_tokens)
251
- pooled_current = pooled_current.unsqueeze(1)
 
252
  if history_scene_tokens is not None and history_scene_tokens.numel() > 0:
253
- history_pooled = history_scene_tokens.mean(dim=2)
 
 
 
254
  if history_actions is not None and history_actions.numel() > 0:
255
  history_actions = history_actions[:, -history_pooled.shape[1] :]
256
- history_pooled = history_pooled + self.action_proj(history_actions)
 
 
257
  sequence = torch.cat([history_pooled, pooled_current], dim=1)
258
  else:
 
259
  history_pooled = pooled_current[:, :0]
260
  sequence = pooled_current
261
  if sequence.shape[1] > self.position_embedding.shape[1]:
@@ -264,17 +299,21 @@ class _SelectiveMemoryBank(nn.Module):
264
  )
265
  encoded = self.sequence_encoder(sequence + self.position_embedding[:, : sequence.shape[1]])
266
  current_token = encoded[:, -1]
267
- prior_token = encoded[:, :-1].mean(dim=1) if encoded.shape[1] > 1 else torch.zeros_like(current_token)
268
- novelty = torch.abs(current_token - prior_token)
269
- informative = novelty.mean(dim=-1, keepdim=True)
270
- gate_logit = self.write_gate(torch.cat([current_token, prior_token, novelty], dim=-1))
271
- gate = torch.sigmoid(gate_logit)
272
- gate = gate * (informative > (self.write_threshold - self.suppression_margin)).to(gate.dtype)
273
- recent_summary = encoded[:, -min(max(1, self.bank_queries.shape[0]), encoded.shape[1]) :].mean(dim=1, keepdim=True)
274
- queries = self.bank_queries.unsqueeze(0).expand(encoded.shape[0], -1, -1) + recent_summary
275
- bank_tokens, _ = self.bank_attention(queries, encoded, encoded)
276
- bank_tokens = bank_tokens + recent_summary
277
- bank_tokens = prior_token.unsqueeze(1) * (1.0 - gate.unsqueeze(1)) + bank_tokens * gate.unsqueeze(1)
 
 
 
 
278
  bank_tokens = self.token_proj(bank_tokens)
279
  return {
280
  "memory_tokens": bank_tokens,
@@ -332,14 +371,13 @@ class DualObservationMemory(nn.Module):
332
  history_scene_tokens: Tensor | None = None,
333
  history_actions: Tensor | None = None,
334
  ) -> dict[str, Tensor]:
335
- pooled_current = scene_tokens.mean(dim=1)
336
  scene_output = self.scene_memory(
337
- pooled_current=pooled_current,
338
  history_scene_tokens=history_scene_tokens,
339
  history_actions=history_actions,
340
  )
341
  belief_output = self.belief_memory(
342
- pooled_current=pooled_current,
343
  history_scene_tokens=history_scene_tokens,
344
  history_actions=history_actions,
345
  )
 
234
  nn.GELU(),
235
  )
236
 
237
+ def _recency_weights(self, length: int, device: torch.device, dtype: torch.dtype) -> Tensor:
238
+ if length <= 0:
239
+ return torch.zeros((0,), device=device, dtype=dtype)
240
+ positions = torch.arange(length, device=device, dtype=dtype)
241
+ distances = (length - 1) - positions
242
+ weights = torch.exp(-0.5 * distances)
243
+ return weights / weights.sum().clamp_min(1e-6)
244
+
245
  def _truncate(self, history: Tensor | None) -> Tensor | None:
246
  if history is None or history.numel() == 0:
247
  return history
 
249
  return history
250
  return history[:, -self.history_steps :]
251
 
252
+ def _chunk_pool(self, tokens: Tensor) -> Tensor:
253
+ batch_size, seq_len, hidden_dim = tokens.shape
254
+ chunk_size = max(1, (seq_len + self.bank_queries.shape[0] - 1) // self.bank_queries.shape[0])
255
+ slots = []
256
+ for slot_idx in range(self.bank_queries.shape[0]):
257
+ start = slot_idx * chunk_size
258
+ end = min(seq_len, start + chunk_size)
259
+ if start >= seq_len:
260
+ pooled = tokens[:, -1]
261
+ else:
262
+ pooled = tokens[:, start:end].mean(dim=1)
263
+ slots.append(pooled)
264
+ return torch.stack(slots, dim=1)
265
+
266
+ def _compress_tokens(self, tokens: Tensor) -> Tensor:
267
+ base_slots = self._chunk_pool(tokens)
268
+ queries = self.bank_queries.unsqueeze(0).expand(tokens.shape[0], -1, -1) + base_slots
269
+ attended, _ = self.bank_attention(queries, tokens, tokens)
270
+ return base_slots + 0.1 * attended
271
+
272
  def forward(
273
  self,
274
+ current_tokens: Tensor,
275
  history_scene_tokens: Tensor | None = None,
276
  history_actions: Tensor | None = None,
277
  ) -> dict[str, Tensor]:
278
  history_scene_tokens = self._truncate(history_scene_tokens)
279
+ current_bank = self._compress_tokens(current_tokens)
280
+ pooled_current = current_bank.mean(dim=1, keepdim=True)
281
  if history_scene_tokens is not None and history_scene_tokens.numel() > 0:
282
+ batch_size, history_steps = history_scene_tokens.shape[:2]
283
+ flat_history = history_scene_tokens.reshape(batch_size * history_steps, history_scene_tokens.shape[2], history_scene_tokens.shape[3])
284
+ history_bank = self._compress_tokens(flat_history).view(batch_size, history_steps, self.bank_queries.shape[0], self.hidden_dim)
285
+ history_pooled = history_bank.mean(dim=2)
286
  if history_actions is not None and history_actions.numel() > 0:
287
  history_actions = history_actions[:, -history_pooled.shape[1] :]
288
+ history_action_tokens = self.action_proj(history_actions).unsqueeze(2)
289
+ history_bank = history_bank + history_action_tokens
290
+ history_pooled = history_bank.mean(dim=2)
291
  sequence = torch.cat([history_pooled, pooled_current], dim=1)
292
  else:
293
+ history_bank = current_bank.unsqueeze(1)[:, :0]
294
  history_pooled = pooled_current[:, :0]
295
  sequence = pooled_current
296
  if sequence.shape[1] > self.position_embedding.shape[1]:
 
299
  )
300
  encoded = self.sequence_encoder(sequence + self.position_embedding[:, : sequence.shape[1]])
301
  current_token = encoded[:, -1]
302
+ if history_bank.shape[1] > 0:
303
+ recency = self._recency_weights(
304
+ history_bank.shape[1],
305
+ device=history_bank.device,
306
+ dtype=history_bank.dtype,
307
+ ).view(1, -1, 1, 1)
308
+ prior_bank = (history_bank * recency).sum(dim=1)
309
+ else:
310
+ prior_bank = torch.zeros_like(current_bank)
311
+ novelty = torch.abs(current_bank - prior_bank)
312
+ gate_logit = self.write_gate(torch.cat([current_bank, prior_bank, novelty], dim=-1))
313
+ novelty_score = novelty.mean(dim=-1, keepdim=True)
314
+ novelty_gate = torch.sigmoid(12.0 * (novelty_score - self.write_threshold))
315
+ gate = (0.25 + 0.75 * torch.sigmoid(gate_logit)) * novelty_gate
316
+ bank_tokens = prior_bank * (1.0 - gate) + current_bank * gate
317
  bank_tokens = self.token_proj(bank_tokens)
318
  return {
319
  "memory_tokens": bank_tokens,
 
371
  history_scene_tokens: Tensor | None = None,
372
  history_actions: Tensor | None = None,
373
  ) -> dict[str, Tensor]:
 
374
  scene_output = self.scene_memory(
375
+ current_tokens=scene_tokens,
376
  history_scene_tokens=history_scene_tokens,
377
  history_actions=history_actions,
378
  )
379
  belief_output = self.belief_memory(
380
+ current_tokens=scene_tokens,
381
  history_scene_tokens=history_scene_tokens,
382
  history_actions=history_actions,
383
  )
code/reveal_vla_bimanual/models/planner.py CHANGED
@@ -250,10 +250,18 @@ class StructuredElasticUtility(nn.Module):
250
  occluder_contact = self._field_mean(rollout_state["occluder_contact_field"]).mean(dim=-1)
251
  grasp_affordance = self._field_mean(rollout_state["grasp_affordance_field"]).mean(dim=-1)
252
  support_stability = torch.sigmoid(self._field_mean(rollout_state["support_stability_field"])).mean(dim=-1)
253
- persistence = self._field_mean(rollout_state["persistence_field"]).mean(dim=-1)
254
- reocclusion = self._field_mean(rollout_state["reocclusion_field"]).mean(dim=-1)
255
- disturbance = self._field_mean(rollout_state["disturbance_field"]).mean(dim=-1)
256
- access_quality = torch.sigmoid(self._field_mean(rollout_state["access_field"])).mean(dim=-1)
 
 
 
 
 
 
 
 
257
  retrieve_progress = torch.sigmoid(candidate_chunks[:, :, :, -1]).mean(dim=-1)
258
  utility = (
259
  self.config.belief_gain_weight * (belief_future - initial_belief)
@@ -278,8 +286,12 @@ class StructuredElasticUtility(nn.Module):
278
  "persistence": persistence,
279
  "support_stability": support_stability,
280
  "reocclusion_penalty": reocclusion,
 
281
  "disturbance_penalty": disturbance,
282
  "access_quality": access_quality,
 
 
 
283
  "task_progress": retrieve_progress,
284
  "utility_structured": utility,
285
  }
@@ -346,14 +358,42 @@ class CascadePlanner(nn.Module):
346
  self.structured = StructuredElasticUtility(config)
347
  self.residual = ResidualPlannerScorer(config)
348
 
349
- def shortlist(self, proposal_logits: Tensor | None, candidate_chunks: Tensor) -> Tensor:
 
 
 
 
 
350
  batch_size, num_candidates = candidate_chunks.shape[:2]
351
  top_k = min(max(1, self.config.top_k), num_candidates)
352
  if proposal_logits is None:
353
  cheap_scores = -candidate_chunks.square().mean(dim=(-1, -2))
354
  else:
355
  cheap_scores = proposal_logits
356
- return cheap_scores.topk(top_k, dim=-1).indices
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
 
358
  def select_best(
359
  self,
@@ -362,6 +402,7 @@ class CascadePlanner(nn.Module):
362
  rollout_state: dict[str, Tensor],
363
  proposal_logits: Tensor | None = None,
364
  candidate_indices: Tensor | None = None,
 
365
  ) -> dict[str, Tensor]:
366
  structured = self.structured(
367
  initial_state=initial_state,
@@ -375,6 +416,22 @@ class CascadePlanner(nn.Module):
375
  )
376
  utility_total = structured["utility_structured"] + self.config.residual_weight * residual["utility_residual"]
377
  utility_total = utility_total + residual["success_logits"].sigmoid() - residual["risk_values"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  best_local = utility_total.argmax(dim=-1)
379
  batch_indices = torch.arange(candidate_chunks.shape[0], device=candidate_chunks.device)
380
  if candidate_indices is None:
@@ -386,6 +443,7 @@ class CascadePlanner(nn.Module):
386
  **residual,
387
  "utility_total": utility_total,
388
  "utility_scores": utility_total,
 
389
  "best_indices": best_indices,
390
  "best_chunk": candidate_chunks[batch_indices, best_local],
391
  "ranking_diagnostics": {
 
250
  occluder_contact = self._field_mean(rollout_state["occluder_contact_field"]).mean(dim=-1)
251
  grasp_affordance = self._field_mean(rollout_state["grasp_affordance_field"]).mean(dim=-1)
252
  support_stability = torch.sigmoid(self._field_mean(rollout_state["support_stability_field"])).mean(dim=-1)
253
+ persistence_traj = self._field_mean(rollout_state["persistence_field"])
254
+ reocclusion_traj = self._field_mean(rollout_state["reocclusion_field"])
255
+ disturbance_traj = self._field_mean(rollout_state["disturbance_field"])
256
+ access_traj = torch.sigmoid(self._field_mean(rollout_state["access_field"]))
257
+ persistence = persistence_traj.mean(dim=-1)
258
+ reocclusion = reocclusion_traj.mean(dim=-1)
259
+ disturbance = disturbance_traj.mean(dim=-1)
260
+ access_quality = access_traj.mean(dim=-1)
261
+ access_floor = access_traj.amin(dim=-1)
262
+ persistence_floor = persistence_traj.amin(dim=-1)
263
+ support_floor = torch.sigmoid(self._field_mean(rollout_state["support_stability_field"])).amin(dim=-1)
264
+ reocclusion_worst = reocclusion_traj.amax(dim=-1)
265
  retrieve_progress = torch.sigmoid(candidate_chunks[:, :, :, -1]).mean(dim=-1)
266
  utility = (
267
  self.config.belief_gain_weight * (belief_future - initial_belief)
 
286
  "persistence": persistence,
287
  "support_stability": support_stability,
288
  "reocclusion_penalty": reocclusion,
289
+ "reocclusion_worst": reocclusion_worst,
290
  "disturbance_penalty": disturbance,
291
  "access_quality": access_quality,
292
+ "access_floor": access_floor,
293
+ "persistence_floor": persistence_floor,
294
+ "support_floor": support_floor,
295
  "task_progress": retrieve_progress,
296
  "utility_structured": utility,
297
  }
 
358
  self.structured = StructuredElasticUtility(config)
359
  self.residual = ResidualPlannerScorer(config)
360
 
361
+ def shortlist(
362
+ self,
363
+ proposal_logits: Tensor | None,
364
+ candidate_chunks: Tensor,
365
+ proposal_mode_assignments: Tensor | None = None,
366
+ ) -> Tensor:
367
  batch_size, num_candidates = candidate_chunks.shape[:2]
368
  top_k = min(max(1, self.config.top_k), num_candidates)
369
  if proposal_logits is None:
370
  cheap_scores = -candidate_chunks.square().mean(dim=(-1, -2))
371
  else:
372
  cheap_scores = proposal_logits
373
+ if proposal_mode_assignments is None:
374
+ return cheap_scores.topk(top_k, dim=-1).indices
375
+ if proposal_mode_assignments.ndim == 1:
376
+ proposal_mode_assignments = proposal_mode_assignments.unsqueeze(0).expand(batch_size, -1)
377
+
378
+ shortlisted = []
379
+ for batch_idx in range(batch_size):
380
+ scores = cheap_scores[batch_idx]
381
+ mode_ids = proposal_mode_assignments[batch_idx]
382
+ mode_best: list[tuple[float, int]] = []
383
+ for mode_id in torch.unique(mode_ids):
384
+ mode_indices = torch.nonzero(mode_ids == mode_id, as_tuple=False).squeeze(-1)
385
+ best_local = mode_indices[scores[mode_indices].argmax()]
386
+ mode_best.append((float(scores[best_local]), int(best_local)))
387
+ mode_best.sort(key=lambda item: item[0], reverse=True)
388
+ chosen = [index for _, index in mode_best[:top_k]]
389
+ if len(chosen) < top_k:
390
+ for candidate_idx in scores.argsort(descending=True).tolist():
391
+ if candidate_idx not in chosen:
392
+ chosen.append(candidate_idx)
393
+ if len(chosen) >= top_k:
394
+ break
395
+ shortlisted.append(torch.as_tensor(chosen[:top_k], device=candidate_chunks.device, dtype=torch.long))
396
+ return torch.stack(shortlisted, dim=0)
397
 
398
  def select_best(
399
  self,
 
402
  rollout_state: dict[str, Tensor],
403
  proposal_logits: Tensor | None = None,
404
  candidate_indices: Tensor | None = None,
405
+ proposal_mode_names: list[list[str]] | None = None,
406
  ) -> dict[str, Tensor]:
407
  structured = self.structured(
408
  initial_state=initial_state,
 
416
  )
417
  utility_total = structured["utility_structured"] + self.config.residual_weight * residual["utility_residual"]
418
  utility_total = utility_total + residual["success_logits"].sigmoid() - residual["risk_values"]
419
+ feasibility_penalty = torch.zeros_like(utility_total)
420
+ if proposal_mode_names is not None:
421
+ retrieve_like = torch.zeros_like(utility_total, dtype=torch.bool)
422
+ for batch_idx, names in enumerate(proposal_mode_names):
423
+ for candidate_idx, name in enumerate(names[: utility_total.shape[1]]):
424
+ retrieve_like[batch_idx, candidate_idx] = any(
425
+ token in name for token in ("retrieve", "insert_actor", "probe_inside")
426
+ )
427
+ blocked = (
428
+ (structured["access_floor"] < 0.15)
429
+ | (structured["persistence_floor"] < 0.15)
430
+ | (structured["support_floor"] < 0.25)
431
+ | (structured["reocclusion_worst"] > 0.6)
432
+ )
433
+ feasibility_penalty = retrieve_like.to(dtype=utility_total.dtype) * blocked.to(dtype=utility_total.dtype) * 2.0
434
+ utility_total = utility_total - feasibility_penalty
435
  best_local = utility_total.argmax(dim=-1)
436
  batch_indices = torch.arange(candidate_chunks.shape[0], device=candidate_chunks.device)
437
  if candidate_indices is None:
 
443
  **residual,
444
  "utility_total": utility_total,
445
  "utility_scores": utility_total,
446
+ "feasibility_penalty": feasibility_penalty,
447
  "best_indices": best_indices,
448
  "best_chunk": candidate_chunks[batch_indices, best_local],
449
  "ranking_diagnostics": {
code/reveal_vla_bimanual/models/policy.py CHANGED
@@ -11,6 +11,7 @@ from models.action_decoder import (
11
  ChunkDecoderConfig,
12
  InteractionChunkDecoder,
13
  SymmetricCoordinatedChunkDecoder,
 
14
  )
15
  from models.backbones import FrozenVLBackbone, FrozenVLBackboneConfig
16
  from models.multiview_fusion import MultiViewFusion, MultiViewFusionConfig
@@ -65,6 +66,11 @@ class BackboneOnlyPolicy(nn.Module):
65
  attention_mask=language_tokens["attention_mask"],
66
  )
67
 
 
 
 
 
 
68
  def encode_scene(
69
  self,
70
  images: Tensor,
@@ -388,6 +394,8 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
388
  camera_intrinsics: Tensor | None = None,
389
  camera_extrinsics: Tensor | None = None,
390
  use_depth: bool = True,
 
 
391
  ) -> dict[str, Tensor]:
392
  encoded = self.backbone.encode_images(
393
  images,
@@ -396,6 +404,9 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
396
  camera_intrinsics=camera_intrinsics if use_depth else None,
397
  camera_extrinsics=camera_extrinsics if use_depth else None,
398
  return_aux=True,
 
 
 
399
  )
400
  assert isinstance(encoded, dict)
401
  text_tokens = self._encode_language(images, texts=texts, language_tokens=language_tokens)
@@ -404,6 +415,7 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
404
  proprio=proprio,
405
  language_tokens=text_tokens,
406
  depth_tokens=encoded.get("depth_tokens"),
 
407
  camera_tokens=encoded.get("camera_tokens"),
408
  return_aux=True,
409
  )
@@ -413,6 +425,7 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
413
  "view_summaries": fused["view_summaries"],
414
  "geometry_summaries": fused["geometry_summaries"],
415
  "depth_tokens": encoded.get("depth_tokens"),
 
416
  "camera_tokens": encoded.get("camera_tokens"),
417
  }
418
 
@@ -441,6 +454,8 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
441
  camera_intrinsics: Tensor | None = None,
442
  camera_extrinsics: Tensor | None = None,
443
  use_depth: bool = True,
 
 
444
  ) -> Tensor | None:
445
  if history_images is None or history_proprio is None or history_images.numel() == 0:
446
  return None
@@ -469,6 +484,8 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
469
  camera_intrinsics=None,
470
  camera_extrinsics=None,
471
  use_depth=use_depth,
 
 
472
  )["scene_tokens"]
473
  return history_scene.view(batch_size, history_steps, history_scene.shape[1], history_scene.shape[2])
474
 
@@ -495,6 +512,27 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
495
  value = value.detach()
496
  return value.unsqueeze(1).unsqueeze(2).expand(-1, num_candidates, horizon, *value.shape[1:])
497
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498
  def _identity_rollout(
499
  self,
500
  interaction_state: dict[str, Tensor],
@@ -531,7 +569,14 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
531
  history_depths: Tensor | None = None,
532
  history_depth_valid: Tensor | None = None,
533
  compute_equivariance_probe: bool = False,
 
 
 
 
 
 
534
  ) -> dict[str, Tensor]:
 
535
  scene_output = self._encode_scene_with_optional_depth(
536
  images=images,
537
  proprio=proprio,
@@ -542,6 +587,8 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
542
  camera_intrinsics=camera_intrinsics,
543
  camera_extrinsics=camera_extrinsics,
544
  use_depth=use_depth,
 
 
545
  )
546
  scene_tokens = scene_output["scene_tokens"]
547
  history_scene_tokens = self.encode_history_with_optional_depth(
@@ -554,19 +601,26 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
554
  camera_intrinsics=camera_intrinsics,
555
  camera_extrinsics=camera_extrinsics,
556
  use_depth=use_depth,
 
 
557
  )
558
  if history_steps_override is not None and history_scene_tokens is not None and history_scene_tokens.numel() > 0:
559
  history_scene_tokens = history_scene_tokens[:, -history_steps_override:]
560
  if history_actions is not None and history_actions.numel() > 0:
561
  history_actions = history_actions[:, -history_steps_override:]
562
- memory_output = self.memory(
563
- scene_tokens,
564
- history_scene_tokens=history_scene_tokens,
565
- history_actions=history_actions,
566
- )
 
 
 
567
  elastic_state = self.elastic_state_head(
568
  scene_tokens,
569
  memory_tokens=memory_output["memory_tokens"],
 
 
570
  )
571
  elastic_state["memory_tokens"] = memory_output["memory_tokens"]
572
  elastic_state["memory_token"] = memory_output["memory_token"]
@@ -581,6 +635,7 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
581
  interaction_state=elastic_state,
582
  memory_tokens=memory_output["memory_tokens"],
583
  compute_equivariance_probe=compute_equivariance_probe,
 
584
  )
585
  outputs = {
586
  **decoded,
@@ -592,7 +647,11 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
592
  "reveal_state": elastic_state,
593
  "view_summaries": scene_output["view_summaries"],
594
  "geometry_summaries": scene_output["geometry_summaries"],
 
 
 
595
  "rollout_source": "none",
 
596
  }
597
 
598
  candidate_chunks = candidate_chunks_override
@@ -602,8 +661,10 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
602
  outputs["action_mean"],
603
  outputs["action_log_std"],
604
  num_candidates=self.config.decoder.num_candidates,
605
- proposal_candidates=outputs.get("proposal_candidates"),
606
  )
 
 
607
  else:
608
  proposal_logits = None
609
  outputs["candidate_chunks"] = candidate_chunks
@@ -625,13 +686,25 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
625
  )
626
  return outputs
627
 
628
- shortlist_indices = self.planner.shortlist(proposal_logits=proposal_logits, candidate_chunks=candidate_chunks)
 
 
 
 
629
  outputs["planner_topk_indices"] = shortlist_indices
630
  batch_size = candidate_chunks.shape[0]
631
  batch_indices = torch.arange(batch_size, device=candidate_chunks.device).unsqueeze(-1)
632
  topk_candidates = candidate_chunks[batch_indices, shortlist_indices]
633
  num_topk = topk_candidates.shape[1]
634
  outputs["planner_topk_candidates"] = topk_candidates
 
 
 
 
 
 
 
 
635
  if proposal_logits is not None:
636
  topk_proposal_logits = proposal_logits.gather(1, shortlist_indices)
637
  else:
@@ -653,6 +726,7 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
653
  rollout_state=identity_rollout,
654
  proposal_logits=topk_proposal_logits,
655
  candidate_indices=shortlist_indices,
 
656
  )
657
  outputs["planned_rollout"] = identity_rollout
658
  outputs["planned_chunk"] = selected["best_chunk"]
@@ -677,6 +751,8 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
677
  memory_tokens=self._tile_tensor(memory_output["memory_tokens"], num_topk),
678
  scene_memory_tokens=self._tile_tensor(memory_output["scene_memory_tokens"], num_topk),
679
  belief_memory_tokens=self._tile_tensor(memory_output["belief_memory_tokens"], num_topk),
 
 
680
  )
681
  reshaped_rollout = {
682
  key: value.view(batch_size, num_topk, *value.shape[1:]) for key, value in rollout.items()
@@ -687,6 +763,7 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
687
  rollout_state=reshaped_rollout,
688
  proposal_logits=topk_proposal_logits,
689
  candidate_indices=shortlist_indices,
 
690
  )
691
  outputs["planned_rollout"] = reshaped_rollout
692
  outputs["planned_chunk"] = selected["best_chunk"]
 
11
  ChunkDecoderConfig,
12
  InteractionChunkDecoder,
13
  SymmetricCoordinatedChunkDecoder,
14
+ infer_task_name_from_text,
15
  )
16
  from models.backbones import FrozenVLBackbone, FrozenVLBackboneConfig
17
  from models.multiview_fusion import MultiViewFusion, MultiViewFusionConfig
 
66
  attention_mask=language_tokens["attention_mask"],
67
  )
68
 
69
+ def _task_names(self, batch_size: int, texts: Sequence[str] | None = None) -> list[str]:
70
+ if texts is None:
71
+ return ["generic"] * batch_size
72
+ return [infer_task_name_from_text(text) for text in texts]
73
+
74
  def encode_scene(
75
  self,
76
  images: Tensor,
 
394
  camera_intrinsics: Tensor | None = None,
395
  camera_extrinsics: Tensor | None = None,
396
  use_depth: bool = True,
397
+ use_geometry_tokens: bool | None = None,
398
+ use_camera_pose_tokens: bool | None = None,
399
  ) -> dict[str, Tensor]:
400
  encoded = self.backbone.encode_images(
401
  images,
 
404
  camera_intrinsics=camera_intrinsics if use_depth else None,
405
  camera_extrinsics=camera_extrinsics if use_depth else None,
406
  return_aux=True,
407
+ use_depth_tokens=use_depth,
408
+ use_geometry_tokens=use_geometry_tokens,
409
+ use_camera_pose_tokens=use_camera_pose_tokens,
410
  )
411
  assert isinstance(encoded, dict)
412
  text_tokens = self._encode_language(images, texts=texts, language_tokens=language_tokens)
 
415
  proprio=proprio,
416
  language_tokens=text_tokens,
417
  depth_tokens=encoded.get("depth_tokens"),
418
+ geometry_tokens=encoded.get("geometry_tokens"),
419
  camera_tokens=encoded.get("camera_tokens"),
420
  return_aux=True,
421
  )
 
425
  "view_summaries": fused["view_summaries"],
426
  "geometry_summaries": fused["geometry_summaries"],
427
  "depth_tokens": encoded.get("depth_tokens"),
428
+ "geometry_tokens": encoded.get("geometry_tokens"),
429
  "camera_tokens": encoded.get("camera_tokens"),
430
  }
431
 
 
454
  camera_intrinsics: Tensor | None = None,
455
  camera_extrinsics: Tensor | None = None,
456
  use_depth: bool = True,
457
+ use_geometry_tokens: bool | None = None,
458
+ use_camera_pose_tokens: bool | None = None,
459
  ) -> Tensor | None:
460
  if history_images is None or history_proprio is None or history_images.numel() == 0:
461
  return None
 
484
  camera_intrinsics=None,
485
  camera_extrinsics=None,
486
  use_depth=use_depth,
487
+ use_geometry_tokens=use_geometry_tokens,
488
+ use_camera_pose_tokens=use_camera_pose_tokens,
489
  )["scene_tokens"]
490
  return history_scene.view(batch_size, history_steps, history_scene.shape[1], history_scene.shape[2])
491
 
 
512
  value = value.detach()
513
  return value.unsqueeze(1).unsqueeze(2).expand(-1, num_candidates, horizon, *value.shape[1:])
514
 
515
+ def _zero_memory_output(self, scene_tokens: Tensor) -> dict[str, Tensor]:
516
+ batch_size, _, hidden_dim = scene_tokens.shape
517
+ scene_memory_tokens = scene_tokens.new_zeros((batch_size, self.config.memory.scene_bank_size, hidden_dim))
518
+ belief_memory_tokens = scene_tokens.new_zeros((batch_size, self.config.memory.belief_bank_size, hidden_dim))
519
+ memory_tokens = torch.cat([scene_memory_tokens, belief_memory_tokens], dim=1)
520
+ return {
521
+ "scene_memory_tokens": scene_memory_tokens,
522
+ "belief_memory_tokens": belief_memory_tokens,
523
+ "memory_tokens": memory_tokens,
524
+ "memory_token": memory_tokens.mean(dim=1, keepdim=True),
525
+ "memory_sequence": scene_tokens.new_zeros((batch_size, 0, hidden_dim)),
526
+ "memory_state": scene_tokens.new_zeros((batch_size, hidden_dim * 2)),
527
+ "memory_uncertainty": scene_tokens.new_zeros((batch_size,)),
528
+ "memory_write_rate": scene_tokens.new_zeros((batch_size,)),
529
+ "memory_saturation": scene_tokens.new_zeros((batch_size,)),
530
+ "scene_write_gate": scene_tokens.new_zeros((batch_size, self.config.memory.scene_bank_size)),
531
+ "belief_write_gate": scene_tokens.new_zeros((batch_size, self.config.memory.belief_bank_size)),
532
+ "memory_scene_state": scene_tokens.new_zeros((batch_size, hidden_dim)),
533
+ "memory_belief_state": scene_tokens.new_zeros((batch_size, hidden_dim)),
534
+ }
535
+
536
  def _identity_rollout(
537
  self,
538
  interaction_state: dict[str, Tensor],
 
569
  history_depths: Tensor | None = None,
570
  history_depth_valid: Tensor | None = None,
571
  compute_equivariance_probe: bool = False,
572
+ use_geometry_tokens: bool | None = None,
573
+ use_camera_pose_tokens: bool | None = None,
574
+ use_memory: bool = True,
575
+ use_task_conditioning: bool = True,
576
+ rollout_mode_override: str | None = None,
577
+ use_proposal_candidates: bool = True,
578
  ) -> dict[str, Tensor]:
579
+ task_names = self._task_names(images.shape[0], texts=texts)
580
  scene_output = self._encode_scene_with_optional_depth(
581
  images=images,
582
  proprio=proprio,
 
587
  camera_intrinsics=camera_intrinsics,
588
  camera_extrinsics=camera_extrinsics,
589
  use_depth=use_depth,
590
+ use_geometry_tokens=use_geometry_tokens,
591
+ use_camera_pose_tokens=use_camera_pose_tokens,
592
  )
593
  scene_tokens = scene_output["scene_tokens"]
594
  history_scene_tokens = self.encode_history_with_optional_depth(
 
601
  camera_intrinsics=camera_intrinsics,
602
  camera_extrinsics=camera_extrinsics,
603
  use_depth=use_depth,
604
+ use_geometry_tokens=use_geometry_tokens,
605
+ use_camera_pose_tokens=use_camera_pose_tokens,
606
  )
607
  if history_steps_override is not None and history_scene_tokens is not None and history_scene_tokens.numel() > 0:
608
  history_scene_tokens = history_scene_tokens[:, -history_steps_override:]
609
  if history_actions is not None and history_actions.numel() > 0:
610
  history_actions = history_actions[:, -history_steps_override:]
611
+ if use_memory:
612
+ memory_output = self.memory(
613
+ scene_tokens,
614
+ history_scene_tokens=history_scene_tokens,
615
+ history_actions=history_actions,
616
+ )
617
+ else:
618
+ memory_output = self._zero_memory_output(scene_tokens)
619
  elastic_state = self.elastic_state_head(
620
  scene_tokens,
621
  memory_tokens=memory_output["memory_tokens"],
622
+ task_names=task_names,
623
+ use_task_conditioning=use_task_conditioning,
624
  )
625
  elastic_state["memory_tokens"] = memory_output["memory_tokens"]
626
  elastic_state["memory_token"] = memory_output["memory_token"]
 
635
  interaction_state=elastic_state,
636
  memory_tokens=memory_output["memory_tokens"],
637
  compute_equivariance_probe=compute_equivariance_probe,
638
+ task_names=task_names,
639
  )
640
  outputs = {
641
  **decoded,
 
647
  "reveal_state": elastic_state,
648
  "view_summaries": scene_output["view_summaries"],
649
  "geometry_summaries": scene_output["geometry_summaries"],
650
+ "depth_tokens": scene_output["depth_tokens"],
651
+ "geometry_tokens": scene_output["geometry_tokens"],
652
+ "camera_tokens": scene_output["camera_tokens"],
653
  "rollout_source": "none",
654
+ "task_names": task_names,
655
  }
656
 
657
  candidate_chunks = candidate_chunks_override
 
661
  outputs["action_mean"],
662
  outputs["action_log_std"],
663
  num_candidates=self.config.decoder.num_candidates,
664
+ proposal_candidates=outputs.get("proposal_candidates") if use_proposal_candidates else None,
665
  )
666
+ if not use_proposal_candidates:
667
+ proposal_logits = None
668
  else:
669
  proposal_logits = None
670
  outputs["candidate_chunks"] = candidate_chunks
 
686
  )
687
  return outputs
688
 
689
+ shortlist_indices = self.planner.shortlist(
690
+ proposal_logits=proposal_logits,
691
+ candidate_chunks=candidate_chunks,
692
+ proposal_mode_assignments=outputs.get("proposal_mode_assignments") if use_proposal_candidates else None,
693
+ )
694
  outputs["planner_topk_indices"] = shortlist_indices
695
  batch_size = candidate_chunks.shape[0]
696
  batch_indices = torch.arange(batch_size, device=candidate_chunks.device).unsqueeze(-1)
697
  topk_candidates = candidate_chunks[batch_indices, shortlist_indices]
698
  num_topk = topk_candidates.shape[1]
699
  outputs["planner_topk_candidates"] = topk_candidates
700
+ proposal_mode_names = outputs.get("proposal_mode_names")
701
+ topk_proposal_mode_names = None
702
+ if proposal_mode_names is not None and use_proposal_candidates:
703
+ topk_proposal_mode_names = [
704
+ [proposal_mode_names[batch_idx][int(candidate_idx.item())] for candidate_idx in shortlist_indices[batch_idx]]
705
+ for batch_idx in range(batch_size)
706
+ ]
707
+ outputs["planner_topk_mode_names"] = topk_proposal_mode_names
708
  if proposal_logits is not None:
709
  topk_proposal_logits = proposal_logits.gather(1, shortlist_indices)
710
  else:
 
726
  rollout_state=identity_rollout,
727
  proposal_logits=topk_proposal_logits,
728
  candidate_indices=shortlist_indices,
729
+ proposal_mode_names=topk_proposal_mode_names,
730
  )
731
  outputs["planned_rollout"] = identity_rollout
732
  outputs["planned_chunk"] = selected["best_chunk"]
 
751
  memory_tokens=self._tile_tensor(memory_output["memory_tokens"], num_topk),
752
  scene_memory_tokens=self._tile_tensor(memory_output["scene_memory_tokens"], num_topk),
753
  belief_memory_tokens=self._tile_tensor(memory_output["belief_memory_tokens"], num_topk),
754
+ task_names=[name for name in task_names for _ in range(num_topk)],
755
+ rollout_mode_override=rollout_mode_override,
756
  )
757
  reshaped_rollout = {
758
  key: value.view(batch_size, num_topk, *value.shape[1:]) for key, value in rollout.items()
 
763
  rollout_state=reshaped_rollout,
764
  proposal_logits=topk_proposal_logits,
765
  candidate_indices=shortlist_indices,
766
+ proposal_mode_names=topk_proposal_mode_names,
767
  )
768
  outputs["planned_rollout"] = reshaped_rollout
769
  outputs["planned_chunk"] = selected["best_chunk"]
code/reveal_vla_bimanual/models/reveal_head.py CHANGED
@@ -1,12 +1,103 @@
1
  from __future__ import annotations
2
 
3
  from dataclasses import dataclass
 
4
 
5
  import torch
6
  import torch.nn.functional as F
7
  from torch import Tensor, nn
8
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  @dataclass
11
  class RevealHeadConfig:
12
  hidden_dim: int = 512
@@ -20,6 +111,7 @@ class RevealHeadConfig:
20
  num_phases: int = 5
21
  num_arm_roles: int = 4
22
  num_interaction_tokens: int = 8
 
23
 
24
 
25
  class RevealStateHead(nn.Module):
@@ -379,6 +471,22 @@ class ElasticOcclusionFieldDecoder(nn.Module):
379
  nn.GELU(),
380
  nn.Linear(config.hidden_dim, config.num_support_modes),
381
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
 
383
  def _pool_source(self, source_tokens: Tensor | None, fallback: Tensor) -> Tensor:
384
  if source_tokens is None or source_tokens.numel() == 0:
@@ -403,6 +511,8 @@ class ElasticOcclusionFieldDecoder(nn.Module):
403
  interaction_tokens: Tensor,
404
  scene_tokens: Tensor | None = None,
405
  memory_tokens: Tensor | None = None,
 
 
406
  ) -> dict[str, Tensor]:
407
  batch_size = interaction_tokens.shape[0]
408
  pooled_interaction = interaction_tokens.mean(dim=1)
@@ -423,6 +533,15 @@ class ElasticOcclusionFieldDecoder(nn.Module):
423
  pooled_field = field_tokens.mean(dim=1)
424
  summary_input = torch.cat([pooled_interaction, pooled_field, pooled_scene, pooled_memory], dim=-1)
425
  latent_summary = self.summary_proj(summary_input)
 
 
 
 
 
 
 
 
 
426
 
427
  access_field = self.access_field(grid)
428
  target_belief_field = self.target_belief_field(grid)
@@ -435,6 +554,23 @@ class ElasticOcclusionFieldDecoder(nn.Module):
435
  reocclusion_field = torch.sigmoid(self.reocclusion_field(grid))
436
  disturbance_field = torch.sigmoid(self.disturbance_field(grid))
437
  uncertainty_field = F.softplus(self.uncertainty_field(grid))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
 
439
  support_stability_prob = torch.sigmoid(support_stability_field)
440
  risk_field = torch.sigmoid(
@@ -459,7 +595,7 @@ class ElasticOcclusionFieldDecoder(nn.Module):
459
  arm_identity = self.arm_identity.weight.unsqueeze(0).expand(batch_size, -1, -1)
460
  arm_tokens = pooled_interaction.unsqueeze(1).expand(-1, 2, -1) + arm_identity
461
  arm_role_input = torch.cat(
462
- [arm_tokens, latent_summary.unsqueeze(1).expand(-1, 2, -1)],
463
  dim=-1,
464
  )
465
  arm_role_logits = self.arm_role_head(arm_role_input)
@@ -477,8 +613,8 @@ class ElasticOcclusionFieldDecoder(nn.Module):
477
  risk_field.mean(dim=(-1, -2)).squeeze(1),
478
  uncertainty_field.mean(dim=(-1, -2)).squeeze(1),
479
  access_prob.mean(dim=(-1, -2)).transpose(0, 1).transpose(0, 1),
480
- self.support_mode(summary_input),
481
- self.phase_head(summary_input),
482
  arm_role_logits.reshape(batch_size, -1),
483
  ]
484
  compact_state = torch.cat(
@@ -487,7 +623,7 @@ class ElasticOcclusionFieldDecoder(nn.Module):
487
  )
488
 
489
  output = {
490
- "phase_logits": self.phase_head(summary_input),
491
  "arm_role_logits": arm_role_logits,
492
  "target_belief_field": target_belief_field,
493
  "visibility_field": visibility_field,
@@ -502,20 +638,34 @@ class ElasticOcclusionFieldDecoder(nn.Module):
502
  "uncertainty_field": uncertainty_field,
503
  "interaction_tokens": interaction_tokens,
504
  "field_tokens": field_tokens,
505
- "latent_summary": latent_summary,
506
- "support_mode_logits": self.support_mode(summary_input),
507
  "corridor_logits": corridor_logits,
508
  "persistence_horizon": persistence_horizon,
509
  "disturbance_cost": disturbance_cost,
510
  "belief_map": target_belief_map,
511
- "reocclusion_logit": self.reocclusion_head(summary_input),
512
  "persistence_uncertainty": uncertainty_field.mean(dim=(-1, -2)).squeeze(1),
513
  "access_field": access_field,
514
  "uncertainty": uncertainty_field.mean(dim=(-1, -2)).squeeze(1),
515
  "compact_state": compact_state,
 
516
  }
517
  output["target_field"] = output["target_belief_field"]
518
  output["actor_feasibility_field"] = output["clearance_field"]
 
 
 
 
 
 
 
 
 
 
 
 
 
519
  return output
520
 
521
 
@@ -544,6 +694,8 @@ class ElasticOcclusionStateHead(nn.Module):
544
  scene_tokens: Tensor,
545
  memory_token: Tensor | None = None,
546
  memory_tokens: Tensor | None = None,
 
 
547
  ) -> dict[str, Tensor]:
548
  if memory_tokens is None:
549
  memory_tokens = memory_token
@@ -558,4 +710,6 @@ class ElasticOcclusionStateHead(nn.Module):
558
  interaction_tokens=interaction_tokens,
559
  scene_tokens=scene_tokens,
560
  memory_tokens=memory_tokens,
 
 
561
  )
 
1
  from __future__ import annotations
2
 
3
  from dataclasses import dataclass
4
+ from typing import Sequence
5
 
6
  import torch
7
  import torch.nn.functional as F
8
  from torch import Tensor, nn
9
 
10
 
11
+ HEAD_TASKS = ("generic", "foliage", "bag", "cloth")
12
+ TASK_METRIC_NAMES = (
13
+ "opening_quality",
14
+ "actor_feasibility_score",
15
+ "gap_width",
16
+ "damage_proxy",
17
+ "release_collapse_rate",
18
+ "target_visibility_confidence",
19
+ "mouth_aperture",
20
+ "hold_quality",
21
+ "rim_slip_risk",
22
+ "insertable_actor_corridor",
23
+ "layer_separation_quality",
24
+ "fold_preservation",
25
+ "insertion_corridor",
26
+ "top_layer_stability",
27
+ "lift_too_much_risk",
28
+ )
29
+ TASK_INDEX = {name: idx for idx, name in enumerate(HEAD_TASKS)}
30
+
31
+
32
+ def task_ids_from_names(task_names: Sequence[str] | None, device: torch.device, batch_size: int) -> Tensor:
33
+ if task_names is None:
34
+ return torch.zeros(batch_size, device=device, dtype=torch.long)
35
+ return torch.as_tensor(
36
+ [TASK_INDEX.get(str(name), 0) for name in task_names],
37
+ device=device,
38
+ dtype=torch.long,
39
+ )
40
+
41
+
42
+ def _mean_map(value: Tensor) -> Tensor:
43
+ return value.mean(dim=(-1, -2)).squeeze(1)
44
+
45
+
46
+ def compute_task_metrics_from_fields(
47
+ *,
48
+ access_field: Tensor,
49
+ persistence_field: Tensor,
50
+ disturbance_field: Tensor,
51
+ reocclusion_field: Tensor,
52
+ visibility_field: Tensor,
53
+ clearance_field: Tensor,
54
+ support_stability_field: Tensor,
55
+ uncertainty_field: Tensor,
56
+ ) -> dict[str, Tensor]:
57
+ access_prob = torch.sigmoid(access_field)
58
+ opening_mask = access_prob.amax(dim=1, keepdim=True)
59
+ support_stability = torch.sigmoid(support_stability_field)
60
+ visibility_prob = torch.sigmoid(visibility_field)
61
+ clearance_prob = torch.sigmoid(clearance_field).mean(dim=1, keepdim=True)
62
+ normalized_uncertainty = uncertainty_field / (1.0 + uncertainty_field)
63
+
64
+ opening_quality_field = opening_mask * persistence_field * support_stability
65
+ newly_revealed_field = torch.relu(visibility_prob - reocclusion_field)
66
+ still_visible_field = visibility_prob * persistence_field
67
+ reoccluded_field = reocclusion_field
68
+
69
+ opening_quality = _mean_map(opening_quality_field)
70
+ actor_feasibility_score = 0.6 * _mean_map(clearance_prob) + 0.4 * _mean_map(opening_mask)
71
+ base_gap = _mean_map(opening_mask)
72
+ disturbance_cost = _mean_map(disturbance_field)
73
+ support_quality = _mean_map(support_stability)
74
+ visibility_confidence = _mean_map(visibility_prob * (1.0 - normalized_uncertainty))
75
+ reocclusion_rate = _mean_map(reocclusion_field)
76
+ persistence_score = _mean_map(persistence_field)
77
+
78
+ return {
79
+ "newly_revealed_field": newly_revealed_field,
80
+ "still_visible_field": still_visible_field,
81
+ "reoccluded_field": reoccluded_field,
82
+ "opening_quality_field": opening_quality_field,
83
+ "opening_quality": torch.clamp(opening_quality, 0.0, 1.0),
84
+ "actor_feasibility_score": torch.clamp(actor_feasibility_score, 0.0, 1.0),
85
+ "gap_width": 0.03 + 0.21 * torch.clamp(base_gap, 0.0, 1.0),
86
+ "damage_proxy": torch.clamp(disturbance_cost + 0.5 * (1.0 - support_quality), 0.0, 1.0),
87
+ "release_collapse_rate": torch.clamp(reocclusion_rate, 0.0, 1.0),
88
+ "target_visibility_confidence": torch.clamp(visibility_confidence, 0.0, 1.0),
89
+ "mouth_aperture": torch.clamp(base_gap, 0.0, 1.0),
90
+ "hold_quality": torch.clamp(0.5 * (persistence_score + support_quality), 0.0, 1.0),
91
+ "rim_slip_risk": torch.clamp(reocclusion_rate + 0.5 * (1.0 - support_quality), 0.0, 1.0),
92
+ "insertable_actor_corridor": torch.clamp(0.6 * actor_feasibility_score + 0.4 * base_gap, 0.0, 1.0),
93
+ "layer_separation_quality": torch.clamp(0.7 * base_gap + 0.3 * actor_feasibility_score, 0.0, 1.0),
94
+ "fold_preservation": torch.clamp(1.0 - disturbance_cost, 0.0, 1.0),
95
+ "insertion_corridor": torch.clamp(0.5 * actor_feasibility_score + 0.5 * base_gap, 0.0, 1.0),
96
+ "top_layer_stability": torch.clamp(support_quality, 0.0, 1.0),
97
+ "lift_too_much_risk": torch.clamp(disturbance_cost + 0.5 * torch.relu(base_gap - 0.5), 0.0, 1.0),
98
+ }
99
+
100
+
101
  @dataclass
102
  class RevealHeadConfig:
103
  hidden_dim: int = 512
 
111
  num_phases: int = 5
112
  num_arm_roles: int = 4
113
  num_interaction_tokens: int = 8
114
+ num_tasks: int = len(HEAD_TASKS)
115
 
116
 
117
  class RevealStateHead(nn.Module):
 
471
  nn.GELU(),
472
  nn.Linear(config.hidden_dim, config.num_support_modes),
473
  )
474
+ self.task_embedding = nn.Embedding(config.num_tasks, config.hidden_dim)
475
+ self.task_field_affine = nn.Linear(config.hidden_dim, config.hidden_dim * 2)
476
+ self.task_summary_adapter = nn.Sequential(
477
+ nn.LayerNorm(config.hidden_dim * 2),
478
+ nn.Linear(config.hidden_dim * 2, config.hidden_dim),
479
+ nn.GELU(),
480
+ )
481
+ self.task_phase_head = nn.Linear(config.hidden_dim, config.num_phases)
482
+ self.task_support_head = nn.Linear(config.hidden_dim, config.num_support_modes)
483
+ self.task_reocclusion_head = nn.Linear(config.hidden_dim, config.num_support_modes)
484
+ self.task_metric_head = nn.Sequential(
485
+ nn.LayerNorm(config.hidden_dim * 2),
486
+ nn.Linear(config.hidden_dim * 2, config.hidden_dim),
487
+ nn.GELU(),
488
+ nn.Linear(config.hidden_dim, len(TASK_METRIC_NAMES)),
489
+ )
490
 
491
  def _pool_source(self, source_tokens: Tensor | None, fallback: Tensor) -> Tensor:
492
  if source_tokens is None or source_tokens.numel() == 0:
 
511
  interaction_tokens: Tensor,
512
  scene_tokens: Tensor | None = None,
513
  memory_tokens: Tensor | None = None,
514
+ task_names: Sequence[str] | None = None,
515
+ use_task_conditioning: bool = True,
516
  ) -> dict[str, Tensor]:
517
  batch_size = interaction_tokens.shape[0]
518
  pooled_interaction = interaction_tokens.mean(dim=1)
 
533
  pooled_field = field_tokens.mean(dim=1)
534
  summary_input = torch.cat([pooled_interaction, pooled_field, pooled_scene, pooled_memory], dim=-1)
535
  latent_summary = self.summary_proj(summary_input)
536
+ task_ids = task_ids_from_names(task_names, interaction_tokens.device, batch_size)
537
+ task_embed = self.task_embedding(task_ids)
538
+ if use_task_conditioning:
539
+ scale, bias = self.task_field_affine(task_embed).chunk(2, dim=-1)
540
+ grid = grid * (1.0 + 0.1 * scale.view(batch_size, self.config.hidden_dim, 1, 1))
541
+ grid = grid + 0.1 * bias.view(batch_size, self.config.hidden_dim, 1, 1)
542
+ task_summary = latent_summary + 0.1 * self.task_summary_adapter(torch.cat([latent_summary, task_embed], dim=-1))
543
+ else:
544
+ task_summary = latent_summary
545
 
546
  access_field = self.access_field(grid)
547
  target_belief_field = self.target_belief_field(grid)
 
554
  reocclusion_field = torch.sigmoid(self.reocclusion_field(grid))
555
  disturbance_field = torch.sigmoid(self.disturbance_field(grid))
556
  uncertainty_field = F.softplus(self.uncertainty_field(grid))
557
+ task_metrics = compute_task_metrics_from_fields(
558
+ access_field=access_field,
559
+ persistence_field=persistence_field,
560
+ disturbance_field=disturbance_field,
561
+ reocclusion_field=reocclusion_field,
562
+ visibility_field=visibility_field,
563
+ clearance_field=clearance_field,
564
+ support_stability_field=support_stability_field,
565
+ uncertainty_field=uncertainty_field,
566
+ )
567
+ metric_residuals = 0.05 * torch.tanh(
568
+ self.task_metric_head(torch.cat([task_summary, task_embed], dim=-1))
569
+ )
570
+ metric_residual_map = {
571
+ name: metric_residuals[:, idx]
572
+ for idx, name in enumerate(TASK_METRIC_NAMES)
573
+ }
574
 
575
  support_stability_prob = torch.sigmoid(support_stability_field)
576
  risk_field = torch.sigmoid(
 
595
  arm_identity = self.arm_identity.weight.unsqueeze(0).expand(batch_size, -1, -1)
596
  arm_tokens = pooled_interaction.unsqueeze(1).expand(-1, 2, -1) + arm_identity
597
  arm_role_input = torch.cat(
598
+ [arm_tokens, task_summary.unsqueeze(1).expand(-1, 2, -1)],
599
  dim=-1,
600
  )
601
  arm_role_logits = self.arm_role_head(arm_role_input)
 
613
  risk_field.mean(dim=(-1, -2)).squeeze(1),
614
  uncertainty_field.mean(dim=(-1, -2)).squeeze(1),
615
  access_prob.mean(dim=(-1, -2)).transpose(0, 1).transpose(0, 1),
616
+ self.support_mode(summary_input) + (self.task_support_head(task_summary) if use_task_conditioning else 0.0),
617
+ self.phase_head(summary_input) + (self.task_phase_head(task_summary) if use_task_conditioning else 0.0),
618
  arm_role_logits.reshape(batch_size, -1),
619
  ]
620
  compact_state = torch.cat(
 
623
  )
624
 
625
  output = {
626
+ "phase_logits": self.phase_head(summary_input) + (self.task_phase_head(task_summary) if use_task_conditioning else 0.0),
627
  "arm_role_logits": arm_role_logits,
628
  "target_belief_field": target_belief_field,
629
  "visibility_field": visibility_field,
 
638
  "uncertainty_field": uncertainty_field,
639
  "interaction_tokens": interaction_tokens,
640
  "field_tokens": field_tokens,
641
+ "latent_summary": task_summary,
642
+ "support_mode_logits": self.support_mode(summary_input) + (self.task_support_head(task_summary) if use_task_conditioning else 0.0),
643
  "corridor_logits": corridor_logits,
644
  "persistence_horizon": persistence_horizon,
645
  "disturbance_cost": disturbance_cost,
646
  "belief_map": target_belief_map,
647
+ "reocclusion_logit": self.reocclusion_head(summary_input) + (self.task_reocclusion_head(task_summary) if use_task_conditioning else 0.0),
648
  "persistence_uncertainty": uncertainty_field.mean(dim=(-1, -2)).squeeze(1),
649
  "access_field": access_field,
650
  "uncertainty": uncertainty_field.mean(dim=(-1, -2)).squeeze(1),
651
  "compact_state": compact_state,
652
+ "task_ids": task_ids,
653
  }
654
  output["target_field"] = output["target_belief_field"]
655
  output["actor_feasibility_field"] = output["clearance_field"]
656
+ output.update(
657
+ {
658
+ "newly_revealed_field": task_metrics["newly_revealed_field"],
659
+ "still_visible_field": task_metrics["still_visible_field"],
660
+ "reoccluded_field": task_metrics["reoccluded_field"],
661
+ "opening_quality_field": task_metrics["opening_quality_field"],
662
+ }
663
+ )
664
+ for name in TASK_METRIC_NAMES:
665
+ if name == "gap_width":
666
+ output[name] = torch.clamp(task_metrics[name] + 0.01 * metric_residual_map[name], 0.0, 1.0)
667
+ else:
668
+ output[name] = torch.clamp(task_metrics[name] + metric_residual_map[name], 0.0, 1.0)
669
  return output
670
 
671
 
 
694
  scene_tokens: Tensor,
695
  memory_token: Tensor | None = None,
696
  memory_tokens: Tensor | None = None,
697
+ task_names: Sequence[str] | None = None,
698
+ use_task_conditioning: bool = True,
699
  ) -> dict[str, Tensor]:
700
  if memory_tokens is None:
701
  memory_tokens = memory_token
 
710
  interaction_tokens=interaction_tokens,
711
  scene_tokens=scene_tokens,
712
  memory_tokens=memory_tokens,
713
+ task_names=task_names,
714
+ use_task_conditioning=use_task_conditioning,
715
  )
code/reveal_vla_bimanual/models/world_model.py CHANGED
@@ -1,11 +1,13 @@
1
  from __future__ import annotations
2
 
3
  from dataclasses import dataclass
 
4
 
5
  import torch
 
6
  from torch import Tensor, nn
7
 
8
- from models.reveal_head import InteractionFieldDecoder
9
 
10
 
11
  @dataclass
@@ -24,6 +26,8 @@ class RevealWMConfig:
24
  predict_belief_map: bool = True
25
  scene_bank_size: int = 2
26
  belief_bank_size: int = 2
 
 
27
 
28
 
29
  class RevealWM(nn.Module):
@@ -167,6 +171,7 @@ class ElasticOcclusionWorldModel(nn.Module):
167
  + config.num_phases
168
  + (2 * config.num_arm_roles)
169
  )
 
170
  self.state_encoder = nn.Sequential(
171
  nn.LayerNorm(compact_state_dim),
172
  nn.Linear(compact_state_dim, config.hidden_dim),
@@ -203,6 +208,43 @@ class ElasticOcclusionWorldModel(nn.Module):
203
  self.disturbance_head = nn.Linear(config.hidden_dim, field_elements)
204
  self.uncertainty_head = nn.Linear(config.hidden_dim, field_elements)
205
  self.access_head = nn.Linear(config.hidden_dim, config.num_support_modes * field_elements)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  def _compact_from_state(self, interaction_state: dict[str, Tensor]) -> Tensor:
208
  if "compact_state" in interaction_state:
@@ -226,6 +268,32 @@ class ElasticOcclusionWorldModel(nn.Module):
226
  ]
227
  return torch.cat([component if component.ndim > 1 else component.unsqueeze(-1) for component in components], dim=-1)
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  def _decode_fields(self, latent: Tensor) -> dict[str, Tensor]:
230
  batch_size = latent.shape[0]
231
  side = self.config.field_size
@@ -257,7 +325,7 @@ class ElasticOcclusionWorldModel(nn.Module):
257
  weighted_persistence = (persistence_field.expand_as(access_prob) * access_prob).sum(dim=(-1, -2))
258
  access_mass = access_prob.sum(dim=(-1, -2)).clamp_min(1e-4)
259
  persistence_horizon = self.config.rollout_horizon * weighted_persistence / access_mass
260
- return {
261
  "target_belief_field": target_belief_field,
262
  "visibility_field": visibility_field,
263
  "clearance_field": clearance_field,
@@ -282,6 +350,96 @@ class ElasticOcclusionWorldModel(nn.Module):
282
  "target_field": target_belief_field,
283
  "actor_feasibility_field": clearance_field,
284
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
  def forward(
287
  self,
@@ -291,6 +449,8 @@ class ElasticOcclusionWorldModel(nn.Module):
291
  memory_tokens: Tensor | None = None,
292
  scene_memory_tokens: Tensor | None = None,
293
  belief_memory_tokens: Tensor | None = None,
 
 
294
  ) -> dict[str, Tensor]:
295
  if scene_memory_tokens is None:
296
  scene_memory_tokens = interaction_state.get("scene_memory_tokens")
@@ -305,14 +465,57 @@ class ElasticOcclusionWorldModel(nn.Module):
305
  if belief_memory_tokens is None:
306
  belief_memory_tokens = scene_tokens[:, :1]
307
 
308
- latent = self.state_encoder(self._compact_from_state(interaction_state))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  scene_memory = self.scene_memory_proj(scene_memory_tokens.mean(dim=1))
310
  belief_memory = self.belief_memory_proj(belief_memory_tokens.mean(dim=1))
311
  outputs: dict[str, list[Tensor]] = {}
312
  scene_bias = scene_tokens.mean(dim=1)
313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  for step in range(action_chunk.shape[1]):
315
- action_latent = self.action_encoder(action_chunk[:, step])
316
  transition_input = torch.cat([latent, action_latent, scene_memory, belief_memory], dim=-1)
317
  latent = self.transition(transition_input, latent + 0.1 * scene_bias)
318
  scene_memory = 0.75 * scene_memory + 0.25 * torch.tanh(self.scene_memory_update(latent))
 
1
  from __future__ import annotations
2
 
3
  from dataclasses import dataclass
4
+ from typing import Sequence
5
 
6
  import torch
7
+ import torch.nn.functional as F
8
  from torch import Tensor, nn
9
 
10
+ from models.reveal_head import InteractionFieldDecoder, compute_task_metrics_from_fields, task_ids_from_names
11
 
12
 
13
  @dataclass
 
26
  predict_belief_map: bool = True
27
  scene_bank_size: int = 2
28
  belief_bank_size: int = 2
29
+ rollout_mode: str = "spatial_rollout"
30
+ num_tasks: int = 4
31
 
32
 
33
  class RevealWM(nn.Module):
 
171
  + config.num_phases
172
  + (2 * config.num_arm_roles)
173
  )
174
+ self.compact_state_dim = compact_state_dim
175
  self.state_encoder = nn.Sequential(
176
  nn.LayerNorm(compact_state_dim),
177
  nn.Linear(compact_state_dim, config.hidden_dim),
 
208
  self.disturbance_head = nn.Linear(config.hidden_dim, field_elements)
209
  self.uncertainty_head = nn.Linear(config.hidden_dim, field_elements)
210
  self.access_head = nn.Linear(config.hidden_dim, config.num_support_modes * field_elements)
211
+ field_channels = 12 + config.num_support_modes
212
+ spatial_hidden = max(32, config.hidden_dim // 2)
213
+ self.task_embedding = nn.Embedding(config.num_tasks, config.hidden_dim)
214
+ self.spatial_field_encoder = nn.Sequential(
215
+ nn.Conv2d(field_channels, spatial_hidden, kernel_size=3, padding=1),
216
+ nn.GELU(),
217
+ nn.Conv2d(spatial_hidden, config.hidden_dim, kernel_size=3, padding=1),
218
+ nn.GELU(),
219
+ )
220
+ self.spatial_context_proj = nn.Sequential(
221
+ nn.LayerNorm(config.hidden_dim * 4),
222
+ nn.Linear(config.hidden_dim * 4, config.hidden_dim),
223
+ nn.GELU(),
224
+ )
225
+ self.spatial_gate_z = nn.Conv2d(config.hidden_dim * 2, config.hidden_dim, kernel_size=3, padding=1)
226
+ self.spatial_gate_r = nn.Conv2d(config.hidden_dim * 2, config.hidden_dim, kernel_size=3, padding=1)
227
+ self.spatial_candidate = nn.Conv2d(config.hidden_dim * 2, config.hidden_dim, kernel_size=3, padding=1)
228
+ self.spatial_summary_proj = nn.Sequential(
229
+ nn.LayerNorm(config.hidden_dim * 3),
230
+ nn.Linear(config.hidden_dim * 3, config.hidden_dim),
231
+ nn.GELU(),
232
+ )
233
+ self.spatial_phase_head = nn.Linear(config.hidden_dim, config.num_phases)
234
+ self.spatial_support_mode_head = nn.Linear(config.hidden_dim, config.num_support_modes)
235
+ self.spatial_arm_role_head = nn.Linear(config.hidden_dim, 2 * config.num_arm_roles)
236
+ self.spatial_reocclusion_head = nn.Linear(config.hidden_dim, config.num_support_modes)
237
+ self.spatial_target_belief_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
238
+ self.spatial_visibility_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
239
+ self.spatial_clearance_head = nn.Conv2d(config.hidden_dim, 2, kernel_size=1)
240
+ self.spatial_occluder_contact_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
241
+ self.spatial_grasp_affordance_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
242
+ self.spatial_support_stability_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
243
+ self.spatial_persistence_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
244
+ self.spatial_reocclusion_field_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
245
+ self.spatial_disturbance_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
246
+ self.spatial_uncertainty_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
247
+ self.spatial_access_head = nn.Conv2d(config.hidden_dim, config.num_support_modes, kernel_size=1)
248
 
249
  def _compact_from_state(self, interaction_state: dict[str, Tensor]) -> Tensor:
250
  if "compact_state" in interaction_state:
 
268
  ]
269
  return torch.cat([component if component.ndim > 1 else component.unsqueeze(-1) for component in components], dim=-1)
270
 
271
+ def _repeat_state_rollout(self, interaction_state: dict[str, Tensor], horizon: int) -> dict[str, Tensor]:
272
+ rollout: dict[str, Tensor] = {}
273
+ for key, value in interaction_state.items():
274
+ if isinstance(value, Tensor):
275
+ rollout[key] = value.unsqueeze(1).expand(-1, horizon, *value.shape[1:])
276
+ return rollout
277
+
278
+ def _stack_state_fields(self, interaction_state: dict[str, Tensor]) -> Tensor:
279
+ return torch.cat(
280
+ [
281
+ interaction_state["target_belief_field"],
282
+ interaction_state["visibility_field"],
283
+ interaction_state["clearance_field"],
284
+ interaction_state["occluder_contact_field"],
285
+ interaction_state["grasp_affordance_field"],
286
+ interaction_state["support_stability_field"],
287
+ interaction_state["persistence_field"],
288
+ interaction_state["reocclusion_field"],
289
+ interaction_state["disturbance_field"],
290
+ interaction_state["risk_field"],
291
+ interaction_state["uncertainty_field"],
292
+ interaction_state["access_field"],
293
+ ],
294
+ dim=1,
295
+ )
296
+
297
  def _decode_fields(self, latent: Tensor) -> dict[str, Tensor]:
298
  batch_size = latent.shape[0]
299
  side = self.config.field_size
 
325
  weighted_persistence = (persistence_field.expand_as(access_prob) * access_prob).sum(dim=(-1, -2))
326
  access_mass = access_prob.sum(dim=(-1, -2)).clamp_min(1e-4)
327
  persistence_horizon = self.config.rollout_horizon * weighted_persistence / access_mass
328
+ outputs = {
329
  "target_belief_field": target_belief_field,
330
  "visibility_field": visibility_field,
331
  "clearance_field": clearance_field,
 
350
  "target_field": target_belief_field,
351
  "actor_feasibility_field": clearance_field,
352
  }
353
+ outputs.update(
354
+ compute_task_metrics_from_fields(
355
+ access_field=access_field,
356
+ persistence_field=persistence_field,
357
+ disturbance_field=disturbance_field,
358
+ reocclusion_field=reocclusion_field,
359
+ visibility_field=visibility_field,
360
+ clearance_field=clearance_field,
361
+ support_stability_field=support_stability_field,
362
+ uncertainty_field=uncertainty_field,
363
+ )
364
+ )
365
+ return outputs
366
+
367
+ def _decode_spatial_fields(self, hidden: Tensor, summary: Tensor) -> dict[str, Tensor]:
368
+ target_belief_field = self.spatial_target_belief_head(hidden)
369
+ visibility_field = self.spatial_visibility_head(hidden)
370
+ clearance_field = self.spatial_clearance_head(hidden)
371
+ occluder_contact_field = self.spatial_occluder_contact_head(hidden)
372
+ grasp_affordance_field = self.spatial_grasp_affordance_head(hidden)
373
+ support_stability_field = self.spatial_support_stability_head(hidden)
374
+ persistence_field = torch.sigmoid(self.spatial_persistence_head(hidden))
375
+ reocclusion_field = torch.sigmoid(self.spatial_reocclusion_field_head(hidden))
376
+ disturbance_field = torch.sigmoid(self.spatial_disturbance_head(hidden))
377
+ uncertainty_field = F.softplus(self.spatial_uncertainty_head(hidden))
378
+ access_field = self.spatial_access_head(hidden)
379
+ support_stability_prob = torch.sigmoid(support_stability_field)
380
+ risk_field = torch.sigmoid(
381
+ disturbance_field
382
+ + 0.75 * reocclusion_field
383
+ + 0.5 * (1.0 - support_stability_prob)
384
+ + 0.25 * uncertainty_field
385
+ )
386
+ corridor_source = access_field.amax(dim=-2)
387
+ corridor_logits = F.interpolate(
388
+ corridor_source,
389
+ size=self.config.num_approach_templates,
390
+ mode="linear",
391
+ align_corners=False,
392
+ )
393
+ access_prob = torch.sigmoid(access_field)
394
+ weighted_persistence = (persistence_field.expand_as(access_prob) * access_prob).sum(dim=(-1, -2))
395
+ access_mass = access_prob.sum(dim=(-1, -2)).clamp_min(1e-4)
396
+ persistence_horizon = self.config.rollout_horizon * weighted_persistence / access_mass
397
+ compact_state = self.compact_decoder(summary)
398
+ role_slice = self.spatial_arm_role_head(summary).view(summary.shape[0], 2, self.config.num_arm_roles)
399
+ outputs = {
400
+ "target_belief_field": target_belief_field,
401
+ "visibility_field": visibility_field,
402
+ "clearance_field": clearance_field,
403
+ "occluder_contact_field": occluder_contact_field,
404
+ "grasp_affordance_field": grasp_affordance_field,
405
+ "support_stability_field": support_stability_field,
406
+ "persistence_field": persistence_field,
407
+ "reocclusion_field": reocclusion_field,
408
+ "disturbance_field": disturbance_field,
409
+ "risk_field": risk_field,
410
+ "uncertainty_field": uncertainty_field,
411
+ "access_field": access_field,
412
+ "corridor_logits": corridor_logits,
413
+ "persistence_horizon": persistence_horizon,
414
+ "disturbance_cost": disturbance_field.mean(dim=(-1, -2)).squeeze(1),
415
+ "belief_map": F.interpolate(
416
+ target_belief_field,
417
+ size=(self.config.belief_map_size, self.config.belief_map_size),
418
+ mode="bilinear",
419
+ align_corners=False,
420
+ ),
421
+ "target_field": target_belief_field,
422
+ "actor_feasibility_field": clearance_field,
423
+ "compact_state": compact_state,
424
+ "phase_logits": self.spatial_phase_head(summary),
425
+ "arm_role_logits": role_slice,
426
+ "support_mode_logits": self.spatial_support_mode_head(summary),
427
+ "reocclusion_logit": self.spatial_reocclusion_head(summary),
428
+ "uncertainty": uncertainty_field.mean(dim=(-1, -2)).squeeze(1),
429
+ }
430
+ outputs.update(
431
+ compute_task_metrics_from_fields(
432
+ access_field=access_field,
433
+ persistence_field=persistence_field,
434
+ disturbance_field=disturbance_field,
435
+ reocclusion_field=reocclusion_field,
436
+ visibility_field=visibility_field,
437
+ clearance_field=clearance_field,
438
+ support_stability_field=support_stability_field,
439
+ uncertainty_field=uncertainty_field,
440
+ )
441
+ )
442
+ return outputs
443
 
444
  def forward(
445
  self,
 
449
  memory_tokens: Tensor | None = None,
450
  scene_memory_tokens: Tensor | None = None,
451
  belief_memory_tokens: Tensor | None = None,
452
+ task_names: Sequence[str] | None = None,
453
+ rollout_mode_override: str | None = None,
454
  ) -> dict[str, Tensor]:
455
  if scene_memory_tokens is None:
456
  scene_memory_tokens = interaction_state.get("scene_memory_tokens")
 
465
  if belief_memory_tokens is None:
466
  belief_memory_tokens = scene_tokens[:, :1]
467
 
468
+ rollout_mode = rollout_mode_override or self.config.rollout_mode
469
+ horizon = action_chunk.shape[1]
470
+ if rollout_mode in {"null_rollout", "identity_rollout"}:
471
+ repeated = self._repeat_state_rollout(interaction_state, horizon)
472
+ if "scene_memory_tokens" not in repeated:
473
+ repeated["scene_memory_tokens"] = scene_memory_tokens.unsqueeze(1).expand(-1, horizon, *scene_memory_tokens.shape[1:])
474
+ if "belief_memory_tokens" not in repeated:
475
+ repeated["belief_memory_tokens"] = belief_memory_tokens.unsqueeze(1).expand(-1, horizon, *belief_memory_tokens.shape[1:])
476
+ if "memory_tokens" not in repeated:
477
+ repeated["memory_tokens"] = torch.cat(
478
+ [repeated["scene_memory_tokens"], repeated["belief_memory_tokens"]],
479
+ dim=2,
480
+ )
481
+ if "memory_token" not in repeated:
482
+ repeated["memory_token"] = repeated["memory_tokens"].mean(dim=2, keepdim=True)
483
+ return repeated
484
+
485
+ task_ids = task_ids_from_names(task_names, scene_tokens.device, scene_tokens.shape[0])
486
+ task_embed = self.task_embedding(task_ids)
487
+ latent = self.state_encoder(self._compact_from_state(interaction_state)) + 0.1 * task_embed
488
  scene_memory = self.scene_memory_proj(scene_memory_tokens.mean(dim=1))
489
  belief_memory = self.belief_memory_proj(belief_memory_tokens.mean(dim=1))
490
  outputs: dict[str, list[Tensor]] = {}
491
  scene_bias = scene_tokens.mean(dim=1)
492
 
493
+ if rollout_mode == "spatial_rollout":
494
+ hidden = self.spatial_field_encoder(self._stack_state_fields(interaction_state))
495
+ spatial_context = self.spatial_context_proj(torch.cat([scene_bias, scene_memory, belief_memory, task_embed], dim=-1))
496
+ hidden = hidden + spatial_context.unsqueeze(-1).unsqueeze(-1)
497
+ for step in range(horizon):
498
+ action_latent = self.action_encoder(action_chunk[:, step]) + 0.1 * task_embed
499
+ input_map = (action_latent + spatial_context).unsqueeze(-1).unsqueeze(-1).expand_as(hidden)
500
+ z = torch.sigmoid(self.spatial_gate_z(torch.cat([hidden, input_map], dim=1)))
501
+ r = torch.sigmoid(self.spatial_gate_r(torch.cat([hidden, input_map], dim=1)))
502
+ candidate = torch.tanh(self.spatial_candidate(torch.cat([r * hidden, input_map], dim=1)))
503
+ hidden = (1.0 - z) * hidden + z * candidate
504
+ pooled_hidden = hidden.mean(dim=(-1, -2))
505
+ scene_memory = 0.75 * scene_memory + 0.25 * torch.tanh(self.scene_memory_update(pooled_hidden))
506
+ belief_memory = 0.65 * belief_memory + 0.35 * torch.tanh(self.belief_memory_update(pooled_hidden))
507
+ summary = self.spatial_summary_proj(torch.cat([pooled_hidden, scene_bias, task_embed], dim=-1))
508
+ decoded = self._decode_spatial_fields(hidden, summary)
509
+ decoded["scene_memory_tokens"] = scene_memory.unsqueeze(1).expand(-1, self.config.scene_bank_size, -1)
510
+ decoded["belief_memory_tokens"] = belief_memory.unsqueeze(1).expand(-1, self.config.belief_bank_size, -1)
511
+ decoded["memory_tokens"] = torch.cat([decoded["scene_memory_tokens"], decoded["belief_memory_tokens"]], dim=1)
512
+ decoded["memory_token"] = decoded["memory_tokens"].mean(dim=1, keepdim=True)
513
+ for key, value in decoded.items():
514
+ outputs.setdefault(key, []).append(value)
515
+ return {key: torch.stack(values, dim=1) for key, values in outputs.items()}
516
+
517
  for step in range(action_chunk.shape[1]):
518
+ action_latent = self.action_encoder(action_chunk[:, step]) + 0.1 * task_embed
519
  transition_input = torch.cat([latent, action_latent, scene_memory, belief_memory], dim=-1)
520
  latent = self.transition(transition_input, latent + 0.1 * scene_bias)
521
  scene_memory = 0.75 * scene_memory + 0.25 * torch.tanh(self.scene_memory_update(latent))
code/reveal_vla_bimanual/scripts/run_rlbench_handoff_eval.sh ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ ROOT_DIR="${ROOT_DIR:-/workspace}"
5
+ PROJECT_DIR="${ROOT_DIR}/reveal_vla_bimanual"
6
+ PYTHON_BIN="${PYTHON_BIN:-${ROOT_DIR}/envs/rlbench/bin/python}"
7
+ OUTPUT_ROOT="${OUTPUT_ROOT:-${ROOT_DIR}/reports/rlbench_handoff_matrix}"
8
+ EPISODES_PER_TASK="${EPISODES_PER_TASK:-1}"
9
+ EPISODE_LENGTH="${EPISODE_LENGTH:-20}"
10
+ RESOLUTION="${RESOLUTION:-224}"
11
+ CHUNK_COMMIT_STEPS="${CHUNK_COMMIT_STEPS:-4}"
12
+ BASELINE_CHECKPOINT="${BASELINE_CHECKPOINT:-${ROOT_DIR}/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt}"
13
+ SPATIAL_CHECKPOINT="${SPATIAL_CHECKPOINT:-${ROOT_DIR}/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/checkpoint_best.pt}"
14
+
15
+ source "${ROOT_DIR}/VLAarchtests_work/environment/runtime_env_vars.sh"
16
+
17
+ run_sweep() {
18
+ local output_dir="$1"
19
+ shift
20
+ mkdir -p "${output_dir}"
21
+ (
22
+ cd "${PROJECT_DIR}"
23
+ PYTHONPATH="${PROJECT_DIR}" "${PYTHON_BIN}" -m eval.run_peract2_task_sweep "$@"
24
+ )
25
+ }
26
+
27
+ mkdir -p "${OUTPUT_ROOT}"
28
+
29
+ run_sweep \
30
+ "${OUTPUT_ROOT}/baseline" \
31
+ --checkpoint "${BASELINE_CHECKPOINT}" \
32
+ --output-root "${OUTPUT_ROOT}/baseline" \
33
+ --run-name-prefix baseline_rgbd_seed17 \
34
+ --episodes-per-task "${EPISODES_PER_TASK}" \
35
+ --episode-length "${EPISODE_LENGTH}" \
36
+ --resolution "${RESOLUTION}" \
37
+ --chunk-commit-steps "${CHUNK_COMMIT_STEPS}" \
38
+ --allow-unsupervised-planning \
39
+ --headless \
40
+ --skip-noplan
41
+
42
+ run_sweep \
43
+ "${OUTPUT_ROOT}/spatial_full" \
44
+ --checkpoint "${SPATIAL_CHECKPOINT}" \
45
+ --output-root "${OUTPUT_ROOT}/spatial_full" \
46
+ --run-name-prefix spatial_phase_seed17 \
47
+ --episodes-per-task "${EPISODES_PER_TASK}" \
48
+ --episode-length "${EPISODE_LENGTH}" \
49
+ --resolution "${RESOLUTION}" \
50
+ --chunk-commit-steps "${CHUNK_COMMIT_STEPS}" \
51
+ --allow-unsupervised-planning \
52
+ --headless
53
+
54
+ run_sweep \
55
+ "${OUTPUT_ROOT}/spatial_nogeom" \
56
+ --checkpoint "${SPATIAL_CHECKPOINT}" \
57
+ --output-root "${OUTPUT_ROOT}/spatial_nogeom" \
58
+ --run-name-prefix spatial_phase_nogeom_seed17 \
59
+ --episodes-per-task "${EPISODES_PER_TASK}" \
60
+ --episode-length "${EPISODE_LENGTH}" \
61
+ --resolution "${RESOLUTION}" \
62
+ --chunk-commit-steps "${CHUNK_COMMIT_STEPS}" \
63
+ --allow-unsupervised-planning \
64
+ --headless \
65
+ --no-geometry \
66
+ --skip-noplan
67
+
68
+ run_sweep \
69
+ "${OUTPUT_ROOT}/spatial_compactwm" \
70
+ --checkpoint "${SPATIAL_CHECKPOINT}" \
71
+ --output-root "${OUTPUT_ROOT}/spatial_compactwm" \
72
+ --run-name-prefix spatial_phase_compactwm_seed17 \
73
+ --episodes-per-task "${EPISODES_PER_TASK}" \
74
+ --episode-length "${EPISODE_LENGTH}" \
75
+ --resolution "${RESOLUTION}" \
76
+ --chunk-commit-steps "${CHUNK_COMMIT_STEPS}" \
77
+ --allow-unsupervised-planning \
78
+ --headless \
79
+ --compact-world-model \
80
+ --skip-noplan
81
+
82
+ run_sweep \
83
+ "${OUTPUT_ROOT}/spatial_notask" \
84
+ --checkpoint "${SPATIAL_CHECKPOINT}" \
85
+ --output-root "${OUTPUT_ROOT}/spatial_notask" \
86
+ --run-name-prefix spatial_phase_notask_seed17 \
87
+ --episodes-per-task "${EPISODES_PER_TASK}" \
88
+ --episode-length "${EPISODE_LENGTH}" \
89
+ --resolution "${RESOLUTION}" \
90
+ --chunk-commit-steps "${CHUNK_COMMIT_STEPS}" \
91
+ --allow-unsupervised-planning \
92
+ --headless \
93
+ --disable-task-conditioning \
94
+ --skip-noplan
95
+
96
+ (
97
+ cd "${PROJECT_DIR}"
98
+ PYTHONPATH="${PROJECT_DIR}" "${PYTHON_BIN}" -m eval.compare_rlbench_sweeps \
99
+ --reference-label baseline_plan \
100
+ --output-dir "${OUTPUT_ROOT}/comparison" \
101
+ --run "baseline_plan=${OUTPUT_ROOT}/baseline/baseline_rgbd_seed17_plan_split/rollout_eval.json" \
102
+ --run "spatial_noplan=${OUTPUT_ROOT}/spatial_full/spatial_phase_seed17_noplan_split/rollout_eval.json" \
103
+ --run "spatial_plan=${OUTPUT_ROOT}/spatial_full/spatial_phase_seed17_plan_split/rollout_eval.json" \
104
+ --run "spatial_nogeom=${OUTPUT_ROOT}/spatial_nogeom/spatial_phase_nogeom_seed17_plan_split/rollout_eval.json" \
105
+ --run "spatial_compactwm=${OUTPUT_ROOT}/spatial_compactwm/spatial_phase_compactwm_seed17_plan_split/rollout_eval.json" \
106
+ --run "spatial_notask=${OUTPUT_ROOT}/spatial_notask/spatial_phase_notask_seed17_plan_split/rollout_eval.json"
107
+ )
code/reveal_vla_bimanual/sim_reveal/dataset.py CHANGED
@@ -26,6 +26,11 @@ LEGACY_PRIVILEGED_RENDER_KEYS = frozenset(
26
  )
27
 
28
 
 
 
 
 
 
29
  def _assert_noleak_sample(sample: dict[str, Any]) -> None:
30
  render_state = sample.get("render_state", {})
31
  leaked_keys = sorted(LEGACY_PRIVILEGED_RENDER_KEYS.intersection(render_state))
@@ -102,6 +107,8 @@ def collect_teacher_dataset(
102
  "language_goal": observation["text"],
103
  "action_chunk": action_chunk.astype("float32"),
104
  "support_mode": int(privileged_state["support_mode"]),
 
 
105
  "corridor_feasible": privileged_state["corridor_feasible"].astype("float32"),
106
  "persistence_horizon": privileged_state["persistence_horizon"].astype("float32"),
107
  "disturbance_cost": float(privileged_state["disturbance_cost"]),
@@ -114,7 +121,21 @@ def collect_teacher_dataset(
114
  "support_stability_map": privileged_state["support_stability_map"].astype("float32"),
115
  "reocclusion_target": float(privileged_state["reocclusion_target"]),
116
  "reocclusion_map": privileged_state["reocclusion_map"].astype("float32"),
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  "rollout_support_mode": rollout["rollout_support_mode"].astype("int64"),
 
118
  "rollout_corridor_feasible": rollout["rollout_corridor_feasible"].astype("float32"),
119
  "rollout_persistence_horizon": rollout["rollout_persistence_horizon"].astype("float32"),
120
  "rollout_disturbance_cost": rollout["rollout_disturbance_cost"].astype("float32"),
@@ -189,7 +210,7 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
189
  return len(self.samples)
190
 
191
  def _render_cache_key(self, sample: dict[str, Any], render_state: dict[str, Any]) -> bytes:
192
- include_depth = sample.get("dataset_version") == RGBD_PROXY_DATASET_VERSION
193
  return pickle.dumps(
194
  (sample["proxy_name"], self.resolution, include_depth, render_state),
195
  protocol=4,
@@ -200,7 +221,7 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
200
  cached = self._render_cache.get(cache_key)
201
  if cached is not None:
202
  return cached
203
- include_depth = sample.get("dataset_version") == RGBD_PROXY_DATASET_VERSION
204
  rendered = render_views_from_state(
205
  proxy_name=sample["proxy_name"],
206
  render_state=render_state,
@@ -216,6 +237,7 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
216
  return cached_item
217
  sample = self.samples[index]
218
  _assert_noleak_sample(sample)
 
219
  images = self._render_sample(sample, sample["render_state"])
220
  history_images = []
221
  history_depths = []
@@ -232,7 +254,7 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
232
  dim=0,
233
  )
234
  )
235
- if sample.get("dataset_version") == RGBD_PROXY_DATASET_VERSION:
236
  history_depths.append(
237
  torch.stack(
238
  [
@@ -267,7 +289,7 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
267
  history_stacked = torch.stack(history_images, dim=0).permute(0, 1, 4, 2, 3).float() / 255.0
268
  else:
269
  history_stacked = torch.zeros((0, 3, 3, self.resolution, self.resolution), dtype=torch.float32)
270
- if sample.get("dataset_version") == RGBD_PROXY_DATASET_VERSION:
271
  depths = torch.stack(
272
  [
273
  torch.from_numpy(images["front_depth"]),
@@ -317,6 +339,8 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
317
  "texts": sample["language_goal"],
318
  "action_chunk": torch.as_tensor(sample["action_chunk"], dtype=torch.float32),
319
  "support_mode": torch.as_tensor(sample["support_mode"], dtype=torch.long),
 
 
320
  "corridor_feasible": torch.as_tensor(sample["corridor_feasible"], dtype=torch.float32),
321
  "persistence_horizon": torch.as_tensor(sample["persistence_horizon"], dtype=torch.float32),
322
  "disturbance_cost": torch.as_tensor(sample["disturbance_cost"], dtype=torch.float32),
@@ -329,7 +353,21 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
329
  "support_stability_map": torch.as_tensor(sample.get("support_stability_map", np.zeros((32, 32), dtype=np.float32)), dtype=torch.float32).unsqueeze(0),
330
  "reocclusion_target": torch.as_tensor(sample.get("reocclusion_target", 0.0), dtype=torch.float32),
331
  "reocclusion_map": torch.as_tensor(sample.get("reocclusion_map", np.zeros((32, 32), dtype=np.float32)), dtype=torch.float32).unsqueeze(0),
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  "rollout_support_mode": torch.as_tensor(sample["rollout_support_mode"], dtype=torch.long),
 
333
  "rollout_corridor_feasible": torch.as_tensor(sample["rollout_corridor_feasible"], dtype=torch.float32),
334
  "rollout_persistence_horizon": torch.as_tensor(sample["rollout_persistence_horizon"], dtype=torch.float32),
335
  "rollout_disturbance_cost": torch.as_tensor(sample["rollout_disturbance_cost"], dtype=torch.float32),
@@ -342,6 +380,7 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
342
  "rollout_grasp_affordance_map": torch.as_tensor(sample.get("rollout_grasp_affordance_map", np.zeros((0, 32, 32), dtype=np.float32)), dtype=torch.float32),
343
  "candidate_action_chunks": torch.as_tensor(sample["candidate_action_chunks"], dtype=torch.float32),
344
  "candidate_rollout_support_mode": torch.as_tensor(sample["candidate_rollout_support_mode"], dtype=torch.long),
 
345
  "candidate_rollout_corridor_feasible": torch.as_tensor(sample["candidate_rollout_corridor_feasible"], dtype=torch.float32),
346
  "candidate_rollout_persistence_horizon": torch.as_tensor(sample["candidate_rollout_persistence_horizon"], dtype=torch.float32),
347
  "candidate_rollout_disturbance_cost": torch.as_tensor(sample["candidate_rollout_disturbance_cost"], dtype=torch.float32),
@@ -356,8 +395,23 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
356
  "candidate_final_disturbance_cost": torch.as_tensor(sample["candidate_final_disturbance_cost"], dtype=torch.float32),
357
  "candidate_reocclusion_rate": torch.as_tensor(sample["candidate_reocclusion_rate"], dtype=torch.float32),
358
  "candidate_visibility_integral": torch.as_tensor(sample["candidate_visibility_integral"], dtype=torch.float32),
 
 
 
 
 
 
 
359
  "candidate_risk": torch.as_tensor(sample["candidate_risk"], dtype=torch.float32),
360
  "candidate_utility": torch.as_tensor(sample["candidate_utility"], dtype=torch.float32),
 
 
 
 
 
 
 
 
361
  "proxy_name": sample["proxy_name"],
362
  "episode_id": sample["episode_id"],
363
  }
 
26
  )
27
 
28
 
29
+ def dataset_uses_rgbd(dataset_version: Any) -> bool:
30
+ version = str(dataset_version or "")
31
+ return version.startswith(RGBD_PROXY_DATASET_VERSION)
32
+
33
+
34
  def _assert_noleak_sample(sample: dict[str, Any]) -> None:
35
  render_state = sample.get("render_state", {})
36
  leaked_keys = sorted(LEGACY_PRIVILEGED_RENDER_KEYS.intersection(render_state))
 
107
  "language_goal": observation["text"],
108
  "action_chunk": action_chunk.astype("float32"),
109
  "support_mode": int(privileged_state["support_mode"]),
110
+ "phase": int(privileged_state.get("phase_label", 0)),
111
+ "subgoal_progress": float(privileged_state.get("subgoal_progress", 0.0)),
112
  "corridor_feasible": privileged_state["corridor_feasible"].astype("float32"),
113
  "persistence_horizon": privileged_state["persistence_horizon"].astype("float32"),
114
  "disturbance_cost": float(privileged_state["disturbance_cost"]),
 
121
  "support_stability_map": privileged_state["support_stability_map"].astype("float32"),
122
  "reocclusion_target": float(privileged_state["reocclusion_target"]),
123
  "reocclusion_map": privileged_state["reocclusion_map"].astype("float32"),
124
+ "gap_width": float(privileged_state.get("gap_width", 0.0)),
125
+ "damage_proxy": float(privileged_state.get("damage_proxy", 0.0)),
126
+ "release_collapse_rate": float(privileged_state.get("release_collapse_rate", 0.0)),
127
+ "target_visibility_confidence": float(privileged_state.get("target_visibility_confidence", 0.0)),
128
+ "mouth_aperture": float(privileged_state.get("mouth_aperture", 0.0)),
129
+ "hold_quality": float(privileged_state.get("hold_quality", 0.0)),
130
+ "rim_slip_risk": float(privileged_state.get("rim_slip_risk", 0.0)),
131
+ "insertable_actor_corridor": float(privileged_state.get("insertable_actor_corridor", 0.0)),
132
+ "layer_separation_quality": float(privileged_state.get("layer_separation_quality", 0.0)),
133
+ "fold_preservation": float(privileged_state.get("fold_preservation", 0.0)),
134
+ "insertion_corridor": float(privileged_state.get("insertion_corridor", 0.0)),
135
+ "top_layer_stability": float(privileged_state.get("top_layer_stability", 0.0)),
136
+ "lift_too_much_risk": float(privileged_state.get("lift_too_much_risk", 0.0)),
137
  "rollout_support_mode": rollout["rollout_support_mode"].astype("int64"),
138
+ "rollout_phase": rollout.get("rollout_phase", np.zeros((rollout["rollout_support_mode"].shape[0],), dtype=np.int64)).astype("int64"),
139
  "rollout_corridor_feasible": rollout["rollout_corridor_feasible"].astype("float32"),
140
  "rollout_persistence_horizon": rollout["rollout_persistence_horizon"].astype("float32"),
141
  "rollout_disturbance_cost": rollout["rollout_disturbance_cost"].astype("float32"),
 
210
  return len(self.samples)
211
 
212
  def _render_cache_key(self, sample: dict[str, Any], render_state: dict[str, Any]) -> bytes:
213
+ include_depth = dataset_uses_rgbd(sample.get("dataset_version"))
214
  return pickle.dumps(
215
  (sample["proxy_name"], self.resolution, include_depth, render_state),
216
  protocol=4,
 
221
  cached = self._render_cache.get(cache_key)
222
  if cached is not None:
223
  return cached
224
+ include_depth = dataset_uses_rgbd(sample.get("dataset_version"))
225
  rendered = render_views_from_state(
226
  proxy_name=sample["proxy_name"],
227
  render_state=render_state,
 
237
  return cached_item
238
  sample = self.samples[index]
239
  _assert_noleak_sample(sample)
240
+ candidate_count = int(sample.get("candidate_action_chunks", np.zeros((0, 0, 0), dtype=np.float32)).shape[0])
241
  images = self._render_sample(sample, sample["render_state"])
242
  history_images = []
243
  history_depths = []
 
254
  dim=0,
255
  )
256
  )
257
+ if dataset_uses_rgbd(sample.get("dataset_version")):
258
  history_depths.append(
259
  torch.stack(
260
  [
 
289
  history_stacked = torch.stack(history_images, dim=0).permute(0, 1, 4, 2, 3).float() / 255.0
290
  else:
291
  history_stacked = torch.zeros((0, 3, 3, self.resolution, self.resolution), dtype=torch.float32)
292
+ if dataset_uses_rgbd(sample.get("dataset_version")):
293
  depths = torch.stack(
294
  [
295
  torch.from_numpy(images["front_depth"]),
 
339
  "texts": sample["language_goal"],
340
  "action_chunk": torch.as_tensor(sample["action_chunk"], dtype=torch.float32),
341
  "support_mode": torch.as_tensor(sample["support_mode"], dtype=torch.long),
342
+ "phase": torch.as_tensor(sample.get("phase", 0), dtype=torch.long),
343
+ "subgoal_progress": torch.as_tensor(sample.get("subgoal_progress", 0.0), dtype=torch.float32),
344
  "corridor_feasible": torch.as_tensor(sample["corridor_feasible"], dtype=torch.float32),
345
  "persistence_horizon": torch.as_tensor(sample["persistence_horizon"], dtype=torch.float32),
346
  "disturbance_cost": torch.as_tensor(sample["disturbance_cost"], dtype=torch.float32),
 
353
  "support_stability_map": torch.as_tensor(sample.get("support_stability_map", np.zeros((32, 32), dtype=np.float32)), dtype=torch.float32).unsqueeze(0),
354
  "reocclusion_target": torch.as_tensor(sample.get("reocclusion_target", 0.0), dtype=torch.float32),
355
  "reocclusion_map": torch.as_tensor(sample.get("reocclusion_map", np.zeros((32, 32), dtype=np.float32)), dtype=torch.float32).unsqueeze(0),
356
+ "gap_width": torch.as_tensor(sample.get("gap_width", 0.0), dtype=torch.float32),
357
+ "damage_proxy": torch.as_tensor(sample.get("damage_proxy", 0.0), dtype=torch.float32),
358
+ "release_collapse_rate": torch.as_tensor(sample.get("release_collapse_rate", 0.0), dtype=torch.float32),
359
+ "target_visibility_confidence": torch.as_tensor(sample.get("target_visibility_confidence", 0.0), dtype=torch.float32),
360
+ "mouth_aperture": torch.as_tensor(sample.get("mouth_aperture", 0.0), dtype=torch.float32),
361
+ "hold_quality": torch.as_tensor(sample.get("hold_quality", 0.0), dtype=torch.float32),
362
+ "rim_slip_risk": torch.as_tensor(sample.get("rim_slip_risk", 0.0), dtype=torch.float32),
363
+ "insertable_actor_corridor": torch.as_tensor(sample.get("insertable_actor_corridor", 0.0), dtype=torch.float32),
364
+ "layer_separation_quality": torch.as_tensor(sample.get("layer_separation_quality", 0.0), dtype=torch.float32),
365
+ "fold_preservation": torch.as_tensor(sample.get("fold_preservation", 0.0), dtype=torch.float32),
366
+ "insertion_corridor": torch.as_tensor(sample.get("insertion_corridor", 0.0), dtype=torch.float32),
367
+ "top_layer_stability": torch.as_tensor(sample.get("top_layer_stability", 0.0), dtype=torch.float32),
368
+ "lift_too_much_risk": torch.as_tensor(sample.get("lift_too_much_risk", 0.0), dtype=torch.float32),
369
  "rollout_support_mode": torch.as_tensor(sample["rollout_support_mode"], dtype=torch.long),
370
+ "rollout_phase": torch.as_tensor(sample.get("rollout_phase", np.zeros((0,), dtype=np.int64)), dtype=torch.long),
371
  "rollout_corridor_feasible": torch.as_tensor(sample["rollout_corridor_feasible"], dtype=torch.float32),
372
  "rollout_persistence_horizon": torch.as_tensor(sample["rollout_persistence_horizon"], dtype=torch.float32),
373
  "rollout_disturbance_cost": torch.as_tensor(sample["rollout_disturbance_cost"], dtype=torch.float32),
 
380
  "rollout_grasp_affordance_map": torch.as_tensor(sample.get("rollout_grasp_affordance_map", np.zeros((0, 32, 32), dtype=np.float32)), dtype=torch.float32),
381
  "candidate_action_chunks": torch.as_tensor(sample["candidate_action_chunks"], dtype=torch.float32),
382
  "candidate_rollout_support_mode": torch.as_tensor(sample["candidate_rollout_support_mode"], dtype=torch.long),
383
+ "candidate_rollout_phase": torch.as_tensor(sample.get("candidate_rollout_phase", np.zeros((0, 0), dtype=np.int64)), dtype=torch.long),
384
  "candidate_rollout_corridor_feasible": torch.as_tensor(sample["candidate_rollout_corridor_feasible"], dtype=torch.float32),
385
  "candidate_rollout_persistence_horizon": torch.as_tensor(sample["candidate_rollout_persistence_horizon"], dtype=torch.float32),
386
  "candidate_rollout_disturbance_cost": torch.as_tensor(sample["candidate_rollout_disturbance_cost"], dtype=torch.float32),
 
395
  "candidate_final_disturbance_cost": torch.as_tensor(sample["candidate_final_disturbance_cost"], dtype=torch.float32),
396
  "candidate_reocclusion_rate": torch.as_tensor(sample["candidate_reocclusion_rate"], dtype=torch.float32),
397
  "candidate_visibility_integral": torch.as_tensor(sample["candidate_visibility_integral"], dtype=torch.float32),
398
+ "candidate_actor_feasibility_auc": torch.as_tensor(sample.get("candidate_actor_feasibility_auc", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
399
+ "candidate_reveal_achieved": torch.as_tensor(sample.get("candidate_reveal_achieved", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
400
+ "candidate_hold_persistence": torch.as_tensor(sample.get("candidate_hold_persistence", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
401
+ "candidate_support_stability_auc": torch.as_tensor(sample.get("candidate_support_stability_auc", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
402
+ "candidate_disturbance_auc": torch.as_tensor(sample.get("candidate_disturbance_auc", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
403
+ "candidate_macro_ids": torch.as_tensor(sample.get("candidate_macro_ids", np.zeros((candidate_count,), dtype=np.int64)), dtype=torch.long),
404
+ "candidate_is_hard_negative": torch.as_tensor(sample.get("candidate_is_hard_negative", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
405
  "candidate_risk": torch.as_tensor(sample["candidate_risk"], dtype=torch.float32),
406
  "candidate_utility": torch.as_tensor(sample["candidate_utility"], dtype=torch.float32),
407
+ "candidate_gap_width": torch.as_tensor(sample.get("candidate_gap_width", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
408
+ "candidate_damage_proxy": torch.as_tensor(sample.get("candidate_damage_proxy", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
409
+ "candidate_mouth_aperture": torch.as_tensor(sample.get("candidate_mouth_aperture", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
410
+ "candidate_hold_quality": torch.as_tensor(sample.get("candidate_hold_quality", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
411
+ "candidate_rim_slip_risk": torch.as_tensor(sample.get("candidate_rim_slip_risk", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
412
+ "candidate_fold_preservation": torch.as_tensor(sample.get("candidate_fold_preservation", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
413
+ "candidate_layer_separation_quality": torch.as_tensor(sample.get("candidate_layer_separation_quality", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
414
+ "candidate_lift_too_much_risk": torch.as_tensor(sample.get("candidate_lift_too_much_risk", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
415
  "proxy_name": sample["proxy_name"],
416
  "episode_id": sample["episode_id"],
417
  }
code/reveal_vla_bimanual/sim_reveal/procedural_envs.py CHANGED
@@ -347,6 +347,53 @@ class ProceduralRevealEnv:
347
  horizon_ratio = persistence[current_mode] / float(max(1, self.rollout_horizon))
348
  return float(np.clip(1.0 - horizon_ratio + 0.35 * self.disturbance, 0.0, 1.0))
349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  def _grasp_affordance_map(
351
  self,
352
  belief_map: np.ndarray,
@@ -374,6 +421,40 @@ class ProceduralRevealEnv:
374
  reocclusion_target = self._reocclusion_target(persistence)
375
  reocclusion_map = np.full((32, 32), reocclusion_target, dtype=np.float32)
376
  grasp_affordance_map = self._grasp_affordance_map(belief_map, visibility_map, clearance_map)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  return {
378
  "support_mode": support_mode,
379
  "corridor_feasible": corridor,
@@ -391,6 +472,9 @@ class ProceduralRevealEnv:
391
  "visibility": visibility,
392
  "retrieval_success": bool(self.retrieved),
393
  "target_template": self.target_template,
 
 
 
394
  }
395
 
396
  def render_state(self, privileged_state: dict[str, Any] | None = None) -> dict[str, Any]:
@@ -467,6 +551,99 @@ class ProceduralRevealEnv:
467
  action[13] = np.float32(1.0 if retrieve else -1.0)
468
  return action
469
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  def teacher_chunk_and_rollout(
471
  self,
472
  chunk_horizon: int = 8,
@@ -486,6 +663,7 @@ class ProceduralRevealEnv:
486
  rollout_reocclusion = []
487
  rollout_occluder_contact = []
488
  rollout_grasp_affordance = []
 
489
  for step in range(chunk_horizon):
490
  action = self.teacher_action()
491
  action_chunk.append(action)
@@ -502,6 +680,7 @@ class ProceduralRevealEnv:
502
  rollout_reocclusion.append(privileged_state["reocclusion_target"])
503
  rollout_occluder_contact.append(privileged_state["occluder_contact_map"])
504
  rollout_grasp_affordance.append(privileged_state["grasp_affordance_map"])
 
505
  if terminated or truncated:
506
  break
507
  while len(action_chunk) < chunk_horizon:
@@ -519,6 +698,7 @@ class ProceduralRevealEnv:
519
  rollout_reocclusion.append(current["reocclusion_target"])
520
  rollout_occluder_contact.append(current["occluder_contact_map"])
521
  rollout_grasp_affordance.append(current["grasp_affordance_map"])
 
522
  self.restore_state(snapshot)
523
  return np.stack(action_chunk, axis=0).astype(np.float32), {
524
  "rollout_support_mode": np.asarray(rollout_support_mode, dtype=np.int64),
@@ -532,6 +712,7 @@ class ProceduralRevealEnv:
532
  "rollout_reocclusion_target": np.asarray(rollout_reocclusion, dtype=np.float32),
533
  "rollout_occluder_contact_map": np.asarray(rollout_occluder_contact, dtype=np.float32),
534
  "rollout_grasp_affordance_map": np.asarray(rollout_grasp_affordance, dtype=np.float32),
 
535
  }
536
 
537
  def evaluate_action_chunk(
@@ -552,8 +733,12 @@ class ProceduralRevealEnv:
552
  rollout_reocclusion: list[float] = []
553
  rollout_occluder_contact: list[np.ndarray] = []
554
  rollout_grasp_affordance: list[np.ndarray] = []
 
555
  corridor_open_trace = [float(self.get_privileged_state()["corridor_feasible"][self._current_support_mode()].any())]
556
  visibility_trace = [float(self.get_privileged_state()["visibility"])]
 
 
 
557
  terminated = False
558
  truncated = False
559
  privileged_state = self.get_privileged_state()
@@ -571,8 +756,12 @@ class ProceduralRevealEnv:
571
  rollout_reocclusion.append(float(privileged_state["reocclusion_target"]))
572
  rollout_occluder_contact.append(privileged_state["occluder_contact_map"].astype(np.float32))
573
  rollout_grasp_affordance.append(privileged_state["grasp_affordance_map"].astype(np.float32))
 
574
  corridor_open_trace.append(float(privileged_state["corridor_feasible"][privileged_state["support_mode"]].any()))
575
  visibility_trace.append(float(privileged_state["visibility"]))
 
 
 
576
  if terminated or truncated:
577
  break
578
  while len(rollout_support_mode) < rollout_horizon:
@@ -588,11 +777,17 @@ class ProceduralRevealEnv:
588
  rollout_reocclusion.append(float(current["reocclusion_target"]))
589
  rollout_occluder_contact.append(current["occluder_contact_map"].astype(np.float32))
590
  rollout_grasp_affordance.append(current["grasp_affordance_map"].astype(np.float32))
 
591
  final_state = self.get_privileged_state()
 
 
 
 
 
592
  reocclusion = float(
593
  np.logical_and(
594
- np.asarray(corridor_open_trace[:-1]) > 0.5,
595
- np.asarray(corridor_open_trace[1:]) <= 0.5,
596
  ).mean()
597
  ) if len(corridor_open_trace) > 1 else 0.0
598
  result: dict[str, np.ndarray | float] = {
@@ -607,11 +802,29 @@ class ProceduralRevealEnv:
607
  "rollout_reocclusion_target": np.asarray(rollout_reocclusion, dtype=np.float32),
608
  "rollout_occluder_contact_map": np.asarray(rollout_occluder_contact, dtype=np.float32),
609
  "rollout_grasp_affordance_map": np.asarray(rollout_grasp_affordance, dtype=np.float32),
 
610
  "retrieval_success": float(final_state["retrieval_success"]),
611
  "final_disturbance_cost": float(final_state["disturbance_cost"]),
612
  "reocclusion_rate": reocclusion,
613
- "visibility_integral": float(np.sum(np.asarray(visibility_trace, dtype=np.float32))),
 
 
 
 
 
 
614
  }
 
 
 
 
 
 
 
 
 
 
 
615
  self.restore_state(snapshot)
616
  return result
617
 
@@ -625,16 +838,70 @@ class ProceduralRevealEnv:
625
  teacher_chunk = np.asarray(teacher_chunk, dtype=np.float32)
626
  candidates = [teacher_chunk.astype(np.float32)]
627
  outcomes = [self.evaluate_action_chunk(teacher_chunk, rollout_horizon=rollout_horizon)]
628
- for candidate_idx in range(1, num_candidates):
629
- candidate = teacher_chunk.copy()
630
- revealer_noise = self.rng.normal(loc=0.0, scale=0.20 + 0.03 * candidate_idx, size=candidate[:, :7].shape)
631
- actor_noise = self.rng.normal(loc=0.0, scale=0.18 + 0.04 * candidate_idx, size=candidate[:, 7:].shape)
632
- candidate[:, :7] = np.clip(candidate[:, :7] + revealer_noise.astype(np.float32), -1.0, 1.0)
633
- candidate[:, 7:] = np.clip(candidate[:, 7:] + actor_noise.astype(np.float32), -1.0, 1.0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634
  candidates.append(candidate.astype(np.float32))
635
  outcomes.append(self.evaluate_action_chunk(candidate, rollout_horizon=rollout_horizon))
 
 
 
 
 
 
 
 
 
 
 
 
 
636
  stacked_outcomes = {
637
  "candidate_rollout_support_mode": np.stack([item["rollout_support_mode"] for item in outcomes], axis=0).astype(np.int64),
 
638
  "candidate_rollout_corridor_feasible": np.stack(
639
  [item["rollout_corridor_feasible"] for item in outcomes], axis=0
640
  ).astype(np.float32),
@@ -671,6 +938,13 @@ class ProceduralRevealEnv:
671
  ),
672
  "candidate_reocclusion_rate": np.asarray([item["reocclusion_rate"] for item in outcomes], dtype=np.float32),
673
  "candidate_visibility_integral": np.asarray([item["visibility_integral"] for item in outcomes], dtype=np.float32),
 
 
 
 
 
 
 
674
  }
675
  stacked_outcomes["candidate_risk"] = np.clip(
676
  stacked_outcomes["candidate_final_disturbance_cost"] + stacked_outcomes["candidate_reocclusion_rate"],
@@ -680,6 +954,19 @@ class ProceduralRevealEnv:
680
  stacked_outcomes["candidate_utility"] = (
681
  stacked_outcomes["candidate_retrieval_success"] - stacked_outcomes["candidate_risk"]
682
  ).astype(np.float32)
 
 
 
 
 
 
 
 
 
 
 
 
 
683
  return np.stack(candidates, axis=0).astype(np.float32), stacked_outcomes
684
 
685
  def step(self, action: np.ndarray) -> tuple[dict[str, Any], float, bool, bool, dict[str, Any]]:
 
347
  horizon_ratio = persistence[current_mode] / float(max(1, self.rollout_horizon))
348
  return float(np.clip(1.0 - horizon_ratio + 0.35 * self.disturbance, 0.0, 1.0))
349
 
350
+ def _phase_label(
351
+ self,
352
+ visibility: float,
353
+ corridor: np.ndarray,
354
+ persistence: np.ndarray,
355
+ disturbance_cost: float,
356
+ ) -> int:
357
+ support_mode = int(self._current_support_mode())
358
+ corridor_ready = bool(corridor[support_mode, self.target_template] > 0.5)
359
+ persistence_ratio = persistence[support_mode] / float(max(1, self.rollout_horizon))
360
+ opening_ready = self.opening >= (0.75 * self.dynamics.desired_opening)
361
+ retrieve_ready = (
362
+ corridor_ready
363
+ and visibility >= self.dynamics.retrieve_visibility_threshold
364
+ and self.actor_progress >= 0.55
365
+ )
366
+ recovering = disturbance_cost >= 0.55 or (opening_ready and persistence_ratio < 0.35)
367
+ if retrieve_ready:
368
+ return 3
369
+ if recovering:
370
+ return 4
371
+ if opening_ready and persistence_ratio >= 0.6:
372
+ return 2
373
+ if self.opening < self.dynamics.desired_opening or visibility < self.dynamics.retrieve_visibility_threshold:
374
+ return 1
375
+ return 0
376
+
377
+ def _subgoal_progress(
378
+ self,
379
+ visibility: float,
380
+ corridor: np.ndarray,
381
+ persistence: np.ndarray,
382
+ ) -> float:
383
+ support_mode = int(self._current_support_mode())
384
+ corridor_mass = float(corridor[support_mode].mean())
385
+ persistence_ratio = float(persistence[support_mode] / float(max(1, self.rollout_horizon)))
386
+ return float(
387
+ np.clip(
388
+ 0.35 * self.opening
389
+ + 0.25 * visibility
390
+ + 0.20 * corridor_mass
391
+ + 0.20 * persistence_ratio,
392
+ 0.0,
393
+ 1.0,
394
+ )
395
+ )
396
+
397
  def _grasp_affordance_map(
398
  self,
399
  belief_map: np.ndarray,
 
421
  reocclusion_target = self._reocclusion_target(persistence)
422
  reocclusion_map = np.full((32, 32), reocclusion_target, dtype=np.float32)
423
  grasp_affordance_map = self._grasp_affordance_map(belief_map, visibility_map, clearance_map)
424
+ task_metrics: dict[str, float] = {}
425
+ if self.proxy_name == FOLIAGE_PROXY.name:
426
+ task_metrics = {
427
+ "gap_width": float(np.clip(0.03 + 0.16 * self.opening, 0.03, 0.24)),
428
+ "damage_proxy": disturbance_cost,
429
+ "release_collapse_rate": reocclusion_target,
430
+ "target_visibility_confidence": visibility,
431
+ }
432
+ elif self.proxy_name == BAG_PROXY.name:
433
+ task_metrics = {
434
+ "mouth_aperture": float(self.opening),
435
+ "hold_quality": support_stability,
436
+ "rim_slip_risk": reocclusion_target,
437
+ "insertable_actor_corridor": float(corridor[support_mode, self.target_template]),
438
+ }
439
+ elif self.proxy_name == CLOTH_PROXY.name:
440
+ task_metrics = {
441
+ "layer_separation_quality": float(np.clip(self.opening * (1.0 - 0.20 * self.disturbance), 0.0, 1.0)),
442
+ "fold_preservation": float(np.clip(1.0 - disturbance_cost, 0.0, 1.0)),
443
+ "insertion_corridor": float(corridor[support_mode, self.target_template]),
444
+ "top_layer_stability": support_stability,
445
+ "lift_too_much_risk": float(np.clip(max(0.0, self.opening - self.dynamics.desired_opening), 0.0, 1.0)),
446
+ }
447
+ phase_label = self._phase_label(
448
+ visibility=visibility,
449
+ corridor=corridor,
450
+ persistence=persistence,
451
+ disturbance_cost=disturbance_cost,
452
+ )
453
+ subgoal_progress = self._subgoal_progress(
454
+ visibility=visibility,
455
+ corridor=corridor,
456
+ persistence=persistence,
457
+ )
458
  return {
459
  "support_mode": support_mode,
460
  "corridor_feasible": corridor,
 
472
  "visibility": visibility,
473
  "retrieval_success": bool(self.retrieved),
474
  "target_template": self.target_template,
475
+ "phase_label": int(phase_label),
476
+ "subgoal_progress": float(subgoal_progress),
477
+ **task_metrics,
478
  }
479
 
480
  def render_state(self, privileged_state: dict[str, Any] | None = None) -> dict[str, Any]:
 
551
  action[13] = np.float32(1.0 if retrieve else -1.0)
552
  return action
553
 
554
+ def _set_mode_bits(self, action: np.ndarray, mode: SupportMode) -> None:
555
+ action[1] = np.float32(1.0 if mode == SupportMode.TRANSFER else -1.0)
556
+ action[2] = np.float32(1.0 if mode == SupportMode.PASSIVE else -1.0)
557
+ action[6] = np.float32(1.0 if mode == SupportMode.HOLD else -1.0)
558
+
559
+ def macro_action_chunk(self, macro_name: str, chunk_horizon: int = 8) -> np.ndarray:
560
+ preferred_mode = self.dynamics.preferred_mode
561
+ hold_mode = SupportMode.HOLD
562
+ passive_mode = SupportMode.PASSIVE
563
+ target_index = self.target_template
564
+ left_index = max(0, target_index - 4)
565
+ right_index = min(self.num_templates - 1, target_index + 4)
566
+ wrong_index = 0 if target_index > (self.num_templates // 2) else self.num_templates - 1
567
+
568
+ chunk = np.zeros((chunk_horizon, 14), dtype=np.float32)
569
+ for step_idx in range(chunk_horizon):
570
+ action = self.teacher_action()
571
+ action[13] = np.float32(-1.0)
572
+ action[8] = np.float32(0.2)
573
+ self._set_mode_bits(action, preferred_mode)
574
+
575
+ if macro_name in {"widen_gap", "widen_mouth", "lift_edge", "separate_layer"}:
576
+ self._set_mode_bits(action, hold_mode)
577
+ action[0] = np.float32(0.95)
578
+ elif macro_name in {"maintain_gap", "maintain_mouth", "maintain_lift", "stabilize_fold", "pin_canopy"}:
579
+ self._set_mode_bits(action, preferred_mode)
580
+ action[0] = np.float32(0.12)
581
+ elif macro_name in {"sweep_left", "pin_left_rim"}:
582
+ self._set_mode_bits(action, hold_mode)
583
+ action[0] = np.float32(0.75)
584
+ action[7] = np.float32(self._normalized_template(left_index))
585
+ elif macro_name in {"sweep_right", "pin_right_rim"}:
586
+ self._set_mode_bits(action, hold_mode)
587
+ action[0] = np.float32(0.75)
588
+ action[7] = np.float32(self._normalized_template(right_index))
589
+ elif macro_name == "probe_inside":
590
+ self._set_mode_bits(action, preferred_mode)
591
+ action[0] = np.float32(0.10)
592
+ action[8] = np.float32(0.75)
593
+ elif macro_name == "insert_actor":
594
+ self._set_mode_bits(action, preferred_mode)
595
+ action[0] = np.float32(0.10)
596
+ action[8] = np.float32(1.0)
597
+ elif macro_name == "retrieve":
598
+ self._set_mode_bits(action, preferred_mode)
599
+ action[0] = np.float32(0.05)
600
+ action[8] = np.float32(1.0)
601
+ action[13] = np.float32(1.0)
602
+ elif macro_name == "premature_retrieve":
603
+ self._set_mode_bits(action, passive_mode)
604
+ action[0] = np.float32(-0.20)
605
+ action[8] = np.float32(1.0)
606
+ action[13] = np.float32(1.0)
607
+ elif macro_name in {"reveal_with_release", "foliage_immediate_reocclusion"}:
608
+ reveal_phase = step_idx < max(1, chunk_horizon // 2)
609
+ self._set_mode_bits(action, hold_mode if reveal_phase else passive_mode)
610
+ action[0] = np.float32(0.95 if reveal_phase else -0.35)
611
+ action[8] = np.float32(0.2)
612
+ elif macro_name in {"wrong_side_reveal", "wrong_edge_reveal", "wrong_layer_reveal"}:
613
+ self._set_mode_bits(action, hold_mode)
614
+ action[0] = np.float32(0.65)
615
+ action[7] = np.float32(self._normalized_template(wrong_index))
616
+ elif macro_name in {"over_disturbance", "cloth_lift_high"}:
617
+ self._set_mode_bits(action, passive_mode)
618
+ action[0] = np.float32(1.0)
619
+ action[8] = np.float32(1.0 if macro_name == "over_disturbance" else 0.2)
620
+ elif macro_name == "delayed_actor_entry":
621
+ self._set_mode_bits(action, preferred_mode)
622
+ action[0] = np.float32(0.10)
623
+ action[8] = np.float32(0.2 if step_idx < (chunk_horizon - 1) else 1.0)
624
+ elif macro_name in {"weak_corridor_insert", "bag_fabric_probe"}:
625
+ self._set_mode_bits(action, passive_mode)
626
+ action[0] = np.float32(0.02)
627
+ action[8] = np.float32(1.0)
628
+ else:
629
+ action = self.teacher_action()
630
+ chunk[step_idx] = np.clip(action, -1.0, 1.0)
631
+ return chunk
632
+
633
+ def baseline_action_chunk(self, baseline_name: str, chunk_horizon: int = 8) -> np.ndarray:
634
+ if baseline_name == "teacher":
635
+ chunk, _ = self.teacher_chunk_and_rollout(chunk_horizon=chunk_horizon, rollout_horizon=self.rollout_horizon)
636
+ return chunk
637
+ if baseline_name == "reveal_only":
638
+ return self.macro_action_chunk("widen_gap" if self.proxy_name == FOLIAGE_PROXY.name else ("widen_mouth" if self.proxy_name == BAG_PROXY.name else "lift_edge"), chunk_horizon=chunk_horizon)
639
+ if baseline_name == "retrieve_only":
640
+ return self.macro_action_chunk("premature_retrieve", chunk_horizon=chunk_horizon)
641
+ if baseline_name == "no_hold":
642
+ return self.macro_action_chunk("reveal_with_release", chunk_horizon=chunk_horizon)
643
+ if baseline_name == "random":
644
+ return self.rng.uniform(-1.0, 1.0, size=(chunk_horizon, 14)).astype(np.float32)
645
+ raise KeyError(f"Unknown baseline chunk: {baseline_name}")
646
+
647
  def teacher_chunk_and_rollout(
648
  self,
649
  chunk_horizon: int = 8,
 
663
  rollout_reocclusion = []
664
  rollout_occluder_contact = []
665
  rollout_grasp_affordance = []
666
+ rollout_phase = []
667
  for step in range(chunk_horizon):
668
  action = self.teacher_action()
669
  action_chunk.append(action)
 
680
  rollout_reocclusion.append(privileged_state["reocclusion_target"])
681
  rollout_occluder_contact.append(privileged_state["occluder_contact_map"])
682
  rollout_grasp_affordance.append(privileged_state["grasp_affordance_map"])
683
+ rollout_phase.append(int(privileged_state["phase_label"]))
684
  if terminated or truncated:
685
  break
686
  while len(action_chunk) < chunk_horizon:
 
698
  rollout_reocclusion.append(current["reocclusion_target"])
699
  rollout_occluder_contact.append(current["occluder_contact_map"])
700
  rollout_grasp_affordance.append(current["grasp_affordance_map"])
701
+ rollout_phase.append(int(current["phase_label"]))
702
  self.restore_state(snapshot)
703
  return np.stack(action_chunk, axis=0).astype(np.float32), {
704
  "rollout_support_mode": np.asarray(rollout_support_mode, dtype=np.int64),
 
712
  "rollout_reocclusion_target": np.asarray(rollout_reocclusion, dtype=np.float32),
713
  "rollout_occluder_contact_map": np.asarray(rollout_occluder_contact, dtype=np.float32),
714
  "rollout_grasp_affordance_map": np.asarray(rollout_grasp_affordance, dtype=np.float32),
715
+ "rollout_phase": np.asarray(rollout_phase, dtype=np.int64),
716
  }
717
 
718
  def evaluate_action_chunk(
 
733
  rollout_reocclusion: list[float] = []
734
  rollout_occluder_contact: list[np.ndarray] = []
735
  rollout_grasp_affordance: list[np.ndarray] = []
736
+ rollout_phase: list[int] = []
737
  corridor_open_trace = [float(self.get_privileged_state()["corridor_feasible"][self._current_support_mode()].any())]
738
  visibility_trace = [float(self.get_privileged_state()["visibility"])]
739
+ disturbance_trace = [float(self.get_privileged_state()["disturbance_cost"])]
740
+ support_trace = [float(self.get_privileged_state()["support_stability"])]
741
+ opening_trace = [float(self.opening)]
742
  terminated = False
743
  truncated = False
744
  privileged_state = self.get_privileged_state()
 
756
  rollout_reocclusion.append(float(privileged_state["reocclusion_target"]))
757
  rollout_occluder_contact.append(privileged_state["occluder_contact_map"].astype(np.float32))
758
  rollout_grasp_affordance.append(privileged_state["grasp_affordance_map"].astype(np.float32))
759
+ rollout_phase.append(int(privileged_state["phase_label"]))
760
  corridor_open_trace.append(float(privileged_state["corridor_feasible"][privileged_state["support_mode"]].any()))
761
  visibility_trace.append(float(privileged_state["visibility"]))
762
+ disturbance_trace.append(float(privileged_state["disturbance_cost"]))
763
+ support_trace.append(float(privileged_state["support_stability"]))
764
+ opening_trace.append(float(self.opening))
765
  if terminated or truncated:
766
  break
767
  while len(rollout_support_mode) < rollout_horizon:
 
777
  rollout_reocclusion.append(float(current["reocclusion_target"]))
778
  rollout_occluder_contact.append(current["occluder_contact_map"].astype(np.float32))
779
  rollout_grasp_affordance.append(current["grasp_affordance_map"].astype(np.float32))
780
+ rollout_phase.append(int(current["phase_label"]))
781
  final_state = self.get_privileged_state()
782
+ corridor_curve = np.asarray(corridor_open_trace, dtype=np.float32)
783
+ visibility_curve = np.asarray(visibility_trace, dtype=np.float32)
784
+ disturbance_curve = np.asarray(disturbance_trace, dtype=np.float32)
785
+ support_curve = np.asarray(support_trace, dtype=np.float32)
786
+ opening_curve = np.asarray(opening_trace, dtype=np.float32)
787
  reocclusion = float(
788
  np.logical_and(
789
+ corridor_curve[:-1] > 0.5,
790
+ corridor_curve[1:] <= 0.5,
791
  ).mean()
792
  ) if len(corridor_open_trace) > 1 else 0.0
793
  result: dict[str, np.ndarray | float] = {
 
802
  "rollout_reocclusion_target": np.asarray(rollout_reocclusion, dtype=np.float32),
803
  "rollout_occluder_contact_map": np.asarray(rollout_occluder_contact, dtype=np.float32),
804
  "rollout_grasp_affordance_map": np.asarray(rollout_grasp_affordance, dtype=np.float32),
805
+ "rollout_phase": np.asarray(rollout_phase, dtype=np.int64),
806
  "retrieval_success": float(final_state["retrieval_success"]),
807
  "final_disturbance_cost": float(final_state["disturbance_cost"]),
808
  "reocclusion_rate": reocclusion,
809
+ "visibility_integral": float(np.sum(visibility_curve)),
810
+ "actor_feasibility_auc": float(corridor_curve.mean()),
811
+ "reveal_achieved": float(visibility_curve.max() >= self.dynamics.retrieve_visibility_threshold),
812
+ "hold_persistence": float(corridor_curve.mean()),
813
+ "support_stability_auc": float(support_curve.mean()),
814
+ "disturbance_auc": float(disturbance_curve.mean()),
815
+ "opening_peak": float(opening_curve.max()),
816
  }
817
+ if self.proxy_name == FOLIAGE_PROXY.name:
818
+ result["candidate_gap_width"] = float(final_state.get("gap_width", opening_curve.max()))
819
+ result["candidate_damage_proxy"] = float(final_state.get("damage_proxy", final_state["disturbance_cost"]))
820
+ elif self.proxy_name == BAG_PROXY.name:
821
+ result["candidate_mouth_aperture"] = float(final_state.get("mouth_aperture", opening_curve.max()))
822
+ result["candidate_hold_quality"] = float(final_state.get("hold_quality", support_curve.mean()))
823
+ result["candidate_rim_slip_risk"] = float(final_state.get("rim_slip_risk", reocclusion))
824
+ elif self.proxy_name == CLOTH_PROXY.name:
825
+ result["candidate_fold_preservation"] = float(final_state.get("fold_preservation", 1.0 - final_state["disturbance_cost"]))
826
+ result["candidate_layer_separation_quality"] = float(final_state.get("layer_separation_quality", opening_curve.max()))
827
+ result["candidate_lift_too_much_risk"] = float(final_state.get("lift_too_much_risk", max(0.0, opening_curve.max() - self.dynamics.desired_opening)))
828
  self.restore_state(snapshot)
829
  return result
830
 
 
838
  teacher_chunk = np.asarray(teacher_chunk, dtype=np.float32)
839
  candidates = [teacher_chunk.astype(np.float32)]
840
  outcomes = [self.evaluate_action_chunk(teacher_chunk, rollout_horizon=rollout_horizon)]
841
+ candidate_macro_ids = [0]
842
+ candidate_is_hard_negative = [0.0]
843
+ candidate_macro_names = ["teacher"]
844
+ candidate_negative_families = ["teacher"]
845
+ if self.proxy_name == FOLIAGE_PROXY.name:
846
+ semantic_specs = [
847
+ ("pin_canopy", "positive"),
848
+ ("maintain_gap", "positive"),
849
+ ("premature_retrieve", "premature_retrieve"),
850
+ ("reveal_with_release", "reveal_with_release"),
851
+ ("wrong_side_reveal", "wrong_side_reveal"),
852
+ ("foliage_immediate_reocclusion", "immediate_reocclusion"),
853
+ ("over_disturbance", "over_disturbance"),
854
+ ("weak_corridor_insert", "weak_corridor_insert"),
855
+ ("insert_actor", "positive"),
856
+ ("retrieve", "positive"),
857
+ ]
858
+ elif self.proxy_name == BAG_PROXY.name:
859
+ semantic_specs = [
860
+ ("widen_mouth", "positive"),
861
+ ("maintain_mouth", "positive"),
862
+ ("premature_retrieve", "premature_retrieve"),
863
+ ("reveal_with_release", "reveal_with_release"),
864
+ ("wrong_edge_reveal", "wrong_side_reveal"),
865
+ ("pin_left_rim", "one_rim_slip"),
866
+ ("bag_fabric_probe", "fabric_probe"),
867
+ ("weak_corridor_insert", "weak_corridor_insert"),
868
+ ("insert_actor", "positive"),
869
+ ("retrieve", "positive"),
870
+ ]
871
+ else:
872
+ semantic_specs = [
873
+ ("lift_edge", "positive"),
874
+ ("stabilize_fold", "positive"),
875
+ ("premature_retrieve", "premature_retrieve"),
876
+ ("reveal_with_release", "reveal_with_release"),
877
+ ("cloth_lift_high", "lift_too_high"),
878
+ ("wrong_layer_reveal", "wrong_layer_reveal"),
879
+ ("delayed_actor_entry", "delayed_actor_entry"),
880
+ ("weak_corridor_insert", "weak_corridor_insert"),
881
+ ("insert_actor", "positive"),
882
+ ("retrieve", "positive"),
883
+ ]
884
+
885
+ for spec_idx, (macro_name, family_name) in enumerate(semantic_specs[: max(0, num_candidates - 1)], start=1):
886
+ candidate = self.macro_action_chunk(macro_name, chunk_horizon=teacher_chunk.shape[0])
887
  candidates.append(candidate.astype(np.float32))
888
  outcomes.append(self.evaluate_action_chunk(candidate, rollout_horizon=rollout_horizon))
889
+ candidate_macro_ids.append(spec_idx)
890
+ candidate_macro_names.append(macro_name)
891
+ candidate_negative_families.append(family_name)
892
+ candidate_is_hard_negative.append(0.0 if family_name == "positive" else 1.0)
893
+
894
+ while len(candidates) < num_candidates:
895
+ random_chunk = self.rng.uniform(-1.0, 1.0, size=teacher_chunk.shape).astype(np.float32)
896
+ candidates.append(random_chunk)
897
+ outcomes.append(self.evaluate_action_chunk(random_chunk, rollout_horizon=rollout_horizon))
898
+ candidate_macro_ids.append(len(candidate_macro_ids))
899
+ candidate_macro_names.append("random")
900
+ candidate_negative_families.append("random")
901
+ candidate_is_hard_negative.append(1.0)
902
  stacked_outcomes = {
903
  "candidate_rollout_support_mode": np.stack([item["rollout_support_mode"] for item in outcomes], axis=0).astype(np.int64),
904
+ "candidate_rollout_phase": np.stack([item["rollout_phase"] for item in outcomes], axis=0).astype(np.int64),
905
  "candidate_rollout_corridor_feasible": np.stack(
906
  [item["rollout_corridor_feasible"] for item in outcomes], axis=0
907
  ).astype(np.float32),
 
938
  ),
939
  "candidate_reocclusion_rate": np.asarray([item["reocclusion_rate"] for item in outcomes], dtype=np.float32),
940
  "candidate_visibility_integral": np.asarray([item["visibility_integral"] for item in outcomes], dtype=np.float32),
941
+ "candidate_actor_feasibility_auc": np.asarray([item["actor_feasibility_auc"] for item in outcomes], dtype=np.float32),
942
+ "candidate_reveal_achieved": np.asarray([item["reveal_achieved"] for item in outcomes], dtype=np.float32),
943
+ "candidate_hold_persistence": np.asarray([item["hold_persistence"] for item in outcomes], dtype=np.float32),
944
+ "candidate_support_stability_auc": np.asarray([item["support_stability_auc"] for item in outcomes], dtype=np.float32),
945
+ "candidate_disturbance_auc": np.asarray([item["disturbance_auc"] for item in outcomes], dtype=np.float32),
946
+ "candidate_macro_ids": np.asarray(candidate_macro_ids, dtype=np.int64),
947
+ "candidate_is_hard_negative": np.asarray(candidate_is_hard_negative, dtype=np.float32),
948
  }
949
  stacked_outcomes["candidate_risk"] = np.clip(
950
  stacked_outcomes["candidate_final_disturbance_cost"] + stacked_outcomes["candidate_reocclusion_rate"],
 
954
  stacked_outcomes["candidate_utility"] = (
955
  stacked_outcomes["candidate_retrieval_success"] - stacked_outcomes["candidate_risk"]
956
  ).astype(np.float32)
957
+ stacked_outcomes["candidate_macro_names"] = candidate_macro_names
958
+ stacked_outcomes["candidate_negative_families"] = candidate_negative_families
959
+ if self.proxy_name == FOLIAGE_PROXY.name:
960
+ stacked_outcomes["candidate_gap_width"] = np.asarray([item["candidate_gap_width"] for item in outcomes], dtype=np.float32)
961
+ stacked_outcomes["candidate_damage_proxy"] = np.asarray([item["candidate_damage_proxy"] for item in outcomes], dtype=np.float32)
962
+ elif self.proxy_name == BAG_PROXY.name:
963
+ stacked_outcomes["candidate_mouth_aperture"] = np.asarray([item["candidate_mouth_aperture"] for item in outcomes], dtype=np.float32)
964
+ stacked_outcomes["candidate_hold_quality"] = np.asarray([item["candidate_hold_quality"] for item in outcomes], dtype=np.float32)
965
+ stacked_outcomes["candidate_rim_slip_risk"] = np.asarray([item["candidate_rim_slip_risk"] for item in outcomes], dtype=np.float32)
966
+ elif self.proxy_name == CLOTH_PROXY.name:
967
+ stacked_outcomes["candidate_fold_preservation"] = np.asarray([item["candidate_fold_preservation"] for item in outcomes], dtype=np.float32)
968
+ stacked_outcomes["candidate_layer_separation_quality"] = np.asarray([item["candidate_layer_separation_quality"] for item in outcomes], dtype=np.float32)
969
+ stacked_outcomes["candidate_lift_too_much_risk"] = np.asarray([item["candidate_lift_too_much_risk"] for item in outcomes], dtype=np.float32)
970
  return np.stack(candidates, axis=0).astype(np.float32), stacked_outcomes
971
 
972
  def step(self, action: np.ndarray) -> tuple[dict[str, Any], float, bool, bool, dict[str, Any]]:
code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact.yaml ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17
2
+ output_dir: /workspace/outputs/r3d_handoff
3
+ device: cuda
4
+ seed: 17
5
+ init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt
6
+ init_strict: false
7
+ data:
8
+ proxies: [foliage_proxy, bag_proxy, cloth_proxy]
9
+ resolution: 224
10
+ dataset_version: reveal_proxy_v6_rgbd_elastic_state
11
+ train_episodes_per_proxy: 48
12
+ val_episodes_per_proxy: 16
13
+ train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3.pt
14
+ val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3.pt
15
+ rebuild_dataset: false
16
+ chunk_horizon: 8
17
+ rollout_horizon: 5
18
+ history_steps: 6
19
+ planner_candidates: 8
20
+ seed: 17
21
+ optim:
22
+ epochs: 3
23
+ batch_size: 4
24
+ num_workers: 24
25
+ lr: 0.0001
26
+ weight_decay: 0.0001
27
+ trainer:
28
+ policy_type: elastic_reveal
29
+ use_bf16: true
30
+ grad_clip_norm: 1.0
31
+ freeze_backbone: true
32
+ gradient_checkpointing: false
33
+ plan_during_train: true
34
+ plan_during_eval: true
35
+ support_mode_conditioning: true
36
+ planner_mode: trainable
37
+ use_depth: true
38
+ use_world_model: true
39
+ use_role_tokens: true
40
+ compute_equivariance_probe: false
41
+ policy:
42
+ backbone:
43
+ model_name: openai/clip-vit-base-patch32
44
+ hidden_dim: 512
45
+ max_text_tokens: 32
46
+ freeze_backbone: true
47
+ gradient_checkpointing: false
48
+ use_dummy_backbone: false
49
+ fusion:
50
+ hidden_dim: 512
51
+ num_cameras: 3
52
+ num_layers: 4
53
+ num_heads: 8
54
+ ff_dim: 2048
55
+ dropout: 0.1
56
+ proprio_dim: 32
57
+ proprio_tokens: 1
58
+ memory:
59
+ hidden_dim: 512
60
+ action_dim: 14
61
+ history_steps: 6
62
+ scene_history_steps: 3
63
+ belief_history_steps: 8
64
+ num_layers: 2
65
+ dropout: 0.1
66
+ memory_bank_size: 4
67
+ scene_bank_size: 2
68
+ belief_bank_size: 2
69
+ num_heads: 8
70
+ max_history_steps: 8
71
+ decoder:
72
+ hidden_dim: 512
73
+ num_heads: 8
74
+ num_layers: 4
75
+ ff_dim: 2048
76
+ dropout: 0.1
77
+ chunk_size: 8
78
+ action_dim: 14
79
+ arm_action_dim: 7
80
+ num_candidates: 8
81
+ num_phases: 5
82
+ num_arm_roles: 4
83
+ num_proposal_modes: 7
84
+ planner_top_k: 4
85
+ reveal_head:
86
+ hidden_dim: 512
87
+ num_support_modes: 3
88
+ num_approach_templates: 32
89
+ rollout_horizon: 5
90
+ belief_map_size: 32
91
+ field_size: 16
92
+ num_heads: 8
93
+ predict_belief_map: true
94
+ num_phases: 5
95
+ num_arm_roles: 4
96
+ num_interaction_tokens: 8
97
+ num_tasks: 4
98
+ world_model:
99
+ hidden_dim: 512
100
+ action_dim: 14
101
+ num_support_modes: 3
102
+ num_approach_templates: 32
103
+ rollout_horizon: 5
104
+ field_size: 16
105
+ num_heads: 8
106
+ num_phases: 5
107
+ num_arm_roles: 4
108
+ num_interaction_tokens: 8
109
+ belief_map_size: 32
110
+ predict_belief_map: true
111
+ scene_bank_size: 2
112
+ belief_bank_size: 2
113
+ rollout_mode: compact_rollout
114
+ num_tasks: 4
115
+ planner:
116
+ hidden_dim: 512
117
+ num_candidates: 8
118
+ action_dim: 14
119
+ num_support_modes: 3
120
+ utility_margin: 0.1
121
+ num_heads: 8
122
+ num_layers: 2
123
+ num_phases: 5
124
+ num_arm_roles: 4
125
+ top_k: 4
126
+ loss_weights:
127
+ action: 1.0
128
+ phase: 0.05
129
+ arm_role: 0.1
130
+ support_mode: 0.1
131
+ corridor: 0.12
132
+ persistence: 0.06
133
+ disturbance: 0.06
134
+ world_model: 0.2
135
+ belief: 0.05
136
+ visibility: 0.05
137
+ clearance: 0.06
138
+ support_stability: 0.06
139
+ reocclusion: 0.06
140
+ occluder_contact: 0.05
141
+ grasp_affordance: 0.05
142
+ planner_success: 0.2
143
+ planner_risk: 0.08
144
+ planner_ranking: 0.2
145
+ proposal_reconstruction: 0.08
146
+ proposal_success: 0.12
147
+ proposal_ranking: 0.15
148
+ proposal_diversity: 0.05
149
+ role_swap_consistency: 0.02
150
+ task_metrics: 0.05
code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase.yaml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17
2
+ output_dir: /workspace/outputs/r3d_handoff_phase
3
+ device: cuda
4
+ seed: 17
5
+ init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt
6
+ init_strict: false
7
+ data:
8
+ proxies: [foliage_proxy, bag_proxy, cloth_proxy]
9
+ resolution: 224
10
+ dataset_version: reveal_proxy_v6_rgbd_elastic_state_phase
11
+ train_episodes_per_proxy: 48
12
+ val_episodes_per_proxy: 16
13
+ train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_phase.pt
14
+ val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_phase.pt
15
+ rebuild_dataset: false
16
+ chunk_horizon: 8
17
+ rollout_horizon: 5
18
+ history_steps: 6
19
+ planner_candidates: 8
20
+ seed: 17
21
+ optim:
22
+ epochs: 3
23
+ batch_size: 4
24
+ num_workers: 24
25
+ lr: 0.0001
26
+ weight_decay: 0.0001
27
+ trainer:
28
+ policy_type: elastic_reveal
29
+ use_bf16: true
30
+ grad_clip_norm: 1.0
31
+ freeze_backbone: true
32
+ gradient_checkpointing: false
33
+ plan_during_train: true
34
+ plan_during_eval: true
35
+ support_mode_conditioning: true
36
+ planner_mode: trainable
37
+ use_depth: true
38
+ use_world_model: true
39
+ use_role_tokens: true
40
+ compute_equivariance_probe: false
41
+ policy:
42
+ backbone: {model_name: openai/clip-vit-base-patch32, hidden_dim: 512, max_text_tokens: 32, freeze_backbone: true, gradient_checkpointing: false, use_dummy_backbone: false}
43
+ fusion: {hidden_dim: 512, num_cameras: 3, num_layers: 4, num_heads: 8, ff_dim: 2048, dropout: 0.1, proprio_dim: 32, proprio_tokens: 1}
44
+ memory: {hidden_dim: 512, action_dim: 14, history_steps: 6, scene_history_steps: 3, belief_history_steps: 8, num_layers: 2, dropout: 0.1, memory_bank_size: 4, scene_bank_size: 2, belief_bank_size: 2, num_heads: 8, max_history_steps: 8}
45
+ decoder: {hidden_dim: 512, num_heads: 8, num_layers: 4, ff_dim: 2048, dropout: 0.1, chunk_size: 8, action_dim: 14, arm_action_dim: 7, num_candidates: 8, num_phases: 5, num_arm_roles: 4, num_proposal_modes: 7, planner_top_k: 4}
46
+ reveal_head: {hidden_dim: 512, num_support_modes: 3, num_approach_templates: 32, rollout_horizon: 5, belief_map_size: 32, field_size: 16, num_heads: 8, predict_belief_map: true, num_phases: 5, num_arm_roles: 4, num_interaction_tokens: 8, num_tasks: 4}
47
+ world_model: {hidden_dim: 512, action_dim: 14, num_support_modes: 3, num_approach_templates: 32, rollout_horizon: 5, field_size: 16, num_heads: 8, num_phases: 5, num_arm_roles: 4, num_interaction_tokens: 8, belief_map_size: 32, predict_belief_map: true, scene_bank_size: 2, belief_bank_size: 2, rollout_mode: compact_rollout, num_tasks: 4}
48
+ planner: {hidden_dim: 512, num_candidates: 8, action_dim: 14, num_support_modes: 3, utility_margin: 0.1, num_heads: 8, num_layers: 2, num_phases: 5, num_arm_roles: 4, top_k: 4}
49
+ loss_weights:
50
+ action: 1.0
51
+ phase: 0.08
52
+ arm_role: 0.1
53
+ support_mode: 0.1
54
+ corridor: 0.12
55
+ persistence: 0.06
56
+ disturbance: 0.06
57
+ world_model: 0.2
58
+ belief: 0.05
59
+ visibility: 0.05
60
+ clearance: 0.06
61
+ support_stability: 0.06
62
+ reocclusion: 0.06
63
+ occluder_contact: 0.05
64
+ grasp_affordance: 0.05
65
+ planner_success: 0.2
66
+ planner_risk: 0.08
67
+ planner_ranking: 0.2
68
+ proposal_reconstruction: 0.08
69
+ proposal_success: 0.12
70
+ proposal_ranking: 0.15
71
+ proposal_diversity: 0.05
72
+ role_swap_consistency: 0.02
73
+ task_metrics: 0.05
code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial.yaml ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17
2
+ output_dir: /workspace/outputs/r3d_handoff
3
+ device: cuda
4
+ seed: 17
5
+ init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt
6
+ init_strict: false
7
+ data:
8
+ proxies: [foliage_proxy, bag_proxy, cloth_proxy]
9
+ resolution: 224
10
+ dataset_version: reveal_proxy_v6_rgbd_elastic_state
11
+ train_episodes_per_proxy: 48
12
+ val_episodes_per_proxy: 16
13
+ train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3.pt
14
+ val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3.pt
15
+ rebuild_dataset: false
16
+ chunk_horizon: 8
17
+ rollout_horizon: 5
18
+ history_steps: 6
19
+ planner_candidates: 8
20
+ seed: 17
21
+ optim:
22
+ epochs: 5
23
+ batch_size: 4
24
+ num_workers: 24
25
+ lr: 0.00015
26
+ weight_decay: 0.0001
27
+ trainer:
28
+ policy_type: elastic_reveal
29
+ use_bf16: true
30
+ grad_clip_norm: 1.0
31
+ freeze_backbone: true
32
+ gradient_checkpointing: false
33
+ plan_during_train: true
34
+ plan_during_eval: true
35
+ support_mode_conditioning: true
36
+ planner_mode: trainable
37
+ use_depth: true
38
+ use_world_model: true
39
+ use_role_tokens: true
40
+ compute_equivariance_probe: false
41
+ policy:
42
+ backbone:
43
+ model_name: openai/clip-vit-base-patch32
44
+ hidden_dim: 512
45
+ max_text_tokens: 32
46
+ freeze_backbone: true
47
+ gradient_checkpointing: false
48
+ use_dummy_backbone: false
49
+ fusion:
50
+ hidden_dim: 512
51
+ num_cameras: 3
52
+ num_layers: 4
53
+ num_heads: 8
54
+ ff_dim: 2048
55
+ dropout: 0.1
56
+ proprio_dim: 32
57
+ proprio_tokens: 1
58
+ memory:
59
+ hidden_dim: 512
60
+ action_dim: 14
61
+ history_steps: 6
62
+ scene_history_steps: 3
63
+ belief_history_steps: 8
64
+ num_layers: 2
65
+ dropout: 0.1
66
+ memory_bank_size: 4
67
+ scene_bank_size: 2
68
+ belief_bank_size: 2
69
+ num_heads: 8
70
+ max_history_steps: 8
71
+ decoder:
72
+ hidden_dim: 512
73
+ num_heads: 8
74
+ num_layers: 4
75
+ ff_dim: 2048
76
+ dropout: 0.1
77
+ chunk_size: 8
78
+ action_dim: 14
79
+ arm_action_dim: 7
80
+ num_candidates: 8
81
+ num_phases: 5
82
+ num_arm_roles: 4
83
+ num_proposal_modes: 7
84
+ planner_top_k: 4
85
+ reveal_head:
86
+ hidden_dim: 512
87
+ num_support_modes: 3
88
+ num_approach_templates: 32
89
+ rollout_horizon: 5
90
+ belief_map_size: 32
91
+ field_size: 16
92
+ num_heads: 8
93
+ predict_belief_map: true
94
+ num_phases: 5
95
+ num_arm_roles: 4
96
+ num_interaction_tokens: 8
97
+ num_tasks: 4
98
+ world_model:
99
+ hidden_dim: 512
100
+ action_dim: 14
101
+ num_support_modes: 3
102
+ num_approach_templates: 32
103
+ rollout_horizon: 5
104
+ field_size: 16
105
+ num_heads: 8
106
+ num_phases: 5
107
+ num_arm_roles: 4
108
+ num_interaction_tokens: 8
109
+ belief_map_size: 32
110
+ predict_belief_map: true
111
+ scene_bank_size: 2
112
+ belief_bank_size: 2
113
+ rollout_mode: spatial_rollout
114
+ num_tasks: 4
115
+ planner:
116
+ hidden_dim: 512
117
+ num_candidates: 8
118
+ action_dim: 14
119
+ num_support_modes: 3
120
+ utility_margin: 0.1
121
+ num_heads: 8
122
+ num_layers: 2
123
+ num_phases: 5
124
+ num_arm_roles: 4
125
+ top_k: 4
126
+ loss_weights:
127
+ action: 0.6
128
+ phase: 0.05
129
+ arm_role: 0.1
130
+ support_mode: 0.1
131
+ corridor: 0.15
132
+ persistence: 0.08
133
+ disturbance: 0.08
134
+ world_model: 0.35
135
+ belief: 0.05
136
+ visibility: 0.05
137
+ clearance: 0.08
138
+ support_stability: 0.08
139
+ reocclusion: 0.08
140
+ occluder_contact: 0.05
141
+ grasp_affordance: 0.05
142
+ planner_success: 0.25
143
+ planner_risk: 0.1
144
+ planner_ranking: 0.25
145
+ proposal_reconstruction: 0.05
146
+ proposal_success: 0.2
147
+ proposal_ranking: 0.25
148
+ proposal_diversity: 0.05
149
+ role_swap_consistency: 0.02
150
+ task_metrics: 0.1
code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase.yaml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17
2
+ output_dir: /workspace/outputs/r3d_handoff_phase
3
+ device: cuda
4
+ seed: 17
5
+ init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt
6
+ init_strict: false
7
+ data:
8
+ proxies: [foliage_proxy, bag_proxy, cloth_proxy]
9
+ resolution: 224
10
+ dataset_version: reveal_proxy_v6_rgbd_elastic_state_phase
11
+ train_episodes_per_proxy: 48
12
+ val_episodes_per_proxy: 16
13
+ train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_phase.pt
14
+ val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_phase.pt
15
+ rebuild_dataset: false
16
+ chunk_horizon: 8
17
+ rollout_horizon: 5
18
+ history_steps: 6
19
+ planner_candidates: 8
20
+ seed: 17
21
+ optim:
22
+ epochs: 4
23
+ batch_size: 4
24
+ num_workers: 24
25
+ lr: 0.00015
26
+ weight_decay: 0.0001
27
+ trainer:
28
+ policy_type: elastic_reveal
29
+ use_bf16: true
30
+ grad_clip_norm: 1.0
31
+ freeze_backbone: true
32
+ gradient_checkpointing: false
33
+ plan_during_train: true
34
+ plan_during_eval: true
35
+ support_mode_conditioning: true
36
+ planner_mode: trainable
37
+ use_depth: true
38
+ use_world_model: true
39
+ use_role_tokens: true
40
+ compute_equivariance_probe: false
41
+ policy:
42
+ backbone: {model_name: openai/clip-vit-base-patch32, hidden_dim: 512, max_text_tokens: 32, freeze_backbone: true, gradient_checkpointing: false, use_dummy_backbone: false}
43
+ fusion: {hidden_dim: 512, num_cameras: 3, num_layers: 4, num_heads: 8, ff_dim: 2048, dropout: 0.1, proprio_dim: 32, proprio_tokens: 1}
44
+ memory: {hidden_dim: 512, action_dim: 14, history_steps: 6, scene_history_steps: 3, belief_history_steps: 8, num_layers: 2, dropout: 0.1, memory_bank_size: 4, scene_bank_size: 2, belief_bank_size: 2, num_heads: 8, max_history_steps: 8}
45
+ decoder: {hidden_dim: 512, num_heads: 8, num_layers: 4, ff_dim: 2048, dropout: 0.1, chunk_size: 8, action_dim: 14, arm_action_dim: 7, num_candidates: 8, num_phases: 5, num_arm_roles: 4, num_proposal_modes: 7, planner_top_k: 4}
46
+ reveal_head: {hidden_dim: 512, num_support_modes: 3, num_approach_templates: 32, rollout_horizon: 5, belief_map_size: 32, field_size: 16, num_heads: 8, predict_belief_map: true, num_phases: 5, num_arm_roles: 4, num_interaction_tokens: 8, num_tasks: 4}
47
+ world_model: {hidden_dim: 512, action_dim: 14, num_support_modes: 3, num_approach_templates: 32, rollout_horizon: 5, field_size: 16, num_heads: 8, num_phases: 5, num_arm_roles: 4, num_interaction_tokens: 8, belief_map_size: 32, predict_belief_map: true, scene_bank_size: 2, belief_bank_size: 2, rollout_mode: spatial_rollout, num_tasks: 4}
48
+ planner: {hidden_dim: 512, num_candidates: 8, action_dim: 14, num_support_modes: 3, utility_margin: 0.1, num_heads: 8, num_layers: 2, num_phases: 5, num_arm_roles: 4, top_k: 4}
49
+ loss_weights:
50
+ action: 0.6
51
+ phase: 0.08
52
+ arm_role: 0.1
53
+ support_mode: 0.1
54
+ corridor: 0.15
55
+ persistence: 0.08
56
+ disturbance: 0.08
57
+ world_model: 0.35
58
+ belief: 0.05
59
+ visibility: 0.05
60
+ clearance: 0.08
61
+ support_stability: 0.08
62
+ reocclusion: 0.08
63
+ occluder_contact: 0.05
64
+ grasp_affordance: 0.05
65
+ planner_success: 0.25
66
+ planner_risk: 0.1
67
+ planner_ranking: 0.25
68
+ proposal_reconstruction: 0.05
69
+ proposal_success: 0.2
70
+ proposal_ranking: 0.25
71
+ proposal_diversity: 0.05
72
+ role_swap_consistency: 0.02
73
+ task_metrics: 0.1
code/reveal_vla_bimanual/train/losses.py CHANGED
@@ -32,6 +32,7 @@ class LossWeights:
32
  proposal_ranking: float = 0.05
33
  proposal_diversity: float = 0.05
34
  role_swap_consistency: float = 0.05
 
35
 
36
 
37
  def chunk_bc_loss(pred_actions: Tensor, target_actions: Tensor, mask: Tensor | None = None) -> Tensor:
@@ -113,12 +114,15 @@ def _resize_like(target: Tensor, prediction: Tensor) -> Tensor:
113
  def reveal_state_loss(pred: dict[str, Tensor], target: dict[str, Tensor], weights: LossWeights) -> dict[str, Tensor]:
114
  losses = {}
115
  if "phase_logits" in pred:
116
- action_chunk = target.get("action_chunk")
117
- if action_chunk is not None:
118
- phase_target = infer_phase_targets_from_actions(action_chunk[:, 0])
119
  else:
120
- phase_map = torch.as_tensor([2, 3, 0], device=target["support_mode"].device, dtype=torch.long)
121
- phase_target = phase_map[target["support_mode"].long()]
 
 
 
 
122
  losses["phase"] = F.cross_entropy(pred["phase_logits"], phase_target)
123
  else:
124
  losses["phase"] = pred["support_mode_logits"].new_tensor(0.0)
@@ -190,6 +194,33 @@ def reveal_state_loss(pred: dict[str, Tensor], target: dict[str, Tensor], weight
190
  losses["uncertainty"] = pred["persistence_uncertainty"].mean()
191
  else:
192
  losses["uncertainty"] = pred["support_mode_logits"].new_tensor(0.0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  return losses
194
 
195
 
@@ -221,6 +252,8 @@ def world_model_rollout_consistency_loss(pred_rollout: dict[str, Tensor], target
221
  "disturbance_cost": _expand_target(target_rollout["disturbance_cost"][..., :horizon]),
222
  "action_chunk": _expand_target(target_rollout["action_chunk"][..., :horizon, :]),
223
  }
 
 
224
  loss = (
225
  F.cross_entropy(
226
  pred_rollout["support_mode_logits"].reshape(-1, pred_rollout["support_mode_logits"].shape[-1]),
@@ -234,7 +267,9 @@ def world_model_rollout_consistency_loss(pred_rollout: dict[str, Tensor], target
234
  + F.mse_loss(pred_rollout["disturbance_cost"], target_rollout["disturbance_cost"].float())
235
  )
236
  if "phase_logits" in pred_rollout:
237
- phase_target = infer_phase_targets_from_actions(target_rollout["action_chunk"])
 
 
238
  loss = loss + 0.5 * F.cross_entropy(
239
  pred_rollout["phase_logits"].reshape(-1, pred_rollout["phase_logits"].shape[-1]),
240
  phase_target.reshape(-1),
@@ -300,6 +335,7 @@ def compute_total_loss(
300
  + weights.occluder_contact * reveal_losses["occluder_contact"]
301
  + weights.grasp_affordance * reveal_losses["grasp_affordance"]
302
  + weights.reocclusion * reveal_losses["reocclusion"]
 
303
  + 0.01 * reveal_losses["uncertainty"]
304
  )
305
 
@@ -314,6 +350,8 @@ def compute_total_loss(
314
  "disturbance_cost": batch["candidate_rollout_disturbance_cost"],
315
  "action_chunk": batch["candidate_action_chunks"],
316
  }
 
 
317
  for optional_key in (
318
  "candidate_rollout_belief_map",
319
  "candidate_rollout_visibility_map",
@@ -344,6 +382,8 @@ def compute_total_loss(
344
  "disturbance_cost": batch["rollout_disturbance_cost"],
345
  "action_chunk": batch["action_chunk"],
346
  }
 
 
347
  for optional_key in (
348
  "rollout_belief_map",
349
  "rollout_visibility_map",
 
32
  proposal_ranking: float = 0.05
33
  proposal_diversity: float = 0.05
34
  role_swap_consistency: float = 0.05
35
+ task_metrics: float = 0.05
36
 
37
 
38
  def chunk_bc_loss(pred_actions: Tensor, target_actions: Tensor, mask: Tensor | None = None) -> Tensor:
 
114
  def reveal_state_loss(pred: dict[str, Tensor], target: dict[str, Tensor], weights: LossWeights) -> dict[str, Tensor]:
115
  losses = {}
116
  if "phase_logits" in pred:
117
+ if "phase" in target:
118
+ phase_target = target["phase"].long()
 
119
  else:
120
+ action_chunk = target.get("action_chunk")
121
+ if action_chunk is not None:
122
+ phase_target = infer_phase_targets_from_actions(action_chunk[:, 0])
123
+ else:
124
+ phase_map = torch.as_tensor([2, 3, 0], device=target["support_mode"].device, dtype=torch.long)
125
+ phase_target = phase_map[target["support_mode"].long()]
126
  losses["phase"] = F.cross_entropy(pred["phase_logits"], phase_target)
127
  else:
128
  losses["phase"] = pred["support_mode_logits"].new_tensor(0.0)
 
194
  losses["uncertainty"] = pred["persistence_uncertainty"].mean()
195
  else:
196
  losses["uncertainty"] = pred["support_mode_logits"].new_tensor(0.0)
197
+ task_metric_pairs = (
198
+ "opening_quality",
199
+ "actor_feasibility_score",
200
+ "gap_width",
201
+ "damage_proxy",
202
+ "release_collapse_rate",
203
+ "target_visibility_confidence",
204
+ "mouth_aperture",
205
+ "hold_quality",
206
+ "rim_slip_risk",
207
+ "insertable_actor_corridor",
208
+ "layer_separation_quality",
209
+ "fold_preservation",
210
+ "insertion_corridor",
211
+ "top_layer_stability",
212
+ "lift_too_much_risk",
213
+ )
214
+ task_losses = [
215
+ F.mse_loss(pred[key].float(), target[key].float())
216
+ for key in task_metric_pairs
217
+ if key in pred and key in target
218
+ ]
219
+ losses["task_metrics"] = (
220
+ torch.stack(task_losses).mean()
221
+ if task_losses
222
+ else pred["support_mode_logits"].new_tensor(0.0)
223
+ )
224
  return losses
225
 
226
 
 
252
  "disturbance_cost": _expand_target(target_rollout["disturbance_cost"][..., :horizon]),
253
  "action_chunk": _expand_target(target_rollout["action_chunk"][..., :horizon, :]),
254
  }
255
+ if "phase" in target_rollout:
256
+ target_rollout["phase"] = _expand_target(target_rollout["phase"][..., :horizon])
257
  loss = (
258
  F.cross_entropy(
259
  pred_rollout["support_mode_logits"].reshape(-1, pred_rollout["support_mode_logits"].shape[-1]),
 
267
  + F.mse_loss(pred_rollout["disturbance_cost"], target_rollout["disturbance_cost"].float())
268
  )
269
  if "phase_logits" in pred_rollout:
270
+ phase_target = target_rollout.get("phase")
271
+ if phase_target is None:
272
+ phase_target = infer_phase_targets_from_actions(target_rollout["action_chunk"])
273
  loss = loss + 0.5 * F.cross_entropy(
274
  pred_rollout["phase_logits"].reshape(-1, pred_rollout["phase_logits"].shape[-1]),
275
  phase_target.reshape(-1),
 
335
  + weights.occluder_contact * reveal_losses["occluder_contact"]
336
  + weights.grasp_affordance * reveal_losses["grasp_affordance"]
337
  + weights.reocclusion * reveal_losses["reocclusion"]
338
+ + weights.task_metrics * reveal_losses["task_metrics"]
339
  + 0.01 * reveal_losses["uncertainty"]
340
  )
341
 
 
350
  "disturbance_cost": batch["candidate_rollout_disturbance_cost"],
351
  "action_chunk": batch["candidate_action_chunks"],
352
  }
353
+ if "candidate_rollout_phase" in batch:
354
+ rollout_target["phase"] = batch["candidate_rollout_phase"]
355
  for optional_key in (
356
  "candidate_rollout_belief_map",
357
  "candidate_rollout_visibility_map",
 
382
  "disturbance_cost": batch["rollout_disturbance_cost"],
383
  "action_chunk": batch["action_chunk"],
384
  }
385
+ if "rollout_phase" in batch:
386
+ rollout_target["phase"] = batch["rollout_phase"]
387
  for optional_key in (
388
  "rollout_belief_map",
389
  "rollout_visibility_map",
results/2026-03-25-runpod/README.md ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 2026-03-25 Runpod Raw Index
2
+
3
+ This directory contains the source handoff instructions, raw report files copied from `/workspace/reports`, and a raw index for the artifacts produced in the `2026-03-25 UTC` session.
4
+
5
+ ## Source Handoff
6
+
7
+ - `instructions.md`
8
+
9
+ ## Test Suite
10
+
11
+ - Command:
12
+ - `PYTHONPATH=/workspace/VLAarchtests_work/code/reveal_vla_bimanual python -m pytest -q /workspace/VLAarchtests_work/tests`
13
+ - Result:
14
+ - `33 passed`
15
+
16
+ ## Generated Datasets
17
+
18
+ | Path | Size (bytes) |
19
+ | --- | ---: |
20
+ | `artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_phase.pt` | 583377508 |
21
+ | `artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_phase.pt` | 200844508 |
22
+
23
+ ## Generated Checkpoints And Training Summaries
24
+
25
+ | Directory | final_train_total | final_val_total | train_time_sec | peak_gpu_memory_mb |
26
+ | --- | ---: | ---: | ---: | ---: |
27
+ | `artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/` | 0.382780 | 0.372276 | 108.922691 | 2451.385742 |
28
+ | `artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/` | 0.518003 | 0.503869 | 163.313406 | 2924.821777 |
29
+ | `artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/` | 0.385303 | 0.378928 | 128.965583 | 2450.287109 |
30
+ | `artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/` | 0.525366 | 0.507625 | 154.841441 | 2926.074707 |
31
+
32
+ ## Proxy Result Files
33
+
34
+ ### Serious Comparisons
35
+
36
+ | File | Reference mean success | Compared mean success | Compared foliage | Compared bag | Compared cloth |
37
+ | --- | ---: | ---: | ---: | ---: | ---: |
38
+ | `reports/reveal_handoff_compare_serious/reveal_benchmark.json` | 0.583333 | 0.216667 | 0.330000 | 0.150000 | 0.170000 |
39
+ | `reports/reveal_handoff_compare_serious_compact/reveal_benchmark.json` | 0.583333 | 0.520000 | 0.660000 | 0.320000 | 0.580000 |
40
+ | `reports/reveal_phase_compare_serious_compact/reveal_benchmark.json` | 0.583333 | 0.513333 | 0.570000 | 0.420000 | 0.550000 |
41
+ | `reports/reveal_phase_compare_serious_spatial_compactwm/reveal_benchmark.json` | 0.583333 | 0.493333 | 0.640000 | 0.330000 | 0.510000 |
42
+
43
+ ### Compact-Phase Ablation Matrix
44
+
45
+ | Ablation | mean_success | visibility_integral | reocclusion_rate | disturbance_cost |
46
+ | --- | ---: | ---: | ---: | ---: |
47
+ | `full_model` | 0.513333 | 39.978670 | 0.000000 | 0.343669 |
48
+ | `no_geometry` | 0.513333 | 39.983892 | 0.000000 | 0.343637 |
49
+ | `no_spatial_memory` | 0.496667 | 37.758093 | 0.002417 | 0.417673 |
50
+ | `compact_world_model` | 0.513333 | 39.978670 | 0.000000 | 0.343669 |
51
+ | `no_planner` | 0.433333 | 20.634101 | 0.019708 | 0.185775 |
52
+ | `gaussian_candidates_only` | 0.466667 | 16.719086 | 0.029561 | 0.477573 |
53
+ | `no_task_head` | 0.513333 | 38.128876 | 0.000000 | 0.344494 |
54
+ | `no_support_mode_conditioning` | 0.513333 | 39.978670 | 0.000000 | 0.343669 |
55
+
56
+ Files:
57
+
58
+ - `reports/reveal_phase_ablations_compact/ablations.json`
59
+ - `reports/reveal_phase_ablations_compact/ablations.md`
60
+ - `reports/reveal_phase_ablations_compact/ablations.partial.json`
61
+ - `reports/reveal_phase_ablations_spatial/ablations.partial.json`
62
+
63
+ ### Teacher Audit
64
+
65
+ | Proxy | Baseline | teacher_success | baseline_success | success_delta | teacher_utility | baseline_utility | utility_delta |
66
+ | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |
67
+ | `foliage_proxy` | `reveal_only` | 1.000000 | 0.000000 | 1.000000 | 1.198432 | 0.140528 | 1.057904 |
68
+ | `foliage_proxy` | `retrieve_only` | 1.000000 | 0.000000 | 1.000000 | 1.198432 | -0.099185 | 1.297617 |
69
+ | `foliage_proxy` | `no_hold` | 1.000000 | 0.000000 | 1.000000 | 1.198432 | 0.084153 | 1.114280 |
70
+ | `foliage_proxy` | `random` | 1.000000 | 0.000000 | 1.000000 | 1.198432 | -0.138381 | 1.336814 |
71
+ | `bag_proxy` | `reveal_only` | 1.000000 | 0.000000 | 1.000000 | 1.196591 | 0.167617 | 1.028974 |
72
+ | `bag_proxy` | `retrieve_only` | 1.000000 | 0.000000 | 1.000000 | 1.196591 | -0.161481 | 1.358072 |
73
+ | `bag_proxy` | `no_hold` | 1.000000 | 0.000000 | 1.000000 | 1.196591 | 0.078455 | 1.118136 |
74
+ | `bag_proxy` | `random` | 1.000000 | 0.000000 | 1.000000 | 1.196591 | -0.181732 | 1.378323 |
75
+ | `cloth_proxy` | `reveal_only` | 1.000000 | 0.000000 | 1.000000 | 1.276440 | 0.347192 | 0.929249 |
76
+ | `cloth_proxy` | `retrieve_only` | 1.000000 | 0.000000 | 1.000000 | 1.276440 | 0.001142 | 1.275299 |
77
+ | `cloth_proxy` | `no_hold` | 1.000000 | 0.000000 | 1.000000 | 1.276440 | 0.507900 | 0.768540 |
78
+ | `cloth_proxy` | `random` | 1.000000 | 0.010000 | 0.990000 | 1.276440 | 0.166110 | 1.110330 |
79
+
80
+ Files:
81
+
82
+ - `reports/reveal_teacher_audit_serious/teacher_audit.json`
83
+ - `reports/reveal_teacher_audit_serious/teacher_audit.md`
84
+
85
+ ### Additional Proxy Report Files
86
+
87
+ - `reports/reveal_smoke_mod/reveal_benchmark.json`
88
+ - `reports/reveal_smoke_nogeom/reveal_benchmark.json`
89
+ - `reports/reveal_smoke_noplanner/reveal_benchmark.json`
90
+ - `reports/reveal_handoff_compact_probe/reveal_benchmark.json`
91
+ - `reports/reveal_handoff_compact_train_probe/reveal_benchmark.json`
92
+ - `reports/reveal_phase_probe_compact/reveal_benchmark.json`
93
+ - `reports/reveal_phase_probe_spatial/reveal_benchmark.json`
94
+ - `reports/reveal_phase_probe_spatial_compactwm/reveal_benchmark.json`
95
+
96
+ ## RLBench Result Files
97
+
98
+ ### Full-Split PerAct2 Rollout Outputs
99
+
100
+ | File | plan_requested | plan_applied | mean_success |
101
+ | --- | --- | --- | ---: |
102
+ | `reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/rollout_eval.json` | `true` | `true` | 0.000000 |
103
+ | `reports/peract2_spatial_full_ep1/spatial_phase_seed17_noplan_split/rollout_eval.json` | `false` | `false` | 0.000000 |
104
+ | `reports/peract2_spatial_full_ep1/spatial_phase_seed17_plan_split/rollout_eval.json` | `true` | `true` | 0.000000 |
105
+
106
+ ### Single-Task RLBench Debug Outputs
107
+
108
+ - `reports/rlbench_debug_baseline_pushbox/rollout_eval.json`
109
+ - `reports/rlbench_debug_compact_pushbox/rollout_eval.json`
110
+ - `reports/rlbench_debug_spatial_pushbox_nogeom/rollout_eval.json`
111
+
112
+ ## Environment Recreation Files
113
+
114
+ - `environment/README.md`
115
+ - `environment/setup_same_machine.sh`
116
+ - `environment/validate_same_machine.sh`
117
+ - `environment/runtime_env_vars.sh`
118
+ - `environment/hardware_snapshot.txt`
119
+ - `environment/glxinfo_B.txt`
120
+ - `environment/upstream_revisions.txt`
121
+ - `environment/system_packages_same_machine.txt`
122
+ - `environment/rlbench_env_export.yaml`
123
+ - `environment/rlbench_env_explicit.txt`
124
+ - `environment/rlbench_pip_freeze.txt`
results/2026-03-25-runpod/instructions.md ADDED
@@ -0,0 +1,717 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Developer handoff: structured bimanual reveal-and-retrieve under elastic occlusion
2
+
3
+ Repo target: `lsnu/VLAarchtests` (current `main`, latest post-fix state). This handoff is written against the current `elastic_reveal` stack, not the older intermediate variants.
4
+
5
+ ## 1. Project introduction
6
+
7
+ This project is a structured bimanual policy stack for reveal-and-retrieve tasks under partial observability and deformable or elastic occlusion. The eventual real-world targets are three Dobot X-trainer environments. The first is dense live foliage with hidden fake snails, where one arm must create and maintain a canopy gap while the other arm retrieves the target safely. The second is bag opening and retrieval, where one arm must open and hold the bag mouth while the other arm retrieves the target item. The third is suitcase or folded-cloth retrieval, where one arm must slightly lift and stabilize clothing layers while the other arm retrieves a hidden item without destroying the fold structure.
8
+
9
+ The current repo already contains the right broad decomposition for this task family. It has a multi-view visual backbone, RGB-D support, an explicit reveal state head, observation memory, a compact world model, a coordinated bimanual action decoder, and a planner. The problem is not the structural idea. The problem is that several important pieces are only partially wired, too compact, or only validated on teacher-shaped proxy data. The current code is a good scaffold. It is not yet strong enough to justify “beats SOTA” claims on either public benchmarks or the three target task families.
10
+
11
+ The current public evidence should be read narrowly. The most credible positive result in the repo is that RGB-D helps on the proxy benchmark. The planner, world model, and role-symmetry components are not yet validated strongly enough to claim they are the source of the gains. The RLBench / PerAct2 integration is also still mostly a launch and plumbing layer, not a mature benchmark suite.
12
+
13
+ This handoff therefore has one purpose. Keep the structured reveal-and-retrieve idea, but harden the architecture and evaluation until there is a realistic chance of beating strong bimanual baselines on the three target environments.
14
+
15
+ ## 2. Current repo status (what exists, what is missing)
16
+
17
+ The current core files are:
18
+
19
+ `code/reveal_vla_bimanual/models/backbones.py`
20
+ `code/reveal_vla_bimanual/models/multiview_fusion.py`
21
+ `code/reveal_vla_bimanual/models/observation_memory.py`
22
+ `code/reveal_vla_bimanual/models/reveal_head.py`
23
+ `code/reveal_vla_bimanual/models/world_model.py`
24
+ `code/reveal_vla_bimanual/models/action_decoder.py`
25
+ `code/reveal_vla_bimanual/models/planner.py`
26
+ `code/reveal_vla_bimanual/models/policy.py`
27
+ `code/reveal_vla_bimanual/train/losses.py`
28
+ `code/reveal_vla_bimanual/sim_reveal/dataset.py`
29
+ `code/reveal_vla_bimanual/sim_reveal/procedural_envs.py`
30
+ `code/reveal_vla_bimanual/eval/run_reveal_benchmark.py`
31
+ `code/reveal_vla_bimanual/eval/run_rlbench_rollout_eval.py`
32
+ `code/reveal_vla_bimanual/eval/run_peract2_task_sweep.py`
33
+
34
+ The current proxy benchmark already uses the correct three abstract task types (`foliage`, `bag`, `cloth`). That is good. The current dataset code also has explicit no-leak assertions, which is also good.
35
+
36
+ The current weaknesses are specific and fixable.
37
+
38
+ First, the geometry path is only partially wired. The backbone produces `depth_tokens`, `geometry_tokens`, and `camera_tokens`, but the policy only forwards RGB, depth, and camera tokens into fusion. The explicit `geometry_tokens` are dropped before fusion. In addition, camera geometry is incomplete. The current depth adapter encodes intrinsics and camera translation, but not an equally explicit camera rotation representation. For three-camera reveal tasks this is a real omission.
39
+
40
+ Second, memory is too pooled and too global. The current memory path reduces scene history to pooled tokens before write decisions and bank updates. That is a novelty-gated summary memory. It is not a spatial occlusion memory. That is not enough for “hold the opening”, “the target is still probably behind this flap”, or “reveal progress will collapse if the revealer arm releases now”.
41
+
42
+ Third, the world model is too compact. It is useful as a scaffold, but not as the state-transition core for elastic foliage, bag apertures, or layered cloth. It currently rolls a compact hidden state rather than a spatial field state. That makes it too weak for counterfactual planning over opening persistence, reocclusion, and safe actor insertion.
43
+
44
+ Fourth, the planner is not trained on hard enough candidates. The current proxy data generation uses the teacher chunk and mostly Gaussian perturbations around it. That is enough to test ranking near a teacher, but not enough to teach the planner the actual failure modes that matter in these tasks (premature retrieval, releasing the opening, over-disturbing the scene, lifting the wrong cloth edge, etc.).
45
+
46
+ Fifth, the state head is still too generic. It predicts a useful set of reveal-related fields, but it does not yet expose the right task-specific latent variables for foliage, bag, and folded cloth. Those tasks are not the same. They share the same reveal-and-retrieve pattern, but they do not share the same dominant failure modes.
47
+
48
+ Sixth, the test suite is mostly contract-level. Those tests are useful, but they do not yet prove that the structured components work behaviorally. The RLBench side is similar. The launch smoke is only a plumbing check. The actual rollout evaluator exists, but it needs to become the main public benchmark path.
49
+
50
+ ## 3. The main design decision
51
+
52
+ Do not collapse this into a generic monolithic VLA. That is not the likely win condition for these tasks.
53
+
54
+ The highest-probability path is a stronger visual backbone plus an explicit structured reveal-and-retrieve stack. The reason is simple. Your target tasks are asymmetric, partially observable, persistence-sensitive, and reocclusion-sensitive. One arm often has to create and maintain a temporary affordance that only exists because of that arm’s continued state. Generic end-to-end BC can sometimes imitate the behavior, but these tasks strongly reward explicit representations of opening quality, hold persistence, target belief, reocclusion risk, and actor feasibility.
55
+
56
+ The structured architecture should stay. It should just become spatial, task-aware, and evaluated honestly.
57
+
58
+ ## 4. Mandatory code changes
59
+
60
+ ### 4.1 Fix and strengthen the geometry path
61
+
62
+ Files to change:
63
+
64
+ `models/backbones.py`
65
+ `models/multiview_fusion.py`
66
+ `models/policy.py`
67
+ `tests/test_rgbd_forward_contract.py` (extend)
68
+ Add new tests: `tests/test_geometry_tokens_propagate.py`, `tests/test_camera_rotation_geometry.py`
69
+
70
+ Exact changes:
71
+
72
+ In `models/policy.py`, update the image encoding path so that `geometry_tokens` are passed from `backbone.encode_images(..., return_aux=True)` into the fusion module. Right now the policy forwards `rgb_tokens`, `depth_tokens`, and `camera_tokens`, but not `geometry_tokens`. This should be corrected first because it is an actual information-drop bug.
73
+
74
+ In `models/multiview_fusion.py`, update the fusion interface to accept explicit `geometry_tokens`. The geometry attention path should fuse from a real concatenation or gated combination of `[depth_tokens, geometry_tokens, camera_tokens]`, rather than synthesizing “geometry” only from the surviving depth and camera paths. Keep the existing gated cross-attention pattern, but make the geometry path explicit and inspectable.
75
+
76
+ In `models/backbones.py`, upgrade `DepthPatchAdapter` so that geometry features include camera orientation. Use a 6D rotation representation or a normalized quaternion plus translation. Also add per-patch viewing ray directions derived from intrinsics and camera pose. The three target environments all rely on view geometry and persistent multi-view correspondence. The current translation-only pose treatment is too weak.
77
+
78
+ Add config flags that actually do something. The current `use_camera_geometry` style config needs to gate a real path, not just exist as a dormant option. Add separate switches for `use_depth_tokens`, `use_geometry_tokens`, and `use_camera_pose_tokens` so ablations are clean.
79
+
80
+ Why this matters: the foliage and bag tasks are especially sensitive to camera geometry because small apparent gaps can be fake from one viewpoint and usable from another. The actor feasibility estimate should depend on geometry, not just appearance.
81
+
82
+ ### 4.2 Replace pooled novelty memory with spatial reveal memory
83
+
84
+ Files to change:
85
+
86
+ `models/observation_memory.py`
87
+ `models/policy.py`
88
+ `models/reveal_head.py`
89
+ Add new tests: `tests/test_spatial_memory_occlusion_persistence.py`, `tests/test_memory_slot_write_gating.py`, `tests/test_reocclusion_memory_regression.py`
90
+
91
+ Exact changes:
92
+
93
+ Keep the current memory modules as a fallback baseline, but add a new default path that stores low-resolution spatial memory instead of only pooled history summaries. The simplest realistic version is a two-branch memory:
94
+
95
+ 1. scene memory: a small bank of view-conditioned or canonicalized spatial tokens for persistent geometry and support structure;
96
+ 2. belief memory: a spatial target-belief / reveal-state memory that carries uncertainty explicitly.
97
+
98
+ The memory does not need to be large. An 8×8 or 12×12 field token grid per view (or a shared canonical field) is enough. The key requirement is that the write gate becomes spatial or slot-wise, not global only. The model must be able to update “the mouth is open here” without overwriting “the target is probably still here”.
99
+
100
+ Add explicit channels or latent heads for:
101
+ - newly revealed regions
102
+ - still-visible regions
103
+ - reoccluded regions
104
+ - persistent hold or opening quality
105
+ - target belief uncertainty
106
+
107
+ The world model and planner should consume this spatial memory directly. Do not average it away before planning.
108
+
109
+ Why this matters: a reveal-and-retrieve policy that forgets where the useful opening was, or where the hidden object probably still is, will look competent in one-step imitation and fail in multi-step retrieval.
110
+
111
+ ### 4.3 Replace the compact world model with a spatial rollout model
112
+
113
+ Files to change:
114
+
115
+ `models/world_model.py`
116
+ `models/policy.py`
117
+ `train/losses.py`
118
+ Add new tests: `tests/test_world_model_null_rollout.py`, `tests/test_world_model_identity_rollout.py`, `tests/test_world_model_field_consistency.py`, `tests/test_world_model_task_adapter.py`
119
+
120
+ Exact changes:
121
+
122
+ Keep the current compact GRU world model only as an ablation. The default model should become a spatial latent rollout over field tokens or low-resolution maps. A realistic implementation is a ConvGRU or a token-wise recurrent transformer over a low-resolution field state. The world-model state should contain at least:
123
+
124
+ - target belief field
125
+ - visibility or reveal field
126
+ - actor feasibility / corridor field
127
+ - opening quality or hold quality field
128
+ - persistence field
129
+ - disturbance / damage risk field
130
+ - reocclusion risk field
131
+ - support stability field
132
+
133
+ Add task conditioning directly into the world model. A learned task embedding (`foliage`, `bag`, `cloth`) should modulate the transition. The dynamics are not the same and should not be forced into one unstructured transition model.
134
+
135
+ Retain explicit ablation modes inside `models/world_model.py`:
136
+ - `identity_rollout`
137
+ - `null_rollout`
138
+ - `compact_rollout` (the current baseline)
139
+ - `spatial_rollout` (new default)
140
+
141
+ These ablations must be real and deterministic. The world-model ablation confusion in the current repo shows why this needs to be explicit and unit-tested.
142
+
143
+ Why this matters: the planner can only beat a simple decoder if its counterfactual rollouts capture persistence and collapse. Without a spatial world model, the “maintain opening while actor advances” pattern will be under-modeled.
144
+
145
+ ### 4.4 Make the reveal head task-aware
146
+
147
+ Files to change:
148
+
149
+ `models/reveal_head.py`
150
+ `train/losses.py`
151
+ `sim_reveal/dataset.py`
152
+ `sim_reveal/procedural_envs.py`
153
+ Add new tests: `tests/test_task_conditioned_head_shapes.py`, `tests/test_task_metric_monotonicity.py`
154
+
155
+ Exact changes:
156
+
157
+ Add a task embedding to the reveal head. Keep the shared trunk, but use task-specific adapters or low-rank heads for the final outputs. The head should still produce common fields, but each task must also expose the state variables that actually matter.
158
+
159
+ For foliage, add:
160
+ - gap width or reveal corridor width
161
+ - canopy strain / damage risk
162
+ - occluder return tendency (reocclusion after release)
163
+ - target visibility confidence under flexible occluders
164
+
165
+ For bag, add:
166
+ - mouth aperture width or area
167
+ - rim endpoint or rim grasp quality
168
+ - hold quality
169
+ - rim slip risk
170
+ - insertable actor corridor
171
+
172
+ For cloth or suitcase, add:
173
+ - layer separation quality
174
+ - fold-preservation score
175
+ - insertion corridor
176
+ - top-layer stability
177
+ - “lift too much” risk
178
+
179
+ The current generic fields (`actor_feasibility_field`, `persistence_field`, `risk_field`, `uncertainty_field`, `reocclusion`) are useful, but they are not enough. The planner needs the task-specific variables because the right action for bag opening is not the right action for layered cloth.
180
+
181
+ ### 4.5 Replace Gaussian candidate noise with semantic macro candidates plus continuous refinement
182
+
183
+ Files to change:
184
+
185
+ `models/action_decoder.py`
186
+ `models/planner.py`
187
+ `sim_reveal/dataset.py`
188
+ `sim_reveal/procedural_envs.py`
189
+ Add new tests: `tests/test_candidate_macro_coverage.py`, `tests/test_planner_reocclusion_gating.py`, `tests/test_proposal_semantic_diversity.py`
190
+
191
+ Exact changes:
192
+
193
+ Keep the current proposal mechanism as a fallback. The default candidate set should become a set of semantic macro modes, each refined by continuous deltas.
194
+
195
+ The candidate vocabulary should be task-aware.
196
+
197
+ For foliage:
198
+ - `sweep_left`
199
+ - `sweep_right`
200
+ - `pin_canopy`
201
+ - `widen_gap`
202
+ - `maintain_gap`
203
+ - `insert_actor`
204
+ - `retrieve`
205
+
206
+ For bag:
207
+ - `pin_left_rim`
208
+ - `pin_right_rim`
209
+ - `widen_mouth`
210
+ - `maintain_mouth`
211
+ - `probe_inside`
212
+ - `insert_actor`
213
+ - `retrieve`
214
+
215
+ For cloth:
216
+ - `lift_edge`
217
+ - `separate_layer`
218
+ - `stabilize_fold`
219
+ - `maintain_lift`
220
+ - `insert_actor`
221
+ - `retrieve`
222
+
223
+ Represent these as discrete proposal tokens or a macro head in `action_decoder.py`, then produce continuous chunk deltas conditioned on the chosen macro. The planner should shortlist across macro families first and refine within each family second. That prevents “all candidates are tiny perturbations around the same wrong idea”.
224
+
225
+ In `models/planner.py`, add hard feasibility gates before utility aggregation. Do not let the planner prefer “retrieve now” if actor feasibility, hold quality, or support stability are below threshold. Use worst-step or CVaR-style penalties for reocclusion and collapse, rather than only mean penalties. These tasks fail on bad tails, not just on averages.
226
+
227
+ Why this matters: the current planner is too dependent on easy local ranking. Real reveal-and-retrieve requires semantically different plans, not just slightly different noise vectors.
228
+
229
+ ### 4.6 Change the loss stack to supervise what actually matters
230
+
231
+ Files to change:
232
+
233
+ `train/losses.py`
234
+ `train/trainer.py` (if needed for logging)
235
+ Add new tests: `tests/test_candidate_ranking_loss.py`, `tests/test_phase_labels_not_action_only.py`, `tests/test_planner_gradient_flow.py`
236
+
237
+ Exact changes:
238
+
239
+ Reduce dependence on heuristic phase labels inferred from the current action chunk. That heuristic is acceptable for early bootstrapping, but it should not remain the main source of phase supervision. Prefer simulator-side phase or subgoal labels where available. If those are not reliable, phase should be a weak auxiliary, not a strong driver.
240
+
241
+ Add pairwise or listwise ranking loss over candidate action chunks using actual rollout utility labels. These labels should come from simulated outcomes, not just from “teacher is first, noise is worse”.
242
+
243
+ Add consistency losses:
244
+ - predicted opening quality should correlate with rollout persistence
245
+ - predicted reocclusion should correlate with actual collapse after release
246
+ - predicted uncertainty should be calibrated against outcome uncertainty or visibility error
247
+
248
+ Lower the relative weight of pure behavior cloning once ranking and rollout supervision are reliable. This project should not stay as BC-with-many-auxiliaries.
249
+
250
+ ## 5. Mandatory data-generation changes
251
+
252
+ Files to change:
253
+
254
+ `sim_reveal/dataset.py`
255
+ `sim_reveal/procedural_envs.py`
256
+ Add new tests: `tests/test_dataset_hard_negative_presence.py`, `tests/test_no_leak_with_new_labels.py`, `tests/test_teacher_audit.py`
257
+
258
+ Exact changes:
259
+
260
+ The dataset generation path must stop relying on teacher-plus-Gaussian-noise as the dominant source of planner candidates. Keep the teacher as one source, but add hard negative families that reflect actual task failures.
261
+
262
+ Required negative families for all three tasks:
263
+
264
+ 1. premature retrieve: actor attempts retrieval before corridor and hold quality are sufficient;
265
+ 2. reveal-with-release: revealer creates an opening but fails to maintain it;
266
+ 3. over-disturbance: revealer opens aggressively but causes collapse or damage risk;
267
+ 4. wrong-side or wrong-edge reveal: the opening is created in a useless place;
268
+ 5. delayed actor entry: revealer holds too long and wastes time or destabilizes the scene;
269
+ 6. actor path through weak corridor: actor enters where access exists visually but not safely.
270
+
271
+ Required task-specific negative families:
272
+
273
+ For foliage:
274
+ - swipe that increases visibility briefly but induces immediate reocclusion;
275
+ - push direction that hides the target from the actor side;
276
+ - gap on the wrong side of the target.
277
+
278
+ For bag:
279
+ - one-rim lift that slips instead of widening the mouth;
280
+ - opening wide enough visually but not stable enough for actor insertion;
281
+ - actor reaches through the fabric instead of through the aperture.
282
+
283
+ For cloth:
284
+ - lift too high and destroy fold structure;
285
+ - lift the wrong layer;
286
+ - retrieve path that drags clothing and unfolds the stack.
287
+
288
+ The dataset should record candidate-level rollout outcomes for every candidate chunk:
289
+ - success
290
+ - reveal achieved
291
+ - visibility AUC
292
+ - hold persistence
293
+ - reocclusion rate
294
+ - disturbance cost
295
+ - fold-preservation (cloth)
296
+ - mouth aperture / hold quality (bag)
297
+ - damage proxy / gap width (foliage)
298
+
299
+ This candidate-level outcome table should be the source of planner labels.
300
+
301
+ Also add a teacher audit report. The current teacher is a useful bootstrap, but it is not enough to assume it is good. The audit should compare the teacher against reveal-only, retrieve-only, no-hold, and random policy baselines on the current proxy suite.
302
+
303
+ ## 6. Small but mandatory engineering cleanups
304
+
305
+ These changes do not change model quality directly, but they reduce evaluation ambiguity and future regressions.
306
+
307
+ In `tests/conftest.py`, remove the hardcoded `/workspace/VLAarchtests/code/reveal_vla_bimanual` path. Replace it with a path derived from `Path(__file__).resolve()` so tests run anywhere.
308
+
309
+ In `eval/run_rlbench_rollout_eval.py`, preserve richer episode traces. Save chosen macro mode, planner scores, confidence, predicted reocclusion, path recoveries, noop fallbacks, and whether support-mode conditioning was enabled.
310
+
311
+ In `eval/run_reveal_benchmark.py`, stop using only the default 24 episodes for serious comparisons. Keep 24 as a smoke benchmark, but add a “serious” mode at 100 or 200 episodes per proxy.
312
+
313
+ In `eval/run_reveal_benchmark.py`, explicitly report `chunk_commit_steps` and do not leave the main reveal benchmark at a commit horizon of zero by default. These tasks are not purely one-step reactive.
314
+
315
+ In the eval reporting utilities, add bootstrap confidence intervals and paired-seed comparisons. The differences you care about are often a few percentage points. Unpaired noisy comparisons are not enough.
316
+
317
+ ## 7. Exact new tests to verify the implementation
318
+
319
+ The current repo has contract tests. Keep them. Add the following behavioral tests.
320
+
321
+ ### 7.1 Geometry and fusion tests
322
+
323
+ `tests/test_geometry_tokens_propagate.py`
324
+
325
+ Construct a tiny batch with fixed RGB and depth. Modify only camera rotation. Verify that:
326
+ 1. `geometry_tokens` change,
327
+ 2. the fused scene representation changes when geometry is enabled,
328
+ 3. the fused scene representation does not change when geometry is disabled.
329
+
330
+ `tests/test_camera_rotation_geometry.py`
331
+
332
+ Use two cameras with identical translation and different rotation. Verify that the policy representation is rotation-sensitive after the geometry fix. This should fail on the current code and pass after the change.
333
+
334
+ ### 7.2 Spatial memory tests
335
+
336
+ `tests/test_spatial_memory_occlusion_persistence.py`
337
+
338
+ Use a scripted proxy sequence where the target is briefly visible, then fully occluded, then visible again. Verify that belief memory retains a localized target belief during occlusion and sharpens it after reappearance. This should test both persistence and uncertainty.
339
+
340
+ `tests/test_memory_slot_write_gating.py`
341
+
342
+ Feed a scene where only the opening region changes. Verify that only a minority of memory slots or cells update. This prevents global overwriting.
343
+
344
+ `tests/test_reocclusion_memory_regression.py`
345
+
346
+ Create a scripted “open then release” sequence. Verify that memory tracks reocclusion and that predicted hold quality declines.
347
+
348
+ ### 7.3 World-model tests
349
+
350
+ `tests/test_world_model_null_rollout.py`
351
+
352
+ Assert that `null_rollout` returns an exact or near-exact identity state and does not apply unintended updates.
353
+
354
+ `tests/test_world_model_identity_rollout.py`
355
+
356
+ Assert that `identity_rollout` preserves state across steps while leaving logging fields consistent.
357
+
358
+ `tests/test_world_model_field_consistency.py`
359
+
360
+ Roll out one deterministic proxy step and compare predicted next-step fields against simulator privileged fields. Enforce MAE thresholds per field, not only a single scalar.
361
+
362
+ `tests/test_world_model_task_adapter.py`
363
+
364
+ Use the same initial field state with different task embeddings. Verify that transitions differ in a consistent way. This catches dead task-conditioning code paths.
365
+
366
+ ### 7.4 Candidate and planner tests
367
+
368
+ `tests/test_candidate_macro_coverage.py`
369
+
370
+ Verify that the proposal generator returns at least one candidate from each required macro family when requested.
371
+
372
+ `tests/test_planner_reocclusion_gating.py`
373
+
374
+ Create a scripted case where one candidate retrieves immediately but causes opening collapse, and another candidate maintains the opening first. Verify that the planner picks the maintain-first plan.
375
+
376
+ `tests/test_proposal_semantic_diversity.py`
377
+
378
+ Do not measure diversity only by vector distance. Also verify macro-family diversity and rollout outcome diversity.
379
+
380
+ ### 7.5 Task-head tests
381
+
382
+ `tests/test_task_conditioned_head_shapes.py`
383
+
384
+ Verify output presence and shapes for all common fields and all task-specific fields.
385
+
386
+ `tests/test_task_metric_monotonicity.py`
387
+
388
+ Use small synthetic perturbations:
389
+ - increase aperture in bag: `opening_quality` should increase;
390
+ - increase canopy gap in foliage: `actor_feasibility` should increase;
391
+ - over-lift cloth: `fold_preservation` should decrease.
392
+
393
+ These are not full scientific tests, but they catch dead or miswired heads quickly.
394
+
395
+ ### 7.6 Dataset and leakage tests
396
+
397
+ `tests/test_dataset_hard_negative_presence.py`
398
+
399
+ Sample dataset items and verify that candidate sets contain hard negative families, not just teacher-centered noise.
400
+
401
+ `tests/test_no_leak_with_new_labels.py`
402
+
403
+ Extend the no-leak assertions to cover all new task-specific labels and maps. The proxy dataset must keep using rendered observations only on the input side.
404
+
405
+ `tests/test_teacher_audit.py`
406
+
407
+ Require the teacher to beat random, retrieve-only, and reveal-only on the proxy metrics. If the teacher itself is weak, the whole planner training signal is questionable.
408
+
409
+ ### 7.7 Scripted proxy behavior suite
410
+
411
+ Add a new deterministic behavioral test suite, for example under `tests/test_proxy_scripted_bench.py`.
412
+
413
+ This suite should include 10 to 20 deterministic seeds per task with hand-designed initial states. The expected winner should be obvious.
414
+
415
+ Required scripted cases:
416
+ - bag: `maintain_mouth` should beat `retrieve` immediately on hold persistence and success;
417
+ - foliage: `pin_canopy` should beat `random_swipe` on reocclusion and visibility AUC;
418
+ - cloth: `stabilize_fold` should beat `lift_high` on fold-preservation and success.
419
+
420
+ The full model does not need to be perfect on these, but the planner should select the intended candidate at least 80 percent of the time.
421
+
422
+ ## 8. Exact benchmark plan to estimate performance
423
+
424
+ Separate the benchmarks into two layers. The first layer verifies that the implementation behaves correctly. The second estimates real performance against baselines.
425
+
426
+ ### 8.1 Layer A: implementation-verification benchmarks
427
+
428
+ These are not publication benchmarks. They are gates.
429
+
430
+ Run the full unit and integration suite after every architecture milestone:
431
+
432
+ ```bash
433
+ PYTHONPATH=code/reveal_vla_bimanual pytest tests -q
434
+ ```
435
+
436
+ After the new behavioral tests are added, require all of the following before moving on:
437
+ - all geometry propagation tests pass;
438
+ - the scripted proxy suite passes;
439
+ - world-model null and identity ablations pass exactly;
440
+ - candidate macro coverage passes;
441
+ - no-leak assertions pass with new task fields.
442
+
443
+ Then run a deterministic proxy smoke benchmark on fixed seeds (for example 10 per task) to catch obvious regressions:
444
+
445
+ ```bash
446
+ cd code/reveal_vla_bimanual
447
+ python -m eval.run_reveal_benchmark \
448
+ --model full=/abs/path/checkpoint.pt \
449
+ --episodes 10 \
450
+ --proxies foliage bag cloth \
451
+ --chunk-commit-steps 4 \
452
+ --output-root /abs/path/reports/reveal_smoke
453
+ ```
454
+
455
+ This benchmark is only for regression detection. It is not a performance claim.
456
+
457
+ ### 8.2 Layer B: strengthened proxy benchmark (main task-aligned benchmark now)
458
+
459
+ This should become the main internal benchmark until real teleop data exists.
460
+
461
+ Use the existing `foliage`, `bag`, and `cloth` proxies, but strengthen them and evaluate seriously:
462
+ - at least 100 deterministic seeds per proxy for final comparisons;
463
+ - paired-seed evaluation across all ablations;
464
+ - chunk commit horizons of at least 4, and also report a 0/2/4 sweep once;
465
+ - no teacher involvement during evaluation.
466
+
467
+ Run the base benchmark:
468
+
469
+ ```bash
470
+ cd code/reveal_vla_bimanual
471
+ python -m eval.run_reveal_benchmark \
472
+ --model full=/abs/path/checkpoint.pt \
473
+ --episodes 100 \
474
+ --proxies foliage bag cloth \
475
+ --chunk-commit-steps 4 \
476
+ --output-root /abs/path/reports/reveal_full
477
+ ```
478
+
479
+ Run required paired ablations from the same checkpoint family or retrained checkpoints:
480
+ - no geometry tokens
481
+ - no spatial memory
482
+ - compact world model instead of spatial
483
+ - no planner
484
+ - planner with Gaussian candidates only
485
+ - no task-conditioned head
486
+ - no support-mode conditioning
487
+
488
+ The proxy benchmark must report at least these metrics:
489
+ - retrieve success
490
+ - reveal success
491
+ - target visibility AUC
492
+ - actor-feasibility AUC
493
+ - hold persistence
494
+ - reocclusion rate
495
+ - disturbance cost
496
+ - planner top-1 on candidate rollouts
497
+ - world-model next-step MAE
498
+ - uncertainty calibration
499
+ - candidate ranking NDCG
500
+
501
+ Add task-specific metrics:
502
+ - foliage: gap width, damage proxy, release-collapse rate
503
+ - bag: aperture width or area, rim slip rate, insertion success
504
+ - cloth: fold-preservation score, layer separation quality, drag-induced disturbance
505
+
506
+ Acceptance gate for continuing toward public baseline comparison:
507
+ - the full model should beat the current repo’s RGB-D baseline on mean proxy success and on at least two of the three proxies;
508
+ - planner-on should beat planner-off on at least two of the three proxies and on hard-negative candidate ranking;
509
+ - spatial world model should beat compact and null rollouts on persistence and reocclusion prediction;
510
+ - task-conditioned head should beat generic head on at least one task-specific metric per target task.
511
+
512
+ ### 8.3 Layer C: RLBench / PerAct2 bimanual rollout benchmark
513
+
514
+ The repo already has the right hook for this. Use `run_rlbench_rollout_eval.py` and `run_peract2_task_sweep.py` as the main public benchmark entry points. Do not treat `run_peract2_launch_smoke.py` as evaluation. It is only a launch check.
515
+
516
+ Run the full existing PerAct2 13-task split from `sim_rlbench/task_splits.py::PERACT2_BIMANUAL_TASKS`:
517
+
518
+ ```bash
519
+ cd code/reveal_vla_bimanual
520
+ python -m eval.run_peract2_task_sweep \
521
+ --checkpoint /abs/path/checkpoint.pt \
522
+ --output-root /abs/path/reports/peract2_13 \
523
+ --episodes-per-task 25 \
524
+ --episode-length 20 \
525
+ --resolution 224 \
526
+ --chunk-commit-steps 4 \
527
+ --allow-unsupervised-planning \
528
+ --headless
529
+ ```
530
+
531
+ Also run direct single-task evaluations when debugging:
532
+
533
+ ```bash
534
+ cd code/reveal_vla_bimanual
535
+ python -m eval.run_rlbench_rollout_eval \
536
+ --checkpoint /abs/path/checkpoint.pt \
537
+ --output-dir /abs/path/reports/rlbench_debug \
538
+ --tasks RightOpenDrawer \
539
+ --episodes-per-task 25 \
540
+ --episode-length 20 \
541
+ --resolution 224 \
542
+ --plan \
543
+ --chunk-commit-steps 4 \
544
+ --allow-unsupervised-planning \
545
+ --headless
546
+ ```
547
+
548
+ This benchmark is not a direct match to the three target tasks, but it is the main public bimanual sanity check. It measures whether the structured modifications hurt or help general bimanual competence.
549
+
550
+ Required comparisons on this benchmark:
551
+ - current repo best checkpoint
552
+ - full improved model
553
+ - no-planner ablation
554
+ - compact world model ablation
555
+ - no geometry ablation
556
+ - no task-conditioning ablation
557
+
558
+ If external baseline code is available, evaluate against:
559
+ - PerAct2
560
+ - InterACT
561
+ - VoxAct-B
562
+ - AnyBimanual
563
+
564
+ If compute allows, also compare against foundation-scale baselines as a separate category:
565
+ - TwinVLA
566
+ - RDT-1B
567
+
568
+ Fairness requirements:
569
+ - same camera setup if possible (front plus both wrists);
570
+ - same resolution;
571
+ - same episode length and reset policy;
572
+ - same task list;
573
+ - same number of evaluation episodes;
574
+ - report whether baselines use extra large-scale pretraining.
575
+
576
+ This benchmark should report:
577
+ - per-task success
578
+ - mean success
579
+ - mean return
580
+ - path recoveries
581
+ - noop fallbacks
582
+ - plan-on vs plan-off
583
+ - per-episode planner traces for error analysis
584
+
585
+ ### 8.4 Layer D: deformable-manipulation public benchmarks
586
+
587
+ You do not yet have custom teleop data, so the closest public matches for bag and cloth should be used now.
588
+
589
+ Recommended benchmarks:
590
+ - DeformableRavens
591
+ - SoftGym cloth tasks
592
+ - DaXBench cloth tasks
593
+
594
+ The exact subset should be chosen based on available tasks, but the mapping is straightforward. Bag-like opening and insertion tasks are the closest public proxy for the bag environment. Cloth lifting, separation, and manipulation tasks are the closest public proxy for the suitcase environment. There is no equally good public foliage benchmark, so the strengthened foliage proxy will remain the main stand-in until custom data exists.
595
+
596
+ Required evaluation protocol:
597
+ - same observation modalities across methods;
598
+ - same action horizon where possible;
599
+ - same random seeds;
600
+ - same episode budgets;
601
+ - report both success and task-specific deformation metrics.
602
+
603
+ Add at least these extra metrics on the deformable benchmarks:
604
+ - opening quality or aperture quality
605
+ - hold persistence under actor motion
606
+ - reocclusion or collapse rate
607
+ - disturbance cost
608
+ - fold-preservation or structural-preservation score
609
+
610
+ ### 8.5 Layer E: optional exploratory / active-perception benchmark
611
+
612
+ If EFM-10 or BAP code and data are actually available when implementation starts, add them. That benchmark is conceptually close to your task family because it measures exploratory plus focused manipulation under occlusion. Do not block the project on it if code is not readily usable.
613
+
614
+ ### 8.6 Layer F: optional broad generalization benchmark
615
+
616
+ If time allows, add RoboTwin 2.0 as a general bimanual breadth check. It is not a direct target-task match, but it is useful for checking whether the structured reveal-and-retrieve bias damages general bimanual transfer.
617
+
618
+ ## 9. Baseline strategy
619
+
620
+ There are two baseline groups and they should not be mixed carelessly.
621
+
622
+ The first group is matched-data or matched-setting baselines. These are the most useful for fair engineering comparison. Use PerAct2, InterACT, VoxAct-B, and AnyBimanual if code is available in a compatible evaluation setting.
623
+
624
+ The second group is foundation-scale baselines. These are useful, but they are not apples-to-apples unless you disclose the pretraining and model scale difference clearly. Use TwinVLA and RDT-1B in this category if compute allows.
625
+
626
+ Do not declare victory because the improved model beats the current repo checkpoint. That is a necessary condition, not the target claim.
627
+
628
+ ## 10. Acceptance criteria for “ready to collect real data”
629
+
630
+ Do not move into expensive teleop collection until all of the following are true.
631
+
632
+ First, the geometry and spatial memory tests pass and stay green for multiple checkpoints.
633
+
634
+ Second, the strengthened proxy benchmark shows that the full model beats the current repo baseline convincingly. The minimum bar should be improvement in overall proxy success plus improvement on at least two of the three task types.
635
+
636
+ Third, planner-on must beat planner-off on hard-negative ranking and on task success. If the planner does not beat the decoder baseline, then the explicit planning stack is not yet earning its complexity.
637
+
638
+ Fourth, the spatial world model must beat compact and null baselines on persistence and reocclusion prediction. If it does not, the planning story is still too weak.
639
+
640
+ Fifth, the improved model should at least match strong public baselines on the RLBench / PerAct2 suite, and ideally exceed them on the tasks most related to opening, holding, uncovering, and coordinated retrieval. If it is significantly behind there, the architecture is still too immature.
641
+
642
+ ## 11. Recommended implementation order
643
+
644
+ Phase 1 should fix information flow and evaluation trustworthiness. Implement geometry propagation, camera orientation encoding, and path cleanup in `tests/conftest.py`. Then add the new geometry tests and rerun the current proxy benchmark.
645
+
646
+ Phase 2 should add task-aware semantic candidates and hard-negative data generation. This is the fastest path to making the planner meaningful without yet rewriting the full memory and world model stack.
647
+
648
+ Phase 3 should add task-conditioned reveal outputs and the strengthened proxy metrics. At this stage the proxy benchmark should start reflecting the real task failure modes.
649
+
650
+ Phase 4 should replace pooled memory and compact rollout with the new spatial memory and spatial world model. This is the biggest change and should only happen after the eval harness can tell whether it helped.
651
+
652
+ Phase 5 should run the full internal ablation suite, then RLBench / PerAct2, then deformable public benchmarks, and only then decide whether the architecture is strong enough to justify real-data collection.
653
+
654
+ ## 12. What to avoid
655
+
656
+ Do not treat launch smoke as performance evaluation.
657
+
658
+ Do not keep teacher-centered Gaussian candidates as the main planner supervision source.
659
+
660
+ Do not remove task structure in favor of a generic monolithic BC model unless the structured architecture clearly fails. Nothing in the current repo proves that.
661
+
662
+ Do not use only mean success. These tasks need persistence, reocclusion, and structural-preservation metrics.
663
+
664
+ Do not claim the current planner or current world model are validated. They are not, yet.
665
+
666
+ ## 13. Minimal first patch set (the first pull request)
667
+
668
+ If only one implementation sprint is possible before deeper refactors, the first pull request should contain exactly this:
669
+
670
+ 1. fix `geometry_tokens` propagation from backbone to fusion to policy output;
671
+ 2. add camera rotation encoding in `DepthPatchAdapter`;
672
+ 3. add `tests/test_geometry_tokens_propagate.py` and `tests/test_camera_rotation_geometry.py`;
673
+ 4. replace hardcoded path logic in `tests/conftest.py`;
674
+ 5. extend `run_reveal_benchmark.py` reporting to save `chunk_commit_steps`, bootstrap confidence intervals, and paired-seed summaries;
675
+ 6. add semantic macro candidates in `action_decoder.py` without yet deleting the Gaussian fallback;
676
+ 7. add hard negative candidate generation in `sim_reveal/procedural_envs.py`;
677
+ 8. add the deterministic scripted proxy benchmark suite.
678
+
679
+ This first patch set will not make the model SOTA. It will make the repo trustworthy enough to support the larger refactor.
680
+
681
+ ## 14. Reference links
682
+
683
+ Repo root:
684
+ https://huggingface.co/lsnu/VLAarchtests/tree/main
685
+
686
+ Core files:
687
+ https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/models/backbones.py
688
+ https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/models/multiview_fusion.py
689
+ https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/models/observation_memory.py
690
+ https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/models/reveal_head.py
691
+ https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/models/world_model.py
692
+ https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/models/action_decoder.py
693
+ https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/models/planner.py
694
+ https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/models/policy.py
695
+ https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/train/losses.py
696
+ https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/sim_reveal/dataset.py
697
+ https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/sim_reveal/procedural_envs.py
698
+ https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/eval/run_reveal_benchmark.py
699
+ https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/eval/run_rlbench_rollout_eval.py
700
+ https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/eval/run_peract2_task_sweep.py
701
+
702
+ Public benchmark / baseline references to align against:
703
+ PerAct2 / RLBench2 bimanual benchmark: https://bimanual.github.io/
704
+ InterACT: https://dannyran123.github.io/interact/
705
+ VoxAct-B: https://voxact-b.github.io/
706
+ AnyBimanual: https://anybimanual.github.io/
707
+ TwinVLA: https://twinvla.github.io/
708
+ RDT-1B: https://rdt-robotics.github.io/rdt-robotics/
709
+ DeformableRavens: https://deformableravens.github.io/
710
+ SoftGym: https://sites.google.com/view/softgym/home
711
+ DaXBench: https://daxbench.github.io/
712
+ EFM / BAP: https://efmanipulation.github.io/
713
+ RoboTwin 2.0: https://robotwin-platform.github.io/
714
+
715
+ ## 15. Final recommendation
716
+
717
+ The architecture should be pursued, but only in a narrower and more explicit form: task-structured bimanual reveal-and-retrieve under elastic occlusion. The current repo is close enough to that idea to be worth continuing. The most important next step is not collecting real data yet. It is making the geometry path real, making the planner learn from hard failure cases, and making the world model spatial enough that “maintain the opening while the other arm retrieves” is something the system can actually predict rather than merely imitate.
results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ /workspace/envs/rlbench/bin/python -m eval.run_rlbench_rollout_eval --checkpoint /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt --output-dir /workspace/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons --tasks bimanual_dual_push_buttons --episodes-per-task 1 --episode-length 20 --resolution 224 --device cuda --chunk-commit-steps 4 --headless --plan --allow-unsupervised-planning
results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/rollout_eval.json ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt",
3
+ "plan_requested": true,
4
+ "plan_applied": true,
5
+ "planner_mode": "trainable",
6
+ "support_mode_conditioning": true,
7
+ "task_conditioning": true,
8
+ "geometry_enabled": true,
9
+ "world_model_mode": "checkpoint_default",
10
+ "episodes_per_task": 1,
11
+ "episode_length": 20,
12
+ "resolution": 224,
13
+ "reset_retries": 20,
14
+ "cameras": [
15
+ "front",
16
+ "wrist_left",
17
+ "wrist_right"
18
+ ],
19
+ "tasks": {
20
+ "bimanual_dual_push_buttons": {
21
+ "task_class": "BimanualDualPushButtons",
22
+ "successes": [
23
+ 0.0
24
+ ],
25
+ "returns": [
26
+ 0.0
27
+ ],
28
+ "path_recoveries": [
29
+ 0
30
+ ],
31
+ "noop_fallbacks": [
32
+ 0
33
+ ],
34
+ "reset_retries": [
35
+ 0
36
+ ],
37
+ "episode_traces": [
38
+ {
39
+ "language_goal": "push the olive and the orange buttons",
40
+ "steps": [
41
+ {
42
+ "timestep": 0,
43
+ "chosen_macro_mode": "lift_support_layer",
44
+ "planner_scores": [
45
+ 11.197153091430664,
46
+ 11.241825103759766,
47
+ 11.236907005310059,
48
+ 11.205011367797852
49
+ ],
50
+ "predicted_reocclusion": 0.5305227041244507,
51
+ "support_mode_conditioning": true,
52
+ "path_recoveries": 0,
53
+ "noop_fallbacks": 0
54
+ },
55
+ {
56
+ "timestep": 1,
57
+ "chosen_macro_mode": "lift_support_layer",
58
+ "planner_scores": [
59
+ 11.197153091430664,
60
+ 11.241825103759766,
61
+ 11.236907005310059,
62
+ 11.205011367797852
63
+ ],
64
+ "predicted_reocclusion": 0.5305227041244507,
65
+ "support_mode_conditioning": true,
66
+ "path_recoveries": 0,
67
+ "noop_fallbacks": 0
68
+ },
69
+ {
70
+ "timestep": 2,
71
+ "chosen_macro_mode": "lift_support_layer",
72
+ "planner_scores": [
73
+ 11.197153091430664,
74
+ 11.241825103759766,
75
+ 11.236907005310059,
76
+ 11.205011367797852
77
+ ],
78
+ "predicted_reocclusion": 0.5305227041244507,
79
+ "support_mode_conditioning": true,
80
+ "path_recoveries": 0,
81
+ "noop_fallbacks": 0
82
+ },
83
+ {
84
+ "timestep": 3,
85
+ "chosen_macro_mode": "lift_support_layer",
86
+ "planner_scores": [
87
+ 11.197153091430664,
88
+ 11.241825103759766,
89
+ 11.236907005310059,
90
+ 11.205011367797852
91
+ ],
92
+ "predicted_reocclusion": 0.5305227041244507,
93
+ "support_mode_conditioning": true,
94
+ "path_recoveries": 0,
95
+ "noop_fallbacks": 0
96
+ },
97
+ {
98
+ "timestep": 4,
99
+ "chosen_macro_mode": "lift_support_layer",
100
+ "planner_scores": [
101
+ 11.1263427734375,
102
+ 11.163692474365234,
103
+ 11.160633087158203,
104
+ 11.130797386169434
105
+ ],
106
+ "predicted_reocclusion": 0.5315501689910889,
107
+ "support_mode_conditioning": true,
108
+ "path_recoveries": 0,
109
+ "noop_fallbacks": 0
110
+ },
111
+ {
112
+ "timestep": 5,
113
+ "chosen_macro_mode": "lift_support_layer",
114
+ "planner_scores": [
115
+ 11.1263427734375,
116
+ 11.163692474365234,
117
+ 11.160633087158203,
118
+ 11.130797386169434
119
+ ],
120
+ "predicted_reocclusion": 0.5315501689910889,
121
+ "support_mode_conditioning": true,
122
+ "path_recoveries": 0,
123
+ "noop_fallbacks": 0
124
+ },
125
+ {
126
+ "timestep": 6,
127
+ "chosen_macro_mode": "lift_support_layer",
128
+ "planner_scores": [
129
+ 11.1263427734375,
130
+ 11.163692474365234,
131
+ 11.160633087158203,
132
+ 11.130797386169434
133
+ ],
134
+ "predicted_reocclusion": 0.5315501689910889,
135
+ "support_mode_conditioning": true,
136
+ "path_recoveries": 0,
137
+ "noop_fallbacks": 0
138
+ },
139
+ {
140
+ "timestep": 7,
141
+ "chosen_macro_mode": "lift_support_layer",
142
+ "planner_scores": [
143
+ 11.1263427734375,
144
+ 11.163692474365234,
145
+ 11.160633087158203,
146
+ 11.130797386169434
147
+ ],
148
+ "predicted_reocclusion": 0.5315501689910889,
149
+ "support_mode_conditioning": true,
150
+ "path_recoveries": 0,
151
+ "noop_fallbacks": 0
152
+ },
153
+ {
154
+ "timestep": 8,
155
+ "chosen_macro_mode": "lift_support_layer",
156
+ "planner_scores": [
157
+ 11.077136039733887,
158
+ 11.114724159240723,
159
+ 11.111690521240234,
160
+ 11.081738471984863
161
+ ],
162
+ "predicted_reocclusion": 0.5313586592674255,
163
+ "support_mode_conditioning": true,
164
+ "path_recoveries": 0,
165
+ "noop_fallbacks": 0
166
+ },
167
+ {
168
+ "timestep": 9,
169
+ "chosen_macro_mode": "lift_support_layer",
170
+ "planner_scores": [
171
+ 11.077136039733887,
172
+ 11.114724159240723,
173
+ 11.111690521240234,
174
+ 11.081738471984863
175
+ ],
176
+ "predicted_reocclusion": 0.5313586592674255,
177
+ "support_mode_conditioning": true,
178
+ "path_recoveries": 0,
179
+ "noop_fallbacks": 0
180
+ },
181
+ {
182
+ "timestep": 10,
183
+ "chosen_macro_mode": "lift_support_layer",
184
+ "planner_scores": [
185
+ 11.077136039733887,
186
+ 11.114724159240723,
187
+ 11.111690521240234,
188
+ 11.081738471984863
189
+ ],
190
+ "predicted_reocclusion": 0.5313586592674255,
191
+ "support_mode_conditioning": true,
192
+ "path_recoveries": 0,
193
+ "noop_fallbacks": 0
194
+ },
195
+ {
196
+ "timestep": 11,
197
+ "chosen_macro_mode": "lift_support_layer",
198
+ "planner_scores": [
199
+ 11.077136039733887,
200
+ 11.114724159240723,
201
+ 11.111690521240234,
202
+ 11.081738471984863
203
+ ],
204
+ "predicted_reocclusion": 0.5313586592674255,
205
+ "support_mode_conditioning": true,
206
+ "path_recoveries": 0,
207
+ "noop_fallbacks": 0
208
+ },
209
+ {
210
+ "timestep": 12,
211
+ "chosen_macro_mode": "lift_support_layer",
212
+ "planner_scores": [
213
+ 11.042268753051758,
214
+ 11.08004379272461,
215
+ 11.07697868347168,
216
+ 11.046956062316895
217
+ ],
218
+ "predicted_reocclusion": 0.5312807559967041,
219
+ "support_mode_conditioning": true,
220
+ "path_recoveries": 0,
221
+ "noop_fallbacks": 0
222
+ },
223
+ {
224
+ "timestep": 13,
225
+ "chosen_macro_mode": "lift_support_layer",
226
+ "planner_scores": [
227
+ 11.042268753051758,
228
+ 11.08004379272461,
229
+ 11.07697868347168,
230
+ 11.046956062316895
231
+ ],
232
+ "predicted_reocclusion": 0.5312807559967041,
233
+ "support_mode_conditioning": true,
234
+ "path_recoveries": 0,
235
+ "noop_fallbacks": 0
236
+ },
237
+ {
238
+ "timestep": 14,
239
+ "chosen_macro_mode": "lift_support_layer",
240
+ "planner_scores": [
241
+ 11.042268753051758,
242
+ 11.08004379272461,
243
+ 11.07697868347168,
244
+ 11.046956062316895
245
+ ],
246
+ "predicted_reocclusion": 0.5312807559967041,
247
+ "support_mode_conditioning": true,
248
+ "path_recoveries": 0,
249
+ "noop_fallbacks": 0
250
+ },
251
+ {
252
+ "timestep": 15,
253
+ "chosen_macro_mode": "lift_support_layer",
254
+ "planner_scores": [
255
+ 11.042268753051758,
256
+ 11.08004379272461,
257
+ 11.07697868347168,
258
+ 11.046956062316895
259
+ ],
260
+ "predicted_reocclusion": 0.5312807559967041,
261
+ "support_mode_conditioning": true,
262
+ "path_recoveries": 0,
263
+ "noop_fallbacks": 0
264
+ },
265
+ {
266
+ "timestep": 16,
267
+ "chosen_macro_mode": "lift_support_layer",
268
+ "planner_scores": [
269
+ 11.03925895690918,
270
+ 11.076944351196289,
271
+ 11.073898315429688,
272
+ 11.043900489807129
273
+ ],
274
+ "predicted_reocclusion": 0.5312473773956299,
275
+ "support_mode_conditioning": true,
276
+ "path_recoveries": 0,
277
+ "noop_fallbacks": 0
278
+ },
279
+ {
280
+ "timestep": 17,
281
+ "chosen_macro_mode": "lift_support_layer",
282
+ "planner_scores": [
283
+ 11.03925895690918,
284
+ 11.076944351196289,
285
+ 11.073898315429688,
286
+ 11.043900489807129
287
+ ],
288
+ "predicted_reocclusion": 0.5312473773956299,
289
+ "support_mode_conditioning": true,
290
+ "path_recoveries": 0,
291
+ "noop_fallbacks": 0
292
+ },
293
+ {
294
+ "timestep": 18,
295
+ "chosen_macro_mode": "lift_support_layer",
296
+ "planner_scores": [
297
+ 11.03925895690918,
298
+ 11.076944351196289,
299
+ 11.073898315429688,
300
+ 11.043900489807129
301
+ ],
302
+ "predicted_reocclusion": 0.5312473773956299,
303
+ "support_mode_conditioning": true,
304
+ "path_recoveries": 0,
305
+ "noop_fallbacks": 0
306
+ },
307
+ {
308
+ "timestep": 19,
309
+ "chosen_macro_mode": "lift_support_layer",
310
+ "planner_scores": [
311
+ 11.03925895690918,
312
+ 11.076944351196289,
313
+ 11.073898315429688,
314
+ 11.043900489807129
315
+ ],
316
+ "predicted_reocclusion": 0.5312473773956299,
317
+ "support_mode_conditioning": true,
318
+ "path_recoveries": 0,
319
+ "noop_fallbacks": 0
320
+ }
321
+ ],
322
+ "success": 0.0,
323
+ "return": 0.0,
324
+ "path_recoveries": 0,
325
+ "noop_fallbacks": 0
326
+ }
327
+ ],
328
+ "mean_success": 0.0,
329
+ "mean_return": 0.0
330
+ }
331
+ },
332
+ "mean_success": 0.0
333
+ }
results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/rollout_eval.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RLBench Rollout Eval
2
+
3
+ - Checkpoint: `/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt`
4
+ - Plan requested: `True`
5
+ - Plan applied: `True`
6
+ - Support-mode conditioning: `True`
7
+ - Task conditioning: `True`
8
+ - Geometry enabled: `True`
9
+ - World-model mode: `checkpoint_default`
10
+ - Mean success: `0.000`
11
+
12
+ ## Per-task
13
+
14
+ - `bimanual_dual_push_buttons`: mean_success=0.000, returns=[0.0]
results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/stderr.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ /workspace/envs/rlbench/lib/python3.10/site-packages/torch/nn/modules/transformer.py:306: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
2
+ warnings.warn(f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}")
3
+ qt.qpa.xcb: QXcbConnection: XCB error: 148 (Unknown), sequence: 181, resource id: 0, major code: 140 (Unknown), minor code: 20
4
+ WARNING:root:not sure how _robot_shapes are used is used.
results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/stdout.txt ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt",
3
+ "plan_requested": true,
4
+ "plan_applied": true,
5
+ "planner_mode": "trainable",
6
+ "support_mode_conditioning": true,
7
+ "task_conditioning": true,
8
+ "geometry_enabled": true,
9
+ "world_model_mode": "checkpoint_default",
10
+ "episodes_per_task": 1,
11
+ "episode_length": 20,
12
+ "resolution": 224,
13
+ "reset_retries": 20,
14
+ "cameras": [
15
+ "front",
16
+ "wrist_left",
17
+ "wrist_right"
18
+ ],
19
+ "tasks": {
20
+ "bimanual_dual_push_buttons": {
21
+ "task_class": "BimanualDualPushButtons",
22
+ "successes": [
23
+ 0.0
24
+ ],
25
+ "returns": [
26
+ 0.0
27
+ ],
28
+ "path_recoveries": [
29
+ 0
30
+ ],
31
+ "noop_fallbacks": [
32
+ 0
33
+ ],
34
+ "reset_retries": [
35
+ 0
36
+ ],
37
+ "episode_traces": [
38
+ {
39
+ "language_goal": "push the olive and the orange buttons",
40
+ "steps": [
41
+ {
42
+ "timestep": 0,
43
+ "chosen_macro_mode": "lift_support_layer",
44
+ "planner_scores": [
45
+ 11.197153091430664,
46
+ 11.241825103759766,
47
+ 11.236907005310059,
48
+ 11.205011367797852
49
+ ],
50
+ "predicted_reocclusion": 0.5305227041244507,
51
+ "support_mode_conditioning": true,
52
+ "path_recoveries": 0,
53
+ "noop_fallbacks": 0
54
+ },
55
+ {
56
+ "timestep": 1,
57
+ "chosen_macro_mode": "lift_support_layer",
58
+ "planner_scores": [
59
+ 11.197153091430664,
60
+ 11.241825103759766,
61
+ 11.236907005310059,
62
+ 11.205011367797852
63
+ ],
64
+ "predicted_reocclusion": 0.5305227041244507,
65
+ "support_mode_conditioning": true,
66
+ "path_recoveries": 0,
67
+ "noop_fallbacks": 0
68
+ },
69
+ {
70
+ "timestep": 2,
71
+ "chosen_macro_mode": "lift_support_layer",
72
+ "planner_scores": [
73
+ 11.197153091430664,
74
+ 11.241825103759766,
75
+ 11.236907005310059,
76
+ 11.205011367797852
77
+ ],
78
+ "predicted_reocclusion": 0.5305227041244507,
79
+ "support_mode_conditioning": true,
80
+ "path_recoveries": 0,
81
+ "noop_fallbacks": 0
82
+ },
83
+ {
84
+ "timestep": 3,
85
+ "chosen_macro_mode": "lift_support_layer",
86
+ "planner_scores": [
87
+ 11.197153091430664,
88
+ 11.241825103759766,
89
+ 11.236907005310059,
90
+ 11.205011367797852
91
+ ],
92
+ "predicted_reocclusion": 0.5305227041244507,
93
+ "support_mode_conditioning": true,
94
+ "path_recoveries": 0,
95
+ "noop_fallbacks": 0
96
+ },
97
+ {
98
+ "timestep": 4,
99
+ "chosen_macro_mode": "lift_support_layer",
100
+ "planner_scores": [
101
+ 11.1263427734375,
102
+ 11.163692474365234,
103
+ 11.160633087158203,
104
+ 11.130797386169434
105
+ ],
106
+ "predicted_reocclusion": 0.5315501689910889,
107
+ "support_mode_conditioning": true,
108
+ "path_recoveries": 0,
109
+ "noop_fallbacks": 0
110
+ },
111
+ {
112
+ "timestep": 5,
113
+ "chosen_macro_mode": "lift_support_layer",
114
+ "planner_scores": [
115
+ 11.1263427734375,
116
+ 11.163692474365234,
117
+ 11.160633087158203,
118
+ 11.130797386169434
119
+ ],
120
+ "predicted_reocclusion": 0.5315501689910889,
121
+ "support_mode_conditioning": true,
122
+ "path_recoveries": 0,
123
+ "noop_fallbacks": 0
124
+ },
125
+ {
126
+ "timestep": 6,
127
+ "chosen_macro_mode": "lift_support_layer",
128
+ "planner_scores": [
129
+ 11.1263427734375,
130
+ 11.163692474365234,
131
+ 11.160633087158203,
132
+ 11.130797386169434
133
+ ],
134
+ "predicted_reocclusion": 0.5315501689910889,
135
+ "support_mode_conditioning": true,
136
+ "path_recoveries": 0,
137
+ "noop_fallbacks": 0
138
+ },
139
+ {
140
+ "timestep": 7,
141
+ "chosen_macro_mode": "lift_support_layer",
142
+ "planner_scores": [
143
+ 11.1263427734375,
144
+ 11.163692474365234,
145
+ 11.160633087158203,
146
+ 11.130797386169434
147
+ ],
148
+ "predicted_reocclusion": 0.5315501689910889,
149
+ "support_mode_conditioning": true,
150
+ "path_recoveries": 0,
151
+ "noop_fallbacks": 0
152
+ },
153
+ {
154
+ "timestep": 8,
155
+ "chosen_macro_mode": "lift_support_layer",
156
+ "planner_scores": [
157
+ 11.077136039733887,
158
+ 11.114724159240723,
159
+ 11.111690521240234,
160
+ 11.081738471984863
161
+ ],
162
+ "predicted_reocclusion": 0.5313586592674255,
163
+ "support_mode_conditioning": true,
164
+ "path_recoveries": 0,
165
+ "noop_fallbacks": 0
166
+ },
167
+ {
168
+ "timestep": 9,
169
+ "chosen_macro_mode": "lift_support_layer",
170
+ "planner_scores": [
171
+ 11.077136039733887,
172
+ 11.114724159240723,
173
+ 11.111690521240234,
174
+ 11.081738471984863
175
+ ],
176
+ "predicted_reocclusion": 0.5313586592674255,
177
+ "support_mode_conditioning": true,
178
+ "path_recoveries": 0,
179
+ "noop_fallbacks": 0
180
+ },
181
+ {
182
+ "timestep": 10,
183
+ "chosen_macro_mode": "lift_support_layer",
184
+ "planner_scores": [
185
+ 11.077136039733887,
186
+ 11.114724159240723,
187
+ 11.111690521240234,
188
+ 11.081738471984863
189
+ ],
190
+ "predicted_reocclusion": 0.5313586592674255,
191
+ "support_mode_conditioning": true,
192
+ "path_recoveries": 0,
193
+ "noop_fallbacks": 0
194
+ },
195
+ {
196
+ "timestep": 11,
197
+ "chosen_macro_mode": "lift_support_layer",
198
+ "planner_scores": [
199
+ 11.077136039733887,
200
+ 11.114724159240723,
201
+ 11.111690521240234,
202
+ 11.081738471984863
203
+ ],
204
+ "predicted_reocclusion": 0.5313586592674255,
205
+ "support_mode_conditioning": true,
206
+ "path_recoveries": 0,
207
+ "noop_fallbacks": 0
208
+ },
209
+ {
210
+ "timestep": 12,
211
+ "chosen_macro_mode": "lift_support_layer",
212
+ "planner_scores": [
213
+ 11.042268753051758,
214
+ 11.08004379272461,
215
+ 11.07697868347168,
216
+ 11.046956062316895
217
+ ],
218
+ "predicted_reocclusion": 0.5312807559967041,
219
+ "support_mode_conditioning": true,
220
+ "path_recoveries": 0,
221
+ "noop_fallbacks": 0
222
+ },
223
+ {
224
+ "timestep": 13,
225
+ "chosen_macro_mode": "lift_support_layer",
226
+ "planner_scores": [
227
+ 11.042268753051758,
228
+ 11.08004379272461,
229
+ 11.07697868347168,
230
+ 11.046956062316895
231
+ ],
232
+ "predicted_reocclusion": 0.5312807559967041,
233
+ "support_mode_conditioning": true,
234
+ "path_recoveries": 0,
235
+ "noop_fallbacks": 0
236
+ },
237
+ {
238
+ "timestep": 14,
239
+ "chosen_macro_mode": "lift_support_layer",
240
+ "planner_scores": [
241
+ 11.042268753051758,
242
+ 11.08004379272461,
243
+ 11.07697868347168,
244
+ 11.046956062316895
245
+ ],
246
+ "predicted_reocclusion": 0.5312807559967041,
247
+ "support_mode_conditioning": true,
248
+ "path_recoveries": 0,
249
+ "noop_fallbacks": 0
250
+ },
251
+ {
252
+ "timestep": 15,
253
+ "chosen_macro_mode": "lift_support_layer",
254
+ "planner_scores": [
255
+ 11.042268753051758,
256
+ 11.08004379272461,
257
+ 11.07697868347168,
258
+ 11.046956062316895
259
+ ],
260
+ "predicted_reocclusion": 0.5312807559967041,
261
+ "support_mode_conditioning": true,
262
+ "path_recoveries": 0,
263
+ "noop_fallbacks": 0
264
+ },
265
+ {
266
+ "timestep": 16,
267
+ "chosen_macro_mode": "lift_support_layer",
268
+ "planner_scores": [
269
+ 11.03925895690918,
270
+ 11.076944351196289,
271
+ 11.073898315429688,
272
+ 11.043900489807129
273
+ ],
274
+ "predicted_reocclusion": 0.5312473773956299,
275
+ "support_mode_conditioning": true,
276
+ "path_recoveries": 0,
277
+ "noop_fallbacks": 0
278
+ },
279
+ {
280
+ "timestep": 17,
281
+ "chosen_macro_mode": "lift_support_layer",
282
+ "planner_scores": [
283
+ 11.03925895690918,
284
+ 11.076944351196289,
285
+ 11.073898315429688,
286
+ 11.043900489807129
287
+ ],
288
+ "predicted_reocclusion": 0.5312473773956299,
289
+ "support_mode_conditioning": true,
290
+ "path_recoveries": 0,
291
+ "noop_fallbacks": 0
292
+ },
293
+ {
294
+ "timestep": 18,
295
+ "chosen_macro_mode": "lift_support_layer",
296
+ "planner_scores": [
297
+ 11.03925895690918,
298
+ 11.076944351196289,
299
+ 11.073898315429688,
300
+ 11.043900489807129
301
+ ],
302
+ "predicted_reocclusion": 0.5312473773956299,
303
+ "support_mode_conditioning": true,
304
+ "path_recoveries": 0,
305
+ "noop_fallbacks": 0
306
+ },
307
+ {
308
+ "timestep": 19,
309
+ "chosen_macro_mode": "lift_support_layer",
310
+ "planner_scores": [
311
+ 11.03925895690918,
312
+ 11.076944351196289,
313
+ 11.073898315429688,
314
+ 11.043900489807129
315
+ ],
316
+ "predicted_reocclusion": 0.5312473773956299,
317
+ "support_mode_conditioning": true,
318
+ "path_recoveries": 0,
319
+ "noop_fallbacks": 0
320
+ }
321
+ ],
322
+ "success": 0.0,
323
+ "return": 0.0,
324
+ "path_recoveries": 0,
325
+ "noop_fallbacks": 0
326
+ }
327
+ ],
328
+ "mean_success": 0.0,
329
+ "mean_return": 0.0
330
+ }
331
+ },
332
+ "mean_success": 0.0
333
+ }
334
+ [CoppeliaSim:loadinfo] done.
results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_handover_item/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ /workspace/envs/rlbench/bin/python -m eval.run_rlbench_rollout_eval --checkpoint /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt --output-dir /workspace/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_handover_item --tasks bimanual_handover_item --episodes-per-task 1 --episode-length 20 --resolution 224 --device cuda --chunk-commit-steps 4 --headless --plan --allow-unsupervised-planning