2026-03-25 runpod handoff update
Browse filesUpload updated code, tests, environment recreation files, generated proxy datasets, new checkpoints, and raw result artifacts from the 2026-03-25 /workspace runpod session.
This view is limited to 50 files because it contains too many changes. See raw diff
- README.md +62 -158
- artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_phase.pt +3 -0
- artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_phase.pt +3 -0
- artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/checkpoint_best.pt +3 -0
- artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/config_resolved.yaml +153 -0
- artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/metrics.json +179 -0
- artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/summary.json +103 -0
- artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/checkpoint_best.pt +3 -0
- artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/config_resolved.yaml +153 -0
- artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/metrics.json +297 -0
- artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/summary.json +103 -0
- artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/checkpoint_best.pt +3 -0
- artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/config_resolved.yaml +153 -0
- artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/metrics.json +179 -0
- artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/summary.json +103 -0
- artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/checkpoint_best.pt +3 -0
- artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/config_resolved.yaml +153 -0
- artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/metrics.json +238 -0
- artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/summary.json +103 -0
- code/reveal_vla_bimanual/eval/ablations.py +6 -4
- code/reveal_vla_bimanual/eval/compare_rlbench_sweeps.py +143 -0
- code/reveal_vla_bimanual/eval/run_ablations.py +17 -1
- code/reveal_vla_bimanual/eval/run_peract2_task_sweep.py +28 -0
- code/reveal_vla_bimanual/eval/run_reveal_benchmark.py +141 -22
- code/reveal_vla_bimanual/eval/run_rlbench_rollout_eval.py +60 -4
- code/reveal_vla_bimanual/eval/run_teacher_audit.py +115 -0
- code/reveal_vla_bimanual/models/action_decoder.py +106 -4
- code/reveal_vla_bimanual/models/backbones.py +73 -43
- code/reveal_vla_bimanual/models/multiview_fusion.py +9 -1
- code/reveal_vla_bimanual/models/observation_memory.py +56 -18
- code/reveal_vla_bimanual/models/planner.py +64 -6
- code/reveal_vla_bimanual/models/policy.py +84 -7
- code/reveal_vla_bimanual/models/reveal_head.py +161 -7
- code/reveal_vla_bimanual/models/world_model.py +207 -4
- code/reveal_vla_bimanual/scripts/run_rlbench_handoff_eval.sh +107 -0
- code/reveal_vla_bimanual/sim_reveal/dataset.py +58 -4
- code/reveal_vla_bimanual/sim_reveal/procedural_envs.py +296 -9
- code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact.yaml +150 -0
- code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase.yaml +73 -0
- code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial.yaml +150 -0
- code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase.yaml +73 -0
- code/reveal_vla_bimanual/train/losses.py +46 -6
- results/2026-03-25-runpod/README.md +124 -0
- results/2026-03-25-runpod/instructions.md +717 -0
- results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/command.txt +1 -0
- results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/rollout_eval.json +333 -0
- results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/rollout_eval.md +14 -0
- results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/stderr.txt +4 -0
- results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/stdout.txt +334 -0
- results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_handover_item/command.txt +1 -0
README.md
CHANGED
|
@@ -9,195 +9,99 @@ tags:
|
|
| 9 |
|
| 10 |
# VLAarchtests
|
| 11 |
|
| 12 |
-
|
| 13 |
|
| 14 |
-
##
|
| 15 |
|
| 16 |
- `code/reveal_vla_bimanual/`
|
| 17 |
-
- current project code
|
| 18 |
-
- `artifacts/outputs/r3d/`
|
| 19 |
-
- R3D-VLA proxy checkpoints, benchmarks, diagnostics, RLBench outputs, and PerAct2 smoke artifacts
|
| 20 |
-
- `regression/baselines.md`
|
| 21 |
-
- locked historical baselines from the downloaded snapshot
|
| 22 |
-
- `results/phase_tracking.md`
|
| 23 |
-
- phase-by-phase gate accounting and acceptance status
|
| 24 |
- `tests/`
|
| 25 |
-
- unit tests for RGB, RGB-D, planner, memory, world-model, and dataset contracts
|
| 26 |
- `environment/`
|
| 27 |
-
|
| 28 |
-
- `
|
| 29 |
-
|
|
|
|
| 30 |
|
| 31 |
-
##
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
- Added the repo-preserving R3D-VLA refactor with updates in:
|
| 36 |
- `code/reveal_vla_bimanual/models/backbones.py`
|
| 37 |
- `code/reveal_vla_bimanual/models/multiview_fusion.py`
|
|
|
|
|
|
|
| 38 |
- `code/reveal_vla_bimanual/models/observation_memory.py`
|
| 39 |
- `code/reveal_vla_bimanual/models/reveal_head.py`
|
| 40 |
- `code/reveal_vla_bimanual/models/world_model.py`
|
|
|
|
| 41 |
- `code/reveal_vla_bimanual/models/action_decoder.py`
|
| 42 |
- `code/reveal_vla_bimanual/models/planner.py`
|
| 43 |
-
|
| 44 |
- `code/reveal_vla_bimanual/train/losses.py`
|
| 45 |
-
- `code/reveal_vla_bimanual/train/smoke_checks.py`
|
| 46 |
- `code/reveal_vla_bimanual/sim_reveal/dataset.py`
|
| 47 |
- `code/reveal_vla_bimanual/sim_reveal/procedural_envs.py`
|
| 48 |
-
-
|
| 49 |
-
- `
|
| 50 |
-
- `
|
| 51 |
-
- `
|
| 52 |
-
- `
|
| 53 |
-
- `
|
| 54 |
-
- `
|
| 55 |
-
- `proxy_interaction_r3d_ablation_noplanner.yaml`
|
| 56 |
-
- `proxy_interaction_r3d_ablation_nowm.yaml`
|
| 57 |
-
- `proxy_interaction_r3d_ablation_norolesym.yaml`
|
| 58 |
-
|
| 59 |
-
### Evaluation and Integration Changes
|
| 60 |
-
|
| 61 |
-
- Completed the requested proxy matrix:
|
| 62 |
-
- stage 1 dummy, 3 seeds
|
| 63 |
-
- stage 1 dummy `no_planner`
|
| 64 |
-
- stage 1 dummy `no_role_symmetry`
|
| 65 |
-
- stage 2 dummy, 3 seeds
|
| 66 |
-
- stage 2 dummy `no_world_model`
|
| 67 |
-
- stage 2 dummy `short_history`
|
| 68 |
-
- stage 1 clip, 3 seeds
|
| 69 |
-
- stage 2 clip, 3 seeds
|
| 70 |
-
- stage 3 clip RGB-D, 3 seeds
|
| 71 |
-
- stage 3 clip RGB-D `no_depth`
|
| 72 |
-
- Completed RLBench integration artifacts:
|
| 73 |
-
- import/config smoke
|
| 74 |
-
- `open_drawer` launch smoke
|
| 75 |
-
- `open_drawer` rollout with JSON output
|
| 76 |
-
- Added a dedicated PerAct2 13-task launch smoke harness:
|
| 77 |
-
- `code/reveal_vla_bimanual/eval/run_peract2_launch_smoke.py`
|
| 78 |
-
- `code/reveal_vla_bimanual/sim_rlbench/launch_smoke.py` now records finite-action checks
|
| 79 |
-
|
| 80 |
-
### Tests
|
| 81 |
-
|
| 82 |
-
- Full local test suite result:
|
| 83 |
-
- `10 passed`
|
| 84 |
-
|
| 85 |
-
## Same-Machine Setup Files
|
| 86 |
-
|
| 87 |
-
- `environment/setup_same_machine.sh`
|
| 88 |
-
- `environment/validate_same_machine.sh`
|
| 89 |
-
- `environment/run_peract2_13_rollouts.sh`
|
| 90 |
-
- `environment/runtime_env_vars.sh`
|
| 91 |
-
- `environment/hardware_snapshot.txt`
|
| 92 |
-
- `environment/glxinfo_B.txt`
|
| 93 |
-
- `environment/upstream_revisions.txt`
|
| 94 |
-
- `environment/system_packages_same_machine.txt`
|
| 95 |
-
- `environment/rlbench_env_export.yaml`
|
| 96 |
-
- `environment/rlbench_env_explicit.txt`
|
| 97 |
-
- `environment/rlbench_pip_freeze.txt`
|
| 98 |
-
- `environment/reveal_env_export.yaml`
|
| 99 |
-
- `environment/reveal_env_explicit.txt`
|
| 100 |
-
- `environment/reveal_pip_freeze.txt`
|
| 101 |
-
|
| 102 |
-
## Raw Proxy Benchmark Matrix
|
| 103 |
-
|
| 104 |
-
| Run | Seeds | Mean success | foliage | bag | cloth | Reocclusion | Persistence MAE | Disturbance | Planner top-1 | Proposal diversity | Swap error |
|
| 105 |
-
| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
| 106 |
-
| stage1 dummy full | `13,14,15` | 0.5787 | 0.4444 | 0.6111 | 0.6806 | 0.0000 | 1.9553 | 0.3649 | 0.2832 | 0.0245 | 0.007680 |
|
| 107 |
-
| stage1 dummy `no_planner` | `13,14,15` | 0.5648 | 0.4306 | 0.5972 | 0.6667 | 0.0000 | 1.9599 | 0.3765 | n/a | n/a | n/a |
|
| 108 |
-
| stage1 dummy `no_role_symmetry` | `13,14,15` | 0.5833 | 0.4583 | 0.6111 | 0.6806 | 0.0000 | 1.9475 | 0.3547 | n/a | n/a | n/a |
|
| 109 |
-
| stage2 dummy full | `21,22,23` | 0.5463 | 0.4444 | 0.5417 | 0.6528 | 0.0121 | 2.2358 | 0.3148 | 0.3442 | 0.0245 | 0.005036 |
|
| 110 |
-
| stage2 dummy `no_world_model` | `21,22,23` | 0.5463 | 0.4444 | 0.5417 | 0.6528 | 0.0027 | 2.3600 | 0.3287 | n/a | n/a | n/a |
|
| 111 |
-
| stage2 dummy `short_history` | `21,22,23` | 0.5463 | 0.4444 | 0.5417 | 0.6528 | 0.0121 | 2.2349 | 0.3148 | n/a | n/a | n/a |
|
| 112 |
-
| stage1 clip full | `7,8,9` | 0.5324 | 0.4306 | 0.5278 | 0.6389 | 0.0244 | 1.3636 | 0.2808 | 0.2676 | 0.0217 | 0.000155 |
|
| 113 |
-
| stage2 clip full | `11,12,13` | 0.4954 | 0.3889 | 0.4583 | 0.6389 | 0.0117 | 2.3198 | 0.2722 | 0.2693 | 0.0216 | 0.000186 |
|
| 114 |
-
| stage3 clip RGB-D full | `17,18,19` | 0.5741 | 0.4861 | 0.5417 | 0.6944 | 0.0151 | 1.7883 | 0.2258 | 0.3265 | 0.0270 | 0.000094 |
|
| 115 |
-
| stage3 clip RGB-D `no_depth` | `17,18,19` | 0.5231 | 0.4167 | 0.4722 | 0.6806 | 0.0198 | 2.0491 | 0.2548 | n/a | n/a | n/a |
|
| 116 |
-
|
| 117 |
-
Full artifact roots are indexed in `MODEL_INDEX.md`.
|
| 118 |
|
| 119 |
-
|
| 120 |
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
-
|
| 124 |
-
| --- | ---: | ---: |
|
| 125 |
-
| stage1 dummy full | 20.45 | 629.62 |
|
| 126 |
-
| stage2 dummy full | 20.76 | 639.39 |
|
| 127 |
-
| stage1 clip full | 156.16 | 1908.92 |
|
| 128 |
-
| stage2 clip full | 141.55 | 1902.54 |
|
| 129 |
-
| stage3 clip RGB-D full | 145.93 | 1952.12 |
|
| 130 |
-
|
| 131 |
-
## Raw RLBench Outputs
|
| 132 |
-
|
| 133 |
-
### Import And Launch Smokes
|
| 134 |
-
|
| 135 |
-
- import/config smoke file:
|
| 136 |
-
- `artifacts/outputs/r3d/rlbench_smokes/smoke_test_output.txt`
|
| 137 |
-
- `open_drawer` launch smoke files:
|
| 138 |
-
- `artifacts/outputs/r3d/rlbench_smokes/launch_smoke_open_drawer.txt`
|
| 139 |
-
- `artifacts/outputs/r3d/rlbench_smokes/launch_smoke_open_drawer.stderr`
|
| 140 |
|
| 141 |
-
|
|
|
|
| 142 |
|
| 143 |
-
|
| 144 |
-
| --- | --- |
|
| 145 |
-
| task | `RightOpenDrawer` |
|
| 146 |
-
| headless | `true` |
|
| 147 |
-
| front_rgb_shape | `[224, 224, 3]` |
|
| 148 |
-
| wrist_left_rgb_shape | `[224, 224, 3]` |
|
| 149 |
-
| wrist_right_rgb_shape | `[224, 224, 3]` |
|
| 150 |
-
| action_finite | `true` |
|
| 151 |
-
| action_dim | `18` |
|
| 152 |
-
| reward | `0.0` |
|
| 153 |
-
| done | `false` |
|
| 154 |
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
-
|
| 158 |
|
| 159 |
-
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
|
| 164 |
-
| --- |
|
| 165 |
-
|
|
| 166 |
-
|
|
| 167 |
-
| task_class | `RightOpenDrawer` |
|
| 168 |
-
| episodes_per_task | `1` |
|
| 169 |
-
| episode_length | `5` |
|
| 170 |
-
| mean_success | `0.0` |
|
| 171 |
-
| mean_return | `0.0` |
|
| 172 |
-
| reset_retries | `[0]` |
|
| 173 |
-
|
| 174 |
-
## Raw PerAct2 Integration Output
|
| 175 |
|
| 176 |
-
|
| 177 |
|
| 178 |
-
-
|
| 179 |
-
- `
|
|
|
|
|
|
|
| 180 |
|
| 181 |
-
|
| 182 |
|
| 183 |
-
|
|
| 184 |
-
| --- | --- |
|
| 185 |
-
|
|
| 186 |
-
|
|
| 187 |
-
|
|
| 188 |
-
| error_tasks | `[]` |
|
| 189 |
-
| resolution | `224` |
|
| 190 |
-
| headless | `true` |
|
| 191 |
|
| 192 |
-
|
| 193 |
|
| 194 |
-
- `
|
| 195 |
|
| 196 |
-
##
|
| 197 |
|
| 198 |
-
-
|
| 199 |
-
|
| 200 |
-
-
|
| 201 |
-
|
| 202 |
-
-
|
| 203 |
-
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# VLAarchtests
|
| 11 |
|
| 12 |
+
Update uploaded from the `/workspace` runpod session dated `2026-03-25 UTC`.
|
| 13 |
|
| 14 |
+
## Updated Paths
|
| 15 |
|
| 16 |
- `code/reveal_vla_bimanual/`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
- `tests/`
|
|
|
|
| 18 |
- `environment/`
|
| 19 |
+
- `artifacts/data/reveal_proxy/`
|
| 20 |
+
- `artifacts/outputs/r3d_handoff/`
|
| 21 |
+
- `artifacts/outputs/r3d_handoff_phase/`
|
| 22 |
+
- `results/2026-03-25-runpod/`
|
| 23 |
|
| 24 |
+
## Primary Source Changes
|
| 25 |
|
| 26 |
+
- Geometry path and camera-pose propagation updates:
|
|
|
|
|
|
|
| 27 |
- `code/reveal_vla_bimanual/models/backbones.py`
|
| 28 |
- `code/reveal_vla_bimanual/models/multiview_fusion.py`
|
| 29 |
+
- `code/reveal_vla_bimanual/models/policy.py`
|
| 30 |
+
- Spatial memory and world-model updates:
|
| 31 |
- `code/reveal_vla_bimanual/models/observation_memory.py`
|
| 32 |
- `code/reveal_vla_bimanual/models/reveal_head.py`
|
| 33 |
- `code/reveal_vla_bimanual/models/world_model.py`
|
| 34 |
+
- Semantic candidate and planner updates:
|
| 35 |
- `code/reveal_vla_bimanual/models/action_decoder.py`
|
| 36 |
- `code/reveal_vla_bimanual/models/planner.py`
|
| 37 |
+
- Loss, dataset, and simulator updates:
|
| 38 |
- `code/reveal_vla_bimanual/train/losses.py`
|
|
|
|
| 39 |
- `code/reveal_vla_bimanual/sim_reveal/dataset.py`
|
| 40 |
- `code/reveal_vla_bimanual/sim_reveal/procedural_envs.py`
|
| 41 |
+
- Evaluation and RLBench tooling updates:
|
| 42 |
+
- `code/reveal_vla_bimanual/eval/run_reveal_benchmark.py`
|
| 43 |
+
- `code/reveal_vla_bimanual/eval/run_teacher_audit.py`
|
| 44 |
+
- `code/reveal_vla_bimanual/eval/run_rlbench_rollout_eval.py`
|
| 45 |
+
- `code/reveal_vla_bimanual/eval/run_peract2_task_sweep.py`
|
| 46 |
+
- `code/reveal_vla_bimanual/eval/compare_rlbench_sweeps.py`
|
| 47 |
+
- `code/reveal_vla_bimanual/scripts/run_rlbench_handoff_eval.sh`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
## Validation
|
| 50 |
|
| 51 |
+
- Test command:
|
| 52 |
+
- `PYTHONPATH=/workspace/VLAarchtests_work/code/reveal_vla_bimanual python -m pytest -q /workspace/VLAarchtests_work/tests`
|
| 53 |
+
- Result:
|
| 54 |
+
- `33 passed`
|
| 55 |
|
| 56 |
+
## Generated Datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
+
- `artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_phase.pt`
|
| 59 |
+
- `artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_phase.pt`
|
| 60 |
|
| 61 |
+
## Generated Checkpoints
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
+
- `artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/`
|
| 64 |
+
- `artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/`
|
| 65 |
+
- `artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/`
|
| 66 |
+
- `artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/`
|
| 67 |
|
| 68 |
+
## Raw Result Summary
|
| 69 |
|
| 70 |
+
### Proxy Serious Comparisons
|
| 71 |
|
| 72 |
+
| File | Reference mean success | Compared mean success |
|
| 73 |
+
| --- | ---: | ---: |
|
| 74 |
+
| `results/2026-03-25-runpod/reports/reveal_handoff_compare_serious/reveal_benchmark.json` | 0.583333 | 0.216667 |
|
| 75 |
+
| `results/2026-03-25-runpod/reports/reveal_handoff_compare_serious_compact/reveal_benchmark.json` | 0.583333 | 0.520000 |
|
| 76 |
+
| `results/2026-03-25-runpod/reports/reveal_phase_compare_serious_compact/reveal_benchmark.json` | 0.583333 | 0.513333 |
|
| 77 |
+
| `results/2026-03-25-runpod/reports/reveal_phase_compare_serious_spatial_compactwm/reveal_benchmark.json` | 0.583333 | 0.493333 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
+
### Proxy Ablations
|
| 80 |
|
| 81 |
+
- Full ablation matrix:
|
| 82 |
+
- `results/2026-03-25-runpod/reports/reveal_phase_ablations_compact/ablations.json`
|
| 83 |
+
- Teacher audit:
|
| 84 |
+
- `results/2026-03-25-runpod/reports/reveal_teacher_audit_serious/teacher_audit.json`
|
| 85 |
|
| 86 |
+
### RLBench
|
| 87 |
|
| 88 |
+
| File | Mean success |
|
| 89 |
+
| --- | ---: |
|
| 90 |
+
| `results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/rollout_eval.json` | 0.000000 |
|
| 91 |
+
| `results/2026-03-25-runpod/reports/peract2_spatial_full_ep1/spatial_phase_seed17_noplan_split/rollout_eval.json` | 0.000000 |
|
| 92 |
+
| `results/2026-03-25-runpod/reports/peract2_spatial_full_ep1/spatial_phase_seed17_plan_split/rollout_eval.json` | 0.000000 |
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
+
## Detailed Raw Index
|
| 95 |
|
| 96 |
+
- `results/2026-03-25-runpod/README.md`
|
| 97 |
|
| 98 |
+
## Environment Recreation
|
| 99 |
|
| 100 |
+
- `environment/README.md`
|
| 101 |
+
- `environment/setup_same_machine.sh`
|
| 102 |
+
- `environment/validate_same_machine.sh`
|
| 103 |
+
- `environment/runtime_env_vars.sh`
|
| 104 |
+
- `environment/upstream_revisions.txt`
|
| 105 |
+
- `environment/rlbench_env_export.yaml`
|
| 106 |
+
- `environment/rlbench_env_explicit.txt`
|
| 107 |
+
- `environment/rlbench_pip_freeze.txt`
|
artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_phase.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:918679191157abb31b3523be4b69ff7b95da4c373d130dd24a0db1314b57ec19
|
| 3 |
+
size 583377508
|
artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_phase.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f1b1b90c882067eba271f59430b783c7fda2edbe6f221360a245ef32eef602d1
|
| 3 |
+
size 200844508
|
artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/checkpoint_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:360037583572613590e99f5f06766729edbecfc4ee2ea0950fe12bb08e562d83
|
| 3 |
+
size 940662478
|
artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/config_resolved.yaml
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17
|
| 2 |
+
output_dir: /workspace/outputs/r3d_handoff
|
| 3 |
+
device: cuda
|
| 4 |
+
seed: 17
|
| 5 |
+
init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt
|
| 6 |
+
init_strict: false
|
| 7 |
+
data:
|
| 8 |
+
proxies:
|
| 9 |
+
- foliage_proxy
|
| 10 |
+
- bag_proxy
|
| 11 |
+
- cloth_proxy
|
| 12 |
+
resolution: 224
|
| 13 |
+
dataset_version: reveal_proxy_v6_rgbd_elastic_state
|
| 14 |
+
train_episodes_per_proxy: 48
|
| 15 |
+
val_episodes_per_proxy: 16
|
| 16 |
+
train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3.pt
|
| 17 |
+
val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3.pt
|
| 18 |
+
rebuild_dataset: false
|
| 19 |
+
chunk_horizon: 8
|
| 20 |
+
rollout_horizon: 5
|
| 21 |
+
history_steps: 6
|
| 22 |
+
planner_candidates: 8
|
| 23 |
+
seed: 17
|
| 24 |
+
optim:
|
| 25 |
+
epochs: 3
|
| 26 |
+
batch_size: 4
|
| 27 |
+
num_workers: 24
|
| 28 |
+
lr: 0.0001
|
| 29 |
+
weight_decay: 0.0001
|
| 30 |
+
trainer:
|
| 31 |
+
policy_type: elastic_reveal
|
| 32 |
+
use_bf16: true
|
| 33 |
+
grad_clip_norm: 1.0
|
| 34 |
+
freeze_backbone: true
|
| 35 |
+
gradient_checkpointing: false
|
| 36 |
+
plan_during_train: true
|
| 37 |
+
plan_during_eval: true
|
| 38 |
+
support_mode_conditioning: true
|
| 39 |
+
planner_mode: trainable
|
| 40 |
+
use_depth: true
|
| 41 |
+
use_world_model: true
|
| 42 |
+
use_role_tokens: true
|
| 43 |
+
compute_equivariance_probe: false
|
| 44 |
+
policy:
|
| 45 |
+
backbone:
|
| 46 |
+
model_name: openai/clip-vit-base-patch32
|
| 47 |
+
hidden_dim: 512
|
| 48 |
+
max_text_tokens: 32
|
| 49 |
+
freeze_backbone: true
|
| 50 |
+
gradient_checkpointing: false
|
| 51 |
+
use_dummy_backbone: false
|
| 52 |
+
fusion:
|
| 53 |
+
hidden_dim: 512
|
| 54 |
+
num_cameras: 3
|
| 55 |
+
num_layers: 4
|
| 56 |
+
num_heads: 8
|
| 57 |
+
ff_dim: 2048
|
| 58 |
+
dropout: 0.1
|
| 59 |
+
proprio_dim: 32
|
| 60 |
+
proprio_tokens: 1
|
| 61 |
+
memory:
|
| 62 |
+
hidden_dim: 512
|
| 63 |
+
action_dim: 14
|
| 64 |
+
history_steps: 6
|
| 65 |
+
scene_history_steps: 3
|
| 66 |
+
belief_history_steps: 8
|
| 67 |
+
num_layers: 2
|
| 68 |
+
dropout: 0.1
|
| 69 |
+
memory_bank_size: 4
|
| 70 |
+
scene_bank_size: 2
|
| 71 |
+
belief_bank_size: 2
|
| 72 |
+
num_heads: 8
|
| 73 |
+
max_history_steps: 8
|
| 74 |
+
decoder:
|
| 75 |
+
hidden_dim: 512
|
| 76 |
+
num_heads: 8
|
| 77 |
+
num_layers: 4
|
| 78 |
+
ff_dim: 2048
|
| 79 |
+
dropout: 0.1
|
| 80 |
+
chunk_size: 8
|
| 81 |
+
action_dim: 14
|
| 82 |
+
arm_action_dim: 7
|
| 83 |
+
num_candidates: 8
|
| 84 |
+
num_phases: 5
|
| 85 |
+
num_arm_roles: 4
|
| 86 |
+
num_proposal_modes: 7
|
| 87 |
+
planner_top_k: 4
|
| 88 |
+
reveal_head:
|
| 89 |
+
hidden_dim: 512
|
| 90 |
+
num_support_modes: 3
|
| 91 |
+
num_approach_templates: 32
|
| 92 |
+
rollout_horizon: 5
|
| 93 |
+
belief_map_size: 32
|
| 94 |
+
field_size: 16
|
| 95 |
+
num_heads: 8
|
| 96 |
+
predict_belief_map: true
|
| 97 |
+
num_phases: 5
|
| 98 |
+
num_arm_roles: 4
|
| 99 |
+
num_interaction_tokens: 8
|
| 100 |
+
num_tasks: 4
|
| 101 |
+
world_model:
|
| 102 |
+
hidden_dim: 512
|
| 103 |
+
action_dim: 14
|
| 104 |
+
num_support_modes: 3
|
| 105 |
+
num_approach_templates: 32
|
| 106 |
+
rollout_horizon: 5
|
| 107 |
+
field_size: 16
|
| 108 |
+
num_heads: 8
|
| 109 |
+
num_phases: 5
|
| 110 |
+
num_arm_roles: 4
|
| 111 |
+
num_interaction_tokens: 8
|
| 112 |
+
belief_map_size: 32
|
| 113 |
+
predict_belief_map: true
|
| 114 |
+
scene_bank_size: 2
|
| 115 |
+
belief_bank_size: 2
|
| 116 |
+
rollout_mode: compact_rollout
|
| 117 |
+
num_tasks: 4
|
| 118 |
+
planner:
|
| 119 |
+
hidden_dim: 512
|
| 120 |
+
num_candidates: 8
|
| 121 |
+
action_dim: 14
|
| 122 |
+
num_support_modes: 3
|
| 123 |
+
utility_margin: 0.1
|
| 124 |
+
num_heads: 8
|
| 125 |
+
num_layers: 2
|
| 126 |
+
num_phases: 5
|
| 127 |
+
num_arm_roles: 4
|
| 128 |
+
top_k: 4
|
| 129 |
+
loss_weights:
|
| 130 |
+
action: 1.0
|
| 131 |
+
phase: 0.05
|
| 132 |
+
arm_role: 0.1
|
| 133 |
+
support_mode: 0.1
|
| 134 |
+
corridor: 0.12
|
| 135 |
+
persistence: 0.06
|
| 136 |
+
disturbance: 0.06
|
| 137 |
+
world_model: 0.2
|
| 138 |
+
belief: 0.05
|
| 139 |
+
visibility: 0.05
|
| 140 |
+
clearance: 0.06
|
| 141 |
+
support_stability: 0.06
|
| 142 |
+
reocclusion: 0.06
|
| 143 |
+
occluder_contact: 0.05
|
| 144 |
+
grasp_affordance: 0.05
|
| 145 |
+
planner_success: 0.2
|
| 146 |
+
planner_risk: 0.08
|
| 147 |
+
planner_ranking: 0.2
|
| 148 |
+
proposal_reconstruction: 0.08
|
| 149 |
+
proposal_success: 0.12
|
| 150 |
+
proposal_ranking: 0.15
|
| 151 |
+
proposal_diversity: 0.05
|
| 152 |
+
role_swap_consistency: 0.02
|
| 153 |
+
task_metrics: 0.05
|
artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/metrics.json
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"epoch": 0,
|
| 4 |
+
"train": {
|
| 5 |
+
"action": 0.01788575194874092,
|
| 6 |
+
"arm_role": 0.001019889743704545,
|
| 7 |
+
"belief": 0.12027505389169643,
|
| 8 |
+
"clearance": 0.08003731069988326,
|
| 9 |
+
"corridor": 0.2066852554580883,
|
| 10 |
+
"disturbance": 0.002048209719176061,
|
| 11 |
+
"grasp_affordance": 0.009688427269850907,
|
| 12 |
+
"occluder_contact": 0.21002381004785237,
|
| 13 |
+
"persistence": 0.6925194860485039,
|
| 14 |
+
"phase": 0.2873851528293208,
|
| 15 |
+
"planner_ranking": 0.007970526718954237,
|
| 16 |
+
"planner_risk": 0.022958766870004567,
|
| 17 |
+
"planner_success": 0.11003280751603214,
|
| 18 |
+
"proposal_diversity": 0.0,
|
| 19 |
+
"proposal_ranking": 0.3698378854676297,
|
| 20 |
+
"proposal_reconstruction": 0.10649478184549432,
|
| 21 |
+
"proposal_success": 0.3088213802952515,
|
| 22 |
+
"reocclusion": 0.23717812233065305,
|
| 23 |
+
"role_swap_consistency": 0.0,
|
| 24 |
+
"support_mode": 0.024943622789884868,
|
| 25 |
+
"support_stability": 0.12860428792865652,
|
| 26 |
+
"task_metrics": 0.16847629100084305,
|
| 27 |
+
"total": 0.7690052716355574,
|
| 28 |
+
"uncertainty": 8.382000443433706e-05,
|
| 29 |
+
"visibility": 0.11110214183205053,
|
| 30 |
+
"world_model": 2.4172904924342506
|
| 31 |
+
},
|
| 32 |
+
"val": {
|
| 33 |
+
"action": 0.01380122694271532,
|
| 34 |
+
"arm_role": 0.0007760834101425258,
|
| 35 |
+
"belief": 0.10585840746308818,
|
| 36 |
+
"clearance": 0.07610336713718646,
|
| 37 |
+
"corridor": 0.20833940104101645,
|
| 38 |
+
"disturbance": 0.001970026997503627,
|
| 39 |
+
"grasp_affordance": 0.009207394397394224,
|
| 40 |
+
"occluder_contact": 0.20593324529402185,
|
| 41 |
+
"persistence": 0.9972314130176197,
|
| 42 |
+
"phase": 0.32413111886743345,
|
| 43 |
+
"planner_ranking": 0.00022911480162733687,
|
| 44 |
+
"planner_risk": 0.01705723936020425,
|
| 45 |
+
"planner_success": 0.01417768012845155,
|
| 46 |
+
"proposal_diversity": 0.0,
|
| 47 |
+
"proposal_ranking": 0.148933302498225,
|
| 48 |
+
"proposal_reconstruction": 0.08428755696072723,
|
| 49 |
+
"proposal_success": 0.1355827044356953,
|
| 50 |
+
"reocclusion": 0.3386235964117628,
|
| 51 |
+
"role_swap_consistency": 0.0,
|
| 52 |
+
"support_mode": 0.00921160001025507,
|
| 53 |
+
"support_stability": 0.13572556762532753,
|
| 54 |
+
"task_metrics": 0.14731343361464413,
|
| 55 |
+
"total": 0.5933836915276267,
|
| 56 |
+
"uncertainty": 6.096601287991331e-05,
|
| 57 |
+
"visibility": 0.0983928834850138,
|
| 58 |
+
"world_model": 1.8323120962489734
|
| 59 |
+
}
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 1,
|
| 63 |
+
"train": {
|
| 64 |
+
"action": 0.011145903746058282,
|
| 65 |
+
"arm_role": 0.004716875678614566,
|
| 66 |
+
"belief": 0.10514058776591953,
|
| 67 |
+
"clearance": 0.07779615305756268,
|
| 68 |
+
"corridor": 0.20047297412529588,
|
| 69 |
+
"disturbance": 0.0022256996764458323,
|
| 70 |
+
"grasp_affordance": 0.009927417171236717,
|
| 71 |
+
"occluder_contact": 0.20330693313949985,
|
| 72 |
+
"persistence": 0.6114087605882608,
|
| 73 |
+
"phase": 0.21543656336633782,
|
| 74 |
+
"planner_ranking": 0.00018106158740920363,
|
| 75 |
+
"planner_risk": 0.015639749421787107,
|
| 76 |
+
"planner_success": 0.007014186491601561,
|
| 77 |
+
"proposal_diversity": 0.0,
|
| 78 |
+
"proposal_ranking": 0.08341658792801593,
|
| 79 |
+
"proposal_reconstruction": 0.07808616882876346,
|
| 80 |
+
"proposal_success": 0.08362671854464632,
|
| 81 |
+
"reocclusion": 0.21602793348659027,
|
| 82 |
+
"role_swap_consistency": 0.0,
|
| 83 |
+
"support_mode": 0.002888570647490652,
|
| 84 |
+
"support_stability": 0.1253421003685186,
|
| 85 |
+
"task_metrics": 0.14696427873874965,
|
| 86 |
+
"total": 0.4685811519622803,
|
| 87 |
+
"uncertainty": 3.833678546901578e-05,
|
| 88 |
+
"visibility": 0.09538611636350029,
|
| 89 |
+
"world_model": 1.5017830422050074
|
| 90 |
+
},
|
| 91 |
+
"val": {
|
| 92 |
+
"action": 0.011654359860007058,
|
| 93 |
+
"arm_role": 0.0034928608396457453,
|
| 94 |
+
"belief": 0.09692509336905046,
|
| 95 |
+
"clearance": 0.07511017166755417,
|
| 96 |
+
"corridor": 0.19370697032321582,
|
| 97 |
+
"disturbance": 0.0026899561648447575,
|
| 98 |
+
"grasp_affordance": 0.0108991796574132,
|
| 99 |
+
"occluder_contact": 0.20079099861058322,
|
| 100 |
+
"persistence": 0.7645651453146429,
|
| 101 |
+
"phase": 0.3479848448751551,
|
| 102 |
+
"planner_ranking": 7.394229859611104e-05,
|
| 103 |
+
"planner_risk": 0.015624796357852492,
|
| 104 |
+
"planner_success": 0.004826839748685333,
|
| 105 |
+
"proposal_diversity": 0.0,
|
| 106 |
+
"proposal_ranking": 0.10358103387283557,
|
| 107 |
+
"proposal_reconstruction": 0.0756011808460409,
|
| 108 |
+
"proposal_success": 0.07432993885242578,
|
| 109 |
+
"reocclusion": 0.2191494649106806,
|
| 110 |
+
"role_swap_consistency": 0.0,
|
| 111 |
+
"support_mode": 0.0027684949190271172,
|
| 112 |
+
"support_stability": 0.1332334725919998,
|
| 113 |
+
"task_metrics": 0.1422290007273356,
|
| 114 |
+
"total": 0.4283526196624293,
|
| 115 |
+
"uncertainty": 6.312012101950654e-05,
|
| 116 |
+
"visibility": 0.09116258178696487,
|
| 117 |
+
"world_model": 1.2195359631018206
|
| 118 |
+
}
|
| 119 |
+
},
|
| 120 |
+
{
|
| 121 |
+
"epoch": 2,
|
| 122 |
+
"train": {
|
| 123 |
+
"action": 0.009001491786176829,
|
| 124 |
+
"arm_role": 0.0031435395541944003,
|
| 125 |
+
"belief": 0.09910694141136972,
|
| 126 |
+
"clearance": 0.0763780973459545,
|
| 127 |
+
"corridor": 0.1971655124798417,
|
| 128 |
+
"disturbance": 0.0022866554429607565,
|
| 129 |
+
"grasp_affordance": 0.010611724677054506,
|
| 130 |
+
"occluder_contact": 0.19787568214692566,
|
| 131 |
+
"persistence": 0.6391040890734171,
|
| 132 |
+
"phase": 0.2374798740211286,
|
| 133 |
+
"planner_ranking": 0.00014213861397461427,
|
| 134 |
+
"planner_risk": 0.011339436628316579,
|
| 135 |
+
"planner_success": 0.002072299244249926,
|
| 136 |
+
"proposal_diversity": 0.0,
|
| 137 |
+
"proposal_ranking": 0.06418653756478115,
|
| 138 |
+
"proposal_reconstruction": 0.07331816969733489,
|
| 139 |
+
"proposal_success": 0.04838304229472813,
|
| 140 |
+
"reocclusion": 0.21607661038931264,
|
| 141 |
+
"role_swap_consistency": 0.0,
|
| 142 |
+
"support_mode": 0.0016624049136513158,
|
| 143 |
+
"support_stability": 0.12395783413789774,
|
| 144 |
+
"task_metrics": 0.14624216885943161,
|
| 145 |
+
"total": 0.38278010518927325,
|
| 146 |
+
"uncertainty": 2.0790473172083742e-05,
|
| 147 |
+
"visibility": 0.09234962655525458,
|
| 148 |
+
"world_model": 1.1216850475261086
|
| 149 |
+
},
|
| 150 |
+
"val": {
|
| 151 |
+
"action": 0.010441844330423257,
|
| 152 |
+
"arm_role": 0.0016049532427904055,
|
| 153 |
+
"belief": 0.09470506825230339,
|
| 154 |
+
"clearance": 0.07375384743014972,
|
| 155 |
+
"corridor": 0.19222540205175226,
|
| 156 |
+
"disturbance": 0.0019139318584083494,
|
| 157 |
+
"grasp_affordance": 0.010620580760366989,
|
| 158 |
+
"occluder_contact": 0.1952310868284919,
|
| 159 |
+
"persistence": 0.6828906978621627,
|
| 160 |
+
"phase": 0.2299347263901974,
|
| 161 |
+
"planner_ranking": 0.00015113159439682602,
|
| 162 |
+
"planner_risk": 0.008906871584864954,
|
| 163 |
+
"planner_success": 0.0014728186387484047,
|
| 164 |
+
"proposal_diversity": 0.0,
|
| 165 |
+
"proposal_ranking": 0.08296609080086152,
|
| 166 |
+
"proposal_reconstruction": 0.07454461446314146,
|
| 167 |
+
"proposal_success": 0.05527880562074257,
|
| 168 |
+
"reocclusion": 0.21840402348475021,
|
| 169 |
+
"role_swap_consistency": 0.0,
|
| 170 |
+
"support_mode": 0.0009884886958664565,
|
| 171 |
+
"support_stability": 0.13373579051006923,
|
| 172 |
+
"task_metrics": 0.14126789863362457,
|
| 173 |
+
"total": 0.3722763684662906,
|
| 174 |
+
"uncertainty": 3.5526475199510585e-05,
|
| 175 |
+
"visibility": 0.08942856323538405,
|
| 176 |
+
"world_model": 1.038636032379035
|
| 177 |
+
}
|
| 178 |
+
}
|
| 179 |
+
]
|
artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/summary.json
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"experiment_name": "proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17",
|
| 3 |
+
"device": "cuda",
|
| 4 |
+
"best_checkpoint": "/workspace/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/checkpoint_best.pt",
|
| 5 |
+
"final_train_total": 0.38278010518927325,
|
| 6 |
+
"final_val_total": 0.3722763684662906,
|
| 7 |
+
"train_time_sec": 108.92269134521484,
|
| 8 |
+
"peak_gpu_memory_mb": 2451.3857421875,
|
| 9 |
+
"num_train_samples": 380,
|
| 10 |
+
"num_val_samples": 131,
|
| 11 |
+
"planner_mode": "trainable",
|
| 12 |
+
"frozen_modules": [],
|
| 13 |
+
"init_info": {
|
| 14 |
+
"path": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt",
|
| 15 |
+
"loaded_keys": 828,
|
| 16 |
+
"skipped_shape_mismatch_keys": [
|
| 17 |
+
"decoder.proposal_mode_head.3.weight",
|
| 18 |
+
"decoder.proposal_mode_head.3.bias",
|
| 19 |
+
"decoder.proposal_mode_embeddings.weight"
|
| 20 |
+
],
|
| 21 |
+
"missing_keys": [
|
| 22 |
+
"decoder.task_embedding.weight",
|
| 23 |
+
"decoder.proposal_mode_head.3.weight",
|
| 24 |
+
"decoder.proposal_mode_head.3.bias",
|
| 25 |
+
"decoder.proposal_mode_embeddings.weight",
|
| 26 |
+
"decoder.mode_residual_heads.6.0.weight",
|
| 27 |
+
"decoder.mode_residual_heads.6.0.bias",
|
| 28 |
+
"decoder.mode_residual_heads.6.1.weight",
|
| 29 |
+
"decoder.mode_residual_heads.6.1.bias",
|
| 30 |
+
"decoder.mode_residual_heads.6.3.weight",
|
| 31 |
+
"decoder.mode_residual_heads.6.3.bias",
|
| 32 |
+
"elastic_state_head.decoder.task_embedding.weight",
|
| 33 |
+
"elastic_state_head.decoder.task_field_affine.weight",
|
| 34 |
+
"elastic_state_head.decoder.task_field_affine.bias",
|
| 35 |
+
"elastic_state_head.decoder.task_summary_adapter.0.weight",
|
| 36 |
+
"elastic_state_head.decoder.task_summary_adapter.0.bias",
|
| 37 |
+
"elastic_state_head.decoder.task_summary_adapter.1.weight",
|
| 38 |
+
"elastic_state_head.decoder.task_summary_adapter.1.bias",
|
| 39 |
+
"elastic_state_head.decoder.task_phase_head.weight",
|
| 40 |
+
"elastic_state_head.decoder.task_phase_head.bias",
|
| 41 |
+
"elastic_state_head.decoder.task_support_head.weight",
|
| 42 |
+
"elastic_state_head.decoder.task_support_head.bias",
|
| 43 |
+
"elastic_state_head.decoder.task_reocclusion_head.weight",
|
| 44 |
+
"elastic_state_head.decoder.task_reocclusion_head.bias",
|
| 45 |
+
"elastic_state_head.decoder.task_metric_head.0.weight",
|
| 46 |
+
"elastic_state_head.decoder.task_metric_head.0.bias",
|
| 47 |
+
"elastic_state_head.decoder.task_metric_head.1.weight",
|
| 48 |
+
"elastic_state_head.decoder.task_metric_head.1.bias",
|
| 49 |
+
"elastic_state_head.decoder.task_metric_head.3.weight",
|
| 50 |
+
"elastic_state_head.decoder.task_metric_head.3.bias",
|
| 51 |
+
"world_model.task_embedding.weight",
|
| 52 |
+
"world_model.spatial_field_encoder.0.weight",
|
| 53 |
+
"world_model.spatial_field_encoder.0.bias",
|
| 54 |
+
"world_model.spatial_field_encoder.2.weight",
|
| 55 |
+
"world_model.spatial_field_encoder.2.bias",
|
| 56 |
+
"world_model.spatial_context_proj.0.weight",
|
| 57 |
+
"world_model.spatial_context_proj.0.bias",
|
| 58 |
+
"world_model.spatial_context_proj.1.weight",
|
| 59 |
+
"world_model.spatial_context_proj.1.bias",
|
| 60 |
+
"world_model.spatial_gate_z.weight",
|
| 61 |
+
"world_model.spatial_gate_z.bias",
|
| 62 |
+
"world_model.spatial_gate_r.weight",
|
| 63 |
+
"world_model.spatial_gate_r.bias",
|
| 64 |
+
"world_model.spatial_candidate.weight",
|
| 65 |
+
"world_model.spatial_candidate.bias",
|
| 66 |
+
"world_model.spatial_summary_proj.0.weight",
|
| 67 |
+
"world_model.spatial_summary_proj.0.bias",
|
| 68 |
+
"world_model.spatial_summary_proj.1.weight",
|
| 69 |
+
"world_model.spatial_summary_proj.1.bias",
|
| 70 |
+
"world_model.spatial_phase_head.weight",
|
| 71 |
+
"world_model.spatial_phase_head.bias",
|
| 72 |
+
"world_model.spatial_support_mode_head.weight",
|
| 73 |
+
"world_model.spatial_support_mode_head.bias",
|
| 74 |
+
"world_model.spatial_arm_role_head.weight",
|
| 75 |
+
"world_model.spatial_arm_role_head.bias",
|
| 76 |
+
"world_model.spatial_reocclusion_head.weight",
|
| 77 |
+
"world_model.spatial_reocclusion_head.bias",
|
| 78 |
+
"world_model.spatial_target_belief_head.weight",
|
| 79 |
+
"world_model.spatial_target_belief_head.bias",
|
| 80 |
+
"world_model.spatial_visibility_head.weight",
|
| 81 |
+
"world_model.spatial_visibility_head.bias",
|
| 82 |
+
"world_model.spatial_clearance_head.weight",
|
| 83 |
+
"world_model.spatial_clearance_head.bias",
|
| 84 |
+
"world_model.spatial_occluder_contact_head.weight",
|
| 85 |
+
"world_model.spatial_occluder_contact_head.bias",
|
| 86 |
+
"world_model.spatial_grasp_affordance_head.weight",
|
| 87 |
+
"world_model.spatial_grasp_affordance_head.bias",
|
| 88 |
+
"world_model.spatial_support_stability_head.weight",
|
| 89 |
+
"world_model.spatial_support_stability_head.bias",
|
| 90 |
+
"world_model.spatial_persistence_head.weight",
|
| 91 |
+
"world_model.spatial_persistence_head.bias",
|
| 92 |
+
"world_model.spatial_reocclusion_field_head.weight",
|
| 93 |
+
"world_model.spatial_reocclusion_field_head.bias",
|
| 94 |
+
"world_model.spatial_disturbance_head.weight",
|
| 95 |
+
"world_model.spatial_disturbance_head.bias",
|
| 96 |
+
"world_model.spatial_uncertainty_head.weight",
|
| 97 |
+
"world_model.spatial_uncertainty_head.bias",
|
| 98 |
+
"world_model.spatial_access_head.weight",
|
| 99 |
+
"world_model.spatial_access_head.bias"
|
| 100 |
+
],
|
| 101 |
+
"unexpected_keys": []
|
| 102 |
+
}
|
| 103 |
+
}
|
artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/checkpoint_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e1bf52506b13c794b6d8f6f4738294947703c16c6a2c3b46dc8ea68fd14e0c12
|
| 3 |
+
size 940663118
|
artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/config_resolved.yaml
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17
|
| 2 |
+
output_dir: /workspace/outputs/r3d_handoff
|
| 3 |
+
device: cuda
|
| 4 |
+
seed: 17
|
| 5 |
+
init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt
|
| 6 |
+
init_strict: false
|
| 7 |
+
data:
|
| 8 |
+
proxies:
|
| 9 |
+
- foliage_proxy
|
| 10 |
+
- bag_proxy
|
| 11 |
+
- cloth_proxy
|
| 12 |
+
resolution: 224
|
| 13 |
+
dataset_version: reveal_proxy_v6_rgbd_elastic_state
|
| 14 |
+
train_episodes_per_proxy: 48
|
| 15 |
+
val_episodes_per_proxy: 16
|
| 16 |
+
train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3.pt
|
| 17 |
+
val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3.pt
|
| 18 |
+
rebuild_dataset: false
|
| 19 |
+
chunk_horizon: 8
|
| 20 |
+
rollout_horizon: 5
|
| 21 |
+
history_steps: 6
|
| 22 |
+
planner_candidates: 8
|
| 23 |
+
seed: 17
|
| 24 |
+
optim:
|
| 25 |
+
epochs: 5
|
| 26 |
+
batch_size: 4
|
| 27 |
+
num_workers: 24
|
| 28 |
+
lr: 0.00015
|
| 29 |
+
weight_decay: 0.0001
|
| 30 |
+
trainer:
|
| 31 |
+
policy_type: elastic_reveal
|
| 32 |
+
use_bf16: true
|
| 33 |
+
grad_clip_norm: 1.0
|
| 34 |
+
freeze_backbone: true
|
| 35 |
+
gradient_checkpointing: false
|
| 36 |
+
plan_during_train: true
|
| 37 |
+
plan_during_eval: true
|
| 38 |
+
support_mode_conditioning: true
|
| 39 |
+
planner_mode: trainable
|
| 40 |
+
use_depth: true
|
| 41 |
+
use_world_model: true
|
| 42 |
+
use_role_tokens: true
|
| 43 |
+
compute_equivariance_probe: false
|
| 44 |
+
policy:
|
| 45 |
+
backbone:
|
| 46 |
+
model_name: openai/clip-vit-base-patch32
|
| 47 |
+
hidden_dim: 512
|
| 48 |
+
max_text_tokens: 32
|
| 49 |
+
freeze_backbone: true
|
| 50 |
+
gradient_checkpointing: false
|
| 51 |
+
use_dummy_backbone: false
|
| 52 |
+
fusion:
|
| 53 |
+
hidden_dim: 512
|
| 54 |
+
num_cameras: 3
|
| 55 |
+
num_layers: 4
|
| 56 |
+
num_heads: 8
|
| 57 |
+
ff_dim: 2048
|
| 58 |
+
dropout: 0.1
|
| 59 |
+
proprio_dim: 32
|
| 60 |
+
proprio_tokens: 1
|
| 61 |
+
memory:
|
| 62 |
+
hidden_dim: 512
|
| 63 |
+
action_dim: 14
|
| 64 |
+
history_steps: 6
|
| 65 |
+
scene_history_steps: 3
|
| 66 |
+
belief_history_steps: 8
|
| 67 |
+
num_layers: 2
|
| 68 |
+
dropout: 0.1
|
| 69 |
+
memory_bank_size: 4
|
| 70 |
+
scene_bank_size: 2
|
| 71 |
+
belief_bank_size: 2
|
| 72 |
+
num_heads: 8
|
| 73 |
+
max_history_steps: 8
|
| 74 |
+
decoder:
|
| 75 |
+
hidden_dim: 512
|
| 76 |
+
num_heads: 8
|
| 77 |
+
num_layers: 4
|
| 78 |
+
ff_dim: 2048
|
| 79 |
+
dropout: 0.1
|
| 80 |
+
chunk_size: 8
|
| 81 |
+
action_dim: 14
|
| 82 |
+
arm_action_dim: 7
|
| 83 |
+
num_candidates: 8
|
| 84 |
+
num_phases: 5
|
| 85 |
+
num_arm_roles: 4
|
| 86 |
+
num_proposal_modes: 7
|
| 87 |
+
planner_top_k: 4
|
| 88 |
+
reveal_head:
|
| 89 |
+
hidden_dim: 512
|
| 90 |
+
num_support_modes: 3
|
| 91 |
+
num_approach_templates: 32
|
| 92 |
+
rollout_horizon: 5
|
| 93 |
+
belief_map_size: 32
|
| 94 |
+
field_size: 16
|
| 95 |
+
num_heads: 8
|
| 96 |
+
predict_belief_map: true
|
| 97 |
+
num_phases: 5
|
| 98 |
+
num_arm_roles: 4
|
| 99 |
+
num_interaction_tokens: 8
|
| 100 |
+
num_tasks: 4
|
| 101 |
+
world_model:
|
| 102 |
+
hidden_dim: 512
|
| 103 |
+
action_dim: 14
|
| 104 |
+
num_support_modes: 3
|
| 105 |
+
num_approach_templates: 32
|
| 106 |
+
rollout_horizon: 5
|
| 107 |
+
field_size: 16
|
| 108 |
+
num_heads: 8
|
| 109 |
+
num_phases: 5
|
| 110 |
+
num_arm_roles: 4
|
| 111 |
+
num_interaction_tokens: 8
|
| 112 |
+
belief_map_size: 32
|
| 113 |
+
predict_belief_map: true
|
| 114 |
+
scene_bank_size: 2
|
| 115 |
+
belief_bank_size: 2
|
| 116 |
+
rollout_mode: spatial_rollout
|
| 117 |
+
num_tasks: 4
|
| 118 |
+
planner:
|
| 119 |
+
hidden_dim: 512
|
| 120 |
+
num_candidates: 8
|
| 121 |
+
action_dim: 14
|
| 122 |
+
num_support_modes: 3
|
| 123 |
+
utility_margin: 0.1
|
| 124 |
+
num_heads: 8
|
| 125 |
+
num_layers: 2
|
| 126 |
+
num_phases: 5
|
| 127 |
+
num_arm_roles: 4
|
| 128 |
+
top_k: 4
|
| 129 |
+
loss_weights:
|
| 130 |
+
action: 0.6
|
| 131 |
+
phase: 0.05
|
| 132 |
+
arm_role: 0.1
|
| 133 |
+
support_mode: 0.1
|
| 134 |
+
corridor: 0.15
|
| 135 |
+
persistence: 0.08
|
| 136 |
+
disturbance: 0.08
|
| 137 |
+
world_model: 0.35
|
| 138 |
+
belief: 0.05
|
| 139 |
+
visibility: 0.05
|
| 140 |
+
clearance: 0.08
|
| 141 |
+
support_stability: 0.08
|
| 142 |
+
reocclusion: 0.08
|
| 143 |
+
occluder_contact: 0.05
|
| 144 |
+
grasp_affordance: 0.05
|
| 145 |
+
planner_success: 0.25
|
| 146 |
+
planner_risk: 0.1
|
| 147 |
+
planner_ranking: 0.25
|
| 148 |
+
proposal_reconstruction: 0.05
|
| 149 |
+
proposal_success: 0.2
|
| 150 |
+
proposal_ranking: 0.25
|
| 151 |
+
proposal_diversity: 0.05
|
| 152 |
+
role_swap_consistency: 0.02
|
| 153 |
+
task_metrics: 0.1
|
artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/metrics.json
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"epoch": 0,
|
| 4 |
+
"train": {
|
| 5 |
+
"action": 0.01881810994328637,
|
| 6 |
+
"arm_role": 2.660249408922697e-07,
|
| 7 |
+
"belief": 0.11447437676159959,
|
| 8 |
+
"clearance": 0.08659498787632114,
|
| 9 |
+
"corridor": 0.2168508012632006,
|
| 10 |
+
"disturbance": 0.003011604138699017,
|
| 11 |
+
"grasp_affordance": 0.026647591468338904,
|
| 12 |
+
"occluder_contact": 0.23407171917589087,
|
| 13 |
+
"persistence": 0.715272118206332,
|
| 14 |
+
"phase": 0.3055733912869504,
|
| 15 |
+
"planner_ranking": 0.022415388478637062,
|
| 16 |
+
"planner_risk": 0.023148000005044436,
|
| 17 |
+
"planner_success": 0.10891202024527286,
|
| 18 |
+
"proposal_diversity": 0.0,
|
| 19 |
+
"proposal_ranking": 0.29254391558076204,
|
| 20 |
+
"proposal_reconstruction": 0.10000501573085785,
|
| 21 |
+
"proposal_success": 0.24436878922738528,
|
| 22 |
+
"reocclusion": 0.23760041894489212,
|
| 23 |
+
"role_swap_consistency": 0.0,
|
| 24 |
+
"support_mode": 0.029171819122214067,
|
| 25 |
+
"support_stability": 0.13612447682964174,
|
| 26 |
+
"task_metrics": 0.15792004442528673,
|
| 27 |
+
"total": 1.1042629627805007,
|
| 28 |
+
"uncertainty": 0.0002569015418885101,
|
| 29 |
+
"visibility": 0.11963542550802231,
|
| 30 |
+
"world_model": 2.1293361720285917
|
| 31 |
+
},
|
| 32 |
+
"val": {
|
| 33 |
+
"action": 0.01775987928902561,
|
| 34 |
+
"arm_role": 1.9868213740892315e-08,
|
| 35 |
+
"belief": 0.1041581260435509,
|
| 36 |
+
"clearance": 0.07728264966245854,
|
| 37 |
+
"corridor": 0.2031804034204194,
|
| 38 |
+
"disturbance": 0.0017973093819102469,
|
| 39 |
+
"grasp_affordance": 0.029909261431770796,
|
| 40 |
+
"occluder_contact": 0.23528439167774085,
|
| 41 |
+
"persistence": 0.775493811025764,
|
| 42 |
+
"phase": 0.3234691350636157,
|
| 43 |
+
"planner_ranking": 0.0003446909185647724,
|
| 44 |
+
"planner_risk": 0.01719488257147146,
|
| 45 |
+
"planner_success": 0.00382949538867582,
|
| 46 |
+
"proposal_diversity": 0.0,
|
| 47 |
+
"proposal_ranking": 0.12049920004651402,
|
| 48 |
+
"proposal_reconstruction": 0.07953478553981493,
|
| 49 |
+
"proposal_success": 0.08874417897878271,
|
| 50 |
+
"reocclusion": 0.2280347410476569,
|
| 51 |
+
"role_swap_consistency": 0.0,
|
| 52 |
+
"support_mode": 0.0012850317253844078,
|
| 53 |
+
"support_stability": 0.14258646694096652,
|
| 54 |
+
"task_metrics": 0.14477597267338724,
|
| 55 |
+
"total": 0.5612193951101014,
|
| 56 |
+
"uncertainty": 5.167457964654948e-05,
|
| 57 |
+
"visibility": 0.09895570508458397,
|
| 58 |
+
"world_model": 0.8950341885740106
|
| 59 |
+
}
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 1,
|
| 63 |
+
"train": {
|
| 64 |
+
"action": 0.011799810049859317,
|
| 65 |
+
"arm_role": 2.227331462659334e-08,
|
| 66 |
+
"belief": 0.10664581923108352,
|
| 67 |
+
"clearance": 0.08073694133444836,
|
| 68 |
+
"corridor": 0.20818061490886305,
|
| 69 |
+
"disturbance": 0.0034403698920198763,
|
| 70 |
+
"grasp_affordance": 0.02138785579682965,
|
| 71 |
+
"occluder_contact": 0.22385836949473933,
|
| 72 |
+
"persistence": 0.6393873089823358,
|
| 73 |
+
"phase": 0.23239254700510126,
|
| 74 |
+
"planner_ranking": 7.025458191932151e-05,
|
| 75 |
+
"planner_risk": 0.014862261340022087,
|
| 76 |
+
"planner_success": 0.0018582946357415303,
|
| 77 |
+
"proposal_diversity": 0.0,
|
| 78 |
+
"proposal_ranking": 0.07140745759304416,
|
| 79 |
+
"proposal_reconstruction": 0.07342678826106222,
|
| 80 |
+
"proposal_success": 0.052891033456513754,
|
| 81 |
+
"reocclusion": 0.22496669194415997,
|
| 82 |
+
"role_swap_consistency": 0.0,
|
| 83 |
+
"support_mode": 0.0026235160074735944,
|
| 84 |
+
"support_stability": 0.129776736858644,
|
| 85 |
+
"task_metrics": 0.14390431940555573,
|
| 86 |
+
"total": 0.5859858243088973,
|
| 87 |
+
"uncertainty": 4.795166801757564e-05,
|
| 88 |
+
"visibility": 0.0989507636741588,
|
| 89 |
+
"world_model": 1.0815125898311013
|
| 90 |
+
},
|
| 91 |
+
"val": {
|
| 92 |
+
"action": 0.012531062933813893,
|
| 93 |
+
"arm_role": 0.0,
|
| 94 |
+
"belief": 0.1047917457692551,
|
| 95 |
+
"clearance": 0.08090435149091663,
|
| 96 |
+
"corridor": 0.20609694809624643,
|
| 97 |
+
"disturbance": 0.004269244044487344,
|
| 98 |
+
"grasp_affordance": 0.03268951613625342,
|
| 99 |
+
"occluder_contact": 0.2295533585729021,
|
| 100 |
+
"persistence": 1.1918026357889175,
|
| 101 |
+
"phase": 0.28549350922306377,
|
| 102 |
+
"planner_ranking": 6.612739407914474e-05,
|
| 103 |
+
"planner_risk": 0.008759455501355908,
|
| 104 |
+
"planner_success": 0.00080455597895762,
|
| 105 |
+
"proposal_diversity": 0.0,
|
| 106 |
+
"proposal_ranking": 0.1058380516866843,
|
| 107 |
+
"proposal_reconstruction": 0.07328326593745839,
|
| 108 |
+
"proposal_success": 0.058020667765628205,
|
| 109 |
+
"reocclusion": 0.33030271168911096,
|
| 110 |
+
"role_swap_consistency": 0.0,
|
| 111 |
+
"support_mode": 0.007980566662312909,
|
| 112 |
+
"support_stability": 0.13863918806115785,
|
| 113 |
+
"task_metrics": 0.14007962495088577,
|
| 114 |
+
"total": 0.5981471421140613,
|
| 115 |
+
"uncertainty": 4.717415575242106e-05,
|
| 116 |
+
"visibility": 0.09760378188256061,
|
| 117 |
+
"world_model": 0.9283028335282297
|
| 118 |
+
}
|
| 119 |
+
},
|
| 120 |
+
{
|
| 121 |
+
"epoch": 2,
|
| 122 |
+
"train": {
|
| 123 |
+
"action": 0.01041558725563319,
|
| 124 |
+
"arm_role": 8.783842387952302e-09,
|
| 125 |
+
"belief": 0.1059942942700888,
|
| 126 |
+
"clearance": 0.08004858184017632,
|
| 127 |
+
"corridor": 0.20489231364703492,
|
| 128 |
+
"disturbance": 0.0035354765677383464,
|
| 129 |
+
"grasp_affordance": 0.0182398099795376,
|
| 130 |
+
"occluder_contact": 0.2177388886087819,
|
| 131 |
+
"persistence": 0.7130741648352629,
|
| 132 |
+
"phase": 0.2232393227125469,
|
| 133 |
+
"planner_ranking": 9.544059870988445e-05,
|
| 134 |
+
"planner_risk": 0.007491795662300367,
|
| 135 |
+
"planner_success": 0.0006995215439115111,
|
| 136 |
+
"proposal_diversity": 0.0,
|
| 137 |
+
"proposal_ranking": 0.06466802976731408,
|
| 138 |
+
"proposal_reconstruction": 0.07109034053589168,
|
| 139 |
+
"proposal_success": 0.03924152674643617,
|
| 140 |
+
"reocclusion": 0.24476394773840807,
|
| 141 |
+
"role_swap_consistency": 0.0,
|
| 142 |
+
"support_mode": 0.0035586046545129073,
|
| 143 |
+
"support_stability": 0.1275901448373732,
|
| 144 |
+
"task_metrics": 0.14374801657701794,
|
| 145 |
+
"total": 0.5572637532886706,
|
| 146 |
+
"uncertainty": 3.1927780274169266e-05,
|
| 147 |
+
"visibility": 0.09604649849628147,
|
| 148 |
+
"world_model": 1.0012797054491545
|
| 149 |
+
},
|
| 150 |
+
"val": {
|
| 151 |
+
"action": 0.01250085191603637,
|
| 152 |
+
"arm_role": 1.9868213740892315e-08,
|
| 153 |
+
"belief": 0.10097009620883247,
|
| 154 |
+
"clearance": 0.07564825914574391,
|
| 155 |
+
"corridor": 0.19414961970213687,
|
| 156 |
+
"disturbance": 0.0028900962097946886,
|
| 157 |
+
"grasp_affordance": 0.014792599320185907,
|
| 158 |
+
"occluder_contact": 0.21100006365414822,
|
| 159 |
+
"persistence": 0.7430237780014673,
|
| 160 |
+
"phase": 0.21708492669418003,
|
| 161 |
+
"planner_ranking": 4.6006352427026066e-05,
|
| 162 |
+
"planner_risk": 0.004453302675039705,
|
| 163 |
+
"planner_success": 0.0002918489108472413,
|
| 164 |
+
"proposal_diversity": 0.0,
|
| 165 |
+
"proposal_ranking": 0.0847416734464015,
|
| 166 |
+
"proposal_reconstruction": 0.07325861490134036,
|
| 167 |
+
"proposal_success": 0.04800050914513342,
|
| 168 |
+
"reocclusion": 0.21185640358563626,
|
| 169 |
+
"role_swap_consistency": 0.0,
|
| 170 |
+
"support_mode": 0.0001279175175472064,
|
| 171 |
+
"support_stability": 0.1355795610808965,
|
| 172 |
+
"task_metrics": 0.13995845525553732,
|
| 173 |
+
"total": 0.5864624706181613,
|
| 174 |
+
"uncertainty": 3.328838372660605e-05,
|
| 175 |
+
"visibility": 0.0915211413168546,
|
| 176 |
+
"world_model": 1.0730400555061572
|
| 177 |
+
}
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"epoch": 3,
|
| 181 |
+
"train": {
|
| 182 |
+
"action": 0.009735075987287258,
|
| 183 |
+
"arm_role": 6.901590447676809e-09,
|
| 184 |
+
"belief": 0.10281775146722794,
|
| 185 |
+
"clearance": 0.07848918745784383,
|
| 186 |
+
"corridor": 0.20201588633813355,
|
| 187 |
+
"disturbance": 0.0031908015084091425,
|
| 188 |
+
"grasp_affordance": 0.015698057898369276,
|
| 189 |
+
"occluder_contact": 0.2077378544368242,
|
| 190 |
+
"persistence": 0.6618055252505368,
|
| 191 |
+
"phase": 0.2315950472103922,
|
| 192 |
+
"planner_ranking": 4.5244067302076614e-05,
|
| 193 |
+
"planner_risk": 0.004139781545381993,
|
| 194 |
+
"planner_success": 0.00024409074271955575,
|
| 195 |
+
"proposal_diversity": 0.0,
|
| 196 |
+
"proposal_ranking": 0.054642489266361255,
|
| 197 |
+
"proposal_reconstruction": 0.0706897854020721,
|
| 198 |
+
"proposal_success": 0.02984069204448085,
|
| 199 |
+
"reocclusion": 0.21985463225527813,
|
| 200 |
+
"role_swap_consistency": 0.0,
|
| 201 |
+
"support_mode": 0.00018626294637981214,
|
| 202 |
+
"support_stability": 0.12553868676094632,
|
| 203 |
+
"task_metrics": 0.14241597934773093,
|
| 204 |
+
"total": 0.5318458660652763,
|
| 205 |
+
"uncertainty": 2.4771544117715837e-05,
|
| 206 |
+
"visibility": 0.09392199994702088,
|
| 207 |
+
"world_model": 0.9659816164719431
|
| 208 |
+
},
|
| 209 |
+
"val": {
|
| 210 |
+
"action": 0.010274204266765579,
|
| 211 |
+
"arm_role": 0.0,
|
| 212 |
+
"belief": 0.09872564831466386,
|
| 213 |
+
"clearance": 0.07471577507076842,
|
| 214 |
+
"corridor": 0.19408509180401312,
|
| 215 |
+
"disturbance": 0.00259435343862909,
|
| 216 |
+
"grasp_affordance": 0.012839911272749305,
|
| 217 |
+
"occluder_contact": 0.20571506023406982,
|
| 218 |
+
"persistence": 0.7026729972073527,
|
| 219 |
+
"phase": 0.26675827372254746,
|
| 220 |
+
"planner_ranking": 4.497824862124672e-05,
|
| 221 |
+
"planner_risk": 0.002781865681754425,
|
| 222 |
+
"planner_success": 0.00015440414228413084,
|
| 223 |
+
"proposal_diversity": 0.0,
|
| 224 |
+
"proposal_ranking": 0.07663222694783613,
|
| 225 |
+
"proposal_reconstruction": 0.07091993872414935,
|
| 226 |
+
"proposal_success": 0.03695757263763384,
|
| 227 |
+
"reocclusion": 0.21823022356539062,
|
| 228 |
+
"role_swap_consistency": 0.0,
|
| 229 |
+
"support_mode": 0.00015865708998965354,
|
| 230 |
+
"support_stability": 0.13456520460771793,
|
| 231 |
+
"task_metrics": 0.1404704556546428,
|
| 232 |
+
"total": 0.5019133334810083,
|
| 233 |
+
"uncertainty": 1.7504166497460435e-05,
|
| 234 |
+
"visibility": 0.09078080579638481,
|
| 235 |
+
"world_model": 0.8508174094286832
|
| 236 |
+
}
|
| 237 |
+
},
|
| 238 |
+
{
|
| 239 |
+
"epoch": 4,
|
| 240 |
+
"train": {
|
| 241 |
+
"action": 0.00924236060757386,
|
| 242 |
+
"arm_role": 3.1370865671258223e-10,
|
| 243 |
+
"belief": 0.10063675104787476,
|
| 244 |
+
"clearance": 0.07763459076614757,
|
| 245 |
+
"corridor": 0.1999763826496507,
|
| 246 |
+
"disturbance": 0.0032564817505006337,
|
| 247 |
+
"grasp_affordance": 0.015768864574401003,
|
| 248 |
+
"occluder_contact": 0.20453226503572966,
|
| 249 |
+
"persistence": 0.6381541584980656,
|
| 250 |
+
"phase": 0.23467233871158802,
|
| 251 |
+
"planner_ranking": 0.002148842357724178,
|
| 252 |
+
"planner_risk": 0.005933802830986679,
|
| 253 |
+
"planner_success": 0.0012002266089487085,
|
| 254 |
+
"proposal_diversity": 0.0,
|
| 255 |
+
"proposal_ranking": 0.04519814905391908,
|
| 256 |
+
"proposal_reconstruction": 0.07035028267847865,
|
| 257 |
+
"proposal_success": 0.02132791725330447,
|
| 258 |
+
"reocclusion": 0.21220772236181226,
|
| 259 |
+
"role_swap_consistency": 0.0,
|
| 260 |
+
"support_mode": 0.00017794872585095856,
|
| 261 |
+
"support_stability": 0.12488894654732001,
|
| 262 |
+
"task_metrics": 0.14187381208727234,
|
| 263 |
+
"total": 0.5180026358679721,
|
| 264 |
+
"uncertainty": 1.678193236126677e-05,
|
| 265 |
+
"visibility": 0.09251636654922837,
|
| 266 |
+
"world_model": 0.9452698754636865
|
| 267 |
+
},
|
| 268 |
+
"val": {
|
| 269 |
+
"action": 0.009614509682058158,
|
| 270 |
+
"arm_role": 9.934106870446158e-09,
|
| 271 |
+
"belief": 0.0954468369935498,
|
| 272 |
+
"clearance": 0.07359647931474628,
|
| 273 |
+
"corridor": 0.19544327168753653,
|
| 274 |
+
"disturbance": 0.004919912796388168,
|
| 275 |
+
"grasp_affordance": 0.01787316725786888,
|
| 276 |
+
"occluder_contact": 0.2034355541973403,
|
| 277 |
+
"persistence": 0.8611267923631452,
|
| 278 |
+
"phase": 0.2359058087635221,
|
| 279 |
+
"planner_ranking": 7.647072767337125e-06,
|
| 280 |
+
"planner_risk": 0.0028961390207493396,
|
| 281 |
+
"planner_success": 0.00021185575601658925,
|
| 282 |
+
"proposal_diversity": 0.0,
|
| 283 |
+
"proposal_ranking": 0.06654375551398985,
|
| 284 |
+
"proposal_reconstruction": 0.07095728814601898,
|
| 285 |
+
"proposal_success": 0.02652511727347067,
|
| 286 |
+
"reocclusion": 0.28424677580143465,
|
| 287 |
+
"role_swap_consistency": 0.0,
|
| 288 |
+
"support_mode": 5.328719337491996e-05,
|
| 289 |
+
"support_stability": 0.13284552616603446,
|
| 290 |
+
"task_metrics": 0.1393586558600267,
|
| 291 |
+
"total": 0.5038685834769047,
|
| 292 |
+
"uncertainty": 1.3334046157483085e-05,
|
| 293 |
+
"visibility": 0.08824686581889789,
|
| 294 |
+
"world_model": 0.8240700237678759
|
| 295 |
+
}
|
| 296 |
+
}
|
| 297 |
+
]
|
artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/summary.json
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"experiment_name": "proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17",
|
| 3 |
+
"device": "cuda",
|
| 4 |
+
"best_checkpoint": "/workspace/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/checkpoint_best.pt",
|
| 5 |
+
"final_train_total": 0.5180026358679721,
|
| 6 |
+
"final_val_total": 0.5038685834769047,
|
| 7 |
+
"train_time_sec": 163.31340551376343,
|
| 8 |
+
"peak_gpu_memory_mb": 2924.82177734375,
|
| 9 |
+
"num_train_samples": 380,
|
| 10 |
+
"num_val_samples": 131,
|
| 11 |
+
"planner_mode": "trainable",
|
| 12 |
+
"frozen_modules": [],
|
| 13 |
+
"init_info": {
|
| 14 |
+
"path": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt",
|
| 15 |
+
"loaded_keys": 828,
|
| 16 |
+
"skipped_shape_mismatch_keys": [
|
| 17 |
+
"decoder.proposal_mode_head.3.weight",
|
| 18 |
+
"decoder.proposal_mode_head.3.bias",
|
| 19 |
+
"decoder.proposal_mode_embeddings.weight"
|
| 20 |
+
],
|
| 21 |
+
"missing_keys": [
|
| 22 |
+
"decoder.task_embedding.weight",
|
| 23 |
+
"decoder.proposal_mode_head.3.weight",
|
| 24 |
+
"decoder.proposal_mode_head.3.bias",
|
| 25 |
+
"decoder.proposal_mode_embeddings.weight",
|
| 26 |
+
"decoder.mode_residual_heads.6.0.weight",
|
| 27 |
+
"decoder.mode_residual_heads.6.0.bias",
|
| 28 |
+
"decoder.mode_residual_heads.6.1.weight",
|
| 29 |
+
"decoder.mode_residual_heads.6.1.bias",
|
| 30 |
+
"decoder.mode_residual_heads.6.3.weight",
|
| 31 |
+
"decoder.mode_residual_heads.6.3.bias",
|
| 32 |
+
"elastic_state_head.decoder.task_embedding.weight",
|
| 33 |
+
"elastic_state_head.decoder.task_field_affine.weight",
|
| 34 |
+
"elastic_state_head.decoder.task_field_affine.bias",
|
| 35 |
+
"elastic_state_head.decoder.task_summary_adapter.0.weight",
|
| 36 |
+
"elastic_state_head.decoder.task_summary_adapter.0.bias",
|
| 37 |
+
"elastic_state_head.decoder.task_summary_adapter.1.weight",
|
| 38 |
+
"elastic_state_head.decoder.task_summary_adapter.1.bias",
|
| 39 |
+
"elastic_state_head.decoder.task_phase_head.weight",
|
| 40 |
+
"elastic_state_head.decoder.task_phase_head.bias",
|
| 41 |
+
"elastic_state_head.decoder.task_support_head.weight",
|
| 42 |
+
"elastic_state_head.decoder.task_support_head.bias",
|
| 43 |
+
"elastic_state_head.decoder.task_reocclusion_head.weight",
|
| 44 |
+
"elastic_state_head.decoder.task_reocclusion_head.bias",
|
| 45 |
+
"elastic_state_head.decoder.task_metric_head.0.weight",
|
| 46 |
+
"elastic_state_head.decoder.task_metric_head.0.bias",
|
| 47 |
+
"elastic_state_head.decoder.task_metric_head.1.weight",
|
| 48 |
+
"elastic_state_head.decoder.task_metric_head.1.bias",
|
| 49 |
+
"elastic_state_head.decoder.task_metric_head.3.weight",
|
| 50 |
+
"elastic_state_head.decoder.task_metric_head.3.bias",
|
| 51 |
+
"world_model.task_embedding.weight",
|
| 52 |
+
"world_model.spatial_field_encoder.0.weight",
|
| 53 |
+
"world_model.spatial_field_encoder.0.bias",
|
| 54 |
+
"world_model.spatial_field_encoder.2.weight",
|
| 55 |
+
"world_model.spatial_field_encoder.2.bias",
|
| 56 |
+
"world_model.spatial_context_proj.0.weight",
|
| 57 |
+
"world_model.spatial_context_proj.0.bias",
|
| 58 |
+
"world_model.spatial_context_proj.1.weight",
|
| 59 |
+
"world_model.spatial_context_proj.1.bias",
|
| 60 |
+
"world_model.spatial_gate_z.weight",
|
| 61 |
+
"world_model.spatial_gate_z.bias",
|
| 62 |
+
"world_model.spatial_gate_r.weight",
|
| 63 |
+
"world_model.spatial_gate_r.bias",
|
| 64 |
+
"world_model.spatial_candidate.weight",
|
| 65 |
+
"world_model.spatial_candidate.bias",
|
| 66 |
+
"world_model.spatial_summary_proj.0.weight",
|
| 67 |
+
"world_model.spatial_summary_proj.0.bias",
|
| 68 |
+
"world_model.spatial_summary_proj.1.weight",
|
| 69 |
+
"world_model.spatial_summary_proj.1.bias",
|
| 70 |
+
"world_model.spatial_phase_head.weight",
|
| 71 |
+
"world_model.spatial_phase_head.bias",
|
| 72 |
+
"world_model.spatial_support_mode_head.weight",
|
| 73 |
+
"world_model.spatial_support_mode_head.bias",
|
| 74 |
+
"world_model.spatial_arm_role_head.weight",
|
| 75 |
+
"world_model.spatial_arm_role_head.bias",
|
| 76 |
+
"world_model.spatial_reocclusion_head.weight",
|
| 77 |
+
"world_model.spatial_reocclusion_head.bias",
|
| 78 |
+
"world_model.spatial_target_belief_head.weight",
|
| 79 |
+
"world_model.spatial_target_belief_head.bias",
|
| 80 |
+
"world_model.spatial_visibility_head.weight",
|
| 81 |
+
"world_model.spatial_visibility_head.bias",
|
| 82 |
+
"world_model.spatial_clearance_head.weight",
|
| 83 |
+
"world_model.spatial_clearance_head.bias",
|
| 84 |
+
"world_model.spatial_occluder_contact_head.weight",
|
| 85 |
+
"world_model.spatial_occluder_contact_head.bias",
|
| 86 |
+
"world_model.spatial_grasp_affordance_head.weight",
|
| 87 |
+
"world_model.spatial_grasp_affordance_head.bias",
|
| 88 |
+
"world_model.spatial_support_stability_head.weight",
|
| 89 |
+
"world_model.spatial_support_stability_head.bias",
|
| 90 |
+
"world_model.spatial_persistence_head.weight",
|
| 91 |
+
"world_model.spatial_persistence_head.bias",
|
| 92 |
+
"world_model.spatial_reocclusion_field_head.weight",
|
| 93 |
+
"world_model.spatial_reocclusion_field_head.bias",
|
| 94 |
+
"world_model.spatial_disturbance_head.weight",
|
| 95 |
+
"world_model.spatial_disturbance_head.bias",
|
| 96 |
+
"world_model.spatial_uncertainty_head.weight",
|
| 97 |
+
"world_model.spatial_uncertainty_head.bias",
|
| 98 |
+
"world_model.spatial_access_head.weight",
|
| 99 |
+
"world_model.spatial_access_head.bias"
|
| 100 |
+
],
|
| 101 |
+
"unexpected_keys": []
|
| 102 |
+
}
|
| 103 |
+
}
|
artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/checkpoint_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3968c7aaeace3aeeb9ba2a6343ab2b35b792acbd00911d5eb76d90cd3db80a1c
|
| 3 |
+
size 940662478
|
artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/config_resolved.yaml
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17
|
| 2 |
+
output_dir: /workspace/outputs/r3d_handoff_phase
|
| 3 |
+
device: cuda
|
| 4 |
+
seed: 17
|
| 5 |
+
init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt
|
| 6 |
+
init_strict: false
|
| 7 |
+
data:
|
| 8 |
+
proxies:
|
| 9 |
+
- foliage_proxy
|
| 10 |
+
- bag_proxy
|
| 11 |
+
- cloth_proxy
|
| 12 |
+
resolution: 224
|
| 13 |
+
dataset_version: reveal_proxy_v6_rgbd_elastic_state_phase
|
| 14 |
+
train_episodes_per_proxy: 48
|
| 15 |
+
val_episodes_per_proxy: 16
|
| 16 |
+
train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_phase.pt
|
| 17 |
+
val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_phase.pt
|
| 18 |
+
rebuild_dataset: false
|
| 19 |
+
chunk_horizon: 8
|
| 20 |
+
rollout_horizon: 5
|
| 21 |
+
history_steps: 6
|
| 22 |
+
planner_candidates: 8
|
| 23 |
+
seed: 17
|
| 24 |
+
optim:
|
| 25 |
+
epochs: 3
|
| 26 |
+
batch_size: 4
|
| 27 |
+
num_workers: 24
|
| 28 |
+
lr: 0.0001
|
| 29 |
+
weight_decay: 0.0001
|
| 30 |
+
trainer:
|
| 31 |
+
policy_type: elastic_reveal
|
| 32 |
+
use_bf16: true
|
| 33 |
+
grad_clip_norm: 1.0
|
| 34 |
+
freeze_backbone: true
|
| 35 |
+
gradient_checkpointing: false
|
| 36 |
+
plan_during_train: true
|
| 37 |
+
plan_during_eval: true
|
| 38 |
+
support_mode_conditioning: true
|
| 39 |
+
planner_mode: trainable
|
| 40 |
+
use_depth: true
|
| 41 |
+
use_world_model: true
|
| 42 |
+
use_role_tokens: true
|
| 43 |
+
compute_equivariance_probe: false
|
| 44 |
+
policy:
|
| 45 |
+
backbone:
|
| 46 |
+
model_name: openai/clip-vit-base-patch32
|
| 47 |
+
hidden_dim: 512
|
| 48 |
+
max_text_tokens: 32
|
| 49 |
+
freeze_backbone: true
|
| 50 |
+
gradient_checkpointing: false
|
| 51 |
+
use_dummy_backbone: false
|
| 52 |
+
fusion:
|
| 53 |
+
hidden_dim: 512
|
| 54 |
+
num_cameras: 3
|
| 55 |
+
num_layers: 4
|
| 56 |
+
num_heads: 8
|
| 57 |
+
ff_dim: 2048
|
| 58 |
+
dropout: 0.1
|
| 59 |
+
proprio_dim: 32
|
| 60 |
+
proprio_tokens: 1
|
| 61 |
+
memory:
|
| 62 |
+
hidden_dim: 512
|
| 63 |
+
action_dim: 14
|
| 64 |
+
history_steps: 6
|
| 65 |
+
scene_history_steps: 3
|
| 66 |
+
belief_history_steps: 8
|
| 67 |
+
num_layers: 2
|
| 68 |
+
dropout: 0.1
|
| 69 |
+
memory_bank_size: 4
|
| 70 |
+
scene_bank_size: 2
|
| 71 |
+
belief_bank_size: 2
|
| 72 |
+
num_heads: 8
|
| 73 |
+
max_history_steps: 8
|
| 74 |
+
decoder:
|
| 75 |
+
hidden_dim: 512
|
| 76 |
+
num_heads: 8
|
| 77 |
+
num_layers: 4
|
| 78 |
+
ff_dim: 2048
|
| 79 |
+
dropout: 0.1
|
| 80 |
+
chunk_size: 8
|
| 81 |
+
action_dim: 14
|
| 82 |
+
arm_action_dim: 7
|
| 83 |
+
num_candidates: 8
|
| 84 |
+
num_phases: 5
|
| 85 |
+
num_arm_roles: 4
|
| 86 |
+
num_proposal_modes: 7
|
| 87 |
+
planner_top_k: 4
|
| 88 |
+
reveal_head:
|
| 89 |
+
hidden_dim: 512
|
| 90 |
+
num_support_modes: 3
|
| 91 |
+
num_approach_templates: 32
|
| 92 |
+
rollout_horizon: 5
|
| 93 |
+
belief_map_size: 32
|
| 94 |
+
field_size: 16
|
| 95 |
+
num_heads: 8
|
| 96 |
+
predict_belief_map: true
|
| 97 |
+
num_phases: 5
|
| 98 |
+
num_arm_roles: 4
|
| 99 |
+
num_interaction_tokens: 8
|
| 100 |
+
num_tasks: 4
|
| 101 |
+
world_model:
|
| 102 |
+
hidden_dim: 512
|
| 103 |
+
action_dim: 14
|
| 104 |
+
num_support_modes: 3
|
| 105 |
+
num_approach_templates: 32
|
| 106 |
+
rollout_horizon: 5
|
| 107 |
+
field_size: 16
|
| 108 |
+
num_heads: 8
|
| 109 |
+
num_phases: 5
|
| 110 |
+
num_arm_roles: 4
|
| 111 |
+
num_interaction_tokens: 8
|
| 112 |
+
belief_map_size: 32
|
| 113 |
+
predict_belief_map: true
|
| 114 |
+
scene_bank_size: 2
|
| 115 |
+
belief_bank_size: 2
|
| 116 |
+
rollout_mode: compact_rollout
|
| 117 |
+
num_tasks: 4
|
| 118 |
+
planner:
|
| 119 |
+
hidden_dim: 512
|
| 120 |
+
num_candidates: 8
|
| 121 |
+
action_dim: 14
|
| 122 |
+
num_support_modes: 3
|
| 123 |
+
utility_margin: 0.1
|
| 124 |
+
num_heads: 8
|
| 125 |
+
num_layers: 2
|
| 126 |
+
num_phases: 5
|
| 127 |
+
num_arm_roles: 4
|
| 128 |
+
top_k: 4
|
| 129 |
+
loss_weights:
|
| 130 |
+
action: 1.0
|
| 131 |
+
phase: 0.08
|
| 132 |
+
arm_role: 0.1
|
| 133 |
+
support_mode: 0.1
|
| 134 |
+
corridor: 0.12
|
| 135 |
+
persistence: 0.06
|
| 136 |
+
disturbance: 0.06
|
| 137 |
+
world_model: 0.2
|
| 138 |
+
belief: 0.05
|
| 139 |
+
visibility: 0.05
|
| 140 |
+
clearance: 0.06
|
| 141 |
+
support_stability: 0.06
|
| 142 |
+
reocclusion: 0.06
|
| 143 |
+
occluder_contact: 0.05
|
| 144 |
+
grasp_affordance: 0.05
|
| 145 |
+
planner_success: 0.2
|
| 146 |
+
planner_risk: 0.08
|
| 147 |
+
planner_ranking: 0.2
|
| 148 |
+
proposal_reconstruction: 0.08
|
| 149 |
+
proposal_success: 0.12
|
| 150 |
+
proposal_ranking: 0.15
|
| 151 |
+
proposal_diversity: 0.05
|
| 152 |
+
role_swap_consistency: 0.02
|
| 153 |
+
task_metrics: 0.05
|
artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/metrics.json
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"epoch": 0,
|
| 4 |
+
"train": {
|
| 5 |
+
"action": 0.01903669923838032,
|
| 6 |
+
"arm_role": 0.0011954489507173237,
|
| 7 |
+
"belief": 0.11834000094156516,
|
| 8 |
+
"clearance": 0.07986288266746622,
|
| 9 |
+
"corridor": 0.2066003169864416,
|
| 10 |
+
"disturbance": 0.00205025365499559,
|
| 11 |
+
"grasp_affordance": 0.00966440967382177,
|
| 12 |
+
"occluder_contact": 0.2094391022857867,
|
| 13 |
+
"persistence": 0.6896675200305732,
|
| 14 |
+
"phase": 0.3597658646734137,
|
| 15 |
+
"planner_ranking": 0.010135862800770586,
|
| 16 |
+
"planner_risk": 0.02269953653020294,
|
| 17 |
+
"planner_success": 0.10877308378878393,
|
| 18 |
+
"proposal_diversity": 0.0,
|
| 19 |
+
"proposal_ranking": 0.3664713734858914,
|
| 20 |
+
"proposal_reconstruction": 0.10571438976024326,
|
| 21 |
+
"proposal_success": 0.3060418816773515,
|
| 22 |
+
"reocclusion": 0.23476032413738337,
|
| 23 |
+
"role_swap_consistency": 0.0,
|
| 24 |
+
"support_mode": 0.02608335896542198,
|
| 25 |
+
"support_stability": 0.1283740213434947,
|
| 26 |
+
"task_metrics": 0.16838834607287456,
|
| 27 |
+
"total": 0.7838476789625067,
|
| 28 |
+
"uncertainty": 9.599007572480907e-05,
|
| 29 |
+
"visibility": 0.11086608359688206,
|
| 30 |
+
"world_model": 2.4192019870406702
|
| 31 |
+
},
|
| 32 |
+
"val": {
|
| 33 |
+
"action": 0.016467795622619717,
|
| 34 |
+
"arm_role": 0.00044356281741232976,
|
| 35 |
+
"belief": 0.10317085593035727,
|
| 36 |
+
"clearance": 0.07557447807806911,
|
| 37 |
+
"corridor": 0.20408162474632263,
|
| 38 |
+
"disturbance": 0.0014927912058986046,
|
| 39 |
+
"grasp_affordance": 0.00985990883782506,
|
| 40 |
+
"occluder_contact": 0.20460259372537787,
|
| 41 |
+
"persistence": 0.9340643006179369,
|
| 42 |
+
"phase": 0.4205810320422505,
|
| 43 |
+
"planner_ranking": 0.0002936208506016004,
|
| 44 |
+
"planner_risk": 0.0162254377748027,
|
| 45 |
+
"planner_success": 0.013315262752726223,
|
| 46 |
+
"proposal_diversity": 0.0,
|
| 47 |
+
"proposal_ranking": 0.14499353143301877,
|
| 48 |
+
"proposal_reconstruction": 0.08181179179386659,
|
| 49 |
+
"proposal_success": 0.13043119690634988,
|
| 50 |
+
"reocclusion": 0.2904955812475898,
|
| 51 |
+
"role_swap_consistency": 0.0,
|
| 52 |
+
"support_mode": 0.0027611398766043058,
|
| 53 |
+
"support_stability": 0.13547231697223402,
|
| 54 |
+
"task_metrics": 0.1477968403787324,
|
| 55 |
+
"total": 0.6003618971867994,
|
| 56 |
+
"uncertainty": 0.00011726474721482109,
|
| 57 |
+
"visibility": 0.09890205435680621,
|
| 58 |
+
"world_model": 1.8151403015310115
|
| 59 |
+
}
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 1,
|
| 63 |
+
"train": {
|
| 64 |
+
"action": 0.011172730509976024,
|
| 65 |
+
"arm_role": 0.005047608049292314,
|
| 66 |
+
"belief": 0.10358183007491262,
|
| 67 |
+
"clearance": 0.07736518386947482,
|
| 68 |
+
"corridor": 0.19873161581589988,
|
| 69 |
+
"disturbance": 0.002192211224188979,
|
| 70 |
+
"grasp_affordance": 0.010328439559395376,
|
| 71 |
+
"occluder_contact": 0.20153354829863498,
|
| 72 |
+
"persistence": 0.6124827290352091,
|
| 73 |
+
"phase": 0.2595963217710194,
|
| 74 |
+
"planner_ranking": 0.00016108446396796545,
|
| 75 |
+
"planner_risk": 0.01525886312960402,
|
| 76 |
+
"planner_success": 0.006821451556721801,
|
| 77 |
+
"proposal_diversity": 0.0,
|
| 78 |
+
"proposal_ranking": 0.08119581977983839,
|
| 79 |
+
"proposal_reconstruction": 0.07775982775186238,
|
| 80 |
+
"proposal_success": 0.08087819657827679,
|
| 81 |
+
"reocclusion": 0.21806120462715625,
|
| 82 |
+
"role_swap_consistency": 0.0,
|
| 83 |
+
"support_mode": 0.004165554046630859,
|
| 84 |
+
"support_stability": 0.12518173395410964,
|
| 85 |
+
"task_metrics": 0.1470285534074432,
|
| 86 |
+
"total": 0.4728757431632594,
|
| 87 |
+
"uncertainty": 4.4979283201859e-05,
|
| 88 |
+
"visibility": 0.09486380823348697,
|
| 89 |
+
"world_model": 1.477295964642575
|
| 90 |
+
},
|
| 91 |
+
"val": {
|
| 92 |
+
"action": 0.014699659296170328,
|
| 93 |
+
"arm_role": 0.0032501910410990768,
|
| 94 |
+
"belief": 0.09604058076034892,
|
| 95 |
+
"clearance": 0.0737346127403505,
|
| 96 |
+
"corridor": 0.19246741181070154,
|
| 97 |
+
"disturbance": 0.002424581844631272,
|
| 98 |
+
"grasp_affordance": 0.011332590499836388,
|
| 99 |
+
"occluder_contact": 0.1972112443410989,
|
| 100 |
+
"persistence": 0.754733283637148,
|
| 101 |
+
"phase": 0.27422264163606364,
|
| 102 |
+
"planner_ranking": 6.61312957218439e-05,
|
| 103 |
+
"planner_risk": 0.014809876634513565,
|
| 104 |
+
"planner_success": 0.005034577334299684,
|
| 105 |
+
"proposal_diversity": 0.0,
|
| 106 |
+
"proposal_ranking": 0.10355714928697456,
|
| 107 |
+
"proposal_reconstruction": 0.07605304298075763,
|
| 108 |
+
"proposal_success": 0.07239359052795352,
|
| 109 |
+
"reocclusion": 0.2211838181723248,
|
| 110 |
+
"role_swap_consistency": 0.0,
|
| 111 |
+
"support_mode": 0.0030512072068328657,
|
| 112 |
+
"support_stability": 0.13321079302466277,
|
| 113 |
+
"task_metrics": 0.14215884109338126,
|
| 114 |
+
"total": 0.42770718534787494,
|
| 115 |
+
"uncertainty": 4.0278254969604006e-05,
|
| 116 |
+
"visibility": 0.09008839387785304,
|
| 117 |
+
"world_model": 1.1843715039166538
|
| 118 |
+
}
|
| 119 |
+
},
|
| 120 |
+
{
|
| 121 |
+
"epoch": 2,
|
| 122 |
+
"train": {
|
| 123 |
+
"action": 0.009650313334637567,
|
| 124 |
+
"arm_role": 0.0029296059357492546,
|
| 125 |
+
"belief": 0.09819177420515764,
|
| 126 |
+
"clearance": 0.07597495360594046,
|
| 127 |
+
"corridor": 0.19736162976018692,
|
| 128 |
+
"disturbance": 0.002284994474719401,
|
| 129 |
+
"grasp_affordance": 0.011029037923895214,
|
| 130 |
+
"occluder_contact": 0.19601713622871197,
|
| 131 |
+
"persistence": 0.6342933682810613,
|
| 132 |
+
"phase": 0.2582773412528791,
|
| 133 |
+
"planner_ranking": 0.00024582118595916605,
|
| 134 |
+
"planner_risk": 0.011063184471506822,
|
| 135 |
+
"planner_success": 0.0019527532209299113,
|
| 136 |
+
"proposal_diversity": 0.0,
|
| 137 |
+
"proposal_ranking": 0.06367454680761224,
|
| 138 |
+
"proposal_reconstruction": 0.07267745521507765,
|
| 139 |
+
"proposal_success": 0.047251041174719205,
|
| 140 |
+
"reocclusion": 0.2153781190392022,
|
| 141 |
+
"role_swap_consistency": 0.0,
|
| 142 |
+
"support_mode": 0.0022934590515337494,
|
| 143 |
+
"support_stability": 0.12411213658357921,
|
| 144 |
+
"task_metrics": 0.14521450063115673,
|
| 145 |
+
"total": 0.3853033130106173,
|
| 146 |
+
"uncertainty": 2.1849837026509607e-05,
|
| 147 |
+
"visibility": 0.09172474040012611,
|
| 148 |
+
"world_model": 1.0909657170898035
|
| 149 |
+
},
|
| 150 |
+
"val": {
|
| 151 |
+
"action": 0.010615826838395813,
|
| 152 |
+
"arm_role": 0.002171783652474088,
|
| 153 |
+
"belief": 0.09422377767887982,
|
| 154 |
+
"clearance": 0.0732199704330979,
|
| 155 |
+
"corridor": 0.1925947627786434,
|
| 156 |
+
"disturbance": 0.001796035210700762,
|
| 157 |
+
"grasp_affordance": 0.010509083230951519,
|
| 158 |
+
"occluder_contact": 0.19385047753651938,
|
| 159 |
+
"persistence": 0.6774057000875473,
|
| 160 |
+
"phase": 0.35872898339188064,
|
| 161 |
+
"planner_ranking": 0.00021498550532586788,
|
| 162 |
+
"planner_risk": 0.009475005589510229,
|
| 163 |
+
"planner_success": 0.0013016004857931737,
|
| 164 |
+
"proposal_diversity": 0.0,
|
| 165 |
+
"proposal_ranking": 0.08124377544630658,
|
| 166 |
+
"proposal_reconstruction": 0.07418028581323045,
|
| 167 |
+
"proposal_success": 0.053315439402605545,
|
| 168 |
+
"reocclusion": 0.21817159472089825,
|
| 169 |
+
"role_swap_consistency": 0.0,
|
| 170 |
+
"support_mode": 0.0010789622478610413,
|
| 171 |
+
"support_stability": 0.13355377282608638,
|
| 172 |
+
"task_metrics": 0.14122987561153644,
|
| 173 |
+
"total": 0.3789277257341327,
|
| 174 |
+
"uncertainty": 3.114792006079259e-05,
|
| 175 |
+
"visibility": 0.08864631684440555,
|
| 176 |
+
"world_model": 0.9896242853366968
|
| 177 |
+
}
|
| 178 |
+
}
|
| 179 |
+
]
|
artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/summary.json
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"experiment_name": "proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17",
|
| 3 |
+
"device": "cuda",
|
| 4 |
+
"best_checkpoint": "/workspace/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/checkpoint_best.pt",
|
| 5 |
+
"final_train_total": 0.3853033130106173,
|
| 6 |
+
"final_val_total": 0.3789277257341327,
|
| 7 |
+
"train_time_sec": 128.96558284759521,
|
| 8 |
+
"peak_gpu_memory_mb": 2450.287109375,
|
| 9 |
+
"num_train_samples": 380,
|
| 10 |
+
"num_val_samples": 131,
|
| 11 |
+
"planner_mode": "trainable",
|
| 12 |
+
"frozen_modules": [],
|
| 13 |
+
"init_info": {
|
| 14 |
+
"path": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt",
|
| 15 |
+
"loaded_keys": 828,
|
| 16 |
+
"skipped_shape_mismatch_keys": [
|
| 17 |
+
"decoder.proposal_mode_head.3.weight",
|
| 18 |
+
"decoder.proposal_mode_head.3.bias",
|
| 19 |
+
"decoder.proposal_mode_embeddings.weight"
|
| 20 |
+
],
|
| 21 |
+
"missing_keys": [
|
| 22 |
+
"decoder.task_embedding.weight",
|
| 23 |
+
"decoder.proposal_mode_head.3.weight",
|
| 24 |
+
"decoder.proposal_mode_head.3.bias",
|
| 25 |
+
"decoder.proposal_mode_embeddings.weight",
|
| 26 |
+
"decoder.mode_residual_heads.6.0.weight",
|
| 27 |
+
"decoder.mode_residual_heads.6.0.bias",
|
| 28 |
+
"decoder.mode_residual_heads.6.1.weight",
|
| 29 |
+
"decoder.mode_residual_heads.6.1.bias",
|
| 30 |
+
"decoder.mode_residual_heads.6.3.weight",
|
| 31 |
+
"decoder.mode_residual_heads.6.3.bias",
|
| 32 |
+
"elastic_state_head.decoder.task_embedding.weight",
|
| 33 |
+
"elastic_state_head.decoder.task_field_affine.weight",
|
| 34 |
+
"elastic_state_head.decoder.task_field_affine.bias",
|
| 35 |
+
"elastic_state_head.decoder.task_summary_adapter.0.weight",
|
| 36 |
+
"elastic_state_head.decoder.task_summary_adapter.0.bias",
|
| 37 |
+
"elastic_state_head.decoder.task_summary_adapter.1.weight",
|
| 38 |
+
"elastic_state_head.decoder.task_summary_adapter.1.bias",
|
| 39 |
+
"elastic_state_head.decoder.task_phase_head.weight",
|
| 40 |
+
"elastic_state_head.decoder.task_phase_head.bias",
|
| 41 |
+
"elastic_state_head.decoder.task_support_head.weight",
|
| 42 |
+
"elastic_state_head.decoder.task_support_head.bias",
|
| 43 |
+
"elastic_state_head.decoder.task_reocclusion_head.weight",
|
| 44 |
+
"elastic_state_head.decoder.task_reocclusion_head.bias",
|
| 45 |
+
"elastic_state_head.decoder.task_metric_head.0.weight",
|
| 46 |
+
"elastic_state_head.decoder.task_metric_head.0.bias",
|
| 47 |
+
"elastic_state_head.decoder.task_metric_head.1.weight",
|
| 48 |
+
"elastic_state_head.decoder.task_metric_head.1.bias",
|
| 49 |
+
"elastic_state_head.decoder.task_metric_head.3.weight",
|
| 50 |
+
"elastic_state_head.decoder.task_metric_head.3.bias",
|
| 51 |
+
"world_model.task_embedding.weight",
|
| 52 |
+
"world_model.spatial_field_encoder.0.weight",
|
| 53 |
+
"world_model.spatial_field_encoder.0.bias",
|
| 54 |
+
"world_model.spatial_field_encoder.2.weight",
|
| 55 |
+
"world_model.spatial_field_encoder.2.bias",
|
| 56 |
+
"world_model.spatial_context_proj.0.weight",
|
| 57 |
+
"world_model.spatial_context_proj.0.bias",
|
| 58 |
+
"world_model.spatial_context_proj.1.weight",
|
| 59 |
+
"world_model.spatial_context_proj.1.bias",
|
| 60 |
+
"world_model.spatial_gate_z.weight",
|
| 61 |
+
"world_model.spatial_gate_z.bias",
|
| 62 |
+
"world_model.spatial_gate_r.weight",
|
| 63 |
+
"world_model.spatial_gate_r.bias",
|
| 64 |
+
"world_model.spatial_candidate.weight",
|
| 65 |
+
"world_model.spatial_candidate.bias",
|
| 66 |
+
"world_model.spatial_summary_proj.0.weight",
|
| 67 |
+
"world_model.spatial_summary_proj.0.bias",
|
| 68 |
+
"world_model.spatial_summary_proj.1.weight",
|
| 69 |
+
"world_model.spatial_summary_proj.1.bias",
|
| 70 |
+
"world_model.spatial_phase_head.weight",
|
| 71 |
+
"world_model.spatial_phase_head.bias",
|
| 72 |
+
"world_model.spatial_support_mode_head.weight",
|
| 73 |
+
"world_model.spatial_support_mode_head.bias",
|
| 74 |
+
"world_model.spatial_arm_role_head.weight",
|
| 75 |
+
"world_model.spatial_arm_role_head.bias",
|
| 76 |
+
"world_model.spatial_reocclusion_head.weight",
|
| 77 |
+
"world_model.spatial_reocclusion_head.bias",
|
| 78 |
+
"world_model.spatial_target_belief_head.weight",
|
| 79 |
+
"world_model.spatial_target_belief_head.bias",
|
| 80 |
+
"world_model.spatial_visibility_head.weight",
|
| 81 |
+
"world_model.spatial_visibility_head.bias",
|
| 82 |
+
"world_model.spatial_clearance_head.weight",
|
| 83 |
+
"world_model.spatial_clearance_head.bias",
|
| 84 |
+
"world_model.spatial_occluder_contact_head.weight",
|
| 85 |
+
"world_model.spatial_occluder_contact_head.bias",
|
| 86 |
+
"world_model.spatial_grasp_affordance_head.weight",
|
| 87 |
+
"world_model.spatial_grasp_affordance_head.bias",
|
| 88 |
+
"world_model.spatial_support_stability_head.weight",
|
| 89 |
+
"world_model.spatial_support_stability_head.bias",
|
| 90 |
+
"world_model.spatial_persistence_head.weight",
|
| 91 |
+
"world_model.spatial_persistence_head.bias",
|
| 92 |
+
"world_model.spatial_reocclusion_field_head.weight",
|
| 93 |
+
"world_model.spatial_reocclusion_field_head.bias",
|
| 94 |
+
"world_model.spatial_disturbance_head.weight",
|
| 95 |
+
"world_model.spatial_disturbance_head.bias",
|
| 96 |
+
"world_model.spatial_uncertainty_head.weight",
|
| 97 |
+
"world_model.spatial_uncertainty_head.bias",
|
| 98 |
+
"world_model.spatial_access_head.weight",
|
| 99 |
+
"world_model.spatial_access_head.bias"
|
| 100 |
+
],
|
| 101 |
+
"unexpected_keys": []
|
| 102 |
+
}
|
| 103 |
+
}
|
artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/checkpoint_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc40df2f2c241001bf2e8c17a177cfbcda82acef7ae90997d8e145357d901349
|
| 3 |
+
size 940663118
|
artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/config_resolved.yaml
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17
|
| 2 |
+
output_dir: /workspace/outputs/r3d_handoff_phase
|
| 3 |
+
device: cuda
|
| 4 |
+
seed: 17
|
| 5 |
+
init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt
|
| 6 |
+
init_strict: false
|
| 7 |
+
data:
|
| 8 |
+
proxies:
|
| 9 |
+
- foliage_proxy
|
| 10 |
+
- bag_proxy
|
| 11 |
+
- cloth_proxy
|
| 12 |
+
resolution: 224
|
| 13 |
+
dataset_version: reveal_proxy_v6_rgbd_elastic_state_phase
|
| 14 |
+
train_episodes_per_proxy: 48
|
| 15 |
+
val_episodes_per_proxy: 16
|
| 16 |
+
train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_phase.pt
|
| 17 |
+
val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_phase.pt
|
| 18 |
+
rebuild_dataset: false
|
| 19 |
+
chunk_horizon: 8
|
| 20 |
+
rollout_horizon: 5
|
| 21 |
+
history_steps: 6
|
| 22 |
+
planner_candidates: 8
|
| 23 |
+
seed: 17
|
| 24 |
+
optim:
|
| 25 |
+
epochs: 4
|
| 26 |
+
batch_size: 4
|
| 27 |
+
num_workers: 24
|
| 28 |
+
lr: 0.00015
|
| 29 |
+
weight_decay: 0.0001
|
| 30 |
+
trainer:
|
| 31 |
+
policy_type: elastic_reveal
|
| 32 |
+
use_bf16: true
|
| 33 |
+
grad_clip_norm: 1.0
|
| 34 |
+
freeze_backbone: true
|
| 35 |
+
gradient_checkpointing: false
|
| 36 |
+
plan_during_train: true
|
| 37 |
+
plan_during_eval: true
|
| 38 |
+
support_mode_conditioning: true
|
| 39 |
+
planner_mode: trainable
|
| 40 |
+
use_depth: true
|
| 41 |
+
use_world_model: true
|
| 42 |
+
use_role_tokens: true
|
| 43 |
+
compute_equivariance_probe: false
|
| 44 |
+
policy:
|
| 45 |
+
backbone:
|
| 46 |
+
model_name: openai/clip-vit-base-patch32
|
| 47 |
+
hidden_dim: 512
|
| 48 |
+
max_text_tokens: 32
|
| 49 |
+
freeze_backbone: true
|
| 50 |
+
gradient_checkpointing: false
|
| 51 |
+
use_dummy_backbone: false
|
| 52 |
+
fusion:
|
| 53 |
+
hidden_dim: 512
|
| 54 |
+
num_cameras: 3
|
| 55 |
+
num_layers: 4
|
| 56 |
+
num_heads: 8
|
| 57 |
+
ff_dim: 2048
|
| 58 |
+
dropout: 0.1
|
| 59 |
+
proprio_dim: 32
|
| 60 |
+
proprio_tokens: 1
|
| 61 |
+
memory:
|
| 62 |
+
hidden_dim: 512
|
| 63 |
+
action_dim: 14
|
| 64 |
+
history_steps: 6
|
| 65 |
+
scene_history_steps: 3
|
| 66 |
+
belief_history_steps: 8
|
| 67 |
+
num_layers: 2
|
| 68 |
+
dropout: 0.1
|
| 69 |
+
memory_bank_size: 4
|
| 70 |
+
scene_bank_size: 2
|
| 71 |
+
belief_bank_size: 2
|
| 72 |
+
num_heads: 8
|
| 73 |
+
max_history_steps: 8
|
| 74 |
+
decoder:
|
| 75 |
+
hidden_dim: 512
|
| 76 |
+
num_heads: 8
|
| 77 |
+
num_layers: 4
|
| 78 |
+
ff_dim: 2048
|
| 79 |
+
dropout: 0.1
|
| 80 |
+
chunk_size: 8
|
| 81 |
+
action_dim: 14
|
| 82 |
+
arm_action_dim: 7
|
| 83 |
+
num_candidates: 8
|
| 84 |
+
num_phases: 5
|
| 85 |
+
num_arm_roles: 4
|
| 86 |
+
num_proposal_modes: 7
|
| 87 |
+
planner_top_k: 4
|
| 88 |
+
reveal_head:
|
| 89 |
+
hidden_dim: 512
|
| 90 |
+
num_support_modes: 3
|
| 91 |
+
num_approach_templates: 32
|
| 92 |
+
rollout_horizon: 5
|
| 93 |
+
belief_map_size: 32
|
| 94 |
+
field_size: 16
|
| 95 |
+
num_heads: 8
|
| 96 |
+
predict_belief_map: true
|
| 97 |
+
num_phases: 5
|
| 98 |
+
num_arm_roles: 4
|
| 99 |
+
num_interaction_tokens: 8
|
| 100 |
+
num_tasks: 4
|
| 101 |
+
world_model:
|
| 102 |
+
hidden_dim: 512
|
| 103 |
+
action_dim: 14
|
| 104 |
+
num_support_modes: 3
|
| 105 |
+
num_approach_templates: 32
|
| 106 |
+
rollout_horizon: 5
|
| 107 |
+
field_size: 16
|
| 108 |
+
num_heads: 8
|
| 109 |
+
num_phases: 5
|
| 110 |
+
num_arm_roles: 4
|
| 111 |
+
num_interaction_tokens: 8
|
| 112 |
+
belief_map_size: 32
|
| 113 |
+
predict_belief_map: true
|
| 114 |
+
scene_bank_size: 2
|
| 115 |
+
belief_bank_size: 2
|
| 116 |
+
rollout_mode: spatial_rollout
|
| 117 |
+
num_tasks: 4
|
| 118 |
+
planner:
|
| 119 |
+
hidden_dim: 512
|
| 120 |
+
num_candidates: 8
|
| 121 |
+
action_dim: 14
|
| 122 |
+
num_support_modes: 3
|
| 123 |
+
utility_margin: 0.1
|
| 124 |
+
num_heads: 8
|
| 125 |
+
num_layers: 2
|
| 126 |
+
num_phases: 5
|
| 127 |
+
num_arm_roles: 4
|
| 128 |
+
top_k: 4
|
| 129 |
+
loss_weights:
|
| 130 |
+
action: 0.6
|
| 131 |
+
phase: 0.08
|
| 132 |
+
arm_role: 0.1
|
| 133 |
+
support_mode: 0.1
|
| 134 |
+
corridor: 0.15
|
| 135 |
+
persistence: 0.08
|
| 136 |
+
disturbance: 0.08
|
| 137 |
+
world_model: 0.35
|
| 138 |
+
belief: 0.05
|
| 139 |
+
visibility: 0.05
|
| 140 |
+
clearance: 0.08
|
| 141 |
+
support_stability: 0.08
|
| 142 |
+
reocclusion: 0.08
|
| 143 |
+
occluder_contact: 0.05
|
| 144 |
+
grasp_affordance: 0.05
|
| 145 |
+
planner_success: 0.25
|
| 146 |
+
planner_risk: 0.1
|
| 147 |
+
planner_ranking: 0.25
|
| 148 |
+
proposal_reconstruction: 0.05
|
| 149 |
+
proposal_success: 0.2
|
| 150 |
+
proposal_ranking: 0.25
|
| 151 |
+
proposal_diversity: 0.05
|
| 152 |
+
role_swap_consistency: 0.02
|
| 153 |
+
task_metrics: 0.1
|
artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/metrics.json
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"epoch": 0,
|
| 4 |
+
"train": {
|
| 5 |
+
"action": 0.019181225144941555,
|
| 6 |
+
"arm_role": 3.1182640477230673e-07,
|
| 7 |
+
"belief": 0.11448187259467024,
|
| 8 |
+
"clearance": 0.0864047217133798,
|
| 9 |
+
"corridor": 0.21893725262856797,
|
| 10 |
+
"disturbance": 0.003079198535813607,
|
| 11 |
+
"grasp_affordance": 0.02621712978978298,
|
| 12 |
+
"occluder_contact": 0.2337783462122867,
|
| 13 |
+
"persistence": 0.719700569705425,
|
| 14 |
+
"phase": 0.3856928740677081,
|
| 15 |
+
"planner_ranking": 0.022916802166031102,
|
| 16 |
+
"planner_risk": 0.023334504154167678,
|
| 17 |
+
"planner_success": 0.10860285238411865,
|
| 18 |
+
"proposal_diversity": 0.0,
|
| 19 |
+
"proposal_ranking": 0.2922646708394352,
|
| 20 |
+
"proposal_reconstruction": 0.09971292505138799,
|
| 21 |
+
"proposal_success": 0.24467597556741613,
|
| 22 |
+
"reocclusion": 0.24054438805109576,
|
| 23 |
+
"role_swap_consistency": 0.0,
|
| 24 |
+
"support_mode": 0.03278662405515972,
|
| 25 |
+
"support_stability": 0.13613393926306774,
|
| 26 |
+
"task_metrics": 0.15838541047353494,
|
| 27 |
+
"total": 1.1187869517426743,
|
| 28 |
+
"uncertainty": 0.00028297689169975407,
|
| 29 |
+
"visibility": 0.11868352764531186,
|
| 30 |
+
"world_model": 2.122099159265819
|
| 31 |
+
},
|
| 32 |
+
"val": {
|
| 33 |
+
"action": 0.016976301219653,
|
| 34 |
+
"arm_role": 5.6895338003194025e-08,
|
| 35 |
+
"belief": 0.10437920215454968,
|
| 36 |
+
"clearance": 0.07773172505425685,
|
| 37 |
+
"corridor": 0.20573271156260461,
|
| 38 |
+
"disturbance": 0.002334372425979602,
|
| 39 |
+
"grasp_affordance": 0.04297696501298836,
|
| 40 |
+
"occluder_contact": 0.23979515650055624,
|
| 41 |
+
"persistence": 0.7753512069131389,
|
| 42 |
+
"phase": 0.44806069852502056,
|
| 43 |
+
"planner_ranking": 0.0006287269447208624,
|
| 44 |
+
"planner_risk": 0.018244338094849478,
|
| 45 |
+
"planner_success": 0.004400868512069185,
|
| 46 |
+
"proposal_diversity": 0.0,
|
| 47 |
+
"proposal_ranking": 0.12023697172602017,
|
| 48 |
+
"proposal_reconstruction": 0.07816453142599626,
|
| 49 |
+
"proposal_success": 0.08989288398262227,
|
| 50 |
+
"reocclusion": 0.23809522954803525,
|
| 51 |
+
"role_swap_consistency": 0.0,
|
| 52 |
+
"support_mode": 0.002169632747995131,
|
| 53 |
+
"support_stability": 0.1437560645016757,
|
| 54 |
+
"task_metrics": 0.14553586303284674,
|
| 55 |
+
"total": 0.6175428755355604,
|
| 56 |
+
"uncertainty": 7.00113979071444e-05,
|
| 57 |
+
"visibility": 0.09822999302184943,
|
| 58 |
+
"world_model": 0.9931504021991383
|
| 59 |
+
}
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 1,
|
| 63 |
+
"train": {
|
| 64 |
+
"action": 0.011922537655520597,
|
| 65 |
+
"arm_role": 3.544907820852179e-08,
|
| 66 |
+
"belief": 0.10706032749853636,
|
| 67 |
+
"clearance": 0.08102132846650324,
|
| 68 |
+
"corridor": 0.20774916157518564,
|
| 69 |
+
"disturbance": 0.003389558960426305,
|
| 70 |
+
"grasp_affordance": 0.021082493273149195,
|
| 71 |
+
"occluder_contact": 0.22566462457180023,
|
| 72 |
+
"persistence": 0.6525513453439712,
|
| 73 |
+
"phase": 0.2908584324937118,
|
| 74 |
+
"planner_ranking": 8.167305853045323e-05,
|
| 75 |
+
"planner_risk": 0.01491448436741178,
|
| 76 |
+
"planner_success": 0.0019715585316972513,
|
| 77 |
+
"proposal_diversity": 0.0,
|
| 78 |
+
"proposal_ranking": 0.06992289091607458,
|
| 79 |
+
"proposal_reconstruction": 0.07328974894787135,
|
| 80 |
+
"proposal_success": 0.05622971397089331,
|
| 81 |
+
"reocclusion": 0.2235310155016027,
|
| 82 |
+
"role_swap_consistency": 0.0,
|
| 83 |
+
"support_mode": 0.0016054740077570866,
|
| 84 |
+
"support_stability": 0.13015181959459657,
|
| 85 |
+
"task_metrics": 0.1444544498857699,
|
| 86 |
+
"total": 0.6000718728492135,
|
| 87 |
+
"uncertainty": 5.28337070471408e-05,
|
| 88 |
+
"visibility": 0.09962119711072821,
|
| 89 |
+
"world_model": 1.0844623948398389
|
| 90 |
+
},
|
| 91 |
+
"val": {
|
| 92 |
+
"action": 0.012654555594605026,
|
| 93 |
+
"arm_role": 0.0,
|
| 94 |
+
"belief": 0.10426627686529448,
|
| 95 |
+
"clearance": 0.08140931176868352,
|
| 96 |
+
"corridor": 0.21032546257430856,
|
| 97 |
+
"disturbance": 0.004191758795445132,
|
| 98 |
+
"grasp_affordance": 0.053641817603034506,
|
| 99 |
+
"occluder_contact": 0.23278299877137848,
|
| 100 |
+
"persistence": 1.2810496254400774,
|
| 101 |
+
"phase": 0.3091735607295325,
|
| 102 |
+
"planner_ranking": 0.00015768451346571743,
|
| 103 |
+
"planner_risk": 0.00882755185364548,
|
| 104 |
+
"planner_success": 0.0008127102125647732,
|
| 105 |
+
"proposal_diversity": 0.0,
|
| 106 |
+
"proposal_ranking": 0.1040246604490235,
|
| 107 |
+
"proposal_reconstruction": 0.074961857362227,
|
| 108 |
+
"proposal_success": 0.05817745603395231,
|
| 109 |
+
"reocclusion": 0.32169581723935675,
|
| 110 |
+
"role_swap_consistency": 0.0,
|
| 111 |
+
"support_mode": 0.0020671126813712444,
|
| 112 |
+
"support_stability": 0.13979825738704565,
|
| 113 |
+
"task_metrics": 0.14051661414630484,
|
| 114 |
+
"total": 0.6931917875102072,
|
| 115 |
+
"uncertainty": 7.29499135531748e-05,
|
| 116 |
+
"visibility": 0.09667146341367201,
|
| 117 |
+
"world_model": 1.1483498287923408
|
| 118 |
+
}
|
| 119 |
+
},
|
| 120 |
+
{
|
| 121 |
+
"epoch": 2,
|
| 122 |
+
"train": {
|
| 123 |
+
"action": 0.009906618949025869,
|
| 124 |
+
"arm_role": 1.2548346268503289e-08,
|
| 125 |
+
"belief": 0.10543899598874544,
|
| 126 |
+
"clearance": 0.08008571463195902,
|
| 127 |
+
"corridor": 0.20356425615517718,
|
| 128 |
+
"disturbance": 0.003584925674122611,
|
| 129 |
+
"grasp_affordance": 0.02068159209662362,
|
| 130 |
+
"occluder_contact": 0.21795249863674765,
|
| 131 |
+
"persistence": 0.7022376383202515,
|
| 132 |
+
"phase": 0.2805413355952815,
|
| 133 |
+
"planner_ranking": 5.863887835115939e-05,
|
| 134 |
+
"planner_risk": 0.007802685193325344,
|
| 135 |
+
"planner_success": 0.0006161007331684232,
|
| 136 |
+
"proposal_diversity": 0.0,
|
| 137 |
+
"proposal_ranking": 0.059995323853371176,
|
| 138 |
+
"proposal_reconstruction": 0.0712106538446326,
|
| 139 |
+
"proposal_success": 0.03740891177011164,
|
| 140 |
+
"reocclusion": 0.23808821498992314,
|
| 141 |
+
"role_swap_consistency": 0.0,
|
| 142 |
+
"support_mode": 0.0014756444253419575,
|
| 143 |
+
"support_stability": 0.1280404823783197,
|
| 144 |
+
"task_metrics": 0.14326360319790088,
|
| 145 |
+
"total": 0.5650381593327773,
|
| 146 |
+
"uncertainty": 4.478374925335964e-05,
|
| 147 |
+
"visibility": 0.09581161878610912,
|
| 148 |
+
"world_model": 1.0014131157021773
|
| 149 |
+
},
|
| 150 |
+
"val": {
|
| 151 |
+
"action": 0.011506305104403786,
|
| 152 |
+
"arm_role": 0.0,
|
| 153 |
+
"belief": 0.10174731707031076,
|
| 154 |
+
"clearance": 0.07599064165895636,
|
| 155 |
+
"corridor": 0.19603406366976825,
|
| 156 |
+
"disturbance": 0.0034094253584925987,
|
| 157 |
+
"grasp_affordance": 0.01412029577080499,
|
| 158 |
+
"occluder_contact": 0.21309178299976117,
|
| 159 |
+
"persistence": 0.7315149356921514,
|
| 160 |
+
"phase": 0.2930864508433098,
|
| 161 |
+
"planner_ranking": 0.0013956731370778168,
|
| 162 |
+
"planner_risk": 0.0041357744308753,
|
| 163 |
+
"planner_success": 0.0003894640280067864,
|
| 164 |
+
"proposal_diversity": 0.0,
|
| 165 |
+
"proposal_ranking": 0.08693639387263719,
|
| 166 |
+
"proposal_reconstruction": 0.07113255470088034,
|
| 167 |
+
"proposal_success": 0.05217952732786988,
|
| 168 |
+
"reocclusion": 0.2135303650390018,
|
| 169 |
+
"role_swap_consistency": 0.0,
|
| 170 |
+
"support_mode": 0.0002804717289770699,
|
| 171 |
+
"support_stability": 0.13648639472596574,
|
| 172 |
+
"task_metrics": 0.13825015418908812,
|
| 173 |
+
"total": 0.621522765267979,
|
| 174 |
+
"uncertainty": 5.1016551851441335e-05,
|
| 175 |
+
"visibility": 0.0916972327412981,
|
| 176 |
+
"world_model": 1.1354843739307288
|
| 177 |
+
}
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"epoch": 3,
|
| 181 |
+
"train": {
|
| 182 |
+
"action": 0.009076371156659565,
|
| 183 |
+
"arm_role": 3.764503880550987e-09,
|
| 184 |
+
"belief": 0.10279968522096936,
|
| 185 |
+
"clearance": 0.07841498155735041,
|
| 186 |
+
"corridor": 0.2006541040775023,
|
| 187 |
+
"disturbance": 0.00325027588436282,
|
| 188 |
+
"grasp_affordance": 0.014321190530532284,
|
| 189 |
+
"occluder_contact": 0.2105231849770797,
|
| 190 |
+
"persistence": 0.6405418016016483,
|
| 191 |
+
"phase": 0.24933996639753642,
|
| 192 |
+
"planner_ranking": 8.54384282053831e-05,
|
| 193 |
+
"planner_risk": 0.004359905376393152,
|
| 194 |
+
"planner_success": 0.00026648731834843365,
|
| 195 |
+
"proposal_diversity": 0.0,
|
| 196 |
+
"proposal_ranking": 0.048603357446968164,
|
| 197 |
+
"proposal_reconstruction": 0.07084132981927771,
|
| 198 |
+
"proposal_success": 0.02799873605958725,
|
| 199 |
+
"reocclusion": 0.2113562585295815,
|
| 200 |
+
"role_swap_consistency": 0.0,
|
| 201 |
+
"support_mode": 0.0005042697253980135,
|
| 202 |
+
"support_stability": 0.1260564218226232,
|
| 203 |
+
"task_metrics": 0.14196890086719866,
|
| 204 |
+
"total": 0.5253663646547417,
|
| 205 |
+
"uncertainty": 2.7578330574023744e-05,
|
| 206 |
+
"visibility": 0.09414561410483561,
|
| 207 |
+
"world_model": 0.9370055976666902
|
| 208 |
+
},
|
| 209 |
+
"val": {
|
| 210 |
+
"action": 0.01043125390159813,
|
| 211 |
+
"arm_role": 0.0,
|
| 212 |
+
"belief": 0.09857180962959926,
|
| 213 |
+
"clearance": 0.07481752595666682,
|
| 214 |
+
"corridor": 0.19474056340528256,
|
| 215 |
+
"disturbance": 0.00304531856664019,
|
| 216 |
+
"grasp_affordance": 0.01476043919242467,
|
| 217 |
+
"occluder_contact": 0.20771399185513006,
|
| 218 |
+
"persistence": 0.6766496115561688,
|
| 219 |
+
"phase": 0.3030792711546052,
|
| 220 |
+
"planner_ranking": 6.782512608514011e-05,
|
| 221 |
+
"planner_risk": 0.002957202364500104,
|
| 222 |
+
"planner_success": 0.00018708011846297956,
|
| 223 |
+
"proposal_diversity": 0.0,
|
| 224 |
+
"proposal_ranking": 0.06918713425561278,
|
| 225 |
+
"proposal_reconstruction": 0.07121785075375528,
|
| 226 |
+
"proposal_success": 0.028793677348982204,
|
| 227 |
+
"reocclusion": 0.21873644242684046,
|
| 228 |
+
"role_swap_consistency": 0.0,
|
| 229 |
+
"support_mode": 0.0007097468169928161,
|
| 230 |
+
"support_stability": 0.13461551465319865,
|
| 231 |
+
"task_metrics": 0.13820900722886575,
|
| 232 |
+
"total": 0.5076249196673884,
|
| 233 |
+
"uncertainty": 2.9063129700675333e-05,
|
| 234 |
+
"visibility": 0.09054145137920525,
|
| 235 |
+
"world_model": 0.850949106794415
|
| 236 |
+
}
|
| 237 |
+
}
|
| 238 |
+
]
|
artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/summary.json
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"experiment_name": "proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17",
|
| 3 |
+
"device": "cuda",
|
| 4 |
+
"best_checkpoint": "/workspace/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/checkpoint_best.pt",
|
| 5 |
+
"final_train_total": 0.5253663646547417,
|
| 6 |
+
"final_val_total": 0.5076249196673884,
|
| 7 |
+
"train_time_sec": 154.84144067764282,
|
| 8 |
+
"peak_gpu_memory_mb": 2926.07470703125,
|
| 9 |
+
"num_train_samples": 380,
|
| 10 |
+
"num_val_samples": 131,
|
| 11 |
+
"planner_mode": "trainable",
|
| 12 |
+
"frozen_modules": [],
|
| 13 |
+
"init_info": {
|
| 14 |
+
"path": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt",
|
| 15 |
+
"loaded_keys": 828,
|
| 16 |
+
"skipped_shape_mismatch_keys": [
|
| 17 |
+
"decoder.proposal_mode_head.3.weight",
|
| 18 |
+
"decoder.proposal_mode_head.3.bias",
|
| 19 |
+
"decoder.proposal_mode_embeddings.weight"
|
| 20 |
+
],
|
| 21 |
+
"missing_keys": [
|
| 22 |
+
"decoder.task_embedding.weight",
|
| 23 |
+
"decoder.proposal_mode_head.3.weight",
|
| 24 |
+
"decoder.proposal_mode_head.3.bias",
|
| 25 |
+
"decoder.proposal_mode_embeddings.weight",
|
| 26 |
+
"decoder.mode_residual_heads.6.0.weight",
|
| 27 |
+
"decoder.mode_residual_heads.6.0.bias",
|
| 28 |
+
"decoder.mode_residual_heads.6.1.weight",
|
| 29 |
+
"decoder.mode_residual_heads.6.1.bias",
|
| 30 |
+
"decoder.mode_residual_heads.6.3.weight",
|
| 31 |
+
"decoder.mode_residual_heads.6.3.bias",
|
| 32 |
+
"elastic_state_head.decoder.task_embedding.weight",
|
| 33 |
+
"elastic_state_head.decoder.task_field_affine.weight",
|
| 34 |
+
"elastic_state_head.decoder.task_field_affine.bias",
|
| 35 |
+
"elastic_state_head.decoder.task_summary_adapter.0.weight",
|
| 36 |
+
"elastic_state_head.decoder.task_summary_adapter.0.bias",
|
| 37 |
+
"elastic_state_head.decoder.task_summary_adapter.1.weight",
|
| 38 |
+
"elastic_state_head.decoder.task_summary_adapter.1.bias",
|
| 39 |
+
"elastic_state_head.decoder.task_phase_head.weight",
|
| 40 |
+
"elastic_state_head.decoder.task_phase_head.bias",
|
| 41 |
+
"elastic_state_head.decoder.task_support_head.weight",
|
| 42 |
+
"elastic_state_head.decoder.task_support_head.bias",
|
| 43 |
+
"elastic_state_head.decoder.task_reocclusion_head.weight",
|
| 44 |
+
"elastic_state_head.decoder.task_reocclusion_head.bias",
|
| 45 |
+
"elastic_state_head.decoder.task_metric_head.0.weight",
|
| 46 |
+
"elastic_state_head.decoder.task_metric_head.0.bias",
|
| 47 |
+
"elastic_state_head.decoder.task_metric_head.1.weight",
|
| 48 |
+
"elastic_state_head.decoder.task_metric_head.1.bias",
|
| 49 |
+
"elastic_state_head.decoder.task_metric_head.3.weight",
|
| 50 |
+
"elastic_state_head.decoder.task_metric_head.3.bias",
|
| 51 |
+
"world_model.task_embedding.weight",
|
| 52 |
+
"world_model.spatial_field_encoder.0.weight",
|
| 53 |
+
"world_model.spatial_field_encoder.0.bias",
|
| 54 |
+
"world_model.spatial_field_encoder.2.weight",
|
| 55 |
+
"world_model.spatial_field_encoder.2.bias",
|
| 56 |
+
"world_model.spatial_context_proj.0.weight",
|
| 57 |
+
"world_model.spatial_context_proj.0.bias",
|
| 58 |
+
"world_model.spatial_context_proj.1.weight",
|
| 59 |
+
"world_model.spatial_context_proj.1.bias",
|
| 60 |
+
"world_model.spatial_gate_z.weight",
|
| 61 |
+
"world_model.spatial_gate_z.bias",
|
| 62 |
+
"world_model.spatial_gate_r.weight",
|
| 63 |
+
"world_model.spatial_gate_r.bias",
|
| 64 |
+
"world_model.spatial_candidate.weight",
|
| 65 |
+
"world_model.spatial_candidate.bias",
|
| 66 |
+
"world_model.spatial_summary_proj.0.weight",
|
| 67 |
+
"world_model.spatial_summary_proj.0.bias",
|
| 68 |
+
"world_model.spatial_summary_proj.1.weight",
|
| 69 |
+
"world_model.spatial_summary_proj.1.bias",
|
| 70 |
+
"world_model.spatial_phase_head.weight",
|
| 71 |
+
"world_model.spatial_phase_head.bias",
|
| 72 |
+
"world_model.spatial_support_mode_head.weight",
|
| 73 |
+
"world_model.spatial_support_mode_head.bias",
|
| 74 |
+
"world_model.spatial_arm_role_head.weight",
|
| 75 |
+
"world_model.spatial_arm_role_head.bias",
|
| 76 |
+
"world_model.spatial_reocclusion_head.weight",
|
| 77 |
+
"world_model.spatial_reocclusion_head.bias",
|
| 78 |
+
"world_model.spatial_target_belief_head.weight",
|
| 79 |
+
"world_model.spatial_target_belief_head.bias",
|
| 80 |
+
"world_model.spatial_visibility_head.weight",
|
| 81 |
+
"world_model.spatial_visibility_head.bias",
|
| 82 |
+
"world_model.spatial_clearance_head.weight",
|
| 83 |
+
"world_model.spatial_clearance_head.bias",
|
| 84 |
+
"world_model.spatial_occluder_contact_head.weight",
|
| 85 |
+
"world_model.spatial_occluder_contact_head.bias",
|
| 86 |
+
"world_model.spatial_grasp_affordance_head.weight",
|
| 87 |
+
"world_model.spatial_grasp_affordance_head.bias",
|
| 88 |
+
"world_model.spatial_support_stability_head.weight",
|
| 89 |
+
"world_model.spatial_support_stability_head.bias",
|
| 90 |
+
"world_model.spatial_persistence_head.weight",
|
| 91 |
+
"world_model.spatial_persistence_head.bias",
|
| 92 |
+
"world_model.spatial_reocclusion_field_head.weight",
|
| 93 |
+
"world_model.spatial_reocclusion_field_head.bias",
|
| 94 |
+
"world_model.spatial_disturbance_head.weight",
|
| 95 |
+
"world_model.spatial_disturbance_head.bias",
|
| 96 |
+
"world_model.spatial_uncertainty_head.weight",
|
| 97 |
+
"world_model.spatial_uncertainty_head.bias",
|
| 98 |
+
"world_model.spatial_access_head.weight",
|
| 99 |
+
"world_model.spatial_access_head.bias"
|
| 100 |
+
],
|
| 101 |
+
"unexpected_keys": []
|
| 102 |
+
}
|
| 103 |
+
}
|
code/reveal_vla_bimanual/eval/ablations.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
| 1 |
MANDATORY_ABLATIONS: tuple[str, ...] = (
|
| 2 |
-
"
|
| 3 |
-
"
|
|
|
|
| 4 |
"no_planner",
|
| 5 |
-
"
|
| 6 |
-
"
|
|
|
|
| 7 |
)
|
|
|
|
| 1 |
MANDATORY_ABLATIONS: tuple[str, ...] = (
|
| 2 |
+
"no_geometry",
|
| 3 |
+
"no_spatial_memory",
|
| 4 |
+
"compact_world_model",
|
| 5 |
"no_planner",
|
| 6 |
+
"gaussian_candidates_only",
|
| 7 |
+
"no_task_head",
|
| 8 |
+
"no_support_mode_conditioning",
|
| 9 |
)
|
code/reveal_vla_bimanual/eval/compare_rlbench_sweeps.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _load_summary(path: Path) -> dict[str, Any]:
|
| 10 |
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
| 11 |
+
task_scores = {
|
| 12 |
+
task_name: float(task_payload.get("mean_success", 0.0))
|
| 13 |
+
for task_name, task_payload in payload.get("tasks", {}).items()
|
| 14 |
+
}
|
| 15 |
+
task_returns = {
|
| 16 |
+
task_name: float(task_payload.get("mean_return", 0.0))
|
| 17 |
+
for task_name, task_payload in payload.get("tasks", {}).items()
|
| 18 |
+
}
|
| 19 |
+
task_path_recoveries = {
|
| 20 |
+
task_name: float(sum(task_payload.get("path_recoveries", [])) / max(1, len(task_payload.get("path_recoveries", []))))
|
| 21 |
+
for task_name, task_payload in payload.get("tasks", {}).items()
|
| 22 |
+
}
|
| 23 |
+
task_noop_fallbacks = {
|
| 24 |
+
task_name: float(sum(task_payload.get("noop_fallbacks", [])) / max(1, len(task_payload.get("noop_fallbacks", []))))
|
| 25 |
+
for task_name, task_payload in payload.get("tasks", {}).items()
|
| 26 |
+
}
|
| 27 |
+
return {
|
| 28 |
+
"path": str(path),
|
| 29 |
+
"checkpoint": payload.get("checkpoint"),
|
| 30 |
+
"mean_success": float(payload.get("mean_success", 0.0)),
|
| 31 |
+
"mean_return": float(sum(task_returns.values()) / max(1, len(task_returns))),
|
| 32 |
+
"mean_path_recoveries": float(sum(task_path_recoveries.values()) / max(1, len(task_path_recoveries))),
|
| 33 |
+
"mean_noop_fallbacks": float(sum(task_noop_fallbacks.values()) / max(1, len(task_noop_fallbacks))),
|
| 34 |
+
"plan_requested": bool(payload.get("plan_requested", False)),
|
| 35 |
+
"plan_applied": bool(payload.get("plan_applied", False)),
|
| 36 |
+
"no_planner": bool(payload.get("no_planner", False)),
|
| 37 |
+
"no_geometry": bool(payload.get("no_geometry", False)),
|
| 38 |
+
"disable_task_conditioning": bool(payload.get("disable_task_conditioning", False)),
|
| 39 |
+
"compact_world_model": bool(payload.get("compact_world_model", False)),
|
| 40 |
+
"task_scores": task_scores,
|
| 41 |
+
"task_returns": task_returns,
|
| 42 |
+
"task_path_recoveries": task_path_recoveries,
|
| 43 |
+
"task_noop_fallbacks": task_noop_fallbacks,
|
| 44 |
+
"error_tasks": list(payload.get("error_tasks", [])),
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _pairwise_delta(reference: dict[str, Any], candidate: dict[str, Any]) -> dict[str, Any]:
|
| 49 |
+
shared_tasks = sorted(set(reference["task_scores"]) & set(candidate["task_scores"]))
|
| 50 |
+
if not shared_tasks:
|
| 51 |
+
return {
|
| 52 |
+
"shared_task_count": 0,
|
| 53 |
+
"mean_success_delta": 0.0,
|
| 54 |
+
"mean_return_delta": 0.0,
|
| 55 |
+
"mean_path_recoveries_delta": 0.0,
|
| 56 |
+
"mean_noop_fallbacks_delta": 0.0,
|
| 57 |
+
"per_task_delta": {},
|
| 58 |
+
}
|
| 59 |
+
per_task_delta = {
|
| 60 |
+
task_name: float(candidate["task_scores"][task_name] - reference["task_scores"][task_name])
|
| 61 |
+
for task_name in shared_tasks
|
| 62 |
+
}
|
| 63 |
+
return {
|
| 64 |
+
"shared_task_count": len(shared_tasks),
|
| 65 |
+
"mean_success_delta": float(candidate["mean_success"] - reference["mean_success"]),
|
| 66 |
+
"mean_return_delta": float(candidate["mean_return"] - reference["mean_return"]),
|
| 67 |
+
"mean_path_recoveries_delta": float(candidate["mean_path_recoveries"] - reference["mean_path_recoveries"]),
|
| 68 |
+
"mean_noop_fallbacks_delta": float(candidate["mean_noop_fallbacks"] - reference["mean_noop_fallbacks"]),
|
| 69 |
+
"per_task_delta": per_task_delta,
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _markdown_lines(reference_label: str, comparison: dict[str, Any]) -> list[str]:
|
| 74 |
+
lines = [
|
| 75 |
+
"# RLBench Sweep Comparison",
|
| 76 |
+
"",
|
| 77 |
+
f"- Reference: `{reference_label}`",
|
| 78 |
+
"",
|
| 79 |
+
"## Runs",
|
| 80 |
+
"",
|
| 81 |
+
]
|
| 82 |
+
for label, payload in comparison["runs"].items():
|
| 83 |
+
lines.append(
|
| 84 |
+
f"- `{label}`: mean_success={payload['mean_success']:.3f}, "
|
| 85 |
+
f"mean_return={payload['mean_return']:.3f}, "
|
| 86 |
+
f"mean_path_recoveries={payload['mean_path_recoveries']:.3f}, "
|
| 87 |
+
f"mean_noop_fallbacks={payload['mean_noop_fallbacks']:.3f}, "
|
| 88 |
+
f"plan_applied={payload['plan_applied']}, "
|
| 89 |
+
f"errors={len(payload['error_tasks'])}, "
|
| 90 |
+
f"path=`{payload['path']}`"
|
| 91 |
+
)
|
| 92 |
+
lines.extend(["", "## Pairwise Deltas", ""])
|
| 93 |
+
for label, payload in comparison["pairwise_against_reference"].items():
|
| 94 |
+
lines.append(
|
| 95 |
+
f"- `{label}`: mean_success_delta={payload['mean_success_delta']:.3f}, "
|
| 96 |
+
f"mean_return_delta={payload['mean_return_delta']:.3f}, "
|
| 97 |
+
f"mean_path_recoveries_delta={payload['mean_path_recoveries_delta']:.3f}, "
|
| 98 |
+
f"mean_noop_fallbacks_delta={payload['mean_noop_fallbacks_delta']:.3f}, "
|
| 99 |
+
f"shared_tasks={payload['shared_task_count']}"
|
| 100 |
+
)
|
| 101 |
+
return lines
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def main() -> None:
|
| 105 |
+
parser = argparse.ArgumentParser()
|
| 106 |
+
parser.add_argument("--run", action="append", required=True, help="label=/abs/path/to/rollout_eval.json")
|
| 107 |
+
parser.add_argument("--reference-label", required=True)
|
| 108 |
+
parser.add_argument("--output-dir", required=True)
|
| 109 |
+
args = parser.parse_args()
|
| 110 |
+
|
| 111 |
+
runs: dict[str, dict[str, Any]] = {}
|
| 112 |
+
for item in args.run:
|
| 113 |
+
label, raw_path = item.split("=", 1)
|
| 114 |
+
runs[label] = _load_summary(Path(raw_path).resolve())
|
| 115 |
+
|
| 116 |
+
if args.reference_label not in runs:
|
| 117 |
+
raise ValueError(f"Missing reference label {args.reference_label!r} in provided runs.")
|
| 118 |
+
|
| 119 |
+
reference = runs[args.reference_label]
|
| 120 |
+
comparison = {
|
| 121 |
+
"reference_label": args.reference_label,
|
| 122 |
+
"runs": runs,
|
| 123 |
+
"pairwise_against_reference": {
|
| 124 |
+
label: _pairwise_delta(reference, payload)
|
| 125 |
+
for label, payload in runs.items()
|
| 126 |
+
if label != args.reference_label
|
| 127 |
+
},
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
output_dir = Path(args.output_dir).resolve()
|
| 131 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 132 |
+
(output_dir / "rlbench_comparison.json").write_text(
|
| 133 |
+
json.dumps(comparison, indent=2),
|
| 134 |
+
encoding="utf-8",
|
| 135 |
+
)
|
| 136 |
+
(output_dir / "rlbench_comparison.md").write_text(
|
| 137 |
+
"\n".join(_markdown_lines(args.reference_label, comparison)) + "\n",
|
| 138 |
+
encoding="utf-8",
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
if __name__ == "__main__":
|
| 143 |
+
main()
|
code/reveal_vla_bimanual/eval/run_ablations.py
CHANGED
|
@@ -7,7 +7,7 @@ import time
|
|
| 7 |
|
| 8 |
from eval.ablations import MANDATORY_ABLATIONS
|
| 9 |
from eval.report import write_comparison_report
|
| 10 |
-
from eval.run_reveal_benchmark import evaluate_model, load_model
|
| 11 |
from sim_reveal import available_proxy_names
|
| 12 |
|
| 13 |
import torch
|
|
@@ -20,6 +20,7 @@ def main() -> None:
|
|
| 20 |
parser.add_argument("--resolution", type=int, default=None)
|
| 21 |
parser.add_argument("--output-root", default="/workspace/reports/reveal_ablation")
|
| 22 |
parser.add_argument("--proxies", nargs="*", default=None)
|
|
|
|
| 23 |
parser.add_argument("--resume", action="store_true")
|
| 24 |
args = parser.parse_args()
|
| 25 |
|
|
@@ -29,17 +30,20 @@ def main() -> None:
|
|
| 29 |
proxies = list(args.proxies or available_proxy_names())
|
| 30 |
output_root = Path(args.output_root)
|
| 31 |
output_root.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 32 |
|
| 33 |
json_path = output_root / "ablations.json"
|
| 34 |
partial_path = output_root / "ablations.partial.json"
|
| 35 |
sections = {}
|
| 36 |
raw = {}
|
|
|
|
| 37 |
completed_labels: set[str] = set()
|
| 38 |
if args.resume and partial_path.exists():
|
| 39 |
partial = json.loads(partial_path.read_text(encoding="utf-8"))
|
| 40 |
raw = partial.get("raw", {})
|
| 41 |
sections = partial.get("sections", {})
|
| 42 |
completed_labels = set(raw)
|
|
|
|
| 43 |
print(json.dumps({"resume_from": str(partial_path), "completed": sorted(completed_labels)}, indent=2))
|
| 44 |
|
| 45 |
ablations = (None, *MANDATORY_ABLATIONS)
|
|
@@ -56,7 +60,9 @@ def main() -> None:
|
|
| 56 |
episodes=args.episodes,
|
| 57 |
resolution=resolution,
|
| 58 |
ablation=ablation,
|
|
|
|
| 59 |
)
|
|
|
|
| 60 |
raw[label] = {
|
| 61 |
"per_task_success": metrics.per_task_success,
|
| 62 |
"mean_success": metrics.mean_success,
|
|
@@ -65,6 +71,7 @@ def main() -> None:
|
|
| 65 |
"reocclusion_rate": metrics.reocclusion_rate,
|
| 66 |
"persistence_horizon_mae": metrics.persistence_horizon_mae,
|
| 67 |
"disturbance_cost": metrics.disturbance_cost,
|
|
|
|
| 68 |
}
|
| 69 |
sections[label] = {
|
| 70 |
"mean_success": metrics.mean_success,
|
|
@@ -73,12 +80,21 @@ def main() -> None:
|
|
| 73 |
"reocclusion_rate": metrics.reocclusion_rate or 0.0,
|
| 74 |
"persistence_horizon_mae": metrics.persistence_horizon_mae or 0.0,
|
| 75 |
"disturbance_cost": metrics.disturbance_cost or 0.0,
|
|
|
|
| 76 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
partial_path.write_text(
|
| 78 |
json.dumps(
|
| 79 |
{
|
| 80 |
"checkpoint": args.checkpoint,
|
| 81 |
"episodes": args.episodes,
|
|
|
|
| 82 |
"sections": sections,
|
| 83 |
"raw": raw,
|
| 84 |
"elapsed_seconds": time.monotonic() - start_time,
|
|
|
|
| 7 |
|
| 8 |
from eval.ablations import MANDATORY_ABLATIONS
|
| 9 |
from eval.report import write_comparison_report
|
| 10 |
+
from eval.run_reveal_benchmark import _paired_seed_summary, evaluate_model, load_model
|
| 11 |
from sim_reveal import available_proxy_names
|
| 12 |
|
| 13 |
import torch
|
|
|
|
| 20 |
parser.add_argument("--resolution", type=int, default=None)
|
| 21 |
parser.add_argument("--output-root", default="/workspace/reports/reveal_ablation")
|
| 22 |
parser.add_argument("--proxies", nargs="*", default=None)
|
| 23 |
+
parser.add_argument("--chunk-commit-steps", type=int, default=0)
|
| 24 |
parser.add_argument("--resume", action="store_true")
|
| 25 |
args = parser.parse_args()
|
| 26 |
|
|
|
|
| 30 |
proxies = list(args.proxies or available_proxy_names())
|
| 31 |
output_root = Path(args.output_root)
|
| 32 |
output_root.mkdir(parents=True, exist_ok=True)
|
| 33 |
+
chunk_commit_steps = None if args.chunk_commit_steps <= 0 else args.chunk_commit_steps
|
| 34 |
|
| 35 |
json_path = output_root / "ablations.json"
|
| 36 |
partial_path = output_root / "ablations.partial.json"
|
| 37 |
sections = {}
|
| 38 |
raw = {}
|
| 39 |
+
full_episode_records: list[dict[str, float | int | str]] | None = None
|
| 40 |
completed_labels: set[str] = set()
|
| 41 |
if args.resume and partial_path.exists():
|
| 42 |
partial = json.loads(partial_path.read_text(encoding="utf-8"))
|
| 43 |
raw = partial.get("raw", {})
|
| 44 |
sections = partial.get("sections", {})
|
| 45 |
completed_labels = set(raw)
|
| 46 |
+
full_episode_records = raw.get("full_model", {}).get("episode_records")
|
| 47 |
print(json.dumps({"resume_from": str(partial_path), "completed": sorted(completed_labels)}, indent=2))
|
| 48 |
|
| 49 |
ablations = (None, *MANDATORY_ABLATIONS)
|
|
|
|
| 60 |
episodes=args.episodes,
|
| 61 |
resolution=resolution,
|
| 62 |
ablation=ablation,
|
| 63 |
+
chunk_commit_steps=chunk_commit_steps,
|
| 64 |
)
|
| 65 |
+
metrics, episode_records = metrics
|
| 66 |
raw[label] = {
|
| 67 |
"per_task_success": metrics.per_task_success,
|
| 68 |
"mean_success": metrics.mean_success,
|
|
|
|
| 71 |
"reocclusion_rate": metrics.reocclusion_rate,
|
| 72 |
"persistence_horizon_mae": metrics.persistence_horizon_mae,
|
| 73 |
"disturbance_cost": metrics.disturbance_cost,
|
| 74 |
+
"episode_records": episode_records,
|
| 75 |
}
|
| 76 |
sections[label] = {
|
| 77 |
"mean_success": metrics.mean_success,
|
|
|
|
| 80 |
"reocclusion_rate": metrics.reocclusion_rate or 0.0,
|
| 81 |
"persistence_horizon_mae": metrics.persistence_horizon_mae or 0.0,
|
| 82 |
"disturbance_cost": metrics.disturbance_cost or 0.0,
|
| 83 |
+
"chunk_commit_steps": float(0 if chunk_commit_steps is None else chunk_commit_steps),
|
| 84 |
}
|
| 85 |
+
if label == "full_model":
|
| 86 |
+
full_episode_records = episode_records
|
| 87 |
+
elif full_episode_records is not None:
|
| 88 |
+
paired = _paired_seed_summary(full_episode_records, episode_records)
|
| 89 |
+
raw[label]["paired_seed_summary_vs_full_model"] = paired
|
| 90 |
+
for key, value in paired.items():
|
| 91 |
+
sections[label][f"paired_{key}_vs_full_model"] = value
|
| 92 |
partial_path.write_text(
|
| 93 |
json.dumps(
|
| 94 |
{
|
| 95 |
"checkpoint": args.checkpoint,
|
| 96 |
"episodes": args.episodes,
|
| 97 |
+
"chunk_commit_steps": 0 if chunk_commit_steps is None else chunk_commit_steps,
|
| 98 |
"sections": sections,
|
| 99 |
"raw": raw,
|
| 100 |
"elapsed_seconds": time.monotonic() - start_time,
|
code/reveal_vla_bimanual/eval/run_peract2_task_sweep.py
CHANGED
|
@@ -25,6 +25,10 @@ def _run_task(
|
|
| 25 |
chunk_commit_steps: int,
|
| 26 |
allow_unsupervised_planning: bool,
|
| 27 |
disable_support_mode_conditioning: bool,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
) -> dict[str, Any]:
|
| 29 |
task_dir = output_dir / task_name
|
| 30 |
task_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -53,10 +57,18 @@ def _run_task(
|
|
| 53 |
command.append("--headless")
|
| 54 |
if plan:
|
| 55 |
command.append("--plan")
|
|
|
|
|
|
|
| 56 |
if allow_unsupervised_planning:
|
| 57 |
command.append("--allow-unsupervised-planning")
|
| 58 |
if disable_support_mode_conditioning:
|
| 59 |
command.append("--disable-support-mode-conditioning")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
completed = subprocess.run(
|
| 62 |
command,
|
|
@@ -105,6 +117,10 @@ def _write_summary_markdown(path: Path, payload: dict[str, Any]) -> None:
|
|
| 105 |
f"- Episodes per task: `{payload['episodes_per_task']}`",
|
| 106 |
f"- Episode length: `{payload['episode_length']}`",
|
| 107 |
f"- Resolution: `{payload['resolution']}`",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
f"- Task count: `{payload['task_count']}`",
|
| 109 |
f"- Error tasks: `{payload['error_tasks']}`",
|
| 110 |
f"- Mean success: `{payload['mean_success']:.3f}`",
|
|
@@ -144,6 +160,10 @@ def _run_mode(args: argparse.Namespace, plan: bool) -> Path:
|
|
| 144 |
"episode_length": args.episode_length,
|
| 145 |
"resolution": args.resolution,
|
| 146 |
"device": args.device,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
"tasks": {},
|
| 148 |
"subprocess_mode": "isolated_per_task",
|
| 149 |
}
|
|
@@ -165,6 +185,10 @@ def _run_mode(args: argparse.Namespace, plan: bool) -> Path:
|
|
| 165 |
chunk_commit_steps=args.chunk_commit_steps,
|
| 166 |
allow_unsupervised_planning=args.allow_unsupervised_planning,
|
| 167 |
disable_support_mode_conditioning=args.disable_support_mode_conditioning,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
)
|
| 169 |
|
| 170 |
task_scores = [float(task_payload["mean_success"]) for task_payload in summary["tasks"].values()]
|
|
@@ -194,6 +218,10 @@ def main() -> None:
|
|
| 194 |
parser.add_argument("--chunk-commit-steps", type=int, default=4)
|
| 195 |
parser.add_argument("--allow-unsupervised-planning", action="store_true")
|
| 196 |
parser.add_argument("--disable-support-mode-conditioning", action="store_true")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
parser.add_argument("--skip-noplan", action="store_true")
|
| 198 |
parser.add_argument("--skip-plan", action="store_true")
|
| 199 |
args = parser.parse_args()
|
|
|
|
| 25 |
chunk_commit_steps: int,
|
| 26 |
allow_unsupervised_planning: bool,
|
| 27 |
disable_support_mode_conditioning: bool,
|
| 28 |
+
disable_task_conditioning: bool,
|
| 29 |
+
no_geometry: bool,
|
| 30 |
+
compact_world_model: bool,
|
| 31 |
+
no_planner: bool,
|
| 32 |
) -> dict[str, Any]:
|
| 33 |
task_dir = output_dir / task_name
|
| 34 |
task_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 57 |
command.append("--headless")
|
| 58 |
if plan:
|
| 59 |
command.append("--plan")
|
| 60 |
+
if no_planner:
|
| 61 |
+
command.append("--no-planner")
|
| 62 |
if allow_unsupervised_planning:
|
| 63 |
command.append("--allow-unsupervised-planning")
|
| 64 |
if disable_support_mode_conditioning:
|
| 65 |
command.append("--disable-support-mode-conditioning")
|
| 66 |
+
if disable_task_conditioning:
|
| 67 |
+
command.append("--disable-task-conditioning")
|
| 68 |
+
if no_geometry:
|
| 69 |
+
command.append("--no-geometry")
|
| 70 |
+
if compact_world_model:
|
| 71 |
+
command.append("--compact-world-model")
|
| 72 |
|
| 73 |
completed = subprocess.run(
|
| 74 |
command,
|
|
|
|
| 117 |
f"- Episodes per task: `{payload['episodes_per_task']}`",
|
| 118 |
f"- Episode length: `{payload['episode_length']}`",
|
| 119 |
f"- Resolution: `{payload['resolution']}`",
|
| 120 |
+
f"- No planner: `{payload['no_planner']}`",
|
| 121 |
+
f"- Disable task conditioning: `{payload['disable_task_conditioning']}`",
|
| 122 |
+
f"- No geometry: `{payload['no_geometry']}`",
|
| 123 |
+
f"- Compact world model: `{payload['compact_world_model']}`",
|
| 124 |
f"- Task count: `{payload['task_count']}`",
|
| 125 |
f"- Error tasks: `{payload['error_tasks']}`",
|
| 126 |
f"- Mean success: `{payload['mean_success']:.3f}`",
|
|
|
|
| 160 |
"episode_length": args.episode_length,
|
| 161 |
"resolution": args.resolution,
|
| 162 |
"device": args.device,
|
| 163 |
+
"no_planner": args.no_planner,
|
| 164 |
+
"disable_task_conditioning": args.disable_task_conditioning,
|
| 165 |
+
"no_geometry": args.no_geometry,
|
| 166 |
+
"compact_world_model": args.compact_world_model,
|
| 167 |
"tasks": {},
|
| 168 |
"subprocess_mode": "isolated_per_task",
|
| 169 |
}
|
|
|
|
| 185 |
chunk_commit_steps=args.chunk_commit_steps,
|
| 186 |
allow_unsupervised_planning=args.allow_unsupervised_planning,
|
| 187 |
disable_support_mode_conditioning=args.disable_support_mode_conditioning,
|
| 188 |
+
disable_task_conditioning=args.disable_task_conditioning,
|
| 189 |
+
no_geometry=args.no_geometry,
|
| 190 |
+
compact_world_model=args.compact_world_model,
|
| 191 |
+
no_planner=args.no_planner,
|
| 192 |
)
|
| 193 |
|
| 194 |
task_scores = [float(task_payload["mean_success"]) for task_payload in summary["tasks"].values()]
|
|
|
|
| 218 |
parser.add_argument("--chunk-commit-steps", type=int, default=4)
|
| 219 |
parser.add_argument("--allow-unsupervised-planning", action="store_true")
|
| 220 |
parser.add_argument("--disable-support-mode-conditioning", action="store_true")
|
| 221 |
+
parser.add_argument("--disable-task-conditioning", action="store_true")
|
| 222 |
+
parser.add_argument("--no-geometry", action="store_true")
|
| 223 |
+
parser.add_argument("--compact-world-model", action="store_true")
|
| 224 |
+
parser.add_argument("--no-planner", action="store_true")
|
| 225 |
parser.add_argument("--skip-noplan", action="store_true")
|
| 226 |
parser.add_argument("--skip-plan", action="store_true")
|
| 227 |
args = parser.parse_args()
|
code/reveal_vla_bimanual/eval/run_reveal_benchmark.py
CHANGED
|
@@ -57,7 +57,12 @@ def load_model(checkpoint_path: str | Path, device: torch.device) -> tuple[torch
|
|
| 57 |
allowed_missing = {
|
| 58 |
key
|
| 59 |
for key in incompatible.missing_keys
|
| 60 |
-
if key.startswith("memory.action_proj.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
}
|
| 62 |
missing_other = sorted(set(incompatible.missing_keys) - allowed_missing)
|
| 63 |
if missing_other or incompatible.unexpected_keys:
|
|
@@ -175,11 +180,18 @@ def select_chunk(
|
|
| 175 |
history_depths=batch.get("history_depths"),
|
| 176 |
history_depth_valid=batch.get("history_depth_valid"),
|
| 177 |
plan=True,
|
|
|
|
| 178 |
use_world_model=(ablation not in {"no_world_model", "no_planner"}),
|
| 179 |
use_planner=(ablation != "no_planner"),
|
| 180 |
use_depth=(ablation != "no_depth"),
|
|
|
|
|
|
|
| 181 |
use_role_tokens=(ablation not in {"no_role_tokens", "no_role_symmetry"}),
|
| 182 |
history_steps_override=(2 if ablation == "short_history" else None),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
)
|
| 184 |
if "planned_chunk" in outputs and ablation != "no_planner":
|
| 185 |
return outputs["planned_chunk"], outputs
|
|
@@ -204,6 +216,52 @@ def select_chunk(
|
|
| 204 |
return outputs["action_mean"], outputs
|
| 205 |
|
| 206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
def evaluate_model(
|
| 208 |
model: torch.nn.Module,
|
| 209 |
device: torch.device,
|
|
@@ -212,24 +270,26 @@ def evaluate_model(
|
|
| 212 |
resolution: int,
|
| 213 |
ablation: str | None = None,
|
| 214 |
chunk_commit_steps: int | None = None,
|
| 215 |
-
) -> BenchmarkMetrics:
|
| 216 |
per_task_success: dict[str, float] = {}
|
| 217 |
visibility_scores = []
|
| 218 |
corridor_scores = []
|
| 219 |
reocclusion_scores = []
|
| 220 |
persistence_errors = []
|
| 221 |
disturbance_scores = []
|
|
|
|
| 222 |
history_steps = int(getattr(model.config.memory, "history_steps", 0)) if hasattr(model, "config") else 0
|
| 223 |
|
| 224 |
for proxy_offset, proxy_name in enumerate(proxies):
|
| 225 |
successes = []
|
| 226 |
for episode_idx in range(episodes):
|
|
|
|
| 227 |
env = make_proxy_env(
|
| 228 |
proxy_name=proxy_name,
|
| 229 |
resolution=resolution,
|
| 230 |
-
seed=
|
| 231 |
)
|
| 232 |
-
observation, privileged_state = env.reset(seed=
|
| 233 |
episode_visibility = [float(privileged_state["visibility"])]
|
| 234 |
episode_corridor = [float(privileged_state["corridor_feasible"][privileged_state["support_mode"]].any())]
|
| 235 |
episode_disturbance = [float(privileged_state["disturbance_cost"])]
|
|
@@ -287,20 +347,39 @@ def evaluate_model(
|
|
| 287 |
if done:
|
| 288 |
break
|
| 289 |
successes.append(float(privileged_state["retrieval_success"]))
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
per_task_success[proxy_name] = float(np.mean(successes))
|
| 295 |
|
| 296 |
-
return
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
|
|
|
|
|
|
|
|
|
| 304 |
)
|
| 305 |
|
| 306 |
|
|
@@ -319,7 +398,8 @@ def _metrics_to_dict(metrics: BenchmarkMetrics) -> dict[str, float | dict[str, f
|
|
| 319 |
def main() -> None:
|
| 320 |
parser = argparse.ArgumentParser()
|
| 321 |
parser.add_argument("--model", action="append", required=True, help="label=/abs/path/checkpoint.pt")
|
| 322 |
-
parser.add_argument("--episodes", type=int, default=
|
|
|
|
| 323 |
parser.add_argument("--resolution", type=int, default=None)
|
| 324 |
parser.add_argument("--ablation", default=None)
|
| 325 |
parser.add_argument("--output-root", default="/workspace/reports/reveal_eval")
|
|
@@ -329,37 +409,76 @@ def main() -> None:
|
|
| 329 |
|
| 330 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 331 |
proxies = list(args.proxies or available_proxy_names())
|
|
|
|
|
|
|
| 332 |
output_root = Path(args.output_root)
|
| 333 |
output_root.mkdir(parents=True, exist_ok=True)
|
| 334 |
|
| 335 |
sections: dict[str, dict[str, float | str]] = {}
|
| 336 |
-
raw_metrics: dict[str, dict[str,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
for item in args.model:
|
| 338 |
label, checkpoint_path = item.split("=", maxsplit=1)
|
| 339 |
model, checkpoint = load_model(checkpoint_path, device=device)
|
| 340 |
resolution = int(args.resolution or checkpoint.get("data_resolution", 96))
|
| 341 |
-
metrics = evaluate_model(
|
| 342 |
model=model,
|
| 343 |
device=device,
|
| 344 |
proxies=proxies,
|
| 345 |
-
episodes=
|
| 346 |
resolution=resolution,
|
| 347 |
ablation=args.ablation,
|
| 348 |
-
chunk_commit_steps=
|
| 349 |
)
|
| 350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
sections[label] = {
|
| 352 |
"checkpoint": checkpoint_path,
|
|
|
|
|
|
|
|
|
|
| 353 |
"mean_success": metrics.mean_success,
|
| 354 |
"visibility_integral": metrics.visibility_integral or 0.0,
|
| 355 |
"corridor_availability": metrics.corridor_availability or 0.0,
|
| 356 |
"reocclusion_rate": metrics.reocclusion_rate or 0.0,
|
| 357 |
"persistence_horizon_mae": metrics.persistence_horizon_mae or 0.0,
|
| 358 |
"disturbance_cost": metrics.disturbance_cost or 0.0,
|
|
|
|
|
|
|
| 359 |
}
|
| 360 |
for task_name, score in metrics.per_task_success.items():
|
| 361 |
sections[label][f"{task_name}_success"] = score
|
| 362 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
json_path = output_root / "reveal_benchmark.json"
|
| 364 |
json_path.write_text(json.dumps(raw_metrics, indent=2), encoding="utf-8")
|
| 365 |
write_comparison_report(output_root / "reveal_benchmark.md", "Reveal Proxy Benchmark", sections)
|
|
|
|
| 57 |
allowed_missing = {
|
| 58 |
key
|
| 59 |
for key in incompatible.missing_keys
|
| 60 |
+
if key.startswith("memory.action_proj.")
|
| 61 |
+
or key.endswith("arm_identity.weight")
|
| 62 |
+
or key.endswith("task_embedding.weight")
|
| 63 |
+
or key.startswith("elastic_state_head.decoder.task_")
|
| 64 |
+
or key.startswith("world_model.task_")
|
| 65 |
+
or key.startswith("world_model.spatial_")
|
| 66 |
}
|
| 67 |
missing_other = sorted(set(incompatible.missing_keys) - allowed_missing)
|
| 68 |
if missing_other or incompatible.unexpected_keys:
|
|
|
|
| 180 |
history_depths=batch.get("history_depths"),
|
| 181 |
history_depth_valid=batch.get("history_depth_valid"),
|
| 182 |
plan=True,
|
| 183 |
+
support_mode_conditioning=(ablation != "no_support_mode_conditioning"),
|
| 184 |
use_world_model=(ablation not in {"no_world_model", "no_planner"}),
|
| 185 |
use_planner=(ablation != "no_planner"),
|
| 186 |
use_depth=(ablation != "no_depth"),
|
| 187 |
+
use_geometry_tokens=(ablation != "no_geometry"),
|
| 188 |
+
use_camera_pose_tokens=(ablation != "no_camera_pose"),
|
| 189 |
use_role_tokens=(ablation not in {"no_role_tokens", "no_role_symmetry"}),
|
| 190 |
history_steps_override=(2 if ablation == "short_history" else None),
|
| 191 |
+
use_memory=(ablation != "no_spatial_memory"),
|
| 192 |
+
use_task_conditioning=(ablation != "no_task_head"),
|
| 193 |
+
rollout_mode_override=("compact_rollout" if ablation == "compact_world_model" else None),
|
| 194 |
+
use_proposal_candidates=(ablation != "gaussian_candidates_only"),
|
| 195 |
)
|
| 196 |
if "planned_chunk" in outputs and ablation != "no_planner":
|
| 197 |
return outputs["planned_chunk"], outputs
|
|
|
|
| 216 |
return outputs["action_mean"], outputs
|
| 217 |
|
| 218 |
|
| 219 |
+
def _bootstrap_interval(values: list[float], bootstrap_samples: int = 1000) -> dict[str, float]:
|
| 220 |
+
if not values:
|
| 221 |
+
return {"mean": 0.0, "low": 0.0, "high": 0.0}
|
| 222 |
+
array = np.asarray(values, dtype=np.float32)
|
| 223 |
+
mean = float(array.mean())
|
| 224 |
+
if array.size == 1:
|
| 225 |
+
return {"mean": mean, "low": mean, "high": mean}
|
| 226 |
+
rng = np.random.default_rng(0)
|
| 227 |
+
sample_indices = rng.integers(0, array.size, size=(bootstrap_samples, array.size))
|
| 228 |
+
sampled_means = array[sample_indices].mean(axis=1)
|
| 229 |
+
return {
|
| 230 |
+
"mean": mean,
|
| 231 |
+
"low": float(np.percentile(sampled_means, 2.5)),
|
| 232 |
+
"high": float(np.percentile(sampled_means, 97.5)),
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
def _paired_seed_summary(
|
| 237 |
+
reference_records: list[dict[str, float | int | str]],
|
| 238 |
+
candidate_records: list[dict[str, float | int | str]],
|
| 239 |
+
) -> dict[str, float]:
|
| 240 |
+
reference_by_key = {
|
| 241 |
+
(str(record["proxy_name"]), int(record["seed"])): record for record in reference_records
|
| 242 |
+
}
|
| 243 |
+
success_deltas = []
|
| 244 |
+
visibility_deltas = []
|
| 245 |
+
reocclusion_deltas = []
|
| 246 |
+
disturbance_deltas = []
|
| 247 |
+
for record in candidate_records:
|
| 248 |
+
key = (str(record["proxy_name"]), int(record["seed"]))
|
| 249 |
+
baseline = reference_by_key.get(key)
|
| 250 |
+
if baseline is None:
|
| 251 |
+
continue
|
| 252 |
+
success_deltas.append(float(record["success"]) - float(baseline["success"]))
|
| 253 |
+
visibility_deltas.append(float(record["visibility_integral"]) - float(baseline["visibility_integral"]))
|
| 254 |
+
reocclusion_deltas.append(float(record["reocclusion_rate"]) - float(baseline["reocclusion_rate"]))
|
| 255 |
+
disturbance_deltas.append(float(record["disturbance_cost"]) - float(baseline["disturbance_cost"]))
|
| 256 |
+
return {
|
| 257 |
+
"paired_episodes": float(len(success_deltas)),
|
| 258 |
+
"success_delta": float(np.mean(success_deltas)) if success_deltas else 0.0,
|
| 259 |
+
"visibility_delta": float(np.mean(visibility_deltas)) if visibility_deltas else 0.0,
|
| 260 |
+
"reocclusion_delta": float(np.mean(reocclusion_deltas)) if reocclusion_deltas else 0.0,
|
| 261 |
+
"disturbance_delta": float(np.mean(disturbance_deltas)) if disturbance_deltas else 0.0,
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
|
| 265 |
def evaluate_model(
|
| 266 |
model: torch.nn.Module,
|
| 267 |
device: torch.device,
|
|
|
|
| 270 |
resolution: int,
|
| 271 |
ablation: str | None = None,
|
| 272 |
chunk_commit_steps: int | None = None,
|
| 273 |
+
) -> tuple[BenchmarkMetrics, list[dict[str, float | int | str]]]:
|
| 274 |
per_task_success: dict[str, float] = {}
|
| 275 |
visibility_scores = []
|
| 276 |
corridor_scores = []
|
| 277 |
reocclusion_scores = []
|
| 278 |
persistence_errors = []
|
| 279 |
disturbance_scores = []
|
| 280 |
+
episode_records: list[dict[str, float | int | str]] = []
|
| 281 |
history_steps = int(getattr(model.config.memory, "history_steps", 0)) if hasattr(model, "config") else 0
|
| 282 |
|
| 283 |
for proxy_offset, proxy_name in enumerate(proxies):
|
| 284 |
successes = []
|
| 285 |
for episode_idx in range(episodes):
|
| 286 |
+
seed = proxy_offset * 10_000 + episode_idx
|
| 287 |
env = make_proxy_env(
|
| 288 |
proxy_name=proxy_name,
|
| 289 |
resolution=resolution,
|
| 290 |
+
seed=seed,
|
| 291 |
)
|
| 292 |
+
observation, privileged_state = env.reset(seed=seed)
|
| 293 |
episode_visibility = [float(privileged_state["visibility"])]
|
| 294 |
episode_corridor = [float(privileged_state["corridor_feasible"][privileged_state["support_mode"]].any())]
|
| 295 |
episode_disturbance = [float(privileged_state["disturbance_cost"])]
|
|
|
|
| 347 |
if done:
|
| 348 |
break
|
| 349 |
successes.append(float(privileged_state["retrieval_success"]))
|
| 350 |
+
episode_visibility_integral = visibility_integral(np.asarray(episode_visibility))
|
| 351 |
+
episode_corridor_availability = corridor_availability(np.asarray(episode_corridor))
|
| 352 |
+
episode_reocclusion = reocclusion_rate(np.asarray(episode_corridor))
|
| 353 |
+
episode_disturbance_cost = mean_disturbance_cost(np.asarray(episode_disturbance))
|
| 354 |
+
visibility_scores.append(episode_visibility_integral)
|
| 355 |
+
corridor_scores.append(episode_corridor_availability)
|
| 356 |
+
reocclusion_scores.append(episode_reocclusion)
|
| 357 |
+
disturbance_scores.append(episode_disturbance_cost)
|
| 358 |
+
episode_records.append(
|
| 359 |
+
{
|
| 360 |
+
"proxy_name": proxy_name,
|
| 361 |
+
"seed": seed,
|
| 362 |
+
"episode_index": episode_idx,
|
| 363 |
+
"success": float(privileged_state["retrieval_success"]),
|
| 364 |
+
"visibility_integral": episode_visibility_integral,
|
| 365 |
+
"corridor_availability": episode_corridor_availability,
|
| 366 |
+
"reocclusion_rate": episode_reocclusion,
|
| 367 |
+
"disturbance_cost": episode_disturbance_cost,
|
| 368 |
+
}
|
| 369 |
+
)
|
| 370 |
per_task_success[proxy_name] = float(np.mean(successes))
|
| 371 |
|
| 372 |
+
return (
|
| 373 |
+
BenchmarkMetrics(
|
| 374 |
+
per_task_success=per_task_success,
|
| 375 |
+
mean_success=mean_success(per_task_success),
|
| 376 |
+
visibility_integral=float(np.mean(visibility_scores)) if visibility_scores else None,
|
| 377 |
+
corridor_availability=float(np.mean(corridor_scores)) if corridor_scores else None,
|
| 378 |
+
reocclusion_rate=float(np.mean(reocclusion_scores)) if reocclusion_scores else None,
|
| 379 |
+
persistence_horizon_mae=float(np.mean(persistence_errors)) if persistence_errors else None,
|
| 380 |
+
disturbance_cost=float(np.mean(disturbance_scores)) if disturbance_scores else None,
|
| 381 |
+
),
|
| 382 |
+
episode_records,
|
| 383 |
)
|
| 384 |
|
| 385 |
|
|
|
|
| 398 |
def main() -> None:
|
| 399 |
parser = argparse.ArgumentParser()
|
| 400 |
parser.add_argument("--model", action="append", required=True, help="label=/abs/path/checkpoint.pt")
|
| 401 |
+
parser.add_argument("--episodes", type=int, default=None)
|
| 402 |
+
parser.add_argument("--benchmark-mode", choices=("smoke", "serious"), default="smoke")
|
| 403 |
parser.add_argument("--resolution", type=int, default=None)
|
| 404 |
parser.add_argument("--ablation", default=None)
|
| 405 |
parser.add_argument("--output-root", default="/workspace/reports/reveal_eval")
|
|
|
|
| 409 |
|
| 410 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 411 |
proxies = list(args.proxies or available_proxy_names())
|
| 412 |
+
episodes = int(args.episodes or (100 if args.benchmark_mode == "serious" else 24))
|
| 413 |
+
chunk_commit_steps = None if args.chunk_commit_steps <= 0 else args.chunk_commit_steps
|
| 414 |
output_root = Path(args.output_root)
|
| 415 |
output_root.mkdir(parents=True, exist_ok=True)
|
| 416 |
|
| 417 |
sections: dict[str, dict[str, float | str]] = {}
|
| 418 |
+
raw_metrics: dict[str, dict[str, Any]] = {
|
| 419 |
+
"benchmark_config": {
|
| 420 |
+
"episodes": episodes,
|
| 421 |
+
"benchmark_mode": args.benchmark_mode,
|
| 422 |
+
"ablation": args.ablation,
|
| 423 |
+
"proxies": proxies,
|
| 424 |
+
"chunk_commit_steps": 0 if chunk_commit_steps is None else chunk_commit_steps,
|
| 425 |
+
}
|
| 426 |
+
}
|
| 427 |
+
episode_records_by_label: dict[str, list[dict[str, float | int | str]]] = {}
|
| 428 |
for item in args.model:
|
| 429 |
label, checkpoint_path = item.split("=", maxsplit=1)
|
| 430 |
model, checkpoint = load_model(checkpoint_path, device=device)
|
| 431 |
resolution = int(args.resolution or checkpoint.get("data_resolution", 96))
|
| 432 |
+
metrics, episode_records = evaluate_model(
|
| 433 |
model=model,
|
| 434 |
device=device,
|
| 435 |
proxies=proxies,
|
| 436 |
+
episodes=episodes,
|
| 437 |
resolution=resolution,
|
| 438 |
ablation=args.ablation,
|
| 439 |
+
chunk_commit_steps=chunk_commit_steps,
|
| 440 |
)
|
| 441 |
+
episode_records_by_label[label] = episode_records
|
| 442 |
+
success_values = [float(record["success"]) for record in episode_records]
|
| 443 |
+
visibility_values = [float(record["visibility_integral"]) for record in episode_records]
|
| 444 |
+
reocclusion_values = [float(record["reocclusion_rate"]) for record in episode_records]
|
| 445 |
+
disturbance_values = [float(record["disturbance_cost"]) for record in episode_records]
|
| 446 |
+
raw_metrics[label] = {
|
| 447 |
+
**_metrics_to_dict(metrics),
|
| 448 |
+
"bootstrap_ci": {
|
| 449 |
+
"success": _bootstrap_interval(success_values),
|
| 450 |
+
"visibility_integral": _bootstrap_interval(visibility_values),
|
| 451 |
+
"reocclusion_rate": _bootstrap_interval(reocclusion_values),
|
| 452 |
+
"disturbance_cost": _bootstrap_interval(disturbance_values),
|
| 453 |
+
},
|
| 454 |
+
"episode_records": episode_records,
|
| 455 |
+
}
|
| 456 |
sections[label] = {
|
| 457 |
"checkpoint": checkpoint_path,
|
| 458 |
+
"benchmark_mode": args.benchmark_mode,
|
| 459 |
+
"episodes": float(episodes),
|
| 460 |
+
"chunk_commit_steps": float(0 if chunk_commit_steps is None else chunk_commit_steps),
|
| 461 |
"mean_success": metrics.mean_success,
|
| 462 |
"visibility_integral": metrics.visibility_integral or 0.0,
|
| 463 |
"corridor_availability": metrics.corridor_availability or 0.0,
|
| 464 |
"reocclusion_rate": metrics.reocclusion_rate or 0.0,
|
| 465 |
"persistence_horizon_mae": metrics.persistence_horizon_mae or 0.0,
|
| 466 |
"disturbance_cost": metrics.disturbance_cost or 0.0,
|
| 467 |
+
"success_ci_low": raw_metrics[label]["bootstrap_ci"]["success"]["low"],
|
| 468 |
+
"success_ci_high": raw_metrics[label]["bootstrap_ci"]["success"]["high"],
|
| 469 |
}
|
| 470 |
for task_name, score in metrics.per_task_success.items():
|
| 471 |
sections[label][f"{task_name}_success"] = score
|
| 472 |
|
| 473 |
+
labels = [item.split("=", maxsplit=1)[0] for item in args.model]
|
| 474 |
+
if labels:
|
| 475 |
+
reference_label = labels[0]
|
| 476 |
+
for label in labels[1:]:
|
| 477 |
+
summary = _paired_seed_summary(episode_records_by_label[reference_label], episode_records_by_label[label])
|
| 478 |
+
raw_metrics[label]["paired_seed_summary_vs_" + reference_label] = summary
|
| 479 |
+
for key, value in summary.items():
|
| 480 |
+
sections[label][f"paired_{key}_vs_{reference_label}"] = value
|
| 481 |
+
|
| 482 |
json_path = output_root / "reveal_benchmark.json"
|
| 483 |
json_path.write_text(json.dumps(raw_metrics, indent=2), encoding="utf-8")
|
| 484 |
write_comparison_report(output_root / "reveal_benchmark.md", "Reveal Proxy Benchmark", sections)
|
code/reveal_vla_bimanual/eval/run_rlbench_rollout_eval.py
CHANGED
|
@@ -122,11 +122,15 @@ def main() -> None:
|
|
| 122 |
parser.add_argument("--resolution", type=int, default=224)
|
| 123 |
parser.add_argument("--device", default="cuda")
|
| 124 |
parser.add_argument("--plan", action="store_true")
|
|
|
|
| 125 |
parser.add_argument("--allow-unsupervised-planning", action="store_true")
|
| 126 |
parser.add_argument("--disable-support-mode-conditioning", action="store_true")
|
|
|
|
| 127 |
parser.add_argument("--headless", action="store_true", default=True)
|
| 128 |
parser.add_argument("--chunk-commit-steps", type=int, default=0)
|
| 129 |
parser.add_argument("--reset-retries", type=int, default=20)
|
|
|
|
|
|
|
| 130 |
args = parser.parse_args()
|
| 131 |
|
| 132 |
checkpoint = torch.load(Path(args.checkpoint), map_location="cpu", weights_only=False)
|
|
@@ -138,7 +142,12 @@ def main() -> None:
|
|
| 138 |
allowed_missing = {
|
| 139 |
key
|
| 140 |
for key in incompatible.missing_keys
|
| 141 |
-
if key.startswith("memory.action_proj.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
}
|
| 143 |
missing_other = sorted(set(incompatible.missing_keys) - allowed_missing)
|
| 144 |
if missing_other or incompatible.unexpected_keys:
|
|
@@ -147,7 +156,7 @@ def main() -> None:
|
|
| 147 |
f"Missing keys: {missing_other}. Unexpected keys: {list(incompatible.unexpected_keys)}"
|
| 148 |
)
|
| 149 |
model.eval()
|
| 150 |
-
plan_requested = bool(args.plan)
|
| 151 |
plan_applied = plan_requested and planner_enabled(trainer_config, during_eval=True)
|
| 152 |
planning_note = None
|
| 153 |
if plan_requested and not policy_supports_planning(trainer_config.policy_type):
|
|
@@ -166,6 +175,9 @@ def main() -> None:
|
|
| 166 |
"plan_applied": plan_applied,
|
| 167 |
"planner_mode": trainer_config.planner_mode,
|
| 168 |
"support_mode_conditioning": not args.disable_support_mode_conditioning,
|
|
|
|
|
|
|
|
|
|
| 169 |
"episodes_per_task": args.episodes_per_task,
|
| 170 |
"episode_length": args.episode_length,
|
| 171 |
"resolution": args.resolution,
|
|
@@ -196,6 +208,9 @@ def main() -> None:
|
|
| 196 |
env.launch()
|
| 197 |
task = env.get_task(task_class)
|
| 198 |
task_reset_retries: list[int] = []
|
|
|
|
|
|
|
|
|
|
| 199 |
for _ in range(args.episodes_per_task):
|
| 200 |
descriptions, obs, reset_retries = _reset_task_with_retries(task, max_attempts=max(1, args.reset_retries))
|
| 201 |
task_reset_retries.append(int(reset_retries))
|
|
@@ -204,6 +219,7 @@ def main() -> None:
|
|
| 204 |
success = 0.0
|
| 205 |
episode_recoveries = 0
|
| 206 |
episode_noop_fallbacks = 0
|
|
|
|
| 207 |
history_images: list[np.ndarray] = []
|
| 208 |
history_proprio: list[np.ndarray] = []
|
| 209 |
history_actions: list[np.ndarray] = []
|
|
@@ -256,6 +272,10 @@ def main() -> None:
|
|
| 256 |
history_actions=history_actions_tensor,
|
| 257 |
plan=plan_applied,
|
| 258 |
support_mode_conditioning=not args.disable_support_mode_conditioning,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
)
|
| 260 |
else:
|
| 261 |
outputs = model(
|
|
@@ -269,6 +289,20 @@ def main() -> None:
|
|
| 269 |
chosen_chunk = outputs["action_mean"]
|
| 270 |
if plan_applied and "planned_chunk" in outputs:
|
| 271 |
chosen_chunk = outputs["planned_chunk"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
chunk_np = chosen_chunk[0].detach().float().cpu().numpy()
|
| 273 |
commit_steps = chunk_np.shape[0] if args.chunk_commit_steps <= 0 else min(args.chunk_commit_steps, chunk_np.shape[0])
|
| 274 |
done = False
|
|
@@ -292,6 +326,17 @@ def main() -> None:
|
|
| 292 |
obs, reward, done, recovered_steps, noop_fallbacks = _step_bimanual_chunk(task, obs, step_action)
|
| 293 |
episode_recoveries += recovered_steps
|
| 294 |
episode_noop_fallbacks += noop_fallbacks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
total_reward += float(reward)
|
| 296 |
timestep += 1
|
| 297 |
if reward >= 1.0:
|
|
@@ -302,13 +347,21 @@ def main() -> None:
|
|
| 302 |
break
|
| 303 |
task_successes.append(success)
|
| 304 |
task_returns.append(total_reward)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
results["tasks"][task_name] = {
|
| 306 |
"task_class": task_class.__name__,
|
| 307 |
"successes": task_successes,
|
| 308 |
"returns": task_returns,
|
| 309 |
-
"path_recoveries":
|
| 310 |
-
"noop_fallbacks":
|
| 311 |
"reset_retries": task_reset_retries,
|
|
|
|
| 312 |
"mean_success": float(np.mean(task_successes)) if task_successes else 0.0,
|
| 313 |
"mean_return": float(np.mean(task_returns)) if task_returns else 0.0,
|
| 314 |
}
|
|
@@ -331,6 +384,9 @@ def main() -> None:
|
|
| 331 |
f"- Plan requested: `{results['plan_requested']}`",
|
| 332 |
f"- Plan applied: `{results['plan_applied']}`",
|
| 333 |
f"- Support-mode conditioning: `{results['support_mode_conditioning']}`",
|
|
|
|
|
|
|
|
|
|
| 334 |
f"- Mean success: `{results['mean_success']:.3f}`",
|
| 335 |
"",
|
| 336 |
"## Per-task",
|
|
|
|
| 122 |
parser.add_argument("--resolution", type=int, default=224)
|
| 123 |
parser.add_argument("--device", default="cuda")
|
| 124 |
parser.add_argument("--plan", action="store_true")
|
| 125 |
+
parser.add_argument("--no-planner", action="store_true")
|
| 126 |
parser.add_argument("--allow-unsupervised-planning", action="store_true")
|
| 127 |
parser.add_argument("--disable-support-mode-conditioning", action="store_true")
|
| 128 |
+
parser.add_argument("--disable-task-conditioning", action="store_true")
|
| 129 |
parser.add_argument("--headless", action="store_true", default=True)
|
| 130 |
parser.add_argument("--chunk-commit-steps", type=int, default=0)
|
| 131 |
parser.add_argument("--reset-retries", type=int, default=20)
|
| 132 |
+
parser.add_argument("--no-geometry", action="store_true")
|
| 133 |
+
parser.add_argument("--compact-world-model", action="store_true")
|
| 134 |
args = parser.parse_args()
|
| 135 |
|
| 136 |
checkpoint = torch.load(Path(args.checkpoint), map_location="cpu", weights_only=False)
|
|
|
|
| 142 |
allowed_missing = {
|
| 143 |
key
|
| 144 |
for key in incompatible.missing_keys
|
| 145 |
+
if key.startswith("memory.action_proj.")
|
| 146 |
+
or key.endswith("arm_identity.weight")
|
| 147 |
+
or key.endswith("task_embedding.weight")
|
| 148 |
+
or key.startswith("elastic_state_head.decoder.task_")
|
| 149 |
+
or key.startswith("world_model.task_")
|
| 150 |
+
or key.startswith("world_model.spatial_")
|
| 151 |
}
|
| 152 |
missing_other = sorted(set(incompatible.missing_keys) - allowed_missing)
|
| 153 |
if missing_other or incompatible.unexpected_keys:
|
|
|
|
| 156 |
f"Missing keys: {missing_other}. Unexpected keys: {list(incompatible.unexpected_keys)}"
|
| 157 |
)
|
| 158 |
model.eval()
|
| 159 |
+
plan_requested = bool(args.plan) and not bool(args.no_planner)
|
| 160 |
plan_applied = plan_requested and planner_enabled(trainer_config, during_eval=True)
|
| 161 |
planning_note = None
|
| 162 |
if plan_requested and not policy_supports_planning(trainer_config.policy_type):
|
|
|
|
| 175 |
"plan_applied": plan_applied,
|
| 176 |
"planner_mode": trainer_config.planner_mode,
|
| 177 |
"support_mode_conditioning": not args.disable_support_mode_conditioning,
|
| 178 |
+
"task_conditioning": not args.disable_task_conditioning,
|
| 179 |
+
"geometry_enabled": not args.no_geometry,
|
| 180 |
+
"world_model_mode": "compact_rollout" if args.compact_world_model else "checkpoint_default",
|
| 181 |
"episodes_per_task": args.episodes_per_task,
|
| 182 |
"episode_length": args.episode_length,
|
| 183 |
"resolution": args.resolution,
|
|
|
|
| 208 |
env.launch()
|
| 209 |
task = env.get_task(task_class)
|
| 210 |
task_reset_retries: list[int] = []
|
| 211 |
+
task_path_recoveries: list[int] = []
|
| 212 |
+
task_noop_fallbacks: list[int] = []
|
| 213 |
+
task_episode_traces: list[dict[str, Any]] = []
|
| 214 |
for _ in range(args.episodes_per_task):
|
| 215 |
descriptions, obs, reset_retries = _reset_task_with_retries(task, max_attempts=max(1, args.reset_retries))
|
| 216 |
task_reset_retries.append(int(reset_retries))
|
|
|
|
| 219 |
success = 0.0
|
| 220 |
episode_recoveries = 0
|
| 221 |
episode_noop_fallbacks = 0
|
| 222 |
+
episode_trace: dict[str, Any] = {"language_goal": language_goal, "steps": []}
|
| 223 |
history_images: list[np.ndarray] = []
|
| 224 |
history_proprio: list[np.ndarray] = []
|
| 225 |
history_actions: list[np.ndarray] = []
|
|
|
|
| 272 |
history_actions=history_actions_tensor,
|
| 273 |
plan=plan_applied,
|
| 274 |
support_mode_conditioning=not args.disable_support_mode_conditioning,
|
| 275 |
+
use_planner=not args.no_planner,
|
| 276 |
+
use_geometry_tokens=not args.no_geometry,
|
| 277 |
+
use_task_conditioning=not args.disable_task_conditioning,
|
| 278 |
+
rollout_mode_override=("compact_rollout" if args.compact_world_model else None),
|
| 279 |
)
|
| 280 |
else:
|
| 281 |
outputs = model(
|
|
|
|
| 289 |
chosen_chunk = outputs["action_mean"]
|
| 290 |
if plan_applied and "planned_chunk" in outputs:
|
| 291 |
chosen_chunk = outputs["planned_chunk"]
|
| 292 |
+
best_local = 0
|
| 293 |
+
if isinstance(outputs.get("ranking_diagnostics"), dict) and "best_local_indices" in outputs["ranking_diagnostics"]:
|
| 294 |
+
best_local = int(outputs["ranking_diagnostics"]["best_local_indices"][0].detach().cpu().item())
|
| 295 |
+
chosen_macro_mode = None
|
| 296 |
+
if "planner_topk_mode_names" in outputs and outputs["planner_topk_mode_names"]:
|
| 297 |
+
chosen_macro_mode = outputs["planner_topk_mode_names"][0][best_local]
|
| 298 |
+
predicted_reocclusion = None
|
| 299 |
+
if "planned_rollout" in outputs and "reocclusion_field" in outputs["planned_rollout"]:
|
| 300 |
+
predicted_reocclusion = float(
|
| 301 |
+
outputs["planned_rollout"]["reocclusion_field"][0, best_local].mean().detach().cpu().item()
|
| 302 |
+
)
|
| 303 |
+
planner_scores = None
|
| 304 |
+
if "planner_scores" in outputs:
|
| 305 |
+
planner_scores = outputs["planner_scores"][0].detach().cpu().tolist()
|
| 306 |
chunk_np = chosen_chunk[0].detach().float().cpu().numpy()
|
| 307 |
commit_steps = chunk_np.shape[0] if args.chunk_commit_steps <= 0 else min(args.chunk_commit_steps, chunk_np.shape[0])
|
| 308 |
done = False
|
|
|
|
| 326 |
obs, reward, done, recovered_steps, noop_fallbacks = _step_bimanual_chunk(task, obs, step_action)
|
| 327 |
episode_recoveries += recovered_steps
|
| 328 |
episode_noop_fallbacks += noop_fallbacks
|
| 329 |
+
episode_trace["steps"].append(
|
| 330 |
+
{
|
| 331 |
+
"timestep": int(timestep),
|
| 332 |
+
"chosen_macro_mode": chosen_macro_mode,
|
| 333 |
+
"planner_scores": planner_scores,
|
| 334 |
+
"predicted_reocclusion": predicted_reocclusion,
|
| 335 |
+
"support_mode_conditioning": not args.disable_support_mode_conditioning,
|
| 336 |
+
"path_recoveries": int(recovered_steps),
|
| 337 |
+
"noop_fallbacks": int(noop_fallbacks),
|
| 338 |
+
}
|
| 339 |
+
)
|
| 340 |
total_reward += float(reward)
|
| 341 |
timestep += 1
|
| 342 |
if reward >= 1.0:
|
|
|
|
| 347 |
break
|
| 348 |
task_successes.append(success)
|
| 349 |
task_returns.append(total_reward)
|
| 350 |
+
task_path_recoveries.append(int(episode_recoveries))
|
| 351 |
+
task_noop_fallbacks.append(int(episode_noop_fallbacks))
|
| 352 |
+
episode_trace["success"] = float(success)
|
| 353 |
+
episode_trace["return"] = float(total_reward)
|
| 354 |
+
episode_trace["path_recoveries"] = int(episode_recoveries)
|
| 355 |
+
episode_trace["noop_fallbacks"] = int(episode_noop_fallbacks)
|
| 356 |
+
task_episode_traces.append(episode_trace)
|
| 357 |
results["tasks"][task_name] = {
|
| 358 |
"task_class": task_class.__name__,
|
| 359 |
"successes": task_successes,
|
| 360 |
"returns": task_returns,
|
| 361 |
+
"path_recoveries": task_path_recoveries,
|
| 362 |
+
"noop_fallbacks": task_noop_fallbacks,
|
| 363 |
"reset_retries": task_reset_retries,
|
| 364 |
+
"episode_traces": task_episode_traces,
|
| 365 |
"mean_success": float(np.mean(task_successes)) if task_successes else 0.0,
|
| 366 |
"mean_return": float(np.mean(task_returns)) if task_returns else 0.0,
|
| 367 |
}
|
|
|
|
| 384 |
f"- Plan requested: `{results['plan_requested']}`",
|
| 385 |
f"- Plan applied: `{results['plan_applied']}`",
|
| 386 |
f"- Support-mode conditioning: `{results['support_mode_conditioning']}`",
|
| 387 |
+
f"- Task conditioning: `{results['task_conditioning']}`",
|
| 388 |
+
f"- Geometry enabled: `{results['geometry_enabled']}`",
|
| 389 |
+
f"- World-model mode: `{results['world_model_mode']}`",
|
| 390 |
f"- Mean success: `{results['mean_success']:.3f}`",
|
| 391 |
"",
|
| 392 |
"## Per-task",
|
code/reveal_vla_bimanual/eval/run_teacher_audit.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
|
| 9 |
+
from eval.report import write_comparison_report
|
| 10 |
+
from sim_reveal.procedural_envs import available_proxy_names, make_proxy_env
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
BASELINES: tuple[str, ...] = ("teacher", "reveal_only", "retrieve_only", "no_hold", "random")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _evaluate_baseline(
|
| 17 |
+
proxy_name: str,
|
| 18 |
+
baseline_name: str,
|
| 19 |
+
episodes: int,
|
| 20 |
+
resolution: int,
|
| 21 |
+
chunk_horizon: int,
|
| 22 |
+
rollout_horizon: int,
|
| 23 |
+
) -> dict[str, float]:
|
| 24 |
+
success = []
|
| 25 |
+
reveal = []
|
| 26 |
+
visibility = []
|
| 27 |
+
hold = []
|
| 28 |
+
reocclusion = []
|
| 29 |
+
disturbance = []
|
| 30 |
+
utility = []
|
| 31 |
+
for seed in range(episodes):
|
| 32 |
+
env = make_proxy_env(proxy_name=proxy_name, resolution=resolution, seed=seed, rollout_horizon=rollout_horizon)
|
| 33 |
+
_, _ = env.reset(seed=seed)
|
| 34 |
+
chunk = env.baseline_action_chunk(baseline_name, chunk_horizon=chunk_horizon)
|
| 35 |
+
outcome = env.evaluate_action_chunk(chunk, rollout_horizon=rollout_horizon)
|
| 36 |
+
success_value = float(outcome["retrieval_success"])
|
| 37 |
+
reveal_value = float(outcome["reveal_achieved"])
|
| 38 |
+
visibility_value = float(outcome["visibility_integral"])
|
| 39 |
+
hold_value = float(outcome["hold_persistence"])
|
| 40 |
+
reocclusion_value = float(outcome["reocclusion_rate"])
|
| 41 |
+
disturbance_value = float(outcome["final_disturbance_cost"])
|
| 42 |
+
utility_value = success_value + 0.25 * reveal_value + 0.1 * hold_value + 0.05 * visibility_value
|
| 43 |
+
utility_value -= reocclusion_value + disturbance_value
|
| 44 |
+
success.append(success_value)
|
| 45 |
+
reveal.append(reveal_value)
|
| 46 |
+
visibility.append(visibility_value)
|
| 47 |
+
hold.append(hold_value)
|
| 48 |
+
reocclusion.append(reocclusion_value)
|
| 49 |
+
disturbance.append(disturbance_value)
|
| 50 |
+
utility.append(utility_value)
|
| 51 |
+
return {
|
| 52 |
+
"success": float(np.mean(success)),
|
| 53 |
+
"reveal_achieved": float(np.mean(reveal)),
|
| 54 |
+
"visibility_integral": float(np.mean(visibility)),
|
| 55 |
+
"hold_persistence": float(np.mean(hold)),
|
| 56 |
+
"reocclusion_rate": float(np.mean(reocclusion)),
|
| 57 |
+
"disturbance_cost": float(np.mean(disturbance)),
|
| 58 |
+
"utility": float(np.mean(utility)),
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def main() -> None:
|
| 63 |
+
parser = argparse.ArgumentParser()
|
| 64 |
+
parser.add_argument("--episodes", type=int, default=100)
|
| 65 |
+
parser.add_argument("--resolution", type=int, default=96)
|
| 66 |
+
parser.add_argument("--chunk-horizon", type=int, default=8)
|
| 67 |
+
parser.add_argument("--rollout-horizon", type=int, default=5)
|
| 68 |
+
parser.add_argument("--output-root", default="/workspace/reports/reveal_teacher_audit")
|
| 69 |
+
parser.add_argument("--proxies", nargs="*", default=None)
|
| 70 |
+
args = parser.parse_args()
|
| 71 |
+
|
| 72 |
+
proxies = list(args.proxies or available_proxy_names())
|
| 73 |
+
output_root = Path(args.output_root)
|
| 74 |
+
output_root.mkdir(parents=True, exist_ok=True)
|
| 75 |
+
|
| 76 |
+
raw: dict[str, dict[str, dict[str, float]]] = {}
|
| 77 |
+
sections: dict[str, dict[str, float | str]] = {}
|
| 78 |
+
for proxy_name in proxies:
|
| 79 |
+
proxy_results = {
|
| 80 |
+
baseline_name: _evaluate_baseline(
|
| 81 |
+
proxy_name=proxy_name,
|
| 82 |
+
baseline_name=baseline_name,
|
| 83 |
+
episodes=args.episodes,
|
| 84 |
+
resolution=args.resolution,
|
| 85 |
+
chunk_horizon=args.chunk_horizon,
|
| 86 |
+
rollout_horizon=args.rollout_horizon,
|
| 87 |
+
)
|
| 88 |
+
for baseline_name in BASELINES
|
| 89 |
+
}
|
| 90 |
+
raw[proxy_name] = proxy_results
|
| 91 |
+
teacher_metrics = proxy_results["teacher"]
|
| 92 |
+
for baseline_name in BASELINES[1:]:
|
| 93 |
+
baseline_metrics = proxy_results[baseline_name]
|
| 94 |
+
label = f"{proxy_name}:{baseline_name}"
|
| 95 |
+
sections[label] = {
|
| 96 |
+
"episodes": float(args.episodes),
|
| 97 |
+
"teacher_success": teacher_metrics["success"],
|
| 98 |
+
"baseline_success": baseline_metrics["success"],
|
| 99 |
+
"teacher_utility": teacher_metrics["utility"],
|
| 100 |
+
"baseline_utility": baseline_metrics["utility"],
|
| 101 |
+
"success_delta": teacher_metrics["success"] - baseline_metrics["success"],
|
| 102 |
+
"utility_delta": teacher_metrics["utility"] - baseline_metrics["utility"],
|
| 103 |
+
"hold_delta": teacher_metrics["hold_persistence"] - baseline_metrics["hold_persistence"],
|
| 104 |
+
"reocclusion_delta": teacher_metrics["reocclusion_rate"] - baseline_metrics["reocclusion_rate"],
|
| 105 |
+
"disturbance_delta": teacher_metrics["disturbance_cost"] - baseline_metrics["disturbance_cost"],
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
json_path = output_root / "teacher_audit.json"
|
| 109 |
+
json_path.write_text(json.dumps(raw, indent=2), encoding="utf-8")
|
| 110 |
+
write_comparison_report(output_root / "teacher_audit.md", "Reveal Teacher Audit", sections)
|
| 111 |
+
print(json.dumps({"output_json": str(json_path), "sections": sections}, indent=2))
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
main()
|
code/reveal_vla_bimanual/models/action_decoder.py
CHANGED
|
@@ -394,6 +394,79 @@ DEFAULT_PROPOSAL_MODES = (
|
|
| 394 |
"retrieve",
|
| 395 |
)
|
| 396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
|
| 398 |
def swap_arm_action_order(action_chunk: Tensor) -> Tensor:
|
| 399 |
midpoint = action_chunk.shape[-1] // 2
|
|
@@ -416,6 +489,7 @@ class SymmetricCoordinatedChunkDecoder(nn.Module):
|
|
| 416 |
self.arm_decoder = nn.TransformerDecoder(decoder_layer, num_layers=config.num_layers)
|
| 417 |
self.query_embed = nn.Embedding(config.chunk_size, config.hidden_dim)
|
| 418 |
self.arm_identity = nn.Embedding(2, config.hidden_dim)
|
|
|
|
| 419 |
self.phase_adapter = nn.Linear(config.num_phases, config.hidden_dim)
|
| 420 |
self.role_adapter = nn.Linear(config.num_arm_roles, config.hidden_dim)
|
| 421 |
self.context_proj = nn.Sequential(
|
|
@@ -535,7 +609,8 @@ class SymmetricCoordinatedChunkDecoder(nn.Module):
|
|
| 535 |
self,
|
| 536 |
base_action: Tensor,
|
| 537 |
pooled_context: Tensor,
|
| 538 |
-
|
|
|
|
| 539 |
batch_size = pooled_context.shape[0]
|
| 540 |
mode_logits = self.proposal_mode_head(pooled_context)
|
| 541 |
mode_residuals = []
|
|
@@ -553,6 +628,13 @@ class SymmetricCoordinatedChunkDecoder(nn.Module):
|
|
| 553 |
)
|
| 554 |
proposal_candidates = []
|
| 555 |
proposal_logits = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
for slot_idx in range(self.config.num_candidates):
|
| 557 |
mode_idx = int(mode_assignments[slot_idx])
|
| 558 |
candidate = base_action + 0.35 * torch.tanh(mode_residuals[:, mode_idx]) + 0.05 * torch.tanh(slot_deltas[slot_idx]).unsqueeze(0)
|
|
@@ -571,7 +653,7 @@ class SymmetricCoordinatedChunkDecoder(nn.Module):
|
|
| 571 |
stacked_candidates = torch.stack(proposal_candidates, dim=1)
|
| 572 |
stacked_logits = torch.stack(proposal_logits, dim=1)
|
| 573 |
stacked_candidates[:, 0] = base_action
|
| 574 |
-
return stacked_candidates, stacked_logits, mode_logits
|
| 575 |
|
| 576 |
def forward(
|
| 577 |
self,
|
|
@@ -581,6 +663,7 @@ class SymmetricCoordinatedChunkDecoder(nn.Module):
|
|
| 581 |
reveal_tokens: Tensor | None = None,
|
| 582 |
memory_token: Tensor | None = None,
|
| 583 |
compute_equivariance_probe: bool = False,
|
|
|
|
| 584 |
) -> dict[str, Tensor]:
|
| 585 |
if memory_tokens is None:
|
| 586 |
memory_tokens = memory_token
|
|
@@ -602,6 +685,20 @@ class SymmetricCoordinatedChunkDecoder(nn.Module):
|
|
| 602 |
if memory_tokens is not None:
|
| 603 |
decoder_memory = torch.cat([decoder_memory, memory_tokens], dim=1)
|
| 604 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 605 |
base_queries = self.query_embed.weight.unsqueeze(0).expand(batch_size, -1, -1)
|
| 606 |
arm_mean, arm_log_std, coordination = self._decode_arm_tokens(
|
| 607 |
queries=base_queries,
|
|
@@ -621,7 +718,11 @@ class SymmetricCoordinatedChunkDecoder(nn.Module):
|
|
| 621 |
],
|
| 622 |
dim=-1,
|
| 623 |
)
|
| 624 |
-
proposal_candidates, proposal_logits, proposal_mode_logits = self._proposal_outputs(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 625 |
|
| 626 |
outputs = {
|
| 627 |
"decoded_tokens": torch.cat([arm_mean[:, 0], arm_mean[:, 1]], dim=-1),
|
|
@@ -637,7 +738,8 @@ class SymmetricCoordinatedChunkDecoder(nn.Module):
|
|
| 637 |
self.config.num_candidates,
|
| 638 |
device=scene_tokens.device,
|
| 639 |
) % self.config.num_proposal_modes,
|
| 640 |
-
"proposal_mode_names":
|
|
|
|
| 641 |
}
|
| 642 |
if compute_equivariance_probe:
|
| 643 |
swapped_phase, swapped_roles, swapped_context = self._conditioning(
|
|
|
|
| 394 |
"retrieve",
|
| 395 |
)
|
| 396 |
|
| 397 |
+
TASK_PROPOSAL_MODES = {
|
| 398 |
+
"foliage": (
|
| 399 |
+
"sweep_left",
|
| 400 |
+
"sweep_right",
|
| 401 |
+
"pin_canopy",
|
| 402 |
+
"widen_gap",
|
| 403 |
+
"maintain_gap",
|
| 404 |
+
"insert_actor",
|
| 405 |
+
"retrieve",
|
| 406 |
+
),
|
| 407 |
+
"bag": (
|
| 408 |
+
"pin_left_rim",
|
| 409 |
+
"pin_right_rim",
|
| 410 |
+
"widen_mouth",
|
| 411 |
+
"maintain_mouth",
|
| 412 |
+
"probe_inside",
|
| 413 |
+
"insert_actor",
|
| 414 |
+
"retrieve",
|
| 415 |
+
),
|
| 416 |
+
"cloth": (
|
| 417 |
+
"lift_edge",
|
| 418 |
+
"separate_layer",
|
| 419 |
+
"stabilize_fold",
|
| 420 |
+
"maintain_lift",
|
| 421 |
+
"insert_actor",
|
| 422 |
+
"retrieve",
|
| 423 |
+
),
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
TASK_INDEX = {"foliage": 0, "bag": 1, "cloth": 2}
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
def infer_task_name_from_text(text: str | None) -> str:
|
| 430 |
+
if not text:
|
| 431 |
+
return "generic"
|
| 432 |
+
lowered = text.lower()
|
| 433 |
+
if any(token in lowered for token in ("foliage", "canopy", "leaf", "leaves", "snail")):
|
| 434 |
+
return "foliage"
|
| 435 |
+
if any(token in lowered for token in ("bag", "mouth", "rim", "aperture")):
|
| 436 |
+
return "bag"
|
| 437 |
+
if any(token in lowered for token in ("cloth", "fold", "layer", "suitcase", "garment")):
|
| 438 |
+
return "cloth"
|
| 439 |
+
return "generic"
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
def proposal_mode_vocab(task_name: str, num_modes: int) -> tuple[str, ...]:
|
| 443 |
+
if task_name == "generic":
|
| 444 |
+
base_vocab = tuple(DEFAULT_PROPOSAL_MODES)
|
| 445 |
+
else:
|
| 446 |
+
vocab = TASK_PROPOSAL_MODES[task_name]
|
| 447 |
+
if len(vocab) > num_modes:
|
| 448 |
+
if num_modes >= 6:
|
| 449 |
+
base_vocab = (
|
| 450 |
+
vocab[0],
|
| 451 |
+
vocab[1],
|
| 452 |
+
vocab[2],
|
| 453 |
+
vocab[3],
|
| 454 |
+
vocab[-2],
|
| 455 |
+
vocab[-1],
|
| 456 |
+
)[:num_modes]
|
| 457 |
+
else:
|
| 458 |
+
base_vocab = vocab[:num_modes]
|
| 459 |
+
else:
|
| 460 |
+
base_vocab = vocab
|
| 461 |
+
if len(base_vocab) >= num_modes:
|
| 462 |
+
return tuple(base_vocab[:num_modes])
|
| 463 |
+
if not base_vocab:
|
| 464 |
+
return tuple("retrieve" for _ in range(num_modes))
|
| 465 |
+
padded = list(base_vocab)
|
| 466 |
+
while len(padded) < num_modes:
|
| 467 |
+
padded.append(base_vocab[-1])
|
| 468 |
+
return tuple(padded)
|
| 469 |
+
|
| 470 |
|
| 471 |
def swap_arm_action_order(action_chunk: Tensor) -> Tensor:
|
| 472 |
midpoint = action_chunk.shape[-1] // 2
|
|
|
|
| 489 |
self.arm_decoder = nn.TransformerDecoder(decoder_layer, num_layers=config.num_layers)
|
| 490 |
self.query_embed = nn.Embedding(config.chunk_size, config.hidden_dim)
|
| 491 |
self.arm_identity = nn.Embedding(2, config.hidden_dim)
|
| 492 |
+
self.task_embedding = nn.Embedding(len(TASK_INDEX), config.hidden_dim)
|
| 493 |
self.phase_adapter = nn.Linear(config.num_phases, config.hidden_dim)
|
| 494 |
self.role_adapter = nn.Linear(config.num_arm_roles, config.hidden_dim)
|
| 495 |
self.context_proj = nn.Sequential(
|
|
|
|
| 609 |
self,
|
| 610 |
base_action: Tensor,
|
| 611 |
pooled_context: Tensor,
|
| 612 |
+
task_names: list[str],
|
| 613 |
+
) -> tuple[Tensor, Tensor, Tensor, list[list[str]]]:
|
| 614 |
batch_size = pooled_context.shape[0]
|
| 615 |
mode_logits = self.proposal_mode_head(pooled_context)
|
| 616 |
mode_residuals = []
|
|
|
|
| 628 |
)
|
| 629 |
proposal_candidates = []
|
| 630 |
proposal_logits = []
|
| 631 |
+
proposal_mode_names = [
|
| 632 |
+
[
|
| 633 |
+
proposal_mode_vocab(task_name, self.config.num_proposal_modes)[int(mode_assignments[slot_idx])]
|
| 634 |
+
for slot_idx in range(self.config.num_candidates)
|
| 635 |
+
]
|
| 636 |
+
for task_name in task_names
|
| 637 |
+
]
|
| 638 |
for slot_idx in range(self.config.num_candidates):
|
| 639 |
mode_idx = int(mode_assignments[slot_idx])
|
| 640 |
candidate = base_action + 0.35 * torch.tanh(mode_residuals[:, mode_idx]) + 0.05 * torch.tanh(slot_deltas[slot_idx]).unsqueeze(0)
|
|
|
|
| 653 |
stacked_candidates = torch.stack(proposal_candidates, dim=1)
|
| 654 |
stacked_logits = torch.stack(proposal_logits, dim=1)
|
| 655 |
stacked_candidates[:, 0] = base_action
|
| 656 |
+
return stacked_candidates, stacked_logits, mode_logits, proposal_mode_names
|
| 657 |
|
| 658 |
def forward(
|
| 659 |
self,
|
|
|
|
| 663 |
reveal_tokens: Tensor | None = None,
|
| 664 |
memory_token: Tensor | None = None,
|
| 665 |
compute_equivariance_probe: bool = False,
|
| 666 |
+
task_names: list[str] | None = None,
|
| 667 |
) -> dict[str, Tensor]:
|
| 668 |
if memory_tokens is None:
|
| 669 |
memory_tokens = memory_token
|
|
|
|
| 685 |
if memory_tokens is not None:
|
| 686 |
decoder_memory = torch.cat([decoder_memory, memory_tokens], dim=1)
|
| 687 |
|
| 688 |
+
canonical_task_names = [infer_task_name_from_text(name) for name in (task_names or ["generic"] * batch_size)]
|
| 689 |
+
task_ids = torch.as_tensor(
|
| 690 |
+
[TASK_INDEX[name] for name in canonical_task_names if name in TASK_INDEX],
|
| 691 |
+
device=scene_tokens.device,
|
| 692 |
+
dtype=torch.long,
|
| 693 |
+
)
|
| 694 |
+
if task_ids.numel() != batch_size:
|
| 695 |
+
task_ids = torch.as_tensor(
|
| 696 |
+
[TASK_INDEX.get(name, 0) for name in canonical_task_names],
|
| 697 |
+
device=scene_tokens.device,
|
| 698 |
+
dtype=torch.long,
|
| 699 |
+
)
|
| 700 |
+
interaction_context = interaction_context + self.task_embedding(task_ids)
|
| 701 |
+
|
| 702 |
base_queries = self.query_embed.weight.unsqueeze(0).expand(batch_size, -1, -1)
|
| 703 |
arm_mean, arm_log_std, coordination = self._decode_arm_tokens(
|
| 704 |
queries=base_queries,
|
|
|
|
| 718 |
],
|
| 719 |
dim=-1,
|
| 720 |
)
|
| 721 |
+
proposal_candidates, proposal_logits, proposal_mode_logits, proposal_mode_names = self._proposal_outputs(
|
| 722 |
+
action_mean,
|
| 723 |
+
pooled_context,
|
| 724 |
+
canonical_task_names,
|
| 725 |
+
)
|
| 726 |
|
| 727 |
outputs = {
|
| 728 |
"decoded_tokens": torch.cat([arm_mean[:, 0], arm_mean[:, 1]], dim=-1),
|
|
|
|
| 738 |
self.config.num_candidates,
|
| 739 |
device=scene_tokens.device,
|
| 740 |
) % self.config.num_proposal_modes,
|
| 741 |
+
"proposal_mode_names": proposal_mode_names,
|
| 742 |
+
"proposal_task_names": canonical_task_names,
|
| 743 |
}
|
| 744 |
if compute_equivariance_probe:
|
| 745 |
swapped_phase, swapped_roles, swapped_context = self._conditioning(
|
code/reveal_vla_bimanual/models/backbones.py
CHANGED
|
@@ -10,6 +10,8 @@ import torch
|
|
| 10 |
import torch.nn.functional as F
|
| 11 |
from torch import Tensor, nn
|
| 12 |
|
|
|
|
|
|
|
| 13 |
|
| 14 |
@dataclass
|
| 15 |
class FrozenVLBackboneConfig:
|
|
@@ -22,6 +24,9 @@ class FrozenVLBackboneConfig:
|
|
| 22 |
depth_patch_size: int = 16
|
| 23 |
geometry_feature_dim: int = 8
|
| 24 |
use_camera_geometry: bool = True
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
class DepthPatchAdapter(nn.Module):
|
|
@@ -65,63 +70,64 @@ class DepthPatchAdapter(nn.Module):
|
|
| 65 |
batch_views, _, height, width = depths.shape
|
| 66 |
grid_h = max(1, height // self.patch_size)
|
| 67 |
grid_w = max(1, width // self.patch_size)
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
-
geometry_terms: list[Tensor] = [coords]
|
| 74 |
if camera_intrinsics is not None:
|
| 75 |
-
fx = camera_intrinsics[:, 0, 0].unsqueeze(-1)
|
| 76 |
-
fy = camera_intrinsics[:, 1, 1].unsqueeze(-1)
|
| 77 |
-
cx = camera_intrinsics[:, 0, 2].unsqueeze(-1)
|
| 78 |
-
cy = camera_intrinsics[:, 1, 2].unsqueeze(-1)
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
cx.expand(-1, grid_h * grid_w, -1),
|
| 84 |
-
cy.expand(-1, grid_h * grid_w, -1),
|
| 85 |
-
],
|
| 86 |
-
dim=-1,
|
| 87 |
-
)
|
| 88 |
-
geometry_terms.append(intrinsic_features)
|
| 89 |
else:
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
if camera_extrinsics is not None:
|
| 93 |
-
|
| 94 |
-
translation =
|
| 95 |
-
|
|
|
|
| 96 |
else:
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
-
geometry = torch.cat(
|
| 100 |
if geometry.shape[-1] < self.geometry_feature_dim:
|
| 101 |
pad = self.geometry_feature_dim - geometry.shape[-1]
|
| 102 |
geometry = F.pad(geometry, (0, pad))
|
| 103 |
elif geometry.shape[-1] > self.geometry_feature_dim:
|
| 104 |
geometry = geometry[..., : self.geometry_feature_dim]
|
| 105 |
|
| 106 |
-
if camera_intrinsics is not None:
|
| 107 |
-
camera_summary = torch.cat(
|
| 108 |
-
[
|
| 109 |
-
camera_intrinsics[:, 0, 0:1],
|
| 110 |
-
camera_intrinsics[:, 1, 1:2],
|
| 111 |
-
camera_intrinsics[:, 0, 2:3],
|
| 112 |
-
camera_intrinsics[:, 1, 2:3],
|
| 113 |
-
],
|
| 114 |
-
dim=-1,
|
| 115 |
-
)
|
| 116 |
-
else:
|
| 117 |
-
camera_summary = torch.zeros(batch_views, 4, device=depths.device, dtype=depths.dtype)
|
| 118 |
if camera_extrinsics is not None:
|
| 119 |
-
|
| 120 |
else:
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
dim=-1,
|
| 124 |
-
)
|
| 125 |
return geometry, camera_summary
|
| 126 |
|
| 127 |
def forward(
|
|
@@ -130,6 +136,8 @@ class DepthPatchAdapter(nn.Module):
|
|
| 130 |
depth_valid: Tensor | None = None,
|
| 131 |
camera_intrinsics: Tensor | None = None,
|
| 132 |
camera_extrinsics: Tensor | None = None,
|
|
|
|
|
|
|
| 133 |
) -> dict[str, Tensor]:
|
| 134 |
if depths.ndim == 4:
|
| 135 |
depths = depths.unsqueeze(2)
|
|
@@ -161,7 +169,12 @@ class DepthPatchAdapter(nn.Module):
|
|
| 161 |
camera_intrinsics=flat_intrinsics,
|
| 162 |
camera_extrinsics=flat_extrinsics,
|
| 163 |
)
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
depth_tokens = self.depth_proj(token_inputs)
|
| 166 |
geometry_tokens = self.geometry_proj(geometry_features)
|
| 167 |
camera_tokens = self.camera_proj(camera_summary).unsqueeze(1)
|
|
@@ -324,12 +337,21 @@ class FrozenVLBackbone(nn.Module):
|
|
| 324 |
camera_intrinsics: Tensor | None = None,
|
| 325 |
camera_extrinsics: Tensor | None = None,
|
| 326 |
return_aux: bool = False,
|
|
|
|
|
|
|
|
|
|
| 327 |
) -> Tensor | dict[str, Tensor | None]:
|
| 328 |
rgb_tokens = self._encode_rgb_tokens(images)
|
| 329 |
wants_aux = return_aux or depths is not None or depth_valid is not None or camera_intrinsics is not None or camera_extrinsics is not None
|
| 330 |
if not wants_aux:
|
| 331 |
return rgb_tokens
|
| 332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
depth_outputs: dict[str, Tensor | None] = {
|
| 334 |
"depth_tokens": None,
|
| 335 |
"geometry_tokens": None,
|
|
@@ -341,7 +363,15 @@ class FrozenVLBackbone(nn.Module):
|
|
| 341 |
depth_valid=depth_valid,
|
| 342 |
camera_intrinsics=camera_intrinsics,
|
| 343 |
camera_extrinsics=camera_extrinsics,
|
|
|
|
|
|
|
| 344 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
|
| 346 |
return {
|
| 347 |
"rgb_tokens": rgb_tokens,
|
|
|
|
| 10 |
import torch.nn.functional as F
|
| 11 |
from torch import Tensor, nn
|
| 12 |
|
| 13 |
+
from pytorch3d.transforms import matrix_to_quaternion
|
| 14 |
+
|
| 15 |
|
| 16 |
@dataclass
|
| 17 |
class FrozenVLBackboneConfig:
|
|
|
|
| 24 |
depth_patch_size: int = 16
|
| 25 |
geometry_feature_dim: int = 8
|
| 26 |
use_camera_geometry: bool = True
|
| 27 |
+
use_depth_tokens: bool = True
|
| 28 |
+
use_geometry_tokens: bool = True
|
| 29 |
+
use_camera_pose_tokens: bool = True
|
| 30 |
|
| 31 |
|
| 32 |
class DepthPatchAdapter(nn.Module):
|
|
|
|
| 70 |
batch_views, _, height, width = depths.shape
|
| 71 |
grid_h = max(1, height // self.patch_size)
|
| 72 |
grid_w = max(1, width // self.patch_size)
|
| 73 |
+
patch_center_y = torch.linspace(
|
| 74 |
+
self.patch_size * 0.5,
|
| 75 |
+
max(self.patch_size * 0.5, height - (self.patch_size * 0.5)),
|
| 76 |
+
steps=grid_h,
|
| 77 |
+
device=depths.device,
|
| 78 |
+
dtype=depths.dtype,
|
| 79 |
+
)
|
| 80 |
+
patch_center_x = torch.linspace(
|
| 81 |
+
self.patch_size * 0.5,
|
| 82 |
+
max(self.patch_size * 0.5, width - (self.patch_size * 0.5)),
|
| 83 |
+
steps=grid_w,
|
| 84 |
+
device=depths.device,
|
| 85 |
+
dtype=depths.dtype,
|
| 86 |
+
)
|
| 87 |
+
pixel_y, pixel_x = torch.meshgrid(patch_center_y, patch_center_x, indexing="ij")
|
| 88 |
+
norm_x = ((pixel_x / max(width - 1, 1)) * 2.0 - 1.0).reshape(1, grid_h * grid_w, 1)
|
| 89 |
+
norm_y = ((pixel_y / max(height - 1, 1)) * 2.0 - 1.0).reshape(1, grid_h * grid_w, 1)
|
| 90 |
+
coords = torch.cat([norm_x, norm_y], dim=-1).expand(batch_views, -1, -1)
|
| 91 |
|
|
|
|
| 92 |
if camera_intrinsics is not None:
|
| 93 |
+
fx = camera_intrinsics[:, 0, 0].unsqueeze(-1)
|
| 94 |
+
fy = camera_intrinsics[:, 1, 1].unsqueeze(-1)
|
| 95 |
+
cx = camera_intrinsics[:, 0, 2].unsqueeze(-1)
|
| 96 |
+
cy = camera_intrinsics[:, 1, 2].unsqueeze(-1)
|
| 97 |
+
patch_x = pixel_x.reshape(1, grid_h * grid_w).expand(batch_views, -1)
|
| 98 |
+
patch_y = pixel_y.reshape(1, grid_h * grid_w).expand(batch_views, -1)
|
| 99 |
+
ray_x = (patch_x - cx) / fx.clamp_min(1e-6)
|
| 100 |
+
ray_y = (patch_y - cy) / fy.clamp_min(1e-6)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
else:
|
| 102 |
+
ray_x = coords[..., 0]
|
| 103 |
+
ray_y = coords[..., 1]
|
| 104 |
+
ray_camera = torch.stack([ray_x, ray_y, torch.ones_like(ray_x)], dim=-1)
|
| 105 |
+
ray_camera = F.normalize(ray_camera, dim=-1)
|
| 106 |
|
| 107 |
if camera_extrinsics is not None:
|
| 108 |
+
rotation = camera_extrinsics[:, :3, :3]
|
| 109 |
+
translation = camera_extrinsics[:, :3, 3].unsqueeze(1).expand(-1, grid_h * grid_w, -1)
|
| 110 |
+
ray_world = torch.matmul(rotation, ray_camera.transpose(1, 2)).transpose(1, 2)
|
| 111 |
+
quaternion = matrix_to_quaternion(rotation)
|
| 112 |
else:
|
| 113 |
+
rotation = None
|
| 114 |
+
translation = torch.zeros(batch_views, grid_h * grid_w, 3, device=depths.device, dtype=depths.dtype)
|
| 115 |
+
ray_world = ray_camera
|
| 116 |
+
quaternion = torch.zeros(batch_views, 4, device=depths.device, dtype=depths.dtype)
|
| 117 |
+
quaternion[:, 0] = 1.0
|
| 118 |
|
| 119 |
+
geometry = torch.cat([coords, ray_world, translation], dim=-1)
|
| 120 |
if geometry.shape[-1] < self.geometry_feature_dim:
|
| 121 |
pad = self.geometry_feature_dim - geometry.shape[-1]
|
| 122 |
geometry = F.pad(geometry, (0, pad))
|
| 123 |
elif geometry.shape[-1] > self.geometry_feature_dim:
|
| 124 |
geometry = geometry[..., : self.geometry_feature_dim]
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
if camera_extrinsics is not None:
|
| 127 |
+
translation_summary = camera_extrinsics[:, :3, 3]
|
| 128 |
else:
|
| 129 |
+
translation_summary = torch.zeros(batch_views, 3, device=depths.device, dtype=depths.dtype)
|
| 130 |
+
camera_summary = torch.cat([quaternion, translation_summary], dim=-1)
|
|
|
|
|
|
|
| 131 |
return geometry, camera_summary
|
| 132 |
|
| 133 |
def forward(
|
|
|
|
| 136 |
depth_valid: Tensor | None = None,
|
| 137 |
camera_intrinsics: Tensor | None = None,
|
| 138 |
camera_extrinsics: Tensor | None = None,
|
| 139 |
+
include_geometry_features: bool = True,
|
| 140 |
+
include_camera_pose: bool = True,
|
| 141 |
) -> dict[str, Tensor]:
|
| 142 |
if depths.ndim == 4:
|
| 143 |
depths = depths.unsqueeze(2)
|
|
|
|
| 169 |
camera_intrinsics=flat_intrinsics,
|
| 170 |
camera_extrinsics=flat_extrinsics,
|
| 171 |
)
|
| 172 |
+
if not include_geometry_features:
|
| 173 |
+
geometry_features = torch.zeros_like(geometry_features)
|
| 174 |
+
if not include_camera_pose:
|
| 175 |
+
camera_summary = torch.zeros_like(camera_summary)
|
| 176 |
+
# Keep depth tokens depth-only so depth, geometry, and pose ablations are separable.
|
| 177 |
+
token_inputs = torch.cat([depth_patch, valid_patch, torch.zeros_like(geometry_features)], dim=-1)
|
| 178 |
depth_tokens = self.depth_proj(token_inputs)
|
| 179 |
geometry_tokens = self.geometry_proj(geometry_features)
|
| 180 |
camera_tokens = self.camera_proj(camera_summary).unsqueeze(1)
|
|
|
|
| 337 |
camera_intrinsics: Tensor | None = None,
|
| 338 |
camera_extrinsics: Tensor | None = None,
|
| 339 |
return_aux: bool = False,
|
| 340 |
+
use_depth_tokens: bool | None = None,
|
| 341 |
+
use_geometry_tokens: bool | None = None,
|
| 342 |
+
use_camera_pose_tokens: bool | None = None,
|
| 343 |
) -> Tensor | dict[str, Tensor | None]:
|
| 344 |
rgb_tokens = self._encode_rgb_tokens(images)
|
| 345 |
wants_aux = return_aux or depths is not None or depth_valid is not None or camera_intrinsics is not None or camera_extrinsics is not None
|
| 346 |
if not wants_aux:
|
| 347 |
return rgb_tokens
|
| 348 |
|
| 349 |
+
depth_enabled = self.config.use_depth_tokens if use_depth_tokens is None else use_depth_tokens
|
| 350 |
+
geometry_enabled = self.config.use_geometry_tokens if use_geometry_tokens is None else use_geometry_tokens
|
| 351 |
+
camera_pose_enabled = self.config.use_camera_pose_tokens if use_camera_pose_tokens is None else use_camera_pose_tokens
|
| 352 |
+
geometry_enabled = bool(self.config.use_camera_geometry and geometry_enabled)
|
| 353 |
+
camera_pose_enabled = bool(self.config.use_camera_geometry and camera_pose_enabled)
|
| 354 |
+
|
| 355 |
depth_outputs: dict[str, Tensor | None] = {
|
| 356 |
"depth_tokens": None,
|
| 357 |
"geometry_tokens": None,
|
|
|
|
| 363 |
depth_valid=depth_valid,
|
| 364 |
camera_intrinsics=camera_intrinsics,
|
| 365 |
camera_extrinsics=camera_extrinsics,
|
| 366 |
+
include_geometry_features=geometry_enabled,
|
| 367 |
+
include_camera_pose=camera_pose_enabled,
|
| 368 |
)
|
| 369 |
+
if not depth_enabled:
|
| 370 |
+
depth_outputs["depth_tokens"] = None
|
| 371 |
+
if not geometry_enabled:
|
| 372 |
+
depth_outputs["geometry_tokens"] = None
|
| 373 |
+
if not camera_pose_enabled:
|
| 374 |
+
depth_outputs["camera_tokens"] = None
|
| 375 |
|
| 376 |
return {
|
| 377 |
"rgb_tokens": rgb_tokens,
|
code/reveal_vla_bimanual/models/multiview_fusion.py
CHANGED
|
@@ -83,6 +83,7 @@ class MultiViewFusion(nn.Module):
|
|
| 83 |
proprio: Tensor,
|
| 84 |
language_tokens: Tensor,
|
| 85 |
depth_tokens: Tensor | None = None,
|
|
|
|
| 86 |
camera_tokens: Tensor | None = None,
|
| 87 |
return_aux: bool = False,
|
| 88 |
) -> Tensor | dict[str, Tensor]:
|
|
@@ -102,6 +103,8 @@ class MultiViewFusion(nn.Module):
|
|
| 102 |
geometry_sources = []
|
| 103 |
if depth_tokens is not None:
|
| 104 |
geometry_sources.append(depth_tokens[:, view_idx])
|
|
|
|
|
|
|
| 105 |
if camera_tokens is not None:
|
| 106 |
geometry_sources.append(camera_tokens[:, view_idx])
|
| 107 |
if geometry_sources:
|
|
@@ -119,7 +122,12 @@ class MultiViewFusion(nn.Module):
|
|
| 119 |
batch_size, self.config.proprio_tokens, hidden_dim
|
| 120 |
)
|
| 121 |
scene_tokens = torch.cat([fused, proprio_tokens, language_tokens], dim=1)
|
| 122 |
-
if not (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
return scene_tokens
|
| 124 |
return {
|
| 125 |
"scene_tokens": scene_tokens,
|
|
|
|
| 83 |
proprio: Tensor,
|
| 84 |
language_tokens: Tensor,
|
| 85 |
depth_tokens: Tensor | None = None,
|
| 86 |
+
geometry_tokens: Tensor | None = None,
|
| 87 |
camera_tokens: Tensor | None = None,
|
| 88 |
return_aux: bool = False,
|
| 89 |
) -> Tensor | dict[str, Tensor]:
|
|
|
|
| 103 |
geometry_sources = []
|
| 104 |
if depth_tokens is not None:
|
| 105 |
geometry_sources.append(depth_tokens[:, view_idx])
|
| 106 |
+
if geometry_tokens is not None:
|
| 107 |
+
geometry_sources.append(geometry_tokens[:, view_idx])
|
| 108 |
if camera_tokens is not None:
|
| 109 |
geometry_sources.append(camera_tokens[:, view_idx])
|
| 110 |
if geometry_sources:
|
|
|
|
| 122 |
batch_size, self.config.proprio_tokens, hidden_dim
|
| 123 |
)
|
| 124 |
scene_tokens = torch.cat([fused, proprio_tokens, language_tokens], dim=1)
|
| 125 |
+
if not (
|
| 126 |
+
return_aux
|
| 127 |
+
or depth_tokens is not None
|
| 128 |
+
or geometry_tokens is not None
|
| 129 |
+
or camera_tokens is not None
|
| 130 |
+
):
|
| 131 |
return scene_tokens
|
| 132 |
return {
|
| 133 |
"scene_tokens": scene_tokens,
|
code/reveal_vla_bimanual/models/observation_memory.py
CHANGED
|
@@ -234,6 +234,14 @@ class _SelectiveMemoryBank(nn.Module):
|
|
| 234 |
nn.GELU(),
|
| 235 |
)
|
| 236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
def _truncate(self, history: Tensor | None) -> Tensor | None:
|
| 238 |
if history is None or history.numel() == 0:
|
| 239 |
return history
|
|
@@ -241,21 +249,48 @@ class _SelectiveMemoryBank(nn.Module):
|
|
| 241 |
return history
|
| 242 |
return history[:, -self.history_steps :]
|
| 243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
def forward(
|
| 245 |
self,
|
| 246 |
-
|
| 247 |
history_scene_tokens: Tensor | None = None,
|
| 248 |
history_actions: Tensor | None = None,
|
| 249 |
) -> dict[str, Tensor]:
|
| 250 |
history_scene_tokens = self._truncate(history_scene_tokens)
|
| 251 |
-
|
|
|
|
| 252 |
if history_scene_tokens is not None and history_scene_tokens.numel() > 0:
|
| 253 |
-
|
|
|
|
|
|
|
|
|
|
| 254 |
if history_actions is not None and history_actions.numel() > 0:
|
| 255 |
history_actions = history_actions[:, -history_pooled.shape[1] :]
|
| 256 |
-
|
|
|
|
|
|
|
| 257 |
sequence = torch.cat([history_pooled, pooled_current], dim=1)
|
| 258 |
else:
|
|
|
|
| 259 |
history_pooled = pooled_current[:, :0]
|
| 260 |
sequence = pooled_current
|
| 261 |
if sequence.shape[1] > self.position_embedding.shape[1]:
|
|
@@ -264,17 +299,21 @@ class _SelectiveMemoryBank(nn.Module):
|
|
| 264 |
)
|
| 265 |
encoded = self.sequence_encoder(sequence + self.position_embedding[:, : sequence.shape[1]])
|
| 266 |
current_token = encoded[:, -1]
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
bank_tokens = self.token_proj(bank_tokens)
|
| 279 |
return {
|
| 280 |
"memory_tokens": bank_tokens,
|
|
@@ -332,14 +371,13 @@ class DualObservationMemory(nn.Module):
|
|
| 332 |
history_scene_tokens: Tensor | None = None,
|
| 333 |
history_actions: Tensor | None = None,
|
| 334 |
) -> dict[str, Tensor]:
|
| 335 |
-
pooled_current = scene_tokens.mean(dim=1)
|
| 336 |
scene_output = self.scene_memory(
|
| 337 |
-
|
| 338 |
history_scene_tokens=history_scene_tokens,
|
| 339 |
history_actions=history_actions,
|
| 340 |
)
|
| 341 |
belief_output = self.belief_memory(
|
| 342 |
-
|
| 343 |
history_scene_tokens=history_scene_tokens,
|
| 344 |
history_actions=history_actions,
|
| 345 |
)
|
|
|
|
| 234 |
nn.GELU(),
|
| 235 |
)
|
| 236 |
|
| 237 |
+
def _recency_weights(self, length: int, device: torch.device, dtype: torch.dtype) -> Tensor:
|
| 238 |
+
if length <= 0:
|
| 239 |
+
return torch.zeros((0,), device=device, dtype=dtype)
|
| 240 |
+
positions = torch.arange(length, device=device, dtype=dtype)
|
| 241 |
+
distances = (length - 1) - positions
|
| 242 |
+
weights = torch.exp(-0.5 * distances)
|
| 243 |
+
return weights / weights.sum().clamp_min(1e-6)
|
| 244 |
+
|
| 245 |
def _truncate(self, history: Tensor | None) -> Tensor | None:
|
| 246 |
if history is None or history.numel() == 0:
|
| 247 |
return history
|
|
|
|
| 249 |
return history
|
| 250 |
return history[:, -self.history_steps :]
|
| 251 |
|
| 252 |
+
def _chunk_pool(self, tokens: Tensor) -> Tensor:
|
| 253 |
+
batch_size, seq_len, hidden_dim = tokens.shape
|
| 254 |
+
chunk_size = max(1, (seq_len + self.bank_queries.shape[0] - 1) // self.bank_queries.shape[0])
|
| 255 |
+
slots = []
|
| 256 |
+
for slot_idx in range(self.bank_queries.shape[0]):
|
| 257 |
+
start = slot_idx * chunk_size
|
| 258 |
+
end = min(seq_len, start + chunk_size)
|
| 259 |
+
if start >= seq_len:
|
| 260 |
+
pooled = tokens[:, -1]
|
| 261 |
+
else:
|
| 262 |
+
pooled = tokens[:, start:end].mean(dim=1)
|
| 263 |
+
slots.append(pooled)
|
| 264 |
+
return torch.stack(slots, dim=1)
|
| 265 |
+
|
| 266 |
+
def _compress_tokens(self, tokens: Tensor) -> Tensor:
|
| 267 |
+
base_slots = self._chunk_pool(tokens)
|
| 268 |
+
queries = self.bank_queries.unsqueeze(0).expand(tokens.shape[0], -1, -1) + base_slots
|
| 269 |
+
attended, _ = self.bank_attention(queries, tokens, tokens)
|
| 270 |
+
return base_slots + 0.1 * attended
|
| 271 |
+
|
| 272 |
def forward(
|
| 273 |
self,
|
| 274 |
+
current_tokens: Tensor,
|
| 275 |
history_scene_tokens: Tensor | None = None,
|
| 276 |
history_actions: Tensor | None = None,
|
| 277 |
) -> dict[str, Tensor]:
|
| 278 |
history_scene_tokens = self._truncate(history_scene_tokens)
|
| 279 |
+
current_bank = self._compress_tokens(current_tokens)
|
| 280 |
+
pooled_current = current_bank.mean(dim=1, keepdim=True)
|
| 281 |
if history_scene_tokens is not None and history_scene_tokens.numel() > 0:
|
| 282 |
+
batch_size, history_steps = history_scene_tokens.shape[:2]
|
| 283 |
+
flat_history = history_scene_tokens.reshape(batch_size * history_steps, history_scene_tokens.shape[2], history_scene_tokens.shape[3])
|
| 284 |
+
history_bank = self._compress_tokens(flat_history).view(batch_size, history_steps, self.bank_queries.shape[0], self.hidden_dim)
|
| 285 |
+
history_pooled = history_bank.mean(dim=2)
|
| 286 |
if history_actions is not None and history_actions.numel() > 0:
|
| 287 |
history_actions = history_actions[:, -history_pooled.shape[1] :]
|
| 288 |
+
history_action_tokens = self.action_proj(history_actions).unsqueeze(2)
|
| 289 |
+
history_bank = history_bank + history_action_tokens
|
| 290 |
+
history_pooled = history_bank.mean(dim=2)
|
| 291 |
sequence = torch.cat([history_pooled, pooled_current], dim=1)
|
| 292 |
else:
|
| 293 |
+
history_bank = current_bank.unsqueeze(1)[:, :0]
|
| 294 |
history_pooled = pooled_current[:, :0]
|
| 295 |
sequence = pooled_current
|
| 296 |
if sequence.shape[1] > self.position_embedding.shape[1]:
|
|
|
|
| 299 |
)
|
| 300 |
encoded = self.sequence_encoder(sequence + self.position_embedding[:, : sequence.shape[1]])
|
| 301 |
current_token = encoded[:, -1]
|
| 302 |
+
if history_bank.shape[1] > 0:
|
| 303 |
+
recency = self._recency_weights(
|
| 304 |
+
history_bank.shape[1],
|
| 305 |
+
device=history_bank.device,
|
| 306 |
+
dtype=history_bank.dtype,
|
| 307 |
+
).view(1, -1, 1, 1)
|
| 308 |
+
prior_bank = (history_bank * recency).sum(dim=1)
|
| 309 |
+
else:
|
| 310 |
+
prior_bank = torch.zeros_like(current_bank)
|
| 311 |
+
novelty = torch.abs(current_bank - prior_bank)
|
| 312 |
+
gate_logit = self.write_gate(torch.cat([current_bank, prior_bank, novelty], dim=-1))
|
| 313 |
+
novelty_score = novelty.mean(dim=-1, keepdim=True)
|
| 314 |
+
novelty_gate = torch.sigmoid(12.0 * (novelty_score - self.write_threshold))
|
| 315 |
+
gate = (0.25 + 0.75 * torch.sigmoid(gate_logit)) * novelty_gate
|
| 316 |
+
bank_tokens = prior_bank * (1.0 - gate) + current_bank * gate
|
| 317 |
bank_tokens = self.token_proj(bank_tokens)
|
| 318 |
return {
|
| 319 |
"memory_tokens": bank_tokens,
|
|
|
|
| 371 |
history_scene_tokens: Tensor | None = None,
|
| 372 |
history_actions: Tensor | None = None,
|
| 373 |
) -> dict[str, Tensor]:
|
|
|
|
| 374 |
scene_output = self.scene_memory(
|
| 375 |
+
current_tokens=scene_tokens,
|
| 376 |
history_scene_tokens=history_scene_tokens,
|
| 377 |
history_actions=history_actions,
|
| 378 |
)
|
| 379 |
belief_output = self.belief_memory(
|
| 380 |
+
current_tokens=scene_tokens,
|
| 381 |
history_scene_tokens=history_scene_tokens,
|
| 382 |
history_actions=history_actions,
|
| 383 |
)
|
code/reveal_vla_bimanual/models/planner.py
CHANGED
|
@@ -250,10 +250,18 @@ class StructuredElasticUtility(nn.Module):
|
|
| 250 |
occluder_contact = self._field_mean(rollout_state["occluder_contact_field"]).mean(dim=-1)
|
| 251 |
grasp_affordance = self._field_mean(rollout_state["grasp_affordance_field"]).mean(dim=-1)
|
| 252 |
support_stability = torch.sigmoid(self._field_mean(rollout_state["support_stability_field"])).mean(dim=-1)
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
retrieve_progress = torch.sigmoid(candidate_chunks[:, :, :, -1]).mean(dim=-1)
|
| 258 |
utility = (
|
| 259 |
self.config.belief_gain_weight * (belief_future - initial_belief)
|
|
@@ -278,8 +286,12 @@ class StructuredElasticUtility(nn.Module):
|
|
| 278 |
"persistence": persistence,
|
| 279 |
"support_stability": support_stability,
|
| 280 |
"reocclusion_penalty": reocclusion,
|
|
|
|
| 281 |
"disturbance_penalty": disturbance,
|
| 282 |
"access_quality": access_quality,
|
|
|
|
|
|
|
|
|
|
| 283 |
"task_progress": retrieve_progress,
|
| 284 |
"utility_structured": utility,
|
| 285 |
}
|
|
@@ -346,14 +358,42 @@ class CascadePlanner(nn.Module):
|
|
| 346 |
self.structured = StructuredElasticUtility(config)
|
| 347 |
self.residual = ResidualPlannerScorer(config)
|
| 348 |
|
| 349 |
-
def shortlist(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
batch_size, num_candidates = candidate_chunks.shape[:2]
|
| 351 |
top_k = min(max(1, self.config.top_k), num_candidates)
|
| 352 |
if proposal_logits is None:
|
| 353 |
cheap_scores = -candidate_chunks.square().mean(dim=(-1, -2))
|
| 354 |
else:
|
| 355 |
cheap_scores = proposal_logits
|
| 356 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
|
| 358 |
def select_best(
|
| 359 |
self,
|
|
@@ -362,6 +402,7 @@ class CascadePlanner(nn.Module):
|
|
| 362 |
rollout_state: dict[str, Tensor],
|
| 363 |
proposal_logits: Tensor | None = None,
|
| 364 |
candidate_indices: Tensor | None = None,
|
|
|
|
| 365 |
) -> dict[str, Tensor]:
|
| 366 |
structured = self.structured(
|
| 367 |
initial_state=initial_state,
|
|
@@ -375,6 +416,22 @@ class CascadePlanner(nn.Module):
|
|
| 375 |
)
|
| 376 |
utility_total = structured["utility_structured"] + self.config.residual_weight * residual["utility_residual"]
|
| 377 |
utility_total = utility_total + residual["success_logits"].sigmoid() - residual["risk_values"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
best_local = utility_total.argmax(dim=-1)
|
| 379 |
batch_indices = torch.arange(candidate_chunks.shape[0], device=candidate_chunks.device)
|
| 380 |
if candidate_indices is None:
|
|
@@ -386,6 +443,7 @@ class CascadePlanner(nn.Module):
|
|
| 386 |
**residual,
|
| 387 |
"utility_total": utility_total,
|
| 388 |
"utility_scores": utility_total,
|
|
|
|
| 389 |
"best_indices": best_indices,
|
| 390 |
"best_chunk": candidate_chunks[batch_indices, best_local],
|
| 391 |
"ranking_diagnostics": {
|
|
|
|
| 250 |
occluder_contact = self._field_mean(rollout_state["occluder_contact_field"]).mean(dim=-1)
|
| 251 |
grasp_affordance = self._field_mean(rollout_state["grasp_affordance_field"]).mean(dim=-1)
|
| 252 |
support_stability = torch.sigmoid(self._field_mean(rollout_state["support_stability_field"])).mean(dim=-1)
|
| 253 |
+
persistence_traj = self._field_mean(rollout_state["persistence_field"])
|
| 254 |
+
reocclusion_traj = self._field_mean(rollout_state["reocclusion_field"])
|
| 255 |
+
disturbance_traj = self._field_mean(rollout_state["disturbance_field"])
|
| 256 |
+
access_traj = torch.sigmoid(self._field_mean(rollout_state["access_field"]))
|
| 257 |
+
persistence = persistence_traj.mean(dim=-1)
|
| 258 |
+
reocclusion = reocclusion_traj.mean(dim=-1)
|
| 259 |
+
disturbance = disturbance_traj.mean(dim=-1)
|
| 260 |
+
access_quality = access_traj.mean(dim=-1)
|
| 261 |
+
access_floor = access_traj.amin(dim=-1)
|
| 262 |
+
persistence_floor = persistence_traj.amin(dim=-1)
|
| 263 |
+
support_floor = torch.sigmoid(self._field_mean(rollout_state["support_stability_field"])).amin(dim=-1)
|
| 264 |
+
reocclusion_worst = reocclusion_traj.amax(dim=-1)
|
| 265 |
retrieve_progress = torch.sigmoid(candidate_chunks[:, :, :, -1]).mean(dim=-1)
|
| 266 |
utility = (
|
| 267 |
self.config.belief_gain_weight * (belief_future - initial_belief)
|
|
|
|
| 286 |
"persistence": persistence,
|
| 287 |
"support_stability": support_stability,
|
| 288 |
"reocclusion_penalty": reocclusion,
|
| 289 |
+
"reocclusion_worst": reocclusion_worst,
|
| 290 |
"disturbance_penalty": disturbance,
|
| 291 |
"access_quality": access_quality,
|
| 292 |
+
"access_floor": access_floor,
|
| 293 |
+
"persistence_floor": persistence_floor,
|
| 294 |
+
"support_floor": support_floor,
|
| 295 |
"task_progress": retrieve_progress,
|
| 296 |
"utility_structured": utility,
|
| 297 |
}
|
|
|
|
| 358 |
self.structured = StructuredElasticUtility(config)
|
| 359 |
self.residual = ResidualPlannerScorer(config)
|
| 360 |
|
| 361 |
+
def shortlist(
|
| 362 |
+
self,
|
| 363 |
+
proposal_logits: Tensor | None,
|
| 364 |
+
candidate_chunks: Tensor,
|
| 365 |
+
proposal_mode_assignments: Tensor | None = None,
|
| 366 |
+
) -> Tensor:
|
| 367 |
batch_size, num_candidates = candidate_chunks.shape[:2]
|
| 368 |
top_k = min(max(1, self.config.top_k), num_candidates)
|
| 369 |
if proposal_logits is None:
|
| 370 |
cheap_scores = -candidate_chunks.square().mean(dim=(-1, -2))
|
| 371 |
else:
|
| 372 |
cheap_scores = proposal_logits
|
| 373 |
+
if proposal_mode_assignments is None:
|
| 374 |
+
return cheap_scores.topk(top_k, dim=-1).indices
|
| 375 |
+
if proposal_mode_assignments.ndim == 1:
|
| 376 |
+
proposal_mode_assignments = proposal_mode_assignments.unsqueeze(0).expand(batch_size, -1)
|
| 377 |
+
|
| 378 |
+
shortlisted = []
|
| 379 |
+
for batch_idx in range(batch_size):
|
| 380 |
+
scores = cheap_scores[batch_idx]
|
| 381 |
+
mode_ids = proposal_mode_assignments[batch_idx]
|
| 382 |
+
mode_best: list[tuple[float, int]] = []
|
| 383 |
+
for mode_id in torch.unique(mode_ids):
|
| 384 |
+
mode_indices = torch.nonzero(mode_ids == mode_id, as_tuple=False).squeeze(-1)
|
| 385 |
+
best_local = mode_indices[scores[mode_indices].argmax()]
|
| 386 |
+
mode_best.append((float(scores[best_local]), int(best_local)))
|
| 387 |
+
mode_best.sort(key=lambda item: item[0], reverse=True)
|
| 388 |
+
chosen = [index for _, index in mode_best[:top_k]]
|
| 389 |
+
if len(chosen) < top_k:
|
| 390 |
+
for candidate_idx in scores.argsort(descending=True).tolist():
|
| 391 |
+
if candidate_idx not in chosen:
|
| 392 |
+
chosen.append(candidate_idx)
|
| 393 |
+
if len(chosen) >= top_k:
|
| 394 |
+
break
|
| 395 |
+
shortlisted.append(torch.as_tensor(chosen[:top_k], device=candidate_chunks.device, dtype=torch.long))
|
| 396 |
+
return torch.stack(shortlisted, dim=0)
|
| 397 |
|
| 398 |
def select_best(
|
| 399 |
self,
|
|
|
|
| 402 |
rollout_state: dict[str, Tensor],
|
| 403 |
proposal_logits: Tensor | None = None,
|
| 404 |
candidate_indices: Tensor | None = None,
|
| 405 |
+
proposal_mode_names: list[list[str]] | None = None,
|
| 406 |
) -> dict[str, Tensor]:
|
| 407 |
structured = self.structured(
|
| 408 |
initial_state=initial_state,
|
|
|
|
| 416 |
)
|
| 417 |
utility_total = structured["utility_structured"] + self.config.residual_weight * residual["utility_residual"]
|
| 418 |
utility_total = utility_total + residual["success_logits"].sigmoid() - residual["risk_values"]
|
| 419 |
+
feasibility_penalty = torch.zeros_like(utility_total)
|
| 420 |
+
if proposal_mode_names is not None:
|
| 421 |
+
retrieve_like = torch.zeros_like(utility_total, dtype=torch.bool)
|
| 422 |
+
for batch_idx, names in enumerate(proposal_mode_names):
|
| 423 |
+
for candidate_idx, name in enumerate(names[: utility_total.shape[1]]):
|
| 424 |
+
retrieve_like[batch_idx, candidate_idx] = any(
|
| 425 |
+
token in name for token in ("retrieve", "insert_actor", "probe_inside")
|
| 426 |
+
)
|
| 427 |
+
blocked = (
|
| 428 |
+
(structured["access_floor"] < 0.15)
|
| 429 |
+
| (structured["persistence_floor"] < 0.15)
|
| 430 |
+
| (structured["support_floor"] < 0.25)
|
| 431 |
+
| (structured["reocclusion_worst"] > 0.6)
|
| 432 |
+
)
|
| 433 |
+
feasibility_penalty = retrieve_like.to(dtype=utility_total.dtype) * blocked.to(dtype=utility_total.dtype) * 2.0
|
| 434 |
+
utility_total = utility_total - feasibility_penalty
|
| 435 |
best_local = utility_total.argmax(dim=-1)
|
| 436 |
batch_indices = torch.arange(candidate_chunks.shape[0], device=candidate_chunks.device)
|
| 437 |
if candidate_indices is None:
|
|
|
|
| 443 |
**residual,
|
| 444 |
"utility_total": utility_total,
|
| 445 |
"utility_scores": utility_total,
|
| 446 |
+
"feasibility_penalty": feasibility_penalty,
|
| 447 |
"best_indices": best_indices,
|
| 448 |
"best_chunk": candidate_chunks[batch_indices, best_local],
|
| 449 |
"ranking_diagnostics": {
|
code/reveal_vla_bimanual/models/policy.py
CHANGED
|
@@ -11,6 +11,7 @@ from models.action_decoder import (
|
|
| 11 |
ChunkDecoderConfig,
|
| 12 |
InteractionChunkDecoder,
|
| 13 |
SymmetricCoordinatedChunkDecoder,
|
|
|
|
| 14 |
)
|
| 15 |
from models.backbones import FrozenVLBackbone, FrozenVLBackboneConfig
|
| 16 |
from models.multiview_fusion import MultiViewFusion, MultiViewFusionConfig
|
|
@@ -65,6 +66,11 @@ class BackboneOnlyPolicy(nn.Module):
|
|
| 65 |
attention_mask=language_tokens["attention_mask"],
|
| 66 |
)
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
def encode_scene(
|
| 69 |
self,
|
| 70 |
images: Tensor,
|
|
@@ -388,6 +394,8 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 388 |
camera_intrinsics: Tensor | None = None,
|
| 389 |
camera_extrinsics: Tensor | None = None,
|
| 390 |
use_depth: bool = True,
|
|
|
|
|
|
|
| 391 |
) -> dict[str, Tensor]:
|
| 392 |
encoded = self.backbone.encode_images(
|
| 393 |
images,
|
|
@@ -396,6 +404,9 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 396 |
camera_intrinsics=camera_intrinsics if use_depth else None,
|
| 397 |
camera_extrinsics=camera_extrinsics if use_depth else None,
|
| 398 |
return_aux=True,
|
|
|
|
|
|
|
|
|
|
| 399 |
)
|
| 400 |
assert isinstance(encoded, dict)
|
| 401 |
text_tokens = self._encode_language(images, texts=texts, language_tokens=language_tokens)
|
|
@@ -404,6 +415,7 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 404 |
proprio=proprio,
|
| 405 |
language_tokens=text_tokens,
|
| 406 |
depth_tokens=encoded.get("depth_tokens"),
|
|
|
|
| 407 |
camera_tokens=encoded.get("camera_tokens"),
|
| 408 |
return_aux=True,
|
| 409 |
)
|
|
@@ -413,6 +425,7 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 413 |
"view_summaries": fused["view_summaries"],
|
| 414 |
"geometry_summaries": fused["geometry_summaries"],
|
| 415 |
"depth_tokens": encoded.get("depth_tokens"),
|
|
|
|
| 416 |
"camera_tokens": encoded.get("camera_tokens"),
|
| 417 |
}
|
| 418 |
|
|
@@ -441,6 +454,8 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 441 |
camera_intrinsics: Tensor | None = None,
|
| 442 |
camera_extrinsics: Tensor | None = None,
|
| 443 |
use_depth: bool = True,
|
|
|
|
|
|
|
| 444 |
) -> Tensor | None:
|
| 445 |
if history_images is None or history_proprio is None or history_images.numel() == 0:
|
| 446 |
return None
|
|
@@ -469,6 +484,8 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 469 |
camera_intrinsics=None,
|
| 470 |
camera_extrinsics=None,
|
| 471 |
use_depth=use_depth,
|
|
|
|
|
|
|
| 472 |
)["scene_tokens"]
|
| 473 |
return history_scene.view(batch_size, history_steps, history_scene.shape[1], history_scene.shape[2])
|
| 474 |
|
|
@@ -495,6 +512,27 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 495 |
value = value.detach()
|
| 496 |
return value.unsqueeze(1).unsqueeze(2).expand(-1, num_candidates, horizon, *value.shape[1:])
|
| 497 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 498 |
def _identity_rollout(
|
| 499 |
self,
|
| 500 |
interaction_state: dict[str, Tensor],
|
|
@@ -531,7 +569,14 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 531 |
history_depths: Tensor | None = None,
|
| 532 |
history_depth_valid: Tensor | None = None,
|
| 533 |
compute_equivariance_probe: bool = False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
) -> dict[str, Tensor]:
|
|
|
|
| 535 |
scene_output = self._encode_scene_with_optional_depth(
|
| 536 |
images=images,
|
| 537 |
proprio=proprio,
|
|
@@ -542,6 +587,8 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 542 |
camera_intrinsics=camera_intrinsics,
|
| 543 |
camera_extrinsics=camera_extrinsics,
|
| 544 |
use_depth=use_depth,
|
|
|
|
|
|
|
| 545 |
)
|
| 546 |
scene_tokens = scene_output["scene_tokens"]
|
| 547 |
history_scene_tokens = self.encode_history_with_optional_depth(
|
|
@@ -554,19 +601,26 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 554 |
camera_intrinsics=camera_intrinsics,
|
| 555 |
camera_extrinsics=camera_extrinsics,
|
| 556 |
use_depth=use_depth,
|
|
|
|
|
|
|
| 557 |
)
|
| 558 |
if history_steps_override is not None and history_scene_tokens is not None and history_scene_tokens.numel() > 0:
|
| 559 |
history_scene_tokens = history_scene_tokens[:, -history_steps_override:]
|
| 560 |
if history_actions is not None and history_actions.numel() > 0:
|
| 561 |
history_actions = history_actions[:, -history_steps_override:]
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
|
|
|
|
|
|
|
|
|
| 567 |
elastic_state = self.elastic_state_head(
|
| 568 |
scene_tokens,
|
| 569 |
memory_tokens=memory_output["memory_tokens"],
|
|
|
|
|
|
|
| 570 |
)
|
| 571 |
elastic_state["memory_tokens"] = memory_output["memory_tokens"]
|
| 572 |
elastic_state["memory_token"] = memory_output["memory_token"]
|
|
@@ -581,6 +635,7 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 581 |
interaction_state=elastic_state,
|
| 582 |
memory_tokens=memory_output["memory_tokens"],
|
| 583 |
compute_equivariance_probe=compute_equivariance_probe,
|
|
|
|
| 584 |
)
|
| 585 |
outputs = {
|
| 586 |
**decoded,
|
|
@@ -592,7 +647,11 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 592 |
"reveal_state": elastic_state,
|
| 593 |
"view_summaries": scene_output["view_summaries"],
|
| 594 |
"geometry_summaries": scene_output["geometry_summaries"],
|
|
|
|
|
|
|
|
|
|
| 595 |
"rollout_source": "none",
|
|
|
|
| 596 |
}
|
| 597 |
|
| 598 |
candidate_chunks = candidate_chunks_override
|
|
@@ -602,8 +661,10 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 602 |
outputs["action_mean"],
|
| 603 |
outputs["action_log_std"],
|
| 604 |
num_candidates=self.config.decoder.num_candidates,
|
| 605 |
-
proposal_candidates=outputs.get("proposal_candidates"),
|
| 606 |
)
|
|
|
|
|
|
|
| 607 |
else:
|
| 608 |
proposal_logits = None
|
| 609 |
outputs["candidate_chunks"] = candidate_chunks
|
|
@@ -625,13 +686,25 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 625 |
)
|
| 626 |
return outputs
|
| 627 |
|
| 628 |
-
shortlist_indices = self.planner.shortlist(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 629 |
outputs["planner_topk_indices"] = shortlist_indices
|
| 630 |
batch_size = candidate_chunks.shape[0]
|
| 631 |
batch_indices = torch.arange(batch_size, device=candidate_chunks.device).unsqueeze(-1)
|
| 632 |
topk_candidates = candidate_chunks[batch_indices, shortlist_indices]
|
| 633 |
num_topk = topk_candidates.shape[1]
|
| 634 |
outputs["planner_topk_candidates"] = topk_candidates
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 635 |
if proposal_logits is not None:
|
| 636 |
topk_proposal_logits = proposal_logits.gather(1, shortlist_indices)
|
| 637 |
else:
|
|
@@ -653,6 +726,7 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 653 |
rollout_state=identity_rollout,
|
| 654 |
proposal_logits=topk_proposal_logits,
|
| 655 |
candidate_indices=shortlist_indices,
|
|
|
|
| 656 |
)
|
| 657 |
outputs["planned_rollout"] = identity_rollout
|
| 658 |
outputs["planned_chunk"] = selected["best_chunk"]
|
|
@@ -677,6 +751,8 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 677 |
memory_tokens=self._tile_tensor(memory_output["memory_tokens"], num_topk),
|
| 678 |
scene_memory_tokens=self._tile_tensor(memory_output["scene_memory_tokens"], num_topk),
|
| 679 |
belief_memory_tokens=self._tile_tensor(memory_output["belief_memory_tokens"], num_topk),
|
|
|
|
|
|
|
| 680 |
)
|
| 681 |
reshaped_rollout = {
|
| 682 |
key: value.view(batch_size, num_topk, *value.shape[1:]) for key, value in rollout.items()
|
|
@@ -687,6 +763,7 @@ class ElasticRevealBimanualPolicy(BackboneOnlyPolicy):
|
|
| 687 |
rollout_state=reshaped_rollout,
|
| 688 |
proposal_logits=topk_proposal_logits,
|
| 689 |
candidate_indices=shortlist_indices,
|
|
|
|
| 690 |
)
|
| 691 |
outputs["planned_rollout"] = reshaped_rollout
|
| 692 |
outputs["planned_chunk"] = selected["best_chunk"]
|
|
|
|
| 11 |
ChunkDecoderConfig,
|
| 12 |
InteractionChunkDecoder,
|
| 13 |
SymmetricCoordinatedChunkDecoder,
|
| 14 |
+
infer_task_name_from_text,
|
| 15 |
)
|
| 16 |
from models.backbones import FrozenVLBackbone, FrozenVLBackboneConfig
|
| 17 |
from models.multiview_fusion import MultiViewFusion, MultiViewFusionConfig
|
|
|
|
| 66 |
attention_mask=language_tokens["attention_mask"],
|
| 67 |
)
|
| 68 |
|
| 69 |
+
def _task_names(self, batch_size: int, texts: Sequence[str] | None = None) -> list[str]:
|
| 70 |
+
if texts is None:
|
| 71 |
+
return ["generic"] * batch_size
|
| 72 |
+
return [infer_task_name_from_text(text) for text in texts]
|
| 73 |
+
|
| 74 |
def encode_scene(
|
| 75 |
self,
|
| 76 |
images: Tensor,
|
|
|
|
| 394 |
camera_intrinsics: Tensor | None = None,
|
| 395 |
camera_extrinsics: Tensor | None = None,
|
| 396 |
use_depth: bool = True,
|
| 397 |
+
use_geometry_tokens: bool | None = None,
|
| 398 |
+
use_camera_pose_tokens: bool | None = None,
|
| 399 |
) -> dict[str, Tensor]:
|
| 400 |
encoded = self.backbone.encode_images(
|
| 401 |
images,
|
|
|
|
| 404 |
camera_intrinsics=camera_intrinsics if use_depth else None,
|
| 405 |
camera_extrinsics=camera_extrinsics if use_depth else None,
|
| 406 |
return_aux=True,
|
| 407 |
+
use_depth_tokens=use_depth,
|
| 408 |
+
use_geometry_tokens=use_geometry_tokens,
|
| 409 |
+
use_camera_pose_tokens=use_camera_pose_tokens,
|
| 410 |
)
|
| 411 |
assert isinstance(encoded, dict)
|
| 412 |
text_tokens = self._encode_language(images, texts=texts, language_tokens=language_tokens)
|
|
|
|
| 415 |
proprio=proprio,
|
| 416 |
language_tokens=text_tokens,
|
| 417 |
depth_tokens=encoded.get("depth_tokens"),
|
| 418 |
+
geometry_tokens=encoded.get("geometry_tokens"),
|
| 419 |
camera_tokens=encoded.get("camera_tokens"),
|
| 420 |
return_aux=True,
|
| 421 |
)
|
|
|
|
| 425 |
"view_summaries": fused["view_summaries"],
|
| 426 |
"geometry_summaries": fused["geometry_summaries"],
|
| 427 |
"depth_tokens": encoded.get("depth_tokens"),
|
| 428 |
+
"geometry_tokens": encoded.get("geometry_tokens"),
|
| 429 |
"camera_tokens": encoded.get("camera_tokens"),
|
| 430 |
}
|
| 431 |
|
|
|
|
| 454 |
camera_intrinsics: Tensor | None = None,
|
| 455 |
camera_extrinsics: Tensor | None = None,
|
| 456 |
use_depth: bool = True,
|
| 457 |
+
use_geometry_tokens: bool | None = None,
|
| 458 |
+
use_camera_pose_tokens: bool | None = None,
|
| 459 |
) -> Tensor | None:
|
| 460 |
if history_images is None or history_proprio is None or history_images.numel() == 0:
|
| 461 |
return None
|
|
|
|
| 484 |
camera_intrinsics=None,
|
| 485 |
camera_extrinsics=None,
|
| 486 |
use_depth=use_depth,
|
| 487 |
+
use_geometry_tokens=use_geometry_tokens,
|
| 488 |
+
use_camera_pose_tokens=use_camera_pose_tokens,
|
| 489 |
)["scene_tokens"]
|
| 490 |
return history_scene.view(batch_size, history_steps, history_scene.shape[1], history_scene.shape[2])
|
| 491 |
|
|
|
|
| 512 |
value = value.detach()
|
| 513 |
return value.unsqueeze(1).unsqueeze(2).expand(-1, num_candidates, horizon, *value.shape[1:])
|
| 514 |
|
| 515 |
+
def _zero_memory_output(self, scene_tokens: Tensor) -> dict[str, Tensor]:
|
| 516 |
+
batch_size, _, hidden_dim = scene_tokens.shape
|
| 517 |
+
scene_memory_tokens = scene_tokens.new_zeros((batch_size, self.config.memory.scene_bank_size, hidden_dim))
|
| 518 |
+
belief_memory_tokens = scene_tokens.new_zeros((batch_size, self.config.memory.belief_bank_size, hidden_dim))
|
| 519 |
+
memory_tokens = torch.cat([scene_memory_tokens, belief_memory_tokens], dim=1)
|
| 520 |
+
return {
|
| 521 |
+
"scene_memory_tokens": scene_memory_tokens,
|
| 522 |
+
"belief_memory_tokens": belief_memory_tokens,
|
| 523 |
+
"memory_tokens": memory_tokens,
|
| 524 |
+
"memory_token": memory_tokens.mean(dim=1, keepdim=True),
|
| 525 |
+
"memory_sequence": scene_tokens.new_zeros((batch_size, 0, hidden_dim)),
|
| 526 |
+
"memory_state": scene_tokens.new_zeros((batch_size, hidden_dim * 2)),
|
| 527 |
+
"memory_uncertainty": scene_tokens.new_zeros((batch_size,)),
|
| 528 |
+
"memory_write_rate": scene_tokens.new_zeros((batch_size,)),
|
| 529 |
+
"memory_saturation": scene_tokens.new_zeros((batch_size,)),
|
| 530 |
+
"scene_write_gate": scene_tokens.new_zeros((batch_size, self.config.memory.scene_bank_size)),
|
| 531 |
+
"belief_write_gate": scene_tokens.new_zeros((batch_size, self.config.memory.belief_bank_size)),
|
| 532 |
+
"memory_scene_state": scene_tokens.new_zeros((batch_size, hidden_dim)),
|
| 533 |
+
"memory_belief_state": scene_tokens.new_zeros((batch_size, hidden_dim)),
|
| 534 |
+
}
|
| 535 |
+
|
| 536 |
def _identity_rollout(
|
| 537 |
self,
|
| 538 |
interaction_state: dict[str, Tensor],
|
|
|
|
| 569 |
history_depths: Tensor | None = None,
|
| 570 |
history_depth_valid: Tensor | None = None,
|
| 571 |
compute_equivariance_probe: bool = False,
|
| 572 |
+
use_geometry_tokens: bool | None = None,
|
| 573 |
+
use_camera_pose_tokens: bool | None = None,
|
| 574 |
+
use_memory: bool = True,
|
| 575 |
+
use_task_conditioning: bool = True,
|
| 576 |
+
rollout_mode_override: str | None = None,
|
| 577 |
+
use_proposal_candidates: bool = True,
|
| 578 |
) -> dict[str, Tensor]:
|
| 579 |
+
task_names = self._task_names(images.shape[0], texts=texts)
|
| 580 |
scene_output = self._encode_scene_with_optional_depth(
|
| 581 |
images=images,
|
| 582 |
proprio=proprio,
|
|
|
|
| 587 |
camera_intrinsics=camera_intrinsics,
|
| 588 |
camera_extrinsics=camera_extrinsics,
|
| 589 |
use_depth=use_depth,
|
| 590 |
+
use_geometry_tokens=use_geometry_tokens,
|
| 591 |
+
use_camera_pose_tokens=use_camera_pose_tokens,
|
| 592 |
)
|
| 593 |
scene_tokens = scene_output["scene_tokens"]
|
| 594 |
history_scene_tokens = self.encode_history_with_optional_depth(
|
|
|
|
| 601 |
camera_intrinsics=camera_intrinsics,
|
| 602 |
camera_extrinsics=camera_extrinsics,
|
| 603 |
use_depth=use_depth,
|
| 604 |
+
use_geometry_tokens=use_geometry_tokens,
|
| 605 |
+
use_camera_pose_tokens=use_camera_pose_tokens,
|
| 606 |
)
|
| 607 |
if history_steps_override is not None and history_scene_tokens is not None and history_scene_tokens.numel() > 0:
|
| 608 |
history_scene_tokens = history_scene_tokens[:, -history_steps_override:]
|
| 609 |
if history_actions is not None and history_actions.numel() > 0:
|
| 610 |
history_actions = history_actions[:, -history_steps_override:]
|
| 611 |
+
if use_memory:
|
| 612 |
+
memory_output = self.memory(
|
| 613 |
+
scene_tokens,
|
| 614 |
+
history_scene_tokens=history_scene_tokens,
|
| 615 |
+
history_actions=history_actions,
|
| 616 |
+
)
|
| 617 |
+
else:
|
| 618 |
+
memory_output = self._zero_memory_output(scene_tokens)
|
| 619 |
elastic_state = self.elastic_state_head(
|
| 620 |
scene_tokens,
|
| 621 |
memory_tokens=memory_output["memory_tokens"],
|
| 622 |
+
task_names=task_names,
|
| 623 |
+
use_task_conditioning=use_task_conditioning,
|
| 624 |
)
|
| 625 |
elastic_state["memory_tokens"] = memory_output["memory_tokens"]
|
| 626 |
elastic_state["memory_token"] = memory_output["memory_token"]
|
|
|
|
| 635 |
interaction_state=elastic_state,
|
| 636 |
memory_tokens=memory_output["memory_tokens"],
|
| 637 |
compute_equivariance_probe=compute_equivariance_probe,
|
| 638 |
+
task_names=task_names,
|
| 639 |
)
|
| 640 |
outputs = {
|
| 641 |
**decoded,
|
|
|
|
| 647 |
"reveal_state": elastic_state,
|
| 648 |
"view_summaries": scene_output["view_summaries"],
|
| 649 |
"geometry_summaries": scene_output["geometry_summaries"],
|
| 650 |
+
"depth_tokens": scene_output["depth_tokens"],
|
| 651 |
+
"geometry_tokens": scene_output["geometry_tokens"],
|
| 652 |
+
"camera_tokens": scene_output["camera_tokens"],
|
| 653 |
"rollout_source": "none",
|
| 654 |
+
"task_names": task_names,
|
| 655 |
}
|
| 656 |
|
| 657 |
candidate_chunks = candidate_chunks_override
|
|
|
|
| 661 |
outputs["action_mean"],
|
| 662 |
outputs["action_log_std"],
|
| 663 |
num_candidates=self.config.decoder.num_candidates,
|
| 664 |
+
proposal_candidates=outputs.get("proposal_candidates") if use_proposal_candidates else None,
|
| 665 |
)
|
| 666 |
+
if not use_proposal_candidates:
|
| 667 |
+
proposal_logits = None
|
| 668 |
else:
|
| 669 |
proposal_logits = None
|
| 670 |
outputs["candidate_chunks"] = candidate_chunks
|
|
|
|
| 686 |
)
|
| 687 |
return outputs
|
| 688 |
|
| 689 |
+
shortlist_indices = self.planner.shortlist(
|
| 690 |
+
proposal_logits=proposal_logits,
|
| 691 |
+
candidate_chunks=candidate_chunks,
|
| 692 |
+
proposal_mode_assignments=outputs.get("proposal_mode_assignments") if use_proposal_candidates else None,
|
| 693 |
+
)
|
| 694 |
outputs["planner_topk_indices"] = shortlist_indices
|
| 695 |
batch_size = candidate_chunks.shape[0]
|
| 696 |
batch_indices = torch.arange(batch_size, device=candidate_chunks.device).unsqueeze(-1)
|
| 697 |
topk_candidates = candidate_chunks[batch_indices, shortlist_indices]
|
| 698 |
num_topk = topk_candidates.shape[1]
|
| 699 |
outputs["planner_topk_candidates"] = topk_candidates
|
| 700 |
+
proposal_mode_names = outputs.get("proposal_mode_names")
|
| 701 |
+
topk_proposal_mode_names = None
|
| 702 |
+
if proposal_mode_names is not None and use_proposal_candidates:
|
| 703 |
+
topk_proposal_mode_names = [
|
| 704 |
+
[proposal_mode_names[batch_idx][int(candidate_idx.item())] for candidate_idx in shortlist_indices[batch_idx]]
|
| 705 |
+
for batch_idx in range(batch_size)
|
| 706 |
+
]
|
| 707 |
+
outputs["planner_topk_mode_names"] = topk_proposal_mode_names
|
| 708 |
if proposal_logits is not None:
|
| 709 |
topk_proposal_logits = proposal_logits.gather(1, shortlist_indices)
|
| 710 |
else:
|
|
|
|
| 726 |
rollout_state=identity_rollout,
|
| 727 |
proposal_logits=topk_proposal_logits,
|
| 728 |
candidate_indices=shortlist_indices,
|
| 729 |
+
proposal_mode_names=topk_proposal_mode_names,
|
| 730 |
)
|
| 731 |
outputs["planned_rollout"] = identity_rollout
|
| 732 |
outputs["planned_chunk"] = selected["best_chunk"]
|
|
|
|
| 751 |
memory_tokens=self._tile_tensor(memory_output["memory_tokens"], num_topk),
|
| 752 |
scene_memory_tokens=self._tile_tensor(memory_output["scene_memory_tokens"], num_topk),
|
| 753 |
belief_memory_tokens=self._tile_tensor(memory_output["belief_memory_tokens"], num_topk),
|
| 754 |
+
task_names=[name for name in task_names for _ in range(num_topk)],
|
| 755 |
+
rollout_mode_override=rollout_mode_override,
|
| 756 |
)
|
| 757 |
reshaped_rollout = {
|
| 758 |
key: value.view(batch_size, num_topk, *value.shape[1:]) for key, value in rollout.items()
|
|
|
|
| 763 |
rollout_state=reshaped_rollout,
|
| 764 |
proposal_logits=topk_proposal_logits,
|
| 765 |
candidate_indices=shortlist_indices,
|
| 766 |
+
proposal_mode_names=topk_proposal_mode_names,
|
| 767 |
)
|
| 768 |
outputs["planned_rollout"] = reshaped_rollout
|
| 769 |
outputs["planned_chunk"] = selected["best_chunk"]
|
code/reveal_vla_bimanual/models/reveal_head.py
CHANGED
|
@@ -1,12 +1,103 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
from dataclasses import dataclass
|
|
|
|
| 4 |
|
| 5 |
import torch
|
| 6 |
import torch.nn.functional as F
|
| 7 |
from torch import Tensor, nn
|
| 8 |
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
@dataclass
|
| 11 |
class RevealHeadConfig:
|
| 12 |
hidden_dim: int = 512
|
|
@@ -20,6 +111,7 @@ class RevealHeadConfig:
|
|
| 20 |
num_phases: int = 5
|
| 21 |
num_arm_roles: int = 4
|
| 22 |
num_interaction_tokens: int = 8
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
class RevealStateHead(nn.Module):
|
|
@@ -379,6 +471,22 @@ class ElasticOcclusionFieldDecoder(nn.Module):
|
|
| 379 |
nn.GELU(),
|
| 380 |
nn.Linear(config.hidden_dim, config.num_support_modes),
|
| 381 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
|
| 383 |
def _pool_source(self, source_tokens: Tensor | None, fallback: Tensor) -> Tensor:
|
| 384 |
if source_tokens is None or source_tokens.numel() == 0:
|
|
@@ -403,6 +511,8 @@ class ElasticOcclusionFieldDecoder(nn.Module):
|
|
| 403 |
interaction_tokens: Tensor,
|
| 404 |
scene_tokens: Tensor | None = None,
|
| 405 |
memory_tokens: Tensor | None = None,
|
|
|
|
|
|
|
| 406 |
) -> dict[str, Tensor]:
|
| 407 |
batch_size = interaction_tokens.shape[0]
|
| 408 |
pooled_interaction = interaction_tokens.mean(dim=1)
|
|
@@ -423,6 +533,15 @@ class ElasticOcclusionFieldDecoder(nn.Module):
|
|
| 423 |
pooled_field = field_tokens.mean(dim=1)
|
| 424 |
summary_input = torch.cat([pooled_interaction, pooled_field, pooled_scene, pooled_memory], dim=-1)
|
| 425 |
latent_summary = self.summary_proj(summary_input)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
|
| 427 |
access_field = self.access_field(grid)
|
| 428 |
target_belief_field = self.target_belief_field(grid)
|
|
@@ -435,6 +554,23 @@ class ElasticOcclusionFieldDecoder(nn.Module):
|
|
| 435 |
reocclusion_field = torch.sigmoid(self.reocclusion_field(grid))
|
| 436 |
disturbance_field = torch.sigmoid(self.disturbance_field(grid))
|
| 437 |
uncertainty_field = F.softplus(self.uncertainty_field(grid))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
|
| 439 |
support_stability_prob = torch.sigmoid(support_stability_field)
|
| 440 |
risk_field = torch.sigmoid(
|
|
@@ -459,7 +595,7 @@ class ElasticOcclusionFieldDecoder(nn.Module):
|
|
| 459 |
arm_identity = self.arm_identity.weight.unsqueeze(0).expand(batch_size, -1, -1)
|
| 460 |
arm_tokens = pooled_interaction.unsqueeze(1).expand(-1, 2, -1) + arm_identity
|
| 461 |
arm_role_input = torch.cat(
|
| 462 |
-
[arm_tokens,
|
| 463 |
dim=-1,
|
| 464 |
)
|
| 465 |
arm_role_logits = self.arm_role_head(arm_role_input)
|
|
@@ -477,8 +613,8 @@ class ElasticOcclusionFieldDecoder(nn.Module):
|
|
| 477 |
risk_field.mean(dim=(-1, -2)).squeeze(1),
|
| 478 |
uncertainty_field.mean(dim=(-1, -2)).squeeze(1),
|
| 479 |
access_prob.mean(dim=(-1, -2)).transpose(0, 1).transpose(0, 1),
|
| 480 |
-
self.support_mode(summary_input),
|
| 481 |
-
self.phase_head(summary_input),
|
| 482 |
arm_role_logits.reshape(batch_size, -1),
|
| 483 |
]
|
| 484 |
compact_state = torch.cat(
|
|
@@ -487,7 +623,7 @@ class ElasticOcclusionFieldDecoder(nn.Module):
|
|
| 487 |
)
|
| 488 |
|
| 489 |
output = {
|
| 490 |
-
"phase_logits": self.phase_head(summary_input),
|
| 491 |
"arm_role_logits": arm_role_logits,
|
| 492 |
"target_belief_field": target_belief_field,
|
| 493 |
"visibility_field": visibility_field,
|
|
@@ -502,20 +638,34 @@ class ElasticOcclusionFieldDecoder(nn.Module):
|
|
| 502 |
"uncertainty_field": uncertainty_field,
|
| 503 |
"interaction_tokens": interaction_tokens,
|
| 504 |
"field_tokens": field_tokens,
|
| 505 |
-
"latent_summary":
|
| 506 |
-
"support_mode_logits": self.support_mode(summary_input),
|
| 507 |
"corridor_logits": corridor_logits,
|
| 508 |
"persistence_horizon": persistence_horizon,
|
| 509 |
"disturbance_cost": disturbance_cost,
|
| 510 |
"belief_map": target_belief_map,
|
| 511 |
-
"reocclusion_logit": self.reocclusion_head(summary_input),
|
| 512 |
"persistence_uncertainty": uncertainty_field.mean(dim=(-1, -2)).squeeze(1),
|
| 513 |
"access_field": access_field,
|
| 514 |
"uncertainty": uncertainty_field.mean(dim=(-1, -2)).squeeze(1),
|
| 515 |
"compact_state": compact_state,
|
|
|
|
| 516 |
}
|
| 517 |
output["target_field"] = output["target_belief_field"]
|
| 518 |
output["actor_feasibility_field"] = output["clearance_field"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
return output
|
| 520 |
|
| 521 |
|
|
@@ -544,6 +694,8 @@ class ElasticOcclusionStateHead(nn.Module):
|
|
| 544 |
scene_tokens: Tensor,
|
| 545 |
memory_token: Tensor | None = None,
|
| 546 |
memory_tokens: Tensor | None = None,
|
|
|
|
|
|
|
| 547 |
) -> dict[str, Tensor]:
|
| 548 |
if memory_tokens is None:
|
| 549 |
memory_tokens = memory_token
|
|
@@ -558,4 +710,6 @@ class ElasticOcclusionStateHead(nn.Module):
|
|
| 558 |
interaction_tokens=interaction_tokens,
|
| 559 |
scene_tokens=scene_tokens,
|
| 560 |
memory_tokens=memory_tokens,
|
|
|
|
|
|
|
| 561 |
)
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
from dataclasses import dataclass
|
| 4 |
+
from typing import Sequence
|
| 5 |
|
| 6 |
import torch
|
| 7 |
import torch.nn.functional as F
|
| 8 |
from torch import Tensor, nn
|
| 9 |
|
| 10 |
|
| 11 |
+
HEAD_TASKS = ("generic", "foliage", "bag", "cloth")
|
| 12 |
+
TASK_METRIC_NAMES = (
|
| 13 |
+
"opening_quality",
|
| 14 |
+
"actor_feasibility_score",
|
| 15 |
+
"gap_width",
|
| 16 |
+
"damage_proxy",
|
| 17 |
+
"release_collapse_rate",
|
| 18 |
+
"target_visibility_confidence",
|
| 19 |
+
"mouth_aperture",
|
| 20 |
+
"hold_quality",
|
| 21 |
+
"rim_slip_risk",
|
| 22 |
+
"insertable_actor_corridor",
|
| 23 |
+
"layer_separation_quality",
|
| 24 |
+
"fold_preservation",
|
| 25 |
+
"insertion_corridor",
|
| 26 |
+
"top_layer_stability",
|
| 27 |
+
"lift_too_much_risk",
|
| 28 |
+
)
|
| 29 |
+
TASK_INDEX = {name: idx for idx, name in enumerate(HEAD_TASKS)}
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def task_ids_from_names(task_names: Sequence[str] | None, device: torch.device, batch_size: int) -> Tensor:
|
| 33 |
+
if task_names is None:
|
| 34 |
+
return torch.zeros(batch_size, device=device, dtype=torch.long)
|
| 35 |
+
return torch.as_tensor(
|
| 36 |
+
[TASK_INDEX.get(str(name), 0) for name in task_names],
|
| 37 |
+
device=device,
|
| 38 |
+
dtype=torch.long,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _mean_map(value: Tensor) -> Tensor:
|
| 43 |
+
return value.mean(dim=(-1, -2)).squeeze(1)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def compute_task_metrics_from_fields(
|
| 47 |
+
*,
|
| 48 |
+
access_field: Tensor,
|
| 49 |
+
persistence_field: Tensor,
|
| 50 |
+
disturbance_field: Tensor,
|
| 51 |
+
reocclusion_field: Tensor,
|
| 52 |
+
visibility_field: Tensor,
|
| 53 |
+
clearance_field: Tensor,
|
| 54 |
+
support_stability_field: Tensor,
|
| 55 |
+
uncertainty_field: Tensor,
|
| 56 |
+
) -> dict[str, Tensor]:
|
| 57 |
+
access_prob = torch.sigmoid(access_field)
|
| 58 |
+
opening_mask = access_prob.amax(dim=1, keepdim=True)
|
| 59 |
+
support_stability = torch.sigmoid(support_stability_field)
|
| 60 |
+
visibility_prob = torch.sigmoid(visibility_field)
|
| 61 |
+
clearance_prob = torch.sigmoid(clearance_field).mean(dim=1, keepdim=True)
|
| 62 |
+
normalized_uncertainty = uncertainty_field / (1.0 + uncertainty_field)
|
| 63 |
+
|
| 64 |
+
opening_quality_field = opening_mask * persistence_field * support_stability
|
| 65 |
+
newly_revealed_field = torch.relu(visibility_prob - reocclusion_field)
|
| 66 |
+
still_visible_field = visibility_prob * persistence_field
|
| 67 |
+
reoccluded_field = reocclusion_field
|
| 68 |
+
|
| 69 |
+
opening_quality = _mean_map(opening_quality_field)
|
| 70 |
+
actor_feasibility_score = 0.6 * _mean_map(clearance_prob) + 0.4 * _mean_map(opening_mask)
|
| 71 |
+
base_gap = _mean_map(opening_mask)
|
| 72 |
+
disturbance_cost = _mean_map(disturbance_field)
|
| 73 |
+
support_quality = _mean_map(support_stability)
|
| 74 |
+
visibility_confidence = _mean_map(visibility_prob * (1.0 - normalized_uncertainty))
|
| 75 |
+
reocclusion_rate = _mean_map(reocclusion_field)
|
| 76 |
+
persistence_score = _mean_map(persistence_field)
|
| 77 |
+
|
| 78 |
+
return {
|
| 79 |
+
"newly_revealed_field": newly_revealed_field,
|
| 80 |
+
"still_visible_field": still_visible_field,
|
| 81 |
+
"reoccluded_field": reoccluded_field,
|
| 82 |
+
"opening_quality_field": opening_quality_field,
|
| 83 |
+
"opening_quality": torch.clamp(opening_quality, 0.0, 1.0),
|
| 84 |
+
"actor_feasibility_score": torch.clamp(actor_feasibility_score, 0.0, 1.0),
|
| 85 |
+
"gap_width": 0.03 + 0.21 * torch.clamp(base_gap, 0.0, 1.0),
|
| 86 |
+
"damage_proxy": torch.clamp(disturbance_cost + 0.5 * (1.0 - support_quality), 0.0, 1.0),
|
| 87 |
+
"release_collapse_rate": torch.clamp(reocclusion_rate, 0.0, 1.0),
|
| 88 |
+
"target_visibility_confidence": torch.clamp(visibility_confidence, 0.0, 1.0),
|
| 89 |
+
"mouth_aperture": torch.clamp(base_gap, 0.0, 1.0),
|
| 90 |
+
"hold_quality": torch.clamp(0.5 * (persistence_score + support_quality), 0.0, 1.0),
|
| 91 |
+
"rim_slip_risk": torch.clamp(reocclusion_rate + 0.5 * (1.0 - support_quality), 0.0, 1.0),
|
| 92 |
+
"insertable_actor_corridor": torch.clamp(0.6 * actor_feasibility_score + 0.4 * base_gap, 0.0, 1.0),
|
| 93 |
+
"layer_separation_quality": torch.clamp(0.7 * base_gap + 0.3 * actor_feasibility_score, 0.0, 1.0),
|
| 94 |
+
"fold_preservation": torch.clamp(1.0 - disturbance_cost, 0.0, 1.0),
|
| 95 |
+
"insertion_corridor": torch.clamp(0.5 * actor_feasibility_score + 0.5 * base_gap, 0.0, 1.0),
|
| 96 |
+
"top_layer_stability": torch.clamp(support_quality, 0.0, 1.0),
|
| 97 |
+
"lift_too_much_risk": torch.clamp(disturbance_cost + 0.5 * torch.relu(base_gap - 0.5), 0.0, 1.0),
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
|
| 101 |
@dataclass
|
| 102 |
class RevealHeadConfig:
|
| 103 |
hidden_dim: int = 512
|
|
|
|
| 111 |
num_phases: int = 5
|
| 112 |
num_arm_roles: int = 4
|
| 113 |
num_interaction_tokens: int = 8
|
| 114 |
+
num_tasks: int = len(HEAD_TASKS)
|
| 115 |
|
| 116 |
|
| 117 |
class RevealStateHead(nn.Module):
|
|
|
|
| 471 |
nn.GELU(),
|
| 472 |
nn.Linear(config.hidden_dim, config.num_support_modes),
|
| 473 |
)
|
| 474 |
+
self.task_embedding = nn.Embedding(config.num_tasks, config.hidden_dim)
|
| 475 |
+
self.task_field_affine = nn.Linear(config.hidden_dim, config.hidden_dim * 2)
|
| 476 |
+
self.task_summary_adapter = nn.Sequential(
|
| 477 |
+
nn.LayerNorm(config.hidden_dim * 2),
|
| 478 |
+
nn.Linear(config.hidden_dim * 2, config.hidden_dim),
|
| 479 |
+
nn.GELU(),
|
| 480 |
+
)
|
| 481 |
+
self.task_phase_head = nn.Linear(config.hidden_dim, config.num_phases)
|
| 482 |
+
self.task_support_head = nn.Linear(config.hidden_dim, config.num_support_modes)
|
| 483 |
+
self.task_reocclusion_head = nn.Linear(config.hidden_dim, config.num_support_modes)
|
| 484 |
+
self.task_metric_head = nn.Sequential(
|
| 485 |
+
nn.LayerNorm(config.hidden_dim * 2),
|
| 486 |
+
nn.Linear(config.hidden_dim * 2, config.hidden_dim),
|
| 487 |
+
nn.GELU(),
|
| 488 |
+
nn.Linear(config.hidden_dim, len(TASK_METRIC_NAMES)),
|
| 489 |
+
)
|
| 490 |
|
| 491 |
def _pool_source(self, source_tokens: Tensor | None, fallback: Tensor) -> Tensor:
|
| 492 |
if source_tokens is None or source_tokens.numel() == 0:
|
|
|
|
| 511 |
interaction_tokens: Tensor,
|
| 512 |
scene_tokens: Tensor | None = None,
|
| 513 |
memory_tokens: Tensor | None = None,
|
| 514 |
+
task_names: Sequence[str] | None = None,
|
| 515 |
+
use_task_conditioning: bool = True,
|
| 516 |
) -> dict[str, Tensor]:
|
| 517 |
batch_size = interaction_tokens.shape[0]
|
| 518 |
pooled_interaction = interaction_tokens.mean(dim=1)
|
|
|
|
| 533 |
pooled_field = field_tokens.mean(dim=1)
|
| 534 |
summary_input = torch.cat([pooled_interaction, pooled_field, pooled_scene, pooled_memory], dim=-1)
|
| 535 |
latent_summary = self.summary_proj(summary_input)
|
| 536 |
+
task_ids = task_ids_from_names(task_names, interaction_tokens.device, batch_size)
|
| 537 |
+
task_embed = self.task_embedding(task_ids)
|
| 538 |
+
if use_task_conditioning:
|
| 539 |
+
scale, bias = self.task_field_affine(task_embed).chunk(2, dim=-1)
|
| 540 |
+
grid = grid * (1.0 + 0.1 * scale.view(batch_size, self.config.hidden_dim, 1, 1))
|
| 541 |
+
grid = grid + 0.1 * bias.view(batch_size, self.config.hidden_dim, 1, 1)
|
| 542 |
+
task_summary = latent_summary + 0.1 * self.task_summary_adapter(torch.cat([latent_summary, task_embed], dim=-1))
|
| 543 |
+
else:
|
| 544 |
+
task_summary = latent_summary
|
| 545 |
|
| 546 |
access_field = self.access_field(grid)
|
| 547 |
target_belief_field = self.target_belief_field(grid)
|
|
|
|
| 554 |
reocclusion_field = torch.sigmoid(self.reocclusion_field(grid))
|
| 555 |
disturbance_field = torch.sigmoid(self.disturbance_field(grid))
|
| 556 |
uncertainty_field = F.softplus(self.uncertainty_field(grid))
|
| 557 |
+
task_metrics = compute_task_metrics_from_fields(
|
| 558 |
+
access_field=access_field,
|
| 559 |
+
persistence_field=persistence_field,
|
| 560 |
+
disturbance_field=disturbance_field,
|
| 561 |
+
reocclusion_field=reocclusion_field,
|
| 562 |
+
visibility_field=visibility_field,
|
| 563 |
+
clearance_field=clearance_field,
|
| 564 |
+
support_stability_field=support_stability_field,
|
| 565 |
+
uncertainty_field=uncertainty_field,
|
| 566 |
+
)
|
| 567 |
+
metric_residuals = 0.05 * torch.tanh(
|
| 568 |
+
self.task_metric_head(torch.cat([task_summary, task_embed], dim=-1))
|
| 569 |
+
)
|
| 570 |
+
metric_residual_map = {
|
| 571 |
+
name: metric_residuals[:, idx]
|
| 572 |
+
for idx, name in enumerate(TASK_METRIC_NAMES)
|
| 573 |
+
}
|
| 574 |
|
| 575 |
support_stability_prob = torch.sigmoid(support_stability_field)
|
| 576 |
risk_field = torch.sigmoid(
|
|
|
|
| 595 |
arm_identity = self.arm_identity.weight.unsqueeze(0).expand(batch_size, -1, -1)
|
| 596 |
arm_tokens = pooled_interaction.unsqueeze(1).expand(-1, 2, -1) + arm_identity
|
| 597 |
arm_role_input = torch.cat(
|
| 598 |
+
[arm_tokens, task_summary.unsqueeze(1).expand(-1, 2, -1)],
|
| 599 |
dim=-1,
|
| 600 |
)
|
| 601 |
arm_role_logits = self.arm_role_head(arm_role_input)
|
|
|
|
| 613 |
risk_field.mean(dim=(-1, -2)).squeeze(1),
|
| 614 |
uncertainty_field.mean(dim=(-1, -2)).squeeze(1),
|
| 615 |
access_prob.mean(dim=(-1, -2)).transpose(0, 1).transpose(0, 1),
|
| 616 |
+
self.support_mode(summary_input) + (self.task_support_head(task_summary) if use_task_conditioning else 0.0),
|
| 617 |
+
self.phase_head(summary_input) + (self.task_phase_head(task_summary) if use_task_conditioning else 0.0),
|
| 618 |
arm_role_logits.reshape(batch_size, -1),
|
| 619 |
]
|
| 620 |
compact_state = torch.cat(
|
|
|
|
| 623 |
)
|
| 624 |
|
| 625 |
output = {
|
| 626 |
+
"phase_logits": self.phase_head(summary_input) + (self.task_phase_head(task_summary) if use_task_conditioning else 0.0),
|
| 627 |
"arm_role_logits": arm_role_logits,
|
| 628 |
"target_belief_field": target_belief_field,
|
| 629 |
"visibility_field": visibility_field,
|
|
|
|
| 638 |
"uncertainty_field": uncertainty_field,
|
| 639 |
"interaction_tokens": interaction_tokens,
|
| 640 |
"field_tokens": field_tokens,
|
| 641 |
+
"latent_summary": task_summary,
|
| 642 |
+
"support_mode_logits": self.support_mode(summary_input) + (self.task_support_head(task_summary) if use_task_conditioning else 0.0),
|
| 643 |
"corridor_logits": corridor_logits,
|
| 644 |
"persistence_horizon": persistence_horizon,
|
| 645 |
"disturbance_cost": disturbance_cost,
|
| 646 |
"belief_map": target_belief_map,
|
| 647 |
+
"reocclusion_logit": self.reocclusion_head(summary_input) + (self.task_reocclusion_head(task_summary) if use_task_conditioning else 0.0),
|
| 648 |
"persistence_uncertainty": uncertainty_field.mean(dim=(-1, -2)).squeeze(1),
|
| 649 |
"access_field": access_field,
|
| 650 |
"uncertainty": uncertainty_field.mean(dim=(-1, -2)).squeeze(1),
|
| 651 |
"compact_state": compact_state,
|
| 652 |
+
"task_ids": task_ids,
|
| 653 |
}
|
| 654 |
output["target_field"] = output["target_belief_field"]
|
| 655 |
output["actor_feasibility_field"] = output["clearance_field"]
|
| 656 |
+
output.update(
|
| 657 |
+
{
|
| 658 |
+
"newly_revealed_field": task_metrics["newly_revealed_field"],
|
| 659 |
+
"still_visible_field": task_metrics["still_visible_field"],
|
| 660 |
+
"reoccluded_field": task_metrics["reoccluded_field"],
|
| 661 |
+
"opening_quality_field": task_metrics["opening_quality_field"],
|
| 662 |
+
}
|
| 663 |
+
)
|
| 664 |
+
for name in TASK_METRIC_NAMES:
|
| 665 |
+
if name == "gap_width":
|
| 666 |
+
output[name] = torch.clamp(task_metrics[name] + 0.01 * metric_residual_map[name], 0.0, 1.0)
|
| 667 |
+
else:
|
| 668 |
+
output[name] = torch.clamp(task_metrics[name] + metric_residual_map[name], 0.0, 1.0)
|
| 669 |
return output
|
| 670 |
|
| 671 |
|
|
|
|
| 694 |
scene_tokens: Tensor,
|
| 695 |
memory_token: Tensor | None = None,
|
| 696 |
memory_tokens: Tensor | None = None,
|
| 697 |
+
task_names: Sequence[str] | None = None,
|
| 698 |
+
use_task_conditioning: bool = True,
|
| 699 |
) -> dict[str, Tensor]:
|
| 700 |
if memory_tokens is None:
|
| 701 |
memory_tokens = memory_token
|
|
|
|
| 710 |
interaction_tokens=interaction_tokens,
|
| 711 |
scene_tokens=scene_tokens,
|
| 712 |
memory_tokens=memory_tokens,
|
| 713 |
+
task_names=task_names,
|
| 714 |
+
use_task_conditioning=use_task_conditioning,
|
| 715 |
)
|
code/reveal_vla_bimanual/models/world_model.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
from dataclasses import dataclass
|
|
|
|
| 4 |
|
| 5 |
import torch
|
|
|
|
| 6 |
from torch import Tensor, nn
|
| 7 |
|
| 8 |
-
from models.reveal_head import InteractionFieldDecoder
|
| 9 |
|
| 10 |
|
| 11 |
@dataclass
|
|
@@ -24,6 +26,8 @@ class RevealWMConfig:
|
|
| 24 |
predict_belief_map: bool = True
|
| 25 |
scene_bank_size: int = 2
|
| 26 |
belief_bank_size: int = 2
|
|
|
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
class RevealWM(nn.Module):
|
|
@@ -167,6 +171,7 @@ class ElasticOcclusionWorldModel(nn.Module):
|
|
| 167 |
+ config.num_phases
|
| 168 |
+ (2 * config.num_arm_roles)
|
| 169 |
)
|
|
|
|
| 170 |
self.state_encoder = nn.Sequential(
|
| 171 |
nn.LayerNorm(compact_state_dim),
|
| 172 |
nn.Linear(compact_state_dim, config.hidden_dim),
|
|
@@ -203,6 +208,43 @@ class ElasticOcclusionWorldModel(nn.Module):
|
|
| 203 |
self.disturbance_head = nn.Linear(config.hidden_dim, field_elements)
|
| 204 |
self.uncertainty_head = nn.Linear(config.hidden_dim, field_elements)
|
| 205 |
self.access_head = nn.Linear(config.hidden_dim, config.num_support_modes * field_elements)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
def _compact_from_state(self, interaction_state: dict[str, Tensor]) -> Tensor:
|
| 208 |
if "compact_state" in interaction_state:
|
|
@@ -226,6 +268,32 @@ class ElasticOcclusionWorldModel(nn.Module):
|
|
| 226 |
]
|
| 227 |
return torch.cat([component if component.ndim > 1 else component.unsqueeze(-1) for component in components], dim=-1)
|
| 228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
def _decode_fields(self, latent: Tensor) -> dict[str, Tensor]:
|
| 230 |
batch_size = latent.shape[0]
|
| 231 |
side = self.config.field_size
|
|
@@ -257,7 +325,7 @@ class ElasticOcclusionWorldModel(nn.Module):
|
|
| 257 |
weighted_persistence = (persistence_field.expand_as(access_prob) * access_prob).sum(dim=(-1, -2))
|
| 258 |
access_mass = access_prob.sum(dim=(-1, -2)).clamp_min(1e-4)
|
| 259 |
persistence_horizon = self.config.rollout_horizon * weighted_persistence / access_mass
|
| 260 |
-
|
| 261 |
"target_belief_field": target_belief_field,
|
| 262 |
"visibility_field": visibility_field,
|
| 263 |
"clearance_field": clearance_field,
|
|
@@ -282,6 +350,96 @@ class ElasticOcclusionWorldModel(nn.Module):
|
|
| 282 |
"target_field": target_belief_field,
|
| 283 |
"actor_feasibility_field": clearance_field,
|
| 284 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
|
| 286 |
def forward(
|
| 287 |
self,
|
|
@@ -291,6 +449,8 @@ class ElasticOcclusionWorldModel(nn.Module):
|
|
| 291 |
memory_tokens: Tensor | None = None,
|
| 292 |
scene_memory_tokens: Tensor | None = None,
|
| 293 |
belief_memory_tokens: Tensor | None = None,
|
|
|
|
|
|
|
| 294 |
) -> dict[str, Tensor]:
|
| 295 |
if scene_memory_tokens is None:
|
| 296 |
scene_memory_tokens = interaction_state.get("scene_memory_tokens")
|
|
@@ -305,14 +465,57 @@ class ElasticOcclusionWorldModel(nn.Module):
|
|
| 305 |
if belief_memory_tokens is None:
|
| 306 |
belief_memory_tokens = scene_tokens[:, :1]
|
| 307 |
|
| 308 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
scene_memory = self.scene_memory_proj(scene_memory_tokens.mean(dim=1))
|
| 310 |
belief_memory = self.belief_memory_proj(belief_memory_tokens.mean(dim=1))
|
| 311 |
outputs: dict[str, list[Tensor]] = {}
|
| 312 |
scene_bias = scene_tokens.mean(dim=1)
|
| 313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
for step in range(action_chunk.shape[1]):
|
| 315 |
-
action_latent = self.action_encoder(action_chunk[:, step])
|
| 316 |
transition_input = torch.cat([latent, action_latent, scene_memory, belief_memory], dim=-1)
|
| 317 |
latent = self.transition(transition_input, latent + 0.1 * scene_bias)
|
| 318 |
scene_memory = 0.75 * scene_memory + 0.25 * torch.tanh(self.scene_memory_update(latent))
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
from dataclasses import dataclass
|
| 4 |
+
from typing import Sequence
|
| 5 |
|
| 6 |
import torch
|
| 7 |
+
import torch.nn.functional as F
|
| 8 |
from torch import Tensor, nn
|
| 9 |
|
| 10 |
+
from models.reveal_head import InteractionFieldDecoder, compute_task_metrics_from_fields, task_ids_from_names
|
| 11 |
|
| 12 |
|
| 13 |
@dataclass
|
|
|
|
| 26 |
predict_belief_map: bool = True
|
| 27 |
scene_bank_size: int = 2
|
| 28 |
belief_bank_size: int = 2
|
| 29 |
+
rollout_mode: str = "spatial_rollout"
|
| 30 |
+
num_tasks: int = 4
|
| 31 |
|
| 32 |
|
| 33 |
class RevealWM(nn.Module):
|
|
|
|
| 171 |
+ config.num_phases
|
| 172 |
+ (2 * config.num_arm_roles)
|
| 173 |
)
|
| 174 |
+
self.compact_state_dim = compact_state_dim
|
| 175 |
self.state_encoder = nn.Sequential(
|
| 176 |
nn.LayerNorm(compact_state_dim),
|
| 177 |
nn.Linear(compact_state_dim, config.hidden_dim),
|
|
|
|
| 208 |
self.disturbance_head = nn.Linear(config.hidden_dim, field_elements)
|
| 209 |
self.uncertainty_head = nn.Linear(config.hidden_dim, field_elements)
|
| 210 |
self.access_head = nn.Linear(config.hidden_dim, config.num_support_modes * field_elements)
|
| 211 |
+
field_channels = 12 + config.num_support_modes
|
| 212 |
+
spatial_hidden = max(32, config.hidden_dim // 2)
|
| 213 |
+
self.task_embedding = nn.Embedding(config.num_tasks, config.hidden_dim)
|
| 214 |
+
self.spatial_field_encoder = nn.Sequential(
|
| 215 |
+
nn.Conv2d(field_channels, spatial_hidden, kernel_size=3, padding=1),
|
| 216 |
+
nn.GELU(),
|
| 217 |
+
nn.Conv2d(spatial_hidden, config.hidden_dim, kernel_size=3, padding=1),
|
| 218 |
+
nn.GELU(),
|
| 219 |
+
)
|
| 220 |
+
self.spatial_context_proj = nn.Sequential(
|
| 221 |
+
nn.LayerNorm(config.hidden_dim * 4),
|
| 222 |
+
nn.Linear(config.hidden_dim * 4, config.hidden_dim),
|
| 223 |
+
nn.GELU(),
|
| 224 |
+
)
|
| 225 |
+
self.spatial_gate_z = nn.Conv2d(config.hidden_dim * 2, config.hidden_dim, kernel_size=3, padding=1)
|
| 226 |
+
self.spatial_gate_r = nn.Conv2d(config.hidden_dim * 2, config.hidden_dim, kernel_size=3, padding=1)
|
| 227 |
+
self.spatial_candidate = nn.Conv2d(config.hidden_dim * 2, config.hidden_dim, kernel_size=3, padding=1)
|
| 228 |
+
self.spatial_summary_proj = nn.Sequential(
|
| 229 |
+
nn.LayerNorm(config.hidden_dim * 3),
|
| 230 |
+
nn.Linear(config.hidden_dim * 3, config.hidden_dim),
|
| 231 |
+
nn.GELU(),
|
| 232 |
+
)
|
| 233 |
+
self.spatial_phase_head = nn.Linear(config.hidden_dim, config.num_phases)
|
| 234 |
+
self.spatial_support_mode_head = nn.Linear(config.hidden_dim, config.num_support_modes)
|
| 235 |
+
self.spatial_arm_role_head = nn.Linear(config.hidden_dim, 2 * config.num_arm_roles)
|
| 236 |
+
self.spatial_reocclusion_head = nn.Linear(config.hidden_dim, config.num_support_modes)
|
| 237 |
+
self.spatial_target_belief_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
|
| 238 |
+
self.spatial_visibility_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
|
| 239 |
+
self.spatial_clearance_head = nn.Conv2d(config.hidden_dim, 2, kernel_size=1)
|
| 240 |
+
self.spatial_occluder_contact_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
|
| 241 |
+
self.spatial_grasp_affordance_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
|
| 242 |
+
self.spatial_support_stability_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
|
| 243 |
+
self.spatial_persistence_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
|
| 244 |
+
self.spatial_reocclusion_field_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
|
| 245 |
+
self.spatial_disturbance_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
|
| 246 |
+
self.spatial_uncertainty_head = nn.Conv2d(config.hidden_dim, 1, kernel_size=1)
|
| 247 |
+
self.spatial_access_head = nn.Conv2d(config.hidden_dim, config.num_support_modes, kernel_size=1)
|
| 248 |
|
| 249 |
def _compact_from_state(self, interaction_state: dict[str, Tensor]) -> Tensor:
|
| 250 |
if "compact_state" in interaction_state:
|
|
|
|
| 268 |
]
|
| 269 |
return torch.cat([component if component.ndim > 1 else component.unsqueeze(-1) for component in components], dim=-1)
|
| 270 |
|
| 271 |
+
def _repeat_state_rollout(self, interaction_state: dict[str, Tensor], horizon: int) -> dict[str, Tensor]:
|
| 272 |
+
rollout: dict[str, Tensor] = {}
|
| 273 |
+
for key, value in interaction_state.items():
|
| 274 |
+
if isinstance(value, Tensor):
|
| 275 |
+
rollout[key] = value.unsqueeze(1).expand(-1, horizon, *value.shape[1:])
|
| 276 |
+
return rollout
|
| 277 |
+
|
| 278 |
+
def _stack_state_fields(self, interaction_state: dict[str, Tensor]) -> Tensor:
|
| 279 |
+
return torch.cat(
|
| 280 |
+
[
|
| 281 |
+
interaction_state["target_belief_field"],
|
| 282 |
+
interaction_state["visibility_field"],
|
| 283 |
+
interaction_state["clearance_field"],
|
| 284 |
+
interaction_state["occluder_contact_field"],
|
| 285 |
+
interaction_state["grasp_affordance_field"],
|
| 286 |
+
interaction_state["support_stability_field"],
|
| 287 |
+
interaction_state["persistence_field"],
|
| 288 |
+
interaction_state["reocclusion_field"],
|
| 289 |
+
interaction_state["disturbance_field"],
|
| 290 |
+
interaction_state["risk_field"],
|
| 291 |
+
interaction_state["uncertainty_field"],
|
| 292 |
+
interaction_state["access_field"],
|
| 293 |
+
],
|
| 294 |
+
dim=1,
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
def _decode_fields(self, latent: Tensor) -> dict[str, Tensor]:
|
| 298 |
batch_size = latent.shape[0]
|
| 299 |
side = self.config.field_size
|
|
|
|
| 325 |
weighted_persistence = (persistence_field.expand_as(access_prob) * access_prob).sum(dim=(-1, -2))
|
| 326 |
access_mass = access_prob.sum(dim=(-1, -2)).clamp_min(1e-4)
|
| 327 |
persistence_horizon = self.config.rollout_horizon * weighted_persistence / access_mass
|
| 328 |
+
outputs = {
|
| 329 |
"target_belief_field": target_belief_field,
|
| 330 |
"visibility_field": visibility_field,
|
| 331 |
"clearance_field": clearance_field,
|
|
|
|
| 350 |
"target_field": target_belief_field,
|
| 351 |
"actor_feasibility_field": clearance_field,
|
| 352 |
}
|
| 353 |
+
outputs.update(
|
| 354 |
+
compute_task_metrics_from_fields(
|
| 355 |
+
access_field=access_field,
|
| 356 |
+
persistence_field=persistence_field,
|
| 357 |
+
disturbance_field=disturbance_field,
|
| 358 |
+
reocclusion_field=reocclusion_field,
|
| 359 |
+
visibility_field=visibility_field,
|
| 360 |
+
clearance_field=clearance_field,
|
| 361 |
+
support_stability_field=support_stability_field,
|
| 362 |
+
uncertainty_field=uncertainty_field,
|
| 363 |
+
)
|
| 364 |
+
)
|
| 365 |
+
return outputs
|
| 366 |
+
|
| 367 |
+
def _decode_spatial_fields(self, hidden: Tensor, summary: Tensor) -> dict[str, Tensor]:
|
| 368 |
+
target_belief_field = self.spatial_target_belief_head(hidden)
|
| 369 |
+
visibility_field = self.spatial_visibility_head(hidden)
|
| 370 |
+
clearance_field = self.spatial_clearance_head(hidden)
|
| 371 |
+
occluder_contact_field = self.spatial_occluder_contact_head(hidden)
|
| 372 |
+
grasp_affordance_field = self.spatial_grasp_affordance_head(hidden)
|
| 373 |
+
support_stability_field = self.spatial_support_stability_head(hidden)
|
| 374 |
+
persistence_field = torch.sigmoid(self.spatial_persistence_head(hidden))
|
| 375 |
+
reocclusion_field = torch.sigmoid(self.spatial_reocclusion_field_head(hidden))
|
| 376 |
+
disturbance_field = torch.sigmoid(self.spatial_disturbance_head(hidden))
|
| 377 |
+
uncertainty_field = F.softplus(self.spatial_uncertainty_head(hidden))
|
| 378 |
+
access_field = self.spatial_access_head(hidden)
|
| 379 |
+
support_stability_prob = torch.sigmoid(support_stability_field)
|
| 380 |
+
risk_field = torch.sigmoid(
|
| 381 |
+
disturbance_field
|
| 382 |
+
+ 0.75 * reocclusion_field
|
| 383 |
+
+ 0.5 * (1.0 - support_stability_prob)
|
| 384 |
+
+ 0.25 * uncertainty_field
|
| 385 |
+
)
|
| 386 |
+
corridor_source = access_field.amax(dim=-2)
|
| 387 |
+
corridor_logits = F.interpolate(
|
| 388 |
+
corridor_source,
|
| 389 |
+
size=self.config.num_approach_templates,
|
| 390 |
+
mode="linear",
|
| 391 |
+
align_corners=False,
|
| 392 |
+
)
|
| 393 |
+
access_prob = torch.sigmoid(access_field)
|
| 394 |
+
weighted_persistence = (persistence_field.expand_as(access_prob) * access_prob).sum(dim=(-1, -2))
|
| 395 |
+
access_mass = access_prob.sum(dim=(-1, -2)).clamp_min(1e-4)
|
| 396 |
+
persistence_horizon = self.config.rollout_horizon * weighted_persistence / access_mass
|
| 397 |
+
compact_state = self.compact_decoder(summary)
|
| 398 |
+
role_slice = self.spatial_arm_role_head(summary).view(summary.shape[0], 2, self.config.num_arm_roles)
|
| 399 |
+
outputs = {
|
| 400 |
+
"target_belief_field": target_belief_field,
|
| 401 |
+
"visibility_field": visibility_field,
|
| 402 |
+
"clearance_field": clearance_field,
|
| 403 |
+
"occluder_contact_field": occluder_contact_field,
|
| 404 |
+
"grasp_affordance_field": grasp_affordance_field,
|
| 405 |
+
"support_stability_field": support_stability_field,
|
| 406 |
+
"persistence_field": persistence_field,
|
| 407 |
+
"reocclusion_field": reocclusion_field,
|
| 408 |
+
"disturbance_field": disturbance_field,
|
| 409 |
+
"risk_field": risk_field,
|
| 410 |
+
"uncertainty_field": uncertainty_field,
|
| 411 |
+
"access_field": access_field,
|
| 412 |
+
"corridor_logits": corridor_logits,
|
| 413 |
+
"persistence_horizon": persistence_horizon,
|
| 414 |
+
"disturbance_cost": disturbance_field.mean(dim=(-1, -2)).squeeze(1),
|
| 415 |
+
"belief_map": F.interpolate(
|
| 416 |
+
target_belief_field,
|
| 417 |
+
size=(self.config.belief_map_size, self.config.belief_map_size),
|
| 418 |
+
mode="bilinear",
|
| 419 |
+
align_corners=False,
|
| 420 |
+
),
|
| 421 |
+
"target_field": target_belief_field,
|
| 422 |
+
"actor_feasibility_field": clearance_field,
|
| 423 |
+
"compact_state": compact_state,
|
| 424 |
+
"phase_logits": self.spatial_phase_head(summary),
|
| 425 |
+
"arm_role_logits": role_slice,
|
| 426 |
+
"support_mode_logits": self.spatial_support_mode_head(summary),
|
| 427 |
+
"reocclusion_logit": self.spatial_reocclusion_head(summary),
|
| 428 |
+
"uncertainty": uncertainty_field.mean(dim=(-1, -2)).squeeze(1),
|
| 429 |
+
}
|
| 430 |
+
outputs.update(
|
| 431 |
+
compute_task_metrics_from_fields(
|
| 432 |
+
access_field=access_field,
|
| 433 |
+
persistence_field=persistence_field,
|
| 434 |
+
disturbance_field=disturbance_field,
|
| 435 |
+
reocclusion_field=reocclusion_field,
|
| 436 |
+
visibility_field=visibility_field,
|
| 437 |
+
clearance_field=clearance_field,
|
| 438 |
+
support_stability_field=support_stability_field,
|
| 439 |
+
uncertainty_field=uncertainty_field,
|
| 440 |
+
)
|
| 441 |
+
)
|
| 442 |
+
return outputs
|
| 443 |
|
| 444 |
def forward(
|
| 445 |
self,
|
|
|
|
| 449 |
memory_tokens: Tensor | None = None,
|
| 450 |
scene_memory_tokens: Tensor | None = None,
|
| 451 |
belief_memory_tokens: Tensor | None = None,
|
| 452 |
+
task_names: Sequence[str] | None = None,
|
| 453 |
+
rollout_mode_override: str | None = None,
|
| 454 |
) -> dict[str, Tensor]:
|
| 455 |
if scene_memory_tokens is None:
|
| 456 |
scene_memory_tokens = interaction_state.get("scene_memory_tokens")
|
|
|
|
| 465 |
if belief_memory_tokens is None:
|
| 466 |
belief_memory_tokens = scene_tokens[:, :1]
|
| 467 |
|
| 468 |
+
rollout_mode = rollout_mode_override or self.config.rollout_mode
|
| 469 |
+
horizon = action_chunk.shape[1]
|
| 470 |
+
if rollout_mode in {"null_rollout", "identity_rollout"}:
|
| 471 |
+
repeated = self._repeat_state_rollout(interaction_state, horizon)
|
| 472 |
+
if "scene_memory_tokens" not in repeated:
|
| 473 |
+
repeated["scene_memory_tokens"] = scene_memory_tokens.unsqueeze(1).expand(-1, horizon, *scene_memory_tokens.shape[1:])
|
| 474 |
+
if "belief_memory_tokens" not in repeated:
|
| 475 |
+
repeated["belief_memory_tokens"] = belief_memory_tokens.unsqueeze(1).expand(-1, horizon, *belief_memory_tokens.shape[1:])
|
| 476 |
+
if "memory_tokens" not in repeated:
|
| 477 |
+
repeated["memory_tokens"] = torch.cat(
|
| 478 |
+
[repeated["scene_memory_tokens"], repeated["belief_memory_tokens"]],
|
| 479 |
+
dim=2,
|
| 480 |
+
)
|
| 481 |
+
if "memory_token" not in repeated:
|
| 482 |
+
repeated["memory_token"] = repeated["memory_tokens"].mean(dim=2, keepdim=True)
|
| 483 |
+
return repeated
|
| 484 |
+
|
| 485 |
+
task_ids = task_ids_from_names(task_names, scene_tokens.device, scene_tokens.shape[0])
|
| 486 |
+
task_embed = self.task_embedding(task_ids)
|
| 487 |
+
latent = self.state_encoder(self._compact_from_state(interaction_state)) + 0.1 * task_embed
|
| 488 |
scene_memory = self.scene_memory_proj(scene_memory_tokens.mean(dim=1))
|
| 489 |
belief_memory = self.belief_memory_proj(belief_memory_tokens.mean(dim=1))
|
| 490 |
outputs: dict[str, list[Tensor]] = {}
|
| 491 |
scene_bias = scene_tokens.mean(dim=1)
|
| 492 |
|
| 493 |
+
if rollout_mode == "spatial_rollout":
|
| 494 |
+
hidden = self.spatial_field_encoder(self._stack_state_fields(interaction_state))
|
| 495 |
+
spatial_context = self.spatial_context_proj(torch.cat([scene_bias, scene_memory, belief_memory, task_embed], dim=-1))
|
| 496 |
+
hidden = hidden + spatial_context.unsqueeze(-1).unsqueeze(-1)
|
| 497 |
+
for step in range(horizon):
|
| 498 |
+
action_latent = self.action_encoder(action_chunk[:, step]) + 0.1 * task_embed
|
| 499 |
+
input_map = (action_latent + spatial_context).unsqueeze(-1).unsqueeze(-1).expand_as(hidden)
|
| 500 |
+
z = torch.sigmoid(self.spatial_gate_z(torch.cat([hidden, input_map], dim=1)))
|
| 501 |
+
r = torch.sigmoid(self.spatial_gate_r(torch.cat([hidden, input_map], dim=1)))
|
| 502 |
+
candidate = torch.tanh(self.spatial_candidate(torch.cat([r * hidden, input_map], dim=1)))
|
| 503 |
+
hidden = (1.0 - z) * hidden + z * candidate
|
| 504 |
+
pooled_hidden = hidden.mean(dim=(-1, -2))
|
| 505 |
+
scene_memory = 0.75 * scene_memory + 0.25 * torch.tanh(self.scene_memory_update(pooled_hidden))
|
| 506 |
+
belief_memory = 0.65 * belief_memory + 0.35 * torch.tanh(self.belief_memory_update(pooled_hidden))
|
| 507 |
+
summary = self.spatial_summary_proj(torch.cat([pooled_hidden, scene_bias, task_embed], dim=-1))
|
| 508 |
+
decoded = self._decode_spatial_fields(hidden, summary)
|
| 509 |
+
decoded["scene_memory_tokens"] = scene_memory.unsqueeze(1).expand(-1, self.config.scene_bank_size, -1)
|
| 510 |
+
decoded["belief_memory_tokens"] = belief_memory.unsqueeze(1).expand(-1, self.config.belief_bank_size, -1)
|
| 511 |
+
decoded["memory_tokens"] = torch.cat([decoded["scene_memory_tokens"], decoded["belief_memory_tokens"]], dim=1)
|
| 512 |
+
decoded["memory_token"] = decoded["memory_tokens"].mean(dim=1, keepdim=True)
|
| 513 |
+
for key, value in decoded.items():
|
| 514 |
+
outputs.setdefault(key, []).append(value)
|
| 515 |
+
return {key: torch.stack(values, dim=1) for key, values in outputs.items()}
|
| 516 |
+
|
| 517 |
for step in range(action_chunk.shape[1]):
|
| 518 |
+
action_latent = self.action_encoder(action_chunk[:, step]) + 0.1 * task_embed
|
| 519 |
transition_input = torch.cat([latent, action_latent, scene_memory, belief_memory], dim=-1)
|
| 520 |
latent = self.transition(transition_input, latent + 0.1 * scene_bias)
|
| 521 |
scene_memory = 0.75 * scene_memory + 0.25 * torch.tanh(self.scene_memory_update(latent))
|
code/reveal_vla_bimanual/scripts/run_rlbench_handoff_eval.sh
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
ROOT_DIR="${ROOT_DIR:-/workspace}"
|
| 5 |
+
PROJECT_DIR="${ROOT_DIR}/reveal_vla_bimanual"
|
| 6 |
+
PYTHON_BIN="${PYTHON_BIN:-${ROOT_DIR}/envs/rlbench/bin/python}"
|
| 7 |
+
OUTPUT_ROOT="${OUTPUT_ROOT:-${ROOT_DIR}/reports/rlbench_handoff_matrix}"
|
| 8 |
+
EPISODES_PER_TASK="${EPISODES_PER_TASK:-1}"
|
| 9 |
+
EPISODE_LENGTH="${EPISODE_LENGTH:-20}"
|
| 10 |
+
RESOLUTION="${RESOLUTION:-224}"
|
| 11 |
+
CHUNK_COMMIT_STEPS="${CHUNK_COMMIT_STEPS:-4}"
|
| 12 |
+
BASELINE_CHECKPOINT="${BASELINE_CHECKPOINT:-${ROOT_DIR}/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt}"
|
| 13 |
+
SPATIAL_CHECKPOINT="${SPATIAL_CHECKPOINT:-${ROOT_DIR}/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/checkpoint_best.pt}"
|
| 14 |
+
|
| 15 |
+
source "${ROOT_DIR}/VLAarchtests_work/environment/runtime_env_vars.sh"
|
| 16 |
+
|
| 17 |
+
run_sweep() {
|
| 18 |
+
local output_dir="$1"
|
| 19 |
+
shift
|
| 20 |
+
mkdir -p "${output_dir}"
|
| 21 |
+
(
|
| 22 |
+
cd "${PROJECT_DIR}"
|
| 23 |
+
PYTHONPATH="${PROJECT_DIR}" "${PYTHON_BIN}" -m eval.run_peract2_task_sweep "$@"
|
| 24 |
+
)
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
mkdir -p "${OUTPUT_ROOT}"
|
| 28 |
+
|
| 29 |
+
run_sweep \
|
| 30 |
+
"${OUTPUT_ROOT}/baseline" \
|
| 31 |
+
--checkpoint "${BASELINE_CHECKPOINT}" \
|
| 32 |
+
--output-root "${OUTPUT_ROOT}/baseline" \
|
| 33 |
+
--run-name-prefix baseline_rgbd_seed17 \
|
| 34 |
+
--episodes-per-task "${EPISODES_PER_TASK}" \
|
| 35 |
+
--episode-length "${EPISODE_LENGTH}" \
|
| 36 |
+
--resolution "${RESOLUTION}" \
|
| 37 |
+
--chunk-commit-steps "${CHUNK_COMMIT_STEPS}" \
|
| 38 |
+
--allow-unsupervised-planning \
|
| 39 |
+
--headless \
|
| 40 |
+
--skip-noplan
|
| 41 |
+
|
| 42 |
+
run_sweep \
|
| 43 |
+
"${OUTPUT_ROOT}/spatial_full" \
|
| 44 |
+
--checkpoint "${SPATIAL_CHECKPOINT}" \
|
| 45 |
+
--output-root "${OUTPUT_ROOT}/spatial_full" \
|
| 46 |
+
--run-name-prefix spatial_phase_seed17 \
|
| 47 |
+
--episodes-per-task "${EPISODES_PER_TASK}" \
|
| 48 |
+
--episode-length "${EPISODE_LENGTH}" \
|
| 49 |
+
--resolution "${RESOLUTION}" \
|
| 50 |
+
--chunk-commit-steps "${CHUNK_COMMIT_STEPS}" \
|
| 51 |
+
--allow-unsupervised-planning \
|
| 52 |
+
--headless
|
| 53 |
+
|
| 54 |
+
run_sweep \
|
| 55 |
+
"${OUTPUT_ROOT}/spatial_nogeom" \
|
| 56 |
+
--checkpoint "${SPATIAL_CHECKPOINT}" \
|
| 57 |
+
--output-root "${OUTPUT_ROOT}/spatial_nogeom" \
|
| 58 |
+
--run-name-prefix spatial_phase_nogeom_seed17 \
|
| 59 |
+
--episodes-per-task "${EPISODES_PER_TASK}" \
|
| 60 |
+
--episode-length "${EPISODE_LENGTH}" \
|
| 61 |
+
--resolution "${RESOLUTION}" \
|
| 62 |
+
--chunk-commit-steps "${CHUNK_COMMIT_STEPS}" \
|
| 63 |
+
--allow-unsupervised-planning \
|
| 64 |
+
--headless \
|
| 65 |
+
--no-geometry \
|
| 66 |
+
--skip-noplan
|
| 67 |
+
|
| 68 |
+
run_sweep \
|
| 69 |
+
"${OUTPUT_ROOT}/spatial_compactwm" \
|
| 70 |
+
--checkpoint "${SPATIAL_CHECKPOINT}" \
|
| 71 |
+
--output-root "${OUTPUT_ROOT}/spatial_compactwm" \
|
| 72 |
+
--run-name-prefix spatial_phase_compactwm_seed17 \
|
| 73 |
+
--episodes-per-task "${EPISODES_PER_TASK}" \
|
| 74 |
+
--episode-length "${EPISODE_LENGTH}" \
|
| 75 |
+
--resolution "${RESOLUTION}" \
|
| 76 |
+
--chunk-commit-steps "${CHUNK_COMMIT_STEPS}" \
|
| 77 |
+
--allow-unsupervised-planning \
|
| 78 |
+
--headless \
|
| 79 |
+
--compact-world-model \
|
| 80 |
+
--skip-noplan
|
| 81 |
+
|
| 82 |
+
run_sweep \
|
| 83 |
+
"${OUTPUT_ROOT}/spatial_notask" \
|
| 84 |
+
--checkpoint "${SPATIAL_CHECKPOINT}" \
|
| 85 |
+
--output-root "${OUTPUT_ROOT}/spatial_notask" \
|
| 86 |
+
--run-name-prefix spatial_phase_notask_seed17 \
|
| 87 |
+
--episodes-per-task "${EPISODES_PER_TASK}" \
|
| 88 |
+
--episode-length "${EPISODE_LENGTH}" \
|
| 89 |
+
--resolution "${RESOLUTION}" \
|
| 90 |
+
--chunk-commit-steps "${CHUNK_COMMIT_STEPS}" \
|
| 91 |
+
--allow-unsupervised-planning \
|
| 92 |
+
--headless \
|
| 93 |
+
--disable-task-conditioning \
|
| 94 |
+
--skip-noplan
|
| 95 |
+
|
| 96 |
+
(
|
| 97 |
+
cd "${PROJECT_DIR}"
|
| 98 |
+
PYTHONPATH="${PROJECT_DIR}" "${PYTHON_BIN}" -m eval.compare_rlbench_sweeps \
|
| 99 |
+
--reference-label baseline_plan \
|
| 100 |
+
--output-dir "${OUTPUT_ROOT}/comparison" \
|
| 101 |
+
--run "baseline_plan=${OUTPUT_ROOT}/baseline/baseline_rgbd_seed17_plan_split/rollout_eval.json" \
|
| 102 |
+
--run "spatial_noplan=${OUTPUT_ROOT}/spatial_full/spatial_phase_seed17_noplan_split/rollout_eval.json" \
|
| 103 |
+
--run "spatial_plan=${OUTPUT_ROOT}/spatial_full/spatial_phase_seed17_plan_split/rollout_eval.json" \
|
| 104 |
+
--run "spatial_nogeom=${OUTPUT_ROOT}/spatial_nogeom/spatial_phase_nogeom_seed17_plan_split/rollout_eval.json" \
|
| 105 |
+
--run "spatial_compactwm=${OUTPUT_ROOT}/spatial_compactwm/spatial_phase_compactwm_seed17_plan_split/rollout_eval.json" \
|
| 106 |
+
--run "spatial_notask=${OUTPUT_ROOT}/spatial_notask/spatial_phase_notask_seed17_plan_split/rollout_eval.json"
|
| 107 |
+
)
|
code/reveal_vla_bimanual/sim_reveal/dataset.py
CHANGED
|
@@ -26,6 +26,11 @@ LEGACY_PRIVILEGED_RENDER_KEYS = frozenset(
|
|
| 26 |
)
|
| 27 |
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def _assert_noleak_sample(sample: dict[str, Any]) -> None:
|
| 30 |
render_state = sample.get("render_state", {})
|
| 31 |
leaked_keys = sorted(LEGACY_PRIVILEGED_RENDER_KEYS.intersection(render_state))
|
|
@@ -102,6 +107,8 @@ def collect_teacher_dataset(
|
|
| 102 |
"language_goal": observation["text"],
|
| 103 |
"action_chunk": action_chunk.astype("float32"),
|
| 104 |
"support_mode": int(privileged_state["support_mode"]),
|
|
|
|
|
|
|
| 105 |
"corridor_feasible": privileged_state["corridor_feasible"].astype("float32"),
|
| 106 |
"persistence_horizon": privileged_state["persistence_horizon"].astype("float32"),
|
| 107 |
"disturbance_cost": float(privileged_state["disturbance_cost"]),
|
|
@@ -114,7 +121,21 @@ def collect_teacher_dataset(
|
|
| 114 |
"support_stability_map": privileged_state["support_stability_map"].astype("float32"),
|
| 115 |
"reocclusion_target": float(privileged_state["reocclusion_target"]),
|
| 116 |
"reocclusion_map": privileged_state["reocclusion_map"].astype("float32"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
"rollout_support_mode": rollout["rollout_support_mode"].astype("int64"),
|
|
|
|
| 118 |
"rollout_corridor_feasible": rollout["rollout_corridor_feasible"].astype("float32"),
|
| 119 |
"rollout_persistence_horizon": rollout["rollout_persistence_horizon"].astype("float32"),
|
| 120 |
"rollout_disturbance_cost": rollout["rollout_disturbance_cost"].astype("float32"),
|
|
@@ -189,7 +210,7 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
|
|
| 189 |
return len(self.samples)
|
| 190 |
|
| 191 |
def _render_cache_key(self, sample: dict[str, Any], render_state: dict[str, Any]) -> bytes:
|
| 192 |
-
include_depth = sample.get("dataset_version")
|
| 193 |
return pickle.dumps(
|
| 194 |
(sample["proxy_name"], self.resolution, include_depth, render_state),
|
| 195 |
protocol=4,
|
|
@@ -200,7 +221,7 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
|
|
| 200 |
cached = self._render_cache.get(cache_key)
|
| 201 |
if cached is not None:
|
| 202 |
return cached
|
| 203 |
-
include_depth = sample.get("dataset_version")
|
| 204 |
rendered = render_views_from_state(
|
| 205 |
proxy_name=sample["proxy_name"],
|
| 206 |
render_state=render_state,
|
|
@@ -216,6 +237,7 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
|
|
| 216 |
return cached_item
|
| 217 |
sample = self.samples[index]
|
| 218 |
_assert_noleak_sample(sample)
|
|
|
|
| 219 |
images = self._render_sample(sample, sample["render_state"])
|
| 220 |
history_images = []
|
| 221 |
history_depths = []
|
|
@@ -232,7 +254,7 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
|
|
| 232 |
dim=0,
|
| 233 |
)
|
| 234 |
)
|
| 235 |
-
if sample.get("dataset_version")
|
| 236 |
history_depths.append(
|
| 237 |
torch.stack(
|
| 238 |
[
|
|
@@ -267,7 +289,7 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
|
|
| 267 |
history_stacked = torch.stack(history_images, dim=0).permute(0, 1, 4, 2, 3).float() / 255.0
|
| 268 |
else:
|
| 269 |
history_stacked = torch.zeros((0, 3, 3, self.resolution, self.resolution), dtype=torch.float32)
|
| 270 |
-
if sample.get("dataset_version")
|
| 271 |
depths = torch.stack(
|
| 272 |
[
|
| 273 |
torch.from_numpy(images["front_depth"]),
|
|
@@ -317,6 +339,8 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
|
|
| 317 |
"texts": sample["language_goal"],
|
| 318 |
"action_chunk": torch.as_tensor(sample["action_chunk"], dtype=torch.float32),
|
| 319 |
"support_mode": torch.as_tensor(sample["support_mode"], dtype=torch.long),
|
|
|
|
|
|
|
| 320 |
"corridor_feasible": torch.as_tensor(sample["corridor_feasible"], dtype=torch.float32),
|
| 321 |
"persistence_horizon": torch.as_tensor(sample["persistence_horizon"], dtype=torch.float32),
|
| 322 |
"disturbance_cost": torch.as_tensor(sample["disturbance_cost"], dtype=torch.float32),
|
|
@@ -329,7 +353,21 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
|
|
| 329 |
"support_stability_map": torch.as_tensor(sample.get("support_stability_map", np.zeros((32, 32), dtype=np.float32)), dtype=torch.float32).unsqueeze(0),
|
| 330 |
"reocclusion_target": torch.as_tensor(sample.get("reocclusion_target", 0.0), dtype=torch.float32),
|
| 331 |
"reocclusion_map": torch.as_tensor(sample.get("reocclusion_map", np.zeros((32, 32), dtype=np.float32)), dtype=torch.float32).unsqueeze(0),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
"rollout_support_mode": torch.as_tensor(sample["rollout_support_mode"], dtype=torch.long),
|
|
|
|
| 333 |
"rollout_corridor_feasible": torch.as_tensor(sample["rollout_corridor_feasible"], dtype=torch.float32),
|
| 334 |
"rollout_persistence_horizon": torch.as_tensor(sample["rollout_persistence_horizon"], dtype=torch.float32),
|
| 335 |
"rollout_disturbance_cost": torch.as_tensor(sample["rollout_disturbance_cost"], dtype=torch.float32),
|
|
@@ -342,6 +380,7 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
|
|
| 342 |
"rollout_grasp_affordance_map": torch.as_tensor(sample.get("rollout_grasp_affordance_map", np.zeros((0, 32, 32), dtype=np.float32)), dtype=torch.float32),
|
| 343 |
"candidate_action_chunks": torch.as_tensor(sample["candidate_action_chunks"], dtype=torch.float32),
|
| 344 |
"candidate_rollout_support_mode": torch.as_tensor(sample["candidate_rollout_support_mode"], dtype=torch.long),
|
|
|
|
| 345 |
"candidate_rollout_corridor_feasible": torch.as_tensor(sample["candidate_rollout_corridor_feasible"], dtype=torch.float32),
|
| 346 |
"candidate_rollout_persistence_horizon": torch.as_tensor(sample["candidate_rollout_persistence_horizon"], dtype=torch.float32),
|
| 347 |
"candidate_rollout_disturbance_cost": torch.as_tensor(sample["candidate_rollout_disturbance_cost"], dtype=torch.float32),
|
|
@@ -356,8 +395,23 @@ class RevealOfflineDataset(Dataset[dict[str, Any]]):
|
|
| 356 |
"candidate_final_disturbance_cost": torch.as_tensor(sample["candidate_final_disturbance_cost"], dtype=torch.float32),
|
| 357 |
"candidate_reocclusion_rate": torch.as_tensor(sample["candidate_reocclusion_rate"], dtype=torch.float32),
|
| 358 |
"candidate_visibility_integral": torch.as_tensor(sample["candidate_visibility_integral"], dtype=torch.float32),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
"candidate_risk": torch.as_tensor(sample["candidate_risk"], dtype=torch.float32),
|
| 360 |
"candidate_utility": torch.as_tensor(sample["candidate_utility"], dtype=torch.float32),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
"proxy_name": sample["proxy_name"],
|
| 362 |
"episode_id": sample["episode_id"],
|
| 363 |
}
|
|
|
|
| 26 |
)
|
| 27 |
|
| 28 |
|
| 29 |
+
def dataset_uses_rgbd(dataset_version: Any) -> bool:
|
| 30 |
+
version = str(dataset_version or "")
|
| 31 |
+
return version.startswith(RGBD_PROXY_DATASET_VERSION)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
def _assert_noleak_sample(sample: dict[str, Any]) -> None:
|
| 35 |
render_state = sample.get("render_state", {})
|
| 36 |
leaked_keys = sorted(LEGACY_PRIVILEGED_RENDER_KEYS.intersection(render_state))
|
|
|
|
| 107 |
"language_goal": observation["text"],
|
| 108 |
"action_chunk": action_chunk.astype("float32"),
|
| 109 |
"support_mode": int(privileged_state["support_mode"]),
|
| 110 |
+
"phase": int(privileged_state.get("phase_label", 0)),
|
| 111 |
+
"subgoal_progress": float(privileged_state.get("subgoal_progress", 0.0)),
|
| 112 |
"corridor_feasible": privileged_state["corridor_feasible"].astype("float32"),
|
| 113 |
"persistence_horizon": privileged_state["persistence_horizon"].astype("float32"),
|
| 114 |
"disturbance_cost": float(privileged_state["disturbance_cost"]),
|
|
|
|
| 121 |
"support_stability_map": privileged_state["support_stability_map"].astype("float32"),
|
| 122 |
"reocclusion_target": float(privileged_state["reocclusion_target"]),
|
| 123 |
"reocclusion_map": privileged_state["reocclusion_map"].astype("float32"),
|
| 124 |
+
"gap_width": float(privileged_state.get("gap_width", 0.0)),
|
| 125 |
+
"damage_proxy": float(privileged_state.get("damage_proxy", 0.0)),
|
| 126 |
+
"release_collapse_rate": float(privileged_state.get("release_collapse_rate", 0.0)),
|
| 127 |
+
"target_visibility_confidence": float(privileged_state.get("target_visibility_confidence", 0.0)),
|
| 128 |
+
"mouth_aperture": float(privileged_state.get("mouth_aperture", 0.0)),
|
| 129 |
+
"hold_quality": float(privileged_state.get("hold_quality", 0.0)),
|
| 130 |
+
"rim_slip_risk": float(privileged_state.get("rim_slip_risk", 0.0)),
|
| 131 |
+
"insertable_actor_corridor": float(privileged_state.get("insertable_actor_corridor", 0.0)),
|
| 132 |
+
"layer_separation_quality": float(privileged_state.get("layer_separation_quality", 0.0)),
|
| 133 |
+
"fold_preservation": float(privileged_state.get("fold_preservation", 0.0)),
|
| 134 |
+
"insertion_corridor": float(privileged_state.get("insertion_corridor", 0.0)),
|
| 135 |
+
"top_layer_stability": float(privileged_state.get("top_layer_stability", 0.0)),
|
| 136 |
+
"lift_too_much_risk": float(privileged_state.get("lift_too_much_risk", 0.0)),
|
| 137 |
"rollout_support_mode": rollout["rollout_support_mode"].astype("int64"),
|
| 138 |
+
"rollout_phase": rollout.get("rollout_phase", np.zeros((rollout["rollout_support_mode"].shape[0],), dtype=np.int64)).astype("int64"),
|
| 139 |
"rollout_corridor_feasible": rollout["rollout_corridor_feasible"].astype("float32"),
|
| 140 |
"rollout_persistence_horizon": rollout["rollout_persistence_horizon"].astype("float32"),
|
| 141 |
"rollout_disturbance_cost": rollout["rollout_disturbance_cost"].astype("float32"),
|
|
|
|
| 210 |
return len(self.samples)
|
| 211 |
|
| 212 |
def _render_cache_key(self, sample: dict[str, Any], render_state: dict[str, Any]) -> bytes:
|
| 213 |
+
include_depth = dataset_uses_rgbd(sample.get("dataset_version"))
|
| 214 |
return pickle.dumps(
|
| 215 |
(sample["proxy_name"], self.resolution, include_depth, render_state),
|
| 216 |
protocol=4,
|
|
|
|
| 221 |
cached = self._render_cache.get(cache_key)
|
| 222 |
if cached is not None:
|
| 223 |
return cached
|
| 224 |
+
include_depth = dataset_uses_rgbd(sample.get("dataset_version"))
|
| 225 |
rendered = render_views_from_state(
|
| 226 |
proxy_name=sample["proxy_name"],
|
| 227 |
render_state=render_state,
|
|
|
|
| 237 |
return cached_item
|
| 238 |
sample = self.samples[index]
|
| 239 |
_assert_noleak_sample(sample)
|
| 240 |
+
candidate_count = int(sample.get("candidate_action_chunks", np.zeros((0, 0, 0), dtype=np.float32)).shape[0])
|
| 241 |
images = self._render_sample(sample, sample["render_state"])
|
| 242 |
history_images = []
|
| 243 |
history_depths = []
|
|
|
|
| 254 |
dim=0,
|
| 255 |
)
|
| 256 |
)
|
| 257 |
+
if dataset_uses_rgbd(sample.get("dataset_version")):
|
| 258 |
history_depths.append(
|
| 259 |
torch.stack(
|
| 260 |
[
|
|
|
|
| 289 |
history_stacked = torch.stack(history_images, dim=0).permute(0, 1, 4, 2, 3).float() / 255.0
|
| 290 |
else:
|
| 291 |
history_stacked = torch.zeros((0, 3, 3, self.resolution, self.resolution), dtype=torch.float32)
|
| 292 |
+
if dataset_uses_rgbd(sample.get("dataset_version")):
|
| 293 |
depths = torch.stack(
|
| 294 |
[
|
| 295 |
torch.from_numpy(images["front_depth"]),
|
|
|
|
| 339 |
"texts": sample["language_goal"],
|
| 340 |
"action_chunk": torch.as_tensor(sample["action_chunk"], dtype=torch.float32),
|
| 341 |
"support_mode": torch.as_tensor(sample["support_mode"], dtype=torch.long),
|
| 342 |
+
"phase": torch.as_tensor(sample.get("phase", 0), dtype=torch.long),
|
| 343 |
+
"subgoal_progress": torch.as_tensor(sample.get("subgoal_progress", 0.0), dtype=torch.float32),
|
| 344 |
"corridor_feasible": torch.as_tensor(sample["corridor_feasible"], dtype=torch.float32),
|
| 345 |
"persistence_horizon": torch.as_tensor(sample["persistence_horizon"], dtype=torch.float32),
|
| 346 |
"disturbance_cost": torch.as_tensor(sample["disturbance_cost"], dtype=torch.float32),
|
|
|
|
| 353 |
"support_stability_map": torch.as_tensor(sample.get("support_stability_map", np.zeros((32, 32), dtype=np.float32)), dtype=torch.float32).unsqueeze(0),
|
| 354 |
"reocclusion_target": torch.as_tensor(sample.get("reocclusion_target", 0.0), dtype=torch.float32),
|
| 355 |
"reocclusion_map": torch.as_tensor(sample.get("reocclusion_map", np.zeros((32, 32), dtype=np.float32)), dtype=torch.float32).unsqueeze(0),
|
| 356 |
+
"gap_width": torch.as_tensor(sample.get("gap_width", 0.0), dtype=torch.float32),
|
| 357 |
+
"damage_proxy": torch.as_tensor(sample.get("damage_proxy", 0.0), dtype=torch.float32),
|
| 358 |
+
"release_collapse_rate": torch.as_tensor(sample.get("release_collapse_rate", 0.0), dtype=torch.float32),
|
| 359 |
+
"target_visibility_confidence": torch.as_tensor(sample.get("target_visibility_confidence", 0.0), dtype=torch.float32),
|
| 360 |
+
"mouth_aperture": torch.as_tensor(sample.get("mouth_aperture", 0.0), dtype=torch.float32),
|
| 361 |
+
"hold_quality": torch.as_tensor(sample.get("hold_quality", 0.0), dtype=torch.float32),
|
| 362 |
+
"rim_slip_risk": torch.as_tensor(sample.get("rim_slip_risk", 0.0), dtype=torch.float32),
|
| 363 |
+
"insertable_actor_corridor": torch.as_tensor(sample.get("insertable_actor_corridor", 0.0), dtype=torch.float32),
|
| 364 |
+
"layer_separation_quality": torch.as_tensor(sample.get("layer_separation_quality", 0.0), dtype=torch.float32),
|
| 365 |
+
"fold_preservation": torch.as_tensor(sample.get("fold_preservation", 0.0), dtype=torch.float32),
|
| 366 |
+
"insertion_corridor": torch.as_tensor(sample.get("insertion_corridor", 0.0), dtype=torch.float32),
|
| 367 |
+
"top_layer_stability": torch.as_tensor(sample.get("top_layer_stability", 0.0), dtype=torch.float32),
|
| 368 |
+
"lift_too_much_risk": torch.as_tensor(sample.get("lift_too_much_risk", 0.0), dtype=torch.float32),
|
| 369 |
"rollout_support_mode": torch.as_tensor(sample["rollout_support_mode"], dtype=torch.long),
|
| 370 |
+
"rollout_phase": torch.as_tensor(sample.get("rollout_phase", np.zeros((0,), dtype=np.int64)), dtype=torch.long),
|
| 371 |
"rollout_corridor_feasible": torch.as_tensor(sample["rollout_corridor_feasible"], dtype=torch.float32),
|
| 372 |
"rollout_persistence_horizon": torch.as_tensor(sample["rollout_persistence_horizon"], dtype=torch.float32),
|
| 373 |
"rollout_disturbance_cost": torch.as_tensor(sample["rollout_disturbance_cost"], dtype=torch.float32),
|
|
|
|
| 380 |
"rollout_grasp_affordance_map": torch.as_tensor(sample.get("rollout_grasp_affordance_map", np.zeros((0, 32, 32), dtype=np.float32)), dtype=torch.float32),
|
| 381 |
"candidate_action_chunks": torch.as_tensor(sample["candidate_action_chunks"], dtype=torch.float32),
|
| 382 |
"candidate_rollout_support_mode": torch.as_tensor(sample["candidate_rollout_support_mode"], dtype=torch.long),
|
| 383 |
+
"candidate_rollout_phase": torch.as_tensor(sample.get("candidate_rollout_phase", np.zeros((0, 0), dtype=np.int64)), dtype=torch.long),
|
| 384 |
"candidate_rollout_corridor_feasible": torch.as_tensor(sample["candidate_rollout_corridor_feasible"], dtype=torch.float32),
|
| 385 |
"candidate_rollout_persistence_horizon": torch.as_tensor(sample["candidate_rollout_persistence_horizon"], dtype=torch.float32),
|
| 386 |
"candidate_rollout_disturbance_cost": torch.as_tensor(sample["candidate_rollout_disturbance_cost"], dtype=torch.float32),
|
|
|
|
| 395 |
"candidate_final_disturbance_cost": torch.as_tensor(sample["candidate_final_disturbance_cost"], dtype=torch.float32),
|
| 396 |
"candidate_reocclusion_rate": torch.as_tensor(sample["candidate_reocclusion_rate"], dtype=torch.float32),
|
| 397 |
"candidate_visibility_integral": torch.as_tensor(sample["candidate_visibility_integral"], dtype=torch.float32),
|
| 398 |
+
"candidate_actor_feasibility_auc": torch.as_tensor(sample.get("candidate_actor_feasibility_auc", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
|
| 399 |
+
"candidate_reveal_achieved": torch.as_tensor(sample.get("candidate_reveal_achieved", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
|
| 400 |
+
"candidate_hold_persistence": torch.as_tensor(sample.get("candidate_hold_persistence", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
|
| 401 |
+
"candidate_support_stability_auc": torch.as_tensor(sample.get("candidate_support_stability_auc", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
|
| 402 |
+
"candidate_disturbance_auc": torch.as_tensor(sample.get("candidate_disturbance_auc", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
|
| 403 |
+
"candidate_macro_ids": torch.as_tensor(sample.get("candidate_macro_ids", np.zeros((candidate_count,), dtype=np.int64)), dtype=torch.long),
|
| 404 |
+
"candidate_is_hard_negative": torch.as_tensor(sample.get("candidate_is_hard_negative", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
|
| 405 |
"candidate_risk": torch.as_tensor(sample["candidate_risk"], dtype=torch.float32),
|
| 406 |
"candidate_utility": torch.as_tensor(sample["candidate_utility"], dtype=torch.float32),
|
| 407 |
+
"candidate_gap_width": torch.as_tensor(sample.get("candidate_gap_width", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
|
| 408 |
+
"candidate_damage_proxy": torch.as_tensor(sample.get("candidate_damage_proxy", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
|
| 409 |
+
"candidate_mouth_aperture": torch.as_tensor(sample.get("candidate_mouth_aperture", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
|
| 410 |
+
"candidate_hold_quality": torch.as_tensor(sample.get("candidate_hold_quality", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
|
| 411 |
+
"candidate_rim_slip_risk": torch.as_tensor(sample.get("candidate_rim_slip_risk", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
|
| 412 |
+
"candidate_fold_preservation": torch.as_tensor(sample.get("candidate_fold_preservation", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
|
| 413 |
+
"candidate_layer_separation_quality": torch.as_tensor(sample.get("candidate_layer_separation_quality", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
|
| 414 |
+
"candidate_lift_too_much_risk": torch.as_tensor(sample.get("candidate_lift_too_much_risk", np.zeros((candidate_count,), dtype=np.float32)), dtype=torch.float32),
|
| 415 |
"proxy_name": sample["proxy_name"],
|
| 416 |
"episode_id": sample["episode_id"],
|
| 417 |
}
|
code/reveal_vla_bimanual/sim_reveal/procedural_envs.py
CHANGED
|
@@ -347,6 +347,53 @@ class ProceduralRevealEnv:
|
|
| 347 |
horizon_ratio = persistence[current_mode] / float(max(1, self.rollout_horizon))
|
| 348 |
return float(np.clip(1.0 - horizon_ratio + 0.35 * self.disturbance, 0.0, 1.0))
|
| 349 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
def _grasp_affordance_map(
|
| 351 |
self,
|
| 352 |
belief_map: np.ndarray,
|
|
@@ -374,6 +421,40 @@ class ProceduralRevealEnv:
|
|
| 374 |
reocclusion_target = self._reocclusion_target(persistence)
|
| 375 |
reocclusion_map = np.full((32, 32), reocclusion_target, dtype=np.float32)
|
| 376 |
grasp_affordance_map = self._grasp_affordance_map(belief_map, visibility_map, clearance_map)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
return {
|
| 378 |
"support_mode": support_mode,
|
| 379 |
"corridor_feasible": corridor,
|
|
@@ -391,6 +472,9 @@ class ProceduralRevealEnv:
|
|
| 391 |
"visibility": visibility,
|
| 392 |
"retrieval_success": bool(self.retrieved),
|
| 393 |
"target_template": self.target_template,
|
|
|
|
|
|
|
|
|
|
| 394 |
}
|
| 395 |
|
| 396 |
def render_state(self, privileged_state: dict[str, Any] | None = None) -> dict[str, Any]:
|
|
@@ -467,6 +551,99 @@ class ProceduralRevealEnv:
|
|
| 467 |
action[13] = np.float32(1.0 if retrieve else -1.0)
|
| 468 |
return action
|
| 469 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
def teacher_chunk_and_rollout(
|
| 471 |
self,
|
| 472 |
chunk_horizon: int = 8,
|
|
@@ -486,6 +663,7 @@ class ProceduralRevealEnv:
|
|
| 486 |
rollout_reocclusion = []
|
| 487 |
rollout_occluder_contact = []
|
| 488 |
rollout_grasp_affordance = []
|
|
|
|
| 489 |
for step in range(chunk_horizon):
|
| 490 |
action = self.teacher_action()
|
| 491 |
action_chunk.append(action)
|
|
@@ -502,6 +680,7 @@ class ProceduralRevealEnv:
|
|
| 502 |
rollout_reocclusion.append(privileged_state["reocclusion_target"])
|
| 503 |
rollout_occluder_contact.append(privileged_state["occluder_contact_map"])
|
| 504 |
rollout_grasp_affordance.append(privileged_state["grasp_affordance_map"])
|
|
|
|
| 505 |
if terminated or truncated:
|
| 506 |
break
|
| 507 |
while len(action_chunk) < chunk_horizon:
|
|
@@ -519,6 +698,7 @@ class ProceduralRevealEnv:
|
|
| 519 |
rollout_reocclusion.append(current["reocclusion_target"])
|
| 520 |
rollout_occluder_contact.append(current["occluder_contact_map"])
|
| 521 |
rollout_grasp_affordance.append(current["grasp_affordance_map"])
|
|
|
|
| 522 |
self.restore_state(snapshot)
|
| 523 |
return np.stack(action_chunk, axis=0).astype(np.float32), {
|
| 524 |
"rollout_support_mode": np.asarray(rollout_support_mode, dtype=np.int64),
|
|
@@ -532,6 +712,7 @@ class ProceduralRevealEnv:
|
|
| 532 |
"rollout_reocclusion_target": np.asarray(rollout_reocclusion, dtype=np.float32),
|
| 533 |
"rollout_occluder_contact_map": np.asarray(rollout_occluder_contact, dtype=np.float32),
|
| 534 |
"rollout_grasp_affordance_map": np.asarray(rollout_grasp_affordance, dtype=np.float32),
|
|
|
|
| 535 |
}
|
| 536 |
|
| 537 |
def evaluate_action_chunk(
|
|
@@ -552,8 +733,12 @@ class ProceduralRevealEnv:
|
|
| 552 |
rollout_reocclusion: list[float] = []
|
| 553 |
rollout_occluder_contact: list[np.ndarray] = []
|
| 554 |
rollout_grasp_affordance: list[np.ndarray] = []
|
|
|
|
| 555 |
corridor_open_trace = [float(self.get_privileged_state()["corridor_feasible"][self._current_support_mode()].any())]
|
| 556 |
visibility_trace = [float(self.get_privileged_state()["visibility"])]
|
|
|
|
|
|
|
|
|
|
| 557 |
terminated = False
|
| 558 |
truncated = False
|
| 559 |
privileged_state = self.get_privileged_state()
|
|
@@ -571,8 +756,12 @@ class ProceduralRevealEnv:
|
|
| 571 |
rollout_reocclusion.append(float(privileged_state["reocclusion_target"]))
|
| 572 |
rollout_occluder_contact.append(privileged_state["occluder_contact_map"].astype(np.float32))
|
| 573 |
rollout_grasp_affordance.append(privileged_state["grasp_affordance_map"].astype(np.float32))
|
|
|
|
| 574 |
corridor_open_trace.append(float(privileged_state["corridor_feasible"][privileged_state["support_mode"]].any()))
|
| 575 |
visibility_trace.append(float(privileged_state["visibility"]))
|
|
|
|
|
|
|
|
|
|
| 576 |
if terminated or truncated:
|
| 577 |
break
|
| 578 |
while len(rollout_support_mode) < rollout_horizon:
|
|
@@ -588,11 +777,17 @@ class ProceduralRevealEnv:
|
|
| 588 |
rollout_reocclusion.append(float(current["reocclusion_target"]))
|
| 589 |
rollout_occluder_contact.append(current["occluder_contact_map"].astype(np.float32))
|
| 590 |
rollout_grasp_affordance.append(current["grasp_affordance_map"].astype(np.float32))
|
|
|
|
| 591 |
final_state = self.get_privileged_state()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
reocclusion = float(
|
| 593 |
np.logical_and(
|
| 594 |
-
|
| 595 |
-
|
| 596 |
).mean()
|
| 597 |
) if len(corridor_open_trace) > 1 else 0.0
|
| 598 |
result: dict[str, np.ndarray | float] = {
|
|
@@ -607,11 +802,29 @@ class ProceduralRevealEnv:
|
|
| 607 |
"rollout_reocclusion_target": np.asarray(rollout_reocclusion, dtype=np.float32),
|
| 608 |
"rollout_occluder_contact_map": np.asarray(rollout_occluder_contact, dtype=np.float32),
|
| 609 |
"rollout_grasp_affordance_map": np.asarray(rollout_grasp_affordance, dtype=np.float32),
|
|
|
|
| 610 |
"retrieval_success": float(final_state["retrieval_success"]),
|
| 611 |
"final_disturbance_cost": float(final_state["disturbance_cost"]),
|
| 612 |
"reocclusion_rate": reocclusion,
|
| 613 |
-
"visibility_integral": float(np.sum(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 614 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
self.restore_state(snapshot)
|
| 616 |
return result
|
| 617 |
|
|
@@ -625,16 +838,70 @@ class ProceduralRevealEnv:
|
|
| 625 |
teacher_chunk = np.asarray(teacher_chunk, dtype=np.float32)
|
| 626 |
candidates = [teacher_chunk.astype(np.float32)]
|
| 627 |
outcomes = [self.evaluate_action_chunk(teacher_chunk, rollout_horizon=rollout_horizon)]
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 634 |
candidates.append(candidate.astype(np.float32))
|
| 635 |
outcomes.append(self.evaluate_action_chunk(candidate, rollout_horizon=rollout_horizon))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 636 |
stacked_outcomes = {
|
| 637 |
"candidate_rollout_support_mode": np.stack([item["rollout_support_mode"] for item in outcomes], axis=0).astype(np.int64),
|
|
|
|
| 638 |
"candidate_rollout_corridor_feasible": np.stack(
|
| 639 |
[item["rollout_corridor_feasible"] for item in outcomes], axis=0
|
| 640 |
).astype(np.float32),
|
|
@@ -671,6 +938,13 @@ class ProceduralRevealEnv:
|
|
| 671 |
),
|
| 672 |
"candidate_reocclusion_rate": np.asarray([item["reocclusion_rate"] for item in outcomes], dtype=np.float32),
|
| 673 |
"candidate_visibility_integral": np.asarray([item["visibility_integral"] for item in outcomes], dtype=np.float32),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 674 |
}
|
| 675 |
stacked_outcomes["candidate_risk"] = np.clip(
|
| 676 |
stacked_outcomes["candidate_final_disturbance_cost"] + stacked_outcomes["candidate_reocclusion_rate"],
|
|
@@ -680,6 +954,19 @@ class ProceduralRevealEnv:
|
|
| 680 |
stacked_outcomes["candidate_utility"] = (
|
| 681 |
stacked_outcomes["candidate_retrieval_success"] - stacked_outcomes["candidate_risk"]
|
| 682 |
).astype(np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 683 |
return np.stack(candidates, axis=0).astype(np.float32), stacked_outcomes
|
| 684 |
|
| 685 |
def step(self, action: np.ndarray) -> tuple[dict[str, Any], float, bool, bool, dict[str, Any]]:
|
|
|
|
| 347 |
horizon_ratio = persistence[current_mode] / float(max(1, self.rollout_horizon))
|
| 348 |
return float(np.clip(1.0 - horizon_ratio + 0.35 * self.disturbance, 0.0, 1.0))
|
| 349 |
|
| 350 |
+
def _phase_label(
|
| 351 |
+
self,
|
| 352 |
+
visibility: float,
|
| 353 |
+
corridor: np.ndarray,
|
| 354 |
+
persistence: np.ndarray,
|
| 355 |
+
disturbance_cost: float,
|
| 356 |
+
) -> int:
|
| 357 |
+
support_mode = int(self._current_support_mode())
|
| 358 |
+
corridor_ready = bool(corridor[support_mode, self.target_template] > 0.5)
|
| 359 |
+
persistence_ratio = persistence[support_mode] / float(max(1, self.rollout_horizon))
|
| 360 |
+
opening_ready = self.opening >= (0.75 * self.dynamics.desired_opening)
|
| 361 |
+
retrieve_ready = (
|
| 362 |
+
corridor_ready
|
| 363 |
+
and visibility >= self.dynamics.retrieve_visibility_threshold
|
| 364 |
+
and self.actor_progress >= 0.55
|
| 365 |
+
)
|
| 366 |
+
recovering = disturbance_cost >= 0.55 or (opening_ready and persistence_ratio < 0.35)
|
| 367 |
+
if retrieve_ready:
|
| 368 |
+
return 3
|
| 369 |
+
if recovering:
|
| 370 |
+
return 4
|
| 371 |
+
if opening_ready and persistence_ratio >= 0.6:
|
| 372 |
+
return 2
|
| 373 |
+
if self.opening < self.dynamics.desired_opening or visibility < self.dynamics.retrieve_visibility_threshold:
|
| 374 |
+
return 1
|
| 375 |
+
return 0
|
| 376 |
+
|
| 377 |
+
def _subgoal_progress(
|
| 378 |
+
self,
|
| 379 |
+
visibility: float,
|
| 380 |
+
corridor: np.ndarray,
|
| 381 |
+
persistence: np.ndarray,
|
| 382 |
+
) -> float:
|
| 383 |
+
support_mode = int(self._current_support_mode())
|
| 384 |
+
corridor_mass = float(corridor[support_mode].mean())
|
| 385 |
+
persistence_ratio = float(persistence[support_mode] / float(max(1, self.rollout_horizon)))
|
| 386 |
+
return float(
|
| 387 |
+
np.clip(
|
| 388 |
+
0.35 * self.opening
|
| 389 |
+
+ 0.25 * visibility
|
| 390 |
+
+ 0.20 * corridor_mass
|
| 391 |
+
+ 0.20 * persistence_ratio,
|
| 392 |
+
0.0,
|
| 393 |
+
1.0,
|
| 394 |
+
)
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
def _grasp_affordance_map(
|
| 398 |
self,
|
| 399 |
belief_map: np.ndarray,
|
|
|
|
| 421 |
reocclusion_target = self._reocclusion_target(persistence)
|
| 422 |
reocclusion_map = np.full((32, 32), reocclusion_target, dtype=np.float32)
|
| 423 |
grasp_affordance_map = self._grasp_affordance_map(belief_map, visibility_map, clearance_map)
|
| 424 |
+
task_metrics: dict[str, float] = {}
|
| 425 |
+
if self.proxy_name == FOLIAGE_PROXY.name:
|
| 426 |
+
task_metrics = {
|
| 427 |
+
"gap_width": float(np.clip(0.03 + 0.16 * self.opening, 0.03, 0.24)),
|
| 428 |
+
"damage_proxy": disturbance_cost,
|
| 429 |
+
"release_collapse_rate": reocclusion_target,
|
| 430 |
+
"target_visibility_confidence": visibility,
|
| 431 |
+
}
|
| 432 |
+
elif self.proxy_name == BAG_PROXY.name:
|
| 433 |
+
task_metrics = {
|
| 434 |
+
"mouth_aperture": float(self.opening),
|
| 435 |
+
"hold_quality": support_stability,
|
| 436 |
+
"rim_slip_risk": reocclusion_target,
|
| 437 |
+
"insertable_actor_corridor": float(corridor[support_mode, self.target_template]),
|
| 438 |
+
}
|
| 439 |
+
elif self.proxy_name == CLOTH_PROXY.name:
|
| 440 |
+
task_metrics = {
|
| 441 |
+
"layer_separation_quality": float(np.clip(self.opening * (1.0 - 0.20 * self.disturbance), 0.0, 1.0)),
|
| 442 |
+
"fold_preservation": float(np.clip(1.0 - disturbance_cost, 0.0, 1.0)),
|
| 443 |
+
"insertion_corridor": float(corridor[support_mode, self.target_template]),
|
| 444 |
+
"top_layer_stability": support_stability,
|
| 445 |
+
"lift_too_much_risk": float(np.clip(max(0.0, self.opening - self.dynamics.desired_opening), 0.0, 1.0)),
|
| 446 |
+
}
|
| 447 |
+
phase_label = self._phase_label(
|
| 448 |
+
visibility=visibility,
|
| 449 |
+
corridor=corridor,
|
| 450 |
+
persistence=persistence,
|
| 451 |
+
disturbance_cost=disturbance_cost,
|
| 452 |
+
)
|
| 453 |
+
subgoal_progress = self._subgoal_progress(
|
| 454 |
+
visibility=visibility,
|
| 455 |
+
corridor=corridor,
|
| 456 |
+
persistence=persistence,
|
| 457 |
+
)
|
| 458 |
return {
|
| 459 |
"support_mode": support_mode,
|
| 460 |
"corridor_feasible": corridor,
|
|
|
|
| 472 |
"visibility": visibility,
|
| 473 |
"retrieval_success": bool(self.retrieved),
|
| 474 |
"target_template": self.target_template,
|
| 475 |
+
"phase_label": int(phase_label),
|
| 476 |
+
"subgoal_progress": float(subgoal_progress),
|
| 477 |
+
**task_metrics,
|
| 478 |
}
|
| 479 |
|
| 480 |
def render_state(self, privileged_state: dict[str, Any] | None = None) -> dict[str, Any]:
|
|
|
|
| 551 |
action[13] = np.float32(1.0 if retrieve else -1.0)
|
| 552 |
return action
|
| 553 |
|
| 554 |
+
def _set_mode_bits(self, action: np.ndarray, mode: SupportMode) -> None:
|
| 555 |
+
action[1] = np.float32(1.0 if mode == SupportMode.TRANSFER else -1.0)
|
| 556 |
+
action[2] = np.float32(1.0 if mode == SupportMode.PASSIVE else -1.0)
|
| 557 |
+
action[6] = np.float32(1.0 if mode == SupportMode.HOLD else -1.0)
|
| 558 |
+
|
| 559 |
+
def macro_action_chunk(self, macro_name: str, chunk_horizon: int = 8) -> np.ndarray:
|
| 560 |
+
preferred_mode = self.dynamics.preferred_mode
|
| 561 |
+
hold_mode = SupportMode.HOLD
|
| 562 |
+
passive_mode = SupportMode.PASSIVE
|
| 563 |
+
target_index = self.target_template
|
| 564 |
+
left_index = max(0, target_index - 4)
|
| 565 |
+
right_index = min(self.num_templates - 1, target_index + 4)
|
| 566 |
+
wrong_index = 0 if target_index > (self.num_templates // 2) else self.num_templates - 1
|
| 567 |
+
|
| 568 |
+
chunk = np.zeros((chunk_horizon, 14), dtype=np.float32)
|
| 569 |
+
for step_idx in range(chunk_horizon):
|
| 570 |
+
action = self.teacher_action()
|
| 571 |
+
action[13] = np.float32(-1.0)
|
| 572 |
+
action[8] = np.float32(0.2)
|
| 573 |
+
self._set_mode_bits(action, preferred_mode)
|
| 574 |
+
|
| 575 |
+
if macro_name in {"widen_gap", "widen_mouth", "lift_edge", "separate_layer"}:
|
| 576 |
+
self._set_mode_bits(action, hold_mode)
|
| 577 |
+
action[0] = np.float32(0.95)
|
| 578 |
+
elif macro_name in {"maintain_gap", "maintain_mouth", "maintain_lift", "stabilize_fold", "pin_canopy"}:
|
| 579 |
+
self._set_mode_bits(action, preferred_mode)
|
| 580 |
+
action[0] = np.float32(0.12)
|
| 581 |
+
elif macro_name in {"sweep_left", "pin_left_rim"}:
|
| 582 |
+
self._set_mode_bits(action, hold_mode)
|
| 583 |
+
action[0] = np.float32(0.75)
|
| 584 |
+
action[7] = np.float32(self._normalized_template(left_index))
|
| 585 |
+
elif macro_name in {"sweep_right", "pin_right_rim"}:
|
| 586 |
+
self._set_mode_bits(action, hold_mode)
|
| 587 |
+
action[0] = np.float32(0.75)
|
| 588 |
+
action[7] = np.float32(self._normalized_template(right_index))
|
| 589 |
+
elif macro_name == "probe_inside":
|
| 590 |
+
self._set_mode_bits(action, preferred_mode)
|
| 591 |
+
action[0] = np.float32(0.10)
|
| 592 |
+
action[8] = np.float32(0.75)
|
| 593 |
+
elif macro_name == "insert_actor":
|
| 594 |
+
self._set_mode_bits(action, preferred_mode)
|
| 595 |
+
action[0] = np.float32(0.10)
|
| 596 |
+
action[8] = np.float32(1.0)
|
| 597 |
+
elif macro_name == "retrieve":
|
| 598 |
+
self._set_mode_bits(action, preferred_mode)
|
| 599 |
+
action[0] = np.float32(0.05)
|
| 600 |
+
action[8] = np.float32(1.0)
|
| 601 |
+
action[13] = np.float32(1.0)
|
| 602 |
+
elif macro_name == "premature_retrieve":
|
| 603 |
+
self._set_mode_bits(action, passive_mode)
|
| 604 |
+
action[0] = np.float32(-0.20)
|
| 605 |
+
action[8] = np.float32(1.0)
|
| 606 |
+
action[13] = np.float32(1.0)
|
| 607 |
+
elif macro_name in {"reveal_with_release", "foliage_immediate_reocclusion"}:
|
| 608 |
+
reveal_phase = step_idx < max(1, chunk_horizon // 2)
|
| 609 |
+
self._set_mode_bits(action, hold_mode if reveal_phase else passive_mode)
|
| 610 |
+
action[0] = np.float32(0.95 if reveal_phase else -0.35)
|
| 611 |
+
action[8] = np.float32(0.2)
|
| 612 |
+
elif macro_name in {"wrong_side_reveal", "wrong_edge_reveal", "wrong_layer_reveal"}:
|
| 613 |
+
self._set_mode_bits(action, hold_mode)
|
| 614 |
+
action[0] = np.float32(0.65)
|
| 615 |
+
action[7] = np.float32(self._normalized_template(wrong_index))
|
| 616 |
+
elif macro_name in {"over_disturbance", "cloth_lift_high"}:
|
| 617 |
+
self._set_mode_bits(action, passive_mode)
|
| 618 |
+
action[0] = np.float32(1.0)
|
| 619 |
+
action[8] = np.float32(1.0 if macro_name == "over_disturbance" else 0.2)
|
| 620 |
+
elif macro_name == "delayed_actor_entry":
|
| 621 |
+
self._set_mode_bits(action, preferred_mode)
|
| 622 |
+
action[0] = np.float32(0.10)
|
| 623 |
+
action[8] = np.float32(0.2 if step_idx < (chunk_horizon - 1) else 1.0)
|
| 624 |
+
elif macro_name in {"weak_corridor_insert", "bag_fabric_probe"}:
|
| 625 |
+
self._set_mode_bits(action, passive_mode)
|
| 626 |
+
action[0] = np.float32(0.02)
|
| 627 |
+
action[8] = np.float32(1.0)
|
| 628 |
+
else:
|
| 629 |
+
action = self.teacher_action()
|
| 630 |
+
chunk[step_idx] = np.clip(action, -1.0, 1.0)
|
| 631 |
+
return chunk
|
| 632 |
+
|
| 633 |
+
def baseline_action_chunk(self, baseline_name: str, chunk_horizon: int = 8) -> np.ndarray:
|
| 634 |
+
if baseline_name == "teacher":
|
| 635 |
+
chunk, _ = self.teacher_chunk_and_rollout(chunk_horizon=chunk_horizon, rollout_horizon=self.rollout_horizon)
|
| 636 |
+
return chunk
|
| 637 |
+
if baseline_name == "reveal_only":
|
| 638 |
+
return self.macro_action_chunk("widen_gap" if self.proxy_name == FOLIAGE_PROXY.name else ("widen_mouth" if self.proxy_name == BAG_PROXY.name else "lift_edge"), chunk_horizon=chunk_horizon)
|
| 639 |
+
if baseline_name == "retrieve_only":
|
| 640 |
+
return self.macro_action_chunk("premature_retrieve", chunk_horizon=chunk_horizon)
|
| 641 |
+
if baseline_name == "no_hold":
|
| 642 |
+
return self.macro_action_chunk("reveal_with_release", chunk_horizon=chunk_horizon)
|
| 643 |
+
if baseline_name == "random":
|
| 644 |
+
return self.rng.uniform(-1.0, 1.0, size=(chunk_horizon, 14)).astype(np.float32)
|
| 645 |
+
raise KeyError(f"Unknown baseline chunk: {baseline_name}")
|
| 646 |
+
|
| 647 |
def teacher_chunk_and_rollout(
|
| 648 |
self,
|
| 649 |
chunk_horizon: int = 8,
|
|
|
|
| 663 |
rollout_reocclusion = []
|
| 664 |
rollout_occluder_contact = []
|
| 665 |
rollout_grasp_affordance = []
|
| 666 |
+
rollout_phase = []
|
| 667 |
for step in range(chunk_horizon):
|
| 668 |
action = self.teacher_action()
|
| 669 |
action_chunk.append(action)
|
|
|
|
| 680 |
rollout_reocclusion.append(privileged_state["reocclusion_target"])
|
| 681 |
rollout_occluder_contact.append(privileged_state["occluder_contact_map"])
|
| 682 |
rollout_grasp_affordance.append(privileged_state["grasp_affordance_map"])
|
| 683 |
+
rollout_phase.append(int(privileged_state["phase_label"]))
|
| 684 |
if terminated or truncated:
|
| 685 |
break
|
| 686 |
while len(action_chunk) < chunk_horizon:
|
|
|
|
| 698 |
rollout_reocclusion.append(current["reocclusion_target"])
|
| 699 |
rollout_occluder_contact.append(current["occluder_contact_map"])
|
| 700 |
rollout_grasp_affordance.append(current["grasp_affordance_map"])
|
| 701 |
+
rollout_phase.append(int(current["phase_label"]))
|
| 702 |
self.restore_state(snapshot)
|
| 703 |
return np.stack(action_chunk, axis=0).astype(np.float32), {
|
| 704 |
"rollout_support_mode": np.asarray(rollout_support_mode, dtype=np.int64),
|
|
|
|
| 712 |
"rollout_reocclusion_target": np.asarray(rollout_reocclusion, dtype=np.float32),
|
| 713 |
"rollout_occluder_contact_map": np.asarray(rollout_occluder_contact, dtype=np.float32),
|
| 714 |
"rollout_grasp_affordance_map": np.asarray(rollout_grasp_affordance, dtype=np.float32),
|
| 715 |
+
"rollout_phase": np.asarray(rollout_phase, dtype=np.int64),
|
| 716 |
}
|
| 717 |
|
| 718 |
def evaluate_action_chunk(
|
|
|
|
| 733 |
rollout_reocclusion: list[float] = []
|
| 734 |
rollout_occluder_contact: list[np.ndarray] = []
|
| 735 |
rollout_grasp_affordance: list[np.ndarray] = []
|
| 736 |
+
rollout_phase: list[int] = []
|
| 737 |
corridor_open_trace = [float(self.get_privileged_state()["corridor_feasible"][self._current_support_mode()].any())]
|
| 738 |
visibility_trace = [float(self.get_privileged_state()["visibility"])]
|
| 739 |
+
disturbance_trace = [float(self.get_privileged_state()["disturbance_cost"])]
|
| 740 |
+
support_trace = [float(self.get_privileged_state()["support_stability"])]
|
| 741 |
+
opening_trace = [float(self.opening)]
|
| 742 |
terminated = False
|
| 743 |
truncated = False
|
| 744 |
privileged_state = self.get_privileged_state()
|
|
|
|
| 756 |
rollout_reocclusion.append(float(privileged_state["reocclusion_target"]))
|
| 757 |
rollout_occluder_contact.append(privileged_state["occluder_contact_map"].astype(np.float32))
|
| 758 |
rollout_grasp_affordance.append(privileged_state["grasp_affordance_map"].astype(np.float32))
|
| 759 |
+
rollout_phase.append(int(privileged_state["phase_label"]))
|
| 760 |
corridor_open_trace.append(float(privileged_state["corridor_feasible"][privileged_state["support_mode"]].any()))
|
| 761 |
visibility_trace.append(float(privileged_state["visibility"]))
|
| 762 |
+
disturbance_trace.append(float(privileged_state["disturbance_cost"]))
|
| 763 |
+
support_trace.append(float(privileged_state["support_stability"]))
|
| 764 |
+
opening_trace.append(float(self.opening))
|
| 765 |
if terminated or truncated:
|
| 766 |
break
|
| 767 |
while len(rollout_support_mode) < rollout_horizon:
|
|
|
|
| 777 |
rollout_reocclusion.append(float(current["reocclusion_target"]))
|
| 778 |
rollout_occluder_contact.append(current["occluder_contact_map"].astype(np.float32))
|
| 779 |
rollout_grasp_affordance.append(current["grasp_affordance_map"].astype(np.float32))
|
| 780 |
+
rollout_phase.append(int(current["phase_label"]))
|
| 781 |
final_state = self.get_privileged_state()
|
| 782 |
+
corridor_curve = np.asarray(corridor_open_trace, dtype=np.float32)
|
| 783 |
+
visibility_curve = np.asarray(visibility_trace, dtype=np.float32)
|
| 784 |
+
disturbance_curve = np.asarray(disturbance_trace, dtype=np.float32)
|
| 785 |
+
support_curve = np.asarray(support_trace, dtype=np.float32)
|
| 786 |
+
opening_curve = np.asarray(opening_trace, dtype=np.float32)
|
| 787 |
reocclusion = float(
|
| 788 |
np.logical_and(
|
| 789 |
+
corridor_curve[:-1] > 0.5,
|
| 790 |
+
corridor_curve[1:] <= 0.5,
|
| 791 |
).mean()
|
| 792 |
) if len(corridor_open_trace) > 1 else 0.0
|
| 793 |
result: dict[str, np.ndarray | float] = {
|
|
|
|
| 802 |
"rollout_reocclusion_target": np.asarray(rollout_reocclusion, dtype=np.float32),
|
| 803 |
"rollout_occluder_contact_map": np.asarray(rollout_occluder_contact, dtype=np.float32),
|
| 804 |
"rollout_grasp_affordance_map": np.asarray(rollout_grasp_affordance, dtype=np.float32),
|
| 805 |
+
"rollout_phase": np.asarray(rollout_phase, dtype=np.int64),
|
| 806 |
"retrieval_success": float(final_state["retrieval_success"]),
|
| 807 |
"final_disturbance_cost": float(final_state["disturbance_cost"]),
|
| 808 |
"reocclusion_rate": reocclusion,
|
| 809 |
+
"visibility_integral": float(np.sum(visibility_curve)),
|
| 810 |
+
"actor_feasibility_auc": float(corridor_curve.mean()),
|
| 811 |
+
"reveal_achieved": float(visibility_curve.max() >= self.dynamics.retrieve_visibility_threshold),
|
| 812 |
+
"hold_persistence": float(corridor_curve.mean()),
|
| 813 |
+
"support_stability_auc": float(support_curve.mean()),
|
| 814 |
+
"disturbance_auc": float(disturbance_curve.mean()),
|
| 815 |
+
"opening_peak": float(opening_curve.max()),
|
| 816 |
}
|
| 817 |
+
if self.proxy_name == FOLIAGE_PROXY.name:
|
| 818 |
+
result["candidate_gap_width"] = float(final_state.get("gap_width", opening_curve.max()))
|
| 819 |
+
result["candidate_damage_proxy"] = float(final_state.get("damage_proxy", final_state["disturbance_cost"]))
|
| 820 |
+
elif self.proxy_name == BAG_PROXY.name:
|
| 821 |
+
result["candidate_mouth_aperture"] = float(final_state.get("mouth_aperture", opening_curve.max()))
|
| 822 |
+
result["candidate_hold_quality"] = float(final_state.get("hold_quality", support_curve.mean()))
|
| 823 |
+
result["candidate_rim_slip_risk"] = float(final_state.get("rim_slip_risk", reocclusion))
|
| 824 |
+
elif self.proxy_name == CLOTH_PROXY.name:
|
| 825 |
+
result["candidate_fold_preservation"] = float(final_state.get("fold_preservation", 1.0 - final_state["disturbance_cost"]))
|
| 826 |
+
result["candidate_layer_separation_quality"] = float(final_state.get("layer_separation_quality", opening_curve.max()))
|
| 827 |
+
result["candidate_lift_too_much_risk"] = float(final_state.get("lift_too_much_risk", max(0.0, opening_curve.max() - self.dynamics.desired_opening)))
|
| 828 |
self.restore_state(snapshot)
|
| 829 |
return result
|
| 830 |
|
|
|
|
| 838 |
teacher_chunk = np.asarray(teacher_chunk, dtype=np.float32)
|
| 839 |
candidates = [teacher_chunk.astype(np.float32)]
|
| 840 |
outcomes = [self.evaluate_action_chunk(teacher_chunk, rollout_horizon=rollout_horizon)]
|
| 841 |
+
candidate_macro_ids = [0]
|
| 842 |
+
candidate_is_hard_negative = [0.0]
|
| 843 |
+
candidate_macro_names = ["teacher"]
|
| 844 |
+
candidate_negative_families = ["teacher"]
|
| 845 |
+
if self.proxy_name == FOLIAGE_PROXY.name:
|
| 846 |
+
semantic_specs = [
|
| 847 |
+
("pin_canopy", "positive"),
|
| 848 |
+
("maintain_gap", "positive"),
|
| 849 |
+
("premature_retrieve", "premature_retrieve"),
|
| 850 |
+
("reveal_with_release", "reveal_with_release"),
|
| 851 |
+
("wrong_side_reveal", "wrong_side_reveal"),
|
| 852 |
+
("foliage_immediate_reocclusion", "immediate_reocclusion"),
|
| 853 |
+
("over_disturbance", "over_disturbance"),
|
| 854 |
+
("weak_corridor_insert", "weak_corridor_insert"),
|
| 855 |
+
("insert_actor", "positive"),
|
| 856 |
+
("retrieve", "positive"),
|
| 857 |
+
]
|
| 858 |
+
elif self.proxy_name == BAG_PROXY.name:
|
| 859 |
+
semantic_specs = [
|
| 860 |
+
("widen_mouth", "positive"),
|
| 861 |
+
("maintain_mouth", "positive"),
|
| 862 |
+
("premature_retrieve", "premature_retrieve"),
|
| 863 |
+
("reveal_with_release", "reveal_with_release"),
|
| 864 |
+
("wrong_edge_reveal", "wrong_side_reveal"),
|
| 865 |
+
("pin_left_rim", "one_rim_slip"),
|
| 866 |
+
("bag_fabric_probe", "fabric_probe"),
|
| 867 |
+
("weak_corridor_insert", "weak_corridor_insert"),
|
| 868 |
+
("insert_actor", "positive"),
|
| 869 |
+
("retrieve", "positive"),
|
| 870 |
+
]
|
| 871 |
+
else:
|
| 872 |
+
semantic_specs = [
|
| 873 |
+
("lift_edge", "positive"),
|
| 874 |
+
("stabilize_fold", "positive"),
|
| 875 |
+
("premature_retrieve", "premature_retrieve"),
|
| 876 |
+
("reveal_with_release", "reveal_with_release"),
|
| 877 |
+
("cloth_lift_high", "lift_too_high"),
|
| 878 |
+
("wrong_layer_reveal", "wrong_layer_reveal"),
|
| 879 |
+
("delayed_actor_entry", "delayed_actor_entry"),
|
| 880 |
+
("weak_corridor_insert", "weak_corridor_insert"),
|
| 881 |
+
("insert_actor", "positive"),
|
| 882 |
+
("retrieve", "positive"),
|
| 883 |
+
]
|
| 884 |
+
|
| 885 |
+
for spec_idx, (macro_name, family_name) in enumerate(semantic_specs[: max(0, num_candidates - 1)], start=1):
|
| 886 |
+
candidate = self.macro_action_chunk(macro_name, chunk_horizon=teacher_chunk.shape[0])
|
| 887 |
candidates.append(candidate.astype(np.float32))
|
| 888 |
outcomes.append(self.evaluate_action_chunk(candidate, rollout_horizon=rollout_horizon))
|
| 889 |
+
candidate_macro_ids.append(spec_idx)
|
| 890 |
+
candidate_macro_names.append(macro_name)
|
| 891 |
+
candidate_negative_families.append(family_name)
|
| 892 |
+
candidate_is_hard_negative.append(0.0 if family_name == "positive" else 1.0)
|
| 893 |
+
|
| 894 |
+
while len(candidates) < num_candidates:
|
| 895 |
+
random_chunk = self.rng.uniform(-1.0, 1.0, size=teacher_chunk.shape).astype(np.float32)
|
| 896 |
+
candidates.append(random_chunk)
|
| 897 |
+
outcomes.append(self.evaluate_action_chunk(random_chunk, rollout_horizon=rollout_horizon))
|
| 898 |
+
candidate_macro_ids.append(len(candidate_macro_ids))
|
| 899 |
+
candidate_macro_names.append("random")
|
| 900 |
+
candidate_negative_families.append("random")
|
| 901 |
+
candidate_is_hard_negative.append(1.0)
|
| 902 |
stacked_outcomes = {
|
| 903 |
"candidate_rollout_support_mode": np.stack([item["rollout_support_mode"] for item in outcomes], axis=0).astype(np.int64),
|
| 904 |
+
"candidate_rollout_phase": np.stack([item["rollout_phase"] for item in outcomes], axis=0).astype(np.int64),
|
| 905 |
"candidate_rollout_corridor_feasible": np.stack(
|
| 906 |
[item["rollout_corridor_feasible"] for item in outcomes], axis=0
|
| 907 |
).astype(np.float32),
|
|
|
|
| 938 |
),
|
| 939 |
"candidate_reocclusion_rate": np.asarray([item["reocclusion_rate"] for item in outcomes], dtype=np.float32),
|
| 940 |
"candidate_visibility_integral": np.asarray([item["visibility_integral"] for item in outcomes], dtype=np.float32),
|
| 941 |
+
"candidate_actor_feasibility_auc": np.asarray([item["actor_feasibility_auc"] for item in outcomes], dtype=np.float32),
|
| 942 |
+
"candidate_reveal_achieved": np.asarray([item["reveal_achieved"] for item in outcomes], dtype=np.float32),
|
| 943 |
+
"candidate_hold_persistence": np.asarray([item["hold_persistence"] for item in outcomes], dtype=np.float32),
|
| 944 |
+
"candidate_support_stability_auc": np.asarray([item["support_stability_auc"] for item in outcomes], dtype=np.float32),
|
| 945 |
+
"candidate_disturbance_auc": np.asarray([item["disturbance_auc"] for item in outcomes], dtype=np.float32),
|
| 946 |
+
"candidate_macro_ids": np.asarray(candidate_macro_ids, dtype=np.int64),
|
| 947 |
+
"candidate_is_hard_negative": np.asarray(candidate_is_hard_negative, dtype=np.float32),
|
| 948 |
}
|
| 949 |
stacked_outcomes["candidate_risk"] = np.clip(
|
| 950 |
stacked_outcomes["candidate_final_disturbance_cost"] + stacked_outcomes["candidate_reocclusion_rate"],
|
|
|
|
| 954 |
stacked_outcomes["candidate_utility"] = (
|
| 955 |
stacked_outcomes["candidate_retrieval_success"] - stacked_outcomes["candidate_risk"]
|
| 956 |
).astype(np.float32)
|
| 957 |
+
stacked_outcomes["candidate_macro_names"] = candidate_macro_names
|
| 958 |
+
stacked_outcomes["candidate_negative_families"] = candidate_negative_families
|
| 959 |
+
if self.proxy_name == FOLIAGE_PROXY.name:
|
| 960 |
+
stacked_outcomes["candidate_gap_width"] = np.asarray([item["candidate_gap_width"] for item in outcomes], dtype=np.float32)
|
| 961 |
+
stacked_outcomes["candidate_damage_proxy"] = np.asarray([item["candidate_damage_proxy"] for item in outcomes], dtype=np.float32)
|
| 962 |
+
elif self.proxy_name == BAG_PROXY.name:
|
| 963 |
+
stacked_outcomes["candidate_mouth_aperture"] = np.asarray([item["candidate_mouth_aperture"] for item in outcomes], dtype=np.float32)
|
| 964 |
+
stacked_outcomes["candidate_hold_quality"] = np.asarray([item["candidate_hold_quality"] for item in outcomes], dtype=np.float32)
|
| 965 |
+
stacked_outcomes["candidate_rim_slip_risk"] = np.asarray([item["candidate_rim_slip_risk"] for item in outcomes], dtype=np.float32)
|
| 966 |
+
elif self.proxy_name == CLOTH_PROXY.name:
|
| 967 |
+
stacked_outcomes["candidate_fold_preservation"] = np.asarray([item["candidate_fold_preservation"] for item in outcomes], dtype=np.float32)
|
| 968 |
+
stacked_outcomes["candidate_layer_separation_quality"] = np.asarray([item["candidate_layer_separation_quality"] for item in outcomes], dtype=np.float32)
|
| 969 |
+
stacked_outcomes["candidate_lift_too_much_risk"] = np.asarray([item["candidate_lift_too_much_risk"] for item in outcomes], dtype=np.float32)
|
| 970 |
return np.stack(candidates, axis=0).astype(np.float32), stacked_outcomes
|
| 971 |
|
| 972 |
def step(self, action: np.ndarray) -> tuple[dict[str, Any], float, bool, bool, dict[str, Any]]:
|
code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact.yaml
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17
|
| 2 |
+
output_dir: /workspace/outputs/r3d_handoff
|
| 3 |
+
device: cuda
|
| 4 |
+
seed: 17
|
| 5 |
+
init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt
|
| 6 |
+
init_strict: false
|
| 7 |
+
data:
|
| 8 |
+
proxies: [foliage_proxy, bag_proxy, cloth_proxy]
|
| 9 |
+
resolution: 224
|
| 10 |
+
dataset_version: reveal_proxy_v6_rgbd_elastic_state
|
| 11 |
+
train_episodes_per_proxy: 48
|
| 12 |
+
val_episodes_per_proxy: 16
|
| 13 |
+
train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3.pt
|
| 14 |
+
val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3.pt
|
| 15 |
+
rebuild_dataset: false
|
| 16 |
+
chunk_horizon: 8
|
| 17 |
+
rollout_horizon: 5
|
| 18 |
+
history_steps: 6
|
| 19 |
+
planner_candidates: 8
|
| 20 |
+
seed: 17
|
| 21 |
+
optim:
|
| 22 |
+
epochs: 3
|
| 23 |
+
batch_size: 4
|
| 24 |
+
num_workers: 24
|
| 25 |
+
lr: 0.0001
|
| 26 |
+
weight_decay: 0.0001
|
| 27 |
+
trainer:
|
| 28 |
+
policy_type: elastic_reveal
|
| 29 |
+
use_bf16: true
|
| 30 |
+
grad_clip_norm: 1.0
|
| 31 |
+
freeze_backbone: true
|
| 32 |
+
gradient_checkpointing: false
|
| 33 |
+
plan_during_train: true
|
| 34 |
+
plan_during_eval: true
|
| 35 |
+
support_mode_conditioning: true
|
| 36 |
+
planner_mode: trainable
|
| 37 |
+
use_depth: true
|
| 38 |
+
use_world_model: true
|
| 39 |
+
use_role_tokens: true
|
| 40 |
+
compute_equivariance_probe: false
|
| 41 |
+
policy:
|
| 42 |
+
backbone:
|
| 43 |
+
model_name: openai/clip-vit-base-patch32
|
| 44 |
+
hidden_dim: 512
|
| 45 |
+
max_text_tokens: 32
|
| 46 |
+
freeze_backbone: true
|
| 47 |
+
gradient_checkpointing: false
|
| 48 |
+
use_dummy_backbone: false
|
| 49 |
+
fusion:
|
| 50 |
+
hidden_dim: 512
|
| 51 |
+
num_cameras: 3
|
| 52 |
+
num_layers: 4
|
| 53 |
+
num_heads: 8
|
| 54 |
+
ff_dim: 2048
|
| 55 |
+
dropout: 0.1
|
| 56 |
+
proprio_dim: 32
|
| 57 |
+
proprio_tokens: 1
|
| 58 |
+
memory:
|
| 59 |
+
hidden_dim: 512
|
| 60 |
+
action_dim: 14
|
| 61 |
+
history_steps: 6
|
| 62 |
+
scene_history_steps: 3
|
| 63 |
+
belief_history_steps: 8
|
| 64 |
+
num_layers: 2
|
| 65 |
+
dropout: 0.1
|
| 66 |
+
memory_bank_size: 4
|
| 67 |
+
scene_bank_size: 2
|
| 68 |
+
belief_bank_size: 2
|
| 69 |
+
num_heads: 8
|
| 70 |
+
max_history_steps: 8
|
| 71 |
+
decoder:
|
| 72 |
+
hidden_dim: 512
|
| 73 |
+
num_heads: 8
|
| 74 |
+
num_layers: 4
|
| 75 |
+
ff_dim: 2048
|
| 76 |
+
dropout: 0.1
|
| 77 |
+
chunk_size: 8
|
| 78 |
+
action_dim: 14
|
| 79 |
+
arm_action_dim: 7
|
| 80 |
+
num_candidates: 8
|
| 81 |
+
num_phases: 5
|
| 82 |
+
num_arm_roles: 4
|
| 83 |
+
num_proposal_modes: 7
|
| 84 |
+
planner_top_k: 4
|
| 85 |
+
reveal_head:
|
| 86 |
+
hidden_dim: 512
|
| 87 |
+
num_support_modes: 3
|
| 88 |
+
num_approach_templates: 32
|
| 89 |
+
rollout_horizon: 5
|
| 90 |
+
belief_map_size: 32
|
| 91 |
+
field_size: 16
|
| 92 |
+
num_heads: 8
|
| 93 |
+
predict_belief_map: true
|
| 94 |
+
num_phases: 5
|
| 95 |
+
num_arm_roles: 4
|
| 96 |
+
num_interaction_tokens: 8
|
| 97 |
+
num_tasks: 4
|
| 98 |
+
world_model:
|
| 99 |
+
hidden_dim: 512
|
| 100 |
+
action_dim: 14
|
| 101 |
+
num_support_modes: 3
|
| 102 |
+
num_approach_templates: 32
|
| 103 |
+
rollout_horizon: 5
|
| 104 |
+
field_size: 16
|
| 105 |
+
num_heads: 8
|
| 106 |
+
num_phases: 5
|
| 107 |
+
num_arm_roles: 4
|
| 108 |
+
num_interaction_tokens: 8
|
| 109 |
+
belief_map_size: 32
|
| 110 |
+
predict_belief_map: true
|
| 111 |
+
scene_bank_size: 2
|
| 112 |
+
belief_bank_size: 2
|
| 113 |
+
rollout_mode: compact_rollout
|
| 114 |
+
num_tasks: 4
|
| 115 |
+
planner:
|
| 116 |
+
hidden_dim: 512
|
| 117 |
+
num_candidates: 8
|
| 118 |
+
action_dim: 14
|
| 119 |
+
num_support_modes: 3
|
| 120 |
+
utility_margin: 0.1
|
| 121 |
+
num_heads: 8
|
| 122 |
+
num_layers: 2
|
| 123 |
+
num_phases: 5
|
| 124 |
+
num_arm_roles: 4
|
| 125 |
+
top_k: 4
|
| 126 |
+
loss_weights:
|
| 127 |
+
action: 1.0
|
| 128 |
+
phase: 0.05
|
| 129 |
+
arm_role: 0.1
|
| 130 |
+
support_mode: 0.1
|
| 131 |
+
corridor: 0.12
|
| 132 |
+
persistence: 0.06
|
| 133 |
+
disturbance: 0.06
|
| 134 |
+
world_model: 0.2
|
| 135 |
+
belief: 0.05
|
| 136 |
+
visibility: 0.05
|
| 137 |
+
clearance: 0.06
|
| 138 |
+
support_stability: 0.06
|
| 139 |
+
reocclusion: 0.06
|
| 140 |
+
occluder_contact: 0.05
|
| 141 |
+
grasp_affordance: 0.05
|
| 142 |
+
planner_success: 0.2
|
| 143 |
+
planner_risk: 0.08
|
| 144 |
+
planner_ranking: 0.2
|
| 145 |
+
proposal_reconstruction: 0.08
|
| 146 |
+
proposal_success: 0.12
|
| 147 |
+
proposal_ranking: 0.15
|
| 148 |
+
proposal_diversity: 0.05
|
| 149 |
+
role_swap_consistency: 0.02
|
| 150 |
+
task_metrics: 0.05
|
code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase.yaml
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17
|
| 2 |
+
output_dir: /workspace/outputs/r3d_handoff_phase
|
| 3 |
+
device: cuda
|
| 4 |
+
seed: 17
|
| 5 |
+
init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt
|
| 6 |
+
init_strict: false
|
| 7 |
+
data:
|
| 8 |
+
proxies: [foliage_proxy, bag_proxy, cloth_proxy]
|
| 9 |
+
resolution: 224
|
| 10 |
+
dataset_version: reveal_proxy_v6_rgbd_elastic_state_phase
|
| 11 |
+
train_episodes_per_proxy: 48
|
| 12 |
+
val_episodes_per_proxy: 16
|
| 13 |
+
train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_phase.pt
|
| 14 |
+
val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_phase.pt
|
| 15 |
+
rebuild_dataset: false
|
| 16 |
+
chunk_horizon: 8
|
| 17 |
+
rollout_horizon: 5
|
| 18 |
+
history_steps: 6
|
| 19 |
+
planner_candidates: 8
|
| 20 |
+
seed: 17
|
| 21 |
+
optim:
|
| 22 |
+
epochs: 3
|
| 23 |
+
batch_size: 4
|
| 24 |
+
num_workers: 24
|
| 25 |
+
lr: 0.0001
|
| 26 |
+
weight_decay: 0.0001
|
| 27 |
+
trainer:
|
| 28 |
+
policy_type: elastic_reveal
|
| 29 |
+
use_bf16: true
|
| 30 |
+
grad_clip_norm: 1.0
|
| 31 |
+
freeze_backbone: true
|
| 32 |
+
gradient_checkpointing: false
|
| 33 |
+
plan_during_train: true
|
| 34 |
+
plan_during_eval: true
|
| 35 |
+
support_mode_conditioning: true
|
| 36 |
+
planner_mode: trainable
|
| 37 |
+
use_depth: true
|
| 38 |
+
use_world_model: true
|
| 39 |
+
use_role_tokens: true
|
| 40 |
+
compute_equivariance_probe: false
|
| 41 |
+
policy:
|
| 42 |
+
backbone: {model_name: openai/clip-vit-base-patch32, hidden_dim: 512, max_text_tokens: 32, freeze_backbone: true, gradient_checkpointing: false, use_dummy_backbone: false}
|
| 43 |
+
fusion: {hidden_dim: 512, num_cameras: 3, num_layers: 4, num_heads: 8, ff_dim: 2048, dropout: 0.1, proprio_dim: 32, proprio_tokens: 1}
|
| 44 |
+
memory: {hidden_dim: 512, action_dim: 14, history_steps: 6, scene_history_steps: 3, belief_history_steps: 8, num_layers: 2, dropout: 0.1, memory_bank_size: 4, scene_bank_size: 2, belief_bank_size: 2, num_heads: 8, max_history_steps: 8}
|
| 45 |
+
decoder: {hidden_dim: 512, num_heads: 8, num_layers: 4, ff_dim: 2048, dropout: 0.1, chunk_size: 8, action_dim: 14, arm_action_dim: 7, num_candidates: 8, num_phases: 5, num_arm_roles: 4, num_proposal_modes: 7, planner_top_k: 4}
|
| 46 |
+
reveal_head: {hidden_dim: 512, num_support_modes: 3, num_approach_templates: 32, rollout_horizon: 5, belief_map_size: 32, field_size: 16, num_heads: 8, predict_belief_map: true, num_phases: 5, num_arm_roles: 4, num_interaction_tokens: 8, num_tasks: 4}
|
| 47 |
+
world_model: {hidden_dim: 512, action_dim: 14, num_support_modes: 3, num_approach_templates: 32, rollout_horizon: 5, field_size: 16, num_heads: 8, num_phases: 5, num_arm_roles: 4, num_interaction_tokens: 8, belief_map_size: 32, predict_belief_map: true, scene_bank_size: 2, belief_bank_size: 2, rollout_mode: compact_rollout, num_tasks: 4}
|
| 48 |
+
planner: {hidden_dim: 512, num_candidates: 8, action_dim: 14, num_support_modes: 3, utility_margin: 0.1, num_heads: 8, num_layers: 2, num_phases: 5, num_arm_roles: 4, top_k: 4}
|
| 49 |
+
loss_weights:
|
| 50 |
+
action: 1.0
|
| 51 |
+
phase: 0.08
|
| 52 |
+
arm_role: 0.1
|
| 53 |
+
support_mode: 0.1
|
| 54 |
+
corridor: 0.12
|
| 55 |
+
persistence: 0.06
|
| 56 |
+
disturbance: 0.06
|
| 57 |
+
world_model: 0.2
|
| 58 |
+
belief: 0.05
|
| 59 |
+
visibility: 0.05
|
| 60 |
+
clearance: 0.06
|
| 61 |
+
support_stability: 0.06
|
| 62 |
+
reocclusion: 0.06
|
| 63 |
+
occluder_contact: 0.05
|
| 64 |
+
grasp_affordance: 0.05
|
| 65 |
+
planner_success: 0.2
|
| 66 |
+
planner_risk: 0.08
|
| 67 |
+
planner_ranking: 0.2
|
| 68 |
+
proposal_reconstruction: 0.08
|
| 69 |
+
proposal_success: 0.12
|
| 70 |
+
proposal_ranking: 0.15
|
| 71 |
+
proposal_diversity: 0.05
|
| 72 |
+
role_swap_consistency: 0.02
|
| 73 |
+
task_metrics: 0.05
|
code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial.yaml
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17
|
| 2 |
+
output_dir: /workspace/outputs/r3d_handoff
|
| 3 |
+
device: cuda
|
| 4 |
+
seed: 17
|
| 5 |
+
init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt
|
| 6 |
+
init_strict: false
|
| 7 |
+
data:
|
| 8 |
+
proxies: [foliage_proxy, bag_proxy, cloth_proxy]
|
| 9 |
+
resolution: 224
|
| 10 |
+
dataset_version: reveal_proxy_v6_rgbd_elastic_state
|
| 11 |
+
train_episodes_per_proxy: 48
|
| 12 |
+
val_episodes_per_proxy: 16
|
| 13 |
+
train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3.pt
|
| 14 |
+
val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3.pt
|
| 15 |
+
rebuild_dataset: false
|
| 16 |
+
chunk_horizon: 8
|
| 17 |
+
rollout_horizon: 5
|
| 18 |
+
history_steps: 6
|
| 19 |
+
planner_candidates: 8
|
| 20 |
+
seed: 17
|
| 21 |
+
optim:
|
| 22 |
+
epochs: 5
|
| 23 |
+
batch_size: 4
|
| 24 |
+
num_workers: 24
|
| 25 |
+
lr: 0.00015
|
| 26 |
+
weight_decay: 0.0001
|
| 27 |
+
trainer:
|
| 28 |
+
policy_type: elastic_reveal
|
| 29 |
+
use_bf16: true
|
| 30 |
+
grad_clip_norm: 1.0
|
| 31 |
+
freeze_backbone: true
|
| 32 |
+
gradient_checkpointing: false
|
| 33 |
+
plan_during_train: true
|
| 34 |
+
plan_during_eval: true
|
| 35 |
+
support_mode_conditioning: true
|
| 36 |
+
planner_mode: trainable
|
| 37 |
+
use_depth: true
|
| 38 |
+
use_world_model: true
|
| 39 |
+
use_role_tokens: true
|
| 40 |
+
compute_equivariance_probe: false
|
| 41 |
+
policy:
|
| 42 |
+
backbone:
|
| 43 |
+
model_name: openai/clip-vit-base-patch32
|
| 44 |
+
hidden_dim: 512
|
| 45 |
+
max_text_tokens: 32
|
| 46 |
+
freeze_backbone: true
|
| 47 |
+
gradient_checkpointing: false
|
| 48 |
+
use_dummy_backbone: false
|
| 49 |
+
fusion:
|
| 50 |
+
hidden_dim: 512
|
| 51 |
+
num_cameras: 3
|
| 52 |
+
num_layers: 4
|
| 53 |
+
num_heads: 8
|
| 54 |
+
ff_dim: 2048
|
| 55 |
+
dropout: 0.1
|
| 56 |
+
proprio_dim: 32
|
| 57 |
+
proprio_tokens: 1
|
| 58 |
+
memory:
|
| 59 |
+
hidden_dim: 512
|
| 60 |
+
action_dim: 14
|
| 61 |
+
history_steps: 6
|
| 62 |
+
scene_history_steps: 3
|
| 63 |
+
belief_history_steps: 8
|
| 64 |
+
num_layers: 2
|
| 65 |
+
dropout: 0.1
|
| 66 |
+
memory_bank_size: 4
|
| 67 |
+
scene_bank_size: 2
|
| 68 |
+
belief_bank_size: 2
|
| 69 |
+
num_heads: 8
|
| 70 |
+
max_history_steps: 8
|
| 71 |
+
decoder:
|
| 72 |
+
hidden_dim: 512
|
| 73 |
+
num_heads: 8
|
| 74 |
+
num_layers: 4
|
| 75 |
+
ff_dim: 2048
|
| 76 |
+
dropout: 0.1
|
| 77 |
+
chunk_size: 8
|
| 78 |
+
action_dim: 14
|
| 79 |
+
arm_action_dim: 7
|
| 80 |
+
num_candidates: 8
|
| 81 |
+
num_phases: 5
|
| 82 |
+
num_arm_roles: 4
|
| 83 |
+
num_proposal_modes: 7
|
| 84 |
+
planner_top_k: 4
|
| 85 |
+
reveal_head:
|
| 86 |
+
hidden_dim: 512
|
| 87 |
+
num_support_modes: 3
|
| 88 |
+
num_approach_templates: 32
|
| 89 |
+
rollout_horizon: 5
|
| 90 |
+
belief_map_size: 32
|
| 91 |
+
field_size: 16
|
| 92 |
+
num_heads: 8
|
| 93 |
+
predict_belief_map: true
|
| 94 |
+
num_phases: 5
|
| 95 |
+
num_arm_roles: 4
|
| 96 |
+
num_interaction_tokens: 8
|
| 97 |
+
num_tasks: 4
|
| 98 |
+
world_model:
|
| 99 |
+
hidden_dim: 512
|
| 100 |
+
action_dim: 14
|
| 101 |
+
num_support_modes: 3
|
| 102 |
+
num_approach_templates: 32
|
| 103 |
+
rollout_horizon: 5
|
| 104 |
+
field_size: 16
|
| 105 |
+
num_heads: 8
|
| 106 |
+
num_phases: 5
|
| 107 |
+
num_arm_roles: 4
|
| 108 |
+
num_interaction_tokens: 8
|
| 109 |
+
belief_map_size: 32
|
| 110 |
+
predict_belief_map: true
|
| 111 |
+
scene_bank_size: 2
|
| 112 |
+
belief_bank_size: 2
|
| 113 |
+
rollout_mode: spatial_rollout
|
| 114 |
+
num_tasks: 4
|
| 115 |
+
planner:
|
| 116 |
+
hidden_dim: 512
|
| 117 |
+
num_candidates: 8
|
| 118 |
+
action_dim: 14
|
| 119 |
+
num_support_modes: 3
|
| 120 |
+
utility_margin: 0.1
|
| 121 |
+
num_heads: 8
|
| 122 |
+
num_layers: 2
|
| 123 |
+
num_phases: 5
|
| 124 |
+
num_arm_roles: 4
|
| 125 |
+
top_k: 4
|
| 126 |
+
loss_weights:
|
| 127 |
+
action: 0.6
|
| 128 |
+
phase: 0.05
|
| 129 |
+
arm_role: 0.1
|
| 130 |
+
support_mode: 0.1
|
| 131 |
+
corridor: 0.15
|
| 132 |
+
persistence: 0.08
|
| 133 |
+
disturbance: 0.08
|
| 134 |
+
world_model: 0.35
|
| 135 |
+
belief: 0.05
|
| 136 |
+
visibility: 0.05
|
| 137 |
+
clearance: 0.08
|
| 138 |
+
support_stability: 0.08
|
| 139 |
+
reocclusion: 0.08
|
| 140 |
+
occluder_contact: 0.05
|
| 141 |
+
grasp_affordance: 0.05
|
| 142 |
+
planner_success: 0.25
|
| 143 |
+
planner_risk: 0.1
|
| 144 |
+
planner_ranking: 0.25
|
| 145 |
+
proposal_reconstruction: 0.05
|
| 146 |
+
proposal_success: 0.2
|
| 147 |
+
proposal_ranking: 0.25
|
| 148 |
+
proposal_diversity: 0.05
|
| 149 |
+
role_swap_consistency: 0.02
|
| 150 |
+
task_metrics: 0.1
|
code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase.yaml
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17
|
| 2 |
+
output_dir: /workspace/outputs/r3d_handoff_phase
|
| 3 |
+
device: cuda
|
| 4 |
+
seed: 17
|
| 5 |
+
init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt
|
| 6 |
+
init_strict: false
|
| 7 |
+
data:
|
| 8 |
+
proxies: [foliage_proxy, bag_proxy, cloth_proxy]
|
| 9 |
+
resolution: 224
|
| 10 |
+
dataset_version: reveal_proxy_v6_rgbd_elastic_state_phase
|
| 11 |
+
train_episodes_per_proxy: 48
|
| 12 |
+
val_episodes_per_proxy: 16
|
| 13 |
+
train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_phase.pt
|
| 14 |
+
val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_phase.pt
|
| 15 |
+
rebuild_dataset: false
|
| 16 |
+
chunk_horizon: 8
|
| 17 |
+
rollout_horizon: 5
|
| 18 |
+
history_steps: 6
|
| 19 |
+
planner_candidates: 8
|
| 20 |
+
seed: 17
|
| 21 |
+
optim:
|
| 22 |
+
epochs: 4
|
| 23 |
+
batch_size: 4
|
| 24 |
+
num_workers: 24
|
| 25 |
+
lr: 0.00015
|
| 26 |
+
weight_decay: 0.0001
|
| 27 |
+
trainer:
|
| 28 |
+
policy_type: elastic_reveal
|
| 29 |
+
use_bf16: true
|
| 30 |
+
grad_clip_norm: 1.0
|
| 31 |
+
freeze_backbone: true
|
| 32 |
+
gradient_checkpointing: false
|
| 33 |
+
plan_during_train: true
|
| 34 |
+
plan_during_eval: true
|
| 35 |
+
support_mode_conditioning: true
|
| 36 |
+
planner_mode: trainable
|
| 37 |
+
use_depth: true
|
| 38 |
+
use_world_model: true
|
| 39 |
+
use_role_tokens: true
|
| 40 |
+
compute_equivariance_probe: false
|
| 41 |
+
policy:
|
| 42 |
+
backbone: {model_name: openai/clip-vit-base-patch32, hidden_dim: 512, max_text_tokens: 32, freeze_backbone: true, gradient_checkpointing: false, use_dummy_backbone: false}
|
| 43 |
+
fusion: {hidden_dim: 512, num_cameras: 3, num_layers: 4, num_heads: 8, ff_dim: 2048, dropout: 0.1, proprio_dim: 32, proprio_tokens: 1}
|
| 44 |
+
memory: {hidden_dim: 512, action_dim: 14, history_steps: 6, scene_history_steps: 3, belief_history_steps: 8, num_layers: 2, dropout: 0.1, memory_bank_size: 4, scene_bank_size: 2, belief_bank_size: 2, num_heads: 8, max_history_steps: 8}
|
| 45 |
+
decoder: {hidden_dim: 512, num_heads: 8, num_layers: 4, ff_dim: 2048, dropout: 0.1, chunk_size: 8, action_dim: 14, arm_action_dim: 7, num_candidates: 8, num_phases: 5, num_arm_roles: 4, num_proposal_modes: 7, planner_top_k: 4}
|
| 46 |
+
reveal_head: {hidden_dim: 512, num_support_modes: 3, num_approach_templates: 32, rollout_horizon: 5, belief_map_size: 32, field_size: 16, num_heads: 8, predict_belief_map: true, num_phases: 5, num_arm_roles: 4, num_interaction_tokens: 8, num_tasks: 4}
|
| 47 |
+
world_model: {hidden_dim: 512, action_dim: 14, num_support_modes: 3, num_approach_templates: 32, rollout_horizon: 5, field_size: 16, num_heads: 8, num_phases: 5, num_arm_roles: 4, num_interaction_tokens: 8, belief_map_size: 32, predict_belief_map: true, scene_bank_size: 2, belief_bank_size: 2, rollout_mode: spatial_rollout, num_tasks: 4}
|
| 48 |
+
planner: {hidden_dim: 512, num_candidates: 8, action_dim: 14, num_support_modes: 3, utility_margin: 0.1, num_heads: 8, num_layers: 2, num_phases: 5, num_arm_roles: 4, top_k: 4}
|
| 49 |
+
loss_weights:
|
| 50 |
+
action: 0.6
|
| 51 |
+
phase: 0.08
|
| 52 |
+
arm_role: 0.1
|
| 53 |
+
support_mode: 0.1
|
| 54 |
+
corridor: 0.15
|
| 55 |
+
persistence: 0.08
|
| 56 |
+
disturbance: 0.08
|
| 57 |
+
world_model: 0.35
|
| 58 |
+
belief: 0.05
|
| 59 |
+
visibility: 0.05
|
| 60 |
+
clearance: 0.08
|
| 61 |
+
support_stability: 0.08
|
| 62 |
+
reocclusion: 0.08
|
| 63 |
+
occluder_contact: 0.05
|
| 64 |
+
grasp_affordance: 0.05
|
| 65 |
+
planner_success: 0.25
|
| 66 |
+
planner_risk: 0.1
|
| 67 |
+
planner_ranking: 0.25
|
| 68 |
+
proposal_reconstruction: 0.05
|
| 69 |
+
proposal_success: 0.2
|
| 70 |
+
proposal_ranking: 0.25
|
| 71 |
+
proposal_diversity: 0.05
|
| 72 |
+
role_swap_consistency: 0.02
|
| 73 |
+
task_metrics: 0.1
|
code/reveal_vla_bimanual/train/losses.py
CHANGED
|
@@ -32,6 +32,7 @@ class LossWeights:
|
|
| 32 |
proposal_ranking: float = 0.05
|
| 33 |
proposal_diversity: float = 0.05
|
| 34 |
role_swap_consistency: float = 0.05
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
def chunk_bc_loss(pred_actions: Tensor, target_actions: Tensor, mask: Tensor | None = None) -> Tensor:
|
|
@@ -113,12 +114,15 @@ def _resize_like(target: Tensor, prediction: Tensor) -> Tensor:
|
|
| 113 |
def reveal_state_loss(pred: dict[str, Tensor], target: dict[str, Tensor], weights: LossWeights) -> dict[str, Tensor]:
|
| 114 |
losses = {}
|
| 115 |
if "phase_logits" in pred:
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
phase_target = infer_phase_targets_from_actions(action_chunk[:, 0])
|
| 119 |
else:
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
losses["phase"] = F.cross_entropy(pred["phase_logits"], phase_target)
|
| 123 |
else:
|
| 124 |
losses["phase"] = pred["support_mode_logits"].new_tensor(0.0)
|
|
@@ -190,6 +194,33 @@ def reveal_state_loss(pred: dict[str, Tensor], target: dict[str, Tensor], weight
|
|
| 190 |
losses["uncertainty"] = pred["persistence_uncertainty"].mean()
|
| 191 |
else:
|
| 192 |
losses["uncertainty"] = pred["support_mode_logits"].new_tensor(0.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
return losses
|
| 194 |
|
| 195 |
|
|
@@ -221,6 +252,8 @@ def world_model_rollout_consistency_loss(pred_rollout: dict[str, Tensor], target
|
|
| 221 |
"disturbance_cost": _expand_target(target_rollout["disturbance_cost"][..., :horizon]),
|
| 222 |
"action_chunk": _expand_target(target_rollout["action_chunk"][..., :horizon, :]),
|
| 223 |
}
|
|
|
|
|
|
|
| 224 |
loss = (
|
| 225 |
F.cross_entropy(
|
| 226 |
pred_rollout["support_mode_logits"].reshape(-1, pred_rollout["support_mode_logits"].shape[-1]),
|
|
@@ -234,7 +267,9 @@ def world_model_rollout_consistency_loss(pred_rollout: dict[str, Tensor], target
|
|
| 234 |
+ F.mse_loss(pred_rollout["disturbance_cost"], target_rollout["disturbance_cost"].float())
|
| 235 |
)
|
| 236 |
if "phase_logits" in pred_rollout:
|
| 237 |
-
phase_target =
|
|
|
|
|
|
|
| 238 |
loss = loss + 0.5 * F.cross_entropy(
|
| 239 |
pred_rollout["phase_logits"].reshape(-1, pred_rollout["phase_logits"].shape[-1]),
|
| 240 |
phase_target.reshape(-1),
|
|
@@ -300,6 +335,7 @@ def compute_total_loss(
|
|
| 300 |
+ weights.occluder_contact * reveal_losses["occluder_contact"]
|
| 301 |
+ weights.grasp_affordance * reveal_losses["grasp_affordance"]
|
| 302 |
+ weights.reocclusion * reveal_losses["reocclusion"]
|
|
|
|
| 303 |
+ 0.01 * reveal_losses["uncertainty"]
|
| 304 |
)
|
| 305 |
|
|
@@ -314,6 +350,8 @@ def compute_total_loss(
|
|
| 314 |
"disturbance_cost": batch["candidate_rollout_disturbance_cost"],
|
| 315 |
"action_chunk": batch["candidate_action_chunks"],
|
| 316 |
}
|
|
|
|
|
|
|
| 317 |
for optional_key in (
|
| 318 |
"candidate_rollout_belief_map",
|
| 319 |
"candidate_rollout_visibility_map",
|
|
@@ -344,6 +382,8 @@ def compute_total_loss(
|
|
| 344 |
"disturbance_cost": batch["rollout_disturbance_cost"],
|
| 345 |
"action_chunk": batch["action_chunk"],
|
| 346 |
}
|
|
|
|
|
|
|
| 347 |
for optional_key in (
|
| 348 |
"rollout_belief_map",
|
| 349 |
"rollout_visibility_map",
|
|
|
|
| 32 |
proposal_ranking: float = 0.05
|
| 33 |
proposal_diversity: float = 0.05
|
| 34 |
role_swap_consistency: float = 0.05
|
| 35 |
+
task_metrics: float = 0.05
|
| 36 |
|
| 37 |
|
| 38 |
def chunk_bc_loss(pred_actions: Tensor, target_actions: Tensor, mask: Tensor | None = None) -> Tensor:
|
|
|
|
| 114 |
def reveal_state_loss(pred: dict[str, Tensor], target: dict[str, Tensor], weights: LossWeights) -> dict[str, Tensor]:
|
| 115 |
losses = {}
|
| 116 |
if "phase_logits" in pred:
|
| 117 |
+
if "phase" in target:
|
| 118 |
+
phase_target = target["phase"].long()
|
|
|
|
| 119 |
else:
|
| 120 |
+
action_chunk = target.get("action_chunk")
|
| 121 |
+
if action_chunk is not None:
|
| 122 |
+
phase_target = infer_phase_targets_from_actions(action_chunk[:, 0])
|
| 123 |
+
else:
|
| 124 |
+
phase_map = torch.as_tensor([2, 3, 0], device=target["support_mode"].device, dtype=torch.long)
|
| 125 |
+
phase_target = phase_map[target["support_mode"].long()]
|
| 126 |
losses["phase"] = F.cross_entropy(pred["phase_logits"], phase_target)
|
| 127 |
else:
|
| 128 |
losses["phase"] = pred["support_mode_logits"].new_tensor(0.0)
|
|
|
|
| 194 |
losses["uncertainty"] = pred["persistence_uncertainty"].mean()
|
| 195 |
else:
|
| 196 |
losses["uncertainty"] = pred["support_mode_logits"].new_tensor(0.0)
|
| 197 |
+
task_metric_pairs = (
|
| 198 |
+
"opening_quality",
|
| 199 |
+
"actor_feasibility_score",
|
| 200 |
+
"gap_width",
|
| 201 |
+
"damage_proxy",
|
| 202 |
+
"release_collapse_rate",
|
| 203 |
+
"target_visibility_confidence",
|
| 204 |
+
"mouth_aperture",
|
| 205 |
+
"hold_quality",
|
| 206 |
+
"rim_slip_risk",
|
| 207 |
+
"insertable_actor_corridor",
|
| 208 |
+
"layer_separation_quality",
|
| 209 |
+
"fold_preservation",
|
| 210 |
+
"insertion_corridor",
|
| 211 |
+
"top_layer_stability",
|
| 212 |
+
"lift_too_much_risk",
|
| 213 |
+
)
|
| 214 |
+
task_losses = [
|
| 215 |
+
F.mse_loss(pred[key].float(), target[key].float())
|
| 216 |
+
for key in task_metric_pairs
|
| 217 |
+
if key in pred and key in target
|
| 218 |
+
]
|
| 219 |
+
losses["task_metrics"] = (
|
| 220 |
+
torch.stack(task_losses).mean()
|
| 221 |
+
if task_losses
|
| 222 |
+
else pred["support_mode_logits"].new_tensor(0.0)
|
| 223 |
+
)
|
| 224 |
return losses
|
| 225 |
|
| 226 |
|
|
|
|
| 252 |
"disturbance_cost": _expand_target(target_rollout["disturbance_cost"][..., :horizon]),
|
| 253 |
"action_chunk": _expand_target(target_rollout["action_chunk"][..., :horizon, :]),
|
| 254 |
}
|
| 255 |
+
if "phase" in target_rollout:
|
| 256 |
+
target_rollout["phase"] = _expand_target(target_rollout["phase"][..., :horizon])
|
| 257 |
loss = (
|
| 258 |
F.cross_entropy(
|
| 259 |
pred_rollout["support_mode_logits"].reshape(-1, pred_rollout["support_mode_logits"].shape[-1]),
|
|
|
|
| 267 |
+ F.mse_loss(pred_rollout["disturbance_cost"], target_rollout["disturbance_cost"].float())
|
| 268 |
)
|
| 269 |
if "phase_logits" in pred_rollout:
|
| 270 |
+
phase_target = target_rollout.get("phase")
|
| 271 |
+
if phase_target is None:
|
| 272 |
+
phase_target = infer_phase_targets_from_actions(target_rollout["action_chunk"])
|
| 273 |
loss = loss + 0.5 * F.cross_entropy(
|
| 274 |
pred_rollout["phase_logits"].reshape(-1, pred_rollout["phase_logits"].shape[-1]),
|
| 275 |
phase_target.reshape(-1),
|
|
|
|
| 335 |
+ weights.occluder_contact * reveal_losses["occluder_contact"]
|
| 336 |
+ weights.grasp_affordance * reveal_losses["grasp_affordance"]
|
| 337 |
+ weights.reocclusion * reveal_losses["reocclusion"]
|
| 338 |
+
+ weights.task_metrics * reveal_losses["task_metrics"]
|
| 339 |
+ 0.01 * reveal_losses["uncertainty"]
|
| 340 |
)
|
| 341 |
|
|
|
|
| 350 |
"disturbance_cost": batch["candidate_rollout_disturbance_cost"],
|
| 351 |
"action_chunk": batch["candidate_action_chunks"],
|
| 352 |
}
|
| 353 |
+
if "candidate_rollout_phase" in batch:
|
| 354 |
+
rollout_target["phase"] = batch["candidate_rollout_phase"]
|
| 355 |
for optional_key in (
|
| 356 |
"candidate_rollout_belief_map",
|
| 357 |
"candidate_rollout_visibility_map",
|
|
|
|
| 382 |
"disturbance_cost": batch["rollout_disturbance_cost"],
|
| 383 |
"action_chunk": batch["action_chunk"],
|
| 384 |
}
|
| 385 |
+
if "rollout_phase" in batch:
|
| 386 |
+
rollout_target["phase"] = batch["rollout_phase"]
|
| 387 |
for optional_key in (
|
| 388 |
"rollout_belief_map",
|
| 389 |
"rollout_visibility_map",
|
results/2026-03-25-runpod/README.md
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 2026-03-25 Runpod Raw Index
|
| 2 |
+
|
| 3 |
+
This directory contains the source handoff instructions, raw report files copied from `/workspace/reports`, and a raw index for the artifacts produced in the `2026-03-25 UTC` session.
|
| 4 |
+
|
| 5 |
+
## Source Handoff
|
| 6 |
+
|
| 7 |
+
- `instructions.md`
|
| 8 |
+
|
| 9 |
+
## Test Suite
|
| 10 |
+
|
| 11 |
+
- Command:
|
| 12 |
+
- `PYTHONPATH=/workspace/VLAarchtests_work/code/reveal_vla_bimanual python -m pytest -q /workspace/VLAarchtests_work/tests`
|
| 13 |
+
- Result:
|
| 14 |
+
- `33 passed`
|
| 15 |
+
|
| 16 |
+
## Generated Datasets
|
| 17 |
+
|
| 18 |
+
| Path | Size (bytes) |
|
| 19 |
+
| --- | ---: |
|
| 20 |
+
| `artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_phase.pt` | 583377508 |
|
| 21 |
+
| `artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_phase.pt` | 200844508 |
|
| 22 |
+
|
| 23 |
+
## Generated Checkpoints And Training Summaries
|
| 24 |
+
|
| 25 |
+
| Directory | final_train_total | final_val_total | train_time_sec | peak_gpu_memory_mb |
|
| 26 |
+
| --- | ---: | ---: | ---: | ---: |
|
| 27 |
+
| `artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_seed17/` | 0.382780 | 0.372276 | 108.922691 | 2451.385742 |
|
| 28 |
+
| `artifacts/outputs/r3d_handoff/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_seed17/` | 0.518003 | 0.503869 | 163.313406 | 2924.821777 |
|
| 29 |
+
| `artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_seed17/` | 0.385303 | 0.378928 | 128.965583 | 2450.287109 |
|
| 30 |
+
| `artifacts/outputs/r3d_handoff_phase/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase_seed17/` | 0.525366 | 0.507625 | 154.841441 | 2926.074707 |
|
| 31 |
+
|
| 32 |
+
## Proxy Result Files
|
| 33 |
+
|
| 34 |
+
### Serious Comparisons
|
| 35 |
+
|
| 36 |
+
| File | Reference mean success | Compared mean success | Compared foliage | Compared bag | Compared cloth |
|
| 37 |
+
| --- | ---: | ---: | ---: | ---: | ---: |
|
| 38 |
+
| `reports/reveal_handoff_compare_serious/reveal_benchmark.json` | 0.583333 | 0.216667 | 0.330000 | 0.150000 | 0.170000 |
|
| 39 |
+
| `reports/reveal_handoff_compare_serious_compact/reveal_benchmark.json` | 0.583333 | 0.520000 | 0.660000 | 0.320000 | 0.580000 |
|
| 40 |
+
| `reports/reveal_phase_compare_serious_compact/reveal_benchmark.json` | 0.583333 | 0.513333 | 0.570000 | 0.420000 | 0.550000 |
|
| 41 |
+
| `reports/reveal_phase_compare_serious_spatial_compactwm/reveal_benchmark.json` | 0.583333 | 0.493333 | 0.640000 | 0.330000 | 0.510000 |
|
| 42 |
+
|
| 43 |
+
### Compact-Phase Ablation Matrix
|
| 44 |
+
|
| 45 |
+
| Ablation | mean_success | visibility_integral | reocclusion_rate | disturbance_cost |
|
| 46 |
+
| --- | ---: | ---: | ---: | ---: |
|
| 47 |
+
| `full_model` | 0.513333 | 39.978670 | 0.000000 | 0.343669 |
|
| 48 |
+
| `no_geometry` | 0.513333 | 39.983892 | 0.000000 | 0.343637 |
|
| 49 |
+
| `no_spatial_memory` | 0.496667 | 37.758093 | 0.002417 | 0.417673 |
|
| 50 |
+
| `compact_world_model` | 0.513333 | 39.978670 | 0.000000 | 0.343669 |
|
| 51 |
+
| `no_planner` | 0.433333 | 20.634101 | 0.019708 | 0.185775 |
|
| 52 |
+
| `gaussian_candidates_only` | 0.466667 | 16.719086 | 0.029561 | 0.477573 |
|
| 53 |
+
| `no_task_head` | 0.513333 | 38.128876 | 0.000000 | 0.344494 |
|
| 54 |
+
| `no_support_mode_conditioning` | 0.513333 | 39.978670 | 0.000000 | 0.343669 |
|
| 55 |
+
|
| 56 |
+
Files:
|
| 57 |
+
|
| 58 |
+
- `reports/reveal_phase_ablations_compact/ablations.json`
|
| 59 |
+
- `reports/reveal_phase_ablations_compact/ablations.md`
|
| 60 |
+
- `reports/reveal_phase_ablations_compact/ablations.partial.json`
|
| 61 |
+
- `reports/reveal_phase_ablations_spatial/ablations.partial.json`
|
| 62 |
+
|
| 63 |
+
### Teacher Audit
|
| 64 |
+
|
| 65 |
+
| Proxy | Baseline | teacher_success | baseline_success | success_delta | teacher_utility | baseline_utility | utility_delta |
|
| 66 |
+
| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |
|
| 67 |
+
| `foliage_proxy` | `reveal_only` | 1.000000 | 0.000000 | 1.000000 | 1.198432 | 0.140528 | 1.057904 |
|
| 68 |
+
| `foliage_proxy` | `retrieve_only` | 1.000000 | 0.000000 | 1.000000 | 1.198432 | -0.099185 | 1.297617 |
|
| 69 |
+
| `foliage_proxy` | `no_hold` | 1.000000 | 0.000000 | 1.000000 | 1.198432 | 0.084153 | 1.114280 |
|
| 70 |
+
| `foliage_proxy` | `random` | 1.000000 | 0.000000 | 1.000000 | 1.198432 | -0.138381 | 1.336814 |
|
| 71 |
+
| `bag_proxy` | `reveal_only` | 1.000000 | 0.000000 | 1.000000 | 1.196591 | 0.167617 | 1.028974 |
|
| 72 |
+
| `bag_proxy` | `retrieve_only` | 1.000000 | 0.000000 | 1.000000 | 1.196591 | -0.161481 | 1.358072 |
|
| 73 |
+
| `bag_proxy` | `no_hold` | 1.000000 | 0.000000 | 1.000000 | 1.196591 | 0.078455 | 1.118136 |
|
| 74 |
+
| `bag_proxy` | `random` | 1.000000 | 0.000000 | 1.000000 | 1.196591 | -0.181732 | 1.378323 |
|
| 75 |
+
| `cloth_proxy` | `reveal_only` | 1.000000 | 0.000000 | 1.000000 | 1.276440 | 0.347192 | 0.929249 |
|
| 76 |
+
| `cloth_proxy` | `retrieve_only` | 1.000000 | 0.000000 | 1.000000 | 1.276440 | 0.001142 | 1.275299 |
|
| 77 |
+
| `cloth_proxy` | `no_hold` | 1.000000 | 0.000000 | 1.000000 | 1.276440 | 0.507900 | 0.768540 |
|
| 78 |
+
| `cloth_proxy` | `random` | 1.000000 | 0.010000 | 0.990000 | 1.276440 | 0.166110 | 1.110330 |
|
| 79 |
+
|
| 80 |
+
Files:
|
| 81 |
+
|
| 82 |
+
- `reports/reveal_teacher_audit_serious/teacher_audit.json`
|
| 83 |
+
- `reports/reveal_teacher_audit_serious/teacher_audit.md`
|
| 84 |
+
|
| 85 |
+
### Additional Proxy Report Files
|
| 86 |
+
|
| 87 |
+
- `reports/reveal_smoke_mod/reveal_benchmark.json`
|
| 88 |
+
- `reports/reveal_smoke_nogeom/reveal_benchmark.json`
|
| 89 |
+
- `reports/reveal_smoke_noplanner/reveal_benchmark.json`
|
| 90 |
+
- `reports/reveal_handoff_compact_probe/reveal_benchmark.json`
|
| 91 |
+
- `reports/reveal_handoff_compact_train_probe/reveal_benchmark.json`
|
| 92 |
+
- `reports/reveal_phase_probe_compact/reveal_benchmark.json`
|
| 93 |
+
- `reports/reveal_phase_probe_spatial/reveal_benchmark.json`
|
| 94 |
+
- `reports/reveal_phase_probe_spatial_compactwm/reveal_benchmark.json`
|
| 95 |
+
|
| 96 |
+
## RLBench Result Files
|
| 97 |
+
|
| 98 |
+
### Full-Split PerAct2 Rollout Outputs
|
| 99 |
+
|
| 100 |
+
| File | plan_requested | plan_applied | mean_success |
|
| 101 |
+
| --- | --- | --- | ---: |
|
| 102 |
+
| `reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/rollout_eval.json` | `true` | `true` | 0.000000 |
|
| 103 |
+
| `reports/peract2_spatial_full_ep1/spatial_phase_seed17_noplan_split/rollout_eval.json` | `false` | `false` | 0.000000 |
|
| 104 |
+
| `reports/peract2_spatial_full_ep1/spatial_phase_seed17_plan_split/rollout_eval.json` | `true` | `true` | 0.000000 |
|
| 105 |
+
|
| 106 |
+
### Single-Task RLBench Debug Outputs
|
| 107 |
+
|
| 108 |
+
- `reports/rlbench_debug_baseline_pushbox/rollout_eval.json`
|
| 109 |
+
- `reports/rlbench_debug_compact_pushbox/rollout_eval.json`
|
| 110 |
+
- `reports/rlbench_debug_spatial_pushbox_nogeom/rollout_eval.json`
|
| 111 |
+
|
| 112 |
+
## Environment Recreation Files
|
| 113 |
+
|
| 114 |
+
- `environment/README.md`
|
| 115 |
+
- `environment/setup_same_machine.sh`
|
| 116 |
+
- `environment/validate_same_machine.sh`
|
| 117 |
+
- `environment/runtime_env_vars.sh`
|
| 118 |
+
- `environment/hardware_snapshot.txt`
|
| 119 |
+
- `environment/glxinfo_B.txt`
|
| 120 |
+
- `environment/upstream_revisions.txt`
|
| 121 |
+
- `environment/system_packages_same_machine.txt`
|
| 122 |
+
- `environment/rlbench_env_export.yaml`
|
| 123 |
+
- `environment/rlbench_env_explicit.txt`
|
| 124 |
+
- `environment/rlbench_pip_freeze.txt`
|
results/2026-03-25-runpod/instructions.md
ADDED
|
@@ -0,0 +1,717 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Developer handoff: structured bimanual reveal-and-retrieve under elastic occlusion
|
| 2 |
+
|
| 3 |
+
Repo target: `lsnu/VLAarchtests` (current `main`, latest post-fix state). This handoff is written against the current `elastic_reveal` stack, not the older intermediate variants.
|
| 4 |
+
|
| 5 |
+
## 1. Project introduction
|
| 6 |
+
|
| 7 |
+
This project is a structured bimanual policy stack for reveal-and-retrieve tasks under partial observability and deformable or elastic occlusion. The eventual real-world targets are three Dobot X-trainer environments. The first is dense live foliage with hidden fake snails, where one arm must create and maintain a canopy gap while the other arm retrieves the target safely. The second is bag opening and retrieval, where one arm must open and hold the bag mouth while the other arm retrieves the target item. The third is suitcase or folded-cloth retrieval, where one arm must slightly lift and stabilize clothing layers while the other arm retrieves a hidden item without destroying the fold structure.
|
| 8 |
+
|
| 9 |
+
The current repo already contains the right broad decomposition for this task family. It has a multi-view visual backbone, RGB-D support, an explicit reveal state head, observation memory, a compact world model, a coordinated bimanual action decoder, and a planner. The problem is not the structural idea. The problem is that several important pieces are only partially wired, too compact, or only validated on teacher-shaped proxy data. The current code is a good scaffold. It is not yet strong enough to justify “beats SOTA” claims on either public benchmarks or the three target task families.
|
| 10 |
+
|
| 11 |
+
The current public evidence should be read narrowly. The most credible positive result in the repo is that RGB-D helps on the proxy benchmark. The planner, world model, and role-symmetry components are not yet validated strongly enough to claim they are the source of the gains. The RLBench / PerAct2 integration is also still mostly a launch and plumbing layer, not a mature benchmark suite.
|
| 12 |
+
|
| 13 |
+
This handoff therefore has one purpose. Keep the structured reveal-and-retrieve idea, but harden the architecture and evaluation until there is a realistic chance of beating strong bimanual baselines on the three target environments.
|
| 14 |
+
|
| 15 |
+
## 2. Current repo status (what exists, what is missing)
|
| 16 |
+
|
| 17 |
+
The current core files are:
|
| 18 |
+
|
| 19 |
+
`code/reveal_vla_bimanual/models/backbones.py`
|
| 20 |
+
`code/reveal_vla_bimanual/models/multiview_fusion.py`
|
| 21 |
+
`code/reveal_vla_bimanual/models/observation_memory.py`
|
| 22 |
+
`code/reveal_vla_bimanual/models/reveal_head.py`
|
| 23 |
+
`code/reveal_vla_bimanual/models/world_model.py`
|
| 24 |
+
`code/reveal_vla_bimanual/models/action_decoder.py`
|
| 25 |
+
`code/reveal_vla_bimanual/models/planner.py`
|
| 26 |
+
`code/reveal_vla_bimanual/models/policy.py`
|
| 27 |
+
`code/reveal_vla_bimanual/train/losses.py`
|
| 28 |
+
`code/reveal_vla_bimanual/sim_reveal/dataset.py`
|
| 29 |
+
`code/reveal_vla_bimanual/sim_reveal/procedural_envs.py`
|
| 30 |
+
`code/reveal_vla_bimanual/eval/run_reveal_benchmark.py`
|
| 31 |
+
`code/reveal_vla_bimanual/eval/run_rlbench_rollout_eval.py`
|
| 32 |
+
`code/reveal_vla_bimanual/eval/run_peract2_task_sweep.py`
|
| 33 |
+
|
| 34 |
+
The current proxy benchmark already uses the correct three abstract task types (`foliage`, `bag`, `cloth`). That is good. The current dataset code also has explicit no-leak assertions, which is also good.
|
| 35 |
+
|
| 36 |
+
The current weaknesses are specific and fixable.
|
| 37 |
+
|
| 38 |
+
First, the geometry path is only partially wired. The backbone produces `depth_tokens`, `geometry_tokens`, and `camera_tokens`, but the policy only forwards RGB, depth, and camera tokens into fusion. The explicit `geometry_tokens` are dropped before fusion. In addition, camera geometry is incomplete. The current depth adapter encodes intrinsics and camera translation, but not an equally explicit camera rotation representation. For three-camera reveal tasks this is a real omission.
|
| 39 |
+
|
| 40 |
+
Second, memory is too pooled and too global. The current memory path reduces scene history to pooled tokens before write decisions and bank updates. That is a novelty-gated summary memory. It is not a spatial occlusion memory. That is not enough for “hold the opening”, “the target is still probably behind this flap”, or “reveal progress will collapse if the revealer arm releases now”.
|
| 41 |
+
|
| 42 |
+
Third, the world model is too compact. It is useful as a scaffold, but not as the state-transition core for elastic foliage, bag apertures, or layered cloth. It currently rolls a compact hidden state rather than a spatial field state. That makes it too weak for counterfactual planning over opening persistence, reocclusion, and safe actor insertion.
|
| 43 |
+
|
| 44 |
+
Fourth, the planner is not trained on hard enough candidates. The current proxy data generation uses the teacher chunk and mostly Gaussian perturbations around it. That is enough to test ranking near a teacher, but not enough to teach the planner the actual failure modes that matter in these tasks (premature retrieval, releasing the opening, over-disturbing the scene, lifting the wrong cloth edge, etc.).
|
| 45 |
+
|
| 46 |
+
Fifth, the state head is still too generic. It predicts a useful set of reveal-related fields, but it does not yet expose the right task-specific latent variables for foliage, bag, and folded cloth. Those tasks are not the same. They share the same reveal-and-retrieve pattern, but they do not share the same dominant failure modes.
|
| 47 |
+
|
| 48 |
+
Sixth, the test suite is mostly contract-level. Those tests are useful, but they do not yet prove that the structured components work behaviorally. The RLBench side is similar. The launch smoke is only a plumbing check. The actual rollout evaluator exists, but it needs to become the main public benchmark path.
|
| 49 |
+
|
| 50 |
+
## 3. The main design decision
|
| 51 |
+
|
| 52 |
+
Do not collapse this into a generic monolithic VLA. That is not the likely win condition for these tasks.
|
| 53 |
+
|
| 54 |
+
The highest-probability path is a stronger visual backbone plus an explicit structured reveal-and-retrieve stack. The reason is simple. Your target tasks are asymmetric, partially observable, persistence-sensitive, and reocclusion-sensitive. One arm often has to create and maintain a temporary affordance that only exists because of that arm’s continued state. Generic end-to-end BC can sometimes imitate the behavior, but these tasks strongly reward explicit representations of opening quality, hold persistence, target belief, reocclusion risk, and actor feasibility.
|
| 55 |
+
|
| 56 |
+
The structured architecture should stay. It should just become spatial, task-aware, and evaluated honestly.
|
| 57 |
+
|
| 58 |
+
## 4. Mandatory code changes
|
| 59 |
+
|
| 60 |
+
### 4.1 Fix and strengthen the geometry path
|
| 61 |
+
|
| 62 |
+
Files to change:
|
| 63 |
+
|
| 64 |
+
`models/backbones.py`
|
| 65 |
+
`models/multiview_fusion.py`
|
| 66 |
+
`models/policy.py`
|
| 67 |
+
`tests/test_rgbd_forward_contract.py` (extend)
|
| 68 |
+
Add new tests: `tests/test_geometry_tokens_propagate.py`, `tests/test_camera_rotation_geometry.py`
|
| 69 |
+
|
| 70 |
+
Exact changes:
|
| 71 |
+
|
| 72 |
+
In `models/policy.py`, update the image encoding path so that `geometry_tokens` are passed from `backbone.encode_images(..., return_aux=True)` into the fusion module. Right now the policy forwards `rgb_tokens`, `depth_tokens`, and `camera_tokens`, but not `geometry_tokens`. This should be corrected first because it is an actual information-drop bug.
|
| 73 |
+
|
| 74 |
+
In `models/multiview_fusion.py`, update the fusion interface to accept explicit `geometry_tokens`. The geometry attention path should fuse from a real concatenation or gated combination of `[depth_tokens, geometry_tokens, camera_tokens]`, rather than synthesizing “geometry” only from the surviving depth and camera paths. Keep the existing gated cross-attention pattern, but make the geometry path explicit and inspectable.
|
| 75 |
+
|
| 76 |
+
In `models/backbones.py`, upgrade `DepthPatchAdapter` so that geometry features include camera orientation. Use a 6D rotation representation or a normalized quaternion plus translation. Also add per-patch viewing ray directions derived from intrinsics and camera pose. The three target environments all rely on view geometry and persistent multi-view correspondence. The current translation-only pose treatment is too weak.
|
| 77 |
+
|
| 78 |
+
Add config flags that actually do something. The current `use_camera_geometry` style config needs to gate a real path, not just exist as a dormant option. Add separate switches for `use_depth_tokens`, `use_geometry_tokens`, and `use_camera_pose_tokens` so ablations are clean.
|
| 79 |
+
|
| 80 |
+
Why this matters: the foliage and bag tasks are especially sensitive to camera geometry because small apparent gaps can be fake from one viewpoint and usable from another. The actor feasibility estimate should depend on geometry, not just appearance.
|
| 81 |
+
|
| 82 |
+
### 4.2 Replace pooled novelty memory with spatial reveal memory
|
| 83 |
+
|
| 84 |
+
Files to change:
|
| 85 |
+
|
| 86 |
+
`models/observation_memory.py`
|
| 87 |
+
`models/policy.py`
|
| 88 |
+
`models/reveal_head.py`
|
| 89 |
+
Add new tests: `tests/test_spatial_memory_occlusion_persistence.py`, `tests/test_memory_slot_write_gating.py`, `tests/test_reocclusion_memory_regression.py`
|
| 90 |
+
|
| 91 |
+
Exact changes:
|
| 92 |
+
|
| 93 |
+
Keep the current memory modules as a fallback baseline, but add a new default path that stores low-resolution spatial memory instead of only pooled history summaries. The simplest realistic version is a two-branch memory:
|
| 94 |
+
|
| 95 |
+
1. scene memory: a small bank of view-conditioned or canonicalized spatial tokens for persistent geometry and support structure;
|
| 96 |
+
2. belief memory: a spatial target-belief / reveal-state memory that carries uncertainty explicitly.
|
| 97 |
+
|
| 98 |
+
The memory does not need to be large. An 8×8 or 12×12 field token grid per view (or a shared canonical field) is enough. The key requirement is that the write gate becomes spatial or slot-wise, not global only. The model must be able to update “the mouth is open here” without overwriting “the target is probably still here”.
|
| 99 |
+
|
| 100 |
+
Add explicit channels or latent heads for:
|
| 101 |
+
- newly revealed regions
|
| 102 |
+
- still-visible regions
|
| 103 |
+
- reoccluded regions
|
| 104 |
+
- persistent hold or opening quality
|
| 105 |
+
- target belief uncertainty
|
| 106 |
+
|
| 107 |
+
The world model and planner should consume this spatial memory directly. Do not average it away before planning.
|
| 108 |
+
|
| 109 |
+
Why this matters: a reveal-and-retrieve policy that forgets where the useful opening was, or where the hidden object probably still is, will look competent in one-step imitation and fail in multi-step retrieval.
|
| 110 |
+
|
| 111 |
+
### 4.3 Replace the compact world model with a spatial rollout model
|
| 112 |
+
|
| 113 |
+
Files to change:
|
| 114 |
+
|
| 115 |
+
`models/world_model.py`
|
| 116 |
+
`models/policy.py`
|
| 117 |
+
`train/losses.py`
|
| 118 |
+
Add new tests: `tests/test_world_model_null_rollout.py`, `tests/test_world_model_identity_rollout.py`, `tests/test_world_model_field_consistency.py`, `tests/test_world_model_task_adapter.py`
|
| 119 |
+
|
| 120 |
+
Exact changes:
|
| 121 |
+
|
| 122 |
+
Keep the current compact GRU world model only as an ablation. The default model should become a spatial latent rollout over field tokens or low-resolution maps. A realistic implementation is a ConvGRU or a token-wise recurrent transformer over a low-resolution field state. The world-model state should contain at least:
|
| 123 |
+
|
| 124 |
+
- target belief field
|
| 125 |
+
- visibility or reveal field
|
| 126 |
+
- actor feasibility / corridor field
|
| 127 |
+
- opening quality or hold quality field
|
| 128 |
+
- persistence field
|
| 129 |
+
- disturbance / damage risk field
|
| 130 |
+
- reocclusion risk field
|
| 131 |
+
- support stability field
|
| 132 |
+
|
| 133 |
+
Add task conditioning directly into the world model. A learned task embedding (`foliage`, `bag`, `cloth`) should modulate the transition. The dynamics are not the same and should not be forced into one unstructured transition model.
|
| 134 |
+
|
| 135 |
+
Retain explicit ablation modes inside `models/world_model.py`:
|
| 136 |
+
- `identity_rollout`
|
| 137 |
+
- `null_rollout`
|
| 138 |
+
- `compact_rollout` (the current baseline)
|
| 139 |
+
- `spatial_rollout` (new default)
|
| 140 |
+
|
| 141 |
+
These ablations must be real and deterministic. The world-model ablation confusion in the current repo shows why this needs to be explicit and unit-tested.
|
| 142 |
+
|
| 143 |
+
Why this matters: the planner can only beat a simple decoder if its counterfactual rollouts capture persistence and collapse. Without a spatial world model, the “maintain opening while actor advances” pattern will be under-modeled.
|
| 144 |
+
|
| 145 |
+
### 4.4 Make the reveal head task-aware
|
| 146 |
+
|
| 147 |
+
Files to change:
|
| 148 |
+
|
| 149 |
+
`models/reveal_head.py`
|
| 150 |
+
`train/losses.py`
|
| 151 |
+
`sim_reveal/dataset.py`
|
| 152 |
+
`sim_reveal/procedural_envs.py`
|
| 153 |
+
Add new tests: `tests/test_task_conditioned_head_shapes.py`, `tests/test_task_metric_monotonicity.py`
|
| 154 |
+
|
| 155 |
+
Exact changes:
|
| 156 |
+
|
| 157 |
+
Add a task embedding to the reveal head. Keep the shared trunk, but use task-specific adapters or low-rank heads for the final outputs. The head should still produce common fields, but each task must also expose the state variables that actually matter.
|
| 158 |
+
|
| 159 |
+
For foliage, add:
|
| 160 |
+
- gap width or reveal corridor width
|
| 161 |
+
- canopy strain / damage risk
|
| 162 |
+
- occluder return tendency (reocclusion after release)
|
| 163 |
+
- target visibility confidence under flexible occluders
|
| 164 |
+
|
| 165 |
+
For bag, add:
|
| 166 |
+
- mouth aperture width or area
|
| 167 |
+
- rim endpoint or rim grasp quality
|
| 168 |
+
- hold quality
|
| 169 |
+
- rim slip risk
|
| 170 |
+
- insertable actor corridor
|
| 171 |
+
|
| 172 |
+
For cloth or suitcase, add:
|
| 173 |
+
- layer separation quality
|
| 174 |
+
- fold-preservation score
|
| 175 |
+
- insertion corridor
|
| 176 |
+
- top-layer stability
|
| 177 |
+
- “lift too much” risk
|
| 178 |
+
|
| 179 |
+
The current generic fields (`actor_feasibility_field`, `persistence_field`, `risk_field`, `uncertainty_field`, `reocclusion`) are useful, but they are not enough. The planner needs the task-specific variables because the right action for bag opening is not the right action for layered cloth.
|
| 180 |
+
|
| 181 |
+
### 4.5 Replace Gaussian candidate noise with semantic macro candidates plus continuous refinement
|
| 182 |
+
|
| 183 |
+
Files to change:
|
| 184 |
+
|
| 185 |
+
`models/action_decoder.py`
|
| 186 |
+
`models/planner.py`
|
| 187 |
+
`sim_reveal/dataset.py`
|
| 188 |
+
`sim_reveal/procedural_envs.py`
|
| 189 |
+
Add new tests: `tests/test_candidate_macro_coverage.py`, `tests/test_planner_reocclusion_gating.py`, `tests/test_proposal_semantic_diversity.py`
|
| 190 |
+
|
| 191 |
+
Exact changes:
|
| 192 |
+
|
| 193 |
+
Keep the current proposal mechanism as a fallback. The default candidate set should become a set of semantic macro modes, each refined by continuous deltas.
|
| 194 |
+
|
| 195 |
+
The candidate vocabulary should be task-aware.
|
| 196 |
+
|
| 197 |
+
For foliage:
|
| 198 |
+
- `sweep_left`
|
| 199 |
+
- `sweep_right`
|
| 200 |
+
- `pin_canopy`
|
| 201 |
+
- `widen_gap`
|
| 202 |
+
- `maintain_gap`
|
| 203 |
+
- `insert_actor`
|
| 204 |
+
- `retrieve`
|
| 205 |
+
|
| 206 |
+
For bag:
|
| 207 |
+
- `pin_left_rim`
|
| 208 |
+
- `pin_right_rim`
|
| 209 |
+
- `widen_mouth`
|
| 210 |
+
- `maintain_mouth`
|
| 211 |
+
- `probe_inside`
|
| 212 |
+
- `insert_actor`
|
| 213 |
+
- `retrieve`
|
| 214 |
+
|
| 215 |
+
For cloth:
|
| 216 |
+
- `lift_edge`
|
| 217 |
+
- `separate_layer`
|
| 218 |
+
- `stabilize_fold`
|
| 219 |
+
- `maintain_lift`
|
| 220 |
+
- `insert_actor`
|
| 221 |
+
- `retrieve`
|
| 222 |
+
|
| 223 |
+
Represent these as discrete proposal tokens or a macro head in `action_decoder.py`, then produce continuous chunk deltas conditioned on the chosen macro. The planner should shortlist across macro families first and refine within each family second. That prevents “all candidates are tiny perturbations around the same wrong idea”.
|
| 224 |
+
|
| 225 |
+
In `models/planner.py`, add hard feasibility gates before utility aggregation. Do not let the planner prefer “retrieve now” if actor feasibility, hold quality, or support stability are below threshold. Use worst-step or CVaR-style penalties for reocclusion and collapse, rather than only mean penalties. These tasks fail on bad tails, not just on averages.
|
| 226 |
+
|
| 227 |
+
Why this matters: the current planner is too dependent on easy local ranking. Real reveal-and-retrieve requires semantically different plans, not just slightly different noise vectors.
|
| 228 |
+
|
| 229 |
+
### 4.6 Change the loss stack to supervise what actually matters
|
| 230 |
+
|
| 231 |
+
Files to change:
|
| 232 |
+
|
| 233 |
+
`train/losses.py`
|
| 234 |
+
`train/trainer.py` (if needed for logging)
|
| 235 |
+
Add new tests: `tests/test_candidate_ranking_loss.py`, `tests/test_phase_labels_not_action_only.py`, `tests/test_planner_gradient_flow.py`
|
| 236 |
+
|
| 237 |
+
Exact changes:
|
| 238 |
+
|
| 239 |
+
Reduce dependence on heuristic phase labels inferred from the current action chunk. That heuristic is acceptable for early bootstrapping, but it should not remain the main source of phase supervision. Prefer simulator-side phase or subgoal labels where available. If those are not reliable, phase should be a weak auxiliary, not a strong driver.
|
| 240 |
+
|
| 241 |
+
Add pairwise or listwise ranking loss over candidate action chunks using actual rollout utility labels. These labels should come from simulated outcomes, not just from “teacher is first, noise is worse”.
|
| 242 |
+
|
| 243 |
+
Add consistency losses:
|
| 244 |
+
- predicted opening quality should correlate with rollout persistence
|
| 245 |
+
- predicted reocclusion should correlate with actual collapse after release
|
| 246 |
+
- predicted uncertainty should be calibrated against outcome uncertainty or visibility error
|
| 247 |
+
|
| 248 |
+
Lower the relative weight of pure behavior cloning once ranking and rollout supervision are reliable. This project should not stay as BC-with-many-auxiliaries.
|
| 249 |
+
|
| 250 |
+
## 5. Mandatory data-generation changes
|
| 251 |
+
|
| 252 |
+
Files to change:
|
| 253 |
+
|
| 254 |
+
`sim_reveal/dataset.py`
|
| 255 |
+
`sim_reveal/procedural_envs.py`
|
| 256 |
+
Add new tests: `tests/test_dataset_hard_negative_presence.py`, `tests/test_no_leak_with_new_labels.py`, `tests/test_teacher_audit.py`
|
| 257 |
+
|
| 258 |
+
Exact changes:
|
| 259 |
+
|
| 260 |
+
The dataset generation path must stop relying on teacher-plus-Gaussian-noise as the dominant source of planner candidates. Keep the teacher as one source, but add hard negative families that reflect actual task failures.
|
| 261 |
+
|
| 262 |
+
Required negative families for all three tasks:
|
| 263 |
+
|
| 264 |
+
1. premature retrieve: actor attempts retrieval before corridor and hold quality are sufficient;
|
| 265 |
+
2. reveal-with-release: revealer creates an opening but fails to maintain it;
|
| 266 |
+
3. over-disturbance: revealer opens aggressively but causes collapse or damage risk;
|
| 267 |
+
4. wrong-side or wrong-edge reveal: the opening is created in a useless place;
|
| 268 |
+
5. delayed actor entry: revealer holds too long and wastes time or destabilizes the scene;
|
| 269 |
+
6. actor path through weak corridor: actor enters where access exists visually but not safely.
|
| 270 |
+
|
| 271 |
+
Required task-specific negative families:
|
| 272 |
+
|
| 273 |
+
For foliage:
|
| 274 |
+
- swipe that increases visibility briefly but induces immediate reocclusion;
|
| 275 |
+
- push direction that hides the target from the actor side;
|
| 276 |
+
- gap on the wrong side of the target.
|
| 277 |
+
|
| 278 |
+
For bag:
|
| 279 |
+
- one-rim lift that slips instead of widening the mouth;
|
| 280 |
+
- opening wide enough visually but not stable enough for actor insertion;
|
| 281 |
+
- actor reaches through the fabric instead of through the aperture.
|
| 282 |
+
|
| 283 |
+
For cloth:
|
| 284 |
+
- lift too high and destroy fold structure;
|
| 285 |
+
- lift the wrong layer;
|
| 286 |
+
- retrieve path that drags clothing and unfolds the stack.
|
| 287 |
+
|
| 288 |
+
The dataset should record candidate-level rollout outcomes for every candidate chunk:
|
| 289 |
+
- success
|
| 290 |
+
- reveal achieved
|
| 291 |
+
- visibility AUC
|
| 292 |
+
- hold persistence
|
| 293 |
+
- reocclusion rate
|
| 294 |
+
- disturbance cost
|
| 295 |
+
- fold-preservation (cloth)
|
| 296 |
+
- mouth aperture / hold quality (bag)
|
| 297 |
+
- damage proxy / gap width (foliage)
|
| 298 |
+
|
| 299 |
+
This candidate-level outcome table should be the source of planner labels.
|
| 300 |
+
|
| 301 |
+
Also add a teacher audit report. The current teacher is a useful bootstrap, but it is not enough to assume it is good. The audit should compare the teacher against reveal-only, retrieve-only, no-hold, and random policy baselines on the current proxy suite.
|
| 302 |
+
|
| 303 |
+
## 6. Small but mandatory engineering cleanups
|
| 304 |
+
|
| 305 |
+
These changes do not change model quality directly, but they reduce evaluation ambiguity and future regressions.
|
| 306 |
+
|
| 307 |
+
In `tests/conftest.py`, remove the hardcoded `/workspace/VLAarchtests/code/reveal_vla_bimanual` path. Replace it with a path derived from `Path(__file__).resolve()` so tests run anywhere.
|
| 308 |
+
|
| 309 |
+
In `eval/run_rlbench_rollout_eval.py`, preserve richer episode traces. Save chosen macro mode, planner scores, confidence, predicted reocclusion, path recoveries, noop fallbacks, and whether support-mode conditioning was enabled.
|
| 310 |
+
|
| 311 |
+
In `eval/run_reveal_benchmark.py`, stop using only the default 24 episodes for serious comparisons. Keep 24 as a smoke benchmark, but add a “serious” mode at 100 or 200 episodes per proxy.
|
| 312 |
+
|
| 313 |
+
In `eval/run_reveal_benchmark.py`, explicitly report `chunk_commit_steps` and do not leave the main reveal benchmark at a commit horizon of zero by default. These tasks are not purely one-step reactive.
|
| 314 |
+
|
| 315 |
+
In the eval reporting utilities, add bootstrap confidence intervals and paired-seed comparisons. The differences you care about are often a few percentage points. Unpaired noisy comparisons are not enough.
|
| 316 |
+
|
| 317 |
+
## 7. Exact new tests to verify the implementation
|
| 318 |
+
|
| 319 |
+
The current repo has contract tests. Keep them. Add the following behavioral tests.
|
| 320 |
+
|
| 321 |
+
### 7.1 Geometry and fusion tests
|
| 322 |
+
|
| 323 |
+
`tests/test_geometry_tokens_propagate.py`
|
| 324 |
+
|
| 325 |
+
Construct a tiny batch with fixed RGB and depth. Modify only camera rotation. Verify that:
|
| 326 |
+
1. `geometry_tokens` change,
|
| 327 |
+
2. the fused scene representation changes when geometry is enabled,
|
| 328 |
+
3. the fused scene representation does not change when geometry is disabled.
|
| 329 |
+
|
| 330 |
+
`tests/test_camera_rotation_geometry.py`
|
| 331 |
+
|
| 332 |
+
Use two cameras with identical translation and different rotation. Verify that the policy representation is rotation-sensitive after the geometry fix. This should fail on the current code and pass after the change.
|
| 333 |
+
|
| 334 |
+
### 7.2 Spatial memory tests
|
| 335 |
+
|
| 336 |
+
`tests/test_spatial_memory_occlusion_persistence.py`
|
| 337 |
+
|
| 338 |
+
Use a scripted proxy sequence where the target is briefly visible, then fully occluded, then visible again. Verify that belief memory retains a localized target belief during occlusion and sharpens it after reappearance. This should test both persistence and uncertainty.
|
| 339 |
+
|
| 340 |
+
`tests/test_memory_slot_write_gating.py`
|
| 341 |
+
|
| 342 |
+
Feed a scene where only the opening region changes. Verify that only a minority of memory slots or cells update. This prevents global overwriting.
|
| 343 |
+
|
| 344 |
+
`tests/test_reocclusion_memory_regression.py`
|
| 345 |
+
|
| 346 |
+
Create a scripted “open then release” sequence. Verify that memory tracks reocclusion and that predicted hold quality declines.
|
| 347 |
+
|
| 348 |
+
### 7.3 World-model tests
|
| 349 |
+
|
| 350 |
+
`tests/test_world_model_null_rollout.py`
|
| 351 |
+
|
| 352 |
+
Assert that `null_rollout` returns an exact or near-exact identity state and does not apply unintended updates.
|
| 353 |
+
|
| 354 |
+
`tests/test_world_model_identity_rollout.py`
|
| 355 |
+
|
| 356 |
+
Assert that `identity_rollout` preserves state across steps while leaving logging fields consistent.
|
| 357 |
+
|
| 358 |
+
`tests/test_world_model_field_consistency.py`
|
| 359 |
+
|
| 360 |
+
Roll out one deterministic proxy step and compare predicted next-step fields against simulator privileged fields. Enforce MAE thresholds per field, not only a single scalar.
|
| 361 |
+
|
| 362 |
+
`tests/test_world_model_task_adapter.py`
|
| 363 |
+
|
| 364 |
+
Use the same initial field state with different task embeddings. Verify that transitions differ in a consistent way. This catches dead task-conditioning code paths.
|
| 365 |
+
|
| 366 |
+
### 7.4 Candidate and planner tests
|
| 367 |
+
|
| 368 |
+
`tests/test_candidate_macro_coverage.py`
|
| 369 |
+
|
| 370 |
+
Verify that the proposal generator returns at least one candidate from each required macro family when requested.
|
| 371 |
+
|
| 372 |
+
`tests/test_planner_reocclusion_gating.py`
|
| 373 |
+
|
| 374 |
+
Create a scripted case where one candidate retrieves immediately but causes opening collapse, and another candidate maintains the opening first. Verify that the planner picks the maintain-first plan.
|
| 375 |
+
|
| 376 |
+
`tests/test_proposal_semantic_diversity.py`
|
| 377 |
+
|
| 378 |
+
Do not measure diversity only by vector distance. Also verify macro-family diversity and rollout outcome diversity.
|
| 379 |
+
|
| 380 |
+
### 7.5 Task-head tests
|
| 381 |
+
|
| 382 |
+
`tests/test_task_conditioned_head_shapes.py`
|
| 383 |
+
|
| 384 |
+
Verify output presence and shapes for all common fields and all task-specific fields.
|
| 385 |
+
|
| 386 |
+
`tests/test_task_metric_monotonicity.py`
|
| 387 |
+
|
| 388 |
+
Use small synthetic perturbations:
|
| 389 |
+
- increase aperture in bag: `opening_quality` should increase;
|
| 390 |
+
- increase canopy gap in foliage: `actor_feasibility` should increase;
|
| 391 |
+
- over-lift cloth: `fold_preservation` should decrease.
|
| 392 |
+
|
| 393 |
+
These are not full scientific tests, but they catch dead or miswired heads quickly.
|
| 394 |
+
|
| 395 |
+
### 7.6 Dataset and leakage tests
|
| 396 |
+
|
| 397 |
+
`tests/test_dataset_hard_negative_presence.py`
|
| 398 |
+
|
| 399 |
+
Sample dataset items and verify that candidate sets contain hard negative families, not just teacher-centered noise.
|
| 400 |
+
|
| 401 |
+
`tests/test_no_leak_with_new_labels.py`
|
| 402 |
+
|
| 403 |
+
Extend the no-leak assertions to cover all new task-specific labels and maps. The proxy dataset must keep using rendered observations only on the input side.
|
| 404 |
+
|
| 405 |
+
`tests/test_teacher_audit.py`
|
| 406 |
+
|
| 407 |
+
Require the teacher to beat random, retrieve-only, and reveal-only on the proxy metrics. If the teacher itself is weak, the whole planner training signal is questionable.
|
| 408 |
+
|
| 409 |
+
### 7.7 Scripted proxy behavior suite
|
| 410 |
+
|
| 411 |
+
Add a new deterministic behavioral test suite, for example under `tests/test_proxy_scripted_bench.py`.
|
| 412 |
+
|
| 413 |
+
This suite should include 10 to 20 deterministic seeds per task with hand-designed initial states. The expected winner should be obvious.
|
| 414 |
+
|
| 415 |
+
Required scripted cases:
|
| 416 |
+
- bag: `maintain_mouth` should beat `retrieve` immediately on hold persistence and success;
|
| 417 |
+
- foliage: `pin_canopy` should beat `random_swipe` on reocclusion and visibility AUC;
|
| 418 |
+
- cloth: `stabilize_fold` should beat `lift_high` on fold-preservation and success.
|
| 419 |
+
|
| 420 |
+
The full model does not need to be perfect on these, but the planner should select the intended candidate at least 80 percent of the time.
|
| 421 |
+
|
| 422 |
+
## 8. Exact benchmark plan to estimate performance
|
| 423 |
+
|
| 424 |
+
Separate the benchmarks into two layers. The first layer verifies that the implementation behaves correctly. The second estimates real performance against baselines.
|
| 425 |
+
|
| 426 |
+
### 8.1 Layer A: implementation-verification benchmarks
|
| 427 |
+
|
| 428 |
+
These are not publication benchmarks. They are gates.
|
| 429 |
+
|
| 430 |
+
Run the full unit and integration suite after every architecture milestone:
|
| 431 |
+
|
| 432 |
+
```bash
|
| 433 |
+
PYTHONPATH=code/reveal_vla_bimanual pytest tests -q
|
| 434 |
+
```
|
| 435 |
+
|
| 436 |
+
After the new behavioral tests are added, require all of the following before moving on:
|
| 437 |
+
- all geometry propagation tests pass;
|
| 438 |
+
- the scripted proxy suite passes;
|
| 439 |
+
- world-model null and identity ablations pass exactly;
|
| 440 |
+
- candidate macro coverage passes;
|
| 441 |
+
- no-leak assertions pass with new task fields.
|
| 442 |
+
|
| 443 |
+
Then run a deterministic proxy smoke benchmark on fixed seeds (for example 10 per task) to catch obvious regressions:
|
| 444 |
+
|
| 445 |
+
```bash
|
| 446 |
+
cd code/reveal_vla_bimanual
|
| 447 |
+
python -m eval.run_reveal_benchmark \
|
| 448 |
+
--model full=/abs/path/checkpoint.pt \
|
| 449 |
+
--episodes 10 \
|
| 450 |
+
--proxies foliage bag cloth \
|
| 451 |
+
--chunk-commit-steps 4 \
|
| 452 |
+
--output-root /abs/path/reports/reveal_smoke
|
| 453 |
+
```
|
| 454 |
+
|
| 455 |
+
This benchmark is only for regression detection. It is not a performance claim.
|
| 456 |
+
|
| 457 |
+
### 8.2 Layer B: strengthened proxy benchmark (main task-aligned benchmark now)
|
| 458 |
+
|
| 459 |
+
This should become the main internal benchmark until real teleop data exists.
|
| 460 |
+
|
| 461 |
+
Use the existing `foliage`, `bag`, and `cloth` proxies, but strengthen them and evaluate seriously:
|
| 462 |
+
- at least 100 deterministic seeds per proxy for final comparisons;
|
| 463 |
+
- paired-seed evaluation across all ablations;
|
| 464 |
+
- chunk commit horizons of at least 4, and also report a 0/2/4 sweep once;
|
| 465 |
+
- no teacher involvement during evaluation.
|
| 466 |
+
|
| 467 |
+
Run the base benchmark:
|
| 468 |
+
|
| 469 |
+
```bash
|
| 470 |
+
cd code/reveal_vla_bimanual
|
| 471 |
+
python -m eval.run_reveal_benchmark \
|
| 472 |
+
--model full=/abs/path/checkpoint.pt \
|
| 473 |
+
--episodes 100 \
|
| 474 |
+
--proxies foliage bag cloth \
|
| 475 |
+
--chunk-commit-steps 4 \
|
| 476 |
+
--output-root /abs/path/reports/reveal_full
|
| 477 |
+
```
|
| 478 |
+
|
| 479 |
+
Run required paired ablations from the same checkpoint family or retrained checkpoints:
|
| 480 |
+
- no geometry tokens
|
| 481 |
+
- no spatial memory
|
| 482 |
+
- compact world model instead of spatial
|
| 483 |
+
- no planner
|
| 484 |
+
- planner with Gaussian candidates only
|
| 485 |
+
- no task-conditioned head
|
| 486 |
+
- no support-mode conditioning
|
| 487 |
+
|
| 488 |
+
The proxy benchmark must report at least these metrics:
|
| 489 |
+
- retrieve success
|
| 490 |
+
- reveal success
|
| 491 |
+
- target visibility AUC
|
| 492 |
+
- actor-feasibility AUC
|
| 493 |
+
- hold persistence
|
| 494 |
+
- reocclusion rate
|
| 495 |
+
- disturbance cost
|
| 496 |
+
- planner top-1 on candidate rollouts
|
| 497 |
+
- world-model next-step MAE
|
| 498 |
+
- uncertainty calibration
|
| 499 |
+
- candidate ranking NDCG
|
| 500 |
+
|
| 501 |
+
Add task-specific metrics:
|
| 502 |
+
- foliage: gap width, damage proxy, release-collapse rate
|
| 503 |
+
- bag: aperture width or area, rim slip rate, insertion success
|
| 504 |
+
- cloth: fold-preservation score, layer separation quality, drag-induced disturbance
|
| 505 |
+
|
| 506 |
+
Acceptance gate for continuing toward public baseline comparison:
|
| 507 |
+
- the full model should beat the current repo’s RGB-D baseline on mean proxy success and on at least two of the three proxies;
|
| 508 |
+
- planner-on should beat planner-off on at least two of the three proxies and on hard-negative candidate ranking;
|
| 509 |
+
- spatial world model should beat compact and null rollouts on persistence and reocclusion prediction;
|
| 510 |
+
- task-conditioned head should beat generic head on at least one task-specific metric per target task.
|
| 511 |
+
|
| 512 |
+
### 8.3 Layer C: RLBench / PerAct2 bimanual rollout benchmark
|
| 513 |
+
|
| 514 |
+
The repo already has the right hook for this. Use `run_rlbench_rollout_eval.py` and `run_peract2_task_sweep.py` as the main public benchmark entry points. Do not treat `run_peract2_launch_smoke.py` as evaluation. It is only a launch check.
|
| 515 |
+
|
| 516 |
+
Run the full existing PerAct2 13-task split from `sim_rlbench/task_splits.py::PERACT2_BIMANUAL_TASKS`:
|
| 517 |
+
|
| 518 |
+
```bash
|
| 519 |
+
cd code/reveal_vla_bimanual
|
| 520 |
+
python -m eval.run_peract2_task_sweep \
|
| 521 |
+
--checkpoint /abs/path/checkpoint.pt \
|
| 522 |
+
--output-root /abs/path/reports/peract2_13 \
|
| 523 |
+
--episodes-per-task 25 \
|
| 524 |
+
--episode-length 20 \
|
| 525 |
+
--resolution 224 \
|
| 526 |
+
--chunk-commit-steps 4 \
|
| 527 |
+
--allow-unsupervised-planning \
|
| 528 |
+
--headless
|
| 529 |
+
```
|
| 530 |
+
|
| 531 |
+
Also run direct single-task evaluations when debugging:
|
| 532 |
+
|
| 533 |
+
```bash
|
| 534 |
+
cd code/reveal_vla_bimanual
|
| 535 |
+
python -m eval.run_rlbench_rollout_eval \
|
| 536 |
+
--checkpoint /abs/path/checkpoint.pt \
|
| 537 |
+
--output-dir /abs/path/reports/rlbench_debug \
|
| 538 |
+
--tasks RightOpenDrawer \
|
| 539 |
+
--episodes-per-task 25 \
|
| 540 |
+
--episode-length 20 \
|
| 541 |
+
--resolution 224 \
|
| 542 |
+
--plan \
|
| 543 |
+
--chunk-commit-steps 4 \
|
| 544 |
+
--allow-unsupervised-planning \
|
| 545 |
+
--headless
|
| 546 |
+
```
|
| 547 |
+
|
| 548 |
+
This benchmark is not a direct match to the three target tasks, but it is the main public bimanual sanity check. It measures whether the structured modifications hurt or help general bimanual competence.
|
| 549 |
+
|
| 550 |
+
Required comparisons on this benchmark:
|
| 551 |
+
- current repo best checkpoint
|
| 552 |
+
- full improved model
|
| 553 |
+
- no-planner ablation
|
| 554 |
+
- compact world model ablation
|
| 555 |
+
- no geometry ablation
|
| 556 |
+
- no task-conditioning ablation
|
| 557 |
+
|
| 558 |
+
If external baseline code is available, evaluate against:
|
| 559 |
+
- PerAct2
|
| 560 |
+
- InterACT
|
| 561 |
+
- VoxAct-B
|
| 562 |
+
- AnyBimanual
|
| 563 |
+
|
| 564 |
+
If compute allows, also compare against foundation-scale baselines as a separate category:
|
| 565 |
+
- TwinVLA
|
| 566 |
+
- RDT-1B
|
| 567 |
+
|
| 568 |
+
Fairness requirements:
|
| 569 |
+
- same camera setup if possible (front plus both wrists);
|
| 570 |
+
- same resolution;
|
| 571 |
+
- same episode length and reset policy;
|
| 572 |
+
- same task list;
|
| 573 |
+
- same number of evaluation episodes;
|
| 574 |
+
- report whether baselines use extra large-scale pretraining.
|
| 575 |
+
|
| 576 |
+
This benchmark should report:
|
| 577 |
+
- per-task success
|
| 578 |
+
- mean success
|
| 579 |
+
- mean return
|
| 580 |
+
- path recoveries
|
| 581 |
+
- noop fallbacks
|
| 582 |
+
- plan-on vs plan-off
|
| 583 |
+
- per-episode planner traces for error analysis
|
| 584 |
+
|
| 585 |
+
### 8.4 Layer D: deformable-manipulation public benchmarks
|
| 586 |
+
|
| 587 |
+
You do not yet have custom teleop data, so the closest public matches for bag and cloth should be used now.
|
| 588 |
+
|
| 589 |
+
Recommended benchmarks:
|
| 590 |
+
- DeformableRavens
|
| 591 |
+
- SoftGym cloth tasks
|
| 592 |
+
- DaXBench cloth tasks
|
| 593 |
+
|
| 594 |
+
The exact subset should be chosen based on available tasks, but the mapping is straightforward. Bag-like opening and insertion tasks are the closest public proxy for the bag environment. Cloth lifting, separation, and manipulation tasks are the closest public proxy for the suitcase environment. There is no equally good public foliage benchmark, so the strengthened foliage proxy will remain the main stand-in until custom data exists.
|
| 595 |
+
|
| 596 |
+
Required evaluation protocol:
|
| 597 |
+
- same observation modalities across methods;
|
| 598 |
+
- same action horizon where possible;
|
| 599 |
+
- same random seeds;
|
| 600 |
+
- same episode budgets;
|
| 601 |
+
- report both success and task-specific deformation metrics.
|
| 602 |
+
|
| 603 |
+
Add at least these extra metrics on the deformable benchmarks:
|
| 604 |
+
- opening quality or aperture quality
|
| 605 |
+
- hold persistence under actor motion
|
| 606 |
+
- reocclusion or collapse rate
|
| 607 |
+
- disturbance cost
|
| 608 |
+
- fold-preservation or structural-preservation score
|
| 609 |
+
|
| 610 |
+
### 8.5 Layer E: optional exploratory / active-perception benchmark
|
| 611 |
+
|
| 612 |
+
If EFM-10 or BAP code and data are actually available when implementation starts, add them. That benchmark is conceptually close to your task family because it measures exploratory plus focused manipulation under occlusion. Do not block the project on it if code is not readily usable.
|
| 613 |
+
|
| 614 |
+
### 8.6 Layer F: optional broad generalization benchmark
|
| 615 |
+
|
| 616 |
+
If time allows, add RoboTwin 2.0 as a general bimanual breadth check. It is not a direct target-task match, but it is useful for checking whether the structured reveal-and-retrieve bias damages general bimanual transfer.
|
| 617 |
+
|
| 618 |
+
## 9. Baseline strategy
|
| 619 |
+
|
| 620 |
+
There are two baseline groups and they should not be mixed carelessly.
|
| 621 |
+
|
| 622 |
+
The first group is matched-data or matched-setting baselines. These are the most useful for fair engineering comparison. Use PerAct2, InterACT, VoxAct-B, and AnyBimanual if code is available in a compatible evaluation setting.
|
| 623 |
+
|
| 624 |
+
The second group is foundation-scale baselines. These are useful, but they are not apples-to-apples unless you disclose the pretraining and model scale difference clearly. Use TwinVLA and RDT-1B in this category if compute allows.
|
| 625 |
+
|
| 626 |
+
Do not declare victory because the improved model beats the current repo checkpoint. That is a necessary condition, not the target claim.
|
| 627 |
+
|
| 628 |
+
## 10. Acceptance criteria for “ready to collect real data”
|
| 629 |
+
|
| 630 |
+
Do not move into expensive teleop collection until all of the following are true.
|
| 631 |
+
|
| 632 |
+
First, the geometry and spatial memory tests pass and stay green for multiple checkpoints.
|
| 633 |
+
|
| 634 |
+
Second, the strengthened proxy benchmark shows that the full model beats the current repo baseline convincingly. The minimum bar should be improvement in overall proxy success plus improvement on at least two of the three task types.
|
| 635 |
+
|
| 636 |
+
Third, planner-on must beat planner-off on hard-negative ranking and on task success. If the planner does not beat the decoder baseline, then the explicit planning stack is not yet earning its complexity.
|
| 637 |
+
|
| 638 |
+
Fourth, the spatial world model must beat compact and null baselines on persistence and reocclusion prediction. If it does not, the planning story is still too weak.
|
| 639 |
+
|
| 640 |
+
Fifth, the improved model should at least match strong public baselines on the RLBench / PerAct2 suite, and ideally exceed them on the tasks most related to opening, holding, uncovering, and coordinated retrieval. If it is significantly behind there, the architecture is still too immature.
|
| 641 |
+
|
| 642 |
+
## 11. Recommended implementation order
|
| 643 |
+
|
| 644 |
+
Phase 1 should fix information flow and evaluation trustworthiness. Implement geometry propagation, camera orientation encoding, and path cleanup in `tests/conftest.py`. Then add the new geometry tests and rerun the current proxy benchmark.
|
| 645 |
+
|
| 646 |
+
Phase 2 should add task-aware semantic candidates and hard-negative data generation. This is the fastest path to making the planner meaningful without yet rewriting the full memory and world model stack.
|
| 647 |
+
|
| 648 |
+
Phase 3 should add task-conditioned reveal outputs and the strengthened proxy metrics. At this stage the proxy benchmark should start reflecting the real task failure modes.
|
| 649 |
+
|
| 650 |
+
Phase 4 should replace pooled memory and compact rollout with the new spatial memory and spatial world model. This is the biggest change and should only happen after the eval harness can tell whether it helped.
|
| 651 |
+
|
| 652 |
+
Phase 5 should run the full internal ablation suite, then RLBench / PerAct2, then deformable public benchmarks, and only then decide whether the architecture is strong enough to justify real-data collection.
|
| 653 |
+
|
| 654 |
+
## 12. What to avoid
|
| 655 |
+
|
| 656 |
+
Do not treat launch smoke as performance evaluation.
|
| 657 |
+
|
| 658 |
+
Do not keep teacher-centered Gaussian candidates as the main planner supervision source.
|
| 659 |
+
|
| 660 |
+
Do not remove task structure in favor of a generic monolithic BC model unless the structured architecture clearly fails. Nothing in the current repo proves that.
|
| 661 |
+
|
| 662 |
+
Do not use only mean success. These tasks need persistence, reocclusion, and structural-preservation metrics.
|
| 663 |
+
|
| 664 |
+
Do not claim the current planner or current world model are validated. They are not, yet.
|
| 665 |
+
|
| 666 |
+
## 13. Minimal first patch set (the first pull request)
|
| 667 |
+
|
| 668 |
+
If only one implementation sprint is possible before deeper refactors, the first pull request should contain exactly this:
|
| 669 |
+
|
| 670 |
+
1. fix `geometry_tokens` propagation from backbone to fusion to policy output;
|
| 671 |
+
2. add camera rotation encoding in `DepthPatchAdapter`;
|
| 672 |
+
3. add `tests/test_geometry_tokens_propagate.py` and `tests/test_camera_rotation_geometry.py`;
|
| 673 |
+
4. replace hardcoded path logic in `tests/conftest.py`;
|
| 674 |
+
5. extend `run_reveal_benchmark.py` reporting to save `chunk_commit_steps`, bootstrap confidence intervals, and paired-seed summaries;
|
| 675 |
+
6. add semantic macro candidates in `action_decoder.py` without yet deleting the Gaussian fallback;
|
| 676 |
+
7. add hard negative candidate generation in `sim_reveal/procedural_envs.py`;
|
| 677 |
+
8. add the deterministic scripted proxy benchmark suite.
|
| 678 |
+
|
| 679 |
+
This first patch set will not make the model SOTA. It will make the repo trustworthy enough to support the larger refactor.
|
| 680 |
+
|
| 681 |
+
## 14. Reference links
|
| 682 |
+
|
| 683 |
+
Repo root:
|
| 684 |
+
https://huggingface.co/lsnu/VLAarchtests/tree/main
|
| 685 |
+
|
| 686 |
+
Core files:
|
| 687 |
+
https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/models/backbones.py
|
| 688 |
+
https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/models/multiview_fusion.py
|
| 689 |
+
https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/models/observation_memory.py
|
| 690 |
+
https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/models/reveal_head.py
|
| 691 |
+
https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/models/world_model.py
|
| 692 |
+
https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/models/action_decoder.py
|
| 693 |
+
https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/models/planner.py
|
| 694 |
+
https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/models/policy.py
|
| 695 |
+
https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/train/losses.py
|
| 696 |
+
https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/sim_reveal/dataset.py
|
| 697 |
+
https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/sim_reveal/procedural_envs.py
|
| 698 |
+
https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/eval/run_reveal_benchmark.py
|
| 699 |
+
https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/eval/run_rlbench_rollout_eval.py
|
| 700 |
+
https://huggingface.co/lsnu/VLAarchtests/blob/main/code/reveal_vla_bimanual/eval/run_peract2_task_sweep.py
|
| 701 |
+
|
| 702 |
+
Public benchmark / baseline references to align against:
|
| 703 |
+
PerAct2 / RLBench2 bimanual benchmark: https://bimanual.github.io/
|
| 704 |
+
InterACT: https://dannyran123.github.io/interact/
|
| 705 |
+
VoxAct-B: https://voxact-b.github.io/
|
| 706 |
+
AnyBimanual: https://anybimanual.github.io/
|
| 707 |
+
TwinVLA: https://twinvla.github.io/
|
| 708 |
+
RDT-1B: https://rdt-robotics.github.io/rdt-robotics/
|
| 709 |
+
DeformableRavens: https://deformableravens.github.io/
|
| 710 |
+
SoftGym: https://sites.google.com/view/softgym/home
|
| 711 |
+
DaXBench: https://daxbench.github.io/
|
| 712 |
+
EFM / BAP: https://efmanipulation.github.io/
|
| 713 |
+
RoboTwin 2.0: https://robotwin-platform.github.io/
|
| 714 |
+
|
| 715 |
+
## 15. Final recommendation
|
| 716 |
+
|
| 717 |
+
The architecture should be pursued, but only in a narrower and more explicit form: task-structured bimanual reveal-and-retrieve under elastic occlusion. The current repo is close enough to that idea to be worth continuing. The most important next step is not collecting real data yet. It is making the geometry path real, making the planner learn from hard failure cases, and making the world model spatial enough that “maintain the opening while the other arm retrieves” is something the system can actually predict rather than merely imitate.
|
results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/command.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
/workspace/envs/rlbench/bin/python -m eval.run_rlbench_rollout_eval --checkpoint /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt --output-dir /workspace/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons --tasks bimanual_dual_push_buttons --episodes-per-task 1 --episode-length 20 --resolution 224 --device cuda --chunk-commit-steps 4 --headless --plan --allow-unsupervised-planning
|
results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/rollout_eval.json
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt",
|
| 3 |
+
"plan_requested": true,
|
| 4 |
+
"plan_applied": true,
|
| 5 |
+
"planner_mode": "trainable",
|
| 6 |
+
"support_mode_conditioning": true,
|
| 7 |
+
"task_conditioning": true,
|
| 8 |
+
"geometry_enabled": true,
|
| 9 |
+
"world_model_mode": "checkpoint_default",
|
| 10 |
+
"episodes_per_task": 1,
|
| 11 |
+
"episode_length": 20,
|
| 12 |
+
"resolution": 224,
|
| 13 |
+
"reset_retries": 20,
|
| 14 |
+
"cameras": [
|
| 15 |
+
"front",
|
| 16 |
+
"wrist_left",
|
| 17 |
+
"wrist_right"
|
| 18 |
+
],
|
| 19 |
+
"tasks": {
|
| 20 |
+
"bimanual_dual_push_buttons": {
|
| 21 |
+
"task_class": "BimanualDualPushButtons",
|
| 22 |
+
"successes": [
|
| 23 |
+
0.0
|
| 24 |
+
],
|
| 25 |
+
"returns": [
|
| 26 |
+
0.0
|
| 27 |
+
],
|
| 28 |
+
"path_recoveries": [
|
| 29 |
+
0
|
| 30 |
+
],
|
| 31 |
+
"noop_fallbacks": [
|
| 32 |
+
0
|
| 33 |
+
],
|
| 34 |
+
"reset_retries": [
|
| 35 |
+
0
|
| 36 |
+
],
|
| 37 |
+
"episode_traces": [
|
| 38 |
+
{
|
| 39 |
+
"language_goal": "push the olive and the orange buttons",
|
| 40 |
+
"steps": [
|
| 41 |
+
{
|
| 42 |
+
"timestep": 0,
|
| 43 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 44 |
+
"planner_scores": [
|
| 45 |
+
11.197153091430664,
|
| 46 |
+
11.241825103759766,
|
| 47 |
+
11.236907005310059,
|
| 48 |
+
11.205011367797852
|
| 49 |
+
],
|
| 50 |
+
"predicted_reocclusion": 0.5305227041244507,
|
| 51 |
+
"support_mode_conditioning": true,
|
| 52 |
+
"path_recoveries": 0,
|
| 53 |
+
"noop_fallbacks": 0
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"timestep": 1,
|
| 57 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 58 |
+
"planner_scores": [
|
| 59 |
+
11.197153091430664,
|
| 60 |
+
11.241825103759766,
|
| 61 |
+
11.236907005310059,
|
| 62 |
+
11.205011367797852
|
| 63 |
+
],
|
| 64 |
+
"predicted_reocclusion": 0.5305227041244507,
|
| 65 |
+
"support_mode_conditioning": true,
|
| 66 |
+
"path_recoveries": 0,
|
| 67 |
+
"noop_fallbacks": 0
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"timestep": 2,
|
| 71 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 72 |
+
"planner_scores": [
|
| 73 |
+
11.197153091430664,
|
| 74 |
+
11.241825103759766,
|
| 75 |
+
11.236907005310059,
|
| 76 |
+
11.205011367797852
|
| 77 |
+
],
|
| 78 |
+
"predicted_reocclusion": 0.5305227041244507,
|
| 79 |
+
"support_mode_conditioning": true,
|
| 80 |
+
"path_recoveries": 0,
|
| 81 |
+
"noop_fallbacks": 0
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"timestep": 3,
|
| 85 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 86 |
+
"planner_scores": [
|
| 87 |
+
11.197153091430664,
|
| 88 |
+
11.241825103759766,
|
| 89 |
+
11.236907005310059,
|
| 90 |
+
11.205011367797852
|
| 91 |
+
],
|
| 92 |
+
"predicted_reocclusion": 0.5305227041244507,
|
| 93 |
+
"support_mode_conditioning": true,
|
| 94 |
+
"path_recoveries": 0,
|
| 95 |
+
"noop_fallbacks": 0
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"timestep": 4,
|
| 99 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 100 |
+
"planner_scores": [
|
| 101 |
+
11.1263427734375,
|
| 102 |
+
11.163692474365234,
|
| 103 |
+
11.160633087158203,
|
| 104 |
+
11.130797386169434
|
| 105 |
+
],
|
| 106 |
+
"predicted_reocclusion": 0.5315501689910889,
|
| 107 |
+
"support_mode_conditioning": true,
|
| 108 |
+
"path_recoveries": 0,
|
| 109 |
+
"noop_fallbacks": 0
|
| 110 |
+
},
|
| 111 |
+
{
|
| 112 |
+
"timestep": 5,
|
| 113 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 114 |
+
"planner_scores": [
|
| 115 |
+
11.1263427734375,
|
| 116 |
+
11.163692474365234,
|
| 117 |
+
11.160633087158203,
|
| 118 |
+
11.130797386169434
|
| 119 |
+
],
|
| 120 |
+
"predicted_reocclusion": 0.5315501689910889,
|
| 121 |
+
"support_mode_conditioning": true,
|
| 122 |
+
"path_recoveries": 0,
|
| 123 |
+
"noop_fallbacks": 0
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"timestep": 6,
|
| 127 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 128 |
+
"planner_scores": [
|
| 129 |
+
11.1263427734375,
|
| 130 |
+
11.163692474365234,
|
| 131 |
+
11.160633087158203,
|
| 132 |
+
11.130797386169434
|
| 133 |
+
],
|
| 134 |
+
"predicted_reocclusion": 0.5315501689910889,
|
| 135 |
+
"support_mode_conditioning": true,
|
| 136 |
+
"path_recoveries": 0,
|
| 137 |
+
"noop_fallbacks": 0
|
| 138 |
+
},
|
| 139 |
+
{
|
| 140 |
+
"timestep": 7,
|
| 141 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 142 |
+
"planner_scores": [
|
| 143 |
+
11.1263427734375,
|
| 144 |
+
11.163692474365234,
|
| 145 |
+
11.160633087158203,
|
| 146 |
+
11.130797386169434
|
| 147 |
+
],
|
| 148 |
+
"predicted_reocclusion": 0.5315501689910889,
|
| 149 |
+
"support_mode_conditioning": true,
|
| 150 |
+
"path_recoveries": 0,
|
| 151 |
+
"noop_fallbacks": 0
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"timestep": 8,
|
| 155 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 156 |
+
"planner_scores": [
|
| 157 |
+
11.077136039733887,
|
| 158 |
+
11.114724159240723,
|
| 159 |
+
11.111690521240234,
|
| 160 |
+
11.081738471984863
|
| 161 |
+
],
|
| 162 |
+
"predicted_reocclusion": 0.5313586592674255,
|
| 163 |
+
"support_mode_conditioning": true,
|
| 164 |
+
"path_recoveries": 0,
|
| 165 |
+
"noop_fallbacks": 0
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"timestep": 9,
|
| 169 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 170 |
+
"planner_scores": [
|
| 171 |
+
11.077136039733887,
|
| 172 |
+
11.114724159240723,
|
| 173 |
+
11.111690521240234,
|
| 174 |
+
11.081738471984863
|
| 175 |
+
],
|
| 176 |
+
"predicted_reocclusion": 0.5313586592674255,
|
| 177 |
+
"support_mode_conditioning": true,
|
| 178 |
+
"path_recoveries": 0,
|
| 179 |
+
"noop_fallbacks": 0
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"timestep": 10,
|
| 183 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 184 |
+
"planner_scores": [
|
| 185 |
+
11.077136039733887,
|
| 186 |
+
11.114724159240723,
|
| 187 |
+
11.111690521240234,
|
| 188 |
+
11.081738471984863
|
| 189 |
+
],
|
| 190 |
+
"predicted_reocclusion": 0.5313586592674255,
|
| 191 |
+
"support_mode_conditioning": true,
|
| 192 |
+
"path_recoveries": 0,
|
| 193 |
+
"noop_fallbacks": 0
|
| 194 |
+
},
|
| 195 |
+
{
|
| 196 |
+
"timestep": 11,
|
| 197 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 198 |
+
"planner_scores": [
|
| 199 |
+
11.077136039733887,
|
| 200 |
+
11.114724159240723,
|
| 201 |
+
11.111690521240234,
|
| 202 |
+
11.081738471984863
|
| 203 |
+
],
|
| 204 |
+
"predicted_reocclusion": 0.5313586592674255,
|
| 205 |
+
"support_mode_conditioning": true,
|
| 206 |
+
"path_recoveries": 0,
|
| 207 |
+
"noop_fallbacks": 0
|
| 208 |
+
},
|
| 209 |
+
{
|
| 210 |
+
"timestep": 12,
|
| 211 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 212 |
+
"planner_scores": [
|
| 213 |
+
11.042268753051758,
|
| 214 |
+
11.08004379272461,
|
| 215 |
+
11.07697868347168,
|
| 216 |
+
11.046956062316895
|
| 217 |
+
],
|
| 218 |
+
"predicted_reocclusion": 0.5312807559967041,
|
| 219 |
+
"support_mode_conditioning": true,
|
| 220 |
+
"path_recoveries": 0,
|
| 221 |
+
"noop_fallbacks": 0
|
| 222 |
+
},
|
| 223 |
+
{
|
| 224 |
+
"timestep": 13,
|
| 225 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 226 |
+
"planner_scores": [
|
| 227 |
+
11.042268753051758,
|
| 228 |
+
11.08004379272461,
|
| 229 |
+
11.07697868347168,
|
| 230 |
+
11.046956062316895
|
| 231 |
+
],
|
| 232 |
+
"predicted_reocclusion": 0.5312807559967041,
|
| 233 |
+
"support_mode_conditioning": true,
|
| 234 |
+
"path_recoveries": 0,
|
| 235 |
+
"noop_fallbacks": 0
|
| 236 |
+
},
|
| 237 |
+
{
|
| 238 |
+
"timestep": 14,
|
| 239 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 240 |
+
"planner_scores": [
|
| 241 |
+
11.042268753051758,
|
| 242 |
+
11.08004379272461,
|
| 243 |
+
11.07697868347168,
|
| 244 |
+
11.046956062316895
|
| 245 |
+
],
|
| 246 |
+
"predicted_reocclusion": 0.5312807559967041,
|
| 247 |
+
"support_mode_conditioning": true,
|
| 248 |
+
"path_recoveries": 0,
|
| 249 |
+
"noop_fallbacks": 0
|
| 250 |
+
},
|
| 251 |
+
{
|
| 252 |
+
"timestep": 15,
|
| 253 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 254 |
+
"planner_scores": [
|
| 255 |
+
11.042268753051758,
|
| 256 |
+
11.08004379272461,
|
| 257 |
+
11.07697868347168,
|
| 258 |
+
11.046956062316895
|
| 259 |
+
],
|
| 260 |
+
"predicted_reocclusion": 0.5312807559967041,
|
| 261 |
+
"support_mode_conditioning": true,
|
| 262 |
+
"path_recoveries": 0,
|
| 263 |
+
"noop_fallbacks": 0
|
| 264 |
+
},
|
| 265 |
+
{
|
| 266 |
+
"timestep": 16,
|
| 267 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 268 |
+
"planner_scores": [
|
| 269 |
+
11.03925895690918,
|
| 270 |
+
11.076944351196289,
|
| 271 |
+
11.073898315429688,
|
| 272 |
+
11.043900489807129
|
| 273 |
+
],
|
| 274 |
+
"predicted_reocclusion": 0.5312473773956299,
|
| 275 |
+
"support_mode_conditioning": true,
|
| 276 |
+
"path_recoveries": 0,
|
| 277 |
+
"noop_fallbacks": 0
|
| 278 |
+
},
|
| 279 |
+
{
|
| 280 |
+
"timestep": 17,
|
| 281 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 282 |
+
"planner_scores": [
|
| 283 |
+
11.03925895690918,
|
| 284 |
+
11.076944351196289,
|
| 285 |
+
11.073898315429688,
|
| 286 |
+
11.043900489807129
|
| 287 |
+
],
|
| 288 |
+
"predicted_reocclusion": 0.5312473773956299,
|
| 289 |
+
"support_mode_conditioning": true,
|
| 290 |
+
"path_recoveries": 0,
|
| 291 |
+
"noop_fallbacks": 0
|
| 292 |
+
},
|
| 293 |
+
{
|
| 294 |
+
"timestep": 18,
|
| 295 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 296 |
+
"planner_scores": [
|
| 297 |
+
11.03925895690918,
|
| 298 |
+
11.076944351196289,
|
| 299 |
+
11.073898315429688,
|
| 300 |
+
11.043900489807129
|
| 301 |
+
],
|
| 302 |
+
"predicted_reocclusion": 0.5312473773956299,
|
| 303 |
+
"support_mode_conditioning": true,
|
| 304 |
+
"path_recoveries": 0,
|
| 305 |
+
"noop_fallbacks": 0
|
| 306 |
+
},
|
| 307 |
+
{
|
| 308 |
+
"timestep": 19,
|
| 309 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 310 |
+
"planner_scores": [
|
| 311 |
+
11.03925895690918,
|
| 312 |
+
11.076944351196289,
|
| 313 |
+
11.073898315429688,
|
| 314 |
+
11.043900489807129
|
| 315 |
+
],
|
| 316 |
+
"predicted_reocclusion": 0.5312473773956299,
|
| 317 |
+
"support_mode_conditioning": true,
|
| 318 |
+
"path_recoveries": 0,
|
| 319 |
+
"noop_fallbacks": 0
|
| 320 |
+
}
|
| 321 |
+
],
|
| 322 |
+
"success": 0.0,
|
| 323 |
+
"return": 0.0,
|
| 324 |
+
"path_recoveries": 0,
|
| 325 |
+
"noop_fallbacks": 0
|
| 326 |
+
}
|
| 327 |
+
],
|
| 328 |
+
"mean_success": 0.0,
|
| 329 |
+
"mean_return": 0.0
|
| 330 |
+
}
|
| 331 |
+
},
|
| 332 |
+
"mean_success": 0.0
|
| 333 |
+
}
|
results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/rollout_eval.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RLBench Rollout Eval
|
| 2 |
+
|
| 3 |
+
- Checkpoint: `/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt`
|
| 4 |
+
- Plan requested: `True`
|
| 5 |
+
- Plan applied: `True`
|
| 6 |
+
- Support-mode conditioning: `True`
|
| 7 |
+
- Task conditioning: `True`
|
| 8 |
+
- Geometry enabled: `True`
|
| 9 |
+
- World-model mode: `checkpoint_default`
|
| 10 |
+
- Mean success: `0.000`
|
| 11 |
+
|
| 12 |
+
## Per-task
|
| 13 |
+
|
| 14 |
+
- `bimanual_dual_push_buttons`: mean_success=0.000, returns=[0.0]
|
results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/stderr.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/workspace/envs/rlbench/lib/python3.10/site-packages/torch/nn/modules/transformer.py:306: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
|
| 2 |
+
warnings.warn(f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}")
|
| 3 |
+
qt.qpa.xcb: QXcbConnection: XCB error: 148 (Unknown), sequence: 181, resource id: 0, major code: 140 (Unknown), minor code: 20
|
| 4 |
+
WARNING:root:not sure how _robot_shapes are used is used.
|
results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_dual_push_buttons/stdout.txt
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt",
|
| 3 |
+
"plan_requested": true,
|
| 4 |
+
"plan_applied": true,
|
| 5 |
+
"planner_mode": "trainable",
|
| 6 |
+
"support_mode_conditioning": true,
|
| 7 |
+
"task_conditioning": true,
|
| 8 |
+
"geometry_enabled": true,
|
| 9 |
+
"world_model_mode": "checkpoint_default",
|
| 10 |
+
"episodes_per_task": 1,
|
| 11 |
+
"episode_length": 20,
|
| 12 |
+
"resolution": 224,
|
| 13 |
+
"reset_retries": 20,
|
| 14 |
+
"cameras": [
|
| 15 |
+
"front",
|
| 16 |
+
"wrist_left",
|
| 17 |
+
"wrist_right"
|
| 18 |
+
],
|
| 19 |
+
"tasks": {
|
| 20 |
+
"bimanual_dual_push_buttons": {
|
| 21 |
+
"task_class": "BimanualDualPushButtons",
|
| 22 |
+
"successes": [
|
| 23 |
+
0.0
|
| 24 |
+
],
|
| 25 |
+
"returns": [
|
| 26 |
+
0.0
|
| 27 |
+
],
|
| 28 |
+
"path_recoveries": [
|
| 29 |
+
0
|
| 30 |
+
],
|
| 31 |
+
"noop_fallbacks": [
|
| 32 |
+
0
|
| 33 |
+
],
|
| 34 |
+
"reset_retries": [
|
| 35 |
+
0
|
| 36 |
+
],
|
| 37 |
+
"episode_traces": [
|
| 38 |
+
{
|
| 39 |
+
"language_goal": "push the olive and the orange buttons",
|
| 40 |
+
"steps": [
|
| 41 |
+
{
|
| 42 |
+
"timestep": 0,
|
| 43 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 44 |
+
"planner_scores": [
|
| 45 |
+
11.197153091430664,
|
| 46 |
+
11.241825103759766,
|
| 47 |
+
11.236907005310059,
|
| 48 |
+
11.205011367797852
|
| 49 |
+
],
|
| 50 |
+
"predicted_reocclusion": 0.5305227041244507,
|
| 51 |
+
"support_mode_conditioning": true,
|
| 52 |
+
"path_recoveries": 0,
|
| 53 |
+
"noop_fallbacks": 0
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"timestep": 1,
|
| 57 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 58 |
+
"planner_scores": [
|
| 59 |
+
11.197153091430664,
|
| 60 |
+
11.241825103759766,
|
| 61 |
+
11.236907005310059,
|
| 62 |
+
11.205011367797852
|
| 63 |
+
],
|
| 64 |
+
"predicted_reocclusion": 0.5305227041244507,
|
| 65 |
+
"support_mode_conditioning": true,
|
| 66 |
+
"path_recoveries": 0,
|
| 67 |
+
"noop_fallbacks": 0
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"timestep": 2,
|
| 71 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 72 |
+
"planner_scores": [
|
| 73 |
+
11.197153091430664,
|
| 74 |
+
11.241825103759766,
|
| 75 |
+
11.236907005310059,
|
| 76 |
+
11.205011367797852
|
| 77 |
+
],
|
| 78 |
+
"predicted_reocclusion": 0.5305227041244507,
|
| 79 |
+
"support_mode_conditioning": true,
|
| 80 |
+
"path_recoveries": 0,
|
| 81 |
+
"noop_fallbacks": 0
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"timestep": 3,
|
| 85 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 86 |
+
"planner_scores": [
|
| 87 |
+
11.197153091430664,
|
| 88 |
+
11.241825103759766,
|
| 89 |
+
11.236907005310059,
|
| 90 |
+
11.205011367797852
|
| 91 |
+
],
|
| 92 |
+
"predicted_reocclusion": 0.5305227041244507,
|
| 93 |
+
"support_mode_conditioning": true,
|
| 94 |
+
"path_recoveries": 0,
|
| 95 |
+
"noop_fallbacks": 0
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"timestep": 4,
|
| 99 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 100 |
+
"planner_scores": [
|
| 101 |
+
11.1263427734375,
|
| 102 |
+
11.163692474365234,
|
| 103 |
+
11.160633087158203,
|
| 104 |
+
11.130797386169434
|
| 105 |
+
],
|
| 106 |
+
"predicted_reocclusion": 0.5315501689910889,
|
| 107 |
+
"support_mode_conditioning": true,
|
| 108 |
+
"path_recoveries": 0,
|
| 109 |
+
"noop_fallbacks": 0
|
| 110 |
+
},
|
| 111 |
+
{
|
| 112 |
+
"timestep": 5,
|
| 113 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 114 |
+
"planner_scores": [
|
| 115 |
+
11.1263427734375,
|
| 116 |
+
11.163692474365234,
|
| 117 |
+
11.160633087158203,
|
| 118 |
+
11.130797386169434
|
| 119 |
+
],
|
| 120 |
+
"predicted_reocclusion": 0.5315501689910889,
|
| 121 |
+
"support_mode_conditioning": true,
|
| 122 |
+
"path_recoveries": 0,
|
| 123 |
+
"noop_fallbacks": 0
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"timestep": 6,
|
| 127 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 128 |
+
"planner_scores": [
|
| 129 |
+
11.1263427734375,
|
| 130 |
+
11.163692474365234,
|
| 131 |
+
11.160633087158203,
|
| 132 |
+
11.130797386169434
|
| 133 |
+
],
|
| 134 |
+
"predicted_reocclusion": 0.5315501689910889,
|
| 135 |
+
"support_mode_conditioning": true,
|
| 136 |
+
"path_recoveries": 0,
|
| 137 |
+
"noop_fallbacks": 0
|
| 138 |
+
},
|
| 139 |
+
{
|
| 140 |
+
"timestep": 7,
|
| 141 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 142 |
+
"planner_scores": [
|
| 143 |
+
11.1263427734375,
|
| 144 |
+
11.163692474365234,
|
| 145 |
+
11.160633087158203,
|
| 146 |
+
11.130797386169434
|
| 147 |
+
],
|
| 148 |
+
"predicted_reocclusion": 0.5315501689910889,
|
| 149 |
+
"support_mode_conditioning": true,
|
| 150 |
+
"path_recoveries": 0,
|
| 151 |
+
"noop_fallbacks": 0
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"timestep": 8,
|
| 155 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 156 |
+
"planner_scores": [
|
| 157 |
+
11.077136039733887,
|
| 158 |
+
11.114724159240723,
|
| 159 |
+
11.111690521240234,
|
| 160 |
+
11.081738471984863
|
| 161 |
+
],
|
| 162 |
+
"predicted_reocclusion": 0.5313586592674255,
|
| 163 |
+
"support_mode_conditioning": true,
|
| 164 |
+
"path_recoveries": 0,
|
| 165 |
+
"noop_fallbacks": 0
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"timestep": 9,
|
| 169 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 170 |
+
"planner_scores": [
|
| 171 |
+
11.077136039733887,
|
| 172 |
+
11.114724159240723,
|
| 173 |
+
11.111690521240234,
|
| 174 |
+
11.081738471984863
|
| 175 |
+
],
|
| 176 |
+
"predicted_reocclusion": 0.5313586592674255,
|
| 177 |
+
"support_mode_conditioning": true,
|
| 178 |
+
"path_recoveries": 0,
|
| 179 |
+
"noop_fallbacks": 0
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"timestep": 10,
|
| 183 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 184 |
+
"planner_scores": [
|
| 185 |
+
11.077136039733887,
|
| 186 |
+
11.114724159240723,
|
| 187 |
+
11.111690521240234,
|
| 188 |
+
11.081738471984863
|
| 189 |
+
],
|
| 190 |
+
"predicted_reocclusion": 0.5313586592674255,
|
| 191 |
+
"support_mode_conditioning": true,
|
| 192 |
+
"path_recoveries": 0,
|
| 193 |
+
"noop_fallbacks": 0
|
| 194 |
+
},
|
| 195 |
+
{
|
| 196 |
+
"timestep": 11,
|
| 197 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 198 |
+
"planner_scores": [
|
| 199 |
+
11.077136039733887,
|
| 200 |
+
11.114724159240723,
|
| 201 |
+
11.111690521240234,
|
| 202 |
+
11.081738471984863
|
| 203 |
+
],
|
| 204 |
+
"predicted_reocclusion": 0.5313586592674255,
|
| 205 |
+
"support_mode_conditioning": true,
|
| 206 |
+
"path_recoveries": 0,
|
| 207 |
+
"noop_fallbacks": 0
|
| 208 |
+
},
|
| 209 |
+
{
|
| 210 |
+
"timestep": 12,
|
| 211 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 212 |
+
"planner_scores": [
|
| 213 |
+
11.042268753051758,
|
| 214 |
+
11.08004379272461,
|
| 215 |
+
11.07697868347168,
|
| 216 |
+
11.046956062316895
|
| 217 |
+
],
|
| 218 |
+
"predicted_reocclusion": 0.5312807559967041,
|
| 219 |
+
"support_mode_conditioning": true,
|
| 220 |
+
"path_recoveries": 0,
|
| 221 |
+
"noop_fallbacks": 0
|
| 222 |
+
},
|
| 223 |
+
{
|
| 224 |
+
"timestep": 13,
|
| 225 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 226 |
+
"planner_scores": [
|
| 227 |
+
11.042268753051758,
|
| 228 |
+
11.08004379272461,
|
| 229 |
+
11.07697868347168,
|
| 230 |
+
11.046956062316895
|
| 231 |
+
],
|
| 232 |
+
"predicted_reocclusion": 0.5312807559967041,
|
| 233 |
+
"support_mode_conditioning": true,
|
| 234 |
+
"path_recoveries": 0,
|
| 235 |
+
"noop_fallbacks": 0
|
| 236 |
+
},
|
| 237 |
+
{
|
| 238 |
+
"timestep": 14,
|
| 239 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 240 |
+
"planner_scores": [
|
| 241 |
+
11.042268753051758,
|
| 242 |
+
11.08004379272461,
|
| 243 |
+
11.07697868347168,
|
| 244 |
+
11.046956062316895
|
| 245 |
+
],
|
| 246 |
+
"predicted_reocclusion": 0.5312807559967041,
|
| 247 |
+
"support_mode_conditioning": true,
|
| 248 |
+
"path_recoveries": 0,
|
| 249 |
+
"noop_fallbacks": 0
|
| 250 |
+
},
|
| 251 |
+
{
|
| 252 |
+
"timestep": 15,
|
| 253 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 254 |
+
"planner_scores": [
|
| 255 |
+
11.042268753051758,
|
| 256 |
+
11.08004379272461,
|
| 257 |
+
11.07697868347168,
|
| 258 |
+
11.046956062316895
|
| 259 |
+
],
|
| 260 |
+
"predicted_reocclusion": 0.5312807559967041,
|
| 261 |
+
"support_mode_conditioning": true,
|
| 262 |
+
"path_recoveries": 0,
|
| 263 |
+
"noop_fallbacks": 0
|
| 264 |
+
},
|
| 265 |
+
{
|
| 266 |
+
"timestep": 16,
|
| 267 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 268 |
+
"planner_scores": [
|
| 269 |
+
11.03925895690918,
|
| 270 |
+
11.076944351196289,
|
| 271 |
+
11.073898315429688,
|
| 272 |
+
11.043900489807129
|
| 273 |
+
],
|
| 274 |
+
"predicted_reocclusion": 0.5312473773956299,
|
| 275 |
+
"support_mode_conditioning": true,
|
| 276 |
+
"path_recoveries": 0,
|
| 277 |
+
"noop_fallbacks": 0
|
| 278 |
+
},
|
| 279 |
+
{
|
| 280 |
+
"timestep": 17,
|
| 281 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 282 |
+
"planner_scores": [
|
| 283 |
+
11.03925895690918,
|
| 284 |
+
11.076944351196289,
|
| 285 |
+
11.073898315429688,
|
| 286 |
+
11.043900489807129
|
| 287 |
+
],
|
| 288 |
+
"predicted_reocclusion": 0.5312473773956299,
|
| 289 |
+
"support_mode_conditioning": true,
|
| 290 |
+
"path_recoveries": 0,
|
| 291 |
+
"noop_fallbacks": 0
|
| 292 |
+
},
|
| 293 |
+
{
|
| 294 |
+
"timestep": 18,
|
| 295 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 296 |
+
"planner_scores": [
|
| 297 |
+
11.03925895690918,
|
| 298 |
+
11.076944351196289,
|
| 299 |
+
11.073898315429688,
|
| 300 |
+
11.043900489807129
|
| 301 |
+
],
|
| 302 |
+
"predicted_reocclusion": 0.5312473773956299,
|
| 303 |
+
"support_mode_conditioning": true,
|
| 304 |
+
"path_recoveries": 0,
|
| 305 |
+
"noop_fallbacks": 0
|
| 306 |
+
},
|
| 307 |
+
{
|
| 308 |
+
"timestep": 19,
|
| 309 |
+
"chosen_macro_mode": "lift_support_layer",
|
| 310 |
+
"planner_scores": [
|
| 311 |
+
11.03925895690918,
|
| 312 |
+
11.076944351196289,
|
| 313 |
+
11.073898315429688,
|
| 314 |
+
11.043900489807129
|
| 315 |
+
],
|
| 316 |
+
"predicted_reocclusion": 0.5312473773956299,
|
| 317 |
+
"support_mode_conditioning": true,
|
| 318 |
+
"path_recoveries": 0,
|
| 319 |
+
"noop_fallbacks": 0
|
| 320 |
+
}
|
| 321 |
+
],
|
| 322 |
+
"success": 0.0,
|
| 323 |
+
"return": 0.0,
|
| 324 |
+
"path_recoveries": 0,
|
| 325 |
+
"noop_fallbacks": 0
|
| 326 |
+
}
|
| 327 |
+
],
|
| 328 |
+
"mean_success": 0.0,
|
| 329 |
+
"mean_return": 0.0
|
| 330 |
+
}
|
| 331 |
+
},
|
| 332 |
+
"mean_success": 0.0
|
| 333 |
+
}
|
| 334 |
+
[CoppeliaSim:loadinfo] done.
|
results/2026-03-25-runpod/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_handover_item/command.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
/workspace/envs/rlbench/bin/python -m eval.run_rlbench_rollout_eval --checkpoint /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt --output-dir /workspace/reports/peract2_baseline_ep1/baseline_rgbd_seed17_plan_split/bimanual_handover_item --tasks bimanual_handover_item --episodes-per-task 1 --episode-length 20 --resolution 224 --device cuda --chunk-commit-steps 4 --headless --plan --allow-unsupervised-planning
|