lsnu commited on
Commit
f7a3ee4
·
verified ·
1 Parent(s): ccf25b1

Add files using upload-large-folder tool

Browse files
Files changed (33) hide show
  1. README.md +74 -1
  2. artifacts/twin_dual_push_128_stepcmp_2k_20260311/README.md +51 -0
  3. artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/df_workspace.txt +2 -0
  4. artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/env_selected.txt +3 -0
  5. artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/nvidia_smi.txt +32 -0
  6. artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/nvidia_smi_topo.txt +23 -0
  7. artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/pip_freeze.txt +223 -0
  8. artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/python_version.txt +1 -0
  9. artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/torch_env.txt +82 -0
  10. artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/bootstrap_regeneration_status.txt +2 -0
  11. artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/check_split_communicating_invariants.log +9 -0
  12. artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/check_split_independent_invariants.log +11 -0
  13. artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_head_only.log +15 -0
  14. artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_split_communicating.log +16 -0
  15. artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_split_independent.log +15 -0
  16. artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/norm_stats_status.txt +6 -0
  17. artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/sample_batch_size_probe.log +52 -0
  18. artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/sample_batch_size_used.txt +1 -0
  19. artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/date_utc.txt +1 -0
  20. artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/pip_freeze.txt +223 -0
  21. artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/python_version.txt +1 -0
  22. artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/torch_env.txt +1 -0
  23. artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/uname.txt +1 -0
  24. openpi/checkpoints/debug_pi05_split_communicating_pytorch_smoke/debug_pi05_split_communicating_pytorch_smoke/1/optimizer.pt +3 -0
  25. openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/1/model.safetensors +3 -0
  26. openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/1/optimizer.pt +3 -0
  27. openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/2/optimizer.pt +3 -0
  28. openpi/run_logs/split_independent_real_smoke20.log +0 -0
  29. openpi/run_logs/split_independent_real_smoke3.log +104 -0
  30. openpi/scripts/collect_twin_dual_push_128_stepcmp_metrics.py +913 -0
  31. openpi/scripts/prune_stepcmp_checkpoints.py +53 -0
  32. openpi/scripts/run_twin_dual_push_128_stepcmp_2k.sh +558 -0
  33. run_logs/hf_upload_20260310.log +0 -0
README.md CHANGED
@@ -12,6 +12,7 @@ Three runs are included:
12
  1. an initial `2K` baseline-vs-parallel comparison
13
  2. a longer `10K` follow-up on the same packed setup
14
  3. a `5K` dual-push `128` screening study on the same packed path
 
15
 
16
  This update also adds a split-action-expert bring-up bundle for the packed TWIN path, covering:
17
 
@@ -57,6 +58,41 @@ Dual-push `128` screening results:
57
 
58
  The dual-push screening run shows a small but consistent parallel edge at `1K`, `2K`, and `5K` on both teacher-forced validation loss and fixed-subset sample MAE.
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  ## Warm-start note
61
 
62
  The packed parallel warm-start uses the slice/fuse mapping implemented in `openpi/scripts/init_parallel_pi05_from_single_pytorch.py`, but the added step-0 numerical checks show it is not exactly identical end-to-end on a real batch:
@@ -107,11 +143,43 @@ New bring-up artifact bundle:
107
  - `10K` follow-up bundle with metrics, logs, repro manifests, and environment snapshot
108
  - `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/`
109
  - dual-push `128` screening bundle with metrics, logs, repro manifests, and environment snapshot
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  - `artifacts/twin_split_expert_bringup_20260310/`
111
- - split-expert warm-start checkpoints, sanity checks, and bring-up repro commands
 
 
 
 
 
 
 
 
112
  - `artifacts/pi05_base_params/`
113
  - staged base parameter snapshot used during JAX-to-PyTorch conversion
114
 
 
 
 
 
 
 
 
 
 
 
 
115
  ## Key files
116
 
117
  - Full report: `REPORT.md`
@@ -122,6 +190,11 @@ New bring-up artifact bundle:
122
  - dual-push `5K` teacher-forced table: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics/teacher_forced_eval_table.csv`
123
  - dual-push `5K` sample eval table: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics/sample_eval_table.csv`
124
  - dual-push `5K` environment snapshot: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/environment/`
 
 
 
 
 
125
  - split-expert bring-up summary: `artifacts/twin_split_expert_bringup_20260310/README.md`
126
  - split-expert repro commands: `artifacts/twin_split_expert_bringup_20260310/repro/commands_bringup.sh`
127
  - split-expert invariant check outputs: `artifacts/twin_split_expert_bringup_20260310/sanity_checks/`
 
12
  1. an initial `2K` baseline-vs-parallel comparison
13
  2. a longer `10K` follow-up on the same packed setup
14
  3. a `5K` dual-push `128` screening study on the same packed path
15
+ 4. a `2K` dual-push `128` four-way step comparison across `shared`, `head_only_parallel`, `split_independent`, and `split_communicating`
16
 
17
  This update also adds a split-action-expert bring-up bundle for the packed TWIN path, covering:
18
 
 
58
 
59
  The dual-push screening run shows a small but consistent parallel edge at `1K`, `2K`, and `5K` on both teacher-forced validation loss and fixed-subset sample MAE.
60
 
61
+ Dual-push `128` four-way `2K` step comparison raw results:
62
+
63
+ Step-0 teacher-forced masked validation loss:
64
+
65
+ | Model | Step-0 val loss | Step-0 left/right imbalance |
66
+ | --- | ---: | ---: |
67
+ | Shared | `1.084735` | `0.505345` |
68
+ | Head-only parallel | `1.082985` | `0.501182` |
69
+ | Split independent | `1.328262` | `0.448843` |
70
+ | Split communicating | `1.783048` | `0.671085` |
71
+
72
+ Step-2000 teacher-forced masked validation loss:
73
+
74
+ | Model | Step-2000 val loss | Step-2000 left/right imbalance |
75
+ | --- | ---: | ---: |
76
+ | Shared | `0.055329` | `0.069564` |
77
+ | Head-only parallel | `0.055297` | `0.069380` |
78
+ | Split independent | `0.063537` | `0.092029` |
79
+ | Split communicating | `0.059952` | `0.080435` |
80
+
81
+ Step-2000 sample masked MAE:
82
+
83
+ | Model | 1-step MAE | 4-step MAE | 16-step MAE |
84
+ | --- | ---: | ---: | ---: |
85
+ | Shared | `0.087330` | `0.078164` | `0.085222` |
86
+ | Head-only parallel | `0.086764` | `0.078301` | `0.085272` |
87
+ | Split independent | `0.079100` | `0.070436` | `0.075281` |
88
+ | Split communicating | `0.078618` | `0.071087` | `0.075570` |
89
+
90
+ Full raw tables for the `0/100/500/2000` sweep live in:
91
+
92
+ - `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/teacher_forced_eval_table.csv`
93
+ - `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/sample_eval_table.csv`
94
+ - `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/training_summary.csv`
95
+
96
  ## Warm-start note
97
 
98
  The packed parallel warm-start uses the slice/fuse mapping implemented in `openpi/scripts/init_parallel_pi05_from_single_pytorch.py`, but the added step-0 numerical checks show it is not exactly identical end-to-end on a real batch:
 
143
  - `10K` follow-up bundle with metrics, logs, repro manifests, and environment snapshot
144
  - `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/`
145
  - dual-push `128` screening bundle with metrics, logs, repro manifests, and environment snapshot
146
+ - `artifacts/twin_dual_push_128_stepcmp_2k_20260311/`
147
+ - dual-push `128` four-way `2K` step-comparison bundle with metrics, logs, repro manifests, and environment snapshot
148
+ - `artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/`
149
+ - small preflight/debug snapshot from the interrupted bring-up path; useful for debugging the runner, not the canonical result bundle
150
+ - `artifacts/twin_split_expert_bringup_20260310/`
151
+ - split-expert bring-up bundle committed with summary README, repro commands, detached run logs, and sanity checks
152
+
153
+ ## Committed artifact note
154
+
155
+ For this update, the committed artifact payloads are:
156
+
157
+ - `artifacts/twin_dual_push_128_stepcmp_2k_20260311/`
158
+ - the official finalized `4`-model dual-push `2K` step-comparison bundle
159
  - `artifacts/twin_split_expert_bringup_20260310/`
160
+ - the split-expert bring-up bundle used as the sanity and warm-start reference
161
+ - `artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/`
162
+ - a small debug-only environment snapshot from the failed/resumed bring-up sequence
163
+
164
+ The debug bundle is intentionally committed only as runner diagnostics. The canonical study outputs are the non-`_debug` step-comparison bundle plus the split bring-up bundle.
165
+ - `openpi/run_logs/`
166
+ - raw local split bring-up logs kept for completeness; the canonical copies for the finalized bring-up record live under `artifacts/twin_split_expert_bringup_20260310/run_logs/`
167
+ - `openpi/scripts/upload_stepcmp_bundle_to_hf.py`
168
+ - the committed high-throughput HF uploader for the step-comparison bundle and retained checkpoints; it uses `huggingface_hub.HfApi.upload_large_folder(...)`
169
  - `artifacts/pi05_base_params/`
170
  - staged base parameter snapshot used during JAX-to-PyTorch conversion
171
 
172
+ ## Future commit/upload workflow
173
+
174
+ When adding new experiment results to this repo:
175
+
176
+ - keep the canonical bundle under `artifacts/<study_name>/` and only retain the checkpoint steps that are scientifically required under `openpi/checkpoints/`
177
+ - before claiming the repo is fully committed, audit ignored artifact paths explicitly:
178
+ - `git ls-files --others -i --exclude-standard --directory -- openpi/checkpoints artifacts openpi/run_logs run_logs`
179
+ - if a result is intentionally kept in an ignored path such as `openpi/checkpoints/` or `openpi/run_logs/`, force-add it explicitly with `git add --sparse -f ...`
180
+ - use `openpi/scripts/upload_stepcmp_bundle_to_hf.py` for large HF uploads; it uses `huggingface_hub.HfApi.upload_large_folder(...)` and is the preferred path for checkpoint-heavy updates
181
+ - never hardcode HF credentials in scripts, logs, or READMEs; keep the credential in `HF_TOKEN` or load it from `HF_TOKEN_FILE`, and check for literal `hf_...` strings before committing
182
+
183
  ## Key files
184
 
185
  - Full report: `REPORT.md`
 
190
  - dual-push `5K` teacher-forced table: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics/teacher_forced_eval_table.csv`
191
  - dual-push `5K` sample eval table: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics/sample_eval_table.csv`
192
  - dual-push `5K` environment snapshot: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/environment/`
193
+ - dual-push `2K` step-comparison summary: `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/summary.json`
194
+ - dual-push `2K` step-comparison README: `artifacts/twin_dual_push_128_stepcmp_2k_20260311/README.md`
195
+ - dual-push `2K` teacher-forced table: `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/teacher_forced_eval_table.csv`
196
+ - dual-push `2K` sample eval table: `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/sample_eval_table.csv`
197
+ - dual-push `2K` training summary: `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/training_summary.csv`
198
  - split-expert bring-up summary: `artifacts/twin_split_expert_bringup_20260310/README.md`
199
  - split-expert repro commands: `artifacts/twin_split_expert_bringup_20260310/repro/commands_bringup.sh`
200
  - split-expert invariant check outputs: `artifacts/twin_split_expert_bringup_20260310/sanity_checks/`
artifacts/twin_dual_push_128_stepcmp_2k_20260311/README.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # twin_dual_push_128_stepcmp_2k_20260311
2
+
3
+ Controlled 4-way early-training comparison on packed TWIN dual-push `128` with a shared step-0 bootstrap check, fresh `2K` training runs, and fixed validation settings at steps `0`, `100`, `500`, and `2000`.
4
+
5
+ ## Quick answers
6
+ - Smallest step-0 teacher-forced jump vs `shared`: `head_only_parallel` (`-0.001750`).
7
+ - Smallest step-0 sample jump vs `shared` (average over sample steps `1,2,4,8,16`): `head_only_parallel` (`+0.000005`).
8
+ - Best teacher-forced result by step `2000`: `head_only_parallel`.
9
+ - Best sample result by step `2000` (average masked MAE over sample steps `1,2,4,8,16`): `split_communicating`.
10
+ - Split vs head-only by step `2000`: teacher-forced beat flags `split_independent=False`, `split_communicating=False`; sample beat flags `split_independent=True`, `split_communicating=True`.
11
+ - `split_communicating` vs `split_independent` at `2000`: teacher delta `-0.003585`, sample-average delta `-0.000257`.
12
+
13
+ ## Step-0 teacher-forced comparison
14
+ | model | mean_val_loss | delta_vs_shared | left_right_imbalance |
15
+ | --- | --- | --- | --- |
16
+ | shared | 1.084735 | +0.000000 | 0.505345 |
17
+ | head_only_parallel | 1.082985 | -0.001750 | 0.501182 |
18
+ | split_independent | 1.328262 | +0.243527 | 0.448843 |
19
+ | split_communicating | 1.783048 | +0.698313 | 0.671085 |
20
+
21
+ ## Step-2000 comparison
22
+ | model | mean_val_loss | 0_to_2000_improvement | left_right_imbalance |
23
+ | --- | --- | --- | --- |
24
+ | shared | 0.055329 | 1.029406 | 0.069564 |
25
+ | head_only_parallel | 0.055297 | 1.027688 | 0.069380 |
26
+ | split_independent | 0.063537 | 1.264725 | 0.092029 |
27
+ | split_communicating | 0.059952 | 1.723096 | 0.080435 |
28
+
29
+ | model | 1-step_mae | 4-step_mae | 16-step_mae |
30
+ | --- | --- | --- | --- |
31
+ | shared | 0.087330 | 0.078164 | 0.085222 |
32
+ | head_only_parallel | 0.086764 | 0.078301 | 0.085272 |
33
+ | split_independent | 0.079100 | 0.070436 | 0.075281 |
34
+ | split_communicating | 0.078618 | 0.071087 | 0.075570 |
35
+
36
+ ## Stability notes
37
+ - Sample batch size used for all official evals: `16`.
38
+ - Step-0 weight loading was clean for all four variants: missing and unexpected key counts were zero in every step-0 eval log.
39
+ - Peak training VRAM by model: shared=35.23GB, head_only_parallel=35.27GB, split_independent=41.73GB, split_communicating=41.73GB.
40
+ - `split_communicating` communication path: active=`True`, `grad_cross_arm_comm_max=0.394700`, `attention_mass_mean=0.009074`, `gate_abs_max=0.003900`.
41
+
42
+ ## Regression check vs prior dual-push screen
43
+ - Prior `5K` study at step `2000` had `baseline=0.083194` and `parallel=0.082729` with head-only edge `+0.000465`. This rerun has `shared=0.055329` and `head_only_parallel=0.055297` with head-only edge `+0.000032`; direction match=`True`.
44
+ - Prior `5K` study `4`-step MAE at step `2000` had `baseline=0.069732` and `parallel=0.069053` with head-only edge `+0.000679`. This rerun has `shared=0.078164` and `head_only_parallel=0.078301` with head-only edge `-0.000137`; direction match=`False`.
45
+
46
+ ## Files
47
+ - `metrics/teacher_forced_eval_table.csv`: all teacher-forced metrics at steps `0`, `100`, `500`, `2000`.
48
+ - `metrics/sample_eval_table.csv`: all sample-eval metrics for sample steps `1`, `2`, `4`, `8`, `16` at steps `0`, `100`, `500`, `2000`.
49
+ - `metrics/training_summary.csv`: per-log-interval training diagnostics with model-specific gradient columns.
50
+ - `metrics/startup_summaries.txt`: startup configuration and weight-loading summaries for each run.
51
+ - `run_logs/`: full train/eval logs, including the first-five-step debug lines in each train log.
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/df_workspace.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Filesystem Size Used Avail Use% Mounted on
2
+ mfs#us-mo-1.runpod.net:9421 154T 129T 25T 84% /workspace
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/env_selected.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ PYTHONPATH=/workspace/pi05tests/openpi/src
2
+ TOKENIZERS_PARALLELISM=false
3
+ XDG_CACHE_HOME=/workspace/.cache
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/nvidia_smi.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Wed Mar 11 23:32:47 2026
2
+ +-----------------------------------------------------------------------------------------+
3
+ | NVIDIA-SMI 580.126.09 Driver Version: 580.126.09 CUDA Version: 13.0 |
4
+ +-----------------------------------------+------------------------+----------------------+
5
+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
6
+ | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
7
+ | | | MIG M. |
8
+ |=========================================+========================+======================|
9
+ | 0 NVIDIA H100 80GB HBM3 On | 00000000:2A:00.0 Off | 0 |
10
+ | N/A 30C P0 69W / 700W | 0MiB / 81559MiB | 0% Default |
11
+ | | | Disabled |
12
+ +-----------------------------------------+------------------------+----------------------+
13
+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 |
14
+ | N/A 29C P0 71W / 700W | 0MiB / 81559MiB | 0% Default |
15
+ | | | Disabled |
16
+ +-----------------------------------------+------------------------+----------------------+
17
+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:9A:00.0 Off | 0 |
18
+ | N/A 28C P0 69W / 700W | 0MiB / 81559MiB | 0% Default |
19
+ | | | Disabled |
20
+ +-----------------------------------------+------------------------+----------------------+
21
+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:AB:00.0 Off | 0 |
22
+ | N/A 29C P0 74W / 700W | 0MiB / 81559MiB | 0% Default |
23
+ | | | Disabled |
24
+ +-----------------------------------------+------------------------+----------------------+
25
+
26
+ +-----------------------------------------------------------------------------------------+
27
+ | Processes: |
28
+ | GPU GI CI PID Type Process name GPU Memory |
29
+ | ID ID Usage |
30
+ |=========================================================================================|
31
+ | No running processes found |
32
+ +-----------------------------------------------------------------------------------------+
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/nvidia_smi_topo.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GPU0 GPU1 GPU2 GPU3 NIC0 NIC1 CPU Affinity NUMA Affinity GPU NUMA ID
2
+ GPU0 X NV18 NV18 NV18 NODE NODE 0-51,104-155 0 N/A
3
+ GPU1 NV18 X NV18 NV18 NODE NODE 0-51,104-155 0 N/A
4
+ GPU2 NV18 NV18 X NV18 SYS SYS 52-103,156-207 1 N/A
5
+ GPU3 NV18 NV18 NV18 X SYS SYS 52-103,156-207 1 N/A
6
+ NIC0 NODE NODE SYS SYS X PIX
7
+ NIC1 NODE NODE SYS SYS PIX X
8
+
9
+ Legend:
10
+
11
+ X = Self
12
+ SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
13
+ NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
14
+ PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
15
+ PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
16
+ PIX = Connection traversing at most a single PCIe bridge
17
+ NV# = Connection traversing a bonded set of # NVLinks
18
+
19
+ NIC Legend:
20
+
21
+ NIC0: mlx5_3
22
+ NIC1: mlx5_4
23
+
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/pip_freeze.txt ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.4.0
2
+ aiohappyeyeballs==2.6.1
3
+ aiohttp==3.13.3
4
+ aiosignal==1.4.0
5
+ annotated-types==0.7.0
6
+ antlr4-python3-runtime==4.9.3
7
+ anyio==4.6.0
8
+ argon2-cffi==23.1.0
9
+ argon2-cffi-bindings==21.2.0
10
+ arrow==1.3.0
11
+ asttokens==2.4.1
12
+ async-lru==2.0.4
13
+ attrs==25.4.0
14
+ augmax==0.4.1
15
+ av==16.1.0
16
+ babel==2.16.0
17
+ beartype==0.19.0
18
+ beautifulsoup4==4.12.3
19
+ bleach==6.1.0
20
+ certifi==2026.2.25
21
+ cffi==2.0.0
22
+ charset-normalizer==3.4.5
23
+ comm==0.2.2
24
+ cryptography==46.0.5
25
+ datasets==4.7.0
26
+ debugpy==1.8.5
27
+ decorator==5.1.1
28
+ deepdiff==8.6.1
29
+ defusedxml==0.7.1
30
+ dill==0.4.0
31
+ dm-tree==0.1.9
32
+ docstring_parser==0.17.0
33
+ draccus==0.10.0
34
+ einops==0.8.2
35
+ entrypoints==0.4
36
+ equinox==0.13.6
37
+ etils==1.14.0
38
+ executing==2.1.0
39
+ fastjsonschema==2.20.0
40
+ filelock==3.25.1
41
+ flatbuffers==25.12.19
42
+ flax==0.10.2
43
+ fqdn==1.5.1
44
+ frozenlist==1.8.0
45
+ fsspec==2026.2.0
46
+ gcsfs==2026.2.0
47
+ google-api-core==2.30.0
48
+ google-auth==2.49.0
49
+ google-auth-oauthlib==1.3.0
50
+ google-cloud-core==2.5.0
51
+ google-cloud-storage==3.9.0
52
+ google-cloud-storage-control==1.10.0
53
+ google-crc32c==1.8.0
54
+ google-resumable-media==2.8.0
55
+ googleapis-common-protos==1.73.0
56
+ grpc-google-iam-v1==0.14.3
57
+ grpcio==1.78.0
58
+ grpcio-status==1.78.0
59
+ h11==0.14.0
60
+ hf-xet==1.3.2
61
+ httpcore==1.0.5
62
+ httpx==0.27.2
63
+ huggingface_hub==0.36.2
64
+ humanize==4.15.0
65
+ idna==3.11
66
+ ImageIO==2.37.3
67
+ ipykernel==6.29.5
68
+ ipython==8.27.0
69
+ ipython-genutils==0.2.0
70
+ ipywidgets==8.1.5
71
+ isoduration==20.11.0
72
+ jax==0.5.3
73
+ jaxlib==0.5.3
74
+ jaxtyping==0.2.36
75
+ jedi==0.19.1
76
+ Jinja2==3.1.3
77
+ json5==0.9.25
78
+ jsonlines==4.0.0
79
+ jsonpointer==3.0.0
80
+ jsonschema==4.23.0
81
+ jsonschema-specifications==2023.12.1
82
+ jupyter-archive==3.4.0
83
+ jupyter-events==0.10.0
84
+ jupyter-highlight-selected-word==0.2.0
85
+ jupyter-lsp==2.2.5
86
+ jupyter_client==7.4.9
87
+ jupyter_contrib_core==0.4.2
88
+ jupyter_contrib_nbextensions==0.7.0
89
+ jupyter_core==5.7.2
90
+ jupyter_nbextensions_configurator==0.6.4
91
+ jupyter_server==2.14.2
92
+ jupyter_server_terminals==0.5.3
93
+ jupyterlab==4.2.5
94
+ jupyterlab_pygments==0.3.0
95
+ jupyterlab_server==2.27.3
96
+ jupyterlab_widgets==3.0.13
97
+ lerobot @ git+https://github.com/huggingface/lerobot@0cf864870cf29f4738d3ade893e6fd13fbd7cdb5
98
+ lxml==5.3.0
99
+ markdown-it-py==4.0.0
100
+ MarkupSafe==2.1.5
101
+ matplotlib-inline==0.1.7
102
+ mdurl==0.1.2
103
+ mergedeep==1.3.4
104
+ mistune==3.0.2
105
+ ml_collections==1.0.0
106
+ ml_dtypes==0.5.4
107
+ mpmath==1.3.0
108
+ msgpack==1.1.2
109
+ multidict==6.7.1
110
+ multiprocess==0.70.18
111
+ mypy_extensions==1.1.0
112
+ nbclassic==1.1.0
113
+ nbclient==0.10.0
114
+ nbconvert==7.16.4
115
+ nbformat==5.10.4
116
+ nest-asyncio==1.6.0
117
+ networkx==3.2.1
118
+ notebook==6.5.5
119
+ notebook_shim==0.2.4
120
+ numpy==1.26.4
121
+ numpydantic==1.8.0
122
+ nvidia-cublas-cu12==12.4.2.65
123
+ nvidia-cuda-cupti-cu12==12.4.99
124
+ nvidia-cuda-nvrtc-cu12==12.4.99
125
+ nvidia-cuda-runtime-cu12==12.4.99
126
+ nvidia-cudnn-cu12==9.1.0.70
127
+ nvidia-cufft-cu12==11.2.0.44
128
+ nvidia-curand-cu12==10.3.5.119
129
+ nvidia-cusolver-cu12==11.6.0.99
130
+ nvidia-cusparse-cu12==12.3.0.142
131
+ nvidia-nccl-cu12==2.20.5
132
+ nvidia-nvjitlink-cu12==12.4.99
133
+ nvidia-nvtx-cu12==12.4.99
134
+ oauthlib==3.3.1
135
+ omegaconf==2.3.0
136
+ opencv-python==4.11.0.86
137
+ openpi-client==0.1.1
138
+ opt_einsum==3.4.0
139
+ optax==0.2.7
140
+ orbax-checkpoint==0.11.13
141
+ orderly-set==5.5.0
142
+ overrides==7.7.0
143
+ packaging==26.0
144
+ pandas==3.0.1
145
+ pandocfilters==1.5.1
146
+ parso==0.8.4
147
+ pexpect==4.9.0
148
+ pillow==12.1.1
149
+ platformdirs==4.3.6
150
+ prometheus_client==0.21.0
151
+ prompt_toolkit==3.0.47
152
+ propcache==0.4.1
153
+ proto-plus==1.27.1
154
+ protobuf==6.33.5
155
+ psutil==6.0.0
156
+ ptyprocess==0.7.0
157
+ pure_eval==0.2.3
158
+ pyarrow==23.0.1
159
+ pyasn1==0.6.2
160
+ pyasn1_modules==0.4.2
161
+ pycparser==2.22
162
+ pydantic==2.12.5
163
+ pydantic_core==2.41.5
164
+ Pygments==2.19.2
165
+ python-dateutil==2.9.0.post0
166
+ python-json-logger==2.0.7
167
+ PyYAML==6.0.3
168
+ pyyaml-include==1.4.1
169
+ pyzmq==24.0.1
170
+ referencing==0.35.1
171
+ regex==2026.2.28
172
+ requests==2.32.5
173
+ requests-oauthlib==2.0.0
174
+ rfc3339-validator==0.1.4
175
+ rfc3986-validator==0.1.1
176
+ rich==14.3.3
177
+ rpds-py==0.20.0
178
+ rsa==4.9.1
179
+ safetensors==0.7.0
180
+ scipy==1.17.1
181
+ Send2Trash==1.8.3
182
+ sentencepiece==0.2.1
183
+ simplejson==3.20.2
184
+ six==1.17.0
185
+ sniffio==1.3.1
186
+ soupsieve==2.6
187
+ stack-data==0.6.3
188
+ sympy==1.12
189
+ tensorstore==0.1.81
190
+ termcolor==3.3.0
191
+ terminado==0.18.1
192
+ tinycss2==1.3.0
193
+ tokenizers==0.21.4
194
+ toml==0.10.2
195
+ torch==2.4.1+cu124
196
+ torchaudio==2.4.1+cu124
197
+ torchvision==0.19.1+cu124
198
+ tornado==6.4.1
199
+ tqdm==4.67.3
200
+ tqdm-loggable==0.3
201
+ traitlets==5.14.3
202
+ transformers==4.53.2
203
+ treescope==0.1.10
204
+ triton==3.0.0
205
+ typeguard==4.5.1
206
+ types-python-dateutil==2.9.0.20240906
207
+ typing-inspect==0.9.0
208
+ typing-inspection==0.4.2
209
+ typing_extensions==4.15.0
210
+ tyro==1.0.8
211
+ uri-template==1.3.0
212
+ urllib3==2.6.3
213
+ wadler_lindig==0.1.7
214
+ wcwidth==0.2.13
215
+ webcolors==24.8.0
216
+ webencodings==0.5.1
217
+ websocket-client==1.8.0
218
+ websockets==16.0
219
+ widgetsnbextension==4.0.13
220
+ wrapt==2.1.2
221
+ xxhash==3.6.0
222
+ yarl==1.23.0
223
+ zipp==3.23.0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/python_version.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Python 3.11.10
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/torch_env.txt ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <frozen runpy>:128: RuntimeWarning: 'torch.utils.collect_env' found in sys.modules after import of package 'torch.utils', but prior to execution of 'torch.utils.collect_env'; this may result in unpredictable behaviour
2
+ Collecting environment information...
3
+ PyTorch version: 2.4.1+cu124
4
+ Is debug build: False
5
+ CUDA used to build PyTorch: 12.4
6
+ ROCM used to build PyTorch: N/A
7
+
8
+ OS: Ubuntu 22.04.5 LTS (x86_64)
9
+ GCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
10
+ Clang version: Could not collect
11
+ CMake version: Could not collect
12
+ Libc version: glibc-2.35
13
+
14
+ Python version: 3.11.10 (main, Sep 7 2024, 18:35:41) [GCC 11.4.0] (64-bit runtime)
15
+ Python platform: Linux-6.8.0-90-generic-x86_64-with-glibc2.35
16
+ Is CUDA available: True
17
+ CUDA runtime version: 12.4.131
18
+ CUDA_MODULE_LOADING set to: LAZY
19
+ GPU models and configuration:
20
+ GPU 0: NVIDIA H100 80GB HBM3
21
+ GPU 1: NVIDIA H100 80GB HBM3
22
+ GPU 2: NVIDIA H100 80GB HBM3
23
+ GPU 3: NVIDIA H100 80GB HBM3
24
+
25
+ Nvidia driver version: 580.126.09
26
+ cuDNN version: Could not collect
27
+ HIP runtime version: N/A
28
+ MIOpen runtime version: N/A
29
+ Is XNNPACK available: True
30
+
31
+ CPU:
32
+ Architecture: x86_64
33
+ CPU op-mode(s): 32-bit, 64-bit
34
+ Address sizes: 46 bits physical, 57 bits virtual
35
+ Byte Order: Little Endian
36
+ CPU(s): 208
37
+ On-line CPU(s) list: 0-207
38
+ Vendor ID: GenuineIntel
39
+ Model name: Intel(R) Xeon(R) Platinum 8470
40
+ CPU family: 6
41
+ Model: 143
42
+ Thread(s) per core: 2
43
+ Core(s) per socket: 52
44
+ Socket(s): 2
45
+ Stepping: 8
46
+ CPU max MHz: 3800.0000
47
+ CPU min MHz: 800.0000
48
+ BogoMIPS: 4000.00
49
+ Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cat_l2 cdp_l3 intel_ppin cdp_l2 ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local split_lock_detect user_shstk avx_vnni avx512_bf16 wbnoinvd dtherm ida arat pln pts vnmi avx512vbmi umip pku ospke waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq la57 rdpid bus_lock_detect cldemote movdiri movdir64b enqcmd fsrm md_clear serialize tsxldtrk pconfig arch_lbr ibt amx_bf16 avx512_fp16 amx_tile amx_int8 flush_l1d arch_capabilities ibpb_exit_to_user
50
+ Virtualization: VT-x
51
+ L1d cache: 4.9 MiB (104 instances)
52
+ L1i cache: 3.3 MiB (104 instances)
53
+ L2 cache: 208 MiB (104 instances)
54
+ L3 cache: 210 MiB (2 instances)
55
+ NUMA node(s): 2
56
+ NUMA node0 CPU(s): 0-51,104-155
57
+ NUMA node1 CPU(s): 52-103,156-207
58
+ Vulnerability Gather data sampling: Not affected
59
+ Vulnerability Itlb multihit: Not affected
60
+ Vulnerability L1tf: Not affected
61
+ Vulnerability Mds: Not affected
62
+ Vulnerability Meltdown: Not affected
63
+ Vulnerability Mmio stale data: Not affected
64
+ Vulnerability Reg file data sampling: Not affected
65
+ Vulnerability Retbleed: Not affected
66
+ Vulnerability Spec rstack overflow: Not affected
67
+ Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl
68
+ Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization
69
+ Vulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; RSB filling; PBRSB-eIBRS SW sequence; BHI BHI_DIS_S
70
+ Vulnerability Srbds: Not affected
71
+ Vulnerability Tsx async abort: Not affected
72
+ Vulnerability Vmscape: Mitigation; IBPB before exit to userspace
73
+
74
+ Versions of relevant libraries:
75
+ [pip3] mypy_extensions==1.1.0
76
+ [pip3] numpy==1.26.4
77
+ [pip3] numpydantic==1.8.0
78
+ [pip3] torch==2.4.1+cu124
79
+ [pip3] torchaudio==2.4.1+cu124
80
+ [pip3] torchvision==0.19.1+cu124
81
+ [pip3] triton==3.0.0
82
+ [conda] Could not collect
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/bootstrap_regeneration_status.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ regenerated_any=0
2
+ regenerated_split=0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/check_split_communicating_invariants.log ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ config_name: pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k
2
+ checkpoint_dir: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single
3
+ action_expert_mode: split_communicating
4
+ weight_loading_missing_keys: []
5
+ weight_loading_unexpected_keys: []
6
+ identical_branch_suffix_max_abs_diff: 0.00000000
7
+ identical_branch_suffix_match: True
8
+ left_branch_invariance_max_abs_diff: skipped_for_split_communicating
9
+ right_branch_invariance_max_abs_diff: skipped_for_split_communicating
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/check_split_independent_invariants.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config_name: pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k
2
+ checkpoint_dir: /workspace/checkpoints/pi05_base_split_independent_packed_from_single
3
+ action_expert_mode: split_independent
4
+ weight_loading_missing_keys: []
5
+ weight_loading_unexpected_keys: []
6
+ identical_branch_suffix_max_abs_diff: 0.00000000
7
+ identical_branch_suffix_match: True
8
+ left_branch_invariance_max_abs_diff: 0.00000000
9
+ right_branch_invariance_max_abs_diff: 0.00000000
10
+ left_branch_invariant: True
11
+ right_branch_invariant: True
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_head_only.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config_name: pi05_twin_dual_push_128_packed_parallel_pytorch_5k
2
+ action_expert_mode: head_only_parallel
3
+ single_ckpt: /workspace/checkpoints/pi05_base_single_pytorch
4
+ output_path: /workspace/checkpoints/pi05_base_parallel_packed_from_single
5
+ load_state_missing_keys_count: 11
6
+ load_state_missing_keys: ['paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight', 'action_in_proj_arms.0.weight', 'action_in_proj_arms.0.bias', 'action_in_proj_arms.1.weight', 'action_in_proj_arms.1.bias', 'arm_token_fuse.weight', 'arm_token_fuse.bias', 'action_out_proj_arms.0.weight', 'action_out_proj_arms.0.bias', 'action_out_proj_arms.1.weight', 'action_out_proj_arms.1.bias']
7
+ load_state_unexpected_keys_count: 4
8
+ load_state_unexpected_keys: ['action_in_proj.bias', 'action_in_proj.weight', 'action_out_proj.bias', 'action_out_proj.weight']
9
+ input_projection_max_abs_diff: 9.5367431640625e-07
10
+ left_input_projection_max_abs_diff: 0.0
11
+ left_output_projection_max_abs_diff: 0.0
12
+ output_projection_max_abs_diff: 8.344650268554688e-07
13
+ right_input_projection_max_abs_diff: 0.0
14
+ right_output_projection_max_abs_diff: 0.0
15
+ warm_start_exact: False
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_split_communicating.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config_name: pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k
2
+ action_expert_mode: split_communicating
3
+ single_ckpt: /workspace/checkpoints/pi05_base_single_pytorch
4
+ output_path: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single
5
+ load_state_missing_keys_count: 412
6
+ load_state_missing_keys: ['paligemma_with_expert.cross_arm_comm', 'paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.norm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.left_gemma_expert.lm_head.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.norm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.right_gemma_expert.lm_head.weight', 'action_in_proj_arms.0.weight', 'action_in_proj_arms.0.bias', 'action_in_proj_arms.1.weight', 'action_in_proj_arms.1.bias', 'action_out_proj_arms.0.weight', 'action_out_proj_arms.0.bias', 'action_out_proj_arms.1.weight', 'action_out_proj_arms.1.bias']
7
+ load_state_unexpected_keys_count: 205
8
+ load_state_unexpected_keys: ['action_in_proj.bias', 'action_in_proj.weight', 'action_out_proj.bias', 'action_out_proj.weight', 'paligemma_with_expert.gemma_expert.lm_head.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.gemma_expert.model.norm.dense.weight']
9
+ cross_arm_comm_init: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
10
+ left_expert_max_abs_diff: 0.0
11
+ left_input_projection_max_abs_diff: 0.0
12
+ left_output_projection_max_abs_diff: 0.0
13
+ right_expert_max_abs_diff: 0.0
14
+ right_input_projection_max_abs_diff: 0.0
15
+ right_output_projection_max_abs_diff: 0.0
16
+ warm_start_exact: True
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_split_independent.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config_name: pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k
2
+ action_expert_mode: split_independent
3
+ single_ckpt: /workspace/checkpoints/pi05_base_single_pytorch
4
+ output_path: /workspace/checkpoints/pi05_base_split_independent_packed_from_single
5
+ load_state_missing_keys_count: 411
6
+ load_state_missing_keys: ['paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.norm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.left_gemma_expert.lm_head.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.norm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.right_gemma_expert.lm_head.weight', 'action_in_proj_arms.0.weight', 'action_in_proj_arms.0.bias', 'action_in_proj_arms.1.weight', 'action_in_proj_arms.1.bias', 'action_out_proj_arms.0.weight', 'action_out_proj_arms.0.bias', 'action_out_proj_arms.1.weight', 'action_out_proj_arms.1.bias']
7
+ load_state_unexpected_keys_count: 205
8
+ load_state_unexpected_keys: ['action_in_proj.bias', 'action_in_proj.weight', 'action_out_proj.bias', 'action_out_proj.weight', 'paligemma_with_expert.gemma_expert.lm_head.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.gemma_expert.model.norm.dense.weight']
9
+ left_expert_max_abs_diff: 0.0
10
+ left_input_projection_max_abs_diff: 0.0
11
+ left_output_projection_max_abs_diff: 0.0
12
+ right_expert_max_abs_diff: 0.0
13
+ right_input_projection_max_abs_diff: 0.0
14
+ right_output_projection_max_abs_diff: 0.0
15
+ warm_start_exact: True
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/norm_stats_status.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ canonical_source=/workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json
2
+ canonical_sha256=9dc7c777a62f956cca6799726a34596ba3774cda0d9795cc068dbd47222d86a5
3
+ shared=/workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_baseline_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json sha256=9dc7c777a62f956cca6799726a34596ba3774cda0d9795cc068dbd47222d86a5
4
+ head_only_parallel=/workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_parallel_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json sha256=9dc7c777a62f956cca6799726a34596ba3774cda0d9795cc068dbd47222d86a5
5
+ split_independent=/workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json sha256=9dc7c777a62f956cca6799726a34596ba3774cda0d9795cc068dbd47222d86a5
6
+ split_communicating=/workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json sha256=9dc7c777a62f956cca6799726a34596ba3774cda0d9795cc068dbd47222d86a5
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/sample_batch_size_probe.log ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ starting_eval config=pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k checkpoint=/workspace/checkpoints/pi05_base_split_communicating_packed_from_single repo_id=lsnu/twin_dual_push_128_val
2
+ eval_loader batch_size=16 num_batches=1 num_workers=0
3
+ teacher_forced_eval_seed: 123
4
+ sample_eval enabled=True batch_size=16 num_batches=1 num_steps=[16] seed=321
5
+ WARNING:root:'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder
6
+ weight_loading missing=0 unexpected=0 device=cuda:0
7
+ eval_batch=1 loss=2.309001 left_arm_loss=1.740481 right_arm_loss=2.877522 imbalance=1.137041 batch_time_s=0.6439
8
+ config_name: pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k
9
+ checkpoint_path: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single
10
+ repo_id_used: lsnu/twin_dual_push_128_val
11
+ num_batches: 1
12
+ mean_val_loss: 2.309001
13
+ std_val_loss: 0.000000
14
+ mean_left_arm_loss: 1.740481
15
+ std_left_arm_loss: 0.000000
16
+ mean_right_arm_loss: 2.877522
17
+ std_right_arm_loss: 0.000000
18
+ mean_left_joint_loss: 1.680031
19
+ std_left_joint_loss: 0.000000
20
+ mean_left_gripper_loss: 2.163631
21
+ std_left_gripper_loss: 0.000000
22
+ mean_right_joint_loss: 2.108088
23
+ std_right_joint_loss: 0.000000
24
+ mean_right_gripper_loss: 8.263555
25
+ std_right_gripper_loss: 0.000000
26
+ mean_left_right_imbalance: 1.137041
27
+ std_left_right_imbalance: 0.000000
28
+ per_batch_timing_seconds: mean=0.6439 std=0.0000 min=0.6439 max=0.6439
29
+ active_mask_dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]
30
+ masked_dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]
31
+ weight_loading_missing_keys: []
32
+ weight_loading_unexpected_keys: []
33
+ WARNING:root:'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder
34
+ sample_eval_batch=1 num_steps=16 masked_mae=0.611553 left_arm_mae=0.679956 right_arm_mae=0.543150 imbalance_mae=0.136806 batch_time_s=3.3573
35
+ sample_eval_num_steps_16_num_batches: 1
36
+ sample_eval_num_steps_16_mean_masked_mae: 0.611553
37
+ sample_eval_num_steps_16_std_masked_mae: 0.000000
38
+ sample_eval_num_steps_16_mean_left_arm_mae: 0.679956
39
+ sample_eval_num_steps_16_std_left_arm_mae: 0.000000
40
+ sample_eval_num_steps_16_mean_right_arm_mae: 0.543150
41
+ sample_eval_num_steps_16_std_right_arm_mae: 0.000000
42
+ sample_eval_num_steps_16_mean_left_joint_mae: 0.648674
43
+ sample_eval_num_steps_16_std_left_joint_mae: 0.000000
44
+ sample_eval_num_steps_16_mean_left_gripper_mae: 0.898926
45
+ sample_eval_num_steps_16_std_left_gripper_mae: 0.000000
46
+ sample_eval_num_steps_16_mean_right_joint_mae: 0.478297
47
+ sample_eval_num_steps_16_std_right_joint_mae: 0.000000
48
+ sample_eval_num_steps_16_mean_right_gripper_mae: 0.997122
49
+ sample_eval_num_steps_16_std_right_gripper_mae: 0.000000
50
+ sample_eval_num_steps_16_mean_left_right_imbalance_mae: 0.136806
51
+ sample_eval_num_steps_16_std_left_right_imbalance_mae: 0.000000
52
+ sample_eval_num_steps_16_per_batch_timing_seconds: mean=3.3573 std=0.0000 min=3.3573 max=3.3573
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/sample_batch_size_used.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ default
artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/date_utc.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 2026-03-11 17:33:48 UTC
artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/pip_freeze.txt ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.4.0
2
+ aiohappyeyeballs==2.6.1
3
+ aiohttp==3.13.3
4
+ aiosignal==1.4.0
5
+ annotated-types==0.7.0
6
+ antlr4-python3-runtime==4.9.3
7
+ anyio==4.6.0
8
+ argon2-cffi==23.1.0
9
+ argon2-cffi-bindings==21.2.0
10
+ arrow==1.3.0
11
+ asttokens==2.4.1
12
+ async-lru==2.0.4
13
+ attrs==25.4.0
14
+ augmax==0.4.1
15
+ av==16.1.0
16
+ babel==2.16.0
17
+ beartype==0.19.0
18
+ beautifulsoup4==4.12.3
19
+ bleach==6.1.0
20
+ certifi==2026.2.25
21
+ cffi==2.0.0
22
+ charset-normalizer==3.4.5
23
+ comm==0.2.2
24
+ cryptography==46.0.5
25
+ datasets==4.7.0
26
+ debugpy==1.8.5
27
+ decorator==5.1.1
28
+ deepdiff==8.6.1
29
+ defusedxml==0.7.1
30
+ dill==0.4.0
31
+ dm-tree==0.1.9
32
+ docstring_parser==0.17.0
33
+ draccus==0.10.0
34
+ einops==0.8.2
35
+ entrypoints==0.4
36
+ equinox==0.13.6
37
+ etils==1.14.0
38
+ executing==2.1.0
39
+ fastjsonschema==2.20.0
40
+ filelock==3.25.1
41
+ flatbuffers==25.12.19
42
+ flax==0.10.2
43
+ fqdn==1.5.1
44
+ frozenlist==1.8.0
45
+ fsspec==2026.2.0
46
+ gcsfs==2026.2.0
47
+ google-api-core==2.30.0
48
+ google-auth==2.49.0
49
+ google-auth-oauthlib==1.3.0
50
+ google-cloud-core==2.5.0
51
+ google-cloud-storage==3.9.0
52
+ google-cloud-storage-control==1.10.0
53
+ google-crc32c==1.8.0
54
+ google-resumable-media==2.8.0
55
+ googleapis-common-protos==1.73.0
56
+ grpc-google-iam-v1==0.14.3
57
+ grpcio==1.78.0
58
+ grpcio-status==1.78.0
59
+ h11==0.14.0
60
+ hf-xet==1.3.2
61
+ httpcore==1.0.5
62
+ httpx==0.27.2
63
+ huggingface_hub==0.36.2
64
+ humanize==4.15.0
65
+ idna==3.11
66
+ ImageIO==2.37.3
67
+ ipykernel==6.29.5
68
+ ipython==8.27.0
69
+ ipython-genutils==0.2.0
70
+ ipywidgets==8.1.5
71
+ isoduration==20.11.0
72
+ jax==0.5.3
73
+ jaxlib==0.5.3
74
+ jaxtyping==0.2.36
75
+ jedi==0.19.1
76
+ Jinja2==3.1.3
77
+ json5==0.9.25
78
+ jsonlines==4.0.0
79
+ jsonpointer==3.0.0
80
+ jsonschema==4.23.0
81
+ jsonschema-specifications==2023.12.1
82
+ jupyter-archive==3.4.0
83
+ jupyter-events==0.10.0
84
+ jupyter-highlight-selected-word==0.2.0
85
+ jupyter-lsp==2.2.5
86
+ jupyter_client==7.4.9
87
+ jupyter_contrib_core==0.4.2
88
+ jupyter_contrib_nbextensions==0.7.0
89
+ jupyter_core==5.7.2
90
+ jupyter_nbextensions_configurator==0.6.4
91
+ jupyter_server==2.14.2
92
+ jupyter_server_terminals==0.5.3
93
+ jupyterlab==4.2.5
94
+ jupyterlab_pygments==0.3.0
95
+ jupyterlab_server==2.27.3
96
+ jupyterlab_widgets==3.0.13
97
+ lerobot @ git+https://github.com/huggingface/lerobot@0cf864870cf29f4738d3ade893e6fd13fbd7cdb5
98
+ lxml==5.3.0
99
+ markdown-it-py==4.0.0
100
+ MarkupSafe==2.1.5
101
+ matplotlib-inline==0.1.7
102
+ mdurl==0.1.2
103
+ mergedeep==1.3.4
104
+ mistune==3.0.2
105
+ ml_collections==1.0.0
106
+ ml_dtypes==0.5.4
107
+ mpmath==1.3.0
108
+ msgpack==1.1.2
109
+ multidict==6.7.1
110
+ multiprocess==0.70.18
111
+ mypy_extensions==1.1.0
112
+ nbclassic==1.1.0
113
+ nbclient==0.10.0
114
+ nbconvert==7.16.4
115
+ nbformat==5.10.4
116
+ nest-asyncio==1.6.0
117
+ networkx==3.2.1
118
+ notebook==6.5.5
119
+ notebook_shim==0.2.4
120
+ numpy==1.26.4
121
+ numpydantic==1.8.0
122
+ nvidia-cublas-cu12==12.4.2.65
123
+ nvidia-cuda-cupti-cu12==12.4.99
124
+ nvidia-cuda-nvrtc-cu12==12.4.99
125
+ nvidia-cuda-runtime-cu12==12.4.99
126
+ nvidia-cudnn-cu12==9.1.0.70
127
+ nvidia-cufft-cu12==11.2.0.44
128
+ nvidia-curand-cu12==10.3.5.119
129
+ nvidia-cusolver-cu12==11.6.0.99
130
+ nvidia-cusparse-cu12==12.3.0.142
131
+ nvidia-nccl-cu12==2.20.5
132
+ nvidia-nvjitlink-cu12==12.4.99
133
+ nvidia-nvtx-cu12==12.4.99
134
+ oauthlib==3.3.1
135
+ omegaconf==2.3.0
136
+ opencv-python==4.11.0.86
137
+ openpi-client==0.1.1
138
+ opt_einsum==3.4.0
139
+ optax==0.2.7
140
+ orbax-checkpoint==0.11.13
141
+ orderly-set==5.5.0
142
+ overrides==7.7.0
143
+ packaging==26.0
144
+ pandas==3.0.1
145
+ pandocfilters==1.5.1
146
+ parso==0.8.4
147
+ pexpect==4.9.0
148
+ pillow==12.1.1
149
+ platformdirs==4.3.6
150
+ prometheus_client==0.21.0
151
+ prompt_toolkit==3.0.47
152
+ propcache==0.4.1
153
+ proto-plus==1.27.1
154
+ protobuf==6.33.5
155
+ psutil==6.0.0
156
+ ptyprocess==0.7.0
157
+ pure_eval==0.2.3
158
+ pyarrow==23.0.1
159
+ pyasn1==0.6.2
160
+ pyasn1_modules==0.4.2
161
+ pycparser==2.22
162
+ pydantic==2.12.5
163
+ pydantic_core==2.41.5
164
+ Pygments==2.19.2
165
+ python-dateutil==2.9.0.post0
166
+ python-json-logger==2.0.7
167
+ PyYAML==6.0.3
168
+ pyyaml-include==1.4.1
169
+ pyzmq==24.0.1
170
+ referencing==0.35.1
171
+ regex==2026.2.28
172
+ requests==2.32.5
173
+ requests-oauthlib==2.0.0
174
+ rfc3339-validator==0.1.4
175
+ rfc3986-validator==0.1.1
176
+ rich==14.3.3
177
+ rpds-py==0.20.0
178
+ rsa==4.9.1
179
+ safetensors==0.7.0
180
+ scipy==1.17.1
181
+ Send2Trash==1.8.3
182
+ sentencepiece==0.2.1
183
+ simplejson==3.20.2
184
+ six==1.17.0
185
+ sniffio==1.3.1
186
+ soupsieve==2.6
187
+ stack-data==0.6.3
188
+ sympy==1.12
189
+ tensorstore==0.1.81
190
+ termcolor==3.3.0
191
+ terminado==0.18.1
192
+ tinycss2==1.3.0
193
+ tokenizers==0.21.4
194
+ toml==0.10.2
195
+ torch==2.4.1+cu124
196
+ torchaudio==2.4.1+cu124
197
+ torchvision==0.19.1+cu124
198
+ tornado==6.4.1
199
+ tqdm==4.67.3
200
+ tqdm-loggable==0.3
201
+ traitlets==5.14.3
202
+ transformers==4.53.2
203
+ treescope==0.1.10
204
+ triton==3.0.0
205
+ typeguard==4.5.1
206
+ types-python-dateutil==2.9.0.20240906
207
+ typing-inspect==0.9.0
208
+ typing-inspection==0.4.2
209
+ typing_extensions==4.15.0
210
+ tyro==1.0.8
211
+ uri-template==1.3.0
212
+ urllib3==2.6.3
213
+ wadler_lindig==0.1.7
214
+ wcwidth==0.2.13
215
+ webcolors==24.8.0
216
+ webencodings==0.5.1
217
+ websocket-client==1.8.0
218
+ websockets==16.0
219
+ widgetsnbextension==4.0.13
220
+ wrapt==2.1.2
221
+ xxhash==3.6.0
222
+ yarl==1.23.0
223
+ zipp==3.23.0
artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/python_version.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Python 3.11.10
artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/torch_env.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ <frozen runpy>:128: RuntimeWarning: 'torch.utils.collect_env' found in sys.modules after import of package 'torch.utils', but prior to execution of 'torch.utils.collect_env'; this may result in unpredictable behaviour
artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/uname.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Linux f87904697f84 6.8.0-90-generic #91-Ubuntu SMP PREEMPT_DYNAMIC Tue Nov 18 14:14:30 UTC 2025 x86_64 x86_64 x86_64 GNU/Linux
openpi/checkpoints/debug_pi05_split_communicating_pytorch_smoke/debug_pi05_split_communicating_pytorch_smoke/1/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9fa98e4f1c6159fd9b956a11323f5990b8d92aae3553eb4785ee7341c79a680
3
+ size 3438041490
openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/1/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d726e07e1c039c30cab249de7f558f75d04a5e3c7151fb9e08ab7b9c804d7342
3
+ size 1850670584
openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/1/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f3c7cbab597ad818c2570f4920d8a9b7d396053543ee538a8b97b4ba623bfe5
3
+ size 3438040655
openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/2/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c68994d9d8e008f11db503c715bbbf739ca7b9f5144ac90c4822f90a55d003f
3
+ size 3438040655
openpi/run_logs/split_independent_real_smoke20.log ADDED
File without changes
openpi/run_logs/split_independent_real_smoke3.log ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 19:24:48.871 [I] Created experiment checkpoint directory: /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_smoke3 (19525:train_pytorch.py:533)
2
+ 19:24:48.874 [I] Using batch size per GPU: 1 (total batch size across 1 GPUs: 1) (19525:train_pytorch.py:552)
3
+ 19:24:48.988 [I] Loaded norm stats from /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train (19525:config.py:234)
4
+ 19:24:48.990 [I] data_config: DataConfig(repo_id='lsnu/twin_dual_push_128_train', asset_id='lsnu/twin_dual_push_128_train', norm_stats={'state': NormStats(mean=array([ 0.10604009, 0.20956482, 0.09184283, -1.98801565, -0.04930164,
5
+ 2.20065784, 1.07595289, 0.52742052, 0.01585805, 0.08288047,
6
+ -0.06887393, -1.906394 , 0.04810138, 2.01086807, -0.92902797,
7
+ 0.8440811 ]), std=array([0.09207697, 0.31317395, 0.08127229, 0.53812712, 0.06093267,
8
+ 0.51205784, 0.22527155, 0.49924755, 0.20230208, 0.31408131,
9
+ 0.21665592, 0.5264315 , 0.20170984, 0.4745712 , 1.17861438,
10
+ 0.36277843]), q01=array([-5.00321221e-06, -3.88026012e-01, -2.23782954e-05, -2.98962682e+00,
11
+ -2.38592355e-01, 1.22146201e+00, 7.85383821e-01, 0.00000000e+00,
12
+ -6.15615927e-01, -4.14941930e-01, -9.43696350e-01, -2.88397729e+00,
13
+ -9.05083556e-01, 1.22148895e+00, -2.79564499e+00, 0.00000000e+00]), q99=array([ 0.31251293, 0.86546916, 0.35174239, -0.87634897, 0.05212194,
14
+ 2.97208117, 1.64465171, 0.9998 , 0.7670313 , 0.96073459,
15
+ 0.68710467, -0.87498123, 0.35838486, 2.9773227 , 0.78477909,
16
+ 0.9998 ])), 'actions': NormStats(mean=array([ 0.03630241, 0.09624442, 0.01367408, -0.2224988 , -0.02762174,
17
+ 0.27498844, 0.0892187 , 0.45650524, -0.00378086, 0.09113847,
18
+ -0.00376227, -0.22537093, 0.00826233, 0.26799494, -0.57452869,
19
+ 0.7731654 ]), std=array([0.04995174, 0.29268014, 0.06852161, 0.3647725 , 0.07012808,
20
+ 0.27129024, 0.11329207, 0.4981046 , 0.0917461 , 0.22704004,
21
+ 0.1069391 , 0.2572591 , 0.11801817, 0.1235588 , 0.35835782,
22
+ 0.41878474]), q01=array([-5.86206436e-04, -3.88117499e-01, -2.55800724e-01, -8.34769463e-01,
23
+ -3.51454727e-01, -1.54787922e-03, -5.81741333e-04, 0.00000000e+00,
24
+ -2.64436970e-01, -3.51582764e-01, -3.69693995e-01, -7.30919549e-01,
25
+ -3.35441585e-01, -6.62303925e-04, -9.34731126e-01, 0.00000000e+00]), q99=array([0.20790743, 0.81198567, 0.19612836, 0.33958174, 0.05568643,
26
+ 0.75265345, 0.425256 , 0.9998 , 0.2558236 , 0.58901345,
27
+ 0.35822071, 0.18567593, 0.44035054, 0.49966629, 0.12655233,
28
+ 0.9998 ]))}, repack_transforms=Group(inputs=[RepackTransform(structure={'images': {'cam_high': 'front_image', 'cam_left_wrist': 'wrist_left_image', 'cam_right_wrist': 'wrist_right_image'}, 'state': 'state', 'actions': 'action', 'prompt': 'task'})], outputs=()), data_transforms=Group(inputs=[AlohaInputs(adapt_to_pi=False)], outputs=[]), model_transforms=Group(inputs=[InjectDefaultPrompt(prompt=None), ResizeImages(height=224, width=224), TokenizePrompt(tokenizer=<openpi.models.tokenizer.PaligemmaTokenizer object at 0x70f9ab2c5d10>, discrete_state_input=True), PackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))], outputs=[UnpackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))]), use_quantile_norm=True, action_sequence_keys=('action',), prompt_from_task=False, rlds_data_dir=None, action_space=None, datasets=()) (19525:data_loader.py:284)
29
+ 19:24:57.449 [I] JAX version 0.5.3 available. (19525:config.py:125)
30
+ 19:25:32.845 [I] Using existing local LeRobot dataset mirror for lsnu/twin_dual_push_128_train: /workspace/lerobot/lsnu/twin_dual_push_128_train (19525:data_loader.py:148)
31
+ 19:25:33.031 [W] 'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder (19525:video_utils.py:36)
32
+ 19:26:39.374 [I] local_batch_size: 1 (19525:data_loader.py:365)
33
+ 19:28:30.580 [I] Enabled gradient checkpointing for PI0Pytorch model (19525:pi0_pytorch.py:138)
34
+ 19:28:30.582 [I] Enabled gradient checkpointing for memory optimization (19525:train_pytorch.py:624)
35
+ 19:28:30.583 [I] Step 0 (after_model_creation): GPU memory - allocated: 17.23GB, reserved: 17.23GB, free: 0.00GB, peak_allocated: 17.23GB, peak_reserved: 17.23GB (19525:train_pytorch.py:493)
36
+ 19:28:30.583 [I] Loading weights from: /workspace/checkpoints/pi05_base_split_independent_packed_from_single (19525:train_pytorch.py:653)
37
+ 19:28:34.495 [I] Weight loading missing key count: 0 (19525:train_pytorch.py:657)
38
+ 19:28:34.495 [I] Weight loading missing keys: set() (19525:train_pytorch.py:658)
39
+ 19:28:34.496 [I] Weight loading unexpected key count: 0 (19525:train_pytorch.py:659)
40
+ 19:28:34.496 [I] Weight loading unexpected keys: [] (19525:train_pytorch.py:660)
41
+ 19:28:34.497 [I] Loaded PyTorch weights from /workspace/checkpoints/pi05_base_split_independent_packed_from_single (19525:train_pytorch.py:661)
42
+ 19:28:34.501 [I] Running on: 963c158043aa | world_size=1 (19525:train_pytorch.py:701)
43
+ 19:28:34.501 [I] Training config: batch_size=1, effective_batch_size=1, num_train_steps=3 (19525:train_pytorch.py:702)
44
+ 19:28:34.502 [I] Memory optimizations: gradient_checkpointing=True (19525:train_pytorch.py:705)
45
+ 19:28:34.502 [I] DDP settings: find_unused_parameters=False, gradient_as_bucket_view=True, static_graph=True (19525:train_pytorch.py:706)
46
+ 19:28:34.502 [I] LR schedule: warmup=250, peak_lr=2.50e-05, decay_steps=5000, end_lr=2.50e-06 (19525:train_pytorch.py:707)
47
+ 19:28:34.503 [I] Optimizer: AdamW, weight_decay=1e-10, clip_norm=1.0 (19525:train_pytorch.py:710)
48
+ 19:28:34.503 [I] EMA is not supported for PyTorch training (19525:train_pytorch.py:713)
49
+ 19:28:34.504 [I] Training precision: float32 (19525:train_pytorch.py:714)
50
+ 19:28:34.516 [I] Resolved config name: pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k (19525:train_pytorch.py:308)
51
+ 19:28:34.516 [I] Dataset repo_id: lsnu/twin_dual_push_128_train (19525:train_pytorch.py:309)
52
+ 19:28:34.517 [I] Norm-stats file path: /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json (19525:train_pytorch.py:310)
53
+ 19:28:34.518 [I] Norm-stats summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (19525:train_pytorch.py:311)
54
+ 19:28:34.518 [I] Checkpoint source path: /workspace/checkpoints/pi05_base_split_independent_packed_from_single (19525:train_pytorch.py:312)
55
+ 19:28:34.519 [I] Model type: split_independent (19525:train_pytorch.py:313)
56
+ 19:28:34.519 [I] Packed transforms active: True (19525:train_pytorch.py:314)
57
+ 19:28:34.519 [I] World size: 1 (19525:train_pytorch.py:315)
58
+ 19:28:34.520 [I] Batch size: local=1, global=1 (19525:train_pytorch.py:316)
59
+ 19:28:34.520 [I] num_workers: 0 (19525:train_pytorch.py:317)
60
+ 19:28:34.521 [I] Precision: float32 (19525:train_pytorch.py:318)
61
+ 19:28:34.521 [I] LR schedule summary: warmup_steps=250, peak_lr=2.50e-05, decay_steps=5000, decay_lr=2.50e-06 (19525:train_pytorch.py:319)
62
+ 19:28:34.522 [I] Save/log intervals: save_interval=3, log_interval=1 (19525:train_pytorch.py:326)
63
+ 19:28:34.522 [I] Action-loss mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (19525:train_pytorch.py:327)
64
+ 19:28:34.522 [I] Active mask dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] (19525:train_pytorch.py:328)
65
+ 19:28:34.522 [I] Masked dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] (19525:train_pytorch.py:329)
66
+ 19:28:34.523 [I] Gradient bucket diagnostics: left_action_in, right_action_in, left_expert, right_expert, action_out, cross_arm_comm (19525:train_pytorch.py:722)
67
+
68
+ File "/workspace/pi05tests/openpi/scripts/train_pytorch.py", line 939, in <module>
69
+ main()
70
+ File "/workspace/pi05tests/openpi/scripts/train_pytorch.py", line 935, in main
71
+ train_loop(config)
72
+ File "/workspace/pi05tests/openpi/scripts/train_pytorch.py", line 747, in train_loop
73
+ for observation, actions in loader:
74
+ File "/workspace/pi05tests/openpi/src/openpi/training/data_loader.py", line 596, in __iter__
75
+ for batch in self._data_loader:
76
+ File "/workspace/pi05tests/openpi/src/openpi/training/data_loader.py", line 510, in __iter__
77
+ batch = next(data_iter)
78
+ ^^^^^^^^^^^^^^^
79
+ File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 630, in __next__
80
+ data = self._next_data()
81
+ ^^^^^^^^^^^^^^^^^
82
+ File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 673, in _next_data
83
+ data = self._dataset_fetcher.fetch(index) # may raise StopIteration
84
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
85
+ File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
86
+ data = [self.dataset[idx] for idx in possibly_batched_index]
87
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
88
+ File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
89
+ data = [self.dataset[idx] for idx in possibly_batched_index]
90
+ ~~~~~~~~~~~~^^^^^
91
+ File "/workspace/pi05tests/openpi/src/openpi/training/data_loader.py", line 67, in __getitem__
92
+ return self._transform(self._dataset[index])
93
+ ~~~~~~~~~~~~~^^^^^^^
94
+ File "/workspace/pi05tests/openpi/.venv/lib/python3.11/site-packages/lerobot/common/datasets/lerobot_dataset.py", line 742, in __getitem__
95
+ query_result = self._query_hf_dataset(query_indices)
96
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
97
+ File "/workspace/pi05tests/openpi/.venv/lib/python3.11/site-packages/lerobot/common/datasets/lerobot_dataset.py", line 707, in _query_hf_dataset
98
+ return {
99
+ ^
100
+ File "/workspace/pi05tests/openpi/.venv/lib/python3.11/site-packages/lerobot/common/datasets/lerobot_dataset.py", line 708, in <dictcomp>
101
+ key: torch.stack(self.hf_dataset.select(q_idx)[key])
102
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
103
+ TypeError: stack(): argument 'tensors' (position 1) must be tuple of Tensors, not Column
104
+
openpi/scripts/collect_twin_dual_push_128_stepcmp_metrics.py ADDED
@@ -0,0 +1,913 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import ast
7
+ import csv
8
+ import dataclasses
9
+ import json
10
+ import math
11
+ import pathlib
12
+ import re
13
+ import statistics
14
+ from collections import defaultdict
15
+ from typing import Any
16
+
17
+
18
+ OPENPI_ROOT = pathlib.Path(__file__).resolve().parents[1]
19
+ STEP_ORDER = (0, 100, 500, 2000)
20
+ SAMPLE_STEPS = (1, 2, 4, 8, 16)
21
+ BASE_TRAIN_COLUMNS = [
22
+ "model_variant",
23
+ "config_name",
24
+ "exp_name",
25
+ "step",
26
+ "loss",
27
+ "smoothed_loss",
28
+ "lr",
29
+ "grad_norm",
30
+ "step_time_s",
31
+ "data_time_s",
32
+ "items_per_second",
33
+ "eta_seconds",
34
+ "max_cuda_memory_gb",
35
+ ]
36
+ TIMING_RE = re.compile(r"mean=([0-9.]+)\s+std=([0-9.]+)\s+min=([0-9.]+)\s+max=([0-9.]+)")
37
+ KV_PAIR_RE = re.compile(r"([A-Za-z0-9_./+-]+)=([^\s]+)")
38
+ SOURCE_SUFFIX_RE = re.compile(r"\s+\(\d+:[^)]+\)$")
39
+ TIMESTAMPED_INFO_RE = re.compile(r"^\d{2}:\d{2}:\d{2}\.\d{3} \[I\] (.*)$")
40
+ NORMALIZE_RE = re.compile(r"[^a-z0-9]+")
41
+ ERROR_MARKERS = ("Traceback", "FloatingPointError", "CUDA out of memory", "Non-finite", "RuntimeError:")
42
+
43
+
44
+ @dataclasses.dataclass(frozen=True)
45
+ class ModelSpec:
46
+ key: str
47
+ model_variant: str
48
+ config_name: str
49
+ exp_name: str
50
+ step0_checkpoint: str
51
+
52
+
53
+ MODEL_SPECS = (
54
+ ModelSpec(
55
+ key="shared",
56
+ model_variant="shared",
57
+ config_name="pi05_twin_dual_push_128_packed_baseline_pytorch_5k",
58
+ exp_name="dual_push_128_stepcmp_shared_2k",
59
+ step0_checkpoint="/workspace/checkpoints/pi05_base_single_pytorch",
60
+ ),
61
+ ModelSpec(
62
+ key="head_only",
63
+ model_variant="head_only_parallel",
64
+ config_name="pi05_twin_dual_push_128_packed_parallel_pytorch_5k",
65
+ exp_name="dual_push_128_stepcmp_head_only_2k",
66
+ step0_checkpoint="/workspace/checkpoints/pi05_base_parallel_packed_from_single",
67
+ ),
68
+ ModelSpec(
69
+ key="split_ind",
70
+ model_variant="split_independent",
71
+ config_name="pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k",
72
+ exp_name="dual_push_128_stepcmp_split_ind_2k",
73
+ step0_checkpoint="/workspace/checkpoints/pi05_base_split_independent_packed_from_single",
74
+ ),
75
+ ModelSpec(
76
+ key="split_comm",
77
+ model_variant="split_communicating",
78
+ config_name="pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k",
79
+ exp_name="dual_push_128_stepcmp_split_comm_2k",
80
+ step0_checkpoint="/workspace/checkpoints/pi05_base_split_communicating_packed_from_single",
81
+ ),
82
+ )
83
+
84
+ STARTUP_LABEL_MAP = {
85
+ "resolved_config_name": "config_name",
86
+ "dataset_repo_id": "dataset_repo_id",
87
+ "norm_stats_file_path": "norm_stats_file",
88
+ "norm_stats_summary": "norm_stats_summary",
89
+ "checkpoint_source_path": "checkpoint_source",
90
+ "model_type": "model_type",
91
+ "packed_transforms_active": "packed_transforms",
92
+ "world_size": "world_size",
93
+ "batch_size": "batch_size",
94
+ "num_workers": "num_workers",
95
+ "training_precision": "precision",
96
+ "lr_schedule": "lr_schedule",
97
+ "save_log_intervals": "save_log_intervals",
98
+ "action_loss_mask": "action_loss_mask",
99
+ "active_action_loss_dims": "active_mask_dims",
100
+ "masked_padded_dims": "masked_dims",
101
+ "gradient_bucket_diagnostics": "gradient_buckets",
102
+ "weight_loading_missing_key_count": "weight_missing_count",
103
+ "weight_loading_missing_keys": "weight_missing_keys",
104
+ "weight_loading_unexpected_key_count": "weight_unexpected_count",
105
+ "weight_loading_unexpected_keys": "weight_unexpected_keys",
106
+ }
107
+
108
+
109
+ def parse_args() -> argparse.Namespace:
110
+ parser = argparse.ArgumentParser()
111
+ parser.add_argument("--artifact_root", required=True)
112
+ parser.add_argument("--prior_metrics_root", default="")
113
+ return parser.parse_args()
114
+
115
+
116
+ def normalize_label(label: str) -> str:
117
+ return NORMALIZE_RE.sub("_", label.lower()).strip("_")
118
+
119
+
120
+ def strip_source_suffix(text: str) -> str:
121
+ return SOURCE_SUFFIX_RE.sub("", text.rstrip())
122
+
123
+
124
+ def extract_info_body(line: str) -> str | None:
125
+ match = TIMESTAMPED_INFO_RE.match(strip_source_suffix(line))
126
+ return match.group(1) if match else None
127
+
128
+
129
+ def natural_key(text: str) -> list[Any]:
130
+ parts = re.split(r"(\d+)", text)
131
+ key: list[Any] = []
132
+ for part in parts:
133
+ if not part:
134
+ continue
135
+ key.append(int(part) if part.isdigit() else part)
136
+ return key
137
+
138
+
139
+ def format_float(value: float | None, digits: int = 6) -> str:
140
+ if value is None or math.isnan(value):
141
+ return "n/a"
142
+ return f"{value:.{digits}f}"
143
+
144
+
145
+ def format_delta(value: float | None, digits: int = 6) -> str:
146
+ if value is None or math.isnan(value):
147
+ return "n/a"
148
+ return f"{value:+.{digits}f}"
149
+
150
+
151
+ def resolve_checkpoint_path(raw_path: str) -> str:
152
+ path = pathlib.Path(raw_path)
153
+ if path.is_absolute():
154
+ return str(path.resolve())
155
+ return str((OPENPI_ROOT / path).resolve())
156
+
157
+
158
+ def parse_float(text: str) -> float:
159
+ return float(text)
160
+
161
+
162
+ def parse_optional_float(text: str | None) -> float | None:
163
+ if text is None or text == "":
164
+ return None
165
+ return float(text)
166
+
167
+
168
+ def parse_timing(text: str) -> dict[str, float]:
169
+ match = TIMING_RE.search(text)
170
+ if not match:
171
+ raise ValueError(f"Unable to parse timing summary: {text!r}")
172
+ return {
173
+ "mean": float(match.group(1)),
174
+ "std": float(match.group(2)),
175
+ "min": float(match.group(3)),
176
+ "max": float(match.group(4)),
177
+ }
178
+
179
+
180
+ def parse_literal_count(text: str) -> int:
181
+ text = text.strip()
182
+ if text == "set()":
183
+ return 0
184
+ value = ast.literal_eval(text)
185
+ return len(value)
186
+
187
+
188
+ def read_text_lines(path: pathlib.Path) -> list[str]:
189
+ return path.read_text(encoding="utf-8", errors="replace").splitlines()
190
+
191
+
192
+ def detect_errors(lines: list[str]) -> list[str]:
193
+ errors = []
194
+ for line in lines:
195
+ if any(marker in line for marker in ERROR_MARKERS):
196
+ errors.append(line.strip())
197
+ return errors
198
+
199
+
200
+ def parse_train_log(path: pathlib.Path, spec: ModelSpec) -> tuple[dict[str, str], list[dict[str, Any]], list[str]]:
201
+ lines = read_text_lines(path)
202
+ startup: dict[str, str] = {}
203
+ step_rows: list[dict[str, Any]] = []
204
+ errors = detect_errors(lines)
205
+
206
+ for line in lines:
207
+ body = extract_info_body(line)
208
+ if not body:
209
+ continue
210
+ if body.startswith("step="):
211
+ metrics = {key: value for key, value in KV_PAIR_RE.findall(body)}
212
+ row = {
213
+ "model_variant": spec.model_variant,
214
+ "config_name": spec.config_name,
215
+ "exp_name": spec.exp_name,
216
+ "step": int(metrics["step"]),
217
+ "loss": parse_float(metrics["loss"]),
218
+ "smoothed_loss": parse_float(metrics["smoothed_loss"]),
219
+ "lr": parse_float(metrics["lr"]),
220
+ "grad_norm": parse_float(metrics["grad_norm"]),
221
+ "step_time_s": parse_float(metrics["step_time"].rstrip("s")),
222
+ "data_time_s": parse_float(metrics["data_time"].rstrip("s")),
223
+ "items_per_second": parse_float(metrics["it/s"]),
224
+ "eta_seconds": parse_float(next(value for key, value in metrics.items() if key.startswith("eta_to_")).rstrip("s")),
225
+ "max_cuda_memory_gb": parse_float(metrics["max_cuda_memory"].rstrip("GB")),
226
+ }
227
+ for key, value in metrics.items():
228
+ if key in {"step", "loss", "smoothed_loss", "lr", "grad_norm", "step_time", "data_time", "it/s", "max_cuda_memory"}:
229
+ continue
230
+ if key.startswith("eta_to_"):
231
+ continue
232
+ row[key] = parse_float(value)
233
+ step_rows.append(row)
234
+ continue
235
+
236
+ if ": " not in body:
237
+ continue
238
+ label, value = body.split(": ", 1)
239
+ startup_key = STARTUP_LABEL_MAP.get(normalize_label(label))
240
+ if startup_key:
241
+ startup[startup_key] = value.strip()
242
+
243
+ return startup, step_rows, errors
244
+
245
+
246
+ def parse_eval_log(path: pathlib.Path) -> tuple[dict[str, str], int | None, list[str]]:
247
+ lines = read_text_lines(path)
248
+ metrics: dict[str, str] = {}
249
+ sample_batch_size: int | None = None
250
+ errors = detect_errors(lines)
251
+
252
+ for line in lines:
253
+ stripped = strip_source_suffix(line.strip())
254
+ if stripped.startswith("sample_eval enabled="):
255
+ kv = {key: value for key, value in KV_PAIR_RE.findall(stripped)}
256
+ if "batch_size" in kv:
257
+ sample_batch_size = int(kv["batch_size"])
258
+ if ": " not in stripped:
259
+ continue
260
+ key, value = stripped.split(": ", 1)
261
+ metrics[key.strip()] = value.strip()
262
+
263
+ return metrics, sample_batch_size, errors
264
+
265
+
266
+ def write_csv(path: pathlib.Path, rows: list[dict[str, Any]], columns: list[str]) -> None:
267
+ with path.open("w", encoding="utf-8", newline="") as handle:
268
+ writer = csv.DictWriter(handle, fieldnames=columns)
269
+ writer.writeheader()
270
+ for row in rows:
271
+ writer.writerow({column: row.get(column, "") for column in columns})
272
+
273
+
274
+ def build_teacher_row(spec: ModelSpec, step: int, metrics: dict[str, str]) -> dict[str, Any]:
275
+ timing = parse_timing(metrics["per_batch_timing_seconds"])
276
+ return {
277
+ "model_variant": spec.model_variant,
278
+ "config_name": spec.config_name,
279
+ "exp_name": spec.exp_name,
280
+ "checkpoint_step": step,
281
+ "checkpoint_path": resolve_checkpoint_path(metrics["checkpoint_path"]),
282
+ "repo_id": metrics["repo_id_used"],
283
+ "num_batches": int(metrics["num_batches"]),
284
+ "mean_val_loss": parse_float(metrics["mean_val_loss"]),
285
+ "std_val_loss": parse_float(metrics["std_val_loss"]),
286
+ "mean_left_arm_loss": parse_float(metrics["mean_left_arm_loss"]),
287
+ "std_left_arm_loss": parse_float(metrics["std_left_arm_loss"]),
288
+ "mean_right_arm_loss": parse_float(metrics["mean_right_arm_loss"]),
289
+ "std_right_arm_loss": parse_float(metrics["std_right_arm_loss"]),
290
+ "mean_left_joint_loss": parse_float(metrics["mean_left_joint_loss"]),
291
+ "std_left_joint_loss": parse_float(metrics["std_left_joint_loss"]),
292
+ "mean_left_gripper_loss": parse_float(metrics["mean_left_gripper_loss"]),
293
+ "std_left_gripper_loss": parse_float(metrics["std_left_gripper_loss"]),
294
+ "mean_right_joint_loss": parse_float(metrics["mean_right_joint_loss"]),
295
+ "std_right_joint_loss": parse_float(metrics["std_right_joint_loss"]),
296
+ "mean_right_gripper_loss": parse_float(metrics["mean_right_gripper_loss"]),
297
+ "std_right_gripper_loss": parse_float(metrics["std_right_gripper_loss"]),
298
+ "mean_left_right_imbalance": parse_float(metrics["mean_left_right_imbalance"]),
299
+ "std_left_right_imbalance": parse_float(metrics["std_left_right_imbalance"]),
300
+ "per_batch_time_mean_s": timing["mean"],
301
+ "per_batch_time_std_s": timing["std"],
302
+ "per_batch_time_min_s": timing["min"],
303
+ "per_batch_time_max_s": timing["max"],
304
+ "weight_loading_missing_count": parse_literal_count(metrics["weight_loading_missing_keys"]),
305
+ "weight_loading_unexpected_count": parse_literal_count(metrics["weight_loading_unexpected_keys"]),
306
+ }
307
+
308
+
309
+ def build_sample_rows(spec: ModelSpec, step: int, metrics: dict[str, str]) -> list[dict[str, Any]]:
310
+ rows = []
311
+ for sample_steps in SAMPLE_STEPS:
312
+ prefix = f"sample_eval_num_steps_{sample_steps}_"
313
+ timing = parse_timing(metrics[f"{prefix}per_batch_timing_seconds"])
314
+ rows.append(
315
+ {
316
+ "model_variant": spec.model_variant,
317
+ "config_name": spec.config_name,
318
+ "exp_name": spec.exp_name,
319
+ "checkpoint_step": step,
320
+ "checkpoint_path": resolve_checkpoint_path(metrics["checkpoint_path"]),
321
+ "repo_id": metrics["repo_id_used"],
322
+ "sample_num_steps": sample_steps,
323
+ "sample_num_batches": int(metrics[f"{prefix}num_batches"]),
324
+ "mean_masked_mae": parse_float(metrics[f"{prefix}mean_masked_mae"]),
325
+ "std_masked_mae": parse_float(metrics[f"{prefix}std_masked_mae"]),
326
+ "mean_left_arm_mae": parse_float(metrics[f"{prefix}mean_left_arm_mae"]),
327
+ "std_left_arm_mae": parse_float(metrics[f"{prefix}std_left_arm_mae"]),
328
+ "mean_right_arm_mae": parse_float(metrics[f"{prefix}mean_right_arm_mae"]),
329
+ "std_right_arm_mae": parse_float(metrics[f"{prefix}std_right_arm_mae"]),
330
+ "mean_left_joint_mae": parse_float(metrics[f"{prefix}mean_left_joint_mae"]),
331
+ "std_left_joint_mae": parse_float(metrics[f"{prefix}std_left_joint_mae"]),
332
+ "mean_left_gripper_mae": parse_float(metrics[f"{prefix}mean_left_gripper_mae"]),
333
+ "std_left_gripper_mae": parse_float(metrics[f"{prefix}std_left_gripper_mae"]),
334
+ "mean_right_joint_mae": parse_float(metrics[f"{prefix}mean_right_joint_mae"]),
335
+ "std_right_joint_mae": parse_float(metrics[f"{prefix}std_right_joint_mae"]),
336
+ "mean_right_gripper_mae": parse_float(metrics[f"{prefix}mean_right_gripper_mae"]),
337
+ "std_right_gripper_mae": parse_float(metrics[f"{prefix}std_right_gripper_mae"]),
338
+ "mean_left_right_imbalance_mae": parse_float(metrics[f"{prefix}mean_left_right_imbalance_mae"]),
339
+ "std_left_right_imbalance_mae": parse_float(metrics[f"{prefix}std_left_right_imbalance_mae"]),
340
+ "per_batch_time_mean_s": timing["mean"],
341
+ "per_batch_time_std_s": timing["std"],
342
+ "per_batch_time_min_s": timing["min"],
343
+ "per_batch_time_max_s": timing["max"],
344
+ }
345
+ )
346
+ return rows
347
+
348
+
349
+ def row_index(rows: list[dict[str, Any]], *keys: str) -> dict[tuple[Any, ...], dict[str, Any]]:
350
+ return {tuple(row[key] for key in keys): row for row in rows}
351
+
352
+
353
+ def average(values: list[float]) -> float | None:
354
+ return statistics.fmean(values) if values else None
355
+
356
+
357
+ def summarise_stability(train_rows: list[dict[str, Any]]) -> dict[str, Any]:
358
+ by_variant: dict[str, list[dict[str, Any]]] = defaultdict(list)
359
+ for row in train_rows:
360
+ by_variant[row["model_variant"]].append(row)
361
+
362
+ summary: dict[str, Any] = {}
363
+ for variant, rows in by_variant.items():
364
+ grad_columns = [column for column in rows[0] if column.startswith("grad_")]
365
+ dead_columns = []
366
+ for column in grad_columns:
367
+ if max(abs(float(row.get(column, 0.0) or 0.0)) for row in rows) == 0.0:
368
+ dead_columns.append(column)
369
+ summary[variant] = {
370
+ "max_cuda_memory_gb": max(row["max_cuda_memory_gb"] for row in rows),
371
+ "dead_gradient_columns": dead_columns,
372
+ }
373
+
374
+ split_comm_rows = by_variant.get("split_communicating", [])
375
+ gate_columns = sorted(
376
+ {column for row in split_comm_rows for column in row if column.startswith("cross_arm_comm_gate_layer_")},
377
+ key=natural_key,
378
+ )
379
+ attn_columns = sorted(
380
+ {column for row in split_comm_rows for column in row if column.startswith("cross_arm_attention_mass_layer_")},
381
+ key=natural_key,
382
+ )
383
+ if split_comm_rows:
384
+ gate_values = [abs(float(row[column])) for row in split_comm_rows for column in gate_columns if column in row]
385
+ attn_values = [float(row[column]) for row in split_comm_rows for column in attn_columns if column in row]
386
+ grad_comm_values = [float(row.get("grad_cross_arm_comm", 0.0)) for row in split_comm_rows]
387
+ summary["split_communicating"]["communication"] = {
388
+ "gate_abs_max": max(gate_values) if gate_values else 0.0,
389
+ "gate_abs_mean": average(gate_values) or 0.0,
390
+ "attention_mass_mean": average(attn_values) or 0.0,
391
+ "attention_mass_max": max(attn_values) if attn_values else 0.0,
392
+ "grad_cross_arm_comm_mean": average(grad_comm_values) or 0.0,
393
+ "grad_cross_arm_comm_max": max(grad_comm_values) if grad_comm_values else 0.0,
394
+ "active": bool(attn_values and max(attn_values) > 0.0 and max(grad_comm_values, default=0.0) > 0.0),
395
+ }
396
+ return summary
397
+
398
+
399
+ def load_prior_metrics(prior_metrics_root: pathlib.Path) -> dict[str, Any]:
400
+ result: dict[str, Any] = {}
401
+ if not prior_metrics_root.exists():
402
+ return result
403
+
404
+ teacher_path = prior_metrics_root / "teacher_forced_eval_table.csv"
405
+ sample_path = prior_metrics_root / "sample_eval_table.csv"
406
+
407
+ if teacher_path.exists():
408
+ with teacher_path.open(encoding="utf-8", newline="") as handle:
409
+ rows = list(csv.DictReader(handle))
410
+ teacher_2000 = {row["model"]: row for row in rows if int(row["checkpoint_step"]) == 2000}
411
+ result["teacher_2000"] = {
412
+ "baseline": parse_optional_float(teacher_2000.get("baseline", {}).get("mean_val_loss")),
413
+ "parallel": parse_optional_float(teacher_2000.get("parallel", {}).get("mean_val_loss")),
414
+ }
415
+
416
+ if sample_path.exists():
417
+ with sample_path.open(encoding="utf-8", newline="") as handle:
418
+ rows = list(csv.DictReader(handle))
419
+ sample_2000_step4 = {
420
+ row["model"]: row
421
+ for row in rows
422
+ if int(row["checkpoint_step"]) == 2000 and int(row["num_steps"]) == 4
423
+ }
424
+ result["sample_2000_step4"] = {
425
+ "baseline": parse_optional_float(sample_2000_step4.get("baseline", {}).get("mean_masked_mae")),
426
+ "parallel": parse_optional_float(sample_2000_step4.get("parallel", {}).get("mean_masked_mae")),
427
+ }
428
+
429
+ return result
430
+
431
+
432
+ def build_summary(
433
+ artifact_root: pathlib.Path,
434
+ teacher_rows: list[dict[str, Any]],
435
+ sample_rows: list[dict[str, Any]],
436
+ train_rows: list[dict[str, Any]],
437
+ startup_summaries: dict[str, dict[str, str]],
438
+ log_errors: dict[str, list[str]],
439
+ sample_batch_size_used: str,
440
+ prior_metrics: dict[str, Any],
441
+ ) -> dict[str, Any]:
442
+ teacher_by_key = row_index(teacher_rows, "model_variant", "checkpoint_step")
443
+ sample_by_key = row_index(sample_rows, "model_variant", "checkpoint_step", "sample_num_steps")
444
+
445
+ step0_teacher_gaps = {}
446
+ step0_sample_gaps = {}
447
+ shared_step0_teacher = teacher_by_key[("shared", 0)]["mean_val_loss"]
448
+ for spec in MODEL_SPECS:
449
+ variant = spec.model_variant
450
+ teacher_value = teacher_by_key[(variant, 0)]["mean_val_loss"]
451
+ step0_teacher_gaps[variant] = teacher_value - shared_step0_teacher
452
+ sample_deltas = []
453
+ for sample_steps in SAMPLE_STEPS:
454
+ variant_row = sample_by_key[(variant, 0, sample_steps)]
455
+ shared_row = sample_by_key[("shared", 0, sample_steps)]
456
+ sample_deltas.append(variant_row["mean_masked_mae"] - shared_row["mean_masked_mae"])
457
+ step0_sample_gaps[variant] = {
458
+ "average_delta_vs_shared": average(sample_deltas),
459
+ "per_steps_delta_vs_shared": {str(step): delta for step, delta in zip(SAMPLE_STEPS, sample_deltas, strict=True)},
460
+ }
461
+
462
+ warm_variants = [spec.model_variant for spec in MODEL_SPECS if spec.model_variant != "shared"]
463
+ smallest_teacher_variant = min(warm_variants, key=lambda variant: abs(step0_teacher_gaps[variant]))
464
+ smallest_sample_variant = min(
465
+ warm_variants,
466
+ key=lambda variant: abs(step0_sample_gaps[variant]["average_delta_vs_shared"] or 0.0),
467
+ )
468
+
469
+ teacher_improvements: dict[str, dict[str, float]] = defaultdict(dict)
470
+ sample_improvements: dict[str, dict[str, dict[str, float]]] = defaultdict(lambda: defaultdict(dict))
471
+ for spec in MODEL_SPECS:
472
+ variant = spec.model_variant
473
+ for start_step, end_step in zip(STEP_ORDER[:-1], STEP_ORDER[1:], strict=True):
474
+ teacher_improvements[variant][f"{start_step}_to_{end_step}"] = (
475
+ teacher_by_key[(variant, start_step)]["mean_val_loss"] - teacher_by_key[(variant, end_step)]["mean_val_loss"]
476
+ )
477
+ teacher_improvements[variant]["0_to_2000"] = (
478
+ teacher_by_key[(variant, 0)]["mean_val_loss"] - teacher_by_key[(variant, 2000)]["mean_val_loss"]
479
+ )
480
+ for sample_steps in SAMPLE_STEPS:
481
+ for start_step, end_step in zip(STEP_ORDER[:-1], STEP_ORDER[1:], strict=True):
482
+ sample_improvements[variant][str(sample_steps)][f"{start_step}_to_{end_step}"] = (
483
+ sample_by_key[(variant, start_step, sample_steps)]["mean_masked_mae"]
484
+ - sample_by_key[(variant, end_step, sample_steps)]["mean_masked_mae"]
485
+ )
486
+ sample_improvements[variant][str(sample_steps)]["0_to_2000"] = (
487
+ sample_by_key[(variant, 0, sample_steps)]["mean_masked_mae"]
488
+ - sample_by_key[(variant, 2000, sample_steps)]["mean_masked_mae"]
489
+ )
490
+
491
+ teacher_2k_ranking = sorted(
492
+ (
493
+ {
494
+ "model_variant": spec.model_variant,
495
+ "mean_val_loss": teacher_by_key[(spec.model_variant, 2000)]["mean_val_loss"],
496
+ "mean_left_right_imbalance": teacher_by_key[(spec.model_variant, 2000)]["mean_left_right_imbalance"],
497
+ "improvement_0_to_2000": teacher_by_key[(spec.model_variant, 0)]["mean_val_loss"]
498
+ - teacher_by_key[(spec.model_variant, 2000)]["mean_val_loss"],
499
+ }
500
+ for spec in MODEL_SPECS
501
+ ),
502
+ key=lambda row: row["mean_val_loss"],
503
+ )
504
+
505
+ sample_2k_ranking = sorted(
506
+ (
507
+ {
508
+ "model_variant": spec.model_variant,
509
+ "mean_masked_mae_step_4": sample_by_key[(spec.model_variant, 2000, 4)]["mean_masked_mae"],
510
+ "mean_masked_mae_step_16": sample_by_key[(spec.model_variant, 2000, 16)]["mean_masked_mae"],
511
+ "mean_masked_mae_average": statistics.fmean(
512
+ sample_by_key[(spec.model_variant, 2000, sample_steps)]["mean_masked_mae"] for sample_steps in SAMPLE_STEPS
513
+ ),
514
+ }
515
+ for spec in MODEL_SPECS
516
+ ),
517
+ key=lambda row: row["mean_masked_mae_average"],
518
+ )
519
+
520
+ stability = summarise_stability(train_rows)
521
+ bootstrap_sanity = {
522
+ spec.model_variant: {
523
+ "step0_weight_loading_missing_count": teacher_by_key[(spec.model_variant, 0)]["weight_loading_missing_count"],
524
+ "step0_weight_loading_unexpected_count": teacher_by_key[(spec.model_variant, 0)]["weight_loading_unexpected_count"],
525
+ }
526
+ for spec in MODEL_SPECS
527
+ }
528
+ invariant_logs = {
529
+ "split_independent": (artifact_root / "sanity_checks/check_split_independent_invariants.log").exists(),
530
+ "split_communicating": (artifact_root / "sanity_checks/check_split_communicating_invariants.log").exists(),
531
+ }
532
+
533
+ prior_regression = {}
534
+ teacher_prior = prior_metrics.get("teacher_2000", {})
535
+ sample_prior = prior_metrics.get("sample_2000_step4", {})
536
+ current_shared_teacher_2k = teacher_by_key[("shared", 2000)]["mean_val_loss"]
537
+ current_head_only_teacher_2k = teacher_by_key[("head_only_parallel", 2000)]["mean_val_loss"]
538
+ if teacher_prior:
539
+ prior_delta = (teacher_prior.get("baseline") or 0.0) - (teacher_prior.get("parallel") or 0.0)
540
+ current_delta = current_shared_teacher_2k - current_head_only_teacher_2k
541
+ prior_regression["teacher_forced_2000"] = {
542
+ "prior_baseline": teacher_prior.get("baseline"),
543
+ "prior_parallel": teacher_prior.get("parallel"),
544
+ "prior_parallel_edge": prior_delta,
545
+ "current_shared": current_shared_teacher_2k,
546
+ "current_head_only_parallel": current_head_only_teacher_2k,
547
+ "current_head_only_edge": current_delta,
548
+ "direction_matches": (prior_delta > 0 and current_delta > 0) or (prior_delta < 0 and current_delta < 0),
549
+ }
550
+ if sample_prior:
551
+ current_shared_sample_2k = sample_by_key[("shared", 2000, 4)]["mean_masked_mae"]
552
+ current_head_only_sample_2k = sample_by_key[("head_only_parallel", 2000, 4)]["mean_masked_mae"]
553
+ prior_delta = (sample_prior.get("baseline") or 0.0) - (sample_prior.get("parallel") or 0.0)
554
+ current_delta = current_shared_sample_2k - current_head_only_sample_2k
555
+ prior_regression["sample_step4_2000"] = {
556
+ "prior_baseline": sample_prior.get("baseline"),
557
+ "prior_parallel": sample_prior.get("parallel"),
558
+ "prior_parallel_edge": prior_delta,
559
+ "current_shared": current_shared_sample_2k,
560
+ "current_head_only_parallel": current_head_only_sample_2k,
561
+ "current_head_only_edge": current_delta,
562
+ "direction_matches": (prior_delta > 0 and current_delta > 0) or (prior_delta < 0 and current_delta < 0),
563
+ }
564
+
565
+ split_ind_teacher_2k = teacher_by_key[("split_independent", 2000)]["mean_val_loss"]
566
+ split_comm_teacher_2k = teacher_by_key[("split_communicating", 2000)]["mean_val_loss"]
567
+ head_only_teacher_2k = teacher_by_key[("head_only_parallel", 2000)]["mean_val_loss"]
568
+
569
+ split_ind_sample_avg_2k = statistics.fmean(
570
+ sample_by_key[("split_independent", 2000, sample_steps)]["mean_masked_mae"] for sample_steps in SAMPLE_STEPS
571
+ )
572
+ split_comm_sample_avg_2k = statistics.fmean(
573
+ sample_by_key[("split_communicating", 2000, sample_steps)]["mean_masked_mae"] for sample_steps in SAMPLE_STEPS
574
+ )
575
+ head_only_sample_avg_2k = statistics.fmean(
576
+ sample_by_key[("head_only_parallel", 2000, sample_steps)]["mean_masked_mae"] for sample_steps in SAMPLE_STEPS
577
+ )
578
+
579
+ return {
580
+ "study_name": artifact_root.name,
581
+ "artifact_root": str(artifact_root),
582
+ "hardware": "4x H100 80GB",
583
+ "precision": "bfloat16",
584
+ "train_repo_id": "lsnu/twin_dual_push_128_train",
585
+ "val_repo_id": "lsnu/twin_dual_push_128_val",
586
+ "packed_layout": "[L8, 0x8, R8, 0x8]",
587
+ "sample_batch_size_used": sample_batch_size_used,
588
+ "bootstrap_sanity": {
589
+ "step0_weight_loading": bootstrap_sanity,
590
+ "invariant_logs_present": invariant_logs,
591
+ },
592
+ "step0_gap_analysis": {
593
+ "teacher_forced_delta_vs_shared": step0_teacher_gaps,
594
+ "sample_avg_delta_vs_shared": {
595
+ variant: payload["average_delta_vs_shared"] for variant, payload in step0_sample_gaps.items()
596
+ },
597
+ "sample_delta_vs_shared_by_steps": step0_sample_gaps,
598
+ "smallest_teacher_forced_jump": smallest_teacher_variant,
599
+ "smallest_sample_jump": smallest_sample_variant,
600
+ },
601
+ "teacher_improvements": teacher_improvements,
602
+ "sample_improvements": sample_improvements,
603
+ "teacher_2k_ranking": teacher_2k_ranking,
604
+ "sample_2k_ranking": sample_2k_ranking,
605
+ "optimization_stability": {
606
+ "summary": stability,
607
+ "log_errors": log_errors,
608
+ },
609
+ "head_only_vs_prior_5k_study": prior_regression,
610
+ "answer_summary": {
611
+ "teacher_2k_best": teacher_2k_ranking[0]["model_variant"],
612
+ "sample_2k_best": sample_2k_ranking[0]["model_variant"],
613
+ "split_models_beat_head_only_teacher_2k": {
614
+ "split_independent": split_ind_teacher_2k < head_only_teacher_2k,
615
+ "split_communicating": split_comm_teacher_2k < head_only_teacher_2k,
616
+ },
617
+ "split_models_beat_head_only_sample_2k_avg": {
618
+ "split_independent": split_ind_sample_avg_2k < head_only_sample_avg_2k,
619
+ "split_communicating": split_comm_sample_avg_2k < head_only_sample_avg_2k,
620
+ },
621
+ "split_comm_vs_split_ind_teacher_2k_delta": split_comm_teacher_2k - split_ind_teacher_2k,
622
+ "split_comm_vs_split_ind_sample_2k_avg_delta": split_comm_sample_avg_2k - split_ind_sample_avg_2k,
623
+ },
624
+ "startup_summaries": startup_summaries,
625
+ }
626
+
627
+
628
+ def write_startup_summaries(path: pathlib.Path, startup_summaries: dict[str, dict[str, str]]) -> None:
629
+ ordered_keys = [
630
+ "weight_missing_count",
631
+ "weight_missing_keys",
632
+ "weight_unexpected_count",
633
+ "weight_unexpected_keys",
634
+ "config_name",
635
+ "dataset_repo_id",
636
+ "norm_stats_file",
637
+ "norm_stats_summary",
638
+ "checkpoint_source",
639
+ "model_type",
640
+ "packed_transforms",
641
+ "world_size",
642
+ "batch_size",
643
+ "num_workers",
644
+ "precision",
645
+ "lr_schedule",
646
+ "save_log_intervals",
647
+ "action_loss_mask",
648
+ "active_mask_dims",
649
+ "masked_dims",
650
+ "gradient_buckets",
651
+ ]
652
+ lines = []
653
+ for spec in MODEL_SPECS:
654
+ lines.append(f"[{spec.model_variant}]")
655
+ startup = startup_summaries.get(spec.model_variant, {})
656
+ for key in ordered_keys:
657
+ if key in startup:
658
+ lines.append(f"{key}: {startup[key]}")
659
+ lines.append("")
660
+ path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
661
+
662
+
663
+ def build_markdown_table(headers: list[str], rows: list[list[str]]) -> str:
664
+ header_row = "| " + " | ".join(headers) + " |"
665
+ separator = "| " + " | ".join("---" for _ in headers) + " |"
666
+ body = "\n".join("| " + " | ".join(row) + " |" for row in rows)
667
+ return "\n".join([header_row, separator, body])
668
+
669
+
670
+ def write_readme(
671
+ path: pathlib.Path,
672
+ summary: dict[str, Any],
673
+ teacher_rows: list[dict[str, Any]],
674
+ sample_rows: list[dict[str, Any]],
675
+ ) -> None:
676
+ teacher_by_key = row_index(teacher_rows, "model_variant", "checkpoint_step")
677
+ sample_by_key = row_index(sample_rows, "model_variant", "checkpoint_step", "sample_num_steps")
678
+
679
+ step0_table_rows = []
680
+ final_teacher_rows = []
681
+ final_sample_rows = []
682
+ for spec in MODEL_SPECS:
683
+ variant = spec.model_variant
684
+ step0 = teacher_by_key[(variant, 0)]
685
+ final_teacher = teacher_by_key[(variant, 2000)]
686
+ step0_table_rows.append(
687
+ [
688
+ variant,
689
+ format_float(step0["mean_val_loss"]),
690
+ format_delta(summary["step0_gap_analysis"]["teacher_forced_delta_vs_shared"][variant]),
691
+ format_float(step0["mean_left_right_imbalance"]),
692
+ ]
693
+ )
694
+ final_teacher_rows.append(
695
+ [
696
+ variant,
697
+ format_float(final_teacher["mean_val_loss"]),
698
+ format_float(summary["teacher_improvements"][variant]["0_to_2000"]),
699
+ format_float(final_teacher["mean_left_right_imbalance"]),
700
+ ]
701
+ )
702
+ final_sample_rows.append(
703
+ [
704
+ variant,
705
+ format_float(sample_by_key[(variant, 2000, 1)]["mean_masked_mae"]),
706
+ format_float(sample_by_key[(variant, 2000, 4)]["mean_masked_mae"]),
707
+ format_float(sample_by_key[(variant, 2000, 16)]["mean_masked_mae"]),
708
+ ]
709
+ )
710
+
711
+ stability = summary["optimization_stability"]["summary"]
712
+ memory_note = ", ".join(
713
+ f"{variant}={format_float(stability[variant]['max_cuda_memory_gb'], digits=2)}GB"
714
+ for variant in [spec.model_variant for spec in MODEL_SPECS]
715
+ )
716
+ split_comm_comm = stability.get("split_communicating", {}).get("communication", {})
717
+ prior_teacher = summary.get("head_only_vs_prior_5k_study", {}).get("teacher_forced_2000")
718
+ prior_sample = summary.get("head_only_vs_prior_5k_study", {}).get("sample_step4_2000")
719
+
720
+ readme = [
721
+ f"# {summary['study_name']}",
722
+ "",
723
+ "Controlled 4-way early-training comparison on packed TWIN dual-push `128` with a shared step-0 bootstrap check, fresh `2K` training runs, and fixed validation settings at steps `0`, `100`, `500`, and `2000`.",
724
+ "",
725
+ "## Quick answers",
726
+ f"- Smallest step-0 teacher-forced jump vs `shared`: `{summary['step0_gap_analysis']['smallest_teacher_forced_jump']}` (`{format_delta(summary['step0_gap_analysis']['teacher_forced_delta_vs_shared'][summary['step0_gap_analysis']['smallest_teacher_forced_jump']])}`).",
727
+ f"- Smallest step-0 sample jump vs `shared` (average over sample steps `1,2,4,8,16`): `{summary['step0_gap_analysis']['smallest_sample_jump']}` (`{format_delta(summary['step0_gap_analysis']['sample_avg_delta_vs_shared'][summary['step0_gap_analysis']['smallest_sample_jump']])}`).",
728
+ f"- Best teacher-forced result by step `2000`: `{summary['answer_summary']['teacher_2k_best']}`.",
729
+ f"- Best sample result by step `2000` (average masked MAE over sample steps `1,2,4,8,16`): `{summary['answer_summary']['sample_2k_best']}`.",
730
+ f"- Split vs head-only by step `2000`: teacher-forced beat flags `split_independent={summary['answer_summary']['split_models_beat_head_only_teacher_2k']['split_independent']}`, `split_communicating={summary['answer_summary']['split_models_beat_head_only_teacher_2k']['split_communicating']}`; sample beat flags `split_independent={summary['answer_summary']['split_models_beat_head_only_sample_2k_avg']['split_independent']}`, `split_communicating={summary['answer_summary']['split_models_beat_head_only_sample_2k_avg']['split_communicating']}`.",
731
+ f"- `split_communicating` vs `split_independent` at `2000`: teacher delta `{format_delta(summary['answer_summary']['split_comm_vs_split_ind_teacher_2k_delta'])}`, sample-average delta `{format_delta(summary['answer_summary']['split_comm_vs_split_ind_sample_2k_avg_delta'])}`.",
732
+ "",
733
+ "## Step-0 teacher-forced comparison",
734
+ build_markdown_table(
735
+ ["model", "mean_val_loss", "delta_vs_shared", "left_right_imbalance"],
736
+ step0_table_rows,
737
+ ),
738
+ "",
739
+ "## Step-2000 comparison",
740
+ build_markdown_table(
741
+ ["model", "mean_val_loss", "0_to_2000_improvement", "left_right_imbalance"],
742
+ final_teacher_rows,
743
+ ),
744
+ "",
745
+ build_markdown_table(
746
+ ["model", "1-step_mae", "4-step_mae", "16-step_mae"],
747
+ final_sample_rows,
748
+ ),
749
+ "",
750
+ "## Stability notes",
751
+ f"- Sample batch size used for all official evals: `{summary['sample_batch_size_used']}`.",
752
+ f"- Step-0 weight loading was clean for all four variants: missing and unexpected key counts were zero in every step-0 eval log.",
753
+ f"- Peak training VRAM by model: {memory_note}.",
754
+ f"- `split_communicating` communication path: active=`{split_comm_comm.get('active', False)}`, `grad_cross_arm_comm_max={format_float(split_comm_comm.get('grad_cross_arm_comm_max'))}`, `attention_mass_mean={format_float(split_comm_comm.get('attention_mass_mean'))}`, `gate_abs_max={format_float(split_comm_comm.get('gate_abs_max'))}`.",
755
+ "",
756
+ "## Regression check vs prior dual-push screen",
757
+ ]
758
+
759
+ if prior_teacher:
760
+ readme.append(
761
+ f"- Prior `5K` study at step `2000` had `baseline={format_float(prior_teacher['prior_baseline'])}` and `parallel={format_float(prior_teacher['prior_parallel'])}` with head-only edge `{format_delta(prior_teacher['prior_parallel_edge'])}`. This rerun has `shared={format_float(prior_teacher['current_shared'])}` and `head_only_parallel={format_float(prior_teacher['current_head_only_parallel'])}` with head-only edge `{format_delta(prior_teacher['current_head_only_edge'])}`; direction match=`{prior_teacher['direction_matches']}`."
762
+ )
763
+ else:
764
+ readme.append("- Prior teacher-forced comparison was unavailable.")
765
+
766
+ if prior_sample:
767
+ readme.append(
768
+ f"- Prior `5K` study `4`-step MAE at step `2000` had `baseline={format_float(prior_sample['prior_baseline'])}` and `parallel={format_float(prior_sample['prior_parallel'])}` with head-only edge `{format_delta(prior_sample['prior_parallel_edge'])}`. This rerun has `shared={format_float(prior_sample['current_shared'])}` and `head_only_parallel={format_float(prior_sample['current_head_only_parallel'])}` with head-only edge `{format_delta(prior_sample['current_head_only_edge'])}`; direction match=`{prior_sample['direction_matches']}`."
769
+ )
770
+ else:
771
+ readme.append("- Prior sample-based comparison was unavailable.")
772
+
773
+ readme.extend(
774
+ [
775
+ "",
776
+ "## Files",
777
+ "- `metrics/teacher_forced_eval_table.csv`: all teacher-forced metrics at steps `0`, `100`, `500`, `2000`.",
778
+ "- `metrics/sample_eval_table.csv`: all sample-eval metrics for sample steps `1`, `2`, `4`, `8`, `16` at steps `0`, `100`, `500`, `2000`.",
779
+ "- `metrics/training_summary.csv`: per-log-interval training diagnostics with model-specific gradient columns.",
780
+ "- `metrics/startup_summaries.txt`: startup configuration and weight-loading summaries for each run.",
781
+ "- `run_logs/`: full train/eval logs, including the first-five-step debug lines in each train log.",
782
+ ]
783
+ )
784
+
785
+ path.write_text("\n".join(readme).rstrip() + "\n", encoding="utf-8")
786
+
787
+
788
+ def main() -> None:
789
+ args = parse_args()
790
+ artifact_root = pathlib.Path(args.artifact_root).resolve()
791
+ run_logs_dir = artifact_root / "run_logs"
792
+ metrics_dir = artifact_root / "metrics"
793
+ metrics_dir.mkdir(parents=True, exist_ok=True)
794
+
795
+ teacher_rows: list[dict[str, Any]] = []
796
+ sample_rows: list[dict[str, Any]] = []
797
+ train_rows: list[dict[str, Any]] = []
798
+ startup_summaries: dict[str, dict[str, str]] = {}
799
+ log_errors: dict[str, list[str]] = {}
800
+ sample_batch_size_used = "unknown"
801
+
802
+ extra_train_columns: set[str] = set()
803
+
804
+ for spec in MODEL_SPECS:
805
+ train_log = run_logs_dir / f"{spec.exp_name}.log"
806
+ startup, train_log_rows, train_errors = parse_train_log(train_log, spec)
807
+ startup_summaries[spec.model_variant] = startup
808
+ train_rows.extend(train_log_rows)
809
+ extra_train_columns.update(column for row in train_log_rows for column in row if column not in BASE_TRAIN_COLUMNS)
810
+ if train_errors:
811
+ log_errors[f"{spec.model_variant}:train"] = train_errors
812
+
813
+ for step in STEP_ORDER:
814
+ eval_log = run_logs_dir / f"{spec.exp_name}_val_{step}.log"
815
+ eval_metrics, eval_sample_batch_size, eval_errors = parse_eval_log(eval_log)
816
+ if eval_sample_batch_size is not None:
817
+ sample_batch_size_used = str(eval_sample_batch_size)
818
+ teacher_rows.append(build_teacher_row(spec, step, eval_metrics))
819
+ sample_rows.extend(build_sample_rows(spec, step, eval_metrics))
820
+ if eval_errors:
821
+ log_errors[f"{spec.model_variant}:eval:{step}"] = eval_errors
822
+
823
+ teacher_rows.sort(key=lambda row: (natural_key(row["model_variant"]), row["checkpoint_step"]))
824
+ sample_rows.sort(key=lambda row: (natural_key(row["model_variant"]), row["checkpoint_step"], row["sample_num_steps"]))
825
+ train_rows.sort(key=lambda row: (natural_key(row["model_variant"]), row["step"]))
826
+
827
+ teacher_columns = [
828
+ "model_variant",
829
+ "config_name",
830
+ "exp_name",
831
+ "checkpoint_step",
832
+ "checkpoint_path",
833
+ "repo_id",
834
+ "num_batches",
835
+ "mean_val_loss",
836
+ "std_val_loss",
837
+ "mean_left_arm_loss",
838
+ "std_left_arm_loss",
839
+ "mean_right_arm_loss",
840
+ "std_right_arm_loss",
841
+ "mean_left_joint_loss",
842
+ "std_left_joint_loss",
843
+ "mean_left_gripper_loss",
844
+ "std_left_gripper_loss",
845
+ "mean_right_joint_loss",
846
+ "std_right_joint_loss",
847
+ "mean_right_gripper_loss",
848
+ "std_right_gripper_loss",
849
+ "mean_left_right_imbalance",
850
+ "std_left_right_imbalance",
851
+ "per_batch_time_mean_s",
852
+ "per_batch_time_std_s",
853
+ "per_batch_time_min_s",
854
+ "per_batch_time_max_s",
855
+ "weight_loading_missing_count",
856
+ "weight_loading_unexpected_count",
857
+ ]
858
+ sample_columns = [
859
+ "model_variant",
860
+ "config_name",
861
+ "exp_name",
862
+ "checkpoint_step",
863
+ "checkpoint_path",
864
+ "repo_id",
865
+ "sample_num_steps",
866
+ "sample_num_batches",
867
+ "mean_masked_mae",
868
+ "std_masked_mae",
869
+ "mean_left_arm_mae",
870
+ "std_left_arm_mae",
871
+ "mean_right_arm_mae",
872
+ "std_right_arm_mae",
873
+ "mean_left_joint_mae",
874
+ "std_left_joint_mae",
875
+ "mean_left_gripper_mae",
876
+ "std_left_gripper_mae",
877
+ "mean_right_joint_mae",
878
+ "std_right_joint_mae",
879
+ "mean_right_gripper_mae",
880
+ "std_right_gripper_mae",
881
+ "mean_left_right_imbalance_mae",
882
+ "std_left_right_imbalance_mae",
883
+ "per_batch_time_mean_s",
884
+ "per_batch_time_std_s",
885
+ "per_batch_time_min_s",
886
+ "per_batch_time_max_s",
887
+ ]
888
+
889
+ ordered_extra_train_columns = sorted(extra_train_columns, key=natural_key)
890
+ train_columns = BASE_TRAIN_COLUMNS + ordered_extra_train_columns
891
+
892
+ write_csv(metrics_dir / "teacher_forced_eval_table.csv", teacher_rows, teacher_columns)
893
+ write_csv(metrics_dir / "sample_eval_table.csv", sample_rows, sample_columns)
894
+ write_csv(metrics_dir / "training_summary.csv", train_rows, train_columns)
895
+ write_startup_summaries(metrics_dir / "startup_summaries.txt", startup_summaries)
896
+
897
+ prior_metrics = load_prior_metrics(pathlib.Path(args.prior_metrics_root)) if args.prior_metrics_root else {}
898
+ summary = build_summary(
899
+ artifact_root,
900
+ teacher_rows,
901
+ sample_rows,
902
+ train_rows,
903
+ startup_summaries,
904
+ log_errors,
905
+ sample_batch_size_used,
906
+ prior_metrics,
907
+ )
908
+ (metrics_dir / "summary.json").write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
909
+ write_readme(artifact_root / "README.md", summary, teacher_rows, sample_rows)
910
+
911
+
912
+ if __name__ == "__main__":
913
+ main()
openpi/scripts/prune_stepcmp_checkpoints.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import argparse
4
+ import pathlib
5
+ import shutil
6
+ import time
7
+ from datetime import datetime, timezone
8
+
9
+
10
+ def utc_ts() -> str:
11
+ return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
12
+
13
+
14
+ def prune_once(roots: list[pathlib.Path], keep_steps: set[str]) -> int:
15
+ removed = 0
16
+ for root in roots:
17
+ if not root.is_dir():
18
+ continue
19
+ for child in root.iterdir():
20
+ if not child.is_dir():
21
+ continue
22
+ if child.name.startswith("tmp_"):
23
+ continue
24
+ if not child.name.isdigit():
25
+ continue
26
+ if child.name in keep_steps:
27
+ continue
28
+ shutil.rmtree(child, ignore_errors=True)
29
+ print(f"[{utc_ts()}] pruned {child}", flush=True)
30
+ removed += 1
31
+ return removed
32
+
33
+
34
+ def main() -> None:
35
+ parser = argparse.ArgumentParser()
36
+ parser.add_argument("--interval-seconds", type=int, default=30)
37
+ parser.add_argument("--keep-steps", nargs="+", default=["100", "500", "2000"])
38
+ parser.add_argument("roots", nargs="+")
39
+ args = parser.parse_args()
40
+
41
+ roots = [pathlib.Path(root) for root in args.roots]
42
+ keep_steps = set(args.keep_steps)
43
+ print(
44
+ f"[{utc_ts()}] retention pruner started interval_s={args.interval_seconds} keep_steps={sorted(keep_steps)}",
45
+ flush=True,
46
+ )
47
+ while True:
48
+ prune_once(roots, keep_steps)
49
+ time.sleep(args.interval_seconds)
50
+
51
+
52
+ if __name__ == "__main__":
53
+ main()
openpi/scripts/run_twin_dual_push_128_stepcmp_2k.sh ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
5
+ VENV="$ROOT/.venv/bin/activate"
6
+ PYTHON_BIN="$ROOT/.venv/bin/python"
7
+ ARTIFACT_DATE="${ARTIFACT_DATE:-$(date -u +%Y%m%d)}"
8
+ ARTIFACT_ROOT="${ARTIFACT_ROOT:-/workspace/pi05tests/artifacts/twin_dual_push_128_stepcmp_2k_${ARTIFACT_DATE}}"
9
+ RUN_LOG_DIR="$ARTIFACT_ROOT/run_logs"
10
+ METRICS_DIR="$ARTIFACT_ROOT/metrics"
11
+ REPRO_DIR="$ARTIFACT_ROOT/repro"
12
+ ENV_DIR="$ARTIFACT_ROOT/environment"
13
+ SANITY_DIR="$ARTIFACT_ROOT/sanity_checks"
14
+ mkdir -p "$RUN_LOG_DIR" "$METRICS_DIR" "$REPRO_DIR" "$ENV_DIR" "$SANITY_DIR" "$ROOT/run_logs"
15
+
16
+ export HF_TOKEN="${HF_TOKEN:-}"
17
+ export HF_HOME=/workspace/.hf
18
+ export HF_HUB_CACHE=/workspace/.hf/hub
19
+ export HF_DATASETS_CACHE=/workspace/.hf/datasets
20
+ export HUGGINGFACE_HUB_CACHE=/workspace/.hf/hub
21
+ export XDG_CACHE_HOME=/workspace/.cache
22
+ export OPENPI_LEROBOT_HOME=/workspace/lerobot
23
+ export OPENPI_TORCH_COMPILE_SAMPLE_ACTIONS=0
24
+ export TOKENIZERS_PARALLELISM=false
25
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
26
+ export PYTHONPATH="$ROOT/src"
27
+ export OMP_NUM_THREADS=1
28
+ export MKL_NUM_THREADS=1
29
+ export NUMEXPR_NUM_THREADS=1
30
+ export PYTHONFAULTHANDLER=1
31
+
32
+ cd "$ROOT"
33
+ source "$VENV"
34
+
35
+ TRAIN_REPO="lsnu/twin_dual_push_128_train"
36
+ VAL_REPO="lsnu/twin_dual_push_128_val"
37
+ TEACHER_VAL_BATCHES=100
38
+ SAMPLE_VAL_BATCHES=64
39
+ SAMPLE_NUM_STEPS="1,2,4,8,16"
40
+ PRIOR_METRICS_ROOT="/workspace/pi05tests/artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics"
41
+
42
+ declare -A MODEL_VARIANT=(
43
+ [shared]="shared"
44
+ [head_only]="head_only_parallel"
45
+ [split_ind]="split_independent"
46
+ [split_comm]="split_communicating"
47
+ )
48
+
49
+ declare -A CONFIG_NAME=(
50
+ [shared]="pi05_twin_dual_push_128_packed_baseline_pytorch_5k"
51
+ [head_only]="pi05_twin_dual_push_128_packed_parallel_pytorch_5k"
52
+ [split_ind]="pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k"
53
+ [split_comm]="pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k"
54
+ )
55
+
56
+ declare -A STEP0_CKPT=(
57
+ [shared]="/workspace/checkpoints/pi05_base_single_pytorch"
58
+ [head_only]="/workspace/checkpoints/pi05_base_parallel_packed_from_single"
59
+ [split_ind]="/workspace/checkpoints/pi05_base_split_independent_packed_from_single"
60
+ [split_comm]="/workspace/checkpoints/pi05_base_split_communicating_packed_from_single"
61
+ )
62
+
63
+ declare -A EXP_NAME=(
64
+ [shared]="dual_push_128_stepcmp_shared_2k"
65
+ [head_only]="dual_push_128_stepcmp_head_only_2k"
66
+ [split_ind]="dual_push_128_stepcmp_split_ind_2k"
67
+ [split_comm]="dual_push_128_stepcmp_split_comm_2k"
68
+ )
69
+
70
+ eval_pids=()
71
+ eval_labels=()
72
+ checkpoint_pruner_pid=""
73
+
74
+ log() {
75
+ echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] $*"
76
+ }
77
+
78
+ pruner_log() {
79
+ echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] $*" >>"$RUN_LOG_DIR/checkpoint_retention_pruner.log"
80
+ }
81
+
82
+ trim_env_snapshot() {
83
+ env | sort | grep -E '^(HF_|HUGGINGFACE_|OPENPI_|PYTORCH_|PYTHONPATH|TOKENIZERS_PARALLELISM|XDG_CACHE_HOME)=' >"$ENV_DIR/env_selected.txt"
84
+ }
85
+
86
+ save_environment_snapshot() {
87
+ date -u '+%Y-%m-%d %H:%M:%S UTC' >"$ENV_DIR/date_utc.txt"
88
+ uname -a >"$ENV_DIR/uname.txt"
89
+ python --version >"$ENV_DIR/python_version.txt" 2>&1
90
+ pip freeze >"$ENV_DIR/pip_freeze.txt"
91
+ if ! timeout 120s python -m torch.utils.collect_env >"$ENV_DIR/torch_env.txt" 2>&1; then
92
+ echo "torch.utils.collect_env timed out after 120 seconds" >>"$ENV_DIR/torch_env.txt"
93
+ fi
94
+ nvidia-smi >"$ENV_DIR/nvidia_smi.txt"
95
+ nvidia-smi topo -m >"$ENV_DIR/nvidia_smi_topo.txt"
96
+ df -h /workspace >"$ENV_DIR/df_workspace.txt"
97
+ trim_env_snapshot
98
+ }
99
+
100
+ copy_repro_manifests() {
101
+ cp "$0" "$REPRO_DIR/commands_stepcmp.sh"
102
+ cp "$ROOT/scripts/collect_twin_dual_push_128_stepcmp_metrics.py" "$REPRO_DIR/collect_twin_dual_push_128_stepcmp_metrics.py"
103
+ cat >"$REPRO_DIR/checkpoint_locations.txt" <<EOF
104
+ shared_step0=${STEP0_CKPT[shared]}
105
+ head_only_step0=${STEP0_CKPT[head_only]}
106
+ split_independent_step0=${STEP0_CKPT[split_ind]}
107
+ split_communicating_step0=${STEP0_CKPT[split_comm]}
108
+ shared_train_root=$ROOT/checkpoints/${CONFIG_NAME[shared]}/${EXP_NAME[shared]}
109
+ head_only_train_root=$ROOT/checkpoints/${CONFIG_NAME[head_only]}/${EXP_NAME[head_only]}
110
+ split_independent_train_root=$ROOT/checkpoints/${CONFIG_NAME[split_ind]}/${EXP_NAME[split_ind]}
111
+ split_communicating_train_root=$ROOT/checkpoints/${CONFIG_NAME[split_comm]}/${EXP_NAME[split_comm]}
112
+ artifact_root=$ARTIFACT_ROOT
113
+ EOF
114
+ }
115
+
116
+ require_file() {
117
+ local path="$1"
118
+ if [[ ! -f "$path" ]]; then
119
+ log "required file missing: $path"
120
+ exit 1
121
+ fi
122
+ }
123
+
124
+ require_dir() {
125
+ local path="$1"
126
+ if [[ ! -d "$path" ]]; then
127
+ log "required directory missing: $path"
128
+ exit 1
129
+ fi
130
+ }
131
+
132
+ norm_stats_path_for_key() {
133
+ local key="$1"
134
+ echo "$ROOT/assets/${CONFIG_NAME[$key]}/$TRAIN_REPO/norm_stats.json"
135
+ }
136
+
137
+ ensure_packed_dual_push_norm_stats() {
138
+ local split_ind_stats
139
+ local split_comm_stats
140
+ split_ind_stats="$(norm_stats_path_for_key split_ind)"
141
+ split_comm_stats="$(norm_stats_path_for_key split_comm)"
142
+
143
+ require_file "$split_ind_stats"
144
+ require_file "$split_comm_stats"
145
+
146
+ local split_ind_sha
147
+ local split_comm_sha
148
+ split_ind_sha="$(sha256sum "$split_ind_stats" | awk '{print $1}')"
149
+ split_comm_sha="$(sha256sum "$split_comm_stats" | awk '{print $1}')"
150
+
151
+ if [[ "$split_ind_sha" != "$split_comm_sha" ]]; then
152
+ log "packed dual-push split norm stats differ across split configs"
153
+ echo "split_ind=$split_ind_stats sha256=$split_ind_sha" >"$SANITY_DIR/norm_stats_status.txt"
154
+ echo "split_comm=$split_comm_stats sha256=$split_comm_sha" >>"$SANITY_DIR/norm_stats_status.txt"
155
+ exit 1
156
+ fi
157
+
158
+ local canonical_stats="$split_ind_stats"
159
+ local key
160
+ : >"$SANITY_DIR/norm_stats_status.txt"
161
+ echo "canonical_source=$canonical_stats" >>"$SANITY_DIR/norm_stats_status.txt"
162
+ echo "canonical_sha256=$split_ind_sha" >>"$SANITY_DIR/norm_stats_status.txt"
163
+
164
+ for key in shared head_only split_ind split_comm; do
165
+ local dst
166
+ local dst_sha
167
+ dst="$(norm_stats_path_for_key "$key")"
168
+ if [[ ! -f "$dst" ]]; then
169
+ mkdir -p "$(dirname "$dst")"
170
+ cp "$canonical_stats" "$dst"
171
+ log "restored missing packed dual-push norm stats for ${MODEL_VARIANT[$key]} -> $dst"
172
+ fi
173
+ dst_sha="$(sha256sum "$dst" | awk '{print $1}')"
174
+ echo "${MODEL_VARIANT[$key]}=$dst sha256=$dst_sha" >>"$SANITY_DIR/norm_stats_status.txt"
175
+ if [[ "$dst_sha" != "$split_ind_sha" ]]; then
176
+ log "packed dual-push norm stats mismatch for ${MODEL_VARIANT[$key]}: $dst"
177
+ exit 1
178
+ fi
179
+ done
180
+ }
181
+
182
+ ensure_bootstrap_checkpoints() {
183
+ local regenerated_any=0
184
+ local regenerated_split=0
185
+
186
+ require_file "/workspace/checkpoints/pi05_base_single_pytorch/model.safetensors"
187
+
188
+ if [[ ! -f "${STEP0_CKPT[head_only]}/model.safetensors" ]]; then
189
+ log "regenerating head-only packed warm-start checkpoint"
190
+ python -u scripts/init_parallel_pi05_from_single_pytorch.py \
191
+ --single_ckpt /workspace/checkpoints/pi05_base_single_pytorch \
192
+ --config_name "${CONFIG_NAME[head_only]}" \
193
+ --output_path "${STEP0_CKPT[head_only]}" \
194
+ >"$SANITY_DIR/init_head_only.log" 2>&1
195
+ regenerated_any=1
196
+ fi
197
+
198
+ if [[ ! -f "${STEP0_CKPT[split_ind]}/model.safetensors" ]]; then
199
+ log "regenerating split-independent packed warm-start checkpoint"
200
+ python -u scripts/init_parallel_pi05_from_single_pytorch.py \
201
+ --single_ckpt /workspace/checkpoints/pi05_base_single_pytorch \
202
+ --config_name "${CONFIG_NAME[split_ind]}" \
203
+ --output_path "${STEP0_CKPT[split_ind]}" \
204
+ >"$SANITY_DIR/init_split_independent.log" 2>&1
205
+ regenerated_any=1
206
+ regenerated_split=1
207
+ fi
208
+
209
+ if [[ ! -f "${STEP0_CKPT[split_comm]}/model.safetensors" ]]; then
210
+ log "regenerating split-communicating packed warm-start checkpoint"
211
+ python -u scripts/init_parallel_pi05_from_single_pytorch.py \
212
+ --single_ckpt /workspace/checkpoints/pi05_base_single_pytorch \
213
+ --config_name "${CONFIG_NAME[split_comm]}" \
214
+ --output_path "${STEP0_CKPT[split_comm]}" \
215
+ >"$SANITY_DIR/init_split_communicating.log" 2>&1
216
+ regenerated_any=1
217
+ regenerated_split=1
218
+ fi
219
+
220
+ require_file "${STEP0_CKPT[head_only]}/model.safetensors"
221
+ require_file "${STEP0_CKPT[split_ind]}/model.safetensors"
222
+ require_file "${STEP0_CKPT[split_comm]}/model.safetensors"
223
+
224
+ if [[ "$regenerated_split" -eq 1 ]]; then
225
+ log "rerunning split invariant checks after bootstrap regeneration"
226
+ python -u scripts/check_split_expert_invariants.py \
227
+ --config_name "${CONFIG_NAME[split_ind]}" \
228
+ --checkpoint_dir "${STEP0_CKPT[split_ind]}" \
229
+ >"$SANITY_DIR/check_split_independent_invariants.log" 2>&1
230
+ python -u scripts/check_split_expert_invariants.py \
231
+ --config_name "${CONFIG_NAME[split_comm]}" \
232
+ --checkpoint_dir "${STEP0_CKPT[split_comm]}" \
233
+ >"$SANITY_DIR/check_split_communicating_invariants.log" 2>&1
234
+ fi
235
+
236
+ printf 'regenerated_any=%s\nregenerated_split=%s\n' "$regenerated_any" "$regenerated_split" >"$SANITY_DIR/bootstrap_regeneration_status.txt"
237
+ }
238
+
239
+ sample_batch_size_arg=()
240
+ sample_batch_size_value="default"
241
+
242
+ run_sample_batch_probe() {
243
+ local sample_batch_size="$1"
244
+ local probe_log="$SANITY_DIR/sample_batch_size_probe.log"
245
+ local batch_arg=()
246
+ if [[ -n "$sample_batch_size" ]]; then
247
+ batch_arg=(--sample_batch_size "$sample_batch_size")
248
+ fi
249
+ CUDA_VISIBLE_DEVICES=0 python -u scripts/eval_twin_val_loss_pytorch.py \
250
+ --config_name "${CONFIG_NAME[split_comm]}" \
251
+ --checkpoint_dir "${STEP0_CKPT[split_comm]}" \
252
+ --repo_id "$VAL_REPO" \
253
+ --num_batches 1 \
254
+ --num_workers 0 \
255
+ --eval_seed 123 \
256
+ --sample_num_batches 1 \
257
+ --sample_num_steps "16" \
258
+ --sample_seed 321 \
259
+ "${batch_arg[@]}" \
260
+ >"$probe_log" 2>&1
261
+ }
262
+
263
+ determine_sample_batch_size() {
264
+ local override="${SAMPLE_BATCH_SIZE_OVERRIDE:-}"
265
+ if [[ -n "$override" ]]; then
266
+ sample_batch_size_arg=(--sample_batch_size "$override")
267
+ sample_batch_size_value="$override"
268
+ echo "$sample_batch_size_value" >"$SANITY_DIR/sample_batch_size_used.txt"
269
+ return
270
+ fi
271
+
272
+ log "probing sample-eval batch size on split_communicating step-0 checkpoint"
273
+ if run_sample_batch_probe ""; then
274
+ sample_batch_size_arg=()
275
+ sample_batch_size_value="default"
276
+ echo "$sample_batch_size_value" >"$SANITY_DIR/sample_batch_size_used.txt"
277
+ return
278
+ fi
279
+
280
+ if ! grep -qi 'out of memory' "$SANITY_DIR/sample_batch_size_probe.log"; then
281
+ log "sample batch size probe failed for a non-OOM reason; see $SANITY_DIR/sample_batch_size_probe.log"
282
+ exit 1
283
+ fi
284
+
285
+ local candidate=8
286
+ while [[ "$candidate" -ge 1 ]]; do
287
+ log "retrying sample-eval probe with --sample_batch_size=$candidate"
288
+ if run_sample_batch_probe "$candidate"; then
289
+ sample_batch_size_arg=(--sample_batch_size "$candidate")
290
+ sample_batch_size_value="$candidate"
291
+ echo "$sample_batch_size_value" >"$SANITY_DIR/sample_batch_size_used.txt"
292
+ return
293
+ fi
294
+ if ! grep -qi 'out of memory' "$SANITY_DIR/sample_batch_size_probe.log"; then
295
+ log "sample batch size retry failed for a non-OOM reason; see $SANITY_DIR/sample_batch_size_probe.log"
296
+ exit 1
297
+ fi
298
+ candidate=$((candidate / 2))
299
+ done
300
+
301
+ log "unable to find a viable sample batch size; see $SANITY_DIR/sample_batch_size_probe.log"
302
+ exit 1
303
+ }
304
+
305
+ load_saved_sample_batch_size() {
306
+ local saved_value="default"
307
+ if [[ -f "$SANITY_DIR/sample_batch_size_used.txt" ]]; then
308
+ saved_value="$(<"$SANITY_DIR/sample_batch_size_used.txt")"
309
+ else
310
+ echo "$saved_value" >"$SANITY_DIR/sample_batch_size_used.txt"
311
+ fi
312
+
313
+ sample_batch_size_value="$saved_value"
314
+ sample_batch_size_arg=()
315
+ if [[ "$saved_value" != "default" ]]; then
316
+ sample_batch_size_arg=(--sample_batch_size "$saved_value")
317
+ fi
318
+ }
319
+
320
+ launch_eval_async() {
321
+ local gpu="$1"
322
+ local key="$2"
323
+ local step="$3"
324
+ local ckpt_dir="$4"
325
+ local log_path="$5"
326
+ CUDA_VISIBLE_DEVICES="$gpu" python -u scripts/eval_twin_val_loss_pytorch.py \
327
+ --config_name "${CONFIG_NAME[$key]}" \
328
+ --checkpoint_dir "$ckpt_dir" \
329
+ --repo_id "$VAL_REPO" \
330
+ --num_batches "$TEACHER_VAL_BATCHES" \
331
+ --num_workers 0 \
332
+ --eval_seed 123 \
333
+ --sample_num_batches "$SAMPLE_VAL_BATCHES" \
334
+ --sample_num_steps "$SAMPLE_NUM_STEPS" \
335
+ --sample_seed 321 \
336
+ "${sample_batch_size_arg[@]}" \
337
+ >"$log_path" 2>&1 &
338
+ eval_pids+=("$!")
339
+ eval_labels+=("${MODEL_VARIANT[$key]} step=${step} gpu=${gpu}")
340
+ }
341
+
342
+ run_eval_sync() {
343
+ local gpu="$1"
344
+ local key="$2"
345
+ local step="$3"
346
+ local ckpt_dir="$4"
347
+ local log_path="$5"
348
+ CUDA_VISIBLE_DEVICES="$gpu" python -u scripts/eval_twin_val_loss_pytorch.py \
349
+ --config_name "${CONFIG_NAME[$key]}" \
350
+ --checkpoint_dir "$ckpt_dir" \
351
+ --repo_id "$VAL_REPO" \
352
+ --num_batches "$TEACHER_VAL_BATCHES" \
353
+ --num_workers 0 \
354
+ --eval_seed 123 \
355
+ --sample_num_batches "$SAMPLE_VAL_BATCHES" \
356
+ --sample_num_steps "$SAMPLE_NUM_STEPS" \
357
+ --sample_seed 321 \
358
+ "${sample_batch_size_arg[@]}" \
359
+ >"$log_path" 2>&1
360
+ }
361
+
362
+ wait_for_eval_jobs() {
363
+ local failed=0
364
+ local idx
365
+ for idx in "${!eval_pids[@]}"; do
366
+ if ! wait "${eval_pids[$idx]}"; then
367
+ log "evaluation failed: ${eval_labels[$idx]}"
368
+ failed=1
369
+ fi
370
+ done
371
+ eval_pids=()
372
+ eval_labels=()
373
+ if [[ "$failed" -ne 0 ]]; then
374
+ exit 1
375
+ fi
376
+ }
377
+
378
+ eval_log_complete() {
379
+ local log_path="$1"
380
+ [[ -f "$log_path" ]] && grep -q '^sample_eval_num_steps_16_per_batch_timing_seconds:' "$log_path"
381
+ }
382
+
383
+ checkpoint_roots() {
384
+ printf '%s\n' \
385
+ "$ROOT/checkpoints/${CONFIG_NAME[shared]}/${EXP_NAME[shared]}" \
386
+ "$ROOT/checkpoints/${CONFIG_NAME[head_only]}/${EXP_NAME[head_only]}" \
387
+ "$ROOT/checkpoints/${CONFIG_NAME[split_ind]}/${EXP_NAME[split_ind]}" \
388
+ "$ROOT/checkpoints/${CONFIG_NAME[split_comm]}/${EXP_NAME[split_comm]}"
389
+ }
390
+
391
+ prune_checkpoint_roots_once() {
392
+ local root child base
393
+ while read -r root; do
394
+ [[ -d "$root" ]] || continue
395
+ for child in "$root"/*; do
396
+ [[ -e "$child" ]] || continue
397
+ [[ -d "$child" ]] || continue
398
+ base="$(basename "$child")"
399
+ case "$base" in
400
+ 100|500|2000|tmp_*) continue ;;
401
+ esac
402
+ [[ "$base" =~ ^[0-9]+$ ]] || continue
403
+ rm -rf -- "$child"
404
+ pruner_log "pruned $child"
405
+ done
406
+ done < <(checkpoint_roots)
407
+ }
408
+
409
+ start_checkpoint_pruner() {
410
+ pruner_log "runner checkpoint pruner started interval_s=30 keep_steps=[100,500,2000]"
411
+ (
412
+ while true; do
413
+ prune_checkpoint_roots_once
414
+ sleep 30
415
+ done
416
+ ) &
417
+ checkpoint_pruner_pid="$!"
418
+ }
419
+
420
+ stop_checkpoint_pruner() {
421
+ if [[ -n "$checkpoint_pruner_pid" ]]; then
422
+ kill "$checkpoint_pruner_pid" >/dev/null 2>&1 || true
423
+ wait "$checkpoint_pruner_pid" 2>/dev/null || true
424
+ checkpoint_pruner_pid=""
425
+ fi
426
+ }
427
+
428
+ run_step0_evals() {
429
+ log "starting step-0 evaluation sweep"
430
+ local pending=0
431
+ local gpu key ckpt log_path
432
+ while read -r gpu key ckpt log_path; do
433
+ if eval_log_complete "$log_path"; then
434
+ log "step-0 eval already complete for ${MODEL_VARIANT[$key]}"
435
+ continue
436
+ fi
437
+ if [[ "${PARALLEL_EVALS:-1}" == "1" ]]; then
438
+ launch_eval_async "$gpu" "$key" 0 "$ckpt" "$log_path"
439
+ pending=1
440
+ else
441
+ run_eval_sync "$gpu" "$key" 0 "$ckpt" "$log_path"
442
+ fi
443
+ done <<EOF
444
+ 0 shared ${STEP0_CKPT[shared]} $RUN_LOG_DIR/${EXP_NAME[shared]}_val_0.log
445
+ 1 head_only ${STEP0_CKPT[head_only]} $RUN_LOG_DIR/${EXP_NAME[head_only]}_val_0.log
446
+ 2 split_ind ${STEP0_CKPT[split_ind]} $RUN_LOG_DIR/${EXP_NAME[split_ind]}_val_0.log
447
+ 3 split_comm ${STEP0_CKPT[split_comm]} $RUN_LOG_DIR/${EXP_NAME[split_comm]}_val_0.log
448
+ EOF
449
+ if [[ "${PARALLEL_EVALS:-1}" == "1" && "$pending" -eq 1 ]]; then
450
+ wait_for_eval_jobs
451
+ fi
452
+ log "finished step-0 evaluation sweep"
453
+ }
454
+
455
+ train_variant() {
456
+ local key="$1"
457
+ local ckpt_root="$ROOT/checkpoints/${CONFIG_NAME[$key]}/${EXP_NAME[$key]}"
458
+ local train_log="$RUN_LOG_DIR/${EXP_NAME[$key]}.log"
459
+ if [[ "${SKIP_COMPLETED_TRAIN:-0}" == "1" && -d "$ckpt_root/2000" ]]; then
460
+ log "training already complete for model_variant=${MODEL_VARIANT[$key]}; skipping train and reusing $ckpt_root"
461
+ return
462
+ fi
463
+ log "training start model_variant=${MODEL_VARIANT[$key]} exp_name=${EXP_NAME[$key]}"
464
+ "$PYTHON_BIN" -m torch.distributed.run --standalone --nproc_per_node=4 scripts/train_pytorch.py \
465
+ "${CONFIG_NAME[$key]}" \
466
+ --exp_name "${EXP_NAME[$key]}" \
467
+ --overwrite \
468
+ --num_train_steps 2000 \
469
+ --save_interval 100 \
470
+ --log_interval 10 \
471
+ >"$train_log" 2>&1
472
+ log "training finished model_variant=${MODEL_VARIANT[$key]}"
473
+ }
474
+
475
+ run_post_train_evals() {
476
+ local key="$1"
477
+ local ckpt_root="$ROOT/checkpoints/${CONFIG_NAME[$key]}/${EXP_NAME[$key]}"
478
+ require_dir "$ckpt_root/100"
479
+ require_dir "$ckpt_root/500"
480
+ require_dir "$ckpt_root/2000"
481
+
482
+ log "starting post-train evaluation sweep for ${MODEL_VARIANT[$key]}"
483
+ local pending=0
484
+ local gpu step ckpt log_path
485
+ while read -r gpu step ckpt log_path; do
486
+ if eval_log_complete "$log_path"; then
487
+ log "post-train eval already complete for ${MODEL_VARIANT[$key]} step=$step"
488
+ continue
489
+ fi
490
+ if [[ "${PARALLEL_EVALS:-1}" == "1" ]]; then
491
+ launch_eval_async "$gpu" "$key" "$step" "$ckpt" "$log_path"
492
+ pending=1
493
+ else
494
+ run_eval_sync "$gpu" "$key" "$step" "$ckpt" "$log_path"
495
+ fi
496
+ done <<EOF
497
+ 0 100 $ckpt_root/100 $RUN_LOG_DIR/${EXP_NAME[$key]}_val_100.log
498
+ 1 500 $ckpt_root/500 $RUN_LOG_DIR/${EXP_NAME[$key]}_val_500.log
499
+ 2 2000 $ckpt_root/2000 $RUN_LOG_DIR/${EXP_NAME[$key]}_val_2000.log
500
+ EOF
501
+ if [[ "${PARALLEL_EVALS:-1}" == "1" && "$pending" -eq 1 ]]; then
502
+ wait_for_eval_jobs
503
+ fi
504
+ log "finished post-train evaluation sweep for ${MODEL_VARIANT[$key]}"
505
+ }
506
+
507
+ collect_metrics() {
508
+ log "collecting step-comparison metrics"
509
+ python -u scripts/collect_twin_dual_push_128_stepcmp_metrics.py \
510
+ --artifact_root "$ARTIFACT_ROOT" \
511
+ --prior_metrics_root "$PRIOR_METRICS_ROOT" \
512
+ >"$RUN_LOG_DIR/collect_metrics.log" 2>&1
513
+ log "metrics collection finished"
514
+ }
515
+
516
+ main() {
517
+ trap stop_checkpoint_pruner EXIT
518
+ log "packed dual-push 128 step comparison runner started"
519
+ if [[ "${SKIP_ENV_SNAPSHOT:-0}" == "1" ]]; then
520
+ log "skipping environment snapshot (SKIP_ENV_SNAPSHOT=1)"
521
+ else
522
+ save_environment_snapshot
523
+ fi
524
+ copy_repro_manifests
525
+ ensure_bootstrap_checkpoints
526
+ ensure_packed_dual_push_norm_stats
527
+ start_checkpoint_pruner
528
+ if [[ "${SKIP_SAMPLE_BATCH_PROBE:-0}" == "1" ]]; then
529
+ log "skipping sample-eval batch-size probe (SKIP_SAMPLE_BATCH_PROBE=1)"
530
+ load_saved_sample_batch_size
531
+ else
532
+ determine_sample_batch_size
533
+ fi
534
+ if [[ "${SKIP_STEP0_EVALS:-0}" == "1" ]]; then
535
+ log "skipping step-0 evaluation sweep (SKIP_STEP0_EVALS=1)"
536
+ else
537
+ run_step0_evals
538
+ fi
539
+
540
+ local selected_keys="${MODEL_KEYS:-shared head_only split_ind split_comm}"
541
+ local key
542
+ for key in $selected_keys; do
543
+ case "$key" in
544
+ shared|head_only|split_ind|split_comm) ;;
545
+ *)
546
+ log "unknown model key in MODEL_KEYS: $key"
547
+ exit 1
548
+ ;;
549
+ esac
550
+ train_variant "$key"
551
+ run_post_train_evals "$key"
552
+ done
553
+
554
+ collect_metrics
555
+ log "packed dual-push 128 step comparison runner finished successfully"
556
+ }
557
+
558
+ main "$@"
run_logs/hf_upload_20260310.log ADDED
The diff for this file is too large to render. See raw diff