lsnu commited on about 10 hours ago

Commit

f7a3ee4

verified ·

1 Parent(s): ccf25b1

Add files using upload-large-folder tool

Browse files

Files changed (33) hide show

README.md +74 -1
artifacts/twin_dual_push_128_stepcmp_2k_20260311/README.md +51 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/df_workspace.txt +2 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/env_selected.txt +3 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/nvidia_smi.txt +32 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/nvidia_smi_topo.txt +23 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/pip_freeze.txt +223 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/python_version.txt +1 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/torch_env.txt +82 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/bootstrap_regeneration_status.txt +2 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/check_split_communicating_invariants.log +9 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/check_split_independent_invariants.log +11 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_head_only.log +15 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_split_communicating.log +16 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_split_independent.log +15 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/norm_stats_status.txt +6 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/sample_batch_size_probe.log +52 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/sample_batch_size_used.txt +1 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/date_utc.txt +1 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/pip_freeze.txt +223 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/python_version.txt +1 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/torch_env.txt +1 -0
artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/uname.txt +1 -0
openpi/checkpoints/debug_pi05_split_communicating_pytorch_smoke/debug_pi05_split_communicating_pytorch_smoke/1/optimizer.pt +3 -0
openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/1/model.safetensors +3 -0
openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/1/optimizer.pt +3 -0
openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/2/optimizer.pt +3 -0
openpi/run_logs/split_independent_real_smoke20.log +0 -0
openpi/run_logs/split_independent_real_smoke3.log +104 -0
openpi/scripts/collect_twin_dual_push_128_stepcmp_metrics.py +913 -0
openpi/scripts/prune_stepcmp_checkpoints.py +53 -0
openpi/scripts/run_twin_dual_push_128_stepcmp_2k.sh +558 -0
run_logs/hf_upload_20260310.log +0 -0

README.md CHANGED Viewed

@@ -12,6 +12,7 @@ Three runs are included:
 1. an initial `2K` baseline-vs-parallel comparison
 2. a longer `10K` follow-up on the same packed setup
 3. a `5K` dual-push `128` screening study on the same packed path
 This update also adds a split-action-expert bring-up bundle for the packed TWIN path, covering:
@@ -57,6 +58,41 @@ Dual-push `128` screening results:
 The dual-push screening run shows a small but consistent parallel edge at `1K`, `2K`, and `5K` on both teacher-forced validation loss and fixed-subset sample MAE.
 ## Warm-start note
 The packed parallel warm-start uses the slice/fuse mapping implemented in `openpi/scripts/init_parallel_pi05_from_single_pytorch.py`, but the added step-0 numerical checks show it is not exactly identical end-to-end on a real batch:
@@ -107,11 +143,43 @@ New bring-up artifact bundle:
   - `10K` follow-up bundle with metrics, logs, repro manifests, and environment snapshot
 - `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/`
   - dual-push `128` screening bundle with metrics, logs, repro manifests, and environment snapshot
 - `artifacts/twin_split_expert_bringup_20260310/`
-  - split-expert warm-start checkpoints, sanity checks, and bring-up repro commands
 - `artifacts/pi05_base_params/`
   - staged base parameter snapshot used during JAX-to-PyTorch conversion
 ## Key files
 - Full report: `REPORT.md`
@@ -122,6 +190,11 @@ New bring-up artifact bundle:
 - dual-push `5K` teacher-forced table: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics/teacher_forced_eval_table.csv`
 - dual-push `5K` sample eval table: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics/sample_eval_table.csv`
 - dual-push `5K` environment snapshot: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/environment/`
 - split-expert bring-up summary: `artifacts/twin_split_expert_bringup_20260310/README.md`
 - split-expert repro commands: `artifacts/twin_split_expert_bringup_20260310/repro/commands_bringup.sh`
 - split-expert invariant check outputs: `artifacts/twin_split_expert_bringup_20260310/sanity_checks/`

 1. an initial `2K` baseline-vs-parallel comparison
 2. a longer `10K` follow-up on the same packed setup
 3. a `5K` dual-push `128` screening study on the same packed path
+4. a `2K` dual-push `128` four-way step comparison across `shared`, `head_only_parallel`, `split_independent`, and `split_communicating`
 This update also adds a split-action-expert bring-up bundle for the packed TWIN path, covering:
 The dual-push screening run shows a small but consistent parallel edge at `1K`, `2K`, and `5K` on both teacher-forced validation loss and fixed-subset sample MAE.
+Dual-push `128` four-way `2K` step comparison raw results:
+Step-0 teacher-forced masked validation loss:
+| Model | Step-0 val loss | Step-0 left/right imbalance |
+| --- | ---: | ---: |
+| Shared | `1.084735` | `0.505345` |
+| Head-only parallel | `1.082985` | `0.501182` |
+| Split independent | `1.328262` | `0.448843` |
+| Split communicating | `1.783048` | `0.671085` |
+Step-2000 teacher-forced masked validation loss:
+| Model | Step-2000 val loss | Step-2000 left/right imbalance |
+| --- | ---: | ---: |
+| Shared | `0.055329` | `0.069564` |
+| Head-only parallel | `0.055297` | `0.069380` |
+| Split independent | `0.063537` | `0.092029` |
+| Split communicating | `0.059952` | `0.080435` |
+Step-2000 sample masked MAE:
+| Model | 1-step MAE | 4-step MAE | 16-step MAE |
+| --- | ---: | ---: | ---: |
+| Shared | `0.087330` | `0.078164` | `0.085222` |
+| Head-only parallel | `0.086764` | `0.078301` | `0.085272` |
+| Split independent | `0.079100` | `0.070436` | `0.075281` |
+| Split communicating | `0.078618` | `0.071087` | `0.075570` |
+Full raw tables for the `0/100/500/2000` sweep live in:
+- `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/teacher_forced_eval_table.csv`
+- `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/sample_eval_table.csv`
+- `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/training_summary.csv`
 ## Warm-start note
 The packed parallel warm-start uses the slice/fuse mapping implemented in `openpi/scripts/init_parallel_pi05_from_single_pytorch.py`, but the added step-0 numerical checks show it is not exactly identical end-to-end on a real batch:
   - `10K` follow-up bundle with metrics, logs, repro manifests, and environment snapshot
 - `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/`
   - dual-push `128` screening bundle with metrics, logs, repro manifests, and environment snapshot
+- `artifacts/twin_dual_push_128_stepcmp_2k_20260311/`
+  - dual-push `128` four-way `2K` step-comparison bundle with metrics, logs, repro manifests, and environment snapshot
+- `artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/`
+  - small preflight/debug snapshot from the interrupted bring-up path; useful for debugging the runner, not the canonical result bundle
+- `artifacts/twin_split_expert_bringup_20260310/`
+  - split-expert bring-up bundle committed with summary README, repro commands, detached run logs, and sanity checks
+## Committed artifact note
+For this update, the committed artifact payloads are:
+- `artifacts/twin_dual_push_128_stepcmp_2k_20260311/`
+  - the official finalized `4`-model dual-push `2K` step-comparison bundle
 - `artifacts/twin_split_expert_bringup_20260310/`
+  - the split-expert bring-up bundle used as the sanity and warm-start reference
+- `artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/`
+  - a small debug-only environment snapshot from the failed/resumed bring-up sequence
+The debug bundle is intentionally committed only as runner diagnostics. The canonical study outputs are the non-`_debug` step-comparison bundle plus the split bring-up bundle.
+- `openpi/run_logs/`
+  - raw local split bring-up logs kept for completeness; the canonical copies for the finalized bring-up record live under `artifacts/twin_split_expert_bringup_20260310/run_logs/`
+- `openpi/scripts/upload_stepcmp_bundle_to_hf.py`
+  - the committed high-throughput HF uploader for the step-comparison bundle and retained checkpoints; it uses `huggingface_hub.HfApi.upload_large_folder(...)`
 - `artifacts/pi05_base_params/`
   - staged base parameter snapshot used during JAX-to-PyTorch conversion
+## Future commit/upload workflow
+When adding new experiment results to this repo:
+- keep the canonical bundle under `artifacts/<study_name>/` and only retain the checkpoint steps that are scientifically required under `openpi/checkpoints/`
+- before claiming the repo is fully committed, audit ignored artifact paths explicitly:
+  - `git ls-files --others -i --exclude-standard --directory -- openpi/checkpoints artifacts openpi/run_logs run_logs`
+- if a result is intentionally kept in an ignored path such as `openpi/checkpoints/` or `openpi/run_logs/`, force-add it explicitly with `git add --sparse -f ...`
+- use `openpi/scripts/upload_stepcmp_bundle_to_hf.py` for large HF uploads; it uses `huggingface_hub.HfApi.upload_large_folder(...)` and is the preferred path for checkpoint-heavy updates
+- never hardcode HF credentials in scripts, logs, or READMEs; keep the credential in `HF_TOKEN` or load it from `HF_TOKEN_FILE`, and check for literal `hf_...` strings before committing
 ## Key files
 - Full report: `REPORT.md`
 - dual-push `5K` teacher-forced table: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics/teacher_forced_eval_table.csv`
 - dual-push `5K` sample eval table: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics/sample_eval_table.csv`
 - dual-push `5K` environment snapshot: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/environment/`
+- dual-push `2K` step-comparison summary: `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/summary.json`
+- dual-push `2K` step-comparison README: `artifacts/twin_dual_push_128_stepcmp_2k_20260311/README.md`
+- dual-push `2K` teacher-forced table: `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/teacher_forced_eval_table.csv`
+- dual-push `2K` sample eval table: `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/sample_eval_table.csv`
+- dual-push `2K` training summary: `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/training_summary.csv`
 - split-expert bring-up summary: `artifacts/twin_split_expert_bringup_20260310/README.md`
 - split-expert repro commands: `artifacts/twin_split_expert_bringup_20260310/repro/commands_bringup.sh`
 - split-expert invariant check outputs: `artifacts/twin_split_expert_bringup_20260310/sanity_checks/`

artifacts/twin_dual_push_128_stepcmp_2k_20260311/README.md ADDED Viewed

	@@ -0,0 +1,51 @@

+# twin_dual_push_128_stepcmp_2k_20260311
+Controlled 4-way early-training comparison on packed TWIN dual-push `128` with a shared step-0 bootstrap check, fresh `2K` training runs, and fixed validation settings at steps `0`, `100`, `500`, and `2000`.
+## Quick answers
+- Smallest step-0 teacher-forced jump vs `shared`: `head_only_parallel` (`-0.001750`).
+- Smallest step-0 sample jump vs `shared` (average over sample steps `1,2,4,8,16`): `head_only_parallel` (`+0.000005`).
+- Best teacher-forced result by step `2000`: `head_only_parallel`.
+- Best sample result by step `2000` (average masked MAE over sample steps `1,2,4,8,16`): `split_communicating`.
+- Split vs head-only by step `2000`: teacher-forced beat flags `split_independent=False`, `split_communicating=False`; sample beat flags `split_independent=True`, `split_communicating=True`.
+- `split_communicating` vs `split_independent` at `2000`: teacher delta `-0.003585`, sample-average delta `-0.000257`.
+## Step-0 teacher-forced comparison
+| model | mean_val_loss | delta_vs_shared | left_right_imbalance |
+| --- | --- | --- | --- |
+| shared | 1.084735 | +0.000000 | 0.505345 |
+| head_only_parallel | 1.082985 | -0.001750 | 0.501182 |
+| split_independent | 1.328262 | +0.243527 | 0.448843 |
+| split_communicating | 1.783048 | +0.698313 | 0.671085 |
+## Step-2000 comparison
+| model | mean_val_loss | 0_to_2000_improvement | left_right_imbalance |
+| --- | --- | --- | --- |
+| shared | 0.055329 | 1.029406 | 0.069564 |
+| head_only_parallel | 0.055297 | 1.027688 | 0.069380 |
+| split_independent | 0.063537 | 1.264725 | 0.092029 |
+| split_communicating | 0.059952 | 1.723096 | 0.080435 |
+| model | 1-step_mae | 4-step_mae | 16-step_mae |
+| --- | --- | --- | --- |
+| shared | 0.087330 | 0.078164 | 0.085222 |
+| head_only_parallel | 0.086764 | 0.078301 | 0.085272 |
+| split_independent | 0.079100 | 0.070436 | 0.075281 |
+| split_communicating | 0.078618 | 0.071087 | 0.075570 |
+## Stability notes
+- Sample batch size used for all official evals: `16`.
+- Step-0 weight loading was clean for all four variants: missing and unexpected key counts were zero in every step-0 eval log.
+- Peak training VRAM by model: shared=35.23GB, head_only_parallel=35.27GB, split_independent=41.73GB, split_communicating=41.73GB.
+- `split_communicating` communication path: active=`True`, `grad_cross_arm_comm_max=0.394700`, `attention_mass_mean=0.009074`, `gate_abs_max=0.003900`.
+## Regression check vs prior dual-push screen
+- Prior `5K` study at step `2000` had `baseline=0.083194` and `parallel=0.082729` with head-only edge `+0.000465`. This rerun has `shared=0.055329` and `head_only_parallel=0.055297` with head-only edge `+0.000032`; direction match=`True`.
+- Prior `5K` study `4`-step MAE at step `2000` had `baseline=0.069732` and `parallel=0.069053` with head-only edge `+0.000679`. This rerun has `shared=0.078164` and `head_only_parallel=0.078301` with head-only edge `-0.000137`; direction match=`False`.
+## Files
+- `metrics/teacher_forced_eval_table.csv`: all teacher-forced metrics at steps `0`, `100`, `500`, `2000`.
+- `metrics/sample_eval_table.csv`: all sample-eval metrics for sample steps `1`, `2`, `4`, `8`, `16` at steps `0`, `100`, `500`, `2000`.
+- `metrics/training_summary.csv`: per-log-interval training diagnostics with model-specific gradient columns.
+- `metrics/startup_summaries.txt`: startup configuration and weight-loading summaries for each run.
+- `run_logs/`: full train/eval logs, including the first-five-step debug lines in each train log.

artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/df_workspace.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Filesystem Size Used Avail Use% Mounted on
2	+ mfs#us-mo-1.runpod.net:9421 154T 129T 25T 84% /workspace

artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/env_selected.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+PYTHONPATH=/workspace/pi05tests/openpi/src
+TOKENIZERS_PARALLELISM=false
+XDG_CACHE_HOME=/workspace/.cache

artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/nvidia_smi.txt ADDED Viewed

	@@ -0,0 +1,32 @@

+Wed Mar 11 23:32:47 2026
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 580.126.09             Driver Version: 580.126.09     CUDA Version: 13.0     |
++-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA H100 80GB HBM3          On  |   00000000:2A:00.0 Off |                    0 |
+| N/A   30C    P0             69W /  700W |       0MiB /  81559MiB |      0%      Default |
+|                                         |                        |             Disabled |
++-----------------------------------------+------------------------+----------------------+
+|   1  NVIDIA H100 80GB HBM3          On  |   00000000:5D:00.0 Off |                    0 |
+| N/A   29C    P0             71W /  700W |       0MiB /  81559MiB |      0%      Default |
+|                                         |                        |             Disabled |
++-----------------------------------------+------------------------+----------------------+
+|   2  NVIDIA H100 80GB HBM3          On  |   00000000:9A:00.0 Off |                    0 |
+| N/A   28C    P0             69W /  700W |       0MiB /  81559MiB |      0%      Default |
+|                                         |                        |             Disabled |
++-----------------------------------------+------------------------+----------------------+
+|   3  NVIDIA H100 80GB HBM3          On  |   00000000:AB:00.0 Off |                    0 |
+| N/A   29C    P0             74W /  700W |       0MiB /  81559MiB |      0%      Default |
+|                                         |                        |             Disabled |
++-----------------------------------------+------------------------+----------------------+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+

artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/nvidia_smi_topo.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+	[4mGPU0	GPU1	GPU2	GPU3	NIC0	NIC1	CPU Affinity	NUMA Affinity	GPU NUMA ID[0m
+GPU0	 X 	NV18	NV18	NV18	NODE	NODE	0-51,104-155	0		N/A
+GPU1	NV18	 X 	NV18	NV18	NODE	NODE	0-51,104-155	0		N/A
+GPU2	NV18	NV18	 X 	NV18	SYS	SYS	52-103,156-207	1		N/A
+GPU3	NV18	NV18	NV18	 X 	SYS	SYS	52-103,156-207	1		N/A
+NIC0	NODE	NODE	SYS	SYS	 X 	PIX
+NIC1	NODE	NODE	SYS	SYS	PIX	 X
+Legend:
+  X    = Self
+  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing at most a single PCIe bridge
+  NV#  = Connection traversing a bonded set of # NVLinks
+NIC Legend:
+  NIC0: mlx5_3
+  NIC1: mlx5_4

artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/pip_freeze.txt ADDED Viewed

	@@ -0,0 +1,223 @@

+absl-py==2.4.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.6.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==25.4.0
+augmax==0.4.1
+av==16.1.0
+babel==2.16.0
+beartype==0.19.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+certifi==2026.2.25
+cffi==2.0.0
+charset-normalizer==3.4.5
+comm==0.2.2
+cryptography==46.0.5
+datasets==4.7.0
+debugpy==1.8.5
+decorator==5.1.1
+deepdiff==8.6.1
+defusedxml==0.7.1
+dill==0.4.0
+dm-tree==0.1.9
+docstring_parser==0.17.0
+draccus==0.10.0
+einops==0.8.2
+entrypoints==0.4
+equinox==0.13.6
+etils==1.14.0
+executing==2.1.0
+fastjsonschema==2.20.0
+filelock==3.25.1
+flatbuffers==25.12.19
+flax==0.10.2
+fqdn==1.5.1
+frozenlist==1.8.0
+fsspec==2026.2.0
+gcsfs==2026.2.0
+google-api-core==2.30.0
+google-auth==2.49.0
+google-auth-oauthlib==1.3.0
+google-cloud-core==2.5.0
+google-cloud-storage==3.9.0
+google-cloud-storage-control==1.10.0
+google-crc32c==1.8.0
+google-resumable-media==2.8.0
+googleapis-common-protos==1.73.0
+grpc-google-iam-v1==0.14.3
+grpcio==1.78.0
+grpcio-status==1.78.0
+h11==0.14.0
+hf-xet==1.3.2
+httpcore==1.0.5
+httpx==0.27.2
+huggingface_hub==0.36.2
+humanize==4.15.0
+idna==3.11
+ImageIO==2.37.3
+ipykernel==6.29.5
+ipython==8.27.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jax==0.5.3
+jaxlib==0.5.3
+jaxtyping==0.2.36
+jedi==0.19.1
+Jinja2==3.1.3
+json5==0.9.25
+jsonlines==4.0.0
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter-archive==3.4.0
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_client==7.4.9
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+lerobot @ git+https://github.com/huggingface/lerobot@0cf864870cf29f4738d3ade893e6fd13fbd7cdb5
+lxml==5.3.0
+markdown-it-py==4.0.0
+MarkupSafe==2.1.5
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mergedeep==1.3.4
+mistune==3.0.2
+ml_collections==1.0.0
+ml_dtypes==0.5.4
+mpmath==1.3.0
+msgpack==1.1.2
+multidict==6.7.1
+multiprocess==0.70.18
+mypy_extensions==1.1.0
+nbclassic==1.1.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.2.1
+notebook==6.5.5
+notebook_shim==0.2.4
+numpy==1.26.4
+numpydantic==1.8.0
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.0.44
+nvidia-curand-cu12==10.3.5.119
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.4.99
+oauthlib==3.3.1
+omegaconf==2.3.0
+opencv-python==4.11.0.86
+openpi-client==0.1.1
+opt_einsum==3.4.0
+optax==0.2.7
+orbax-checkpoint==0.11.13
+orderly-set==5.5.0
+overrides==7.7.0
+packaging==26.0
+pandas==3.0.1
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+pillow==12.1.1
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.47
+propcache==0.4.1
+proto-plus==1.27.1
+protobuf==6.33.5
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==23.0.1
+pyasn1==0.6.2
+pyasn1_modules==0.4.2
+pycparser==2.22
+pydantic==2.12.5
+pydantic_core==2.41.5
+Pygments==2.19.2
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+PyYAML==6.0.3
+pyyaml-include==1.4.1
+pyzmq==24.0.1
+referencing==0.35.1
+regex==2026.2.28
+requests==2.32.5
+requests-oauthlib==2.0.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==14.3.3
+rpds-py==0.20.0
+rsa==4.9.1
+safetensors==0.7.0
+scipy==1.17.1
+Send2Trash==1.8.3
+sentencepiece==0.2.1
+simplejson==3.20.2
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+sympy==1.12
+tensorstore==0.1.81
+termcolor==3.3.0
+terminado==0.18.1
+tinycss2==1.3.0
+tokenizers==0.21.4
+toml==0.10.2
+torch==2.4.1+cu124
+torchaudio==2.4.1+cu124
+torchvision==0.19.1+cu124
+tornado==6.4.1
+tqdm==4.67.3
+tqdm-loggable==0.3
+traitlets==5.14.3
+transformers==4.53.2
+treescope==0.1.10
+triton==3.0.0
+typeguard==4.5.1
+types-python-dateutil==2.9.0.20240906
+typing-inspect==0.9.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+tyro==1.0.8
+uri-template==1.3.0
+urllib3==2.6.3
+wadler_lindig==0.1.7
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==16.0
+widgetsnbextension==4.0.13
+wrapt==2.1.2
+xxhash==3.6.0
+yarl==1.23.0
+zipp==3.23.0

artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/python_version.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Python 3.11.10

artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/torch_env.txt ADDED Viewed

	@@ -0,0 +1,82 @@

+<frozen runpy>:128: RuntimeWarning: 'torch.utils.collect_env' found in sys.modules after import of package 'torch.utils', but prior to execution of 'torch.utils.collect_env'; this may result in unpredictable behaviour
+Collecting environment information...
+PyTorch version: 2.4.1+cu124
+Is debug build: False
+CUDA used to build PyTorch: 12.4
+ROCM used to build PyTorch: N/A
+OS: Ubuntu 22.04.5 LTS (x86_64)
+GCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
+Clang version: Could not collect
+CMake version: Could not collect
+Libc version: glibc-2.35
+Python version: 3.11.10 (main, Sep  7 2024, 18:35:41) [GCC 11.4.0] (64-bit runtime)
+Python platform: Linux-6.8.0-90-generic-x86_64-with-glibc2.35
+Is CUDA available: True
+CUDA runtime version: 12.4.131
+CUDA_MODULE_LOADING set to: LAZY
+GPU models and configuration:
+GPU 0: NVIDIA H100 80GB HBM3
+GPU 1: NVIDIA H100 80GB HBM3
+GPU 2: NVIDIA H100 80GB HBM3
+GPU 3: NVIDIA H100 80GB HBM3
+Nvidia driver version: 580.126.09
+cuDNN version: Could not collect
+HIP runtime version: N/A
+MIOpen runtime version: N/A
+Is XNNPACK available: True
+CPU:
+Architecture:                         x86_64
+CPU op-mode(s):                       32-bit, 64-bit
+Address sizes:                        46 bits physical, 57 bits virtual
+Byte Order:                           Little Endian
+CPU(s):                               208
+On-line CPU(s) list:                  0-207
+Vendor ID:                            GenuineIntel
+Model name:                           Intel(R) Xeon(R) Platinum 8470
+CPU family:                           6
+Model:                                143
+Thread(s) per core:                   2
+Core(s) per socket:                   52
+Socket(s):                            2
+Stepping:                             8
+CPU max MHz:                          3800.0000
+CPU min MHz:                          800.0000
+BogoMIPS:                             4000.00
+Flags:                                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cat_l2 cdp_l3 intel_ppin cdp_l2 ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local split_lock_detect user_shstk avx_vnni avx512_bf16 wbnoinvd dtherm ida arat pln pts vnmi avx512vbmi umip pku ospke waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq la57 rdpid bus_lock_detect cldemote movdiri movdir64b enqcmd fsrm md_clear serialize tsxldtrk pconfig arch_lbr ibt amx_bf16 avx512_fp16 amx_tile amx_int8 flush_l1d arch_capabilities ibpb_exit_to_user
+Virtualization:                       VT-x
+L1d cache:                            4.9 MiB (104 instances)
+L1i cache:                            3.3 MiB (104 instances)
+L2 cache:                             208 MiB (104 instances)
+L3 cache:                             210 MiB (2 instances)
+NUMA node(s):                         2
+NUMA node0 CPU(s):                    0-51,104-155
+NUMA node1 CPU(s):                    52-103,156-207
+Vulnerability Gather data sampling:   Not affected
+Vulnerability Itlb multihit:          Not affected
+Vulnerability L1tf:                   Not affected
+Vulnerability Mds:                    Not affected
+Vulnerability Meltdown:               Not affected
+Vulnerability Mmio stale data:        Not affected
+Vulnerability Reg file data sampling: Not affected
+Vulnerability Retbleed:               Not affected
+Vulnerability Spec rstack overflow:   Not affected
+Vulnerability Spec store bypass:      Mitigation; Speculative Store Bypass disabled via prctl
+Vulnerability Spectre v1:             Mitigation; usercopy/swapgs barriers and __user pointer sanitization
+Vulnerability Spectre v2:             Mitigation; Enhanced / Automatic IBRS; IBPB conditional; RSB filling; PBRSB-eIBRS SW sequence; BHI BHI_DIS_S
+Vulnerability Srbds:                  Not affected
+Vulnerability Tsx async abort:        Not affected
+Vulnerability Vmscape:                Mitigation; IBPB before exit to userspace
+Versions of relevant libraries:
+[pip3] mypy_extensions==1.1.0
+[pip3] numpy==1.26.4
+[pip3] numpydantic==1.8.0
+[pip3] torch==2.4.1+cu124
+[pip3] torchaudio==2.4.1+cu124
+[pip3] torchvision==0.19.1+cu124
+[pip3] triton==3.0.0
+[conda] Could not collect

artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/bootstrap_regeneration_status.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ regenerated_any=0
2	+ regenerated_split=0

artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/check_split_communicating_invariants.log ADDED Viewed

	@@ -0,0 +1,9 @@

+config_name: pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k
+checkpoint_dir: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single
+action_expert_mode: split_communicating
+weight_loading_missing_keys: []
+weight_loading_unexpected_keys: []
+identical_branch_suffix_max_abs_diff: 0.00000000
+identical_branch_suffix_match: True
+left_branch_invariance_max_abs_diff: skipped_for_split_communicating
+right_branch_invariance_max_abs_diff: skipped_for_split_communicating

artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/check_split_independent_invariants.log ADDED Viewed

	@@ -0,0 +1,11 @@

+config_name: pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k
+checkpoint_dir: /workspace/checkpoints/pi05_base_split_independent_packed_from_single
+action_expert_mode: split_independent
+weight_loading_missing_keys: []
+weight_loading_unexpected_keys: []
+identical_branch_suffix_max_abs_diff: 0.00000000
+identical_branch_suffix_match: True
+left_branch_invariance_max_abs_diff: 0.00000000
+right_branch_invariance_max_abs_diff: 0.00000000
+left_branch_invariant: True
+right_branch_invariant: True

artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_head_only.log ADDED Viewed

	@@ -0,0 +1,15 @@

+config_name: pi05_twin_dual_push_128_packed_parallel_pytorch_5k
+action_expert_mode: head_only_parallel
+single_ckpt: /workspace/checkpoints/pi05_base_single_pytorch
+output_path: /workspace/checkpoints/pi05_base_parallel_packed_from_single
+load_state_missing_keys_count: 11
+load_state_missing_keys: ['paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight', 'action_in_proj_arms.0.weight', 'action_in_proj_arms.0.bias', 'action_in_proj_arms.1.weight', 'action_in_proj_arms.1.bias', 'arm_token_fuse.weight', 'arm_token_fuse.bias', 'action_out_proj_arms.0.weight', 'action_out_proj_arms.0.bias', 'action_out_proj_arms.1.weight', 'action_out_proj_arms.1.bias']
+load_state_unexpected_keys_count: 4
+load_state_unexpected_keys: ['action_in_proj.bias', 'action_in_proj.weight', 'action_out_proj.bias', 'action_out_proj.weight']
+input_projection_max_abs_diff: 9.5367431640625e-07
+left_input_projection_max_abs_diff: 0.0
+left_output_projection_max_abs_diff: 0.0
+output_projection_max_abs_diff: 8.344650268554688e-07
+right_input_projection_max_abs_diff: 0.0
+right_output_projection_max_abs_diff: 0.0
+warm_start_exact: False

artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_split_communicating.log ADDED Viewed

	@@ -0,0 +1,16 @@

+config_name: pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k
+action_expert_mode: split_communicating
+single_ckpt: /workspace/checkpoints/pi05_base_single_pytorch
+output_path: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single
+load_state_missing_keys_count: 412
+load_state_missing_keys: ['paligemma_with_expert.cross_arm_comm', 'paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.norm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.left_gemma_expert.lm_head.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.norm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.right_gemma_expert.lm_head.weight', 'action_in_proj_arms.0.weight', 'action_in_proj_arms.0.bias', 'action_in_proj_arms.1.weight', 'action_in_proj_arms.1.bias', 'action_out_proj_arms.0.weight', 'action_out_proj_arms.0.bias', 'action_out_proj_arms.1.weight', 'action_out_proj_arms.1.bias']
+load_state_unexpected_keys_count: 205
+load_state_unexpected_keys: ['action_in_proj.bias', 'action_in_proj.weight', 'action_out_proj.bias', 'action_out_proj.weight', 'paligemma_with_expert.gemma_expert.lm_head.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.gemma_expert.model.norm.dense.weight']
+cross_arm_comm_init: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+left_expert_max_abs_diff: 0.0
+left_input_projection_max_abs_diff: 0.0
+left_output_projection_max_abs_diff: 0.0
+right_expert_max_abs_diff: 0.0
+right_input_projection_max_abs_diff: 0.0
+right_output_projection_max_abs_diff: 0.0
+warm_start_exact: True

artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_split_independent.log ADDED Viewed

	@@ -0,0 +1,15 @@

+config_name: pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k
+action_expert_mode: split_independent
+single_ckpt: /workspace/checkpoints/pi05_base_single_pytorch
+output_path: /workspace/checkpoints/pi05_base_split_independent_packed_from_single
+load_state_missing_keys_count: 411
+load_state_missing_keys: ['paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.norm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.left_gemma_expert.lm_head.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.norm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.right_gemma_expert.lm_head.weight', 'action_in_proj_arms.0.weight', 'action_in_proj_arms.0.bias', 'action_in_proj_arms.1.weight', 'action_in_proj_arms.1.bias', 'action_out_proj_arms.0.weight', 'action_out_proj_arms.0.bias', 'action_out_proj_arms.1.weight', 'action_out_proj_arms.1.bias']
+load_state_unexpected_keys_count: 205
+load_state_unexpected_keys: ['action_in_proj.bias', 'action_in_proj.weight', 'action_out_proj.bias', 'action_out_proj.weight', 'paligemma_with_expert.gemma_expert.lm_head.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.gemma_expert.model.norm.dense.weight']
+left_expert_max_abs_diff: 0.0
+left_input_projection_max_abs_diff: 0.0
+left_output_projection_max_abs_diff: 0.0
+right_expert_max_abs_diff: 0.0
+right_input_projection_max_abs_diff: 0.0
+right_output_projection_max_abs_diff: 0.0
+warm_start_exact: True

artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/norm_stats_status.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+canonical_source=/workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json
+canonical_sha256=9dc7c777a62f956cca6799726a34596ba3774cda0d9795cc068dbd47222d86a5
+shared=/workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_baseline_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json sha256=9dc7c777a62f956cca6799726a34596ba3774cda0d9795cc068dbd47222d86a5
+head_only_parallel=/workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_parallel_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json sha256=9dc7c777a62f956cca6799726a34596ba3774cda0d9795cc068dbd47222d86a5
+split_independent=/workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json sha256=9dc7c777a62f956cca6799726a34596ba3774cda0d9795cc068dbd47222d86a5
+split_communicating=/workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json sha256=9dc7c777a62f956cca6799726a34596ba3774cda0d9795cc068dbd47222d86a5

artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/sample_batch_size_probe.log ADDED Viewed

	@@ -0,0 +1,52 @@

+starting_eval config=pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k checkpoint=/workspace/checkpoints/pi05_base_split_communicating_packed_from_single repo_id=lsnu/twin_dual_push_128_val
+eval_loader batch_size=16 num_batches=1 num_workers=0
+teacher_forced_eval_seed: 123
+sample_eval enabled=True batch_size=16 num_batches=1 num_steps=[16] seed=321
+WARNING:root:'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder
+weight_loading missing=0 unexpected=0 device=cuda:0
+eval_batch=1 loss=2.309001 left_arm_loss=1.740481 right_arm_loss=2.877522 imbalance=1.137041 batch_time_s=0.6439
+config_name: pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k
+checkpoint_path: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single
+repo_id_used: lsnu/twin_dual_push_128_val
+num_batches: 1
+mean_val_loss: 2.309001
+std_val_loss: 0.000000
+mean_left_arm_loss: 1.740481
+std_left_arm_loss: 0.000000
+mean_right_arm_loss: 2.877522
+std_right_arm_loss: 0.000000
+mean_left_joint_loss: 1.680031
+std_left_joint_loss: 0.000000
+mean_left_gripper_loss: 2.163631
+std_left_gripper_loss: 0.000000
+mean_right_joint_loss: 2.108088
+std_right_joint_loss: 0.000000
+mean_right_gripper_loss: 8.263555
+std_right_gripper_loss: 0.000000
+mean_left_right_imbalance: 1.137041
+std_left_right_imbalance: 0.000000
+per_batch_timing_seconds: mean=0.6439 std=0.0000 min=0.6439 max=0.6439
+active_mask_dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]
+masked_dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]
+weight_loading_missing_keys: []
+weight_loading_unexpected_keys: []
+WARNING:root:'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder
+sample_eval_batch=1 num_steps=16 masked_mae=0.611553 left_arm_mae=0.679956 right_arm_mae=0.543150 imbalance_mae=0.136806 batch_time_s=3.3573
+sample_eval_num_steps_16_num_batches: 1
+sample_eval_num_steps_16_mean_masked_mae: 0.611553
+sample_eval_num_steps_16_std_masked_mae: 0.000000
+sample_eval_num_steps_16_mean_left_arm_mae: 0.679956
+sample_eval_num_steps_16_std_left_arm_mae: 0.000000
+sample_eval_num_steps_16_mean_right_arm_mae: 0.543150
+sample_eval_num_steps_16_std_right_arm_mae: 0.000000
+sample_eval_num_steps_16_mean_left_joint_mae: 0.648674
+sample_eval_num_steps_16_std_left_joint_mae: 0.000000
+sample_eval_num_steps_16_mean_left_gripper_mae: 0.898926
+sample_eval_num_steps_16_std_left_gripper_mae: 0.000000
+sample_eval_num_steps_16_mean_right_joint_mae: 0.478297
+sample_eval_num_steps_16_std_right_joint_mae: 0.000000
+sample_eval_num_steps_16_mean_right_gripper_mae: 0.997122
+sample_eval_num_steps_16_std_right_gripper_mae: 0.000000
+sample_eval_num_steps_16_mean_left_right_imbalance_mae: 0.136806
+sample_eval_num_steps_16_std_left_right_imbalance_mae: 0.000000
+sample_eval_num_steps_16_per_batch_timing_seconds: mean=3.3573 std=0.0000 min=3.3573 max=3.3573

artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/sample_batch_size_used.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ default

artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/date_utc.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 2026-03-11 17:33:48 UTC

artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/pip_freeze.txt ADDED Viewed

	@@ -0,0 +1,223 @@

+absl-py==2.4.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.6.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==25.4.0
+augmax==0.4.1
+av==16.1.0
+babel==2.16.0
+beartype==0.19.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+certifi==2026.2.25
+cffi==2.0.0
+charset-normalizer==3.4.5
+comm==0.2.2
+cryptography==46.0.5
+datasets==4.7.0
+debugpy==1.8.5
+decorator==5.1.1
+deepdiff==8.6.1
+defusedxml==0.7.1
+dill==0.4.0
+dm-tree==0.1.9
+docstring_parser==0.17.0
+draccus==0.10.0
+einops==0.8.2
+entrypoints==0.4
+equinox==0.13.6
+etils==1.14.0
+executing==2.1.0
+fastjsonschema==2.20.0
+filelock==3.25.1
+flatbuffers==25.12.19
+flax==0.10.2
+fqdn==1.5.1
+frozenlist==1.8.0
+fsspec==2026.2.0
+gcsfs==2026.2.0
+google-api-core==2.30.0
+google-auth==2.49.0
+google-auth-oauthlib==1.3.0
+google-cloud-core==2.5.0
+google-cloud-storage==3.9.0
+google-cloud-storage-control==1.10.0
+google-crc32c==1.8.0
+google-resumable-media==2.8.0
+googleapis-common-protos==1.73.0
+grpc-google-iam-v1==0.14.3
+grpcio==1.78.0
+grpcio-status==1.78.0
+h11==0.14.0
+hf-xet==1.3.2
+httpcore==1.0.5
+httpx==0.27.2
+huggingface_hub==0.36.2
+humanize==4.15.0
+idna==3.11
+ImageIO==2.37.3
+ipykernel==6.29.5
+ipython==8.27.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jax==0.5.3
+jaxlib==0.5.3
+jaxtyping==0.2.36
+jedi==0.19.1
+Jinja2==3.1.3
+json5==0.9.25
+jsonlines==4.0.0
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter-archive==3.4.0
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_client==7.4.9
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+lerobot @ git+https://github.com/huggingface/lerobot@0cf864870cf29f4738d3ade893e6fd13fbd7cdb5
+lxml==5.3.0
+markdown-it-py==4.0.0
+MarkupSafe==2.1.5
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mergedeep==1.3.4
+mistune==3.0.2
+ml_collections==1.0.0
+ml_dtypes==0.5.4
+mpmath==1.3.0
+msgpack==1.1.2
+multidict==6.7.1
+multiprocess==0.70.18
+mypy_extensions==1.1.0
+nbclassic==1.1.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.2.1
+notebook==6.5.5
+notebook_shim==0.2.4
+numpy==1.26.4
+numpydantic==1.8.0
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.0.44
+nvidia-curand-cu12==10.3.5.119
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.4.99
+oauthlib==3.3.1
+omegaconf==2.3.0
+opencv-python==4.11.0.86
+openpi-client==0.1.1
+opt_einsum==3.4.0
+optax==0.2.7
+orbax-checkpoint==0.11.13
+orderly-set==5.5.0
+overrides==7.7.0
+packaging==26.0
+pandas==3.0.1
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+pillow==12.1.1
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.47
+propcache==0.4.1
+proto-plus==1.27.1
+protobuf==6.33.5
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==23.0.1
+pyasn1==0.6.2
+pyasn1_modules==0.4.2
+pycparser==2.22
+pydantic==2.12.5
+pydantic_core==2.41.5
+Pygments==2.19.2
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+PyYAML==6.0.3
+pyyaml-include==1.4.1
+pyzmq==24.0.1
+referencing==0.35.1
+regex==2026.2.28
+requests==2.32.5
+requests-oauthlib==2.0.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==14.3.3
+rpds-py==0.20.0
+rsa==4.9.1
+safetensors==0.7.0
+scipy==1.17.1
+Send2Trash==1.8.3
+sentencepiece==0.2.1
+simplejson==3.20.2
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+sympy==1.12
+tensorstore==0.1.81
+termcolor==3.3.0
+terminado==0.18.1
+tinycss2==1.3.0
+tokenizers==0.21.4
+toml==0.10.2
+torch==2.4.1+cu124
+torchaudio==2.4.1+cu124
+torchvision==0.19.1+cu124
+tornado==6.4.1
+tqdm==4.67.3
+tqdm-loggable==0.3
+traitlets==5.14.3
+transformers==4.53.2
+treescope==0.1.10
+triton==3.0.0
+typeguard==4.5.1
+types-python-dateutil==2.9.0.20240906
+typing-inspect==0.9.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+tyro==1.0.8
+uri-template==1.3.0
+urllib3==2.6.3
+wadler_lindig==0.1.7
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==16.0
+widgetsnbextension==4.0.13
+wrapt==2.1.2
+xxhash==3.6.0
+yarl==1.23.0
+zipp==3.23.0

artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/python_version.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Python 3.11.10

artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/torch_env.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ <frozen runpy>:128: RuntimeWarning: 'torch.utils.collect_env' found in sys.modules after import of package 'torch.utils', but prior to execution of 'torch.utils.collect_env'; this may result in unpredictable behaviour

artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/uname.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Linux f87904697f84 6.8.0-90-generic #91-Ubuntu SMP PREEMPT_DYNAMIC Tue Nov 18 14:14:30 UTC 2025 x86_64 x86_64 x86_64 GNU/Linux

openpi/checkpoints/debug_pi05_split_communicating_pytorch_smoke/debug_pi05_split_communicating_pytorch_smoke/1/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9fa98e4f1c6159fd9b956a11323f5990b8d92aae3553eb4785ee7341c79a680
+size 3438041490

openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/1/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d726e07e1c039c30cab249de7f558f75d04a5e3c7151fb9e08ab7b9c804d7342
+size 1850670584

openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/1/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f3c7cbab597ad818c2570f4920d8a9b7d396053543ee538a8b97b4ba623bfe5
+size 3438040655

openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/2/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c68994d9d8e008f11db503c715bbbf739ca7b9f5144ac90c4822f90a55d003f
+size 3438040655

openpi/run_logs/split_independent_real_smoke20.log ADDED Viewed

File without changes

openpi/run_logs/split_independent_real_smoke3.log ADDED Viewed

	@@ -0,0 +1,104 @@

+19:24:48.871 [I] Created experiment checkpoint directory: /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_smoke3 (19525:train_pytorch.py:533)
+19:24:48.874 [I] Using batch size per GPU: 1 (total batch size across 1 GPUs: 1)                  (19525:train_pytorch.py:552)
+19:24:48.988 [I] Loaded norm stats from /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train (19525:config.py:234)
+19:24:48.990 [I] data_config: DataConfig(repo_id='lsnu/twin_dual_push_128_train', asset_id='lsnu/twin_dual_push_128_train', norm_stats={'state': NormStats(mean=array([ 0.10604009,  0.20956482,  0.09184283, -1.98801565, -0.04930164,
+        2.20065784,  1.07595289,  0.52742052,  0.01585805,  0.08288047,
+       -0.06887393, -1.906394  ,  0.04810138,  2.01086807, -0.92902797,
+        0.8440811 ]), std=array([0.09207697, 0.31317395, 0.08127229, 0.53812712, 0.06093267,
+       0.51205784, 0.22527155, 0.49924755, 0.20230208, 0.31408131,
+       0.21665592, 0.5264315 , 0.20170984, 0.4745712 , 1.17861438,
+       0.36277843]), q01=array([-5.00321221e-06, -3.88026012e-01, -2.23782954e-05, -2.98962682e+00,
+       -2.38592355e-01,  1.22146201e+00,  7.85383821e-01,  0.00000000e+00,
+       -6.15615927e-01, -4.14941930e-01, -9.43696350e-01, -2.88397729e+00,
+       -9.05083556e-01,  1.22148895e+00, -2.79564499e+00,  0.00000000e+00]), q99=array([ 0.31251293,  0.86546916,  0.35174239, -0.87634897,  0.05212194,
+        2.97208117,  1.64465171,  0.9998    ,  0.7670313 ,  0.96073459,
+        0.68710467, -0.87498123,  0.35838486,  2.9773227 ,  0.78477909,
+        0.9998    ])), 'actions': NormStats(mean=array([ 0.03630241,  0.09624442,  0.01367408, -0.2224988 , -0.02762174,
+        0.27498844,  0.0892187 ,  0.45650524, -0.00378086,  0.09113847,
+       -0.00376227, -0.22537093,  0.00826233,  0.26799494, -0.57452869,
+        0.7731654 ]), std=array([0.04995174, 0.29268014, 0.06852161, 0.3647725 , 0.07012808,
+       0.27129024, 0.11329207, 0.4981046 , 0.0917461 , 0.22704004,
+       0.1069391 , 0.2572591 , 0.11801817, 0.1235588 , 0.35835782,
+       0.41878474]), q01=array([-5.86206436e-04, -3.88117499e-01, -2.55800724e-01, -8.34769463e-01,
+       -3.51454727e-01, -1.54787922e-03, -5.81741333e-04,  0.00000000e+00,
+       -2.64436970e-01, -3.51582764e-01, -3.69693995e-01, -7.30919549e-01,
+       -3.35441585e-01, -6.62303925e-04, -9.34731126e-01,  0.00000000e+00]), q99=array([0.20790743, 0.81198567, 0.19612836, 0.33958174, 0.05568643,
+       0.75265345, 0.425256  , 0.9998    , 0.2558236 , 0.58901345,
+       0.35822071, 0.18567593, 0.44035054, 0.49966629, 0.12655233,
+       0.9998    ]))}, repack_transforms=Group(inputs=[RepackTransform(structure={'images': {'cam_high': 'front_image', 'cam_left_wrist': 'wrist_left_image', 'cam_right_wrist': 'wrist_right_image'}, 'state': 'state', 'actions': 'action', 'prompt': 'task'})], outputs=()), data_transforms=Group(inputs=[AlohaInputs(adapt_to_pi=False)], outputs=[]), model_transforms=Group(inputs=[InjectDefaultPrompt(prompt=None), ResizeImages(height=224, width=224), TokenizePrompt(tokenizer=<openpi.models.tokenizer.PaligemmaTokenizer object at 0x70f9ab2c5d10>, discrete_state_input=True), PackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))], outputs=[UnpackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))]), use_quantile_norm=True, action_sequence_keys=('action',), prompt_from_task=False, rlds_data_dir=None, action_space=None, datasets=()) (19525:data_loader.py:284)
+19:24:57.449 [I] JAX version 0.5.3 available.                                                     (19525:config.py:125)
+19:25:32.845 [I] Using existing local LeRobot dataset mirror for lsnu/twin_dual_push_128_train: /workspace/lerobot/lsnu/twin_dual_push_128_train (19525:data_loader.py:148)
+19:25:33.031 [W] 'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder (19525:video_utils.py:36)
+19:26:39.374 [I] local_batch_size: 1                                                              (19525:data_loader.py:365)
+19:28:30.580 [I] Enabled gradient checkpointing for PI0Pytorch model                              (19525:pi0_pytorch.py:138)
+19:28:30.582 [I] Enabled gradient checkpointing for memory optimization                           (19525:train_pytorch.py:624)
+19:28:30.583 [I] Step 0 (after_model_creation): GPU memory - allocated: 17.23GB, reserved: 17.23GB, free: 0.00GB, peak_allocated: 17.23GB, peak_reserved: 17.23GB (19525:train_pytorch.py:493)
+19:28:30.583 [I] Loading weights from: /workspace/checkpoints/pi05_base_split_independent_packed_from_single (19525:train_pytorch.py:653)
+19:28:34.495 [I] Weight loading missing key count: 0                                              (19525:train_pytorch.py:657)
+19:28:34.495 [I] Weight loading missing keys: set()                                               (19525:train_pytorch.py:658)
+19:28:34.496 [I] Weight loading unexpected key count: 0                                           (19525:train_pytorch.py:659)
+19:28:34.496 [I] Weight loading unexpected keys: []                                               (19525:train_pytorch.py:660)
+19:28:34.497 [I] Loaded PyTorch weights from /workspace/checkpoints/pi05_base_split_independent_packed_from_single (19525:train_pytorch.py:661)
+19:28:34.501 [I] Running on: 963c158043aa | world_size=1                                          (19525:train_pytorch.py:701)
+19:28:34.501 [I] Training config: batch_size=1, effective_batch_size=1, num_train_steps=3         (19525:train_pytorch.py:702)
+19:28:34.502 [I] Memory optimizations: gradient_checkpointing=True                                (19525:train_pytorch.py:705)
+19:28:34.502 [I] DDP settings: find_unused_parameters=False, gradient_as_bucket_view=True, static_graph=True (19525:train_pytorch.py:706)
+19:28:34.502 [I] LR schedule: warmup=250, peak_lr=2.50e-05, decay_steps=5000, end_lr=2.50e-06     (19525:train_pytorch.py:707)
+19:28:34.503 [I] Optimizer: AdamW, weight_decay=1e-10, clip_norm=1.0                              (19525:train_pytorch.py:710)
+19:28:34.503 [I] EMA is not supported for PyTorch training                                        (19525:train_pytorch.py:713)
+19:28:34.504 [I] Training precision: float32                                                      (19525:train_pytorch.py:714)
+19:28:34.516 [I] Resolved config name: pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k (19525:train_pytorch.py:308)
+19:28:34.516 [I] Dataset repo_id: lsnu/twin_dual_push_128_train                                   (19525:train_pytorch.py:309)
+19:28:34.517 [I] Norm-stats file path: /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json (19525:train_pytorch.py:310)
+19:28:34.518 [I] Norm-stats summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (19525:train_pytorch.py:311)
+19:28:34.518 [I] Checkpoint source path: /workspace/checkpoints/pi05_base_split_independent_packed_from_single (19525:train_pytorch.py:312)
+19:28:34.519 [I] Model type: split_independent                                                    (19525:train_pytorch.py:313)
+19:28:34.519 [I] Packed transforms active: True                                                   (19525:train_pytorch.py:314)
+19:28:34.519 [I] World size: 1                                                                    (19525:train_pytorch.py:315)
+19:28:34.520 [I] Batch size: local=1, global=1                                                    (19525:train_pytorch.py:316)
+19:28:34.520 [I] num_workers: 0                                                                   (19525:train_pytorch.py:317)
+19:28:34.521 [I] Precision: float32                                                               (19525:train_pytorch.py:318)
+19:28:34.521 [I] LR schedule summary: warmup_steps=250, peak_lr=2.50e-05, decay_steps=5000, decay_lr=2.50e-06 (19525:train_pytorch.py:319)
+19:28:34.522 [I] Save/log intervals: save_interval=3, log_interval=1                              (19525:train_pytorch.py:326)
+19:28:34.522 [I] Action-loss mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (19525:train_pytorch.py:327)
+19:28:34.522 [I] Active mask dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]       (19525:train_pytorch.py:328)
+19:28:34.522 [I] Masked dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]      (19525:train_pytorch.py:329)
+19:28:34.523 [I] Gradient bucket diagnostics: left_action_in, right_action_in, left_expert, right_expert, action_out, cross_arm_comm (19525:train_pytorch.py:722)
+  File "/workspace/pi05tests/openpi/scripts/train_pytorch.py", line 939, in <module>
+    main()
+  File "/workspace/pi05tests/openpi/scripts/train_pytorch.py", line 935, in main
+    train_loop(config)
+  File "/workspace/pi05tests/openpi/scripts/train_pytorch.py", line 747, in train_loop
+    for observation, actions in loader:
+  File "/workspace/pi05tests/openpi/src/openpi/training/data_loader.py", line 596, in __iter__
+    for batch in self._data_loader:
+  File "/workspace/pi05tests/openpi/src/openpi/training/data_loader.py", line 510, in __iter__
+    batch = next(data_iter)
+            ^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 630, in __next__
+    data = self._next_data()
+           ^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 673, in _next_data
+    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
+    data = [self.dataset[idx] for idx in possibly_batched_index]
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
+    data = [self.dataset[idx] for idx in possibly_batched_index]
+            ~~~~~~~~~~~~^^^^^
+  File "/workspace/pi05tests/openpi/src/openpi/training/data_loader.py", line 67, in __getitem__
+    return self._transform(self._dataset[index])
+                           ~~~~~~~~~~~~~^^^^^^^
+  File "/workspace/pi05tests/openpi/.venv/lib/python3.11/site-packages/lerobot/common/datasets/lerobot_dataset.py", line 742, in __getitem__
+    query_result = self._query_hf_dataset(query_indices)
+                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/workspace/pi05tests/openpi/.venv/lib/python3.11/site-packages/lerobot/common/datasets/lerobot_dataset.py", line 707, in _query_hf_dataset
+    return {
+           ^
+  File "/workspace/pi05tests/openpi/.venv/lib/python3.11/site-packages/lerobot/common/datasets/lerobot_dataset.py", line 708, in <dictcomp>
+    key: torch.stack(self.hf_dataset.select(q_idx)[key])
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: stack(): argument 'tensors' (position 1) must be tuple of Tensors, not Column

openpi/scripts/collect_twin_dual_push_128_stepcmp_metrics.py ADDED Viewed

	@@ -0,0 +1,913 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import ast
+import csv
+import dataclasses
+import json
+import math
+import pathlib
+import re
+import statistics
+from collections import defaultdict
+from typing import Any
+OPENPI_ROOT = pathlib.Path(__file__).resolve().parents[1]
+STEP_ORDER = (0, 100, 500, 2000)
+SAMPLE_STEPS = (1, 2, 4, 8, 16)
+BASE_TRAIN_COLUMNS = [
+    "model_variant",
+    "config_name",
+    "exp_name",
+    "step",
+    "loss",
+    "smoothed_loss",
+    "lr",
+    "grad_norm",
+    "step_time_s",
+    "data_time_s",
+    "items_per_second",
+    "eta_seconds",
+    "max_cuda_memory_gb",
+]
+TIMING_RE = re.compile(r"mean=([0-9.]+)\s+std=([0-9.]+)\s+min=([0-9.]+)\s+max=([0-9.]+)")
+KV_PAIR_RE = re.compile(r"([A-Za-z0-9_./+-]+)=([^\s]+)")
+SOURCE_SUFFIX_RE = re.compile(r"\s+\(\d+:[^)]+\)$")
+TIMESTAMPED_INFO_RE = re.compile(r"^\d{2}:\d{2}:\d{2}\.\d{3} \[I\] (.*)$")
+NORMALIZE_RE = re.compile(r"[^a-z0-9]+")
+ERROR_MARKERS = ("Traceback", "FloatingPointError", "CUDA out of memory", "Non-finite", "RuntimeError:")
+@dataclasses.dataclass(frozen=True)
+class ModelSpec:
+    key: str
+    model_variant: str
+    config_name: str
+    exp_name: str
+    step0_checkpoint: str
+MODEL_SPECS = (
+    ModelSpec(
+        key="shared",
+        model_variant="shared",
+        config_name="pi05_twin_dual_push_128_packed_baseline_pytorch_5k",
+        exp_name="dual_push_128_stepcmp_shared_2k",
+        step0_checkpoint="/workspace/checkpoints/pi05_base_single_pytorch",
+    ),
+    ModelSpec(
+        key="head_only",
+        model_variant="head_only_parallel",
+        config_name="pi05_twin_dual_push_128_packed_parallel_pytorch_5k",
+        exp_name="dual_push_128_stepcmp_head_only_2k",
+        step0_checkpoint="/workspace/checkpoints/pi05_base_parallel_packed_from_single",
+    ),
+    ModelSpec(
+        key="split_ind",
+        model_variant="split_independent",
+        config_name="pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k",
+        exp_name="dual_push_128_stepcmp_split_ind_2k",
+        step0_checkpoint="/workspace/checkpoints/pi05_base_split_independent_packed_from_single",
+    ),
+    ModelSpec(
+        key="split_comm",
+        model_variant="split_communicating",
+        config_name="pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k",
+        exp_name="dual_push_128_stepcmp_split_comm_2k",
+        step0_checkpoint="/workspace/checkpoints/pi05_base_split_communicating_packed_from_single",
+    ),
+)
+STARTUP_LABEL_MAP = {
+    "resolved_config_name": "config_name",
+    "dataset_repo_id": "dataset_repo_id",
+    "norm_stats_file_path": "norm_stats_file",
+    "norm_stats_summary": "norm_stats_summary",
+    "checkpoint_source_path": "checkpoint_source",
+    "model_type": "model_type",
+    "packed_transforms_active": "packed_transforms",
+    "world_size": "world_size",
+    "batch_size": "batch_size",
+    "num_workers": "num_workers",
+    "training_precision": "precision",
+    "lr_schedule": "lr_schedule",
+    "save_log_intervals": "save_log_intervals",
+    "action_loss_mask": "action_loss_mask",
+    "active_action_loss_dims": "active_mask_dims",
+    "masked_padded_dims": "masked_dims",
+    "gradient_bucket_diagnostics": "gradient_buckets",
+    "weight_loading_missing_key_count": "weight_missing_count",
+    "weight_loading_missing_keys": "weight_missing_keys",
+    "weight_loading_unexpected_key_count": "weight_unexpected_count",
+    "weight_loading_unexpected_keys": "weight_unexpected_keys",
+}
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--artifact_root", required=True)
+    parser.add_argument("--prior_metrics_root", default="")
+    return parser.parse_args()
+def normalize_label(label: str) -> str:
+    return NORMALIZE_RE.sub("_", label.lower()).strip("_")
+def strip_source_suffix(text: str) -> str:
+    return SOURCE_SUFFIX_RE.sub("", text.rstrip())
+def extract_info_body(line: str) -> str | None:
+    match = TIMESTAMPED_INFO_RE.match(strip_source_suffix(line))
+    return match.group(1) if match else None
+def natural_key(text: str) -> list[Any]:
+    parts = re.split(r"(\d+)", text)
+    key: list[Any] = []
+    for part in parts:
+        if not part:
+            continue
+        key.append(int(part) if part.isdigit() else part)
+    return key
+def format_float(value: float | None, digits: int = 6) -> str:
+    if value is None or math.isnan(value):
+        return "n/a"
+    return f"{value:.{digits}f}"
+def format_delta(value: float | None, digits: int = 6) -> str:
+    if value is None or math.isnan(value):
+        return "n/a"
+    return f"{value:+.{digits}f}"
+def resolve_checkpoint_path(raw_path: str) -> str:
+    path = pathlib.Path(raw_path)
+    if path.is_absolute():
+        return str(path.resolve())
+    return str((OPENPI_ROOT / path).resolve())
+def parse_float(text: str) -> float:
+    return float(text)
+def parse_optional_float(text: str | None) -> float | None:
+    if text is None or text == "":
+        return None
+    return float(text)
+def parse_timing(text: str) -> dict[str, float]:
+    match = TIMING_RE.search(text)
+    if not match:
+        raise ValueError(f"Unable to parse timing summary: {text!r}")
+    return {
+        "mean": float(match.group(1)),
+        "std": float(match.group(2)),
+        "min": float(match.group(3)),
+        "max": float(match.group(4)),
+    }
+def parse_literal_count(text: str) -> int:
+    text = text.strip()
+    if text == "set()":
+        return 0
+    value = ast.literal_eval(text)
+    return len(value)
+def read_text_lines(path: pathlib.Path) -> list[str]:
+    return path.read_text(encoding="utf-8", errors="replace").splitlines()
+def detect_errors(lines: list[str]) -> list[str]:
+    errors = []
+    for line in lines:
+        if any(marker in line for marker in ERROR_MARKERS):
+            errors.append(line.strip())
+    return errors
+def parse_train_log(path: pathlib.Path, spec: ModelSpec) -> tuple[dict[str, str], list[dict[str, Any]], list[str]]:
+    lines = read_text_lines(path)
+    startup: dict[str, str] = {}
+    step_rows: list[dict[str, Any]] = []
+    errors = detect_errors(lines)
+    for line in lines:
+        body = extract_info_body(line)
+        if not body:
+            continue
+        if body.startswith("step="):
+            metrics = {key: value for key, value in KV_PAIR_RE.findall(body)}
+            row = {
+                "model_variant": spec.model_variant,
+                "config_name": spec.config_name,
+                "exp_name": spec.exp_name,
+                "step": int(metrics["step"]),
+                "loss": parse_float(metrics["loss"]),
+                "smoothed_loss": parse_float(metrics["smoothed_loss"]),
+                "lr": parse_float(metrics["lr"]),
+                "grad_norm": parse_float(metrics["grad_norm"]),
+                "step_time_s": parse_float(metrics["step_time"].rstrip("s")),
+                "data_time_s": parse_float(metrics["data_time"].rstrip("s")),
+                "items_per_second": parse_float(metrics["it/s"]),
+                "eta_seconds": parse_float(next(value for key, value in metrics.items() if key.startswith("eta_to_")).rstrip("s")),
+                "max_cuda_memory_gb": parse_float(metrics["max_cuda_memory"].rstrip("GB")),
+            }
+            for key, value in metrics.items():
+                if key in {"step", "loss", "smoothed_loss", "lr", "grad_norm", "step_time", "data_time", "it/s", "max_cuda_memory"}:
+                    continue
+                if key.startswith("eta_to_"):
+                    continue
+                row[key] = parse_float(value)
+            step_rows.append(row)
+            continue
+        if ": " not in body:
+            continue
+        label, value = body.split(": ", 1)
+        startup_key = STARTUP_LABEL_MAP.get(normalize_label(label))
+        if startup_key:
+            startup[startup_key] = value.strip()
+    return startup, step_rows, errors
+def parse_eval_log(path: pathlib.Path) -> tuple[dict[str, str], int | None, list[str]]:
+    lines = read_text_lines(path)
+    metrics: dict[str, str] = {}
+    sample_batch_size: int | None = None
+    errors = detect_errors(lines)
+    for line in lines:
+        stripped = strip_source_suffix(line.strip())
+        if stripped.startswith("sample_eval enabled="):
+            kv = {key: value for key, value in KV_PAIR_RE.findall(stripped)}
+            if "batch_size" in kv:
+                sample_batch_size = int(kv["batch_size"])
+        if ": " not in stripped:
+            continue
+        key, value = stripped.split(": ", 1)
+        metrics[key.strip()] = value.strip()
+    return metrics, sample_batch_size, errors
+def write_csv(path: pathlib.Path, rows: list[dict[str, Any]], columns: list[str]) -> None:
+    with path.open("w", encoding="utf-8", newline="") as handle:
+        writer = csv.DictWriter(handle, fieldnames=columns)
+        writer.writeheader()
+        for row in rows:
+            writer.writerow({column: row.get(column, "") for column in columns})
+def build_teacher_row(spec: ModelSpec, step: int, metrics: dict[str, str]) -> dict[str, Any]:
+    timing = parse_timing(metrics["per_batch_timing_seconds"])
+    return {
+        "model_variant": spec.model_variant,
+        "config_name": spec.config_name,
+        "exp_name": spec.exp_name,
+        "checkpoint_step": step,
+        "checkpoint_path": resolve_checkpoint_path(metrics["checkpoint_path"]),
+        "repo_id": metrics["repo_id_used"],
+        "num_batches": int(metrics["num_batches"]),
+        "mean_val_loss": parse_float(metrics["mean_val_loss"]),
+        "std_val_loss": parse_float(metrics["std_val_loss"]),
+        "mean_left_arm_loss": parse_float(metrics["mean_left_arm_loss"]),
+        "std_left_arm_loss": parse_float(metrics["std_left_arm_loss"]),
+        "mean_right_arm_loss": parse_float(metrics["mean_right_arm_loss"]),
+        "std_right_arm_loss": parse_float(metrics["std_right_arm_loss"]),
+        "mean_left_joint_loss": parse_float(metrics["mean_left_joint_loss"]),
+        "std_left_joint_loss": parse_float(metrics["std_left_joint_loss"]),
+        "mean_left_gripper_loss": parse_float(metrics["mean_left_gripper_loss"]),
+        "std_left_gripper_loss": parse_float(metrics["std_left_gripper_loss"]),
+        "mean_right_joint_loss": parse_float(metrics["mean_right_joint_loss"]),
+        "std_right_joint_loss": parse_float(metrics["std_right_joint_loss"]),
+        "mean_right_gripper_loss": parse_float(metrics["mean_right_gripper_loss"]),
+        "std_right_gripper_loss": parse_float(metrics["std_right_gripper_loss"]),
+        "mean_left_right_imbalance": parse_float(metrics["mean_left_right_imbalance"]),
+        "std_left_right_imbalance": parse_float(metrics["std_left_right_imbalance"]),
+        "per_batch_time_mean_s": timing["mean"],
+        "per_batch_time_std_s": timing["std"],
+        "per_batch_time_min_s": timing["min"],
+        "per_batch_time_max_s": timing["max"],
+        "weight_loading_missing_count": parse_literal_count(metrics["weight_loading_missing_keys"]),
+        "weight_loading_unexpected_count": parse_literal_count(metrics["weight_loading_unexpected_keys"]),
+    }
+def build_sample_rows(spec: ModelSpec, step: int, metrics: dict[str, str]) -> list[dict[str, Any]]:
+    rows = []
+    for sample_steps in SAMPLE_STEPS:
+        prefix = f"sample_eval_num_steps_{sample_steps}_"
+        timing = parse_timing(metrics[f"{prefix}per_batch_timing_seconds"])
+        rows.append(
+            {
+                "model_variant": spec.model_variant,
+                "config_name": spec.config_name,
+                "exp_name": spec.exp_name,
+                "checkpoint_step": step,
+                "checkpoint_path": resolve_checkpoint_path(metrics["checkpoint_path"]),
+                "repo_id": metrics["repo_id_used"],
+                "sample_num_steps": sample_steps,
+                "sample_num_batches": int(metrics[f"{prefix}num_batches"]),
+                "mean_masked_mae": parse_float(metrics[f"{prefix}mean_masked_mae"]),
+                "std_masked_mae": parse_float(metrics[f"{prefix}std_masked_mae"]),
+                "mean_left_arm_mae": parse_float(metrics[f"{prefix}mean_left_arm_mae"]),
+                "std_left_arm_mae": parse_float(metrics[f"{prefix}std_left_arm_mae"]),
+                "mean_right_arm_mae": parse_float(metrics[f"{prefix}mean_right_arm_mae"]),
+                "std_right_arm_mae": parse_float(metrics[f"{prefix}std_right_arm_mae"]),
+                "mean_left_joint_mae": parse_float(metrics[f"{prefix}mean_left_joint_mae"]),
+                "std_left_joint_mae": parse_float(metrics[f"{prefix}std_left_joint_mae"]),
+                "mean_left_gripper_mae": parse_float(metrics[f"{prefix}mean_left_gripper_mae"]),
+                "std_left_gripper_mae": parse_float(metrics[f"{prefix}std_left_gripper_mae"]),
+                "mean_right_joint_mae": parse_float(metrics[f"{prefix}mean_right_joint_mae"]),
+                "std_right_joint_mae": parse_float(metrics[f"{prefix}std_right_joint_mae"]),
+                "mean_right_gripper_mae": parse_float(metrics[f"{prefix}mean_right_gripper_mae"]),
+                "std_right_gripper_mae": parse_float(metrics[f"{prefix}std_right_gripper_mae"]),
+                "mean_left_right_imbalance_mae": parse_float(metrics[f"{prefix}mean_left_right_imbalance_mae"]),
+                "std_left_right_imbalance_mae": parse_float(metrics[f"{prefix}std_left_right_imbalance_mae"]),
+                "per_batch_time_mean_s": timing["mean"],
+                "per_batch_time_std_s": timing["std"],
+                "per_batch_time_min_s": timing["min"],
+                "per_batch_time_max_s": timing["max"],
+            }
+        )
+    return rows
+def row_index(rows: list[dict[str, Any]], *keys: str) -> dict[tuple[Any, ...], dict[str, Any]]:
+    return {tuple(row[key] for key in keys): row for row in rows}
+def average(values: list[float]) -> float | None:
+    return statistics.fmean(values) if values else None
+def summarise_stability(train_rows: list[dict[str, Any]]) -> dict[str, Any]:
+    by_variant: dict[str, list[dict[str, Any]]] = defaultdict(list)
+    for row in train_rows:
+        by_variant[row["model_variant"]].append(row)
+    summary: dict[str, Any] = {}
+    for variant, rows in by_variant.items():
+        grad_columns = [column for column in rows[0] if column.startswith("grad_")]
+        dead_columns = []
+        for column in grad_columns:
+            if max(abs(float(row.get(column, 0.0) or 0.0)) for row in rows) == 0.0:
+                dead_columns.append(column)
+        summary[variant] = {
+            "max_cuda_memory_gb": max(row["max_cuda_memory_gb"] for row in rows),
+            "dead_gradient_columns": dead_columns,
+        }
+    split_comm_rows = by_variant.get("split_communicating", [])
+    gate_columns = sorted(
+        {column for row in split_comm_rows for column in row if column.startswith("cross_arm_comm_gate_layer_")},
+        key=natural_key,
+    )
+    attn_columns = sorted(
+        {column for row in split_comm_rows for column in row if column.startswith("cross_arm_attention_mass_layer_")},
+        key=natural_key,
+    )
+    if split_comm_rows:
+        gate_values = [abs(float(row[column])) for row in split_comm_rows for column in gate_columns if column in row]
+        attn_values = [float(row[column]) for row in split_comm_rows for column in attn_columns if column in row]
+        grad_comm_values = [float(row.get("grad_cross_arm_comm", 0.0)) for row in split_comm_rows]
+        summary["split_communicating"]["communication"] = {
+            "gate_abs_max": max(gate_values) if gate_values else 0.0,
+            "gate_abs_mean": average(gate_values) or 0.0,
+            "attention_mass_mean": average(attn_values) or 0.0,
+            "attention_mass_max": max(attn_values) if attn_values else 0.0,
+            "grad_cross_arm_comm_mean": average(grad_comm_values) or 0.0,
+            "grad_cross_arm_comm_max": max(grad_comm_values) if grad_comm_values else 0.0,
+            "active": bool(attn_values and max(attn_values) > 0.0 and max(grad_comm_values, default=0.0) > 0.0),
+        }
+    return summary
+def load_prior_metrics(prior_metrics_root: pathlib.Path) -> dict[str, Any]:
+    result: dict[str, Any] = {}
+    if not prior_metrics_root.exists():
+        return result
+    teacher_path = prior_metrics_root / "teacher_forced_eval_table.csv"
+    sample_path = prior_metrics_root / "sample_eval_table.csv"
+    if teacher_path.exists():
+        with teacher_path.open(encoding="utf-8", newline="") as handle:
+            rows = list(csv.DictReader(handle))
+        teacher_2000 = {row["model"]: row for row in rows if int(row["checkpoint_step"]) == 2000}
+        result["teacher_2000"] = {
+            "baseline": parse_optional_float(teacher_2000.get("baseline", {}).get("mean_val_loss")),
+            "parallel": parse_optional_float(teacher_2000.get("parallel", {}).get("mean_val_loss")),
+        }
+    if sample_path.exists():
+        with sample_path.open(encoding="utf-8", newline="") as handle:
+            rows = list(csv.DictReader(handle))
+        sample_2000_step4 = {
+            row["model"]: row
+            for row in rows
+            if int(row["checkpoint_step"]) == 2000 and int(row["num_steps"]) == 4
+        }
+        result["sample_2000_step4"] = {
+            "baseline": parse_optional_float(sample_2000_step4.get("baseline", {}).get("mean_masked_mae")),
+            "parallel": parse_optional_float(sample_2000_step4.get("parallel", {}).get("mean_masked_mae")),
+        }
+    return result
+def build_summary(
+    artifact_root: pathlib.Path,
+    teacher_rows: list[dict[str, Any]],
+    sample_rows: list[dict[str, Any]],
+    train_rows: list[dict[str, Any]],
+    startup_summaries: dict[str, dict[str, str]],
+    log_errors: dict[str, list[str]],
+    sample_batch_size_used: str,
+    prior_metrics: dict[str, Any],
+) -> dict[str, Any]:
+    teacher_by_key = row_index(teacher_rows, "model_variant", "checkpoint_step")
+    sample_by_key = row_index(sample_rows, "model_variant", "checkpoint_step", "sample_num_steps")
+    step0_teacher_gaps = {}
+    step0_sample_gaps = {}
+    shared_step0_teacher = teacher_by_key[("shared", 0)]["mean_val_loss"]
+    for spec in MODEL_SPECS:
+        variant = spec.model_variant
+        teacher_value = teacher_by_key[(variant, 0)]["mean_val_loss"]
+        step0_teacher_gaps[variant] = teacher_value - shared_step0_teacher
+        sample_deltas = []
+        for sample_steps in SAMPLE_STEPS:
+            variant_row = sample_by_key[(variant, 0, sample_steps)]
+            shared_row = sample_by_key[("shared", 0, sample_steps)]
+            sample_deltas.append(variant_row["mean_masked_mae"] - shared_row["mean_masked_mae"])
+        step0_sample_gaps[variant] = {
+            "average_delta_vs_shared": average(sample_deltas),
+            "per_steps_delta_vs_shared": {str(step): delta for step, delta in zip(SAMPLE_STEPS, sample_deltas, strict=True)},
+        }
+    warm_variants = [spec.model_variant for spec in MODEL_SPECS if spec.model_variant != "shared"]
+    smallest_teacher_variant = min(warm_variants, key=lambda variant: abs(step0_teacher_gaps[variant]))
+    smallest_sample_variant = min(
+        warm_variants,
+        key=lambda variant: abs(step0_sample_gaps[variant]["average_delta_vs_shared"] or 0.0),
+    )
+    teacher_improvements: dict[str, dict[str, float]] = defaultdict(dict)
+    sample_improvements: dict[str, dict[str, dict[str, float]]] = defaultdict(lambda: defaultdict(dict))
+    for spec in MODEL_SPECS:
+        variant = spec.model_variant
+        for start_step, end_step in zip(STEP_ORDER[:-1], STEP_ORDER[1:], strict=True):
+            teacher_improvements[variant][f"{start_step}_to_{end_step}"] = (
+                teacher_by_key[(variant, start_step)]["mean_val_loss"] - teacher_by_key[(variant, end_step)]["mean_val_loss"]
+            )
+        teacher_improvements[variant]["0_to_2000"] = (
+            teacher_by_key[(variant, 0)]["mean_val_loss"] - teacher_by_key[(variant, 2000)]["mean_val_loss"]
+        )
+        for sample_steps in SAMPLE_STEPS:
+            for start_step, end_step in zip(STEP_ORDER[:-1], STEP_ORDER[1:], strict=True):
+                sample_improvements[variant][str(sample_steps)][f"{start_step}_to_{end_step}"] = (
+                    sample_by_key[(variant, start_step, sample_steps)]["mean_masked_mae"]
+                    - sample_by_key[(variant, end_step, sample_steps)]["mean_masked_mae"]
+                )
+            sample_improvements[variant][str(sample_steps)]["0_to_2000"] = (
+                sample_by_key[(variant, 0, sample_steps)]["mean_masked_mae"]
+                - sample_by_key[(variant, 2000, sample_steps)]["mean_masked_mae"]
+            )
+    teacher_2k_ranking = sorted(
+        (
+            {
+                "model_variant": spec.model_variant,
+                "mean_val_loss": teacher_by_key[(spec.model_variant, 2000)]["mean_val_loss"],
+                "mean_left_right_imbalance": teacher_by_key[(spec.model_variant, 2000)]["mean_left_right_imbalance"],
+                "improvement_0_to_2000": teacher_by_key[(spec.model_variant, 0)]["mean_val_loss"]
+                - teacher_by_key[(spec.model_variant, 2000)]["mean_val_loss"],
+            }
+            for spec in MODEL_SPECS
+        ),
+        key=lambda row: row["mean_val_loss"],
+    )
+    sample_2k_ranking = sorted(
+        (
+            {
+                "model_variant": spec.model_variant,
+                "mean_masked_mae_step_4": sample_by_key[(spec.model_variant, 2000, 4)]["mean_masked_mae"],
+                "mean_masked_mae_step_16": sample_by_key[(spec.model_variant, 2000, 16)]["mean_masked_mae"],
+                "mean_masked_mae_average": statistics.fmean(
+                    sample_by_key[(spec.model_variant, 2000, sample_steps)]["mean_masked_mae"] for sample_steps in SAMPLE_STEPS
+                ),
+            }
+            for spec in MODEL_SPECS
+        ),
+        key=lambda row: row["mean_masked_mae_average"],
+    )
+    stability = summarise_stability(train_rows)
+    bootstrap_sanity = {
+        spec.model_variant: {
+            "step0_weight_loading_missing_count": teacher_by_key[(spec.model_variant, 0)]["weight_loading_missing_count"],
+            "step0_weight_loading_unexpected_count": teacher_by_key[(spec.model_variant, 0)]["weight_loading_unexpected_count"],
+        }
+        for spec in MODEL_SPECS
+    }
+    invariant_logs = {
+        "split_independent": (artifact_root / "sanity_checks/check_split_independent_invariants.log").exists(),
+        "split_communicating": (artifact_root / "sanity_checks/check_split_communicating_invariants.log").exists(),
+    }
+    prior_regression = {}
+    teacher_prior = prior_metrics.get("teacher_2000", {})
+    sample_prior = prior_metrics.get("sample_2000_step4", {})
+    current_shared_teacher_2k = teacher_by_key[("shared", 2000)]["mean_val_loss"]
+    current_head_only_teacher_2k = teacher_by_key[("head_only_parallel", 2000)]["mean_val_loss"]
+    if teacher_prior:
+        prior_delta = (teacher_prior.get("baseline") or 0.0) - (teacher_prior.get("parallel") or 0.0)
+        current_delta = current_shared_teacher_2k - current_head_only_teacher_2k
+        prior_regression["teacher_forced_2000"] = {
+            "prior_baseline": teacher_prior.get("baseline"),
+            "prior_parallel": teacher_prior.get("parallel"),
+            "prior_parallel_edge": prior_delta,
+            "current_shared": current_shared_teacher_2k,
+            "current_head_only_parallel": current_head_only_teacher_2k,
+            "current_head_only_edge": current_delta,
+            "direction_matches": (prior_delta > 0 and current_delta > 0) or (prior_delta < 0 and current_delta < 0),
+        }
+    if sample_prior:
+        current_shared_sample_2k = sample_by_key[("shared", 2000, 4)]["mean_masked_mae"]
+        current_head_only_sample_2k = sample_by_key[("head_only_parallel", 2000, 4)]["mean_masked_mae"]
+        prior_delta = (sample_prior.get("baseline") or 0.0) - (sample_prior.get("parallel") or 0.0)
+        current_delta = current_shared_sample_2k - current_head_only_sample_2k
+        prior_regression["sample_step4_2000"] = {
+            "prior_baseline": sample_prior.get("baseline"),
+            "prior_parallel": sample_prior.get("parallel"),
+            "prior_parallel_edge": prior_delta,
+            "current_shared": current_shared_sample_2k,
+            "current_head_only_parallel": current_head_only_sample_2k,
+            "current_head_only_edge": current_delta,
+            "direction_matches": (prior_delta > 0 and current_delta > 0) or (prior_delta < 0 and current_delta < 0),
+        }
+    split_ind_teacher_2k = teacher_by_key[("split_independent", 2000)]["mean_val_loss"]
+    split_comm_teacher_2k = teacher_by_key[("split_communicating", 2000)]["mean_val_loss"]
+    head_only_teacher_2k = teacher_by_key[("head_only_parallel", 2000)]["mean_val_loss"]
+    split_ind_sample_avg_2k = statistics.fmean(
+        sample_by_key[("split_independent", 2000, sample_steps)]["mean_masked_mae"] for sample_steps in SAMPLE_STEPS
+    )
+    split_comm_sample_avg_2k = statistics.fmean(
+        sample_by_key[("split_communicating", 2000, sample_steps)]["mean_masked_mae"] for sample_steps in SAMPLE_STEPS
+    )
+    head_only_sample_avg_2k = statistics.fmean(
+        sample_by_key[("head_only_parallel", 2000, sample_steps)]["mean_masked_mae"] for sample_steps in SAMPLE_STEPS
+    )
+    return {
+        "study_name": artifact_root.name,
+        "artifact_root": str(artifact_root),
+        "hardware": "4x H100 80GB",
+        "precision": "bfloat16",
+        "train_repo_id": "lsnu/twin_dual_push_128_train",
+        "val_repo_id": "lsnu/twin_dual_push_128_val",
+        "packed_layout": "[L8, 0x8, R8, 0x8]",
+        "sample_batch_size_used": sample_batch_size_used,
+        "bootstrap_sanity": {
+            "step0_weight_loading": bootstrap_sanity,
+            "invariant_logs_present": invariant_logs,
+        },
+        "step0_gap_analysis": {
+            "teacher_forced_delta_vs_shared": step0_teacher_gaps,
+            "sample_avg_delta_vs_shared": {
+                variant: payload["average_delta_vs_shared"] for variant, payload in step0_sample_gaps.items()
+            },
+            "sample_delta_vs_shared_by_steps": step0_sample_gaps,
+            "smallest_teacher_forced_jump": smallest_teacher_variant,
+            "smallest_sample_jump": smallest_sample_variant,
+        },
+        "teacher_improvements": teacher_improvements,
+        "sample_improvements": sample_improvements,
+        "teacher_2k_ranking": teacher_2k_ranking,
+        "sample_2k_ranking": sample_2k_ranking,
+        "optimization_stability": {
+            "summary": stability,
+            "log_errors": log_errors,
+        },
+        "head_only_vs_prior_5k_study": prior_regression,
+        "answer_summary": {
+            "teacher_2k_best": teacher_2k_ranking[0]["model_variant"],
+            "sample_2k_best": sample_2k_ranking[0]["model_variant"],
+            "split_models_beat_head_only_teacher_2k": {
+                "split_independent": split_ind_teacher_2k < head_only_teacher_2k,
+                "split_communicating": split_comm_teacher_2k < head_only_teacher_2k,
+            },
+            "split_models_beat_head_only_sample_2k_avg": {
+                "split_independent": split_ind_sample_avg_2k < head_only_sample_avg_2k,
+                "split_communicating": split_comm_sample_avg_2k < head_only_sample_avg_2k,
+            },
+            "split_comm_vs_split_ind_teacher_2k_delta": split_comm_teacher_2k - split_ind_teacher_2k,
+            "split_comm_vs_split_ind_sample_2k_avg_delta": split_comm_sample_avg_2k - split_ind_sample_avg_2k,
+        },
+        "startup_summaries": startup_summaries,
+    }
+def write_startup_summaries(path: pathlib.Path, startup_summaries: dict[str, dict[str, str]]) -> None:
+    ordered_keys = [
+        "weight_missing_count",
+        "weight_missing_keys",
+        "weight_unexpected_count",
+        "weight_unexpected_keys",
+        "config_name",
+        "dataset_repo_id",
+        "norm_stats_file",
+        "norm_stats_summary",
+        "checkpoint_source",
+        "model_type",
+        "packed_transforms",
+        "world_size",
+        "batch_size",
+        "num_workers",
+        "precision",
+        "lr_schedule",
+        "save_log_intervals",
+        "action_loss_mask",
+        "active_mask_dims",
+        "masked_dims",
+        "gradient_buckets",
+    ]
+    lines = []
+    for spec in MODEL_SPECS:
+        lines.append(f"[{spec.model_variant}]")
+        startup = startup_summaries.get(spec.model_variant, {})
+        for key in ordered_keys:
+            if key in startup:
+                lines.append(f"{key}: {startup[key]}")
+        lines.append("")
+    path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
+def build_markdown_table(headers: list[str], rows: list[list[str]]) -> str:
+    header_row = "| " + " | ".join(headers) + " |"
+    separator = "| " + " | ".join("---" for _ in headers) + " |"
+    body = "\n".join("| " + " | ".join(row) + " |" for row in rows)
+    return "\n".join([header_row, separator, body])
+def write_readme(
+    path: pathlib.Path,
+    summary: dict[str, Any],
+    teacher_rows: list[dict[str, Any]],
+    sample_rows: list[dict[str, Any]],
+) -> None:
+    teacher_by_key = row_index(teacher_rows, "model_variant", "checkpoint_step")
+    sample_by_key = row_index(sample_rows, "model_variant", "checkpoint_step", "sample_num_steps")
+    step0_table_rows = []
+    final_teacher_rows = []
+    final_sample_rows = []
+    for spec in MODEL_SPECS:
+        variant = spec.model_variant
+        step0 = teacher_by_key[(variant, 0)]
+        final_teacher = teacher_by_key[(variant, 2000)]
+        step0_table_rows.append(
+            [
+                variant,
+                format_float(step0["mean_val_loss"]),
+                format_delta(summary["step0_gap_analysis"]["teacher_forced_delta_vs_shared"][variant]),
+                format_float(step0["mean_left_right_imbalance"]),
+            ]
+        )
+        final_teacher_rows.append(
+            [
+                variant,
+                format_float(final_teacher["mean_val_loss"]),
+                format_float(summary["teacher_improvements"][variant]["0_to_2000"]),
+                format_float(final_teacher["mean_left_right_imbalance"]),
+            ]
+        )
+        final_sample_rows.append(
+            [
+                variant,
+                format_float(sample_by_key[(variant, 2000, 1)]["mean_masked_mae"]),
+                format_float(sample_by_key[(variant, 2000, 4)]["mean_masked_mae"]),
+                format_float(sample_by_key[(variant, 2000, 16)]["mean_masked_mae"]),
+            ]
+        )
+    stability = summary["optimization_stability"]["summary"]
+    memory_note = ", ".join(
+        f"{variant}={format_float(stability[variant]['max_cuda_memory_gb'], digits=2)}GB"
+        for variant in [spec.model_variant for spec in MODEL_SPECS]
+    )
+    split_comm_comm = stability.get("split_communicating", {}).get("communication", {})
+    prior_teacher = summary.get("head_only_vs_prior_5k_study", {}).get("teacher_forced_2000")
+    prior_sample = summary.get("head_only_vs_prior_5k_study", {}).get("sample_step4_2000")
+    readme = [
+        f"# {summary['study_name']}",
+        "",
+        "Controlled 4-way early-training comparison on packed TWIN dual-push `128` with a shared step-0 bootstrap check, fresh `2K` training runs, and fixed validation settings at steps `0`, `100`, `500`, and `2000`.",
+        "",
+        "## Quick answers",
+        f"- Smallest step-0 teacher-forced jump vs `shared`: `{summary['step0_gap_analysis']['smallest_teacher_forced_jump']}` (`{format_delta(summary['step0_gap_analysis']['teacher_forced_delta_vs_shared'][summary['step0_gap_analysis']['smallest_teacher_forced_jump']])}`).",
+        f"- Smallest step-0 sample jump vs `shared` (average over sample steps `1,2,4,8,16`): `{summary['step0_gap_analysis']['smallest_sample_jump']}` (`{format_delta(summary['step0_gap_analysis']['sample_avg_delta_vs_shared'][summary['step0_gap_analysis']['smallest_sample_jump']])}`).",
+        f"- Best teacher-forced result by step `2000`: `{summary['answer_summary']['teacher_2k_best']}`.",
+        f"- Best sample result by step `2000` (average masked MAE over sample steps `1,2,4,8,16`): `{summary['answer_summary']['sample_2k_best']}`.",
+        f"- Split vs head-only by step `2000`: teacher-forced beat flags `split_independent={summary['answer_summary']['split_models_beat_head_only_teacher_2k']['split_independent']}`, `split_communicating={summary['answer_summary']['split_models_beat_head_only_teacher_2k']['split_communicating']}`; sample beat flags `split_independent={summary['answer_summary']['split_models_beat_head_only_sample_2k_avg']['split_independent']}`, `split_communicating={summary['answer_summary']['split_models_beat_head_only_sample_2k_avg']['split_communicating']}`.",
+        f"- `split_communicating` vs `split_independent` at `2000`: teacher delta `{format_delta(summary['answer_summary']['split_comm_vs_split_ind_teacher_2k_delta'])}`, sample-average delta `{format_delta(summary['answer_summary']['split_comm_vs_split_ind_sample_2k_avg_delta'])}`.",
+        "",
+        "## Step-0 teacher-forced comparison",
+        build_markdown_table(
+            ["model", "mean_val_loss", "delta_vs_shared", "left_right_imbalance"],
+            step0_table_rows,
+        ),
+        "",
+        "## Step-2000 comparison",
+        build_markdown_table(
+            ["model", "mean_val_loss", "0_to_2000_improvement", "left_right_imbalance"],
+            final_teacher_rows,
+        ),
+        "",
+        build_markdown_table(
+            ["model", "1-step_mae", "4-step_mae", "16-step_mae"],
+            final_sample_rows,
+        ),
+        "",
+        "## Stability notes",
+        f"- Sample batch size used for all official evals: `{summary['sample_batch_size_used']}`.",
+        f"- Step-0 weight loading was clean for all four variants: missing and unexpected key counts were zero in every step-0 eval log.",
+        f"- Peak training VRAM by model: {memory_note}.",
+        f"- `split_communicating` communication path: active=`{split_comm_comm.get('active', False)}`, `grad_cross_arm_comm_max={format_float(split_comm_comm.get('grad_cross_arm_comm_max'))}`, `attention_mass_mean={format_float(split_comm_comm.get('attention_mass_mean'))}`, `gate_abs_max={format_float(split_comm_comm.get('gate_abs_max'))}`.",
+        "",
+        "## Regression check vs prior dual-push screen",
+    ]
+    if prior_teacher:
+        readme.append(
+            f"- Prior `5K` study at step `2000` had `baseline={format_float(prior_teacher['prior_baseline'])}` and `parallel={format_float(prior_teacher['prior_parallel'])}` with head-only edge `{format_delta(prior_teacher['prior_parallel_edge'])}`. This rerun has `shared={format_float(prior_teacher['current_shared'])}` and `head_only_parallel={format_float(prior_teacher['current_head_only_parallel'])}` with head-only edge `{format_delta(prior_teacher['current_head_only_edge'])}`; direction match=`{prior_teacher['direction_matches']}`."
+        )
+    else:
+        readme.append("- Prior teacher-forced comparison was unavailable.")
+    if prior_sample:
+        readme.append(
+            f"- Prior `5K` study `4`-step MAE at step `2000` had `baseline={format_float(prior_sample['prior_baseline'])}` and `parallel={format_float(prior_sample['prior_parallel'])}` with head-only edge `{format_delta(prior_sample['prior_parallel_edge'])}`. This rerun has `shared={format_float(prior_sample['current_shared'])}` and `head_only_parallel={format_float(prior_sample['current_head_only_parallel'])}` with head-only edge `{format_delta(prior_sample['current_head_only_edge'])}`; direction match=`{prior_sample['direction_matches']}`."
+        )
+    else:
+        readme.append("- Prior sample-based comparison was unavailable.")
+    readme.extend(
+        [
+            "",
+            "## Files",
+            "- `metrics/teacher_forced_eval_table.csv`: all teacher-forced metrics at steps `0`, `100`, `500`, `2000`.",
+            "- `metrics/sample_eval_table.csv`: all sample-eval metrics for sample steps `1`, `2`, `4`, `8`, `16` at steps `0`, `100`, `500`, `2000`.",
+            "- `metrics/training_summary.csv`: per-log-interval training diagnostics with model-specific gradient columns.",
+            "- `metrics/startup_summaries.txt`: startup configuration and weight-loading summaries for each run.",
+            "- `run_logs/`: full train/eval logs, including the first-five-step debug lines in each train log.",
+        ]
+    )
+    path.write_text("\n".join(readme).rstrip() + "\n", encoding="utf-8")
+def main() -> None:
+    args = parse_args()
+    artifact_root = pathlib.Path(args.artifact_root).resolve()
+    run_logs_dir = artifact_root / "run_logs"
+    metrics_dir = artifact_root / "metrics"
+    metrics_dir.mkdir(parents=True, exist_ok=True)
+    teacher_rows: list[dict[str, Any]] = []
+    sample_rows: list[dict[str, Any]] = []
+    train_rows: list[dict[str, Any]] = []
+    startup_summaries: dict[str, dict[str, str]] = {}
+    log_errors: dict[str, list[str]] = {}
+    sample_batch_size_used = "unknown"
+    extra_train_columns: set[str] = set()
+    for spec in MODEL_SPECS:
+        train_log = run_logs_dir / f"{spec.exp_name}.log"
+        startup, train_log_rows, train_errors = parse_train_log(train_log, spec)
+        startup_summaries[spec.model_variant] = startup
+        train_rows.extend(train_log_rows)
+        extra_train_columns.update(column for row in train_log_rows for column in row if column not in BASE_TRAIN_COLUMNS)
+        if train_errors:
+            log_errors[f"{spec.model_variant}:train"] = train_errors
+        for step in STEP_ORDER:
+            eval_log = run_logs_dir / f"{spec.exp_name}_val_{step}.log"
+            eval_metrics, eval_sample_batch_size, eval_errors = parse_eval_log(eval_log)
+            if eval_sample_batch_size is not None:
+                sample_batch_size_used = str(eval_sample_batch_size)
+            teacher_rows.append(build_teacher_row(spec, step, eval_metrics))
+            sample_rows.extend(build_sample_rows(spec, step, eval_metrics))
+            if eval_errors:
+                log_errors[f"{spec.model_variant}:eval:{step}"] = eval_errors
+    teacher_rows.sort(key=lambda row: (natural_key(row["model_variant"]), row["checkpoint_step"]))
+    sample_rows.sort(key=lambda row: (natural_key(row["model_variant"]), row["checkpoint_step"], row["sample_num_steps"]))
+    train_rows.sort(key=lambda row: (natural_key(row["model_variant"]), row["step"]))
+    teacher_columns = [
+        "model_variant",
+        "config_name",
+        "exp_name",
+        "checkpoint_step",
+        "checkpoint_path",
+        "repo_id",
+        "num_batches",
+        "mean_val_loss",
+        "std_val_loss",
+        "mean_left_arm_loss",
+        "std_left_arm_loss",
+        "mean_right_arm_loss",
+        "std_right_arm_loss",
+        "mean_left_joint_loss",
+        "std_left_joint_loss",
+        "mean_left_gripper_loss",
+        "std_left_gripper_loss",
+        "mean_right_joint_loss",
+        "std_right_joint_loss",
+        "mean_right_gripper_loss",
+        "std_right_gripper_loss",
+        "mean_left_right_imbalance",
+        "std_left_right_imbalance",
+        "per_batch_time_mean_s",
+        "per_batch_time_std_s",
+        "per_batch_time_min_s",
+        "per_batch_time_max_s",
+        "weight_loading_missing_count",
+        "weight_loading_unexpected_count",
+    ]
+    sample_columns = [
+        "model_variant",
+        "config_name",
+        "exp_name",
+        "checkpoint_step",
+        "checkpoint_path",
+        "repo_id",
+        "sample_num_steps",
+        "sample_num_batches",
+        "mean_masked_mae",
+        "std_masked_mae",
+        "mean_left_arm_mae",
+        "std_left_arm_mae",
+        "mean_right_arm_mae",
+        "std_right_arm_mae",
+        "mean_left_joint_mae",
+        "std_left_joint_mae",
+        "mean_left_gripper_mae",
+        "std_left_gripper_mae",
+        "mean_right_joint_mae",
+        "std_right_joint_mae",
+        "mean_right_gripper_mae",
+        "std_right_gripper_mae",
+        "mean_left_right_imbalance_mae",
+        "std_left_right_imbalance_mae",
+        "per_batch_time_mean_s",
+        "per_batch_time_std_s",
+        "per_batch_time_min_s",
+        "per_batch_time_max_s",
+    ]
+    ordered_extra_train_columns = sorted(extra_train_columns, key=natural_key)
+    train_columns = BASE_TRAIN_COLUMNS + ordered_extra_train_columns
+    write_csv(metrics_dir / "teacher_forced_eval_table.csv", teacher_rows, teacher_columns)
+    write_csv(metrics_dir / "sample_eval_table.csv", sample_rows, sample_columns)
+    write_csv(metrics_dir / "training_summary.csv", train_rows, train_columns)
+    write_startup_summaries(metrics_dir / "startup_summaries.txt", startup_summaries)
+    prior_metrics = load_prior_metrics(pathlib.Path(args.prior_metrics_root)) if args.prior_metrics_root else {}
+    summary = build_summary(
+        artifact_root,
+        teacher_rows,
+        sample_rows,
+        train_rows,
+        startup_summaries,
+        log_errors,
+        sample_batch_size_used,
+        prior_metrics,
+    )
+    (metrics_dir / "summary.json").write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+    write_readme(artifact_root / "README.md", summary, teacher_rows, sample_rows)
+if __name__ == "__main__":
+    main()

openpi/scripts/prune_stepcmp_checkpoints.py ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/usr/bin/env python3
+import argparse
+import pathlib
+import shutil
+import time
+from datetime import datetime, timezone
+def utc_ts() -> str:
+    return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
+def prune_once(roots: list[pathlib.Path], keep_steps: set[str]) -> int:
+    removed = 0
+    for root in roots:
+        if not root.is_dir():
+            continue
+        for child in root.iterdir():
+            if not child.is_dir():
+                continue
+            if child.name.startswith("tmp_"):
+                continue
+            if not child.name.isdigit():
+                continue
+            if child.name in keep_steps:
+                continue
+            shutil.rmtree(child, ignore_errors=True)
+            print(f"[{utc_ts()}] pruned {child}", flush=True)
+            removed += 1
+    return removed
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--interval-seconds", type=int, default=30)
+    parser.add_argument("--keep-steps", nargs="+", default=["100", "500", "2000"])
+    parser.add_argument("roots", nargs="+")
+    args = parser.parse_args()
+    roots = [pathlib.Path(root) for root in args.roots]
+    keep_steps = set(args.keep_steps)
+    print(
+        f"[{utc_ts()}] retention pruner started interval_s={args.interval_seconds} keep_steps={sorted(keep_steps)}",
+        flush=True,
+    )
+    while True:
+        prune_once(roots, keep_steps)
+        time.sleep(args.interval_seconds)
+if __name__ == "__main__":
+    main()

openpi/scripts/run_twin_dual_push_128_stepcmp_2k.sh ADDED Viewed

	@@ -0,0 +1,558 @@

+#!/usr/bin/env bash
+set -euo pipefail
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+VENV="$ROOT/.venv/bin/activate"
+PYTHON_BIN="$ROOT/.venv/bin/python"
+ARTIFACT_DATE="${ARTIFACT_DATE:-$(date -u +%Y%m%d)}"
+ARTIFACT_ROOT="${ARTIFACT_ROOT:-/workspace/pi05tests/artifacts/twin_dual_push_128_stepcmp_2k_${ARTIFACT_DATE}}"
+RUN_LOG_DIR="$ARTIFACT_ROOT/run_logs"
+METRICS_DIR="$ARTIFACT_ROOT/metrics"
+REPRO_DIR="$ARTIFACT_ROOT/repro"
+ENV_DIR="$ARTIFACT_ROOT/environment"
+SANITY_DIR="$ARTIFACT_ROOT/sanity_checks"
+mkdir -p "$RUN_LOG_DIR" "$METRICS_DIR" "$REPRO_DIR" "$ENV_DIR" "$SANITY_DIR" "$ROOT/run_logs"
+export HF_TOKEN="${HF_TOKEN:-}"
+export HF_HOME=/workspace/.hf
+export HF_HUB_CACHE=/workspace/.hf/hub
+export HF_DATASETS_CACHE=/workspace/.hf/datasets
+export HUGGINGFACE_HUB_CACHE=/workspace/.hf/hub
+export XDG_CACHE_HOME=/workspace/.cache
+export OPENPI_LEROBOT_HOME=/workspace/lerobot
+export OPENPI_TORCH_COMPILE_SAMPLE_ACTIONS=0
+export TOKENIZERS_PARALLELISM=false
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export PYTHONPATH="$ROOT/src"
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+export NUMEXPR_NUM_THREADS=1
+export PYTHONFAULTHANDLER=1
+cd "$ROOT"
+source "$VENV"
+TRAIN_REPO="lsnu/twin_dual_push_128_train"
+VAL_REPO="lsnu/twin_dual_push_128_val"
+TEACHER_VAL_BATCHES=100
+SAMPLE_VAL_BATCHES=64
+SAMPLE_NUM_STEPS="1,2,4,8,16"
+PRIOR_METRICS_ROOT="/workspace/pi05tests/artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics"
+declare -A MODEL_VARIANT=(
+  [shared]="shared"
+  [head_only]="head_only_parallel"
+  [split_ind]="split_independent"
+  [split_comm]="split_communicating"
+)
+declare -A CONFIG_NAME=(
+  [shared]="pi05_twin_dual_push_128_packed_baseline_pytorch_5k"
+  [head_only]="pi05_twin_dual_push_128_packed_parallel_pytorch_5k"
+  [split_ind]="pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k"
+  [split_comm]="pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k"
+)
+declare -A STEP0_CKPT=(
+  [shared]="/workspace/checkpoints/pi05_base_single_pytorch"
+  [head_only]="/workspace/checkpoints/pi05_base_parallel_packed_from_single"
+  [split_ind]="/workspace/checkpoints/pi05_base_split_independent_packed_from_single"
+  [split_comm]="/workspace/checkpoints/pi05_base_split_communicating_packed_from_single"
+)
+declare -A EXP_NAME=(
+  [shared]="dual_push_128_stepcmp_shared_2k"
+  [head_only]="dual_push_128_stepcmp_head_only_2k"
+  [split_ind]="dual_push_128_stepcmp_split_ind_2k"
+  [split_comm]="dual_push_128_stepcmp_split_comm_2k"
+)
+eval_pids=()
+eval_labels=()
+checkpoint_pruner_pid=""
+log() {
+  echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] $*"
+}
+pruner_log() {
+  echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] $*" >>"$RUN_LOG_DIR/checkpoint_retention_pruner.log"
+}
+trim_env_snapshot() {
+  env | sort | grep -E '^(HF_|HUGGINGFACE_|OPENPI_|PYTORCH_|PYTHONPATH|TOKENIZERS_PARALLELISM|XDG_CACHE_HOME)=' >"$ENV_DIR/env_selected.txt"
+}
+save_environment_snapshot() {
+  date -u '+%Y-%m-%d %H:%M:%S UTC' >"$ENV_DIR/date_utc.txt"
+  uname -a >"$ENV_DIR/uname.txt"
+  python --version >"$ENV_DIR/python_version.txt" 2>&1
+  pip freeze >"$ENV_DIR/pip_freeze.txt"
+  if ! timeout 120s python -m torch.utils.collect_env >"$ENV_DIR/torch_env.txt" 2>&1; then
+    echo "torch.utils.collect_env timed out after 120 seconds" >>"$ENV_DIR/torch_env.txt"
+  fi
+  nvidia-smi >"$ENV_DIR/nvidia_smi.txt"
+  nvidia-smi topo -m >"$ENV_DIR/nvidia_smi_topo.txt"
+  df -h /workspace >"$ENV_DIR/df_workspace.txt"
+  trim_env_snapshot
+}
+copy_repro_manifests() {
+  cp "$0" "$REPRO_DIR/commands_stepcmp.sh"
+  cp "$ROOT/scripts/collect_twin_dual_push_128_stepcmp_metrics.py" "$REPRO_DIR/collect_twin_dual_push_128_stepcmp_metrics.py"
+  cat >"$REPRO_DIR/checkpoint_locations.txt" <<EOF
+shared_step0=${STEP0_CKPT[shared]}
+head_only_step0=${STEP0_CKPT[head_only]}
+split_independent_step0=${STEP0_CKPT[split_ind]}
+split_communicating_step0=${STEP0_CKPT[split_comm]}
+shared_train_root=$ROOT/checkpoints/${CONFIG_NAME[shared]}/${EXP_NAME[shared]}
+head_only_train_root=$ROOT/checkpoints/${CONFIG_NAME[head_only]}/${EXP_NAME[head_only]}
+split_independent_train_root=$ROOT/checkpoints/${CONFIG_NAME[split_ind]}/${EXP_NAME[split_ind]}
+split_communicating_train_root=$ROOT/checkpoints/${CONFIG_NAME[split_comm]}/${EXP_NAME[split_comm]}
+artifact_root=$ARTIFACT_ROOT
+EOF
+}
+require_file() {
+  local path="$1"
+  if [[ ! -f "$path" ]]; then
+    log "required file missing: $path"
+    exit 1
+  fi
+}
+require_dir() {
+  local path="$1"
+  if [[ ! -d "$path" ]]; then
+    log "required directory missing: $path"
+    exit 1
+  fi
+}
+norm_stats_path_for_key() {
+  local key="$1"
+  echo "$ROOT/assets/${CONFIG_NAME[$key]}/$TRAIN_REPO/norm_stats.json"
+}
+ensure_packed_dual_push_norm_stats() {
+  local split_ind_stats
+  local split_comm_stats
+  split_ind_stats="$(norm_stats_path_for_key split_ind)"
+  split_comm_stats="$(norm_stats_path_for_key split_comm)"
+  require_file "$split_ind_stats"
+  require_file "$split_comm_stats"
+  local split_ind_sha
+  local split_comm_sha
+  split_ind_sha="$(sha256sum "$split_ind_stats" | awk '{print $1}')"
+  split_comm_sha="$(sha256sum "$split_comm_stats" | awk '{print $1}')"
+  if [[ "$split_ind_sha" != "$split_comm_sha" ]]; then
+    log "packed dual-push split norm stats differ across split configs"
+    echo "split_ind=$split_ind_stats sha256=$split_ind_sha" >"$SANITY_DIR/norm_stats_status.txt"
+    echo "split_comm=$split_comm_stats sha256=$split_comm_sha" >>"$SANITY_DIR/norm_stats_status.txt"
+    exit 1
+  fi
+  local canonical_stats="$split_ind_stats"
+  local key
+  : >"$SANITY_DIR/norm_stats_status.txt"
+  echo "canonical_source=$canonical_stats" >>"$SANITY_DIR/norm_stats_status.txt"
+  echo "canonical_sha256=$split_ind_sha" >>"$SANITY_DIR/norm_stats_status.txt"
+  for key in shared head_only split_ind split_comm; do
+    local dst
+    local dst_sha
+    dst="$(norm_stats_path_for_key "$key")"
+    if [[ ! -f "$dst" ]]; then
+      mkdir -p "$(dirname "$dst")"
+      cp "$canonical_stats" "$dst"
+      log "restored missing packed dual-push norm stats for ${MODEL_VARIANT[$key]} -> $dst"
+    fi
+    dst_sha="$(sha256sum "$dst" | awk '{print $1}')"
+    echo "${MODEL_VARIANT[$key]}=$dst sha256=$dst_sha" >>"$SANITY_DIR/norm_stats_status.txt"
+    if [[ "$dst_sha" != "$split_ind_sha" ]]; then
+      log "packed dual-push norm stats mismatch for ${MODEL_VARIANT[$key]}: $dst"
+      exit 1
+    fi
+  done
+}
+ensure_bootstrap_checkpoints() {
+  local regenerated_any=0
+  local regenerated_split=0
+  require_file "/workspace/checkpoints/pi05_base_single_pytorch/model.safetensors"
+  if [[ ! -f "${STEP0_CKPT[head_only]}/model.safetensors" ]]; then
+    log "regenerating head-only packed warm-start checkpoint"
+    python -u scripts/init_parallel_pi05_from_single_pytorch.py \
+      --single_ckpt /workspace/checkpoints/pi05_base_single_pytorch \
+      --config_name "${CONFIG_NAME[head_only]}" \
+      --output_path "${STEP0_CKPT[head_only]}" \
+      >"$SANITY_DIR/init_head_only.log" 2>&1
+    regenerated_any=1
+  fi
+  if [[ ! -f "${STEP0_CKPT[split_ind]}/model.safetensors" ]]; then
+    log "regenerating split-independent packed warm-start checkpoint"
+    python -u scripts/init_parallel_pi05_from_single_pytorch.py \
+      --single_ckpt /workspace/checkpoints/pi05_base_single_pytorch \
+      --config_name "${CONFIG_NAME[split_ind]}" \
+      --output_path "${STEP0_CKPT[split_ind]}" \
+      >"$SANITY_DIR/init_split_independent.log" 2>&1
+    regenerated_any=1
+    regenerated_split=1
+  fi
+  if [[ ! -f "${STEP0_CKPT[split_comm]}/model.safetensors" ]]; then
+    log "regenerating split-communicating packed warm-start checkpoint"
+    python -u scripts/init_parallel_pi05_from_single_pytorch.py \
+      --single_ckpt /workspace/checkpoints/pi05_base_single_pytorch \
+      --config_name "${CONFIG_NAME[split_comm]}" \
+      --output_path "${STEP0_CKPT[split_comm]}" \
+      >"$SANITY_DIR/init_split_communicating.log" 2>&1
+    regenerated_any=1
+    regenerated_split=1
+  fi
+  require_file "${STEP0_CKPT[head_only]}/model.safetensors"
+  require_file "${STEP0_CKPT[split_ind]}/model.safetensors"
+  require_file "${STEP0_CKPT[split_comm]}/model.safetensors"
+  if [[ "$regenerated_split" -eq 1 ]]; then
+    log "rerunning split invariant checks after bootstrap regeneration"
+    python -u scripts/check_split_expert_invariants.py \
+      --config_name "${CONFIG_NAME[split_ind]}" \
+      --checkpoint_dir "${STEP0_CKPT[split_ind]}" \
+      >"$SANITY_DIR/check_split_independent_invariants.log" 2>&1
+    python -u scripts/check_split_expert_invariants.py \
+      --config_name "${CONFIG_NAME[split_comm]}" \
+      --checkpoint_dir "${STEP0_CKPT[split_comm]}" \
+      >"$SANITY_DIR/check_split_communicating_invariants.log" 2>&1
+  fi
+  printf 'regenerated_any=%s\nregenerated_split=%s\n' "$regenerated_any" "$regenerated_split" >"$SANITY_DIR/bootstrap_regeneration_status.txt"
+}
+sample_batch_size_arg=()
+sample_batch_size_value="default"
+run_sample_batch_probe() {
+  local sample_batch_size="$1"
+  local probe_log="$SANITY_DIR/sample_batch_size_probe.log"
+  local batch_arg=()
+  if [[ -n "$sample_batch_size" ]]; then
+    batch_arg=(--sample_batch_size "$sample_batch_size")
+  fi
+  CUDA_VISIBLE_DEVICES=0 python -u scripts/eval_twin_val_loss_pytorch.py \
+    --config_name "${CONFIG_NAME[split_comm]}" \
+    --checkpoint_dir "${STEP0_CKPT[split_comm]}" \
+    --repo_id "$VAL_REPO" \
+    --num_batches 1 \
+    --num_workers 0 \
+    --eval_seed 123 \
+    --sample_num_batches 1 \
+    --sample_num_steps "16" \
+    --sample_seed 321 \
+    "${batch_arg[@]}" \
+    >"$probe_log" 2>&1
+}
+determine_sample_batch_size() {
+  local override="${SAMPLE_BATCH_SIZE_OVERRIDE:-}"
+  if [[ -n "$override" ]]; then
+    sample_batch_size_arg=(--sample_batch_size "$override")
+    sample_batch_size_value="$override"
+    echo "$sample_batch_size_value" >"$SANITY_DIR/sample_batch_size_used.txt"
+    return
+  fi
+  log "probing sample-eval batch size on split_communicating step-0 checkpoint"
+  if run_sample_batch_probe ""; then
+    sample_batch_size_arg=()
+    sample_batch_size_value="default"
+    echo "$sample_batch_size_value" >"$SANITY_DIR/sample_batch_size_used.txt"
+    return
+  fi
+  if ! grep -qi 'out of memory' "$SANITY_DIR/sample_batch_size_probe.log"; then
+    log "sample batch size probe failed for a non-OOM reason; see $SANITY_DIR/sample_batch_size_probe.log"
+    exit 1
+  fi
+  local candidate=8
+  while [[ "$candidate" -ge 1 ]]; do
+    log "retrying sample-eval probe with --sample_batch_size=$candidate"
+    if run_sample_batch_probe "$candidate"; then
+      sample_batch_size_arg=(--sample_batch_size "$candidate")
+      sample_batch_size_value="$candidate"
+      echo "$sample_batch_size_value" >"$SANITY_DIR/sample_batch_size_used.txt"
+      return
+    fi
+    if ! grep -qi 'out of memory' "$SANITY_DIR/sample_batch_size_probe.log"; then
+      log "sample batch size retry failed for a non-OOM reason; see $SANITY_DIR/sample_batch_size_probe.log"
+      exit 1
+    fi
+    candidate=$((candidate / 2))
+  done
+  log "unable to find a viable sample batch size; see $SANITY_DIR/sample_batch_size_probe.log"
+  exit 1
+}
+load_saved_sample_batch_size() {
+  local saved_value="default"
+  if [[ -f "$SANITY_DIR/sample_batch_size_used.txt" ]]; then
+    saved_value="$(<"$SANITY_DIR/sample_batch_size_used.txt")"
+  else
+    echo "$saved_value" >"$SANITY_DIR/sample_batch_size_used.txt"
+  fi
+  sample_batch_size_value="$saved_value"
+  sample_batch_size_arg=()
+  if [[ "$saved_value" != "default" ]]; then
+    sample_batch_size_arg=(--sample_batch_size "$saved_value")
+  fi
+}
+launch_eval_async() {
+  local gpu="$1"
+  local key="$2"
+  local step="$3"
+  local ckpt_dir="$4"
+  local log_path="$5"
+  CUDA_VISIBLE_DEVICES="$gpu" python -u scripts/eval_twin_val_loss_pytorch.py \
+    --config_name "${CONFIG_NAME[$key]}" \
+    --checkpoint_dir "$ckpt_dir" \
+    --repo_id "$VAL_REPO" \
+    --num_batches "$TEACHER_VAL_BATCHES" \
+    --num_workers 0 \
+    --eval_seed 123 \
+    --sample_num_batches "$SAMPLE_VAL_BATCHES" \
+    --sample_num_steps "$SAMPLE_NUM_STEPS" \
+    --sample_seed 321 \
+    "${sample_batch_size_arg[@]}" \
+    >"$log_path" 2>&1 &
+  eval_pids+=("$!")
+  eval_labels+=("${MODEL_VARIANT[$key]} step=${step} gpu=${gpu}")
+}
+run_eval_sync() {
+  local gpu="$1"
+  local key="$2"
+  local step="$3"
+  local ckpt_dir="$4"
+  local log_path="$5"
+  CUDA_VISIBLE_DEVICES="$gpu" python -u scripts/eval_twin_val_loss_pytorch.py \
+    --config_name "${CONFIG_NAME[$key]}" \
+    --checkpoint_dir "$ckpt_dir" \
+    --repo_id "$VAL_REPO" \
+    --num_batches "$TEACHER_VAL_BATCHES" \
+    --num_workers 0 \
+    --eval_seed 123 \
+    --sample_num_batches "$SAMPLE_VAL_BATCHES" \
+    --sample_num_steps "$SAMPLE_NUM_STEPS" \
+    --sample_seed 321 \
+    "${sample_batch_size_arg[@]}" \
+    >"$log_path" 2>&1
+}
+wait_for_eval_jobs() {
+  local failed=0
+  local idx
+  for idx in "${!eval_pids[@]}"; do
+    if ! wait "${eval_pids[$idx]}"; then
+      log "evaluation failed: ${eval_labels[$idx]}"
+      failed=1
+    fi
+  done
+  eval_pids=()
+  eval_labels=()
+  if [[ "$failed" -ne 0 ]]; then
+    exit 1
+  fi
+}
+eval_log_complete() {
+  local log_path="$1"
+  [[ -f "$log_path" ]] && grep -q '^sample_eval_num_steps_16_per_batch_timing_seconds:' "$log_path"
+}
+checkpoint_roots() {
+  printf '%s\n' \
+    "$ROOT/checkpoints/${CONFIG_NAME[shared]}/${EXP_NAME[shared]}" \
+    "$ROOT/checkpoints/${CONFIG_NAME[head_only]}/${EXP_NAME[head_only]}" \
+    "$ROOT/checkpoints/${CONFIG_NAME[split_ind]}/${EXP_NAME[split_ind]}" \
+    "$ROOT/checkpoints/${CONFIG_NAME[split_comm]}/${EXP_NAME[split_comm]}"
+}
+prune_checkpoint_roots_once() {
+  local root child base
+  while read -r root; do
+    [[ -d "$root" ]] || continue
+    for child in "$root"/*; do
+      [[ -e "$child" ]] || continue
+      [[ -d "$child" ]] || continue
+      base="$(basename "$child")"
+      case "$base" in
+        100|500|2000|tmp_*) continue ;;
+      esac
+      [[ "$base" =~ ^[0-9]+$ ]] || continue
+      rm -rf -- "$child"
+      pruner_log "pruned $child"
+    done
+  done < <(checkpoint_roots)
+}
+start_checkpoint_pruner() {
+  pruner_log "runner checkpoint pruner started interval_s=30 keep_steps=[100,500,2000]"
+  (
+    while true; do
+      prune_checkpoint_roots_once
+      sleep 30
+    done
+  ) &
+  checkpoint_pruner_pid="$!"
+}
+stop_checkpoint_pruner() {
+  if [[ -n "$checkpoint_pruner_pid" ]]; then
+    kill "$checkpoint_pruner_pid" >/dev/null 2>&1 || true
+    wait "$checkpoint_pruner_pid" 2>/dev/null || true
+    checkpoint_pruner_pid=""
+  fi
+}
+run_step0_evals() {
+  log "starting step-0 evaluation sweep"
+  local pending=0
+  local gpu key ckpt log_path
+  while read -r gpu key ckpt log_path; do
+    if eval_log_complete "$log_path"; then
+      log "step-0 eval already complete for ${MODEL_VARIANT[$key]}"
+      continue
+    fi
+    if [[ "${PARALLEL_EVALS:-1}" == "1" ]]; then
+      launch_eval_async "$gpu" "$key" 0 "$ckpt" "$log_path"
+      pending=1
+    else
+      run_eval_sync "$gpu" "$key" 0 "$ckpt" "$log_path"
+    fi
+  done <<EOF
+0 shared ${STEP0_CKPT[shared]} $RUN_LOG_DIR/${EXP_NAME[shared]}_val_0.log
+1 head_only ${STEP0_CKPT[head_only]} $RUN_LOG_DIR/${EXP_NAME[head_only]}_val_0.log
+2 split_ind ${STEP0_CKPT[split_ind]} $RUN_LOG_DIR/${EXP_NAME[split_ind]}_val_0.log
+3 split_comm ${STEP0_CKPT[split_comm]} $RUN_LOG_DIR/${EXP_NAME[split_comm]}_val_0.log
+EOF
+  if [[ "${PARALLEL_EVALS:-1}" == "1" && "$pending" -eq 1 ]]; then
+    wait_for_eval_jobs
+  fi
+  log "finished step-0 evaluation sweep"
+}
+train_variant() {
+  local key="$1"
+  local ckpt_root="$ROOT/checkpoints/${CONFIG_NAME[$key]}/${EXP_NAME[$key]}"
+  local train_log="$RUN_LOG_DIR/${EXP_NAME[$key]}.log"
+  if [[ "${SKIP_COMPLETED_TRAIN:-0}" == "1" && -d "$ckpt_root/2000" ]]; then
+    log "training already complete for model_variant=${MODEL_VARIANT[$key]}; skipping train and reusing $ckpt_root"
+    return
+  fi
+  log "training start model_variant=${MODEL_VARIANT[$key]} exp_name=${EXP_NAME[$key]}"
+  "$PYTHON_BIN" -m torch.distributed.run --standalone --nproc_per_node=4 scripts/train_pytorch.py \
+    "${CONFIG_NAME[$key]}" \
+    --exp_name "${EXP_NAME[$key]}" \
+    --overwrite \
+    --num_train_steps 2000 \
+    --save_interval 100 \
+    --log_interval 10 \
+    >"$train_log" 2>&1
+  log "training finished model_variant=${MODEL_VARIANT[$key]}"
+}
+run_post_train_evals() {
+  local key="$1"
+  local ckpt_root="$ROOT/checkpoints/${CONFIG_NAME[$key]}/${EXP_NAME[$key]}"
+  require_dir "$ckpt_root/100"
+  require_dir "$ckpt_root/500"
+  require_dir "$ckpt_root/2000"
+  log "starting post-train evaluation sweep for ${MODEL_VARIANT[$key]}"
+  local pending=0
+  local gpu step ckpt log_path
+  while read -r gpu step ckpt log_path; do
+    if eval_log_complete "$log_path"; then
+      log "post-train eval already complete for ${MODEL_VARIANT[$key]} step=$step"
+      continue
+    fi
+    if [[ "${PARALLEL_EVALS:-1}" == "1" ]]; then
+      launch_eval_async "$gpu" "$key" "$step" "$ckpt" "$log_path"
+      pending=1
+    else
+      run_eval_sync "$gpu" "$key" "$step" "$ckpt" "$log_path"
+    fi
+  done <<EOF
+0 100 $ckpt_root/100 $RUN_LOG_DIR/${EXP_NAME[$key]}_val_100.log
+1 500 $ckpt_root/500 $RUN_LOG_DIR/${EXP_NAME[$key]}_val_500.log
+2 2000 $ckpt_root/2000 $RUN_LOG_DIR/${EXP_NAME[$key]}_val_2000.log
+EOF
+  if [[ "${PARALLEL_EVALS:-1}" == "1" && "$pending" -eq 1 ]]; then
+    wait_for_eval_jobs
+  fi
+  log "finished post-train evaluation sweep for ${MODEL_VARIANT[$key]}"
+}
+collect_metrics() {
+  log "collecting step-comparison metrics"
+  python -u scripts/collect_twin_dual_push_128_stepcmp_metrics.py \
+    --artifact_root "$ARTIFACT_ROOT" \
+    --prior_metrics_root "$PRIOR_METRICS_ROOT" \
+    >"$RUN_LOG_DIR/collect_metrics.log" 2>&1
+  log "metrics collection finished"
+}
+main() {
+  trap stop_checkpoint_pruner EXIT
+  log "packed dual-push 128 step comparison runner started"
+  if [[ "${SKIP_ENV_SNAPSHOT:-0}" == "1" ]]; then
+    log "skipping environment snapshot (SKIP_ENV_SNAPSHOT=1)"
+  else
+    save_environment_snapshot
+  fi
+  copy_repro_manifests
+  ensure_bootstrap_checkpoints
+  ensure_packed_dual_push_norm_stats
+  start_checkpoint_pruner
+  if [[ "${SKIP_SAMPLE_BATCH_PROBE:-0}" == "1" ]]; then
+    log "skipping sample-eval batch-size probe (SKIP_SAMPLE_BATCH_PROBE=1)"
+    load_saved_sample_batch_size
+  else
+    determine_sample_batch_size
+  fi
+  if [[ "${SKIP_STEP0_EVALS:-0}" == "1" ]]; then
+    log "skipping step-0 evaluation sweep (SKIP_STEP0_EVALS=1)"
+  else
+    run_step0_evals
+  fi
+  local selected_keys="${MODEL_KEYS:-shared head_only split_ind split_comm}"
+  local key
+  for key in $selected_keys; do
+    case "$key" in
+      shared|head_only|split_ind|split_comm) ;;
+      *)
+        log "unknown model key in MODEL_KEYS: $key"
+        exit 1
+        ;;
+    esac
+    train_variant "$key"
+    run_post_train_evals "$key"
+  done
+  collect_metrics
+  log "packed dual-push 128 step comparison runner finished successfully"
+}
+main "$@"

run_logs/hf_upload_20260310.log ADDED Viewed

The diff for this file is too large to render. See raw diff