Add files using upload-large-folder tool
Browse files- README.md +74 -1
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/README.md +51 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/df_workspace.txt +2 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/env_selected.txt +3 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/nvidia_smi.txt +32 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/nvidia_smi_topo.txt +23 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/pip_freeze.txt +223 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/python_version.txt +1 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/torch_env.txt +82 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/bootstrap_regeneration_status.txt +2 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/check_split_communicating_invariants.log +9 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/check_split_independent_invariants.log +11 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_head_only.log +15 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_split_communicating.log +16 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_split_independent.log +15 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/norm_stats_status.txt +6 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/sample_batch_size_probe.log +52 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/sample_batch_size_used.txt +1 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/date_utc.txt +1 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/pip_freeze.txt +223 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/python_version.txt +1 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/torch_env.txt +1 -0
- artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/uname.txt +1 -0
- openpi/checkpoints/debug_pi05_split_communicating_pytorch_smoke/debug_pi05_split_communicating_pytorch_smoke/1/optimizer.pt +3 -0
- openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/1/model.safetensors +3 -0
- openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/1/optimizer.pt +3 -0
- openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/2/optimizer.pt +3 -0
- openpi/run_logs/split_independent_real_smoke20.log +0 -0
- openpi/run_logs/split_independent_real_smoke3.log +104 -0
- openpi/scripts/collect_twin_dual_push_128_stepcmp_metrics.py +913 -0
- openpi/scripts/prune_stepcmp_checkpoints.py +53 -0
- openpi/scripts/run_twin_dual_push_128_stepcmp_2k.sh +558 -0
- run_logs/hf_upload_20260310.log +0 -0
README.md
CHANGED
|
@@ -12,6 +12,7 @@ Three runs are included:
|
|
| 12 |
1. an initial `2K` baseline-vs-parallel comparison
|
| 13 |
2. a longer `10K` follow-up on the same packed setup
|
| 14 |
3. a `5K` dual-push `128` screening study on the same packed path
|
|
|
|
| 15 |
|
| 16 |
This update also adds a split-action-expert bring-up bundle for the packed TWIN path, covering:
|
| 17 |
|
|
@@ -57,6 +58,41 @@ Dual-push `128` screening results:
|
|
| 57 |
|
| 58 |
The dual-push screening run shows a small but consistent parallel edge at `1K`, `2K`, and `5K` on both teacher-forced validation loss and fixed-subset sample MAE.
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
## Warm-start note
|
| 61 |
|
| 62 |
The packed parallel warm-start uses the slice/fuse mapping implemented in `openpi/scripts/init_parallel_pi05_from_single_pytorch.py`, but the added step-0 numerical checks show it is not exactly identical end-to-end on a real batch:
|
|
@@ -107,11 +143,43 @@ New bring-up artifact bundle:
|
|
| 107 |
- `10K` follow-up bundle with metrics, logs, repro manifests, and environment snapshot
|
| 108 |
- `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/`
|
| 109 |
- dual-push `128` screening bundle with metrics, logs, repro manifests, and environment snapshot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
- `artifacts/twin_split_expert_bringup_20260310/`
|
| 111 |
-
- split-expert
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
- `artifacts/pi05_base_params/`
|
| 113 |
- staged base parameter snapshot used during JAX-to-PyTorch conversion
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
## Key files
|
| 116 |
|
| 117 |
- Full report: `REPORT.md`
|
|
@@ -122,6 +190,11 @@ New bring-up artifact bundle:
|
|
| 122 |
- dual-push `5K` teacher-forced table: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics/teacher_forced_eval_table.csv`
|
| 123 |
- dual-push `5K` sample eval table: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics/sample_eval_table.csv`
|
| 124 |
- dual-push `5K` environment snapshot: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/environment/`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
- split-expert bring-up summary: `artifacts/twin_split_expert_bringup_20260310/README.md`
|
| 126 |
- split-expert repro commands: `artifacts/twin_split_expert_bringup_20260310/repro/commands_bringup.sh`
|
| 127 |
- split-expert invariant check outputs: `artifacts/twin_split_expert_bringup_20260310/sanity_checks/`
|
|
|
|
| 12 |
1. an initial `2K` baseline-vs-parallel comparison
|
| 13 |
2. a longer `10K` follow-up on the same packed setup
|
| 14 |
3. a `5K` dual-push `128` screening study on the same packed path
|
| 15 |
+
4. a `2K` dual-push `128` four-way step comparison across `shared`, `head_only_parallel`, `split_independent`, and `split_communicating`
|
| 16 |
|
| 17 |
This update also adds a split-action-expert bring-up bundle for the packed TWIN path, covering:
|
| 18 |
|
|
|
|
| 58 |
|
| 59 |
The dual-push screening run shows a small but consistent parallel edge at `1K`, `2K`, and `5K` on both teacher-forced validation loss and fixed-subset sample MAE.
|
| 60 |
|
| 61 |
+
Dual-push `128` four-way `2K` step comparison raw results:
|
| 62 |
+
|
| 63 |
+
Step-0 teacher-forced masked validation loss:
|
| 64 |
+
|
| 65 |
+
| Model | Step-0 val loss | Step-0 left/right imbalance |
|
| 66 |
+
| --- | ---: | ---: |
|
| 67 |
+
| Shared | `1.084735` | `0.505345` |
|
| 68 |
+
| Head-only parallel | `1.082985` | `0.501182` |
|
| 69 |
+
| Split independent | `1.328262` | `0.448843` |
|
| 70 |
+
| Split communicating | `1.783048` | `0.671085` |
|
| 71 |
+
|
| 72 |
+
Step-2000 teacher-forced masked validation loss:
|
| 73 |
+
|
| 74 |
+
| Model | Step-2000 val loss | Step-2000 left/right imbalance |
|
| 75 |
+
| --- | ---: | ---: |
|
| 76 |
+
| Shared | `0.055329` | `0.069564` |
|
| 77 |
+
| Head-only parallel | `0.055297` | `0.069380` |
|
| 78 |
+
| Split independent | `0.063537` | `0.092029` |
|
| 79 |
+
| Split communicating | `0.059952` | `0.080435` |
|
| 80 |
+
|
| 81 |
+
Step-2000 sample masked MAE:
|
| 82 |
+
|
| 83 |
+
| Model | 1-step MAE | 4-step MAE | 16-step MAE |
|
| 84 |
+
| --- | ---: | ---: | ---: |
|
| 85 |
+
| Shared | `0.087330` | `0.078164` | `0.085222` |
|
| 86 |
+
| Head-only parallel | `0.086764` | `0.078301` | `0.085272` |
|
| 87 |
+
| Split independent | `0.079100` | `0.070436` | `0.075281` |
|
| 88 |
+
| Split communicating | `0.078618` | `0.071087` | `0.075570` |
|
| 89 |
+
|
| 90 |
+
Full raw tables for the `0/100/500/2000` sweep live in:
|
| 91 |
+
|
| 92 |
+
- `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/teacher_forced_eval_table.csv`
|
| 93 |
+
- `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/sample_eval_table.csv`
|
| 94 |
+
- `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/training_summary.csv`
|
| 95 |
+
|
| 96 |
## Warm-start note
|
| 97 |
|
| 98 |
The packed parallel warm-start uses the slice/fuse mapping implemented in `openpi/scripts/init_parallel_pi05_from_single_pytorch.py`, but the added step-0 numerical checks show it is not exactly identical end-to-end on a real batch:
|
|
|
|
| 143 |
- `10K` follow-up bundle with metrics, logs, repro manifests, and environment snapshot
|
| 144 |
- `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/`
|
| 145 |
- dual-push `128` screening bundle with metrics, logs, repro manifests, and environment snapshot
|
| 146 |
+
- `artifacts/twin_dual_push_128_stepcmp_2k_20260311/`
|
| 147 |
+
- dual-push `128` four-way `2K` step-comparison bundle with metrics, logs, repro manifests, and environment snapshot
|
| 148 |
+
- `artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/`
|
| 149 |
+
- small preflight/debug snapshot from the interrupted bring-up path; useful for debugging the runner, not the canonical result bundle
|
| 150 |
+
- `artifacts/twin_split_expert_bringup_20260310/`
|
| 151 |
+
- split-expert bring-up bundle committed with summary README, repro commands, detached run logs, and sanity checks
|
| 152 |
+
|
| 153 |
+
## Committed artifact note
|
| 154 |
+
|
| 155 |
+
For this update, the committed artifact payloads are:
|
| 156 |
+
|
| 157 |
+
- `artifacts/twin_dual_push_128_stepcmp_2k_20260311/`
|
| 158 |
+
- the official finalized `4`-model dual-push `2K` step-comparison bundle
|
| 159 |
- `artifacts/twin_split_expert_bringup_20260310/`
|
| 160 |
+
- the split-expert bring-up bundle used as the sanity and warm-start reference
|
| 161 |
+
- `artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/`
|
| 162 |
+
- a small debug-only environment snapshot from the failed/resumed bring-up sequence
|
| 163 |
+
|
| 164 |
+
The debug bundle is intentionally committed only as runner diagnostics. The canonical study outputs are the non-`_debug` step-comparison bundle plus the split bring-up bundle.
|
| 165 |
+
- `openpi/run_logs/`
|
| 166 |
+
- raw local split bring-up logs kept for completeness; the canonical copies for the finalized bring-up record live under `artifacts/twin_split_expert_bringup_20260310/run_logs/`
|
| 167 |
+
- `openpi/scripts/upload_stepcmp_bundle_to_hf.py`
|
| 168 |
+
- the committed high-throughput HF uploader for the step-comparison bundle and retained checkpoints; it uses `huggingface_hub.HfApi.upload_large_folder(...)`
|
| 169 |
- `artifacts/pi05_base_params/`
|
| 170 |
- staged base parameter snapshot used during JAX-to-PyTorch conversion
|
| 171 |
|
| 172 |
+
## Future commit/upload workflow
|
| 173 |
+
|
| 174 |
+
When adding new experiment results to this repo:
|
| 175 |
+
|
| 176 |
+
- keep the canonical bundle under `artifacts/<study_name>/` and only retain the checkpoint steps that are scientifically required under `openpi/checkpoints/`
|
| 177 |
+
- before claiming the repo is fully committed, audit ignored artifact paths explicitly:
|
| 178 |
+
- `git ls-files --others -i --exclude-standard --directory -- openpi/checkpoints artifacts openpi/run_logs run_logs`
|
| 179 |
+
- if a result is intentionally kept in an ignored path such as `openpi/checkpoints/` or `openpi/run_logs/`, force-add it explicitly with `git add --sparse -f ...`
|
| 180 |
+
- use `openpi/scripts/upload_stepcmp_bundle_to_hf.py` for large HF uploads; it uses `huggingface_hub.HfApi.upload_large_folder(...)` and is the preferred path for checkpoint-heavy updates
|
| 181 |
+
- never hardcode HF credentials in scripts, logs, or READMEs; keep the credential in `HF_TOKEN` or load it from `HF_TOKEN_FILE`, and check for literal `hf_...` strings before committing
|
| 182 |
+
|
| 183 |
## Key files
|
| 184 |
|
| 185 |
- Full report: `REPORT.md`
|
|
|
|
| 190 |
- dual-push `5K` teacher-forced table: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics/teacher_forced_eval_table.csv`
|
| 191 |
- dual-push `5K` sample eval table: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics/sample_eval_table.csv`
|
| 192 |
- dual-push `5K` environment snapshot: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/environment/`
|
| 193 |
+
- dual-push `2K` step-comparison summary: `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/summary.json`
|
| 194 |
+
- dual-push `2K` step-comparison README: `artifacts/twin_dual_push_128_stepcmp_2k_20260311/README.md`
|
| 195 |
+
- dual-push `2K` teacher-forced table: `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/teacher_forced_eval_table.csv`
|
| 196 |
+
- dual-push `2K` sample eval table: `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/sample_eval_table.csv`
|
| 197 |
+
- dual-push `2K` training summary: `artifacts/twin_dual_push_128_stepcmp_2k_20260311/metrics/training_summary.csv`
|
| 198 |
- split-expert bring-up summary: `artifacts/twin_split_expert_bringup_20260310/README.md`
|
| 199 |
- split-expert repro commands: `artifacts/twin_split_expert_bringup_20260310/repro/commands_bringup.sh`
|
| 200 |
- split-expert invariant check outputs: `artifacts/twin_split_expert_bringup_20260310/sanity_checks/`
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/README.md
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# twin_dual_push_128_stepcmp_2k_20260311
|
| 2 |
+
|
| 3 |
+
Controlled 4-way early-training comparison on packed TWIN dual-push `128` with a shared step-0 bootstrap check, fresh `2K` training runs, and fixed validation settings at steps `0`, `100`, `500`, and `2000`.
|
| 4 |
+
|
| 5 |
+
## Quick answers
|
| 6 |
+
- Smallest step-0 teacher-forced jump vs `shared`: `head_only_parallel` (`-0.001750`).
|
| 7 |
+
- Smallest step-0 sample jump vs `shared` (average over sample steps `1,2,4,8,16`): `head_only_parallel` (`+0.000005`).
|
| 8 |
+
- Best teacher-forced result by step `2000`: `head_only_parallel`.
|
| 9 |
+
- Best sample result by step `2000` (average masked MAE over sample steps `1,2,4,8,16`): `split_communicating`.
|
| 10 |
+
- Split vs head-only by step `2000`: teacher-forced beat flags `split_independent=False`, `split_communicating=False`; sample beat flags `split_independent=True`, `split_communicating=True`.
|
| 11 |
+
- `split_communicating` vs `split_independent` at `2000`: teacher delta `-0.003585`, sample-average delta `-0.000257`.
|
| 12 |
+
|
| 13 |
+
## Step-0 teacher-forced comparison
|
| 14 |
+
| model | mean_val_loss | delta_vs_shared | left_right_imbalance |
|
| 15 |
+
| --- | --- | --- | --- |
|
| 16 |
+
| shared | 1.084735 | +0.000000 | 0.505345 |
|
| 17 |
+
| head_only_parallel | 1.082985 | -0.001750 | 0.501182 |
|
| 18 |
+
| split_independent | 1.328262 | +0.243527 | 0.448843 |
|
| 19 |
+
| split_communicating | 1.783048 | +0.698313 | 0.671085 |
|
| 20 |
+
|
| 21 |
+
## Step-2000 comparison
|
| 22 |
+
| model | mean_val_loss | 0_to_2000_improvement | left_right_imbalance |
|
| 23 |
+
| --- | --- | --- | --- |
|
| 24 |
+
| shared | 0.055329 | 1.029406 | 0.069564 |
|
| 25 |
+
| head_only_parallel | 0.055297 | 1.027688 | 0.069380 |
|
| 26 |
+
| split_independent | 0.063537 | 1.264725 | 0.092029 |
|
| 27 |
+
| split_communicating | 0.059952 | 1.723096 | 0.080435 |
|
| 28 |
+
|
| 29 |
+
| model | 1-step_mae | 4-step_mae | 16-step_mae |
|
| 30 |
+
| --- | --- | --- | --- |
|
| 31 |
+
| shared | 0.087330 | 0.078164 | 0.085222 |
|
| 32 |
+
| head_only_parallel | 0.086764 | 0.078301 | 0.085272 |
|
| 33 |
+
| split_independent | 0.079100 | 0.070436 | 0.075281 |
|
| 34 |
+
| split_communicating | 0.078618 | 0.071087 | 0.075570 |
|
| 35 |
+
|
| 36 |
+
## Stability notes
|
| 37 |
+
- Sample batch size used for all official evals: `16`.
|
| 38 |
+
- Step-0 weight loading was clean for all four variants: missing and unexpected key counts were zero in every step-0 eval log.
|
| 39 |
+
- Peak training VRAM by model: shared=35.23GB, head_only_parallel=35.27GB, split_independent=41.73GB, split_communicating=41.73GB.
|
| 40 |
+
- `split_communicating` communication path: active=`True`, `grad_cross_arm_comm_max=0.394700`, `attention_mass_mean=0.009074`, `gate_abs_max=0.003900`.
|
| 41 |
+
|
| 42 |
+
## Regression check vs prior dual-push screen
|
| 43 |
+
- Prior `5K` study at step `2000` had `baseline=0.083194` and `parallel=0.082729` with head-only edge `+0.000465`. This rerun has `shared=0.055329` and `head_only_parallel=0.055297` with head-only edge `+0.000032`; direction match=`True`.
|
| 44 |
+
- Prior `5K` study `4`-step MAE at step `2000` had `baseline=0.069732` and `parallel=0.069053` with head-only edge `+0.000679`. This rerun has `shared=0.078164` and `head_only_parallel=0.078301` with head-only edge `-0.000137`; direction match=`False`.
|
| 45 |
+
|
| 46 |
+
## Files
|
| 47 |
+
- `metrics/teacher_forced_eval_table.csv`: all teacher-forced metrics at steps `0`, `100`, `500`, `2000`.
|
| 48 |
+
- `metrics/sample_eval_table.csv`: all sample-eval metrics for sample steps `1`, `2`, `4`, `8`, `16` at steps `0`, `100`, `500`, `2000`.
|
| 49 |
+
- `metrics/training_summary.csv`: per-log-interval training diagnostics with model-specific gradient columns.
|
| 50 |
+
- `metrics/startup_summaries.txt`: startup configuration and weight-loading summaries for each run.
|
| 51 |
+
- `run_logs/`: full train/eval logs, including the first-five-step debug lines in each train log.
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/df_workspace.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Filesystem Size Used Avail Use% Mounted on
|
| 2 |
+
mfs#us-mo-1.runpod.net:9421 154T 129T 25T 84% /workspace
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/env_selected.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PYTHONPATH=/workspace/pi05tests/openpi/src
|
| 2 |
+
TOKENIZERS_PARALLELISM=false
|
| 3 |
+
XDG_CACHE_HOME=/workspace/.cache
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/nvidia_smi.txt
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wed Mar 11 23:32:47 2026
|
| 2 |
+
+-----------------------------------------------------------------------------------------+
|
| 3 |
+
| NVIDIA-SMI 580.126.09 Driver Version: 580.126.09 CUDA Version: 13.0 |
|
| 4 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 5 |
+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 6 |
+
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 7 |
+
| | | MIG M. |
|
| 8 |
+
|=========================================+========================+======================|
|
| 9 |
+
| 0 NVIDIA H100 80GB HBM3 On | 00000000:2A:00.0 Off | 0 |
|
| 10 |
+
| N/A 30C P0 69W / 700W | 0MiB / 81559MiB | 0% Default |
|
| 11 |
+
| | | Disabled |
|
| 12 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 13 |
+
| 1 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 |
|
| 14 |
+
| N/A 29C P0 71W / 700W | 0MiB / 81559MiB | 0% Default |
|
| 15 |
+
| | | Disabled |
|
| 16 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 17 |
+
| 2 NVIDIA H100 80GB HBM3 On | 00000000:9A:00.0 Off | 0 |
|
| 18 |
+
| N/A 28C P0 69W / 700W | 0MiB / 81559MiB | 0% Default |
|
| 19 |
+
| | | Disabled |
|
| 20 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 21 |
+
| 3 NVIDIA H100 80GB HBM3 On | 00000000:AB:00.0 Off | 0 |
|
| 22 |
+
| N/A 29C P0 74W / 700W | 0MiB / 81559MiB | 0% Default |
|
| 23 |
+
| | | Disabled |
|
| 24 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 25 |
+
|
| 26 |
+
+-----------------------------------------------------------------------------------------+
|
| 27 |
+
| Processes: |
|
| 28 |
+
| GPU GI CI PID Type Process name GPU Memory |
|
| 29 |
+
| ID ID Usage |
|
| 30 |
+
|=========================================================================================|
|
| 31 |
+
| No running processes found |
|
| 32 |
+
+-----------------------------------------------------------------------------------------+
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/nvidia_smi_topo.txt
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[4mGPU0 GPU1 GPU2 GPU3 NIC0 NIC1 CPU Affinity NUMA Affinity GPU NUMA ID[0m
|
| 2 |
+
GPU0 X NV18 NV18 NV18 NODE NODE 0-51,104-155 0 N/A
|
| 3 |
+
GPU1 NV18 X NV18 NV18 NODE NODE 0-51,104-155 0 N/A
|
| 4 |
+
GPU2 NV18 NV18 X NV18 SYS SYS 52-103,156-207 1 N/A
|
| 5 |
+
GPU3 NV18 NV18 NV18 X SYS SYS 52-103,156-207 1 N/A
|
| 6 |
+
NIC0 NODE NODE SYS SYS X PIX
|
| 7 |
+
NIC1 NODE NODE SYS SYS PIX X
|
| 8 |
+
|
| 9 |
+
Legend:
|
| 10 |
+
|
| 11 |
+
X = Self
|
| 12 |
+
SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
|
| 13 |
+
NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
|
| 14 |
+
PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
|
| 15 |
+
PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
|
| 16 |
+
PIX = Connection traversing at most a single PCIe bridge
|
| 17 |
+
NV# = Connection traversing a bonded set of # NVLinks
|
| 18 |
+
|
| 19 |
+
NIC Legend:
|
| 20 |
+
|
| 21 |
+
NIC0: mlx5_3
|
| 22 |
+
NIC1: mlx5_4
|
| 23 |
+
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/pip_freeze.txt
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
absl-py==2.4.0
|
| 2 |
+
aiohappyeyeballs==2.6.1
|
| 3 |
+
aiohttp==3.13.3
|
| 4 |
+
aiosignal==1.4.0
|
| 5 |
+
annotated-types==0.7.0
|
| 6 |
+
antlr4-python3-runtime==4.9.3
|
| 7 |
+
anyio==4.6.0
|
| 8 |
+
argon2-cffi==23.1.0
|
| 9 |
+
argon2-cffi-bindings==21.2.0
|
| 10 |
+
arrow==1.3.0
|
| 11 |
+
asttokens==2.4.1
|
| 12 |
+
async-lru==2.0.4
|
| 13 |
+
attrs==25.4.0
|
| 14 |
+
augmax==0.4.1
|
| 15 |
+
av==16.1.0
|
| 16 |
+
babel==2.16.0
|
| 17 |
+
beartype==0.19.0
|
| 18 |
+
beautifulsoup4==4.12.3
|
| 19 |
+
bleach==6.1.0
|
| 20 |
+
certifi==2026.2.25
|
| 21 |
+
cffi==2.0.0
|
| 22 |
+
charset-normalizer==3.4.5
|
| 23 |
+
comm==0.2.2
|
| 24 |
+
cryptography==46.0.5
|
| 25 |
+
datasets==4.7.0
|
| 26 |
+
debugpy==1.8.5
|
| 27 |
+
decorator==5.1.1
|
| 28 |
+
deepdiff==8.6.1
|
| 29 |
+
defusedxml==0.7.1
|
| 30 |
+
dill==0.4.0
|
| 31 |
+
dm-tree==0.1.9
|
| 32 |
+
docstring_parser==0.17.0
|
| 33 |
+
draccus==0.10.0
|
| 34 |
+
einops==0.8.2
|
| 35 |
+
entrypoints==0.4
|
| 36 |
+
equinox==0.13.6
|
| 37 |
+
etils==1.14.0
|
| 38 |
+
executing==2.1.0
|
| 39 |
+
fastjsonschema==2.20.0
|
| 40 |
+
filelock==3.25.1
|
| 41 |
+
flatbuffers==25.12.19
|
| 42 |
+
flax==0.10.2
|
| 43 |
+
fqdn==1.5.1
|
| 44 |
+
frozenlist==1.8.0
|
| 45 |
+
fsspec==2026.2.0
|
| 46 |
+
gcsfs==2026.2.0
|
| 47 |
+
google-api-core==2.30.0
|
| 48 |
+
google-auth==2.49.0
|
| 49 |
+
google-auth-oauthlib==1.3.0
|
| 50 |
+
google-cloud-core==2.5.0
|
| 51 |
+
google-cloud-storage==3.9.0
|
| 52 |
+
google-cloud-storage-control==1.10.0
|
| 53 |
+
google-crc32c==1.8.0
|
| 54 |
+
google-resumable-media==2.8.0
|
| 55 |
+
googleapis-common-protos==1.73.0
|
| 56 |
+
grpc-google-iam-v1==0.14.3
|
| 57 |
+
grpcio==1.78.0
|
| 58 |
+
grpcio-status==1.78.0
|
| 59 |
+
h11==0.14.0
|
| 60 |
+
hf-xet==1.3.2
|
| 61 |
+
httpcore==1.0.5
|
| 62 |
+
httpx==0.27.2
|
| 63 |
+
huggingface_hub==0.36.2
|
| 64 |
+
humanize==4.15.0
|
| 65 |
+
idna==3.11
|
| 66 |
+
ImageIO==2.37.3
|
| 67 |
+
ipykernel==6.29.5
|
| 68 |
+
ipython==8.27.0
|
| 69 |
+
ipython-genutils==0.2.0
|
| 70 |
+
ipywidgets==8.1.5
|
| 71 |
+
isoduration==20.11.0
|
| 72 |
+
jax==0.5.3
|
| 73 |
+
jaxlib==0.5.3
|
| 74 |
+
jaxtyping==0.2.36
|
| 75 |
+
jedi==0.19.1
|
| 76 |
+
Jinja2==3.1.3
|
| 77 |
+
json5==0.9.25
|
| 78 |
+
jsonlines==4.0.0
|
| 79 |
+
jsonpointer==3.0.0
|
| 80 |
+
jsonschema==4.23.0
|
| 81 |
+
jsonschema-specifications==2023.12.1
|
| 82 |
+
jupyter-archive==3.4.0
|
| 83 |
+
jupyter-events==0.10.0
|
| 84 |
+
jupyter-highlight-selected-word==0.2.0
|
| 85 |
+
jupyter-lsp==2.2.5
|
| 86 |
+
jupyter_client==7.4.9
|
| 87 |
+
jupyter_contrib_core==0.4.2
|
| 88 |
+
jupyter_contrib_nbextensions==0.7.0
|
| 89 |
+
jupyter_core==5.7.2
|
| 90 |
+
jupyter_nbextensions_configurator==0.6.4
|
| 91 |
+
jupyter_server==2.14.2
|
| 92 |
+
jupyter_server_terminals==0.5.3
|
| 93 |
+
jupyterlab==4.2.5
|
| 94 |
+
jupyterlab_pygments==0.3.0
|
| 95 |
+
jupyterlab_server==2.27.3
|
| 96 |
+
jupyterlab_widgets==3.0.13
|
| 97 |
+
lerobot @ git+https://github.com/huggingface/lerobot@0cf864870cf29f4738d3ade893e6fd13fbd7cdb5
|
| 98 |
+
lxml==5.3.0
|
| 99 |
+
markdown-it-py==4.0.0
|
| 100 |
+
MarkupSafe==2.1.5
|
| 101 |
+
matplotlib-inline==0.1.7
|
| 102 |
+
mdurl==0.1.2
|
| 103 |
+
mergedeep==1.3.4
|
| 104 |
+
mistune==3.0.2
|
| 105 |
+
ml_collections==1.0.0
|
| 106 |
+
ml_dtypes==0.5.4
|
| 107 |
+
mpmath==1.3.0
|
| 108 |
+
msgpack==1.1.2
|
| 109 |
+
multidict==6.7.1
|
| 110 |
+
multiprocess==0.70.18
|
| 111 |
+
mypy_extensions==1.1.0
|
| 112 |
+
nbclassic==1.1.0
|
| 113 |
+
nbclient==0.10.0
|
| 114 |
+
nbconvert==7.16.4
|
| 115 |
+
nbformat==5.10.4
|
| 116 |
+
nest-asyncio==1.6.0
|
| 117 |
+
networkx==3.2.1
|
| 118 |
+
notebook==6.5.5
|
| 119 |
+
notebook_shim==0.2.4
|
| 120 |
+
numpy==1.26.4
|
| 121 |
+
numpydantic==1.8.0
|
| 122 |
+
nvidia-cublas-cu12==12.4.2.65
|
| 123 |
+
nvidia-cuda-cupti-cu12==12.4.99
|
| 124 |
+
nvidia-cuda-nvrtc-cu12==12.4.99
|
| 125 |
+
nvidia-cuda-runtime-cu12==12.4.99
|
| 126 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 127 |
+
nvidia-cufft-cu12==11.2.0.44
|
| 128 |
+
nvidia-curand-cu12==10.3.5.119
|
| 129 |
+
nvidia-cusolver-cu12==11.6.0.99
|
| 130 |
+
nvidia-cusparse-cu12==12.3.0.142
|
| 131 |
+
nvidia-nccl-cu12==2.20.5
|
| 132 |
+
nvidia-nvjitlink-cu12==12.4.99
|
| 133 |
+
nvidia-nvtx-cu12==12.4.99
|
| 134 |
+
oauthlib==3.3.1
|
| 135 |
+
omegaconf==2.3.0
|
| 136 |
+
opencv-python==4.11.0.86
|
| 137 |
+
openpi-client==0.1.1
|
| 138 |
+
opt_einsum==3.4.0
|
| 139 |
+
optax==0.2.7
|
| 140 |
+
orbax-checkpoint==0.11.13
|
| 141 |
+
orderly-set==5.5.0
|
| 142 |
+
overrides==7.7.0
|
| 143 |
+
packaging==26.0
|
| 144 |
+
pandas==3.0.1
|
| 145 |
+
pandocfilters==1.5.1
|
| 146 |
+
parso==0.8.4
|
| 147 |
+
pexpect==4.9.0
|
| 148 |
+
pillow==12.1.1
|
| 149 |
+
platformdirs==4.3.6
|
| 150 |
+
prometheus_client==0.21.0
|
| 151 |
+
prompt_toolkit==3.0.47
|
| 152 |
+
propcache==0.4.1
|
| 153 |
+
proto-plus==1.27.1
|
| 154 |
+
protobuf==6.33.5
|
| 155 |
+
psutil==6.0.0
|
| 156 |
+
ptyprocess==0.7.0
|
| 157 |
+
pure_eval==0.2.3
|
| 158 |
+
pyarrow==23.0.1
|
| 159 |
+
pyasn1==0.6.2
|
| 160 |
+
pyasn1_modules==0.4.2
|
| 161 |
+
pycparser==2.22
|
| 162 |
+
pydantic==2.12.5
|
| 163 |
+
pydantic_core==2.41.5
|
| 164 |
+
Pygments==2.19.2
|
| 165 |
+
python-dateutil==2.9.0.post0
|
| 166 |
+
python-json-logger==2.0.7
|
| 167 |
+
PyYAML==6.0.3
|
| 168 |
+
pyyaml-include==1.4.1
|
| 169 |
+
pyzmq==24.0.1
|
| 170 |
+
referencing==0.35.1
|
| 171 |
+
regex==2026.2.28
|
| 172 |
+
requests==2.32.5
|
| 173 |
+
requests-oauthlib==2.0.0
|
| 174 |
+
rfc3339-validator==0.1.4
|
| 175 |
+
rfc3986-validator==0.1.1
|
| 176 |
+
rich==14.3.3
|
| 177 |
+
rpds-py==0.20.0
|
| 178 |
+
rsa==4.9.1
|
| 179 |
+
safetensors==0.7.0
|
| 180 |
+
scipy==1.17.1
|
| 181 |
+
Send2Trash==1.8.3
|
| 182 |
+
sentencepiece==0.2.1
|
| 183 |
+
simplejson==3.20.2
|
| 184 |
+
six==1.17.0
|
| 185 |
+
sniffio==1.3.1
|
| 186 |
+
soupsieve==2.6
|
| 187 |
+
stack-data==0.6.3
|
| 188 |
+
sympy==1.12
|
| 189 |
+
tensorstore==0.1.81
|
| 190 |
+
termcolor==3.3.0
|
| 191 |
+
terminado==0.18.1
|
| 192 |
+
tinycss2==1.3.0
|
| 193 |
+
tokenizers==0.21.4
|
| 194 |
+
toml==0.10.2
|
| 195 |
+
torch==2.4.1+cu124
|
| 196 |
+
torchaudio==2.4.1+cu124
|
| 197 |
+
torchvision==0.19.1+cu124
|
| 198 |
+
tornado==6.4.1
|
| 199 |
+
tqdm==4.67.3
|
| 200 |
+
tqdm-loggable==0.3
|
| 201 |
+
traitlets==5.14.3
|
| 202 |
+
transformers==4.53.2
|
| 203 |
+
treescope==0.1.10
|
| 204 |
+
triton==3.0.0
|
| 205 |
+
typeguard==4.5.1
|
| 206 |
+
types-python-dateutil==2.9.0.20240906
|
| 207 |
+
typing-inspect==0.9.0
|
| 208 |
+
typing-inspection==0.4.2
|
| 209 |
+
typing_extensions==4.15.0
|
| 210 |
+
tyro==1.0.8
|
| 211 |
+
uri-template==1.3.0
|
| 212 |
+
urllib3==2.6.3
|
| 213 |
+
wadler_lindig==0.1.7
|
| 214 |
+
wcwidth==0.2.13
|
| 215 |
+
webcolors==24.8.0
|
| 216 |
+
webencodings==0.5.1
|
| 217 |
+
websocket-client==1.8.0
|
| 218 |
+
websockets==16.0
|
| 219 |
+
widgetsnbextension==4.0.13
|
| 220 |
+
wrapt==2.1.2
|
| 221 |
+
xxhash==3.6.0
|
| 222 |
+
yarl==1.23.0
|
| 223 |
+
zipp==3.23.0
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/python_version.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Python 3.11.10
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/environment/torch_env.txt
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<frozen runpy>:128: RuntimeWarning: 'torch.utils.collect_env' found in sys.modules after import of package 'torch.utils', but prior to execution of 'torch.utils.collect_env'; this may result in unpredictable behaviour
|
| 2 |
+
Collecting environment information...
|
| 3 |
+
PyTorch version: 2.4.1+cu124
|
| 4 |
+
Is debug build: False
|
| 5 |
+
CUDA used to build PyTorch: 12.4
|
| 6 |
+
ROCM used to build PyTorch: N/A
|
| 7 |
+
|
| 8 |
+
OS: Ubuntu 22.04.5 LTS (x86_64)
|
| 9 |
+
GCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
|
| 10 |
+
Clang version: Could not collect
|
| 11 |
+
CMake version: Could not collect
|
| 12 |
+
Libc version: glibc-2.35
|
| 13 |
+
|
| 14 |
+
Python version: 3.11.10 (main, Sep 7 2024, 18:35:41) [GCC 11.4.0] (64-bit runtime)
|
| 15 |
+
Python platform: Linux-6.8.0-90-generic-x86_64-with-glibc2.35
|
| 16 |
+
Is CUDA available: True
|
| 17 |
+
CUDA runtime version: 12.4.131
|
| 18 |
+
CUDA_MODULE_LOADING set to: LAZY
|
| 19 |
+
GPU models and configuration:
|
| 20 |
+
GPU 0: NVIDIA H100 80GB HBM3
|
| 21 |
+
GPU 1: NVIDIA H100 80GB HBM3
|
| 22 |
+
GPU 2: NVIDIA H100 80GB HBM3
|
| 23 |
+
GPU 3: NVIDIA H100 80GB HBM3
|
| 24 |
+
|
| 25 |
+
Nvidia driver version: 580.126.09
|
| 26 |
+
cuDNN version: Could not collect
|
| 27 |
+
HIP runtime version: N/A
|
| 28 |
+
MIOpen runtime version: N/A
|
| 29 |
+
Is XNNPACK available: True
|
| 30 |
+
|
| 31 |
+
CPU:
|
| 32 |
+
Architecture: x86_64
|
| 33 |
+
CPU op-mode(s): 32-bit, 64-bit
|
| 34 |
+
Address sizes: 46 bits physical, 57 bits virtual
|
| 35 |
+
Byte Order: Little Endian
|
| 36 |
+
CPU(s): 208
|
| 37 |
+
On-line CPU(s) list: 0-207
|
| 38 |
+
Vendor ID: GenuineIntel
|
| 39 |
+
Model name: Intel(R) Xeon(R) Platinum 8470
|
| 40 |
+
CPU family: 6
|
| 41 |
+
Model: 143
|
| 42 |
+
Thread(s) per core: 2
|
| 43 |
+
Core(s) per socket: 52
|
| 44 |
+
Socket(s): 2
|
| 45 |
+
Stepping: 8
|
| 46 |
+
CPU max MHz: 3800.0000
|
| 47 |
+
CPU min MHz: 800.0000
|
| 48 |
+
BogoMIPS: 4000.00
|
| 49 |
+
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cat_l2 cdp_l3 intel_ppin cdp_l2 ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local split_lock_detect user_shstk avx_vnni avx512_bf16 wbnoinvd dtherm ida arat pln pts vnmi avx512vbmi umip pku ospke waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq la57 rdpid bus_lock_detect cldemote movdiri movdir64b enqcmd fsrm md_clear serialize tsxldtrk pconfig arch_lbr ibt amx_bf16 avx512_fp16 amx_tile amx_int8 flush_l1d arch_capabilities ibpb_exit_to_user
|
| 50 |
+
Virtualization: VT-x
|
| 51 |
+
L1d cache: 4.9 MiB (104 instances)
|
| 52 |
+
L1i cache: 3.3 MiB (104 instances)
|
| 53 |
+
L2 cache: 208 MiB (104 instances)
|
| 54 |
+
L3 cache: 210 MiB (2 instances)
|
| 55 |
+
NUMA node(s): 2
|
| 56 |
+
NUMA node0 CPU(s): 0-51,104-155
|
| 57 |
+
NUMA node1 CPU(s): 52-103,156-207
|
| 58 |
+
Vulnerability Gather data sampling: Not affected
|
| 59 |
+
Vulnerability Itlb multihit: Not affected
|
| 60 |
+
Vulnerability L1tf: Not affected
|
| 61 |
+
Vulnerability Mds: Not affected
|
| 62 |
+
Vulnerability Meltdown: Not affected
|
| 63 |
+
Vulnerability Mmio stale data: Not affected
|
| 64 |
+
Vulnerability Reg file data sampling: Not affected
|
| 65 |
+
Vulnerability Retbleed: Not affected
|
| 66 |
+
Vulnerability Spec rstack overflow: Not affected
|
| 67 |
+
Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl
|
| 68 |
+
Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization
|
| 69 |
+
Vulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; RSB filling; PBRSB-eIBRS SW sequence; BHI BHI_DIS_S
|
| 70 |
+
Vulnerability Srbds: Not affected
|
| 71 |
+
Vulnerability Tsx async abort: Not affected
|
| 72 |
+
Vulnerability Vmscape: Mitigation; IBPB before exit to userspace
|
| 73 |
+
|
| 74 |
+
Versions of relevant libraries:
|
| 75 |
+
[pip3] mypy_extensions==1.1.0
|
| 76 |
+
[pip3] numpy==1.26.4
|
| 77 |
+
[pip3] numpydantic==1.8.0
|
| 78 |
+
[pip3] torch==2.4.1+cu124
|
| 79 |
+
[pip3] torchaudio==2.4.1+cu124
|
| 80 |
+
[pip3] torchvision==0.19.1+cu124
|
| 81 |
+
[pip3] triton==3.0.0
|
| 82 |
+
[conda] Could not collect
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/bootstrap_regeneration_status.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
regenerated_any=0
|
| 2 |
+
regenerated_split=0
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/check_split_communicating_invariants.log
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
config_name: pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k
|
| 2 |
+
checkpoint_dir: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single
|
| 3 |
+
action_expert_mode: split_communicating
|
| 4 |
+
weight_loading_missing_keys: []
|
| 5 |
+
weight_loading_unexpected_keys: []
|
| 6 |
+
identical_branch_suffix_max_abs_diff: 0.00000000
|
| 7 |
+
identical_branch_suffix_match: True
|
| 8 |
+
left_branch_invariance_max_abs_diff: skipped_for_split_communicating
|
| 9 |
+
right_branch_invariance_max_abs_diff: skipped_for_split_communicating
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/check_split_independent_invariants.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
config_name: pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k
|
| 2 |
+
checkpoint_dir: /workspace/checkpoints/pi05_base_split_independent_packed_from_single
|
| 3 |
+
action_expert_mode: split_independent
|
| 4 |
+
weight_loading_missing_keys: []
|
| 5 |
+
weight_loading_unexpected_keys: []
|
| 6 |
+
identical_branch_suffix_max_abs_diff: 0.00000000
|
| 7 |
+
identical_branch_suffix_match: True
|
| 8 |
+
left_branch_invariance_max_abs_diff: 0.00000000
|
| 9 |
+
right_branch_invariance_max_abs_diff: 0.00000000
|
| 10 |
+
left_branch_invariant: True
|
| 11 |
+
right_branch_invariant: True
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_head_only.log
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
config_name: pi05_twin_dual_push_128_packed_parallel_pytorch_5k
|
| 2 |
+
action_expert_mode: head_only_parallel
|
| 3 |
+
single_ckpt: /workspace/checkpoints/pi05_base_single_pytorch
|
| 4 |
+
output_path: /workspace/checkpoints/pi05_base_parallel_packed_from_single
|
| 5 |
+
load_state_missing_keys_count: 11
|
| 6 |
+
load_state_missing_keys: ['paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight', 'action_in_proj_arms.0.weight', 'action_in_proj_arms.0.bias', 'action_in_proj_arms.1.weight', 'action_in_proj_arms.1.bias', 'arm_token_fuse.weight', 'arm_token_fuse.bias', 'action_out_proj_arms.0.weight', 'action_out_proj_arms.0.bias', 'action_out_proj_arms.1.weight', 'action_out_proj_arms.1.bias']
|
| 7 |
+
load_state_unexpected_keys_count: 4
|
| 8 |
+
load_state_unexpected_keys: ['action_in_proj.bias', 'action_in_proj.weight', 'action_out_proj.bias', 'action_out_proj.weight']
|
| 9 |
+
input_projection_max_abs_diff: 9.5367431640625e-07
|
| 10 |
+
left_input_projection_max_abs_diff: 0.0
|
| 11 |
+
left_output_projection_max_abs_diff: 0.0
|
| 12 |
+
output_projection_max_abs_diff: 8.344650268554688e-07
|
| 13 |
+
right_input_projection_max_abs_diff: 0.0
|
| 14 |
+
right_output_projection_max_abs_diff: 0.0
|
| 15 |
+
warm_start_exact: False
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_split_communicating.log
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
config_name: pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k
|
| 2 |
+
action_expert_mode: split_communicating
|
| 3 |
+
single_ckpt: /workspace/checkpoints/pi05_base_single_pytorch
|
| 4 |
+
output_path: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single
|
| 5 |
+
load_state_missing_keys_count: 412
|
| 6 |
+
load_state_missing_keys: ['paligemma_with_expert.cross_arm_comm', 'paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.norm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.left_gemma_expert.lm_head.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.norm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.right_gemma_expert.lm_head.weight', 'action_in_proj_arms.0.weight', 'action_in_proj_arms.0.bias', 'action_in_proj_arms.1.weight', 'action_in_proj_arms.1.bias', 'action_out_proj_arms.0.weight', 'action_out_proj_arms.0.bias', 'action_out_proj_arms.1.weight', 'action_out_proj_arms.1.bias']
|
| 7 |
+
load_state_unexpected_keys_count: 205
|
| 8 |
+
load_state_unexpected_keys: ['action_in_proj.bias', 'action_in_proj.weight', 'action_out_proj.bias', 'action_out_proj.weight', 'paligemma_with_expert.gemma_expert.lm_head.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.gemma_expert.model.norm.dense.weight']
|
| 9 |
+
cross_arm_comm_init: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
|
| 10 |
+
left_expert_max_abs_diff: 0.0
|
| 11 |
+
left_input_projection_max_abs_diff: 0.0
|
| 12 |
+
left_output_projection_max_abs_diff: 0.0
|
| 13 |
+
right_expert_max_abs_diff: 0.0
|
| 14 |
+
right_input_projection_max_abs_diff: 0.0
|
| 15 |
+
right_output_projection_max_abs_diff: 0.0
|
| 16 |
+
warm_start_exact: True
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/init_split_independent.log
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
config_name: pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k
|
| 2 |
+
action_expert_mode: split_independent
|
| 3 |
+
single_ckpt: /workspace/checkpoints/pi05_base_single_pytorch
|
| 4 |
+
output_path: /workspace/checkpoints/pi05_base_split_independent_packed_from_single
|
| 5 |
+
load_state_missing_keys_count: 411
|
| 6 |
+
load_state_missing_keys: ['paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.left_gemma_expert.model.norm.dense.weight', 'paligemma_with_expert.left_gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.left_gemma_expert.lm_head.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.right_gemma_expert.model.norm.dense.weight', 'paligemma_with_expert.right_gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.right_gemma_expert.lm_head.weight', 'action_in_proj_arms.0.weight', 'action_in_proj_arms.0.bias', 'action_in_proj_arms.1.weight', 'action_in_proj_arms.1.bias', 'action_out_proj_arms.0.weight', 'action_out_proj_arms.0.bias', 'action_out_proj_arms.1.weight', 'action_out_proj_arms.1.bias']
|
| 7 |
+
load_state_unexpected_keys_count: 205
|
| 8 |
+
load_state_unexpected_keys: ['action_in_proj.bias', 'action_in_proj.weight', 'action_out_proj.bias', 'action_out_proj.weight', 'paligemma_with_expert.gemma_expert.lm_head.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.0.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.0.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.1.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.1.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.1.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.10.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.10.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.10.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.11.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.11.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.11.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.12.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.12.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.12.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.13.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.13.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.13.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.14.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.14.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.14.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.15.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.15.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.15.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.16.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.16.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.16.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.17.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.17.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.17.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.2.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.2.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.2.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.3.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.3.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.3.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.4.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.4.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.4.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.5.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.5.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.5.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.6.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.6.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.6.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.7.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.7.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.7.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.8.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.8.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.8.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.input_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.9.input_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.down_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.gate_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.mlp.up_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.post_attention_layernorm.dense.bias', 'paligemma_with_expert.gemma_expert.model.layers.9.post_attention_layernorm.dense.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.k_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.o_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.q_proj.weight', 'paligemma_with_expert.gemma_expert.model.layers.9.self_attn.v_proj.weight', 'paligemma_with_expert.gemma_expert.model.norm.dense.bias', 'paligemma_with_expert.gemma_expert.model.norm.dense.weight']
|
| 9 |
+
left_expert_max_abs_diff: 0.0
|
| 10 |
+
left_input_projection_max_abs_diff: 0.0
|
| 11 |
+
left_output_projection_max_abs_diff: 0.0
|
| 12 |
+
right_expert_max_abs_diff: 0.0
|
| 13 |
+
right_input_projection_max_abs_diff: 0.0
|
| 14 |
+
right_output_projection_max_abs_diff: 0.0
|
| 15 |
+
warm_start_exact: True
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/norm_stats_status.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
canonical_source=/workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json
|
| 2 |
+
canonical_sha256=9dc7c777a62f956cca6799726a34596ba3774cda0d9795cc068dbd47222d86a5
|
| 3 |
+
shared=/workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_baseline_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json sha256=9dc7c777a62f956cca6799726a34596ba3774cda0d9795cc068dbd47222d86a5
|
| 4 |
+
head_only_parallel=/workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_parallel_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json sha256=9dc7c777a62f956cca6799726a34596ba3774cda0d9795cc068dbd47222d86a5
|
| 5 |
+
split_independent=/workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json sha256=9dc7c777a62f956cca6799726a34596ba3774cda0d9795cc068dbd47222d86a5
|
| 6 |
+
split_communicating=/workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json sha256=9dc7c777a62f956cca6799726a34596ba3774cda0d9795cc068dbd47222d86a5
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/sample_batch_size_probe.log
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
starting_eval config=pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k checkpoint=/workspace/checkpoints/pi05_base_split_communicating_packed_from_single repo_id=lsnu/twin_dual_push_128_val
|
| 2 |
+
eval_loader batch_size=16 num_batches=1 num_workers=0
|
| 3 |
+
teacher_forced_eval_seed: 123
|
| 4 |
+
sample_eval enabled=True batch_size=16 num_batches=1 num_steps=[16] seed=321
|
| 5 |
+
WARNING:root:'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder
|
| 6 |
+
weight_loading missing=0 unexpected=0 device=cuda:0
|
| 7 |
+
eval_batch=1 loss=2.309001 left_arm_loss=1.740481 right_arm_loss=2.877522 imbalance=1.137041 batch_time_s=0.6439
|
| 8 |
+
config_name: pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k
|
| 9 |
+
checkpoint_path: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single
|
| 10 |
+
repo_id_used: lsnu/twin_dual_push_128_val
|
| 11 |
+
num_batches: 1
|
| 12 |
+
mean_val_loss: 2.309001
|
| 13 |
+
std_val_loss: 0.000000
|
| 14 |
+
mean_left_arm_loss: 1.740481
|
| 15 |
+
std_left_arm_loss: 0.000000
|
| 16 |
+
mean_right_arm_loss: 2.877522
|
| 17 |
+
std_right_arm_loss: 0.000000
|
| 18 |
+
mean_left_joint_loss: 1.680031
|
| 19 |
+
std_left_joint_loss: 0.000000
|
| 20 |
+
mean_left_gripper_loss: 2.163631
|
| 21 |
+
std_left_gripper_loss: 0.000000
|
| 22 |
+
mean_right_joint_loss: 2.108088
|
| 23 |
+
std_right_joint_loss: 0.000000
|
| 24 |
+
mean_right_gripper_loss: 8.263555
|
| 25 |
+
std_right_gripper_loss: 0.000000
|
| 26 |
+
mean_left_right_imbalance: 1.137041
|
| 27 |
+
std_left_right_imbalance: 0.000000
|
| 28 |
+
per_batch_timing_seconds: mean=0.6439 std=0.0000 min=0.6439 max=0.6439
|
| 29 |
+
active_mask_dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]
|
| 30 |
+
masked_dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]
|
| 31 |
+
weight_loading_missing_keys: []
|
| 32 |
+
weight_loading_unexpected_keys: []
|
| 33 |
+
WARNING:root:'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder
|
| 34 |
+
sample_eval_batch=1 num_steps=16 masked_mae=0.611553 left_arm_mae=0.679956 right_arm_mae=0.543150 imbalance_mae=0.136806 batch_time_s=3.3573
|
| 35 |
+
sample_eval_num_steps_16_num_batches: 1
|
| 36 |
+
sample_eval_num_steps_16_mean_masked_mae: 0.611553
|
| 37 |
+
sample_eval_num_steps_16_std_masked_mae: 0.000000
|
| 38 |
+
sample_eval_num_steps_16_mean_left_arm_mae: 0.679956
|
| 39 |
+
sample_eval_num_steps_16_std_left_arm_mae: 0.000000
|
| 40 |
+
sample_eval_num_steps_16_mean_right_arm_mae: 0.543150
|
| 41 |
+
sample_eval_num_steps_16_std_right_arm_mae: 0.000000
|
| 42 |
+
sample_eval_num_steps_16_mean_left_joint_mae: 0.648674
|
| 43 |
+
sample_eval_num_steps_16_std_left_joint_mae: 0.000000
|
| 44 |
+
sample_eval_num_steps_16_mean_left_gripper_mae: 0.898926
|
| 45 |
+
sample_eval_num_steps_16_std_left_gripper_mae: 0.000000
|
| 46 |
+
sample_eval_num_steps_16_mean_right_joint_mae: 0.478297
|
| 47 |
+
sample_eval_num_steps_16_std_right_joint_mae: 0.000000
|
| 48 |
+
sample_eval_num_steps_16_mean_right_gripper_mae: 0.997122
|
| 49 |
+
sample_eval_num_steps_16_std_right_gripper_mae: 0.000000
|
| 50 |
+
sample_eval_num_steps_16_mean_left_right_imbalance_mae: 0.136806
|
| 51 |
+
sample_eval_num_steps_16_std_left_right_imbalance_mae: 0.000000
|
| 52 |
+
sample_eval_num_steps_16_per_batch_timing_seconds: mean=3.3573 std=0.0000 min=3.3573 max=3.3573
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311/sanity_checks/sample_batch_size_used.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
default
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/date_utc.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
2026-03-11 17:33:48 UTC
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/pip_freeze.txt
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
absl-py==2.4.0
|
| 2 |
+
aiohappyeyeballs==2.6.1
|
| 3 |
+
aiohttp==3.13.3
|
| 4 |
+
aiosignal==1.4.0
|
| 5 |
+
annotated-types==0.7.0
|
| 6 |
+
antlr4-python3-runtime==4.9.3
|
| 7 |
+
anyio==4.6.0
|
| 8 |
+
argon2-cffi==23.1.0
|
| 9 |
+
argon2-cffi-bindings==21.2.0
|
| 10 |
+
arrow==1.3.0
|
| 11 |
+
asttokens==2.4.1
|
| 12 |
+
async-lru==2.0.4
|
| 13 |
+
attrs==25.4.0
|
| 14 |
+
augmax==0.4.1
|
| 15 |
+
av==16.1.0
|
| 16 |
+
babel==2.16.0
|
| 17 |
+
beartype==0.19.0
|
| 18 |
+
beautifulsoup4==4.12.3
|
| 19 |
+
bleach==6.1.0
|
| 20 |
+
certifi==2026.2.25
|
| 21 |
+
cffi==2.0.0
|
| 22 |
+
charset-normalizer==3.4.5
|
| 23 |
+
comm==0.2.2
|
| 24 |
+
cryptography==46.0.5
|
| 25 |
+
datasets==4.7.0
|
| 26 |
+
debugpy==1.8.5
|
| 27 |
+
decorator==5.1.1
|
| 28 |
+
deepdiff==8.6.1
|
| 29 |
+
defusedxml==0.7.1
|
| 30 |
+
dill==0.4.0
|
| 31 |
+
dm-tree==0.1.9
|
| 32 |
+
docstring_parser==0.17.0
|
| 33 |
+
draccus==0.10.0
|
| 34 |
+
einops==0.8.2
|
| 35 |
+
entrypoints==0.4
|
| 36 |
+
equinox==0.13.6
|
| 37 |
+
etils==1.14.0
|
| 38 |
+
executing==2.1.0
|
| 39 |
+
fastjsonschema==2.20.0
|
| 40 |
+
filelock==3.25.1
|
| 41 |
+
flatbuffers==25.12.19
|
| 42 |
+
flax==0.10.2
|
| 43 |
+
fqdn==1.5.1
|
| 44 |
+
frozenlist==1.8.0
|
| 45 |
+
fsspec==2026.2.0
|
| 46 |
+
gcsfs==2026.2.0
|
| 47 |
+
google-api-core==2.30.0
|
| 48 |
+
google-auth==2.49.0
|
| 49 |
+
google-auth-oauthlib==1.3.0
|
| 50 |
+
google-cloud-core==2.5.0
|
| 51 |
+
google-cloud-storage==3.9.0
|
| 52 |
+
google-cloud-storage-control==1.10.0
|
| 53 |
+
google-crc32c==1.8.0
|
| 54 |
+
google-resumable-media==2.8.0
|
| 55 |
+
googleapis-common-protos==1.73.0
|
| 56 |
+
grpc-google-iam-v1==0.14.3
|
| 57 |
+
grpcio==1.78.0
|
| 58 |
+
grpcio-status==1.78.0
|
| 59 |
+
h11==0.14.0
|
| 60 |
+
hf-xet==1.3.2
|
| 61 |
+
httpcore==1.0.5
|
| 62 |
+
httpx==0.27.2
|
| 63 |
+
huggingface_hub==0.36.2
|
| 64 |
+
humanize==4.15.0
|
| 65 |
+
idna==3.11
|
| 66 |
+
ImageIO==2.37.3
|
| 67 |
+
ipykernel==6.29.5
|
| 68 |
+
ipython==8.27.0
|
| 69 |
+
ipython-genutils==0.2.0
|
| 70 |
+
ipywidgets==8.1.5
|
| 71 |
+
isoduration==20.11.0
|
| 72 |
+
jax==0.5.3
|
| 73 |
+
jaxlib==0.5.3
|
| 74 |
+
jaxtyping==0.2.36
|
| 75 |
+
jedi==0.19.1
|
| 76 |
+
Jinja2==3.1.3
|
| 77 |
+
json5==0.9.25
|
| 78 |
+
jsonlines==4.0.0
|
| 79 |
+
jsonpointer==3.0.0
|
| 80 |
+
jsonschema==4.23.0
|
| 81 |
+
jsonschema-specifications==2023.12.1
|
| 82 |
+
jupyter-archive==3.4.0
|
| 83 |
+
jupyter-events==0.10.0
|
| 84 |
+
jupyter-highlight-selected-word==0.2.0
|
| 85 |
+
jupyter-lsp==2.2.5
|
| 86 |
+
jupyter_client==7.4.9
|
| 87 |
+
jupyter_contrib_core==0.4.2
|
| 88 |
+
jupyter_contrib_nbextensions==0.7.0
|
| 89 |
+
jupyter_core==5.7.2
|
| 90 |
+
jupyter_nbextensions_configurator==0.6.4
|
| 91 |
+
jupyter_server==2.14.2
|
| 92 |
+
jupyter_server_terminals==0.5.3
|
| 93 |
+
jupyterlab==4.2.5
|
| 94 |
+
jupyterlab_pygments==0.3.0
|
| 95 |
+
jupyterlab_server==2.27.3
|
| 96 |
+
jupyterlab_widgets==3.0.13
|
| 97 |
+
lerobot @ git+https://github.com/huggingface/lerobot@0cf864870cf29f4738d3ade893e6fd13fbd7cdb5
|
| 98 |
+
lxml==5.3.0
|
| 99 |
+
markdown-it-py==4.0.0
|
| 100 |
+
MarkupSafe==2.1.5
|
| 101 |
+
matplotlib-inline==0.1.7
|
| 102 |
+
mdurl==0.1.2
|
| 103 |
+
mergedeep==1.3.4
|
| 104 |
+
mistune==3.0.2
|
| 105 |
+
ml_collections==1.0.0
|
| 106 |
+
ml_dtypes==0.5.4
|
| 107 |
+
mpmath==1.3.0
|
| 108 |
+
msgpack==1.1.2
|
| 109 |
+
multidict==6.7.1
|
| 110 |
+
multiprocess==0.70.18
|
| 111 |
+
mypy_extensions==1.1.0
|
| 112 |
+
nbclassic==1.1.0
|
| 113 |
+
nbclient==0.10.0
|
| 114 |
+
nbconvert==7.16.4
|
| 115 |
+
nbformat==5.10.4
|
| 116 |
+
nest-asyncio==1.6.0
|
| 117 |
+
networkx==3.2.1
|
| 118 |
+
notebook==6.5.5
|
| 119 |
+
notebook_shim==0.2.4
|
| 120 |
+
numpy==1.26.4
|
| 121 |
+
numpydantic==1.8.0
|
| 122 |
+
nvidia-cublas-cu12==12.4.2.65
|
| 123 |
+
nvidia-cuda-cupti-cu12==12.4.99
|
| 124 |
+
nvidia-cuda-nvrtc-cu12==12.4.99
|
| 125 |
+
nvidia-cuda-runtime-cu12==12.4.99
|
| 126 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 127 |
+
nvidia-cufft-cu12==11.2.0.44
|
| 128 |
+
nvidia-curand-cu12==10.3.5.119
|
| 129 |
+
nvidia-cusolver-cu12==11.6.0.99
|
| 130 |
+
nvidia-cusparse-cu12==12.3.0.142
|
| 131 |
+
nvidia-nccl-cu12==2.20.5
|
| 132 |
+
nvidia-nvjitlink-cu12==12.4.99
|
| 133 |
+
nvidia-nvtx-cu12==12.4.99
|
| 134 |
+
oauthlib==3.3.1
|
| 135 |
+
omegaconf==2.3.0
|
| 136 |
+
opencv-python==4.11.0.86
|
| 137 |
+
openpi-client==0.1.1
|
| 138 |
+
opt_einsum==3.4.0
|
| 139 |
+
optax==0.2.7
|
| 140 |
+
orbax-checkpoint==0.11.13
|
| 141 |
+
orderly-set==5.5.0
|
| 142 |
+
overrides==7.7.0
|
| 143 |
+
packaging==26.0
|
| 144 |
+
pandas==3.0.1
|
| 145 |
+
pandocfilters==1.5.1
|
| 146 |
+
parso==0.8.4
|
| 147 |
+
pexpect==4.9.0
|
| 148 |
+
pillow==12.1.1
|
| 149 |
+
platformdirs==4.3.6
|
| 150 |
+
prometheus_client==0.21.0
|
| 151 |
+
prompt_toolkit==3.0.47
|
| 152 |
+
propcache==0.4.1
|
| 153 |
+
proto-plus==1.27.1
|
| 154 |
+
protobuf==6.33.5
|
| 155 |
+
psutil==6.0.0
|
| 156 |
+
ptyprocess==0.7.0
|
| 157 |
+
pure_eval==0.2.3
|
| 158 |
+
pyarrow==23.0.1
|
| 159 |
+
pyasn1==0.6.2
|
| 160 |
+
pyasn1_modules==0.4.2
|
| 161 |
+
pycparser==2.22
|
| 162 |
+
pydantic==2.12.5
|
| 163 |
+
pydantic_core==2.41.5
|
| 164 |
+
Pygments==2.19.2
|
| 165 |
+
python-dateutil==2.9.0.post0
|
| 166 |
+
python-json-logger==2.0.7
|
| 167 |
+
PyYAML==6.0.3
|
| 168 |
+
pyyaml-include==1.4.1
|
| 169 |
+
pyzmq==24.0.1
|
| 170 |
+
referencing==0.35.1
|
| 171 |
+
regex==2026.2.28
|
| 172 |
+
requests==2.32.5
|
| 173 |
+
requests-oauthlib==2.0.0
|
| 174 |
+
rfc3339-validator==0.1.4
|
| 175 |
+
rfc3986-validator==0.1.1
|
| 176 |
+
rich==14.3.3
|
| 177 |
+
rpds-py==0.20.0
|
| 178 |
+
rsa==4.9.1
|
| 179 |
+
safetensors==0.7.0
|
| 180 |
+
scipy==1.17.1
|
| 181 |
+
Send2Trash==1.8.3
|
| 182 |
+
sentencepiece==0.2.1
|
| 183 |
+
simplejson==3.20.2
|
| 184 |
+
six==1.17.0
|
| 185 |
+
sniffio==1.3.1
|
| 186 |
+
soupsieve==2.6
|
| 187 |
+
stack-data==0.6.3
|
| 188 |
+
sympy==1.12
|
| 189 |
+
tensorstore==0.1.81
|
| 190 |
+
termcolor==3.3.0
|
| 191 |
+
terminado==0.18.1
|
| 192 |
+
tinycss2==1.3.0
|
| 193 |
+
tokenizers==0.21.4
|
| 194 |
+
toml==0.10.2
|
| 195 |
+
torch==2.4.1+cu124
|
| 196 |
+
torchaudio==2.4.1+cu124
|
| 197 |
+
torchvision==0.19.1+cu124
|
| 198 |
+
tornado==6.4.1
|
| 199 |
+
tqdm==4.67.3
|
| 200 |
+
tqdm-loggable==0.3
|
| 201 |
+
traitlets==5.14.3
|
| 202 |
+
transformers==4.53.2
|
| 203 |
+
treescope==0.1.10
|
| 204 |
+
triton==3.0.0
|
| 205 |
+
typeguard==4.5.1
|
| 206 |
+
types-python-dateutil==2.9.0.20240906
|
| 207 |
+
typing-inspect==0.9.0
|
| 208 |
+
typing-inspection==0.4.2
|
| 209 |
+
typing_extensions==4.15.0
|
| 210 |
+
tyro==1.0.8
|
| 211 |
+
uri-template==1.3.0
|
| 212 |
+
urllib3==2.6.3
|
| 213 |
+
wadler_lindig==0.1.7
|
| 214 |
+
wcwidth==0.2.13
|
| 215 |
+
webcolors==24.8.0
|
| 216 |
+
webencodings==0.5.1
|
| 217 |
+
websocket-client==1.8.0
|
| 218 |
+
websockets==16.0
|
| 219 |
+
widgetsnbextension==4.0.13
|
| 220 |
+
wrapt==2.1.2
|
| 221 |
+
xxhash==3.6.0
|
| 222 |
+
yarl==1.23.0
|
| 223 |
+
zipp==3.23.0
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/python_version.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Python 3.11.10
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/torch_env.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
<frozen runpy>:128: RuntimeWarning: 'torch.utils.collect_env' found in sys.modules after import of package 'torch.utils', but prior to execution of 'torch.utils.collect_env'; this may result in unpredictable behaviour
|
artifacts/twin_dual_push_128_stepcmp_2k_20260311_debug/environment/uname.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Linux f87904697f84 6.8.0-90-generic #91-Ubuntu SMP PREEMPT_DYNAMIC Tue Nov 18 14:14:30 UTC 2025 x86_64 x86_64 x86_64 GNU/Linux
|
openpi/checkpoints/debug_pi05_split_communicating_pytorch_smoke/debug_pi05_split_communicating_pytorch_smoke/1/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f9fa98e4f1c6159fd9b956a11323f5990b8d92aae3553eb4785ee7341c79a680
|
| 3 |
+
size 3438041490
|
openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/1/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d726e07e1c039c30cab249de7f558f75d04a5e3c7151fb9e08ab7b9c804d7342
|
| 3 |
+
size 1850670584
|
openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/1/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f3c7cbab597ad818c2570f4920d8a9b7d396053543ee538a8b97b4ba623bfe5
|
| 3 |
+
size 3438040655
|
openpi/checkpoints/debug_pi05_split_independent_pytorch_smoke/debug_pi05_split_independent_pytorch_smoke/2/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c68994d9d8e008f11db503c715bbbf739ca7b9f5144ac90c4822f90a55d003f
|
| 3 |
+
size 3438040655
|
openpi/run_logs/split_independent_real_smoke20.log
ADDED
|
File without changes
|
openpi/run_logs/split_independent_real_smoke3.log
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
19:24:48.871 [I] Created experiment checkpoint directory: /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_smoke3 (19525:train_pytorch.py:533)
|
| 2 |
+
19:24:48.874 [I] Using batch size per GPU: 1 (total batch size across 1 GPUs: 1) (19525:train_pytorch.py:552)
|
| 3 |
+
19:24:48.988 [I] Loaded norm stats from /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train (19525:config.py:234)
|
| 4 |
+
19:24:48.990 [I] data_config: DataConfig(repo_id='lsnu/twin_dual_push_128_train', asset_id='lsnu/twin_dual_push_128_train', norm_stats={'state': NormStats(mean=array([ 0.10604009, 0.20956482, 0.09184283, -1.98801565, -0.04930164,
|
| 5 |
+
2.20065784, 1.07595289, 0.52742052, 0.01585805, 0.08288047,
|
| 6 |
+
-0.06887393, -1.906394 , 0.04810138, 2.01086807, -0.92902797,
|
| 7 |
+
0.8440811 ]), std=array([0.09207697, 0.31317395, 0.08127229, 0.53812712, 0.06093267,
|
| 8 |
+
0.51205784, 0.22527155, 0.49924755, 0.20230208, 0.31408131,
|
| 9 |
+
0.21665592, 0.5264315 , 0.20170984, 0.4745712 , 1.17861438,
|
| 10 |
+
0.36277843]), q01=array([-5.00321221e-06, -3.88026012e-01, -2.23782954e-05, -2.98962682e+00,
|
| 11 |
+
-2.38592355e-01, 1.22146201e+00, 7.85383821e-01, 0.00000000e+00,
|
| 12 |
+
-6.15615927e-01, -4.14941930e-01, -9.43696350e-01, -2.88397729e+00,
|
| 13 |
+
-9.05083556e-01, 1.22148895e+00, -2.79564499e+00, 0.00000000e+00]), q99=array([ 0.31251293, 0.86546916, 0.35174239, -0.87634897, 0.05212194,
|
| 14 |
+
2.97208117, 1.64465171, 0.9998 , 0.7670313 , 0.96073459,
|
| 15 |
+
0.68710467, -0.87498123, 0.35838486, 2.9773227 , 0.78477909,
|
| 16 |
+
0.9998 ])), 'actions': NormStats(mean=array([ 0.03630241, 0.09624442, 0.01367408, -0.2224988 , -0.02762174,
|
| 17 |
+
0.27498844, 0.0892187 , 0.45650524, -0.00378086, 0.09113847,
|
| 18 |
+
-0.00376227, -0.22537093, 0.00826233, 0.26799494, -0.57452869,
|
| 19 |
+
0.7731654 ]), std=array([0.04995174, 0.29268014, 0.06852161, 0.3647725 , 0.07012808,
|
| 20 |
+
0.27129024, 0.11329207, 0.4981046 , 0.0917461 , 0.22704004,
|
| 21 |
+
0.1069391 , 0.2572591 , 0.11801817, 0.1235588 , 0.35835782,
|
| 22 |
+
0.41878474]), q01=array([-5.86206436e-04, -3.88117499e-01, -2.55800724e-01, -8.34769463e-01,
|
| 23 |
+
-3.51454727e-01, -1.54787922e-03, -5.81741333e-04, 0.00000000e+00,
|
| 24 |
+
-2.64436970e-01, -3.51582764e-01, -3.69693995e-01, -7.30919549e-01,
|
| 25 |
+
-3.35441585e-01, -6.62303925e-04, -9.34731126e-01, 0.00000000e+00]), q99=array([0.20790743, 0.81198567, 0.19612836, 0.33958174, 0.05568643,
|
| 26 |
+
0.75265345, 0.425256 , 0.9998 , 0.2558236 , 0.58901345,
|
| 27 |
+
0.35822071, 0.18567593, 0.44035054, 0.49966629, 0.12655233,
|
| 28 |
+
0.9998 ]))}, repack_transforms=Group(inputs=[RepackTransform(structure={'images': {'cam_high': 'front_image', 'cam_left_wrist': 'wrist_left_image', 'cam_right_wrist': 'wrist_right_image'}, 'state': 'state', 'actions': 'action', 'prompt': 'task'})], outputs=()), data_transforms=Group(inputs=[AlohaInputs(adapt_to_pi=False)], outputs=[]), model_transforms=Group(inputs=[InjectDefaultPrompt(prompt=None), ResizeImages(height=224, width=224), TokenizePrompt(tokenizer=<openpi.models.tokenizer.PaligemmaTokenizer object at 0x70f9ab2c5d10>, discrete_state_input=True), PackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))], outputs=[UnpackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))]), use_quantile_norm=True, action_sequence_keys=('action',), prompt_from_task=False, rlds_data_dir=None, action_space=None, datasets=()) (19525:data_loader.py:284)
|
| 29 |
+
19:24:57.449 [I] JAX version 0.5.3 available. (19525:config.py:125)
|
| 30 |
+
19:25:32.845 [I] Using existing local LeRobot dataset mirror for lsnu/twin_dual_push_128_train: /workspace/lerobot/lsnu/twin_dual_push_128_train (19525:data_loader.py:148)
|
| 31 |
+
19:25:33.031 [W] 'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder (19525:video_utils.py:36)
|
| 32 |
+
19:26:39.374 [I] local_batch_size: 1 (19525:data_loader.py:365)
|
| 33 |
+
19:28:30.580 [I] Enabled gradient checkpointing for PI0Pytorch model (19525:pi0_pytorch.py:138)
|
| 34 |
+
19:28:30.582 [I] Enabled gradient checkpointing for memory optimization (19525:train_pytorch.py:624)
|
| 35 |
+
19:28:30.583 [I] Step 0 (after_model_creation): GPU memory - allocated: 17.23GB, reserved: 17.23GB, free: 0.00GB, peak_allocated: 17.23GB, peak_reserved: 17.23GB (19525:train_pytorch.py:493)
|
| 36 |
+
19:28:30.583 [I] Loading weights from: /workspace/checkpoints/pi05_base_split_independent_packed_from_single (19525:train_pytorch.py:653)
|
| 37 |
+
19:28:34.495 [I] Weight loading missing key count: 0 (19525:train_pytorch.py:657)
|
| 38 |
+
19:28:34.495 [I] Weight loading missing keys: set() (19525:train_pytorch.py:658)
|
| 39 |
+
19:28:34.496 [I] Weight loading unexpected key count: 0 (19525:train_pytorch.py:659)
|
| 40 |
+
19:28:34.496 [I] Weight loading unexpected keys: [] (19525:train_pytorch.py:660)
|
| 41 |
+
19:28:34.497 [I] Loaded PyTorch weights from /workspace/checkpoints/pi05_base_split_independent_packed_from_single (19525:train_pytorch.py:661)
|
| 42 |
+
19:28:34.501 [I] Running on: 963c158043aa | world_size=1 (19525:train_pytorch.py:701)
|
| 43 |
+
19:28:34.501 [I] Training config: batch_size=1, effective_batch_size=1, num_train_steps=3 (19525:train_pytorch.py:702)
|
| 44 |
+
19:28:34.502 [I] Memory optimizations: gradient_checkpointing=True (19525:train_pytorch.py:705)
|
| 45 |
+
19:28:34.502 [I] DDP settings: find_unused_parameters=False, gradient_as_bucket_view=True, static_graph=True (19525:train_pytorch.py:706)
|
| 46 |
+
19:28:34.502 [I] LR schedule: warmup=250, peak_lr=2.50e-05, decay_steps=5000, end_lr=2.50e-06 (19525:train_pytorch.py:707)
|
| 47 |
+
19:28:34.503 [I] Optimizer: AdamW, weight_decay=1e-10, clip_norm=1.0 (19525:train_pytorch.py:710)
|
| 48 |
+
19:28:34.503 [I] EMA is not supported for PyTorch training (19525:train_pytorch.py:713)
|
| 49 |
+
19:28:34.504 [I] Training precision: float32 (19525:train_pytorch.py:714)
|
| 50 |
+
19:28:34.516 [I] Resolved config name: pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k (19525:train_pytorch.py:308)
|
| 51 |
+
19:28:34.516 [I] Dataset repo_id: lsnu/twin_dual_push_128_train (19525:train_pytorch.py:309)
|
| 52 |
+
19:28:34.517 [I] Norm-stats file path: /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json (19525:train_pytorch.py:310)
|
| 53 |
+
19:28:34.518 [I] Norm-stats summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (19525:train_pytorch.py:311)
|
| 54 |
+
19:28:34.518 [I] Checkpoint source path: /workspace/checkpoints/pi05_base_split_independent_packed_from_single (19525:train_pytorch.py:312)
|
| 55 |
+
19:28:34.519 [I] Model type: split_independent (19525:train_pytorch.py:313)
|
| 56 |
+
19:28:34.519 [I] Packed transforms active: True (19525:train_pytorch.py:314)
|
| 57 |
+
19:28:34.519 [I] World size: 1 (19525:train_pytorch.py:315)
|
| 58 |
+
19:28:34.520 [I] Batch size: local=1, global=1 (19525:train_pytorch.py:316)
|
| 59 |
+
19:28:34.520 [I] num_workers: 0 (19525:train_pytorch.py:317)
|
| 60 |
+
19:28:34.521 [I] Precision: float32 (19525:train_pytorch.py:318)
|
| 61 |
+
19:28:34.521 [I] LR schedule summary: warmup_steps=250, peak_lr=2.50e-05, decay_steps=5000, decay_lr=2.50e-06 (19525:train_pytorch.py:319)
|
| 62 |
+
19:28:34.522 [I] Save/log intervals: save_interval=3, log_interval=1 (19525:train_pytorch.py:326)
|
| 63 |
+
19:28:34.522 [I] Action-loss mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (19525:train_pytorch.py:327)
|
| 64 |
+
19:28:34.522 [I] Active mask dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] (19525:train_pytorch.py:328)
|
| 65 |
+
19:28:34.522 [I] Masked dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] (19525:train_pytorch.py:329)
|
| 66 |
+
19:28:34.523 [I] Gradient bucket diagnostics: left_action_in, right_action_in, left_expert, right_expert, action_out, cross_arm_comm (19525:train_pytorch.py:722)
|
| 67 |
+
|
| 68 |
+
File "/workspace/pi05tests/openpi/scripts/train_pytorch.py", line 939, in <module>
|
| 69 |
+
main()
|
| 70 |
+
File "/workspace/pi05tests/openpi/scripts/train_pytorch.py", line 935, in main
|
| 71 |
+
train_loop(config)
|
| 72 |
+
File "/workspace/pi05tests/openpi/scripts/train_pytorch.py", line 747, in train_loop
|
| 73 |
+
for observation, actions in loader:
|
| 74 |
+
File "/workspace/pi05tests/openpi/src/openpi/training/data_loader.py", line 596, in __iter__
|
| 75 |
+
for batch in self._data_loader:
|
| 76 |
+
File "/workspace/pi05tests/openpi/src/openpi/training/data_loader.py", line 510, in __iter__
|
| 77 |
+
batch = next(data_iter)
|
| 78 |
+
^^^^^^^^^^^^^^^
|
| 79 |
+
File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 630, in __next__
|
| 80 |
+
data = self._next_data()
|
| 81 |
+
^^^^^^^^^^^^^^^^^
|
| 82 |
+
File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 673, in _next_data
|
| 83 |
+
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
|
| 84 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 85 |
+
File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
|
| 86 |
+
data = [self.dataset[idx] for idx in possibly_batched_index]
|
| 87 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 88 |
+
File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
|
| 89 |
+
data = [self.dataset[idx] for idx in possibly_batched_index]
|
| 90 |
+
~~~~~~~~~~~~^^^^^
|
| 91 |
+
File "/workspace/pi05tests/openpi/src/openpi/training/data_loader.py", line 67, in __getitem__
|
| 92 |
+
return self._transform(self._dataset[index])
|
| 93 |
+
~~~~~~~~~~~~~^^^^^^^
|
| 94 |
+
File "/workspace/pi05tests/openpi/.venv/lib/python3.11/site-packages/lerobot/common/datasets/lerobot_dataset.py", line 742, in __getitem__
|
| 95 |
+
query_result = self._query_hf_dataset(query_indices)
|
| 96 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 97 |
+
File "/workspace/pi05tests/openpi/.venv/lib/python3.11/site-packages/lerobot/common/datasets/lerobot_dataset.py", line 707, in _query_hf_dataset
|
| 98 |
+
return {
|
| 99 |
+
^
|
| 100 |
+
File "/workspace/pi05tests/openpi/.venv/lib/python3.11/site-packages/lerobot/common/datasets/lerobot_dataset.py", line 708, in <dictcomp>
|
| 101 |
+
key: torch.stack(self.hf_dataset.select(q_idx)[key])
|
| 102 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 103 |
+
TypeError: stack(): argument 'tensors' (position 1) must be tuple of Tensors, not Column
|
| 104 |
+
|
openpi/scripts/collect_twin_dual_push_128_stepcmp_metrics.py
ADDED
|
@@ -0,0 +1,913 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import ast
|
| 7 |
+
import csv
|
| 8 |
+
import dataclasses
|
| 9 |
+
import json
|
| 10 |
+
import math
|
| 11 |
+
import pathlib
|
| 12 |
+
import re
|
| 13 |
+
import statistics
|
| 14 |
+
from collections import defaultdict
|
| 15 |
+
from typing import Any
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
OPENPI_ROOT = pathlib.Path(__file__).resolve().parents[1]
|
| 19 |
+
STEP_ORDER = (0, 100, 500, 2000)
|
| 20 |
+
SAMPLE_STEPS = (1, 2, 4, 8, 16)
|
| 21 |
+
BASE_TRAIN_COLUMNS = [
|
| 22 |
+
"model_variant",
|
| 23 |
+
"config_name",
|
| 24 |
+
"exp_name",
|
| 25 |
+
"step",
|
| 26 |
+
"loss",
|
| 27 |
+
"smoothed_loss",
|
| 28 |
+
"lr",
|
| 29 |
+
"grad_norm",
|
| 30 |
+
"step_time_s",
|
| 31 |
+
"data_time_s",
|
| 32 |
+
"items_per_second",
|
| 33 |
+
"eta_seconds",
|
| 34 |
+
"max_cuda_memory_gb",
|
| 35 |
+
]
|
| 36 |
+
TIMING_RE = re.compile(r"mean=([0-9.]+)\s+std=([0-9.]+)\s+min=([0-9.]+)\s+max=([0-9.]+)")
|
| 37 |
+
KV_PAIR_RE = re.compile(r"([A-Za-z0-9_./+-]+)=([^\s]+)")
|
| 38 |
+
SOURCE_SUFFIX_RE = re.compile(r"\s+\(\d+:[^)]+\)$")
|
| 39 |
+
TIMESTAMPED_INFO_RE = re.compile(r"^\d{2}:\d{2}:\d{2}\.\d{3} \[I\] (.*)$")
|
| 40 |
+
NORMALIZE_RE = re.compile(r"[^a-z0-9]+")
|
| 41 |
+
ERROR_MARKERS = ("Traceback", "FloatingPointError", "CUDA out of memory", "Non-finite", "RuntimeError:")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclasses.dataclass(frozen=True)
|
| 45 |
+
class ModelSpec:
|
| 46 |
+
key: str
|
| 47 |
+
model_variant: str
|
| 48 |
+
config_name: str
|
| 49 |
+
exp_name: str
|
| 50 |
+
step0_checkpoint: str
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
MODEL_SPECS = (
|
| 54 |
+
ModelSpec(
|
| 55 |
+
key="shared",
|
| 56 |
+
model_variant="shared",
|
| 57 |
+
config_name="pi05_twin_dual_push_128_packed_baseline_pytorch_5k",
|
| 58 |
+
exp_name="dual_push_128_stepcmp_shared_2k",
|
| 59 |
+
step0_checkpoint="/workspace/checkpoints/pi05_base_single_pytorch",
|
| 60 |
+
),
|
| 61 |
+
ModelSpec(
|
| 62 |
+
key="head_only",
|
| 63 |
+
model_variant="head_only_parallel",
|
| 64 |
+
config_name="pi05_twin_dual_push_128_packed_parallel_pytorch_5k",
|
| 65 |
+
exp_name="dual_push_128_stepcmp_head_only_2k",
|
| 66 |
+
step0_checkpoint="/workspace/checkpoints/pi05_base_parallel_packed_from_single",
|
| 67 |
+
),
|
| 68 |
+
ModelSpec(
|
| 69 |
+
key="split_ind",
|
| 70 |
+
model_variant="split_independent",
|
| 71 |
+
config_name="pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k",
|
| 72 |
+
exp_name="dual_push_128_stepcmp_split_ind_2k",
|
| 73 |
+
step0_checkpoint="/workspace/checkpoints/pi05_base_split_independent_packed_from_single",
|
| 74 |
+
),
|
| 75 |
+
ModelSpec(
|
| 76 |
+
key="split_comm",
|
| 77 |
+
model_variant="split_communicating",
|
| 78 |
+
config_name="pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k",
|
| 79 |
+
exp_name="dual_push_128_stepcmp_split_comm_2k",
|
| 80 |
+
step0_checkpoint="/workspace/checkpoints/pi05_base_split_communicating_packed_from_single",
|
| 81 |
+
),
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
STARTUP_LABEL_MAP = {
|
| 85 |
+
"resolved_config_name": "config_name",
|
| 86 |
+
"dataset_repo_id": "dataset_repo_id",
|
| 87 |
+
"norm_stats_file_path": "norm_stats_file",
|
| 88 |
+
"norm_stats_summary": "norm_stats_summary",
|
| 89 |
+
"checkpoint_source_path": "checkpoint_source",
|
| 90 |
+
"model_type": "model_type",
|
| 91 |
+
"packed_transforms_active": "packed_transforms",
|
| 92 |
+
"world_size": "world_size",
|
| 93 |
+
"batch_size": "batch_size",
|
| 94 |
+
"num_workers": "num_workers",
|
| 95 |
+
"training_precision": "precision",
|
| 96 |
+
"lr_schedule": "lr_schedule",
|
| 97 |
+
"save_log_intervals": "save_log_intervals",
|
| 98 |
+
"action_loss_mask": "action_loss_mask",
|
| 99 |
+
"active_action_loss_dims": "active_mask_dims",
|
| 100 |
+
"masked_padded_dims": "masked_dims",
|
| 101 |
+
"gradient_bucket_diagnostics": "gradient_buckets",
|
| 102 |
+
"weight_loading_missing_key_count": "weight_missing_count",
|
| 103 |
+
"weight_loading_missing_keys": "weight_missing_keys",
|
| 104 |
+
"weight_loading_unexpected_key_count": "weight_unexpected_count",
|
| 105 |
+
"weight_loading_unexpected_keys": "weight_unexpected_keys",
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def parse_args() -> argparse.Namespace:
|
| 110 |
+
parser = argparse.ArgumentParser()
|
| 111 |
+
parser.add_argument("--artifact_root", required=True)
|
| 112 |
+
parser.add_argument("--prior_metrics_root", default="")
|
| 113 |
+
return parser.parse_args()
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def normalize_label(label: str) -> str:
|
| 117 |
+
return NORMALIZE_RE.sub("_", label.lower()).strip("_")
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def strip_source_suffix(text: str) -> str:
|
| 121 |
+
return SOURCE_SUFFIX_RE.sub("", text.rstrip())
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def extract_info_body(line: str) -> str | None:
|
| 125 |
+
match = TIMESTAMPED_INFO_RE.match(strip_source_suffix(line))
|
| 126 |
+
return match.group(1) if match else None
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def natural_key(text: str) -> list[Any]:
|
| 130 |
+
parts = re.split(r"(\d+)", text)
|
| 131 |
+
key: list[Any] = []
|
| 132 |
+
for part in parts:
|
| 133 |
+
if not part:
|
| 134 |
+
continue
|
| 135 |
+
key.append(int(part) if part.isdigit() else part)
|
| 136 |
+
return key
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def format_float(value: float | None, digits: int = 6) -> str:
|
| 140 |
+
if value is None or math.isnan(value):
|
| 141 |
+
return "n/a"
|
| 142 |
+
return f"{value:.{digits}f}"
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def format_delta(value: float | None, digits: int = 6) -> str:
|
| 146 |
+
if value is None or math.isnan(value):
|
| 147 |
+
return "n/a"
|
| 148 |
+
return f"{value:+.{digits}f}"
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def resolve_checkpoint_path(raw_path: str) -> str:
|
| 152 |
+
path = pathlib.Path(raw_path)
|
| 153 |
+
if path.is_absolute():
|
| 154 |
+
return str(path.resolve())
|
| 155 |
+
return str((OPENPI_ROOT / path).resolve())
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def parse_float(text: str) -> float:
|
| 159 |
+
return float(text)
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def parse_optional_float(text: str | None) -> float | None:
|
| 163 |
+
if text is None or text == "":
|
| 164 |
+
return None
|
| 165 |
+
return float(text)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def parse_timing(text: str) -> dict[str, float]:
|
| 169 |
+
match = TIMING_RE.search(text)
|
| 170 |
+
if not match:
|
| 171 |
+
raise ValueError(f"Unable to parse timing summary: {text!r}")
|
| 172 |
+
return {
|
| 173 |
+
"mean": float(match.group(1)),
|
| 174 |
+
"std": float(match.group(2)),
|
| 175 |
+
"min": float(match.group(3)),
|
| 176 |
+
"max": float(match.group(4)),
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def parse_literal_count(text: str) -> int:
|
| 181 |
+
text = text.strip()
|
| 182 |
+
if text == "set()":
|
| 183 |
+
return 0
|
| 184 |
+
value = ast.literal_eval(text)
|
| 185 |
+
return len(value)
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def read_text_lines(path: pathlib.Path) -> list[str]:
|
| 189 |
+
return path.read_text(encoding="utf-8", errors="replace").splitlines()
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def detect_errors(lines: list[str]) -> list[str]:
|
| 193 |
+
errors = []
|
| 194 |
+
for line in lines:
|
| 195 |
+
if any(marker in line for marker in ERROR_MARKERS):
|
| 196 |
+
errors.append(line.strip())
|
| 197 |
+
return errors
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def parse_train_log(path: pathlib.Path, spec: ModelSpec) -> tuple[dict[str, str], list[dict[str, Any]], list[str]]:
|
| 201 |
+
lines = read_text_lines(path)
|
| 202 |
+
startup: dict[str, str] = {}
|
| 203 |
+
step_rows: list[dict[str, Any]] = []
|
| 204 |
+
errors = detect_errors(lines)
|
| 205 |
+
|
| 206 |
+
for line in lines:
|
| 207 |
+
body = extract_info_body(line)
|
| 208 |
+
if not body:
|
| 209 |
+
continue
|
| 210 |
+
if body.startswith("step="):
|
| 211 |
+
metrics = {key: value for key, value in KV_PAIR_RE.findall(body)}
|
| 212 |
+
row = {
|
| 213 |
+
"model_variant": spec.model_variant,
|
| 214 |
+
"config_name": spec.config_name,
|
| 215 |
+
"exp_name": spec.exp_name,
|
| 216 |
+
"step": int(metrics["step"]),
|
| 217 |
+
"loss": parse_float(metrics["loss"]),
|
| 218 |
+
"smoothed_loss": parse_float(metrics["smoothed_loss"]),
|
| 219 |
+
"lr": parse_float(metrics["lr"]),
|
| 220 |
+
"grad_norm": parse_float(metrics["grad_norm"]),
|
| 221 |
+
"step_time_s": parse_float(metrics["step_time"].rstrip("s")),
|
| 222 |
+
"data_time_s": parse_float(metrics["data_time"].rstrip("s")),
|
| 223 |
+
"items_per_second": parse_float(metrics["it/s"]),
|
| 224 |
+
"eta_seconds": parse_float(next(value for key, value in metrics.items() if key.startswith("eta_to_")).rstrip("s")),
|
| 225 |
+
"max_cuda_memory_gb": parse_float(metrics["max_cuda_memory"].rstrip("GB")),
|
| 226 |
+
}
|
| 227 |
+
for key, value in metrics.items():
|
| 228 |
+
if key in {"step", "loss", "smoothed_loss", "lr", "grad_norm", "step_time", "data_time", "it/s", "max_cuda_memory"}:
|
| 229 |
+
continue
|
| 230 |
+
if key.startswith("eta_to_"):
|
| 231 |
+
continue
|
| 232 |
+
row[key] = parse_float(value)
|
| 233 |
+
step_rows.append(row)
|
| 234 |
+
continue
|
| 235 |
+
|
| 236 |
+
if ": " not in body:
|
| 237 |
+
continue
|
| 238 |
+
label, value = body.split(": ", 1)
|
| 239 |
+
startup_key = STARTUP_LABEL_MAP.get(normalize_label(label))
|
| 240 |
+
if startup_key:
|
| 241 |
+
startup[startup_key] = value.strip()
|
| 242 |
+
|
| 243 |
+
return startup, step_rows, errors
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def parse_eval_log(path: pathlib.Path) -> tuple[dict[str, str], int | None, list[str]]:
|
| 247 |
+
lines = read_text_lines(path)
|
| 248 |
+
metrics: dict[str, str] = {}
|
| 249 |
+
sample_batch_size: int | None = None
|
| 250 |
+
errors = detect_errors(lines)
|
| 251 |
+
|
| 252 |
+
for line in lines:
|
| 253 |
+
stripped = strip_source_suffix(line.strip())
|
| 254 |
+
if stripped.startswith("sample_eval enabled="):
|
| 255 |
+
kv = {key: value for key, value in KV_PAIR_RE.findall(stripped)}
|
| 256 |
+
if "batch_size" in kv:
|
| 257 |
+
sample_batch_size = int(kv["batch_size"])
|
| 258 |
+
if ": " not in stripped:
|
| 259 |
+
continue
|
| 260 |
+
key, value = stripped.split(": ", 1)
|
| 261 |
+
metrics[key.strip()] = value.strip()
|
| 262 |
+
|
| 263 |
+
return metrics, sample_batch_size, errors
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
def write_csv(path: pathlib.Path, rows: list[dict[str, Any]], columns: list[str]) -> None:
|
| 267 |
+
with path.open("w", encoding="utf-8", newline="") as handle:
|
| 268 |
+
writer = csv.DictWriter(handle, fieldnames=columns)
|
| 269 |
+
writer.writeheader()
|
| 270 |
+
for row in rows:
|
| 271 |
+
writer.writerow({column: row.get(column, "") for column in columns})
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def build_teacher_row(spec: ModelSpec, step: int, metrics: dict[str, str]) -> dict[str, Any]:
|
| 275 |
+
timing = parse_timing(metrics["per_batch_timing_seconds"])
|
| 276 |
+
return {
|
| 277 |
+
"model_variant": spec.model_variant,
|
| 278 |
+
"config_name": spec.config_name,
|
| 279 |
+
"exp_name": spec.exp_name,
|
| 280 |
+
"checkpoint_step": step,
|
| 281 |
+
"checkpoint_path": resolve_checkpoint_path(metrics["checkpoint_path"]),
|
| 282 |
+
"repo_id": metrics["repo_id_used"],
|
| 283 |
+
"num_batches": int(metrics["num_batches"]),
|
| 284 |
+
"mean_val_loss": parse_float(metrics["mean_val_loss"]),
|
| 285 |
+
"std_val_loss": parse_float(metrics["std_val_loss"]),
|
| 286 |
+
"mean_left_arm_loss": parse_float(metrics["mean_left_arm_loss"]),
|
| 287 |
+
"std_left_arm_loss": parse_float(metrics["std_left_arm_loss"]),
|
| 288 |
+
"mean_right_arm_loss": parse_float(metrics["mean_right_arm_loss"]),
|
| 289 |
+
"std_right_arm_loss": parse_float(metrics["std_right_arm_loss"]),
|
| 290 |
+
"mean_left_joint_loss": parse_float(metrics["mean_left_joint_loss"]),
|
| 291 |
+
"std_left_joint_loss": parse_float(metrics["std_left_joint_loss"]),
|
| 292 |
+
"mean_left_gripper_loss": parse_float(metrics["mean_left_gripper_loss"]),
|
| 293 |
+
"std_left_gripper_loss": parse_float(metrics["std_left_gripper_loss"]),
|
| 294 |
+
"mean_right_joint_loss": parse_float(metrics["mean_right_joint_loss"]),
|
| 295 |
+
"std_right_joint_loss": parse_float(metrics["std_right_joint_loss"]),
|
| 296 |
+
"mean_right_gripper_loss": parse_float(metrics["mean_right_gripper_loss"]),
|
| 297 |
+
"std_right_gripper_loss": parse_float(metrics["std_right_gripper_loss"]),
|
| 298 |
+
"mean_left_right_imbalance": parse_float(metrics["mean_left_right_imbalance"]),
|
| 299 |
+
"std_left_right_imbalance": parse_float(metrics["std_left_right_imbalance"]),
|
| 300 |
+
"per_batch_time_mean_s": timing["mean"],
|
| 301 |
+
"per_batch_time_std_s": timing["std"],
|
| 302 |
+
"per_batch_time_min_s": timing["min"],
|
| 303 |
+
"per_batch_time_max_s": timing["max"],
|
| 304 |
+
"weight_loading_missing_count": parse_literal_count(metrics["weight_loading_missing_keys"]),
|
| 305 |
+
"weight_loading_unexpected_count": parse_literal_count(metrics["weight_loading_unexpected_keys"]),
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
def build_sample_rows(spec: ModelSpec, step: int, metrics: dict[str, str]) -> list[dict[str, Any]]:
|
| 310 |
+
rows = []
|
| 311 |
+
for sample_steps in SAMPLE_STEPS:
|
| 312 |
+
prefix = f"sample_eval_num_steps_{sample_steps}_"
|
| 313 |
+
timing = parse_timing(metrics[f"{prefix}per_batch_timing_seconds"])
|
| 314 |
+
rows.append(
|
| 315 |
+
{
|
| 316 |
+
"model_variant": spec.model_variant,
|
| 317 |
+
"config_name": spec.config_name,
|
| 318 |
+
"exp_name": spec.exp_name,
|
| 319 |
+
"checkpoint_step": step,
|
| 320 |
+
"checkpoint_path": resolve_checkpoint_path(metrics["checkpoint_path"]),
|
| 321 |
+
"repo_id": metrics["repo_id_used"],
|
| 322 |
+
"sample_num_steps": sample_steps,
|
| 323 |
+
"sample_num_batches": int(metrics[f"{prefix}num_batches"]),
|
| 324 |
+
"mean_masked_mae": parse_float(metrics[f"{prefix}mean_masked_mae"]),
|
| 325 |
+
"std_masked_mae": parse_float(metrics[f"{prefix}std_masked_mae"]),
|
| 326 |
+
"mean_left_arm_mae": parse_float(metrics[f"{prefix}mean_left_arm_mae"]),
|
| 327 |
+
"std_left_arm_mae": parse_float(metrics[f"{prefix}std_left_arm_mae"]),
|
| 328 |
+
"mean_right_arm_mae": parse_float(metrics[f"{prefix}mean_right_arm_mae"]),
|
| 329 |
+
"std_right_arm_mae": parse_float(metrics[f"{prefix}std_right_arm_mae"]),
|
| 330 |
+
"mean_left_joint_mae": parse_float(metrics[f"{prefix}mean_left_joint_mae"]),
|
| 331 |
+
"std_left_joint_mae": parse_float(metrics[f"{prefix}std_left_joint_mae"]),
|
| 332 |
+
"mean_left_gripper_mae": parse_float(metrics[f"{prefix}mean_left_gripper_mae"]),
|
| 333 |
+
"std_left_gripper_mae": parse_float(metrics[f"{prefix}std_left_gripper_mae"]),
|
| 334 |
+
"mean_right_joint_mae": parse_float(metrics[f"{prefix}mean_right_joint_mae"]),
|
| 335 |
+
"std_right_joint_mae": parse_float(metrics[f"{prefix}std_right_joint_mae"]),
|
| 336 |
+
"mean_right_gripper_mae": parse_float(metrics[f"{prefix}mean_right_gripper_mae"]),
|
| 337 |
+
"std_right_gripper_mae": parse_float(metrics[f"{prefix}std_right_gripper_mae"]),
|
| 338 |
+
"mean_left_right_imbalance_mae": parse_float(metrics[f"{prefix}mean_left_right_imbalance_mae"]),
|
| 339 |
+
"std_left_right_imbalance_mae": parse_float(metrics[f"{prefix}std_left_right_imbalance_mae"]),
|
| 340 |
+
"per_batch_time_mean_s": timing["mean"],
|
| 341 |
+
"per_batch_time_std_s": timing["std"],
|
| 342 |
+
"per_batch_time_min_s": timing["min"],
|
| 343 |
+
"per_batch_time_max_s": timing["max"],
|
| 344 |
+
}
|
| 345 |
+
)
|
| 346 |
+
return rows
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
def row_index(rows: list[dict[str, Any]], *keys: str) -> dict[tuple[Any, ...], dict[str, Any]]:
|
| 350 |
+
return {tuple(row[key] for key in keys): row for row in rows}
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
def average(values: list[float]) -> float | None:
|
| 354 |
+
return statistics.fmean(values) if values else None
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
def summarise_stability(train_rows: list[dict[str, Any]]) -> dict[str, Any]:
|
| 358 |
+
by_variant: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
| 359 |
+
for row in train_rows:
|
| 360 |
+
by_variant[row["model_variant"]].append(row)
|
| 361 |
+
|
| 362 |
+
summary: dict[str, Any] = {}
|
| 363 |
+
for variant, rows in by_variant.items():
|
| 364 |
+
grad_columns = [column for column in rows[0] if column.startswith("grad_")]
|
| 365 |
+
dead_columns = []
|
| 366 |
+
for column in grad_columns:
|
| 367 |
+
if max(abs(float(row.get(column, 0.0) or 0.0)) for row in rows) == 0.0:
|
| 368 |
+
dead_columns.append(column)
|
| 369 |
+
summary[variant] = {
|
| 370 |
+
"max_cuda_memory_gb": max(row["max_cuda_memory_gb"] for row in rows),
|
| 371 |
+
"dead_gradient_columns": dead_columns,
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
split_comm_rows = by_variant.get("split_communicating", [])
|
| 375 |
+
gate_columns = sorted(
|
| 376 |
+
{column for row in split_comm_rows for column in row if column.startswith("cross_arm_comm_gate_layer_")},
|
| 377 |
+
key=natural_key,
|
| 378 |
+
)
|
| 379 |
+
attn_columns = sorted(
|
| 380 |
+
{column for row in split_comm_rows for column in row if column.startswith("cross_arm_attention_mass_layer_")},
|
| 381 |
+
key=natural_key,
|
| 382 |
+
)
|
| 383 |
+
if split_comm_rows:
|
| 384 |
+
gate_values = [abs(float(row[column])) for row in split_comm_rows for column in gate_columns if column in row]
|
| 385 |
+
attn_values = [float(row[column]) for row in split_comm_rows for column in attn_columns if column in row]
|
| 386 |
+
grad_comm_values = [float(row.get("grad_cross_arm_comm", 0.0)) for row in split_comm_rows]
|
| 387 |
+
summary["split_communicating"]["communication"] = {
|
| 388 |
+
"gate_abs_max": max(gate_values) if gate_values else 0.0,
|
| 389 |
+
"gate_abs_mean": average(gate_values) or 0.0,
|
| 390 |
+
"attention_mass_mean": average(attn_values) or 0.0,
|
| 391 |
+
"attention_mass_max": max(attn_values) if attn_values else 0.0,
|
| 392 |
+
"grad_cross_arm_comm_mean": average(grad_comm_values) or 0.0,
|
| 393 |
+
"grad_cross_arm_comm_max": max(grad_comm_values) if grad_comm_values else 0.0,
|
| 394 |
+
"active": bool(attn_values and max(attn_values) > 0.0 and max(grad_comm_values, default=0.0) > 0.0),
|
| 395 |
+
}
|
| 396 |
+
return summary
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
def load_prior_metrics(prior_metrics_root: pathlib.Path) -> dict[str, Any]:
|
| 400 |
+
result: dict[str, Any] = {}
|
| 401 |
+
if not prior_metrics_root.exists():
|
| 402 |
+
return result
|
| 403 |
+
|
| 404 |
+
teacher_path = prior_metrics_root / "teacher_forced_eval_table.csv"
|
| 405 |
+
sample_path = prior_metrics_root / "sample_eval_table.csv"
|
| 406 |
+
|
| 407 |
+
if teacher_path.exists():
|
| 408 |
+
with teacher_path.open(encoding="utf-8", newline="") as handle:
|
| 409 |
+
rows = list(csv.DictReader(handle))
|
| 410 |
+
teacher_2000 = {row["model"]: row for row in rows if int(row["checkpoint_step"]) == 2000}
|
| 411 |
+
result["teacher_2000"] = {
|
| 412 |
+
"baseline": parse_optional_float(teacher_2000.get("baseline", {}).get("mean_val_loss")),
|
| 413 |
+
"parallel": parse_optional_float(teacher_2000.get("parallel", {}).get("mean_val_loss")),
|
| 414 |
+
}
|
| 415 |
+
|
| 416 |
+
if sample_path.exists():
|
| 417 |
+
with sample_path.open(encoding="utf-8", newline="") as handle:
|
| 418 |
+
rows = list(csv.DictReader(handle))
|
| 419 |
+
sample_2000_step4 = {
|
| 420 |
+
row["model"]: row
|
| 421 |
+
for row in rows
|
| 422 |
+
if int(row["checkpoint_step"]) == 2000 and int(row["num_steps"]) == 4
|
| 423 |
+
}
|
| 424 |
+
result["sample_2000_step4"] = {
|
| 425 |
+
"baseline": parse_optional_float(sample_2000_step4.get("baseline", {}).get("mean_masked_mae")),
|
| 426 |
+
"parallel": parse_optional_float(sample_2000_step4.get("parallel", {}).get("mean_masked_mae")),
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
return result
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
def build_summary(
|
| 433 |
+
artifact_root: pathlib.Path,
|
| 434 |
+
teacher_rows: list[dict[str, Any]],
|
| 435 |
+
sample_rows: list[dict[str, Any]],
|
| 436 |
+
train_rows: list[dict[str, Any]],
|
| 437 |
+
startup_summaries: dict[str, dict[str, str]],
|
| 438 |
+
log_errors: dict[str, list[str]],
|
| 439 |
+
sample_batch_size_used: str,
|
| 440 |
+
prior_metrics: dict[str, Any],
|
| 441 |
+
) -> dict[str, Any]:
|
| 442 |
+
teacher_by_key = row_index(teacher_rows, "model_variant", "checkpoint_step")
|
| 443 |
+
sample_by_key = row_index(sample_rows, "model_variant", "checkpoint_step", "sample_num_steps")
|
| 444 |
+
|
| 445 |
+
step0_teacher_gaps = {}
|
| 446 |
+
step0_sample_gaps = {}
|
| 447 |
+
shared_step0_teacher = teacher_by_key[("shared", 0)]["mean_val_loss"]
|
| 448 |
+
for spec in MODEL_SPECS:
|
| 449 |
+
variant = spec.model_variant
|
| 450 |
+
teacher_value = teacher_by_key[(variant, 0)]["mean_val_loss"]
|
| 451 |
+
step0_teacher_gaps[variant] = teacher_value - shared_step0_teacher
|
| 452 |
+
sample_deltas = []
|
| 453 |
+
for sample_steps in SAMPLE_STEPS:
|
| 454 |
+
variant_row = sample_by_key[(variant, 0, sample_steps)]
|
| 455 |
+
shared_row = sample_by_key[("shared", 0, sample_steps)]
|
| 456 |
+
sample_deltas.append(variant_row["mean_masked_mae"] - shared_row["mean_masked_mae"])
|
| 457 |
+
step0_sample_gaps[variant] = {
|
| 458 |
+
"average_delta_vs_shared": average(sample_deltas),
|
| 459 |
+
"per_steps_delta_vs_shared": {str(step): delta for step, delta in zip(SAMPLE_STEPS, sample_deltas, strict=True)},
|
| 460 |
+
}
|
| 461 |
+
|
| 462 |
+
warm_variants = [spec.model_variant for spec in MODEL_SPECS if spec.model_variant != "shared"]
|
| 463 |
+
smallest_teacher_variant = min(warm_variants, key=lambda variant: abs(step0_teacher_gaps[variant]))
|
| 464 |
+
smallest_sample_variant = min(
|
| 465 |
+
warm_variants,
|
| 466 |
+
key=lambda variant: abs(step0_sample_gaps[variant]["average_delta_vs_shared"] or 0.0),
|
| 467 |
+
)
|
| 468 |
+
|
| 469 |
+
teacher_improvements: dict[str, dict[str, float]] = defaultdict(dict)
|
| 470 |
+
sample_improvements: dict[str, dict[str, dict[str, float]]] = defaultdict(lambda: defaultdict(dict))
|
| 471 |
+
for spec in MODEL_SPECS:
|
| 472 |
+
variant = spec.model_variant
|
| 473 |
+
for start_step, end_step in zip(STEP_ORDER[:-1], STEP_ORDER[1:], strict=True):
|
| 474 |
+
teacher_improvements[variant][f"{start_step}_to_{end_step}"] = (
|
| 475 |
+
teacher_by_key[(variant, start_step)]["mean_val_loss"] - teacher_by_key[(variant, end_step)]["mean_val_loss"]
|
| 476 |
+
)
|
| 477 |
+
teacher_improvements[variant]["0_to_2000"] = (
|
| 478 |
+
teacher_by_key[(variant, 0)]["mean_val_loss"] - teacher_by_key[(variant, 2000)]["mean_val_loss"]
|
| 479 |
+
)
|
| 480 |
+
for sample_steps in SAMPLE_STEPS:
|
| 481 |
+
for start_step, end_step in zip(STEP_ORDER[:-1], STEP_ORDER[1:], strict=True):
|
| 482 |
+
sample_improvements[variant][str(sample_steps)][f"{start_step}_to_{end_step}"] = (
|
| 483 |
+
sample_by_key[(variant, start_step, sample_steps)]["mean_masked_mae"]
|
| 484 |
+
- sample_by_key[(variant, end_step, sample_steps)]["mean_masked_mae"]
|
| 485 |
+
)
|
| 486 |
+
sample_improvements[variant][str(sample_steps)]["0_to_2000"] = (
|
| 487 |
+
sample_by_key[(variant, 0, sample_steps)]["mean_masked_mae"]
|
| 488 |
+
- sample_by_key[(variant, 2000, sample_steps)]["mean_masked_mae"]
|
| 489 |
+
)
|
| 490 |
+
|
| 491 |
+
teacher_2k_ranking = sorted(
|
| 492 |
+
(
|
| 493 |
+
{
|
| 494 |
+
"model_variant": spec.model_variant,
|
| 495 |
+
"mean_val_loss": teacher_by_key[(spec.model_variant, 2000)]["mean_val_loss"],
|
| 496 |
+
"mean_left_right_imbalance": teacher_by_key[(spec.model_variant, 2000)]["mean_left_right_imbalance"],
|
| 497 |
+
"improvement_0_to_2000": teacher_by_key[(spec.model_variant, 0)]["mean_val_loss"]
|
| 498 |
+
- teacher_by_key[(spec.model_variant, 2000)]["mean_val_loss"],
|
| 499 |
+
}
|
| 500 |
+
for spec in MODEL_SPECS
|
| 501 |
+
),
|
| 502 |
+
key=lambda row: row["mean_val_loss"],
|
| 503 |
+
)
|
| 504 |
+
|
| 505 |
+
sample_2k_ranking = sorted(
|
| 506 |
+
(
|
| 507 |
+
{
|
| 508 |
+
"model_variant": spec.model_variant,
|
| 509 |
+
"mean_masked_mae_step_4": sample_by_key[(spec.model_variant, 2000, 4)]["mean_masked_mae"],
|
| 510 |
+
"mean_masked_mae_step_16": sample_by_key[(spec.model_variant, 2000, 16)]["mean_masked_mae"],
|
| 511 |
+
"mean_masked_mae_average": statistics.fmean(
|
| 512 |
+
sample_by_key[(spec.model_variant, 2000, sample_steps)]["mean_masked_mae"] for sample_steps in SAMPLE_STEPS
|
| 513 |
+
),
|
| 514 |
+
}
|
| 515 |
+
for spec in MODEL_SPECS
|
| 516 |
+
),
|
| 517 |
+
key=lambda row: row["mean_masked_mae_average"],
|
| 518 |
+
)
|
| 519 |
+
|
| 520 |
+
stability = summarise_stability(train_rows)
|
| 521 |
+
bootstrap_sanity = {
|
| 522 |
+
spec.model_variant: {
|
| 523 |
+
"step0_weight_loading_missing_count": teacher_by_key[(spec.model_variant, 0)]["weight_loading_missing_count"],
|
| 524 |
+
"step0_weight_loading_unexpected_count": teacher_by_key[(spec.model_variant, 0)]["weight_loading_unexpected_count"],
|
| 525 |
+
}
|
| 526 |
+
for spec in MODEL_SPECS
|
| 527 |
+
}
|
| 528 |
+
invariant_logs = {
|
| 529 |
+
"split_independent": (artifact_root / "sanity_checks/check_split_independent_invariants.log").exists(),
|
| 530 |
+
"split_communicating": (artifact_root / "sanity_checks/check_split_communicating_invariants.log").exists(),
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
prior_regression = {}
|
| 534 |
+
teacher_prior = prior_metrics.get("teacher_2000", {})
|
| 535 |
+
sample_prior = prior_metrics.get("sample_2000_step4", {})
|
| 536 |
+
current_shared_teacher_2k = teacher_by_key[("shared", 2000)]["mean_val_loss"]
|
| 537 |
+
current_head_only_teacher_2k = teacher_by_key[("head_only_parallel", 2000)]["mean_val_loss"]
|
| 538 |
+
if teacher_prior:
|
| 539 |
+
prior_delta = (teacher_prior.get("baseline") or 0.0) - (teacher_prior.get("parallel") or 0.0)
|
| 540 |
+
current_delta = current_shared_teacher_2k - current_head_only_teacher_2k
|
| 541 |
+
prior_regression["teacher_forced_2000"] = {
|
| 542 |
+
"prior_baseline": teacher_prior.get("baseline"),
|
| 543 |
+
"prior_parallel": teacher_prior.get("parallel"),
|
| 544 |
+
"prior_parallel_edge": prior_delta,
|
| 545 |
+
"current_shared": current_shared_teacher_2k,
|
| 546 |
+
"current_head_only_parallel": current_head_only_teacher_2k,
|
| 547 |
+
"current_head_only_edge": current_delta,
|
| 548 |
+
"direction_matches": (prior_delta > 0 and current_delta > 0) or (prior_delta < 0 and current_delta < 0),
|
| 549 |
+
}
|
| 550 |
+
if sample_prior:
|
| 551 |
+
current_shared_sample_2k = sample_by_key[("shared", 2000, 4)]["mean_masked_mae"]
|
| 552 |
+
current_head_only_sample_2k = sample_by_key[("head_only_parallel", 2000, 4)]["mean_masked_mae"]
|
| 553 |
+
prior_delta = (sample_prior.get("baseline") or 0.0) - (sample_prior.get("parallel") or 0.0)
|
| 554 |
+
current_delta = current_shared_sample_2k - current_head_only_sample_2k
|
| 555 |
+
prior_regression["sample_step4_2000"] = {
|
| 556 |
+
"prior_baseline": sample_prior.get("baseline"),
|
| 557 |
+
"prior_parallel": sample_prior.get("parallel"),
|
| 558 |
+
"prior_parallel_edge": prior_delta,
|
| 559 |
+
"current_shared": current_shared_sample_2k,
|
| 560 |
+
"current_head_only_parallel": current_head_only_sample_2k,
|
| 561 |
+
"current_head_only_edge": current_delta,
|
| 562 |
+
"direction_matches": (prior_delta > 0 and current_delta > 0) or (prior_delta < 0 and current_delta < 0),
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
split_ind_teacher_2k = teacher_by_key[("split_independent", 2000)]["mean_val_loss"]
|
| 566 |
+
split_comm_teacher_2k = teacher_by_key[("split_communicating", 2000)]["mean_val_loss"]
|
| 567 |
+
head_only_teacher_2k = teacher_by_key[("head_only_parallel", 2000)]["mean_val_loss"]
|
| 568 |
+
|
| 569 |
+
split_ind_sample_avg_2k = statistics.fmean(
|
| 570 |
+
sample_by_key[("split_independent", 2000, sample_steps)]["mean_masked_mae"] for sample_steps in SAMPLE_STEPS
|
| 571 |
+
)
|
| 572 |
+
split_comm_sample_avg_2k = statistics.fmean(
|
| 573 |
+
sample_by_key[("split_communicating", 2000, sample_steps)]["mean_masked_mae"] for sample_steps in SAMPLE_STEPS
|
| 574 |
+
)
|
| 575 |
+
head_only_sample_avg_2k = statistics.fmean(
|
| 576 |
+
sample_by_key[("head_only_parallel", 2000, sample_steps)]["mean_masked_mae"] for sample_steps in SAMPLE_STEPS
|
| 577 |
+
)
|
| 578 |
+
|
| 579 |
+
return {
|
| 580 |
+
"study_name": artifact_root.name,
|
| 581 |
+
"artifact_root": str(artifact_root),
|
| 582 |
+
"hardware": "4x H100 80GB",
|
| 583 |
+
"precision": "bfloat16",
|
| 584 |
+
"train_repo_id": "lsnu/twin_dual_push_128_train",
|
| 585 |
+
"val_repo_id": "lsnu/twin_dual_push_128_val",
|
| 586 |
+
"packed_layout": "[L8, 0x8, R8, 0x8]",
|
| 587 |
+
"sample_batch_size_used": sample_batch_size_used,
|
| 588 |
+
"bootstrap_sanity": {
|
| 589 |
+
"step0_weight_loading": bootstrap_sanity,
|
| 590 |
+
"invariant_logs_present": invariant_logs,
|
| 591 |
+
},
|
| 592 |
+
"step0_gap_analysis": {
|
| 593 |
+
"teacher_forced_delta_vs_shared": step0_teacher_gaps,
|
| 594 |
+
"sample_avg_delta_vs_shared": {
|
| 595 |
+
variant: payload["average_delta_vs_shared"] for variant, payload in step0_sample_gaps.items()
|
| 596 |
+
},
|
| 597 |
+
"sample_delta_vs_shared_by_steps": step0_sample_gaps,
|
| 598 |
+
"smallest_teacher_forced_jump": smallest_teacher_variant,
|
| 599 |
+
"smallest_sample_jump": smallest_sample_variant,
|
| 600 |
+
},
|
| 601 |
+
"teacher_improvements": teacher_improvements,
|
| 602 |
+
"sample_improvements": sample_improvements,
|
| 603 |
+
"teacher_2k_ranking": teacher_2k_ranking,
|
| 604 |
+
"sample_2k_ranking": sample_2k_ranking,
|
| 605 |
+
"optimization_stability": {
|
| 606 |
+
"summary": stability,
|
| 607 |
+
"log_errors": log_errors,
|
| 608 |
+
},
|
| 609 |
+
"head_only_vs_prior_5k_study": prior_regression,
|
| 610 |
+
"answer_summary": {
|
| 611 |
+
"teacher_2k_best": teacher_2k_ranking[0]["model_variant"],
|
| 612 |
+
"sample_2k_best": sample_2k_ranking[0]["model_variant"],
|
| 613 |
+
"split_models_beat_head_only_teacher_2k": {
|
| 614 |
+
"split_independent": split_ind_teacher_2k < head_only_teacher_2k,
|
| 615 |
+
"split_communicating": split_comm_teacher_2k < head_only_teacher_2k,
|
| 616 |
+
},
|
| 617 |
+
"split_models_beat_head_only_sample_2k_avg": {
|
| 618 |
+
"split_independent": split_ind_sample_avg_2k < head_only_sample_avg_2k,
|
| 619 |
+
"split_communicating": split_comm_sample_avg_2k < head_only_sample_avg_2k,
|
| 620 |
+
},
|
| 621 |
+
"split_comm_vs_split_ind_teacher_2k_delta": split_comm_teacher_2k - split_ind_teacher_2k,
|
| 622 |
+
"split_comm_vs_split_ind_sample_2k_avg_delta": split_comm_sample_avg_2k - split_ind_sample_avg_2k,
|
| 623 |
+
},
|
| 624 |
+
"startup_summaries": startup_summaries,
|
| 625 |
+
}
|
| 626 |
+
|
| 627 |
+
|
| 628 |
+
def write_startup_summaries(path: pathlib.Path, startup_summaries: dict[str, dict[str, str]]) -> None:
|
| 629 |
+
ordered_keys = [
|
| 630 |
+
"weight_missing_count",
|
| 631 |
+
"weight_missing_keys",
|
| 632 |
+
"weight_unexpected_count",
|
| 633 |
+
"weight_unexpected_keys",
|
| 634 |
+
"config_name",
|
| 635 |
+
"dataset_repo_id",
|
| 636 |
+
"norm_stats_file",
|
| 637 |
+
"norm_stats_summary",
|
| 638 |
+
"checkpoint_source",
|
| 639 |
+
"model_type",
|
| 640 |
+
"packed_transforms",
|
| 641 |
+
"world_size",
|
| 642 |
+
"batch_size",
|
| 643 |
+
"num_workers",
|
| 644 |
+
"precision",
|
| 645 |
+
"lr_schedule",
|
| 646 |
+
"save_log_intervals",
|
| 647 |
+
"action_loss_mask",
|
| 648 |
+
"active_mask_dims",
|
| 649 |
+
"masked_dims",
|
| 650 |
+
"gradient_buckets",
|
| 651 |
+
]
|
| 652 |
+
lines = []
|
| 653 |
+
for spec in MODEL_SPECS:
|
| 654 |
+
lines.append(f"[{spec.model_variant}]")
|
| 655 |
+
startup = startup_summaries.get(spec.model_variant, {})
|
| 656 |
+
for key in ordered_keys:
|
| 657 |
+
if key in startup:
|
| 658 |
+
lines.append(f"{key}: {startup[key]}")
|
| 659 |
+
lines.append("")
|
| 660 |
+
path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
|
| 661 |
+
|
| 662 |
+
|
| 663 |
+
def build_markdown_table(headers: list[str], rows: list[list[str]]) -> str:
|
| 664 |
+
header_row = "| " + " | ".join(headers) + " |"
|
| 665 |
+
separator = "| " + " | ".join("---" for _ in headers) + " |"
|
| 666 |
+
body = "\n".join("| " + " | ".join(row) + " |" for row in rows)
|
| 667 |
+
return "\n".join([header_row, separator, body])
|
| 668 |
+
|
| 669 |
+
|
| 670 |
+
def write_readme(
|
| 671 |
+
path: pathlib.Path,
|
| 672 |
+
summary: dict[str, Any],
|
| 673 |
+
teacher_rows: list[dict[str, Any]],
|
| 674 |
+
sample_rows: list[dict[str, Any]],
|
| 675 |
+
) -> None:
|
| 676 |
+
teacher_by_key = row_index(teacher_rows, "model_variant", "checkpoint_step")
|
| 677 |
+
sample_by_key = row_index(sample_rows, "model_variant", "checkpoint_step", "sample_num_steps")
|
| 678 |
+
|
| 679 |
+
step0_table_rows = []
|
| 680 |
+
final_teacher_rows = []
|
| 681 |
+
final_sample_rows = []
|
| 682 |
+
for spec in MODEL_SPECS:
|
| 683 |
+
variant = spec.model_variant
|
| 684 |
+
step0 = teacher_by_key[(variant, 0)]
|
| 685 |
+
final_teacher = teacher_by_key[(variant, 2000)]
|
| 686 |
+
step0_table_rows.append(
|
| 687 |
+
[
|
| 688 |
+
variant,
|
| 689 |
+
format_float(step0["mean_val_loss"]),
|
| 690 |
+
format_delta(summary["step0_gap_analysis"]["teacher_forced_delta_vs_shared"][variant]),
|
| 691 |
+
format_float(step0["mean_left_right_imbalance"]),
|
| 692 |
+
]
|
| 693 |
+
)
|
| 694 |
+
final_teacher_rows.append(
|
| 695 |
+
[
|
| 696 |
+
variant,
|
| 697 |
+
format_float(final_teacher["mean_val_loss"]),
|
| 698 |
+
format_float(summary["teacher_improvements"][variant]["0_to_2000"]),
|
| 699 |
+
format_float(final_teacher["mean_left_right_imbalance"]),
|
| 700 |
+
]
|
| 701 |
+
)
|
| 702 |
+
final_sample_rows.append(
|
| 703 |
+
[
|
| 704 |
+
variant,
|
| 705 |
+
format_float(sample_by_key[(variant, 2000, 1)]["mean_masked_mae"]),
|
| 706 |
+
format_float(sample_by_key[(variant, 2000, 4)]["mean_masked_mae"]),
|
| 707 |
+
format_float(sample_by_key[(variant, 2000, 16)]["mean_masked_mae"]),
|
| 708 |
+
]
|
| 709 |
+
)
|
| 710 |
+
|
| 711 |
+
stability = summary["optimization_stability"]["summary"]
|
| 712 |
+
memory_note = ", ".join(
|
| 713 |
+
f"{variant}={format_float(stability[variant]['max_cuda_memory_gb'], digits=2)}GB"
|
| 714 |
+
for variant in [spec.model_variant for spec in MODEL_SPECS]
|
| 715 |
+
)
|
| 716 |
+
split_comm_comm = stability.get("split_communicating", {}).get("communication", {})
|
| 717 |
+
prior_teacher = summary.get("head_only_vs_prior_5k_study", {}).get("teacher_forced_2000")
|
| 718 |
+
prior_sample = summary.get("head_only_vs_prior_5k_study", {}).get("sample_step4_2000")
|
| 719 |
+
|
| 720 |
+
readme = [
|
| 721 |
+
f"# {summary['study_name']}",
|
| 722 |
+
"",
|
| 723 |
+
"Controlled 4-way early-training comparison on packed TWIN dual-push `128` with a shared step-0 bootstrap check, fresh `2K` training runs, and fixed validation settings at steps `0`, `100`, `500`, and `2000`.",
|
| 724 |
+
"",
|
| 725 |
+
"## Quick answers",
|
| 726 |
+
f"- Smallest step-0 teacher-forced jump vs `shared`: `{summary['step0_gap_analysis']['smallest_teacher_forced_jump']}` (`{format_delta(summary['step0_gap_analysis']['teacher_forced_delta_vs_shared'][summary['step0_gap_analysis']['smallest_teacher_forced_jump']])}`).",
|
| 727 |
+
f"- Smallest step-0 sample jump vs `shared` (average over sample steps `1,2,4,8,16`): `{summary['step0_gap_analysis']['smallest_sample_jump']}` (`{format_delta(summary['step0_gap_analysis']['sample_avg_delta_vs_shared'][summary['step0_gap_analysis']['smallest_sample_jump']])}`).",
|
| 728 |
+
f"- Best teacher-forced result by step `2000`: `{summary['answer_summary']['teacher_2k_best']}`.",
|
| 729 |
+
f"- Best sample result by step `2000` (average masked MAE over sample steps `1,2,4,8,16`): `{summary['answer_summary']['sample_2k_best']}`.",
|
| 730 |
+
f"- Split vs head-only by step `2000`: teacher-forced beat flags `split_independent={summary['answer_summary']['split_models_beat_head_only_teacher_2k']['split_independent']}`, `split_communicating={summary['answer_summary']['split_models_beat_head_only_teacher_2k']['split_communicating']}`; sample beat flags `split_independent={summary['answer_summary']['split_models_beat_head_only_sample_2k_avg']['split_independent']}`, `split_communicating={summary['answer_summary']['split_models_beat_head_only_sample_2k_avg']['split_communicating']}`.",
|
| 731 |
+
f"- `split_communicating` vs `split_independent` at `2000`: teacher delta `{format_delta(summary['answer_summary']['split_comm_vs_split_ind_teacher_2k_delta'])}`, sample-average delta `{format_delta(summary['answer_summary']['split_comm_vs_split_ind_sample_2k_avg_delta'])}`.",
|
| 732 |
+
"",
|
| 733 |
+
"## Step-0 teacher-forced comparison",
|
| 734 |
+
build_markdown_table(
|
| 735 |
+
["model", "mean_val_loss", "delta_vs_shared", "left_right_imbalance"],
|
| 736 |
+
step0_table_rows,
|
| 737 |
+
),
|
| 738 |
+
"",
|
| 739 |
+
"## Step-2000 comparison",
|
| 740 |
+
build_markdown_table(
|
| 741 |
+
["model", "mean_val_loss", "0_to_2000_improvement", "left_right_imbalance"],
|
| 742 |
+
final_teacher_rows,
|
| 743 |
+
),
|
| 744 |
+
"",
|
| 745 |
+
build_markdown_table(
|
| 746 |
+
["model", "1-step_mae", "4-step_mae", "16-step_mae"],
|
| 747 |
+
final_sample_rows,
|
| 748 |
+
),
|
| 749 |
+
"",
|
| 750 |
+
"## Stability notes",
|
| 751 |
+
f"- Sample batch size used for all official evals: `{summary['sample_batch_size_used']}`.",
|
| 752 |
+
f"- Step-0 weight loading was clean for all four variants: missing and unexpected key counts were zero in every step-0 eval log.",
|
| 753 |
+
f"- Peak training VRAM by model: {memory_note}.",
|
| 754 |
+
f"- `split_communicating` communication path: active=`{split_comm_comm.get('active', False)}`, `grad_cross_arm_comm_max={format_float(split_comm_comm.get('grad_cross_arm_comm_max'))}`, `attention_mass_mean={format_float(split_comm_comm.get('attention_mass_mean'))}`, `gate_abs_max={format_float(split_comm_comm.get('gate_abs_max'))}`.",
|
| 755 |
+
"",
|
| 756 |
+
"## Regression check vs prior dual-push screen",
|
| 757 |
+
]
|
| 758 |
+
|
| 759 |
+
if prior_teacher:
|
| 760 |
+
readme.append(
|
| 761 |
+
f"- Prior `5K` study at step `2000` had `baseline={format_float(prior_teacher['prior_baseline'])}` and `parallel={format_float(prior_teacher['prior_parallel'])}` with head-only edge `{format_delta(prior_teacher['prior_parallel_edge'])}`. This rerun has `shared={format_float(prior_teacher['current_shared'])}` and `head_only_parallel={format_float(prior_teacher['current_head_only_parallel'])}` with head-only edge `{format_delta(prior_teacher['current_head_only_edge'])}`; direction match=`{prior_teacher['direction_matches']}`."
|
| 762 |
+
)
|
| 763 |
+
else:
|
| 764 |
+
readme.append("- Prior teacher-forced comparison was unavailable.")
|
| 765 |
+
|
| 766 |
+
if prior_sample:
|
| 767 |
+
readme.append(
|
| 768 |
+
f"- Prior `5K` study `4`-step MAE at step `2000` had `baseline={format_float(prior_sample['prior_baseline'])}` and `parallel={format_float(prior_sample['prior_parallel'])}` with head-only edge `{format_delta(prior_sample['prior_parallel_edge'])}`. This rerun has `shared={format_float(prior_sample['current_shared'])}` and `head_only_parallel={format_float(prior_sample['current_head_only_parallel'])}` with head-only edge `{format_delta(prior_sample['current_head_only_edge'])}`; direction match=`{prior_sample['direction_matches']}`."
|
| 769 |
+
)
|
| 770 |
+
else:
|
| 771 |
+
readme.append("- Prior sample-based comparison was unavailable.")
|
| 772 |
+
|
| 773 |
+
readme.extend(
|
| 774 |
+
[
|
| 775 |
+
"",
|
| 776 |
+
"## Files",
|
| 777 |
+
"- `metrics/teacher_forced_eval_table.csv`: all teacher-forced metrics at steps `0`, `100`, `500`, `2000`.",
|
| 778 |
+
"- `metrics/sample_eval_table.csv`: all sample-eval metrics for sample steps `1`, `2`, `4`, `8`, `16` at steps `0`, `100`, `500`, `2000`.",
|
| 779 |
+
"- `metrics/training_summary.csv`: per-log-interval training diagnostics with model-specific gradient columns.",
|
| 780 |
+
"- `metrics/startup_summaries.txt`: startup configuration and weight-loading summaries for each run.",
|
| 781 |
+
"- `run_logs/`: full train/eval logs, including the first-five-step debug lines in each train log.",
|
| 782 |
+
]
|
| 783 |
+
)
|
| 784 |
+
|
| 785 |
+
path.write_text("\n".join(readme).rstrip() + "\n", encoding="utf-8")
|
| 786 |
+
|
| 787 |
+
|
| 788 |
+
def main() -> None:
|
| 789 |
+
args = parse_args()
|
| 790 |
+
artifact_root = pathlib.Path(args.artifact_root).resolve()
|
| 791 |
+
run_logs_dir = artifact_root / "run_logs"
|
| 792 |
+
metrics_dir = artifact_root / "metrics"
|
| 793 |
+
metrics_dir.mkdir(parents=True, exist_ok=True)
|
| 794 |
+
|
| 795 |
+
teacher_rows: list[dict[str, Any]] = []
|
| 796 |
+
sample_rows: list[dict[str, Any]] = []
|
| 797 |
+
train_rows: list[dict[str, Any]] = []
|
| 798 |
+
startup_summaries: dict[str, dict[str, str]] = {}
|
| 799 |
+
log_errors: dict[str, list[str]] = {}
|
| 800 |
+
sample_batch_size_used = "unknown"
|
| 801 |
+
|
| 802 |
+
extra_train_columns: set[str] = set()
|
| 803 |
+
|
| 804 |
+
for spec in MODEL_SPECS:
|
| 805 |
+
train_log = run_logs_dir / f"{spec.exp_name}.log"
|
| 806 |
+
startup, train_log_rows, train_errors = parse_train_log(train_log, spec)
|
| 807 |
+
startup_summaries[spec.model_variant] = startup
|
| 808 |
+
train_rows.extend(train_log_rows)
|
| 809 |
+
extra_train_columns.update(column for row in train_log_rows for column in row if column not in BASE_TRAIN_COLUMNS)
|
| 810 |
+
if train_errors:
|
| 811 |
+
log_errors[f"{spec.model_variant}:train"] = train_errors
|
| 812 |
+
|
| 813 |
+
for step in STEP_ORDER:
|
| 814 |
+
eval_log = run_logs_dir / f"{spec.exp_name}_val_{step}.log"
|
| 815 |
+
eval_metrics, eval_sample_batch_size, eval_errors = parse_eval_log(eval_log)
|
| 816 |
+
if eval_sample_batch_size is not None:
|
| 817 |
+
sample_batch_size_used = str(eval_sample_batch_size)
|
| 818 |
+
teacher_rows.append(build_teacher_row(spec, step, eval_metrics))
|
| 819 |
+
sample_rows.extend(build_sample_rows(spec, step, eval_metrics))
|
| 820 |
+
if eval_errors:
|
| 821 |
+
log_errors[f"{spec.model_variant}:eval:{step}"] = eval_errors
|
| 822 |
+
|
| 823 |
+
teacher_rows.sort(key=lambda row: (natural_key(row["model_variant"]), row["checkpoint_step"]))
|
| 824 |
+
sample_rows.sort(key=lambda row: (natural_key(row["model_variant"]), row["checkpoint_step"], row["sample_num_steps"]))
|
| 825 |
+
train_rows.sort(key=lambda row: (natural_key(row["model_variant"]), row["step"]))
|
| 826 |
+
|
| 827 |
+
teacher_columns = [
|
| 828 |
+
"model_variant",
|
| 829 |
+
"config_name",
|
| 830 |
+
"exp_name",
|
| 831 |
+
"checkpoint_step",
|
| 832 |
+
"checkpoint_path",
|
| 833 |
+
"repo_id",
|
| 834 |
+
"num_batches",
|
| 835 |
+
"mean_val_loss",
|
| 836 |
+
"std_val_loss",
|
| 837 |
+
"mean_left_arm_loss",
|
| 838 |
+
"std_left_arm_loss",
|
| 839 |
+
"mean_right_arm_loss",
|
| 840 |
+
"std_right_arm_loss",
|
| 841 |
+
"mean_left_joint_loss",
|
| 842 |
+
"std_left_joint_loss",
|
| 843 |
+
"mean_left_gripper_loss",
|
| 844 |
+
"std_left_gripper_loss",
|
| 845 |
+
"mean_right_joint_loss",
|
| 846 |
+
"std_right_joint_loss",
|
| 847 |
+
"mean_right_gripper_loss",
|
| 848 |
+
"std_right_gripper_loss",
|
| 849 |
+
"mean_left_right_imbalance",
|
| 850 |
+
"std_left_right_imbalance",
|
| 851 |
+
"per_batch_time_mean_s",
|
| 852 |
+
"per_batch_time_std_s",
|
| 853 |
+
"per_batch_time_min_s",
|
| 854 |
+
"per_batch_time_max_s",
|
| 855 |
+
"weight_loading_missing_count",
|
| 856 |
+
"weight_loading_unexpected_count",
|
| 857 |
+
]
|
| 858 |
+
sample_columns = [
|
| 859 |
+
"model_variant",
|
| 860 |
+
"config_name",
|
| 861 |
+
"exp_name",
|
| 862 |
+
"checkpoint_step",
|
| 863 |
+
"checkpoint_path",
|
| 864 |
+
"repo_id",
|
| 865 |
+
"sample_num_steps",
|
| 866 |
+
"sample_num_batches",
|
| 867 |
+
"mean_masked_mae",
|
| 868 |
+
"std_masked_mae",
|
| 869 |
+
"mean_left_arm_mae",
|
| 870 |
+
"std_left_arm_mae",
|
| 871 |
+
"mean_right_arm_mae",
|
| 872 |
+
"std_right_arm_mae",
|
| 873 |
+
"mean_left_joint_mae",
|
| 874 |
+
"std_left_joint_mae",
|
| 875 |
+
"mean_left_gripper_mae",
|
| 876 |
+
"std_left_gripper_mae",
|
| 877 |
+
"mean_right_joint_mae",
|
| 878 |
+
"std_right_joint_mae",
|
| 879 |
+
"mean_right_gripper_mae",
|
| 880 |
+
"std_right_gripper_mae",
|
| 881 |
+
"mean_left_right_imbalance_mae",
|
| 882 |
+
"std_left_right_imbalance_mae",
|
| 883 |
+
"per_batch_time_mean_s",
|
| 884 |
+
"per_batch_time_std_s",
|
| 885 |
+
"per_batch_time_min_s",
|
| 886 |
+
"per_batch_time_max_s",
|
| 887 |
+
]
|
| 888 |
+
|
| 889 |
+
ordered_extra_train_columns = sorted(extra_train_columns, key=natural_key)
|
| 890 |
+
train_columns = BASE_TRAIN_COLUMNS + ordered_extra_train_columns
|
| 891 |
+
|
| 892 |
+
write_csv(metrics_dir / "teacher_forced_eval_table.csv", teacher_rows, teacher_columns)
|
| 893 |
+
write_csv(metrics_dir / "sample_eval_table.csv", sample_rows, sample_columns)
|
| 894 |
+
write_csv(metrics_dir / "training_summary.csv", train_rows, train_columns)
|
| 895 |
+
write_startup_summaries(metrics_dir / "startup_summaries.txt", startup_summaries)
|
| 896 |
+
|
| 897 |
+
prior_metrics = load_prior_metrics(pathlib.Path(args.prior_metrics_root)) if args.prior_metrics_root else {}
|
| 898 |
+
summary = build_summary(
|
| 899 |
+
artifact_root,
|
| 900 |
+
teacher_rows,
|
| 901 |
+
sample_rows,
|
| 902 |
+
train_rows,
|
| 903 |
+
startup_summaries,
|
| 904 |
+
log_errors,
|
| 905 |
+
sample_batch_size_used,
|
| 906 |
+
prior_metrics,
|
| 907 |
+
)
|
| 908 |
+
(metrics_dir / "summary.json").write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
| 909 |
+
write_readme(artifact_root / "README.md", summary, teacher_rows, sample_rows)
|
| 910 |
+
|
| 911 |
+
|
| 912 |
+
if __name__ == "__main__":
|
| 913 |
+
main()
|
openpi/scripts/prune_stepcmp_checkpoints.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import pathlib
|
| 5 |
+
import shutil
|
| 6 |
+
import time
|
| 7 |
+
from datetime import datetime, timezone
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def utc_ts() -> str:
|
| 11 |
+
return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def prune_once(roots: list[pathlib.Path], keep_steps: set[str]) -> int:
|
| 15 |
+
removed = 0
|
| 16 |
+
for root in roots:
|
| 17 |
+
if not root.is_dir():
|
| 18 |
+
continue
|
| 19 |
+
for child in root.iterdir():
|
| 20 |
+
if not child.is_dir():
|
| 21 |
+
continue
|
| 22 |
+
if child.name.startswith("tmp_"):
|
| 23 |
+
continue
|
| 24 |
+
if not child.name.isdigit():
|
| 25 |
+
continue
|
| 26 |
+
if child.name in keep_steps:
|
| 27 |
+
continue
|
| 28 |
+
shutil.rmtree(child, ignore_errors=True)
|
| 29 |
+
print(f"[{utc_ts()}] pruned {child}", flush=True)
|
| 30 |
+
removed += 1
|
| 31 |
+
return removed
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def main() -> None:
|
| 35 |
+
parser = argparse.ArgumentParser()
|
| 36 |
+
parser.add_argument("--interval-seconds", type=int, default=30)
|
| 37 |
+
parser.add_argument("--keep-steps", nargs="+", default=["100", "500", "2000"])
|
| 38 |
+
parser.add_argument("roots", nargs="+")
|
| 39 |
+
args = parser.parse_args()
|
| 40 |
+
|
| 41 |
+
roots = [pathlib.Path(root) for root in args.roots]
|
| 42 |
+
keep_steps = set(args.keep_steps)
|
| 43 |
+
print(
|
| 44 |
+
f"[{utc_ts()}] retention pruner started interval_s={args.interval_seconds} keep_steps={sorted(keep_steps)}",
|
| 45 |
+
flush=True,
|
| 46 |
+
)
|
| 47 |
+
while True:
|
| 48 |
+
prune_once(roots, keep_steps)
|
| 49 |
+
time.sleep(args.interval_seconds)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
if __name__ == "__main__":
|
| 53 |
+
main()
|
openpi/scripts/run_twin_dual_push_128_stepcmp_2k.sh
ADDED
|
@@ -0,0 +1,558 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 5 |
+
VENV="$ROOT/.venv/bin/activate"
|
| 6 |
+
PYTHON_BIN="$ROOT/.venv/bin/python"
|
| 7 |
+
ARTIFACT_DATE="${ARTIFACT_DATE:-$(date -u +%Y%m%d)}"
|
| 8 |
+
ARTIFACT_ROOT="${ARTIFACT_ROOT:-/workspace/pi05tests/artifacts/twin_dual_push_128_stepcmp_2k_${ARTIFACT_DATE}}"
|
| 9 |
+
RUN_LOG_DIR="$ARTIFACT_ROOT/run_logs"
|
| 10 |
+
METRICS_DIR="$ARTIFACT_ROOT/metrics"
|
| 11 |
+
REPRO_DIR="$ARTIFACT_ROOT/repro"
|
| 12 |
+
ENV_DIR="$ARTIFACT_ROOT/environment"
|
| 13 |
+
SANITY_DIR="$ARTIFACT_ROOT/sanity_checks"
|
| 14 |
+
mkdir -p "$RUN_LOG_DIR" "$METRICS_DIR" "$REPRO_DIR" "$ENV_DIR" "$SANITY_DIR" "$ROOT/run_logs"
|
| 15 |
+
|
| 16 |
+
export HF_TOKEN="${HF_TOKEN:-}"
|
| 17 |
+
export HF_HOME=/workspace/.hf
|
| 18 |
+
export HF_HUB_CACHE=/workspace/.hf/hub
|
| 19 |
+
export HF_DATASETS_CACHE=/workspace/.hf/datasets
|
| 20 |
+
export HUGGINGFACE_HUB_CACHE=/workspace/.hf/hub
|
| 21 |
+
export XDG_CACHE_HOME=/workspace/.cache
|
| 22 |
+
export OPENPI_LEROBOT_HOME=/workspace/lerobot
|
| 23 |
+
export OPENPI_TORCH_COMPILE_SAMPLE_ACTIONS=0
|
| 24 |
+
export TOKENIZERS_PARALLELISM=false
|
| 25 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 26 |
+
export PYTHONPATH="$ROOT/src"
|
| 27 |
+
export OMP_NUM_THREADS=1
|
| 28 |
+
export MKL_NUM_THREADS=1
|
| 29 |
+
export NUMEXPR_NUM_THREADS=1
|
| 30 |
+
export PYTHONFAULTHANDLER=1
|
| 31 |
+
|
| 32 |
+
cd "$ROOT"
|
| 33 |
+
source "$VENV"
|
| 34 |
+
|
| 35 |
+
TRAIN_REPO="lsnu/twin_dual_push_128_train"
|
| 36 |
+
VAL_REPO="lsnu/twin_dual_push_128_val"
|
| 37 |
+
TEACHER_VAL_BATCHES=100
|
| 38 |
+
SAMPLE_VAL_BATCHES=64
|
| 39 |
+
SAMPLE_NUM_STEPS="1,2,4,8,16"
|
| 40 |
+
PRIOR_METRICS_ROOT="/workspace/pi05tests/artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics"
|
| 41 |
+
|
| 42 |
+
declare -A MODEL_VARIANT=(
|
| 43 |
+
[shared]="shared"
|
| 44 |
+
[head_only]="head_only_parallel"
|
| 45 |
+
[split_ind]="split_independent"
|
| 46 |
+
[split_comm]="split_communicating"
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
declare -A CONFIG_NAME=(
|
| 50 |
+
[shared]="pi05_twin_dual_push_128_packed_baseline_pytorch_5k"
|
| 51 |
+
[head_only]="pi05_twin_dual_push_128_packed_parallel_pytorch_5k"
|
| 52 |
+
[split_ind]="pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k"
|
| 53 |
+
[split_comm]="pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k"
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
declare -A STEP0_CKPT=(
|
| 57 |
+
[shared]="/workspace/checkpoints/pi05_base_single_pytorch"
|
| 58 |
+
[head_only]="/workspace/checkpoints/pi05_base_parallel_packed_from_single"
|
| 59 |
+
[split_ind]="/workspace/checkpoints/pi05_base_split_independent_packed_from_single"
|
| 60 |
+
[split_comm]="/workspace/checkpoints/pi05_base_split_communicating_packed_from_single"
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
declare -A EXP_NAME=(
|
| 64 |
+
[shared]="dual_push_128_stepcmp_shared_2k"
|
| 65 |
+
[head_only]="dual_push_128_stepcmp_head_only_2k"
|
| 66 |
+
[split_ind]="dual_push_128_stepcmp_split_ind_2k"
|
| 67 |
+
[split_comm]="dual_push_128_stepcmp_split_comm_2k"
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
eval_pids=()
|
| 71 |
+
eval_labels=()
|
| 72 |
+
checkpoint_pruner_pid=""
|
| 73 |
+
|
| 74 |
+
log() {
|
| 75 |
+
echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] $*"
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
pruner_log() {
|
| 79 |
+
echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] $*" >>"$RUN_LOG_DIR/checkpoint_retention_pruner.log"
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
trim_env_snapshot() {
|
| 83 |
+
env | sort | grep -E '^(HF_|HUGGINGFACE_|OPENPI_|PYTORCH_|PYTHONPATH|TOKENIZERS_PARALLELISM|XDG_CACHE_HOME)=' >"$ENV_DIR/env_selected.txt"
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
save_environment_snapshot() {
|
| 87 |
+
date -u '+%Y-%m-%d %H:%M:%S UTC' >"$ENV_DIR/date_utc.txt"
|
| 88 |
+
uname -a >"$ENV_DIR/uname.txt"
|
| 89 |
+
python --version >"$ENV_DIR/python_version.txt" 2>&1
|
| 90 |
+
pip freeze >"$ENV_DIR/pip_freeze.txt"
|
| 91 |
+
if ! timeout 120s python -m torch.utils.collect_env >"$ENV_DIR/torch_env.txt" 2>&1; then
|
| 92 |
+
echo "torch.utils.collect_env timed out after 120 seconds" >>"$ENV_DIR/torch_env.txt"
|
| 93 |
+
fi
|
| 94 |
+
nvidia-smi >"$ENV_DIR/nvidia_smi.txt"
|
| 95 |
+
nvidia-smi topo -m >"$ENV_DIR/nvidia_smi_topo.txt"
|
| 96 |
+
df -h /workspace >"$ENV_DIR/df_workspace.txt"
|
| 97 |
+
trim_env_snapshot
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
copy_repro_manifests() {
|
| 101 |
+
cp "$0" "$REPRO_DIR/commands_stepcmp.sh"
|
| 102 |
+
cp "$ROOT/scripts/collect_twin_dual_push_128_stepcmp_metrics.py" "$REPRO_DIR/collect_twin_dual_push_128_stepcmp_metrics.py"
|
| 103 |
+
cat >"$REPRO_DIR/checkpoint_locations.txt" <<EOF
|
| 104 |
+
shared_step0=${STEP0_CKPT[shared]}
|
| 105 |
+
head_only_step0=${STEP0_CKPT[head_only]}
|
| 106 |
+
split_independent_step0=${STEP0_CKPT[split_ind]}
|
| 107 |
+
split_communicating_step0=${STEP0_CKPT[split_comm]}
|
| 108 |
+
shared_train_root=$ROOT/checkpoints/${CONFIG_NAME[shared]}/${EXP_NAME[shared]}
|
| 109 |
+
head_only_train_root=$ROOT/checkpoints/${CONFIG_NAME[head_only]}/${EXP_NAME[head_only]}
|
| 110 |
+
split_independent_train_root=$ROOT/checkpoints/${CONFIG_NAME[split_ind]}/${EXP_NAME[split_ind]}
|
| 111 |
+
split_communicating_train_root=$ROOT/checkpoints/${CONFIG_NAME[split_comm]}/${EXP_NAME[split_comm]}
|
| 112 |
+
artifact_root=$ARTIFACT_ROOT
|
| 113 |
+
EOF
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
require_file() {
|
| 117 |
+
local path="$1"
|
| 118 |
+
if [[ ! -f "$path" ]]; then
|
| 119 |
+
log "required file missing: $path"
|
| 120 |
+
exit 1
|
| 121 |
+
fi
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
require_dir() {
|
| 125 |
+
local path="$1"
|
| 126 |
+
if [[ ! -d "$path" ]]; then
|
| 127 |
+
log "required directory missing: $path"
|
| 128 |
+
exit 1
|
| 129 |
+
fi
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
norm_stats_path_for_key() {
|
| 133 |
+
local key="$1"
|
| 134 |
+
echo "$ROOT/assets/${CONFIG_NAME[$key]}/$TRAIN_REPO/norm_stats.json"
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
ensure_packed_dual_push_norm_stats() {
|
| 138 |
+
local split_ind_stats
|
| 139 |
+
local split_comm_stats
|
| 140 |
+
split_ind_stats="$(norm_stats_path_for_key split_ind)"
|
| 141 |
+
split_comm_stats="$(norm_stats_path_for_key split_comm)"
|
| 142 |
+
|
| 143 |
+
require_file "$split_ind_stats"
|
| 144 |
+
require_file "$split_comm_stats"
|
| 145 |
+
|
| 146 |
+
local split_ind_sha
|
| 147 |
+
local split_comm_sha
|
| 148 |
+
split_ind_sha="$(sha256sum "$split_ind_stats" | awk '{print $1}')"
|
| 149 |
+
split_comm_sha="$(sha256sum "$split_comm_stats" | awk '{print $1}')"
|
| 150 |
+
|
| 151 |
+
if [[ "$split_ind_sha" != "$split_comm_sha" ]]; then
|
| 152 |
+
log "packed dual-push split norm stats differ across split configs"
|
| 153 |
+
echo "split_ind=$split_ind_stats sha256=$split_ind_sha" >"$SANITY_DIR/norm_stats_status.txt"
|
| 154 |
+
echo "split_comm=$split_comm_stats sha256=$split_comm_sha" >>"$SANITY_DIR/norm_stats_status.txt"
|
| 155 |
+
exit 1
|
| 156 |
+
fi
|
| 157 |
+
|
| 158 |
+
local canonical_stats="$split_ind_stats"
|
| 159 |
+
local key
|
| 160 |
+
: >"$SANITY_DIR/norm_stats_status.txt"
|
| 161 |
+
echo "canonical_source=$canonical_stats" >>"$SANITY_DIR/norm_stats_status.txt"
|
| 162 |
+
echo "canonical_sha256=$split_ind_sha" >>"$SANITY_DIR/norm_stats_status.txt"
|
| 163 |
+
|
| 164 |
+
for key in shared head_only split_ind split_comm; do
|
| 165 |
+
local dst
|
| 166 |
+
local dst_sha
|
| 167 |
+
dst="$(norm_stats_path_for_key "$key")"
|
| 168 |
+
if [[ ! -f "$dst" ]]; then
|
| 169 |
+
mkdir -p "$(dirname "$dst")"
|
| 170 |
+
cp "$canonical_stats" "$dst"
|
| 171 |
+
log "restored missing packed dual-push norm stats for ${MODEL_VARIANT[$key]} -> $dst"
|
| 172 |
+
fi
|
| 173 |
+
dst_sha="$(sha256sum "$dst" | awk '{print $1}')"
|
| 174 |
+
echo "${MODEL_VARIANT[$key]}=$dst sha256=$dst_sha" >>"$SANITY_DIR/norm_stats_status.txt"
|
| 175 |
+
if [[ "$dst_sha" != "$split_ind_sha" ]]; then
|
| 176 |
+
log "packed dual-push norm stats mismatch for ${MODEL_VARIANT[$key]}: $dst"
|
| 177 |
+
exit 1
|
| 178 |
+
fi
|
| 179 |
+
done
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
ensure_bootstrap_checkpoints() {
|
| 183 |
+
local regenerated_any=0
|
| 184 |
+
local regenerated_split=0
|
| 185 |
+
|
| 186 |
+
require_file "/workspace/checkpoints/pi05_base_single_pytorch/model.safetensors"
|
| 187 |
+
|
| 188 |
+
if [[ ! -f "${STEP0_CKPT[head_only]}/model.safetensors" ]]; then
|
| 189 |
+
log "regenerating head-only packed warm-start checkpoint"
|
| 190 |
+
python -u scripts/init_parallel_pi05_from_single_pytorch.py \
|
| 191 |
+
--single_ckpt /workspace/checkpoints/pi05_base_single_pytorch \
|
| 192 |
+
--config_name "${CONFIG_NAME[head_only]}" \
|
| 193 |
+
--output_path "${STEP0_CKPT[head_only]}" \
|
| 194 |
+
>"$SANITY_DIR/init_head_only.log" 2>&1
|
| 195 |
+
regenerated_any=1
|
| 196 |
+
fi
|
| 197 |
+
|
| 198 |
+
if [[ ! -f "${STEP0_CKPT[split_ind]}/model.safetensors" ]]; then
|
| 199 |
+
log "regenerating split-independent packed warm-start checkpoint"
|
| 200 |
+
python -u scripts/init_parallel_pi05_from_single_pytorch.py \
|
| 201 |
+
--single_ckpt /workspace/checkpoints/pi05_base_single_pytorch \
|
| 202 |
+
--config_name "${CONFIG_NAME[split_ind]}" \
|
| 203 |
+
--output_path "${STEP0_CKPT[split_ind]}" \
|
| 204 |
+
>"$SANITY_DIR/init_split_independent.log" 2>&1
|
| 205 |
+
regenerated_any=1
|
| 206 |
+
regenerated_split=1
|
| 207 |
+
fi
|
| 208 |
+
|
| 209 |
+
if [[ ! -f "${STEP0_CKPT[split_comm]}/model.safetensors" ]]; then
|
| 210 |
+
log "regenerating split-communicating packed warm-start checkpoint"
|
| 211 |
+
python -u scripts/init_parallel_pi05_from_single_pytorch.py \
|
| 212 |
+
--single_ckpt /workspace/checkpoints/pi05_base_single_pytorch \
|
| 213 |
+
--config_name "${CONFIG_NAME[split_comm]}" \
|
| 214 |
+
--output_path "${STEP0_CKPT[split_comm]}" \
|
| 215 |
+
>"$SANITY_DIR/init_split_communicating.log" 2>&1
|
| 216 |
+
regenerated_any=1
|
| 217 |
+
regenerated_split=1
|
| 218 |
+
fi
|
| 219 |
+
|
| 220 |
+
require_file "${STEP0_CKPT[head_only]}/model.safetensors"
|
| 221 |
+
require_file "${STEP0_CKPT[split_ind]}/model.safetensors"
|
| 222 |
+
require_file "${STEP0_CKPT[split_comm]}/model.safetensors"
|
| 223 |
+
|
| 224 |
+
if [[ "$regenerated_split" -eq 1 ]]; then
|
| 225 |
+
log "rerunning split invariant checks after bootstrap regeneration"
|
| 226 |
+
python -u scripts/check_split_expert_invariants.py \
|
| 227 |
+
--config_name "${CONFIG_NAME[split_ind]}" \
|
| 228 |
+
--checkpoint_dir "${STEP0_CKPT[split_ind]}" \
|
| 229 |
+
>"$SANITY_DIR/check_split_independent_invariants.log" 2>&1
|
| 230 |
+
python -u scripts/check_split_expert_invariants.py \
|
| 231 |
+
--config_name "${CONFIG_NAME[split_comm]}" \
|
| 232 |
+
--checkpoint_dir "${STEP0_CKPT[split_comm]}" \
|
| 233 |
+
>"$SANITY_DIR/check_split_communicating_invariants.log" 2>&1
|
| 234 |
+
fi
|
| 235 |
+
|
| 236 |
+
printf 'regenerated_any=%s\nregenerated_split=%s\n' "$regenerated_any" "$regenerated_split" >"$SANITY_DIR/bootstrap_regeneration_status.txt"
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
sample_batch_size_arg=()
|
| 240 |
+
sample_batch_size_value="default"
|
| 241 |
+
|
| 242 |
+
run_sample_batch_probe() {
|
| 243 |
+
local sample_batch_size="$1"
|
| 244 |
+
local probe_log="$SANITY_DIR/sample_batch_size_probe.log"
|
| 245 |
+
local batch_arg=()
|
| 246 |
+
if [[ -n "$sample_batch_size" ]]; then
|
| 247 |
+
batch_arg=(--sample_batch_size "$sample_batch_size")
|
| 248 |
+
fi
|
| 249 |
+
CUDA_VISIBLE_DEVICES=0 python -u scripts/eval_twin_val_loss_pytorch.py \
|
| 250 |
+
--config_name "${CONFIG_NAME[split_comm]}" \
|
| 251 |
+
--checkpoint_dir "${STEP0_CKPT[split_comm]}" \
|
| 252 |
+
--repo_id "$VAL_REPO" \
|
| 253 |
+
--num_batches 1 \
|
| 254 |
+
--num_workers 0 \
|
| 255 |
+
--eval_seed 123 \
|
| 256 |
+
--sample_num_batches 1 \
|
| 257 |
+
--sample_num_steps "16" \
|
| 258 |
+
--sample_seed 321 \
|
| 259 |
+
"${batch_arg[@]}" \
|
| 260 |
+
>"$probe_log" 2>&1
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
determine_sample_batch_size() {
|
| 264 |
+
local override="${SAMPLE_BATCH_SIZE_OVERRIDE:-}"
|
| 265 |
+
if [[ -n "$override" ]]; then
|
| 266 |
+
sample_batch_size_arg=(--sample_batch_size "$override")
|
| 267 |
+
sample_batch_size_value="$override"
|
| 268 |
+
echo "$sample_batch_size_value" >"$SANITY_DIR/sample_batch_size_used.txt"
|
| 269 |
+
return
|
| 270 |
+
fi
|
| 271 |
+
|
| 272 |
+
log "probing sample-eval batch size on split_communicating step-0 checkpoint"
|
| 273 |
+
if run_sample_batch_probe ""; then
|
| 274 |
+
sample_batch_size_arg=()
|
| 275 |
+
sample_batch_size_value="default"
|
| 276 |
+
echo "$sample_batch_size_value" >"$SANITY_DIR/sample_batch_size_used.txt"
|
| 277 |
+
return
|
| 278 |
+
fi
|
| 279 |
+
|
| 280 |
+
if ! grep -qi 'out of memory' "$SANITY_DIR/sample_batch_size_probe.log"; then
|
| 281 |
+
log "sample batch size probe failed for a non-OOM reason; see $SANITY_DIR/sample_batch_size_probe.log"
|
| 282 |
+
exit 1
|
| 283 |
+
fi
|
| 284 |
+
|
| 285 |
+
local candidate=8
|
| 286 |
+
while [[ "$candidate" -ge 1 ]]; do
|
| 287 |
+
log "retrying sample-eval probe with --sample_batch_size=$candidate"
|
| 288 |
+
if run_sample_batch_probe "$candidate"; then
|
| 289 |
+
sample_batch_size_arg=(--sample_batch_size "$candidate")
|
| 290 |
+
sample_batch_size_value="$candidate"
|
| 291 |
+
echo "$sample_batch_size_value" >"$SANITY_DIR/sample_batch_size_used.txt"
|
| 292 |
+
return
|
| 293 |
+
fi
|
| 294 |
+
if ! grep -qi 'out of memory' "$SANITY_DIR/sample_batch_size_probe.log"; then
|
| 295 |
+
log "sample batch size retry failed for a non-OOM reason; see $SANITY_DIR/sample_batch_size_probe.log"
|
| 296 |
+
exit 1
|
| 297 |
+
fi
|
| 298 |
+
candidate=$((candidate / 2))
|
| 299 |
+
done
|
| 300 |
+
|
| 301 |
+
log "unable to find a viable sample batch size; see $SANITY_DIR/sample_batch_size_probe.log"
|
| 302 |
+
exit 1
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
load_saved_sample_batch_size() {
|
| 306 |
+
local saved_value="default"
|
| 307 |
+
if [[ -f "$SANITY_DIR/sample_batch_size_used.txt" ]]; then
|
| 308 |
+
saved_value="$(<"$SANITY_DIR/sample_batch_size_used.txt")"
|
| 309 |
+
else
|
| 310 |
+
echo "$saved_value" >"$SANITY_DIR/sample_batch_size_used.txt"
|
| 311 |
+
fi
|
| 312 |
+
|
| 313 |
+
sample_batch_size_value="$saved_value"
|
| 314 |
+
sample_batch_size_arg=()
|
| 315 |
+
if [[ "$saved_value" != "default" ]]; then
|
| 316 |
+
sample_batch_size_arg=(--sample_batch_size "$saved_value")
|
| 317 |
+
fi
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
launch_eval_async() {
|
| 321 |
+
local gpu="$1"
|
| 322 |
+
local key="$2"
|
| 323 |
+
local step="$3"
|
| 324 |
+
local ckpt_dir="$4"
|
| 325 |
+
local log_path="$5"
|
| 326 |
+
CUDA_VISIBLE_DEVICES="$gpu" python -u scripts/eval_twin_val_loss_pytorch.py \
|
| 327 |
+
--config_name "${CONFIG_NAME[$key]}" \
|
| 328 |
+
--checkpoint_dir "$ckpt_dir" \
|
| 329 |
+
--repo_id "$VAL_REPO" \
|
| 330 |
+
--num_batches "$TEACHER_VAL_BATCHES" \
|
| 331 |
+
--num_workers 0 \
|
| 332 |
+
--eval_seed 123 \
|
| 333 |
+
--sample_num_batches "$SAMPLE_VAL_BATCHES" \
|
| 334 |
+
--sample_num_steps "$SAMPLE_NUM_STEPS" \
|
| 335 |
+
--sample_seed 321 \
|
| 336 |
+
"${sample_batch_size_arg[@]}" \
|
| 337 |
+
>"$log_path" 2>&1 &
|
| 338 |
+
eval_pids+=("$!")
|
| 339 |
+
eval_labels+=("${MODEL_VARIANT[$key]} step=${step} gpu=${gpu}")
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
run_eval_sync() {
|
| 343 |
+
local gpu="$1"
|
| 344 |
+
local key="$2"
|
| 345 |
+
local step="$3"
|
| 346 |
+
local ckpt_dir="$4"
|
| 347 |
+
local log_path="$5"
|
| 348 |
+
CUDA_VISIBLE_DEVICES="$gpu" python -u scripts/eval_twin_val_loss_pytorch.py \
|
| 349 |
+
--config_name "${CONFIG_NAME[$key]}" \
|
| 350 |
+
--checkpoint_dir "$ckpt_dir" \
|
| 351 |
+
--repo_id "$VAL_REPO" \
|
| 352 |
+
--num_batches "$TEACHER_VAL_BATCHES" \
|
| 353 |
+
--num_workers 0 \
|
| 354 |
+
--eval_seed 123 \
|
| 355 |
+
--sample_num_batches "$SAMPLE_VAL_BATCHES" \
|
| 356 |
+
--sample_num_steps "$SAMPLE_NUM_STEPS" \
|
| 357 |
+
--sample_seed 321 \
|
| 358 |
+
"${sample_batch_size_arg[@]}" \
|
| 359 |
+
>"$log_path" 2>&1
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
wait_for_eval_jobs() {
|
| 363 |
+
local failed=0
|
| 364 |
+
local idx
|
| 365 |
+
for idx in "${!eval_pids[@]}"; do
|
| 366 |
+
if ! wait "${eval_pids[$idx]}"; then
|
| 367 |
+
log "evaluation failed: ${eval_labels[$idx]}"
|
| 368 |
+
failed=1
|
| 369 |
+
fi
|
| 370 |
+
done
|
| 371 |
+
eval_pids=()
|
| 372 |
+
eval_labels=()
|
| 373 |
+
if [[ "$failed" -ne 0 ]]; then
|
| 374 |
+
exit 1
|
| 375 |
+
fi
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
eval_log_complete() {
|
| 379 |
+
local log_path="$1"
|
| 380 |
+
[[ -f "$log_path" ]] && grep -q '^sample_eval_num_steps_16_per_batch_timing_seconds:' "$log_path"
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
checkpoint_roots() {
|
| 384 |
+
printf '%s\n' \
|
| 385 |
+
"$ROOT/checkpoints/${CONFIG_NAME[shared]}/${EXP_NAME[shared]}" \
|
| 386 |
+
"$ROOT/checkpoints/${CONFIG_NAME[head_only]}/${EXP_NAME[head_only]}" \
|
| 387 |
+
"$ROOT/checkpoints/${CONFIG_NAME[split_ind]}/${EXP_NAME[split_ind]}" \
|
| 388 |
+
"$ROOT/checkpoints/${CONFIG_NAME[split_comm]}/${EXP_NAME[split_comm]}"
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
prune_checkpoint_roots_once() {
|
| 392 |
+
local root child base
|
| 393 |
+
while read -r root; do
|
| 394 |
+
[[ -d "$root" ]] || continue
|
| 395 |
+
for child in "$root"/*; do
|
| 396 |
+
[[ -e "$child" ]] || continue
|
| 397 |
+
[[ -d "$child" ]] || continue
|
| 398 |
+
base="$(basename "$child")"
|
| 399 |
+
case "$base" in
|
| 400 |
+
100|500|2000|tmp_*) continue ;;
|
| 401 |
+
esac
|
| 402 |
+
[[ "$base" =~ ^[0-9]+$ ]] || continue
|
| 403 |
+
rm -rf -- "$child"
|
| 404 |
+
pruner_log "pruned $child"
|
| 405 |
+
done
|
| 406 |
+
done < <(checkpoint_roots)
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
start_checkpoint_pruner() {
|
| 410 |
+
pruner_log "runner checkpoint pruner started interval_s=30 keep_steps=[100,500,2000]"
|
| 411 |
+
(
|
| 412 |
+
while true; do
|
| 413 |
+
prune_checkpoint_roots_once
|
| 414 |
+
sleep 30
|
| 415 |
+
done
|
| 416 |
+
) &
|
| 417 |
+
checkpoint_pruner_pid="$!"
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
stop_checkpoint_pruner() {
|
| 421 |
+
if [[ -n "$checkpoint_pruner_pid" ]]; then
|
| 422 |
+
kill "$checkpoint_pruner_pid" >/dev/null 2>&1 || true
|
| 423 |
+
wait "$checkpoint_pruner_pid" 2>/dev/null || true
|
| 424 |
+
checkpoint_pruner_pid=""
|
| 425 |
+
fi
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
run_step0_evals() {
|
| 429 |
+
log "starting step-0 evaluation sweep"
|
| 430 |
+
local pending=0
|
| 431 |
+
local gpu key ckpt log_path
|
| 432 |
+
while read -r gpu key ckpt log_path; do
|
| 433 |
+
if eval_log_complete "$log_path"; then
|
| 434 |
+
log "step-0 eval already complete for ${MODEL_VARIANT[$key]}"
|
| 435 |
+
continue
|
| 436 |
+
fi
|
| 437 |
+
if [[ "${PARALLEL_EVALS:-1}" == "1" ]]; then
|
| 438 |
+
launch_eval_async "$gpu" "$key" 0 "$ckpt" "$log_path"
|
| 439 |
+
pending=1
|
| 440 |
+
else
|
| 441 |
+
run_eval_sync "$gpu" "$key" 0 "$ckpt" "$log_path"
|
| 442 |
+
fi
|
| 443 |
+
done <<EOF
|
| 444 |
+
0 shared ${STEP0_CKPT[shared]} $RUN_LOG_DIR/${EXP_NAME[shared]}_val_0.log
|
| 445 |
+
1 head_only ${STEP0_CKPT[head_only]} $RUN_LOG_DIR/${EXP_NAME[head_only]}_val_0.log
|
| 446 |
+
2 split_ind ${STEP0_CKPT[split_ind]} $RUN_LOG_DIR/${EXP_NAME[split_ind]}_val_0.log
|
| 447 |
+
3 split_comm ${STEP0_CKPT[split_comm]} $RUN_LOG_DIR/${EXP_NAME[split_comm]}_val_0.log
|
| 448 |
+
EOF
|
| 449 |
+
if [[ "${PARALLEL_EVALS:-1}" == "1" && "$pending" -eq 1 ]]; then
|
| 450 |
+
wait_for_eval_jobs
|
| 451 |
+
fi
|
| 452 |
+
log "finished step-0 evaluation sweep"
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
train_variant() {
|
| 456 |
+
local key="$1"
|
| 457 |
+
local ckpt_root="$ROOT/checkpoints/${CONFIG_NAME[$key]}/${EXP_NAME[$key]}"
|
| 458 |
+
local train_log="$RUN_LOG_DIR/${EXP_NAME[$key]}.log"
|
| 459 |
+
if [[ "${SKIP_COMPLETED_TRAIN:-0}" == "1" && -d "$ckpt_root/2000" ]]; then
|
| 460 |
+
log "training already complete for model_variant=${MODEL_VARIANT[$key]}; skipping train and reusing $ckpt_root"
|
| 461 |
+
return
|
| 462 |
+
fi
|
| 463 |
+
log "training start model_variant=${MODEL_VARIANT[$key]} exp_name=${EXP_NAME[$key]}"
|
| 464 |
+
"$PYTHON_BIN" -m torch.distributed.run --standalone --nproc_per_node=4 scripts/train_pytorch.py \
|
| 465 |
+
"${CONFIG_NAME[$key]}" \
|
| 466 |
+
--exp_name "${EXP_NAME[$key]}" \
|
| 467 |
+
--overwrite \
|
| 468 |
+
--num_train_steps 2000 \
|
| 469 |
+
--save_interval 100 \
|
| 470 |
+
--log_interval 10 \
|
| 471 |
+
>"$train_log" 2>&1
|
| 472 |
+
log "training finished model_variant=${MODEL_VARIANT[$key]}"
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
run_post_train_evals() {
|
| 476 |
+
local key="$1"
|
| 477 |
+
local ckpt_root="$ROOT/checkpoints/${CONFIG_NAME[$key]}/${EXP_NAME[$key]}"
|
| 478 |
+
require_dir "$ckpt_root/100"
|
| 479 |
+
require_dir "$ckpt_root/500"
|
| 480 |
+
require_dir "$ckpt_root/2000"
|
| 481 |
+
|
| 482 |
+
log "starting post-train evaluation sweep for ${MODEL_VARIANT[$key]}"
|
| 483 |
+
local pending=0
|
| 484 |
+
local gpu step ckpt log_path
|
| 485 |
+
while read -r gpu step ckpt log_path; do
|
| 486 |
+
if eval_log_complete "$log_path"; then
|
| 487 |
+
log "post-train eval already complete for ${MODEL_VARIANT[$key]} step=$step"
|
| 488 |
+
continue
|
| 489 |
+
fi
|
| 490 |
+
if [[ "${PARALLEL_EVALS:-1}" == "1" ]]; then
|
| 491 |
+
launch_eval_async "$gpu" "$key" "$step" "$ckpt" "$log_path"
|
| 492 |
+
pending=1
|
| 493 |
+
else
|
| 494 |
+
run_eval_sync "$gpu" "$key" "$step" "$ckpt" "$log_path"
|
| 495 |
+
fi
|
| 496 |
+
done <<EOF
|
| 497 |
+
0 100 $ckpt_root/100 $RUN_LOG_DIR/${EXP_NAME[$key]}_val_100.log
|
| 498 |
+
1 500 $ckpt_root/500 $RUN_LOG_DIR/${EXP_NAME[$key]}_val_500.log
|
| 499 |
+
2 2000 $ckpt_root/2000 $RUN_LOG_DIR/${EXP_NAME[$key]}_val_2000.log
|
| 500 |
+
EOF
|
| 501 |
+
if [[ "${PARALLEL_EVALS:-1}" == "1" && "$pending" -eq 1 ]]; then
|
| 502 |
+
wait_for_eval_jobs
|
| 503 |
+
fi
|
| 504 |
+
log "finished post-train evaluation sweep for ${MODEL_VARIANT[$key]}"
|
| 505 |
+
}
|
| 506 |
+
|
| 507 |
+
collect_metrics() {
|
| 508 |
+
log "collecting step-comparison metrics"
|
| 509 |
+
python -u scripts/collect_twin_dual_push_128_stepcmp_metrics.py \
|
| 510 |
+
--artifact_root "$ARTIFACT_ROOT" \
|
| 511 |
+
--prior_metrics_root "$PRIOR_METRICS_ROOT" \
|
| 512 |
+
>"$RUN_LOG_DIR/collect_metrics.log" 2>&1
|
| 513 |
+
log "metrics collection finished"
|
| 514 |
+
}
|
| 515 |
+
|
| 516 |
+
main() {
|
| 517 |
+
trap stop_checkpoint_pruner EXIT
|
| 518 |
+
log "packed dual-push 128 step comparison runner started"
|
| 519 |
+
if [[ "${SKIP_ENV_SNAPSHOT:-0}" == "1" ]]; then
|
| 520 |
+
log "skipping environment snapshot (SKIP_ENV_SNAPSHOT=1)"
|
| 521 |
+
else
|
| 522 |
+
save_environment_snapshot
|
| 523 |
+
fi
|
| 524 |
+
copy_repro_manifests
|
| 525 |
+
ensure_bootstrap_checkpoints
|
| 526 |
+
ensure_packed_dual_push_norm_stats
|
| 527 |
+
start_checkpoint_pruner
|
| 528 |
+
if [[ "${SKIP_SAMPLE_BATCH_PROBE:-0}" == "1" ]]; then
|
| 529 |
+
log "skipping sample-eval batch-size probe (SKIP_SAMPLE_BATCH_PROBE=1)"
|
| 530 |
+
load_saved_sample_batch_size
|
| 531 |
+
else
|
| 532 |
+
determine_sample_batch_size
|
| 533 |
+
fi
|
| 534 |
+
if [[ "${SKIP_STEP0_EVALS:-0}" == "1" ]]; then
|
| 535 |
+
log "skipping step-0 evaluation sweep (SKIP_STEP0_EVALS=1)"
|
| 536 |
+
else
|
| 537 |
+
run_step0_evals
|
| 538 |
+
fi
|
| 539 |
+
|
| 540 |
+
local selected_keys="${MODEL_KEYS:-shared head_only split_ind split_comm}"
|
| 541 |
+
local key
|
| 542 |
+
for key in $selected_keys; do
|
| 543 |
+
case "$key" in
|
| 544 |
+
shared|head_only|split_ind|split_comm) ;;
|
| 545 |
+
*)
|
| 546 |
+
log "unknown model key in MODEL_KEYS: $key"
|
| 547 |
+
exit 1
|
| 548 |
+
;;
|
| 549 |
+
esac
|
| 550 |
+
train_variant "$key"
|
| 551 |
+
run_post_train_evals "$key"
|
| 552 |
+
done
|
| 553 |
+
|
| 554 |
+
collect_metrics
|
| 555 |
+
log "packed dual-push 128 step comparison runner finished successfully"
|
| 556 |
+
}
|
| 557 |
+
|
| 558 |
+
main "$@"
|
run_logs/hf_upload_20260310.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|