lsnu commited on 1 day ago

Commit

ccf25b1

verified ·

1 Parent(s): da5e1bd

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

README.md +43 -0
artifacts/twin_split_expert_bringup_20260310/README.md +59 -0
artifacts/twin_split_expert_bringup_20260310/bootstrap_checkpoints/pi05_base_split_communicating_packed_from_single/config.json +15 -0
artifacts/twin_split_expert_bringup_20260310/bootstrap_checkpoints/pi05_base_split_communicating_packed_from_single/init_parallel_metadata.json +654 -0
artifacts/twin_split_expert_bringup_20260310/bootstrap_checkpoints/pi05_base_split_communicating_packed_from_single/model.safetensors +3 -0
artifacts/twin_split_expert_bringup_20260310/bootstrap_checkpoints/pi05_base_split_independent_packed_from_single/config.json +15 -0
artifacts/twin_split_expert_bringup_20260310/bootstrap_checkpoints/pi05_base_split_independent_packed_from_single/init_parallel_metadata.json +633 -0
artifacts/twin_split_expert_bringup_20260310/bootstrap_checkpoints/pi05_base_split_independent_packed_from_single/model.safetensors +3 -0
artifacts/twin_split_expert_bringup_20260310/repro/commands_bringup.sh +82 -0
artifacts/twin_split_expert_bringup_20260310/run_logs/split_communicating_real_smoke3.log +104 -0
artifacts/twin_split_expert_bringup_20260310/run_logs/split_communicating_real_train20.log +173 -0
artifacts/twin_split_expert_bringup_20260310/run_logs/split_independent_real_smoke3_r2.log +104 -0
artifacts/twin_split_expert_bringup_20260310/run_logs/split_independent_real_train20.log +173 -0
artifacts/twin_split_expert_bringup_20260310/sanity_checks/split_communicating_invariants.txt +6 -0
artifacts/twin_split_expert_bringup_20260310/sanity_checks/split_independent_invariants.txt +8 -0
openpi/assets/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json +152 -0
openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json +152 -0
openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_smoke3/3/assets/lsnu/twin_dual_push_128_train/norm_stats.json +152 -0
openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_smoke3/3/metadata.pt +3 -0
openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_smoke3/3/model.safetensors +3 -0
openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_smoke3/3/optimizer.pt +3 -0
openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_train20/20/assets/lsnu/twin_dual_push_128_train/norm_stats.json +152 -0
openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_train20/20/metadata.pt +3 -0
openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_train20/20/model.safetensors +3 -0
openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_train20/20/optimizer.pt +3 -0
openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_smoke3_r2/3/assets/lsnu/twin_dual_push_128_train/norm_stats.json +152 -0
openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_smoke3_r2/3/metadata.pt +3 -0
openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_smoke3_r2/3/model.safetensors +3 -0
openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_smoke3_r2/3/optimizer.pt +3 -0
openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_train20/20/assets/lsnu/twin_dual_push_128_train/norm_stats.json +152 -0
openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_train20/20/metadata.pt +3 -0
openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_train20/20/model.safetensors +3 -0
openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_train20/20/optimizer.pt +3 -0
openpi/run_logs/split_communicating_real_smoke3.log +104 -0
openpi/run_logs/split_communicating_real_train20.log +173 -0
openpi/run_logs/split_independent_real_smoke3_r2.log +104 -0
openpi/run_logs/split_independent_real_train20.log +173 -0
openpi/scripts/check_parallel_warmstart_equivalence.py +7 -0
openpi/scripts/check_split_expert_invariants.py +154 -0
openpi/scripts/eval_twin_val_loss_pytorch.py +1 -1
openpi/scripts/init_parallel_pi05_from_single_pytorch.py +151 -37
openpi/scripts/run_twin_dual_push_128_packed_5k.sh +2 -2
openpi/scripts/run_twin_handover_packed_10k.sh +2 -2
openpi/scripts/train_pytorch.py +34 -2
openpi/src/openpi/models/pi0_config.py +29 -2
openpi/src/openpi/models/utils/fsq_tokenizer.py +17 -4
openpi/src/openpi/models_pytorch/gemma_pytorch.py +236 -170
openpi/src/openpi/models_pytorch/pi0_pytorch.py +238 -150
openpi/src/openpi/training/config.py +233 -0
openpi/src/openpi/training/data_loader.py +111 -1

README.md CHANGED Viewed

@@ -13,6 +13,13 @@ Three runs are included:
 2. a longer `10K` follow-up on the same packed setup
 3. a `5K` dual-push `128` screening study on the same packed path
 ## Experiment setup
 - Handover train/val: `lsnu/twin_handover_256_train`, `lsnu/twin_handover_256_val`
@@ -60,6 +67,34 @@ The packed parallel warm-start uses the slice/fuse mapping implemented in `openp
 So this repo should be read as a matched warm-start study, not as a bitwise-identical step-0 control.
 ## Repo layout
 - `openpi/`
@@ -72,6 +107,8 @@ So this repo should be read as a matched warm-start study, not as a bitwise-iden
   - `10K` follow-up bundle with metrics, logs, repro manifests, and environment snapshot
 - `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/`
   - dual-push `128` screening bundle with metrics, logs, repro manifests, and environment snapshot
 - `artifacts/pi05_base_params/`
   - staged base parameter snapshot used during JAX-to-PyTorch conversion
@@ -85,6 +122,11 @@ So this repo should be read as a matched warm-start study, not as a bitwise-iden
 - dual-push `5K` teacher-forced table: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics/teacher_forced_eval_table.csv`
 - dual-push `5K` sample eval table: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics/sample_eval_table.csv`
 - dual-push `5K` environment snapshot: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/environment/`
 - `10K` repro commands: `artifacts/twin_handover_packed_parallelization_10k_20260309/repro/commands_reproduce.sh`
 - `10K` changed-file manifest: `artifacts/twin_handover_packed_parallelization_10k_20260309/repro/changed_files.txt`
 - `10K` environment snapshot: `artifacts/twin_handover_packed_parallelization_10k_20260309/environment/`
@@ -104,6 +146,7 @@ Initial `2K` + `10K` study logic lives primarily in:
 - `openpi/scripts/init_parallel_pi05_from_single_pytorch.py`
 - `openpi/scripts/inspect_twin_packed_batch.py`
 - `openpi/scripts/check_parallel_warmstart_equivalence.py`
 - `openpi/scripts/run_twin_handover_packed_followup.sh`
 - `openpi/scripts/run_twin_handover_packed_10k.sh`
 - `openpi/scripts/run_twin_dual_push_128_packed_5k.sh`

 2. a longer `10K` follow-up on the same packed setup
 3. a `5K` dual-push `128` screening study on the same packed path
+This update also adds a split-action-expert bring-up bundle for the packed TWIN path, covering:
+- exact single-to-split warm-start checkpoints for `split_independent` and `split_communicating`
+- invariant checks for the new split architecture
+- detached real-data smoke and `20`-step training runs on `lsnu/twin_dual_push_128_train`
+- the code changes that introduce the new split-expert action path
 ## Experiment setup
 - Handover train/val: `lsnu/twin_handover_256_train`, `lsnu/twin_handover_256_val`
 So this repo should be read as a matched warm-start study, not as a bitwise-identical step-0 control.
+## Split-Expert Bring-Up (`2026-03-10`)
+The current repo now contains a true split-action-expert implementation in addition to the earlier packed head-only factorization. The new config flag is `action_expert_mode` with:
+- `shared`
+- `head_only_parallel`
+- `split_independent`
+- `split_communicating`
+Key bring-up results:
+- the split warm-start copies the original single `gemma_expert` into exact left/right expert branches for both split modes
+- `split_independent` passes the branch-local invariants:
+  - identical left/right inputs produce identical suffix outputs
+  - perturbing right-arm inputs leaves left-arm outputs unchanged, and vice versa
+- both split modes pass detached real-data training on packed TWIN dual-push:
+  - `3`-step real-data smoke run with checkpoint save
+  - `20`-step real-data training run with checkpoint save
+- the communicating model emits nonzero cross-arm attention diagnostics and remains finite through the real-data `20`-step run
+New bring-up artifact bundle:
+- `artifacts/twin_split_expert_bringup_20260310/`
+  - split warm-start checkpoints
+  - invariant-check outputs
+  - reproducibility commands
+  - summary README for the split-expert bring-up
 ## Repo layout
 - `openpi/`
   - `10K` follow-up bundle with metrics, logs, repro manifests, and environment snapshot
 - `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/`
   - dual-push `128` screening bundle with metrics, logs, repro manifests, and environment snapshot
+- `artifacts/twin_split_expert_bringup_20260310/`
+  - split-expert warm-start checkpoints, sanity checks, and bring-up repro commands
 - `artifacts/pi05_base_params/`
   - staged base parameter snapshot used during JAX-to-PyTorch conversion
 - dual-push `5K` teacher-forced table: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics/teacher_forced_eval_table.csv`
 - dual-push `5K` sample eval table: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/metrics/sample_eval_table.csv`
 - dual-push `5K` environment snapshot: `artifacts/twin_dual_push_128_packed_parallelization_5k_20260310/environment/`
+- split-expert bring-up summary: `artifacts/twin_split_expert_bringup_20260310/README.md`
+- split-expert repro commands: `artifacts/twin_split_expert_bringup_20260310/repro/commands_bringup.sh`
+- split-expert invariant check outputs: `artifacts/twin_split_expert_bringup_20260310/sanity_checks/`
+- split-expert real-data logs: `openpi/run_logs/split_independent_real_smoke3_r2.log`, `openpi/run_logs/split_communicating_real_smoke3.log`, `openpi/run_logs/split_independent_real_train20.log`, `openpi/run_logs/split_communicating_real_train20.log`
+- split-expert real-data checkpoints: `openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/`, `openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/`
 - `10K` repro commands: `artifacts/twin_handover_packed_parallelization_10k_20260309/repro/commands_reproduce.sh`
 - `10K` changed-file manifest: `artifacts/twin_handover_packed_parallelization_10k_20260309/repro/changed_files.txt`
 - `10K` environment snapshot: `artifacts/twin_handover_packed_parallelization_10k_20260309/environment/`
 - `openpi/scripts/init_parallel_pi05_from_single_pytorch.py`
 - `openpi/scripts/inspect_twin_packed_batch.py`
 - `openpi/scripts/check_parallel_warmstart_equivalence.py`
+- `openpi/scripts/check_split_expert_invariants.py`
 - `openpi/scripts/run_twin_handover_packed_followup.sh`
 - `openpi/scripts/run_twin_handover_packed_10k.sh`
 - `openpi/scripts/run_twin_dual_push_128_packed_5k.sh`

artifacts/twin_split_expert_bringup_20260310/README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+# Split-Expert Bring-Up (`2026-03-10`)
+This bundle captures the initial PyTorch bring-up for the new packed TWIN split-action-expert path on `pi0.5`.
+Included here:
+- exact split warm-start checkpoints created from the original single-head PyTorch base checkpoint
+- invariant-check outputs for `split_independent` and `split_communicating`
+- detached real-data smoke and `20`-step training logs on `lsnu/twin_dual_push_128_train`
+- reproducibility commands used for the bring-up
+## Warm-start summary
+Both split modes inherit the same base expert weights and per-arm input/output projections from the single-head checkpoint.
+- `split_independent`
+  - `left_expert_max_abs_diff = 0.0`
+  - `right_expert_max_abs_diff = 0.0`
+  - `left_input_projection_max_abs_diff = 0.0`
+  - `right_input_projection_max_abs_diff = 0.0`
+  - `left_output_projection_max_abs_diff = 0.0`
+  - `right_output_projection_max_abs_diff = 0.0`
+- `split_communicating`
+  - same exact inherited diffs as above
+  - added cross-arm communication parameters are zero-initialized at warm start
+## Real-data bring-up summary
+Dataset used for real-data smoke and short training:
+- `lsnu/twin_dual_push_128_train`
+Successful detached runs:
+- `split_independent_real_smoke3_r2`
+  - `3` train steps on real packed TWIN data
+  - checkpoint saved at step `3`
+- `split_communicating_real_smoke3`
+  - `3` train steps on real packed TWIN data
+  - checkpoint saved at step `3`
+- `split_independent_real_train20`
+  - `20` train steps on real packed TWIN data
+  - final logged train loss at step `20`: `0.6038`
+  - checkpoint saved at step `20`
+- `split_communicating_real_train20`
+  - `20` train steps on real packed TWIN data
+  - final logged train loss at step `20`: `0.5943`
+  - checkpoint saved at step `20`
+## Layout
+- `bootstrap_checkpoints/`
+  - exact split warm-start checkpoints
+- `sanity_checks/`
+  - invariant-check outputs
+- `run_logs/`
+  - detached real-data run logs
+- `repro/commands_bringup.sh`
+  - reproduction commands used during the bring-up

artifacts/twin_split_expert_bringup_20260310/bootstrap_checkpoints/pi05_base_split_communicating_packed_from_single/config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "action_dim": 32,
+  "action_expert_mode": "split_communicating",
+  "action_expert_variant": "gemma_300m",
+  "action_horizon": 16,
+  "arm_action_dims": [
+    16,
+    16
+  ],
+  "discrete_state_input": true,
+  "dtype": "bfloat16",
+  "max_token_len": 200,
+  "paligemma_variant": "gemma_2b",
+  "pi05": true
+}

artifacts/twin_split_expert_bringup_20260310/bootstrap_checkpoints/pi05_base_split_communicating_packed_from_single/init_parallel_metadata.json ADDED Viewed

	@@ -0,0 +1,654 @@

+{
+  "action_expert_mode": "split_communicating",
+  "config_name": "pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k",
+  "cross_arm_comm_init": [
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0
+  ],
+  "left_expert_max_abs_diff": 0.0,
+  "left_input_projection_max_abs_diff": 0.0,
+  "left_output_projection_max_abs_diff": 0.0,
+  "load_state_missing_keys": [
+    "paligemma_with_expert.cross_arm_comm",
+    "paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.norm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.norm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.lm_head.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.norm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.norm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.lm_head.weight",
+    "action_in_proj_arms.0.weight",
+    "action_in_proj_arms.0.bias",
+    "action_in_proj_arms.1.weight",
+    "action_in_proj_arms.1.bias",
+    "action_out_proj_arms.0.weight",
+    "action_out_proj_arms.0.bias",
+    "action_out_proj_arms.1.weight",
+    "action_out_proj_arms.1.bias"
+  ],
+  "load_state_unexpected_keys": [
+    "action_in_proj.bias",
+    "action_in_proj.weight",
+    "action_out_proj.bias",
+    "action_out_proj.weight",
+    "paligemma_with_expert.gemma_expert.lm_head.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.0.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.1.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.1.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.10.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.10.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.11.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.11.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.12.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.12.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.13.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.13.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.14.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.14.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.15.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.15.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.16.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.16.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.17.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.17.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.2.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.2.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.3.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.3.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.4.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.4.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.5.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.5.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.6.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.6.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.7.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.7.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.8.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.8.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.9.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.9.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.norm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.norm.dense.weight"
+  ],
+  "output_path": "/workspace/checkpoints/pi05_base_split_communicating_packed_from_single",
+  "right_expert_max_abs_diff": 0.0,
+  "right_input_projection_max_abs_diff": 0.0,
+  "right_output_projection_max_abs_diff": 0.0,
+  "single_ckpt": "/workspace/checkpoints/pi05_base_single_pytorch",
+  "warm_start_exact": true
+}

artifacts/twin_split_expert_bringup_20260310/bootstrap_checkpoints/pi05_base_split_communicating_packed_from_single/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de1302c45e9d80cbfe5124e0b288ed4da27c18599f1c73fc84714d6c6f45d998
+size 9088652708

artifacts/twin_split_expert_bringup_20260310/bootstrap_checkpoints/pi05_base_split_independent_packed_from_single/config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "action_dim": 32,
+  "action_expert_mode": "split_independent",
+  "action_expert_variant": "gemma_300m",
+  "action_horizon": 16,
+  "arm_action_dims": [
+    16,
+    16
+  ],
+  "discrete_state_input": true,
+  "dtype": "bfloat16",
+  "max_token_len": 200,
+  "paligemma_variant": "gemma_2b",
+  "pi05": true
+}

artifacts/twin_split_expert_bringup_20260310/bootstrap_checkpoints/pi05_base_split_independent_packed_from_single/init_parallel_metadata.json ADDED Viewed

	@@ -0,0 +1,633 @@

+{
+  "action_expert_mode": "split_independent",
+  "config_name": "pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k",
+  "left_expert_max_abs_diff": 0.0,
+  "left_input_projection_max_abs_diff": 0.0,
+  "left_output_projection_max_abs_diff": 0.0,
+  "load_state_missing_keys": [
+    "paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.0.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.1.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.2.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.3.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.4.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.5.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.6.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.7.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.8.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.9.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.10.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.11.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.12.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.13.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.14.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.15.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.16.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.q_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.k_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.v_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.self_attn.o_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.gate_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.up_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.mlp.down_proj.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.input_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.input_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.layers.17.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.model.norm.dense.weight",
+    "paligemma_with_expert.left_gemma_expert.model.norm.dense.bias",
+    "paligemma_with_expert.left_gemma_expert.lm_head.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.0.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.1.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.2.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.3.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.4.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.5.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.6.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.7.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.8.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.9.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.10.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.11.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.12.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.13.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.14.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.15.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.16.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.q_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.k_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.v_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.self_attn.o_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.gate_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.up_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.mlp.down_proj.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.input_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.input_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.layers.17.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.model.norm.dense.weight",
+    "paligemma_with_expert.right_gemma_expert.model.norm.dense.bias",
+    "paligemma_with_expert.right_gemma_expert.lm_head.weight",
+    "action_in_proj_arms.0.weight",
+    "action_in_proj_arms.0.bias",
+    "action_in_proj_arms.1.weight",
+    "action_in_proj_arms.1.bias",
+    "action_out_proj_arms.0.weight",
+    "action_out_proj_arms.0.bias",
+    "action_out_proj_arms.1.weight",
+    "action_out_proj_arms.1.bias"
+  ],
+  "load_state_unexpected_keys": [
+    "action_in_proj.bias",
+    "action_in_proj.weight",
+    "action_out_proj.bias",
+    "action_out_proj.weight",
+    "paligemma_with_expert.gemma_expert.lm_head.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.0.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.0.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.1.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.1.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.1.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.10.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.10.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.10.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.11.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.11.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.11.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.12.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.12.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.12.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.13.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.13.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.13.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.14.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.14.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.14.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.15.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.15.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.15.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.16.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.16.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.16.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.17.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.17.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.17.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.2.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.2.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.2.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.3.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.3.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.3.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.4.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.4.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.4.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.5.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.5.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.5.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.6.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.6.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.6.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.7.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.7.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.7.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.8.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.8.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.8.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.input_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.9.input_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.mlp.down_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.mlp.gate_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.mlp.up_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.post_attention_layernorm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.layers.9.post_attention_layernorm.dense.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.self_attn.k_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.self_attn.o_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.self_attn.q_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.layers.9.self_attn.v_proj.weight",
+    "paligemma_with_expert.gemma_expert.model.norm.dense.bias",
+    "paligemma_with_expert.gemma_expert.model.norm.dense.weight"
+  ],
+  "output_path": "/workspace/checkpoints/pi05_base_split_independent_packed_from_single",
+  "right_expert_max_abs_diff": 0.0,
+  "right_input_projection_max_abs_diff": 0.0,
+  "right_output_projection_max_abs_diff": 0.0,
+  "single_ckpt": "/workspace/checkpoints/pi05_base_single_pytorch",
+  "warm_start_exact": true
+}

artifacts/twin_split_expert_bringup_20260310/bootstrap_checkpoints/pi05_base_split_independent_packed_from_single/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5164534199e5396320dfc44ac251f50e117d92cb82d90aa3f3f2fe8e82c620dc
+size 9088652560

artifacts/twin_split_expert_bringup_20260310/repro/commands_bringup.sh ADDED Viewed

	@@ -0,0 +1,82 @@

+#!/usr/bin/env bash
+set -euo pipefail
+export HF_TOKEN="${HF_TOKEN:-}"
+export HF_HOME=/workspace/.hf
+export HF_HUB_CACHE=/workspace/.hf/hub
+export HF_DATASETS_CACHE=/workspace/.hf/datasets
+export HUGGINGFACE_HUB_CACHE=/workspace/.hf/hub
+export XDG_CACHE_HOME=/workspace/.cache
+export OPENPI_LEROBOT_HOME=/workspace/lerobot
+export OPENPI_TORCH_COMPILE_SAMPLE_ACTIONS=0
+export TOKENIZERS_PARALLELISM=false
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export PYTHONPATH=/workspace/pi05tests/openpi/src
+cd /workspace/pi05tests/openpi
+# Create exact split warm-start checkpoints from the single-head PyTorch base checkpoint.
+./.venv/bin/python -u scripts/init_parallel_pi05_from_single_pytorch.py \
+  --single_ckpt /workspace/checkpoints/pi05_base_single_pytorch \
+  --config_name pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k \
+  --output_path /workspace/pi05tests/artifacts/twin_split_expert_bringup_20260310/bootstrap_checkpoints/pi05_base_split_independent_packed_from_single
+./.venv/bin/python -u scripts/init_parallel_pi05_from_single_pytorch.py \
+  --single_ckpt /workspace/checkpoints/pi05_base_single_pytorch \
+  --config_name pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k \
+  --output_path /workspace/pi05tests/artifacts/twin_split_expert_bringup_20260310/bootstrap_checkpoints/pi05_base_split_communicating_packed_from_single
+# Check split invariants.
+./.venv/bin/python -u scripts/check_split_expert_invariants.py \
+  --config_name pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k \
+  --checkpoint_dir /workspace/pi05tests/artifacts/twin_split_expert_bringup_20260310/bootstrap_checkpoints/pi05_base_split_independent_packed_from_single
+./.venv/bin/python -u scripts/check_split_expert_invariants.py \
+  --config_name pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k \
+  --checkpoint_dir /workspace/pi05tests/artifacts/twin_split_expert_bringup_20260310/bootstrap_checkpoints/pi05_base_split_communicating_packed_from_single
+# Detached real-data smoke runs.
+CUDA_VISIBLE_DEVICES=0 setsid -f ./.venv/bin/python -u scripts/train_pytorch.py \
+  pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k \
+  --exp_name split_independent_real_smoke3_r2 \
+  --num_train_steps 3 \
+  --save_interval 3 \
+  --log_interval 1 \
+  --batch_size 1 \
+  --num_workers 0 \
+  --pytorch_training_precision float32 \
+  > run_logs/split_independent_real_smoke3_r2.log 2>&1
+CUDA_VISIBLE_DEVICES=1 setsid -f ./.venv/bin/python -u scripts/train_pytorch.py \
+  pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k \
+  --exp_name split_communicating_real_smoke3 \
+  --num_train_steps 3 \
+  --save_interval 3 \
+  --log_interval 1 \
+  --batch_size 1 \
+  --num_workers 0 \
+  --pytorch_training_precision float32 \
+  > run_logs/split_communicating_real_smoke3.log 2>&1
+# Detached short real-data training runs.
+CUDA_VISIBLE_DEVICES=0 setsid -f ./.venv/bin/python -u scripts/train_pytorch.py \
+  pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k \
+  --exp_name split_independent_real_train20 \
+  --num_train_steps 20 \
+  --save_interval 20 \
+  --log_interval 1 \
+  --batch_size 1 \
+  --num_workers 0 \
+  --pytorch_training_precision float32 \
+  > run_logs/split_independent_real_train20.log 2>&1
+CUDA_VISIBLE_DEVICES=1 setsid -f ./.venv/bin/python -u scripts/train_pytorch.py \
+  pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k \
+  --exp_name split_communicating_real_train20 \
+  --num_train_steps 20 \
+  --save_interval 20 \
+  --log_interval 1 \
+  --batch_size 1 \
+  --num_workers 0 \
+  --pytorch_training_precision float32 \
+  > run_logs/split_communicating_real_train20.log 2>&1

artifacts/twin_split_expert_bringup_20260310/run_logs/split_communicating_real_smoke3.log ADDED Viewed

	@@ -0,0 +1,104 @@

+19:55:02.788 [I] Created experiment checkpoint directory: /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_smoke3 (22110:train_pytorch.py:533)
+19:55:02.789 [I] Using batch size per GPU: 1 (total batch size across 1 GPUs: 1)                  (22110:train_pytorch.py:552)
+19:55:02.865 [I] Loaded norm stats from /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/lsnu/twin_dual_push_128_train (22110:config.py:234)
+19:55:02.867 [I] data_config: DataConfig(repo_id='lsnu/twin_dual_push_128_train', asset_id='lsnu/twin_dual_push_128_train', norm_stats={'state': NormStats(mean=array([ 0.10604009,  0.20956482,  0.09184283, -1.98801565, -0.04930164,
+        2.20065784,  1.07595289,  0.52742052,  0.01585805,  0.08288047,
+       -0.06887393, -1.906394  ,  0.04810138,  2.01086807, -0.92902797,
+        0.8440811 ]), std=array([0.09207697, 0.31317395, 0.08127229, 0.53812712, 0.06093267,
+       0.51205784, 0.22527155, 0.49924755, 0.20230208, 0.31408131,
+       0.21665592, 0.5264315 , 0.20170984, 0.4745712 , 1.17861438,
+       0.36277843]), q01=array([-5.00321221e-06, -3.88026012e-01, -2.23782954e-05, -2.98962682e+00,
+       -2.38592355e-01,  1.22146201e+00,  7.85383821e-01,  0.00000000e+00,
+       -6.15615927e-01, -4.14941930e-01, -9.43696350e-01, -2.88397729e+00,
+       -9.05083556e-01,  1.22148895e+00, -2.79564499e+00,  0.00000000e+00]), q99=array([ 0.31251293,  0.86546916,  0.35174239, -0.87634897,  0.05212194,
+        2.97208117,  1.64465171,  0.9998    ,  0.7670313 ,  0.96073459,
+        0.68710467, -0.87498123,  0.35838486,  2.9773227 ,  0.78477909,
+        0.9998    ])), 'actions': NormStats(mean=array([ 0.03630241,  0.09624442,  0.01367408, -0.2224988 , -0.02762174,
+        0.27498844,  0.0892187 ,  0.45650524, -0.00378086,  0.09113847,
+       -0.00376227, -0.22537093,  0.00826233,  0.26799494, -0.57452869,
+        0.7731654 ]), std=array([0.04995174, 0.29268014, 0.06852161, 0.3647725 , 0.07012808,
+       0.27129024, 0.11329207, 0.4981046 , 0.0917461 , 0.22704004,
+       0.1069391 , 0.2572591 , 0.11801817, 0.1235588 , 0.35835782,
+       0.41878474]), q01=array([-5.86206436e-04, -3.88117499e-01, -2.55800724e-01, -8.34769463e-01,
+       -3.51454727e-01, -1.54787922e-03, -5.81741333e-04,  0.00000000e+00,
+       -2.64436970e-01, -3.51582764e-01, -3.69693995e-01, -7.30919549e-01,
+       -3.35441585e-01, -6.62303925e-04, -9.34731126e-01,  0.00000000e+00]), q99=array([0.20790743, 0.81198567, 0.19612836, 0.33958174, 0.05568643,
+       0.75265345, 0.425256  , 0.9998    , 0.2558236 , 0.58901345,
+       0.35822071, 0.18567593, 0.44035054, 0.49966629, 0.12655233,
+       0.9998    ]))}, repack_transforms=Group(inputs=[RepackTransform(structure={'images': {'cam_high': 'front_image', 'cam_left_wrist': 'wrist_left_image', 'cam_right_wrist': 'wrist_right_image'}, 'state': 'state', 'actions': 'action', 'prompt': 'task'})], outputs=()), data_transforms=Group(inputs=[AlohaInputs(adapt_to_pi=False)], outputs=[]), model_transforms=Group(inputs=[InjectDefaultPrompt(prompt=None), ResizeImages(height=224, width=224), TokenizePrompt(tokenizer=<openpi.models.tokenizer.PaligemmaTokenizer object at 0x7ec79fca8910>, discrete_state_input=True), PackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))], outputs=[UnpackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))]), use_quantile_norm=True, action_sequence_keys=('action',), prompt_from_task=False, rlds_data_dir=None, action_space=None, datasets=()) (22110:data_loader.py:284)
+19:55:09.225 [I] JAX version 0.5.3 available.                                                     (22110:config.py:125)
+19:55:34.099 [I] Using existing local LeRobot dataset mirror for lsnu/twin_dual_push_128_train: /workspace/lerobot/lsnu/twin_dual_push_128_train (22110:data_loader.py:148)
+19:55:34.205 [W] 'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder (22110:video_utils.py:36)
+19:56:38.376 [I] local_batch_size: 1                                                              (22110:data_loader.py:365)
+19:58:25.969 [I] Enabled gradient checkpointing for PI0Pytorch model                              (22110:pi0_pytorch.py:138)
+19:58:25.971 [I] Enabled gradient checkpointing for memory optimization                           (22110:train_pytorch.py:624)
+19:58:25.972 [I] Step 0 (after_model_creation): GPU memory - allocated: 17.23GB, reserved: 17.23GB, free: 0.00GB, peak_allocated: 17.23GB, peak_reserved: 17.23GB (22110:train_pytorch.py:493)
+19:58:25.972 [I] Loading weights from: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single (22110:train_pytorch.py:653)
+19:58:29.565 [I] Weight loading missing key count: 0                                              (22110:train_pytorch.py:657)
+19:58:29.566 [I] Weight loading missing keys: set()                                               (22110:train_pytorch.py:658)
+19:58:29.566 [I] Weight loading unexpected key count: 0                                           (22110:train_pytorch.py:659)
+19:58:29.566 [I] Weight loading unexpected keys: []                                               (22110:train_pytorch.py:660)
+19:58:29.567 [I] Loaded PyTorch weights from /workspace/checkpoints/pi05_base_split_communicating_packed_from_single (22110:train_pytorch.py:661)
+19:58:29.571 [I] Running on: 963c158043aa | world_size=1                                          (22110:train_pytorch.py:701)
+19:58:29.571 [I] Training config: batch_size=1, effective_batch_size=1, num_train_steps=3         (22110:train_pytorch.py:702)
+19:58:29.572 [I] Memory optimizations: gradient_checkpointing=True                                (22110:train_pytorch.py:705)
+19:58:29.572 [I] DDP settings: find_unused_parameters=False, gradient_as_bucket_view=True, static_graph=True (22110:train_pytorch.py:706)
+19:58:29.573 [I] LR schedule: warmup=250, peak_lr=2.50e-05, decay_steps=5000, end_lr=2.50e-06     (22110:train_pytorch.py:707)
+19:58:29.573 [I] Optimizer: AdamW, weight_decay=1e-10, clip_norm=1.0                              (22110:train_pytorch.py:710)
+19:58:29.573 [I] EMA is not supported for PyTorch training                                        (22110:train_pytorch.py:713)
+19:58:29.574 [I] Training precision: float32                                                      (22110:train_pytorch.py:714)
+19:58:29.590 [I] Resolved config name: pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k (22110:train_pytorch.py:308)
+19:58:29.590 [I] Dataset repo_id: lsnu/twin_dual_push_128_train                                   (22110:train_pytorch.py:309)
+19:58:29.591 [I] Norm-stats file path: /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json (22110:train_pytorch.py:310)
+19:58:29.592 [I] Norm-stats summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (22110:train_pytorch.py:311)
+19:58:29.592 [I] Checkpoint source path: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single (22110:train_pytorch.py:312)
+19:58:29.592 [I] Model type: split_communicating                                                  (22110:train_pytorch.py:313)
+19:58:29.593 [I] Packed transforms active: True                                                   (22110:train_pytorch.py:314)
+19:58:29.593 [I] World size: 1                                                                    (22110:train_pytorch.py:315)
+19:58:29.594 [I] Batch size: local=1, global=1                                                    (22110:train_pytorch.py:316)
+19:58:29.594 [I] num_workers: 0                                                                   (22110:train_pytorch.py:317)
+19:58:29.595 [I] Precision: float32                                                               (22110:train_pytorch.py:318)
+19:58:29.595 [I] LR schedule summary: warmup_steps=250, peak_lr=2.50e-05, decay_steps=5000, decay_lr=2.50e-06 (22110:train_pytorch.py:319)
+19:58:29.595 [I] Save/log intervals: save_interval=3, log_interval=1                              (22110:train_pytorch.py:326)
+19:58:29.596 [I] Action-loss mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (22110:train_pytorch.py:327)
+19:58:29.596 [I] Active mask dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]       (22110:train_pytorch.py:328)
+19:58:29.597 [I] Masked dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]      (22110:train_pytorch.py:329)
+19:58:29.597 [I] Gradient bucket diagnostics: left_action_in, right_action_in, left_expert, right_expert, action_out, cross_arm_comm (22110:train_pytorch.py:722)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+19:58:31.354 [I] debug_step=1 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22110:train_pytorch.py:831)
+19:58:31.355 [I] debug_step=1 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22110:train_pytorch.py:835)
+19:58:31.356 [I] debug_step=1 prompt_token_lengths=[75]                                           (22110:train_pytorch.py:838)
+19:58:31.356 [I] debug_step=1 state_stats min=-1.0000 max=1.0004 mean=0.0112 std=0.3876           (22110:train_pytorch.py:839)
+19:58:31.357 [I] debug_step=1 action_stats min=-1.0016 max=1.0004 mean=-0.0454 std=0.4716         (22110:train_pytorch.py:842)
+19:58:31.358 [I] debug_step=1 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22110:train_pytorch.py:845)
+19:58:31.372 [I] debug_step=1 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22110:train_pytorch.py:849)
+19:58:31.372 [I] debug_step=1 lr=9.96e-08 grad_norm=60.0472 data_time=0.3311s step_time=1.3966s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.25GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.25GB (22110:train_pytorch.py:854)
+19:58:31.373 [I] debug_step=1 grad_shared_backbone=36.9945 grad_left_action_in=2.3769 grad_right_action_in=1.7630 grad_left_expert=31.1244 grad_right_expert=27.8917 grad_action_out=13.0720 grad_cross_arm_comm=3.1067 cross_arm_comm_gate_layer_0=0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_2=0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=0.0000 cross_arm_comm_gate_layer_5=0.0000 cross_arm_comm_gate_layer_6=0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=0.0000 cross_arm_comm_gate_layer_9=0.0000 cross_arm_comm_gate_layer_10=0.0000 cross_arm_comm_gate_layer_11=0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=0.0000 cross_arm_attention_mass_layer_0=0.0001 cross_arm_attention_mass_layer_1=0.0050 cross_arm_attention_mass_layer_2=0.0217 cross_arm_attention_mass_layer_3=0.0086 cross_arm_attention_mass_layer_4=0.0279 cross_arm_attention_mass_layer_5=0.0355 cross_arm_attention_mass_layer_6=0.0179 cross_arm_attention_mass_layer_7=0.0369 cross_arm_attention_mass_layer_8=0.0183 cross_arm_attention_mass_layer_9=0.0153 cross_arm_attention_mass_layer_10=0.0188 cross_arm_attention_mass_layer_11=0.0278 cross_arm_attention_mass_layer_12=0.0052 cross_arm_attention_mass_layer_13=0.0161 cross_arm_attention_mass_layer_14=0.0091 cross_arm_attention_mass_layer_15=0.0342 cross_arm_attention_mass_layer_16=0.0457 cross_arm_attention_mass_layer_17=0.0454 (22110:train_pytorch.py:862)
+19:58:31.374 [I] step=1 loss=3.8411 smoothed_loss=3.8411 lr=9.96e-08 grad_norm=60.0472 step_time=1.3966s data_time=0.3311s it/s=0.555 eta_to_3=3.6s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0001 cross_arm_attention_mass_layer_1=0.0050 cross_arm_attention_mass_layer_10=0.0188 cross_arm_attention_mass_layer_11=0.0278 cross_arm_attention_mass_layer_12=0.0052 cross_arm_attention_mass_layer_13=0.0161 cross_arm_attention_mass_layer_14=0.0091 cross_arm_attention_mass_layer_15=0.0342 cross_arm_attention_mass_layer_16=0.0457 cross_arm_attention_mass_layer_17=0.0454 cross_arm_attention_mass_layer_2=0.0217 cross_arm_attention_mass_layer_3=0.0086 cross_arm_attention_mass_layer_4=0.0279 cross_arm_attention_mass_layer_5=0.0355 cross_arm_attention_mass_layer_6=0.0179 cross_arm_attention_mass_layer_7=0.0369 cross_arm_attention_mass_layer_8=0.0183 cross_arm_attention_mass_layer_9=0.0153 cross_arm_comm_gate_layer_0=0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_10=0.0000 cross_arm_comm_gate_layer_11=0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=0.0000 cross_arm_comm_gate_layer_2=0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=0.0000 cross_arm_comm_gate_layer_5=0.0000 cross_arm_comm_gate_layer_6=0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=0.0000 cross_arm_comm_gate_layer_9=0.0000 grad_action_out=13.0720 grad_cross_arm_comm=3.1067 grad_left_action_in=2.3769 grad_left_expert=31.1244 grad_right_action_in=1.7630 grad_right_expert=27.8917 grad_shared_backbone=36.9945 (22110:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+19:58:32.164 [I] debug_step=2 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22110:train_pytorch.py:831)
+19:58:32.165 [I] debug_step=2 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22110:train_pytorch.py:835)
+19:58:32.166 [I] debug_step=2 prompt_token_lengths=[76]                                           (22110:train_pytorch.py:838)
+19:58:32.166 [I] debug_step=2 state_stats min=-0.9415 max=1.0004 mean=-0.0010 std=0.4295          (22110:train_pytorch.py:839)
+19:58:32.167 [I] debug_step=2 action_stats min=-1.0000 max=1.1367 mean=0.0272 std=0.4576          (22110:train_pytorch.py:842)
+19:58:32.168 [I] debug_step=2 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22110:train_pytorch.py:845)
+19:58:32.168 [I] debug_step=2 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22110:train_pytorch.py:849)
+19:58:32.169 [I] debug_step=2 lr=1.99e-07 grad_norm=10.7300 data_time=0.1812s step_time=0.6234s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.30GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.30GB (22110:train_pytorch.py:854)
+19:58:32.169 [I] debug_step=2 grad_shared_backbone=9.2018 grad_left_action_in=0.1651 grad_right_action_in=0.1485 grad_left_expert=2.5032 grad_right_expert=2.3988 grad_action_out=4.0772 grad_cross_arm_comm=0.0166 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=-0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0019 cross_arm_attention_mass_layer_2=0.0161 cross_arm_attention_mass_layer_3=0.0029 cross_arm_attention_mass_layer_4=0.0175 cross_arm_attention_mass_layer_5=0.0243 cross_arm_attention_mass_layer_6=0.0074 cross_arm_attention_mass_layer_7=0.0232 cross_arm_attention_mass_layer_8=0.0155 cross_arm_attention_mass_layer_9=0.0135 cross_arm_attention_mass_layer_10=0.0094 cross_arm_attention_mass_layer_11=0.0151 cross_arm_attention_mass_layer_12=0.0021 cross_arm_attention_mass_layer_13=0.0053 cross_arm_attention_mass_layer_14=0.0056 cross_arm_attention_mass_layer_15=0.0250 cross_arm_attention_mass_layer_16=0.0356 cross_arm_attention_mass_layer_17=0.0413 (22110:train_pytorch.py:862)
+19:58:32.170 [I] step=2 loss=1.1389 smoothed_loss=3.5709 lr=1.99e-07 grad_norm=10.7300 step_time=0.6234s data_time=0.1812s it/s=1.257 eta_to_3=0.8s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0019 cross_arm_attention_mass_layer_10=0.0094 cross_arm_attention_mass_layer_11=0.0151 cross_arm_attention_mass_layer_12=0.0021 cross_arm_attention_mass_layer_13=0.0053 cross_arm_attention_mass_layer_14=0.0056 cross_arm_attention_mass_layer_15=0.0250 cross_arm_attention_mass_layer_16=0.0356 cross_arm_attention_mass_layer_17=0.0413 cross_arm_attention_mass_layer_2=0.0161 cross_arm_attention_mass_layer_3=0.0029 cross_arm_attention_mass_layer_4=0.0175 cross_arm_attention_mass_layer_5=0.0243 cross_arm_attention_mass_layer_6=0.0074 cross_arm_attention_mass_layer_7=0.0232 cross_arm_attention_mass_layer_8=0.0155 cross_arm_attention_mass_layer_9=0.0135 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=-0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=4.0772 grad_cross_arm_comm=0.0166 grad_left_action_in=0.1651 grad_left_expert=2.5032 grad_right_action_in=0.1485 grad_right_expert=2.3988 grad_shared_backbone=9.2018 (22110:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+19:58:32.708 [I] debug_step=3 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22110:train_pytorch.py:831)
+19:58:32.709 [I] debug_step=3 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22110:train_pytorch.py:835)
+19:58:32.709 [I] debug_step=3 prompt_token_lengths=[75]                                           (22110:train_pytorch.py:838)
+19:58:32.710 [I] debug_step=3 state_stats min=-1.0000 max=1.0004 mean=0.0558 std=0.4300           (22110:train_pytorch.py:839)
+19:58:32.711 [I] debug_step=3 action_stats min=-1.0033 max=1.0004 mean=-0.0658 std=0.4704         (22110:train_pytorch.py:842)
+19:58:32.711 [I] debug_step=3 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22110:train_pytorch.py:845)
+19:58:32.712 [I] debug_step=3 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22110:train_pytorch.py:849)
+19:58:32.712 [I] debug_step=3 lr=2.99e-07 grad_norm=343.7256 data_time=0.1312s step_time=0.4126s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.30GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.30GB (22110:train_pytorch.py:854)
+19:58:32.713 [I] debug_step=3 grad_shared_backbone=215.2880 grad_left_action_in=4.7981 grad_right_action_in=9.5346 grad_left_expert=72.6437 grad_right_expert=227.6029 grad_action_out=23.7709 grad_cross_arm_comm=3.3555 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_attention_mass_layer_0=0.0003 cross_arm_attention_mass_layer_1=0.0127 cross_arm_attention_mass_layer_2=0.0275 cross_arm_attention_mass_layer_3=0.0190 cross_arm_attention_mass_layer_4=0.0359 cross_arm_attention_mass_layer_5=0.0454 cross_arm_attention_mass_layer_6=0.0228 cross_arm_attention_mass_layer_7=0.0346 cross_arm_attention_mass_layer_8=0.0149 cross_arm_attention_mass_layer_9=0.0296 cross_arm_attention_mass_layer_10=0.0177 cross_arm_attention_mass_layer_11=0.0230 cross_arm_attention_mass_layer_12=0.0134 cross_arm_attention_mass_layer_13=0.0242 cross_arm_attention_mass_layer_14=0.0109 cross_arm_attention_mass_layer_15=0.0285 cross_arm_attention_mass_layer_16=0.0403 cross_arm_attention_mass_layer_17=0.0268 (22110:train_pytorch.py:862)
+19:58:32.713 [I] step=3 loss=5.0518 smoothed_loss=3.7190 lr=2.99e-07 grad_norm=343.7256 step_time=0.4126s data_time=0.1312s it/s=1.843 eta_to_3=0.0s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0003 cross_arm_attention_mass_layer_1=0.0127 cross_arm_attention_mass_layer_10=0.0177 cross_arm_attention_mass_layer_11=0.0230 cross_arm_attention_mass_layer_12=0.0134 cross_arm_attention_mass_layer_13=0.0242 cross_arm_attention_mass_layer_14=0.0109 cross_arm_attention_mass_layer_15=0.0285 cross_arm_attention_mass_layer_16=0.0403 cross_arm_attention_mass_layer_17=0.0268 cross_arm_attention_mass_layer_2=0.0275 cross_arm_attention_mass_layer_3=0.0190 cross_arm_attention_mass_layer_4=0.0359 cross_arm_attention_mass_layer_5=0.0454 cross_arm_attention_mass_layer_6=0.0228 cross_arm_attention_mass_layer_7=0.0346 cross_arm_attention_mass_layer_8=0.0149 cross_arm_attention_mass_layer_9=0.0296 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=23.7709 grad_cross_arm_comm=3.3555 grad_left_action_in=4.7981 grad_left_expert=72.6437 grad_right_action_in=9.5346 grad_right_expert=227.6029 grad_shared_backbone=215.2880 (22110:train_pytorch.py:882)
+20:01:38.475 [I] Saved checkpoint at step 3 -> /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_smoke3/3 (22110:train_pytorch.py:378)

artifacts/twin_split_expert_bringup_20260310/run_logs/split_communicating_real_train20.log ADDED Viewed

	@@ -0,0 +1,173 @@

+20:03:03.480 [I] Created experiment checkpoint directory: /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_train20 (22938:train_pytorch.py:533)
+20:03:03.486 [I] Using batch size per GPU: 1 (total batch size across 1 GPUs: 1)                  (22938:train_pytorch.py:552)
+20:03:03.634 [I] Loaded norm stats from /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/lsnu/twin_dual_push_128_train (22938:config.py:234)
+20:03:03.637 [I] data_config: DataConfig(repo_id='lsnu/twin_dual_push_128_train', asset_id='lsnu/twin_dual_push_128_train', norm_stats={'state': NormStats(mean=array([ 0.10604009,  0.20956482,  0.09184283, -1.98801565, -0.04930164,
+        2.20065784,  1.07595289,  0.52742052,  0.01585805,  0.08288047,
+       -0.06887393, -1.906394  ,  0.04810138,  2.01086807, -0.92902797,
+        0.8440811 ]), std=array([0.09207697, 0.31317395, 0.08127229, 0.53812712, 0.06093267,
+       0.51205784, 0.22527155, 0.49924755, 0.20230208, 0.31408131,
+       0.21665592, 0.5264315 , 0.20170984, 0.4745712 , 1.17861438,
+       0.36277843]), q01=array([-5.00321221e-06, -3.88026012e-01, -2.23782954e-05, -2.98962682e+00,
+       -2.38592355e-01,  1.22146201e+00,  7.85383821e-01,  0.00000000e+00,
+       -6.15615927e-01, -4.14941930e-01, -9.43696350e-01, -2.88397729e+00,
+       -9.05083556e-01,  1.22148895e+00, -2.79564499e+00,  0.00000000e+00]), q99=array([ 0.31251293,  0.86546916,  0.35174239, -0.87634897,  0.05212194,
+        2.97208117,  1.64465171,  0.9998    ,  0.7670313 ,  0.96073459,
+        0.68710467, -0.87498123,  0.35838486,  2.9773227 ,  0.78477909,
+        0.9998    ])), 'actions': NormStats(mean=array([ 0.03630241,  0.09624442,  0.01367408, -0.2224988 , -0.02762174,
+        0.27498844,  0.0892187 ,  0.45650524, -0.00378086,  0.09113847,
+       -0.00376227, -0.22537093,  0.00826233,  0.26799494, -0.57452869,
+        0.7731654 ]), std=array([0.04995174, 0.29268014, 0.06852161, 0.3647725 , 0.07012808,
+       0.27129024, 0.11329207, 0.4981046 , 0.0917461 , 0.22704004,
+       0.1069391 , 0.2572591 , 0.11801817, 0.1235588 , 0.35835782,
+       0.41878474]), q01=array([-5.86206436e-04, -3.88117499e-01, -2.55800724e-01, -8.34769463e-01,
+       -3.51454727e-01, -1.54787922e-03, -5.81741333e-04,  0.00000000e+00,
+       -2.64436970e-01, -3.51582764e-01, -3.69693995e-01, -7.30919549e-01,
+       -3.35441585e-01, -6.62303925e-04, -9.34731126e-01,  0.00000000e+00]), q99=array([0.20790743, 0.81198567, 0.19612836, 0.33958174, 0.05568643,
+       0.75265345, 0.425256  , 0.9998    , 0.2558236 , 0.58901345,
+       0.35822071, 0.18567593, 0.44035054, 0.49966629, 0.12655233,
+       0.9998    ]))}, repack_transforms=Group(inputs=[RepackTransform(structure={'images': {'cam_high': 'front_image', 'cam_left_wrist': 'wrist_left_image', 'cam_right_wrist': 'wrist_right_image'}, 'state': 'state', 'actions': 'action', 'prompt': 'task'})], outputs=()), data_transforms=Group(inputs=[AlohaInputs(adapt_to_pi=False)], outputs=[]), model_transforms=Group(inputs=[InjectDefaultPrompt(prompt=None), ResizeImages(height=224, width=224), TokenizePrompt(tokenizer=<openpi.models.tokenizer.PaligemmaTokenizer object at 0x7303f4ce5b90>, discrete_state_input=True), PackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))], outputs=[UnpackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))]), use_quantile_norm=True, action_sequence_keys=('action',), prompt_from_task=False, rlds_data_dir=None, action_space=None, datasets=()) (22938:data_loader.py:393)
+20:03:15.223 [I] JAX version 0.5.3 available.                                                     (22938:config.py:125)
+20:04:19.283 [I] Using existing local LeRobot dataset mirror for lsnu/twin_dual_push_128_train: /workspace/lerobot/lsnu/twin_dual_push_128_train (22938:data_loader.py:148)
+20:04:19.378 [W] 'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder (22938:video_utils.py:36)
+20:09:10.375 [I] local_batch_size: 1                                                              (22938:data_loader.py:474)
+20:11:59.735 [I] Enabled gradient checkpointing for PI0Pytorch model                              (22938:pi0_pytorch.py:138)
+20:11:59.737 [I] Enabled gradient checkpointing for memory optimization                           (22938:train_pytorch.py:624)
+20:11:59.738 [I] Step 0 (after_model_creation): GPU memory - allocated: 17.23GB, reserved: 17.23GB, free: 0.00GB, peak_allocated: 17.23GB, peak_reserved: 17.23GB (22938:train_pytorch.py:493)
+20:11:59.738 [I] Loading weights from: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single (22938:train_pytorch.py:653)
+20:12:04.492 [I] Weight loading missing key count: 0                                              (22938:train_pytorch.py:657)
+20:12:04.492 [I] Weight loading missing keys: set()                                               (22938:train_pytorch.py:658)
+20:12:04.492 [I] Weight loading unexpected key count: 0                                           (22938:train_pytorch.py:659)
+20:12:04.493 [I] Weight loading unexpected keys: []                                               (22938:train_pytorch.py:660)
+20:12:04.493 [I] Loaded PyTorch weights from /workspace/checkpoints/pi05_base_split_communicating_packed_from_single (22938:train_pytorch.py:661)
+20:12:04.497 [I] Running on: 963c158043aa | world_size=1                                          (22938:train_pytorch.py:701)
+20:12:04.498 [I] Training config: batch_size=1, effective_batch_size=1, num_train_steps=20        (22938:train_pytorch.py:702)
+20:12:04.498 [I] Memory optimizations: gradient_checkpointing=True                                (22938:train_pytorch.py:705)
+20:12:04.499 [I] DDP settings: find_unused_parameters=False, gradient_as_bucket_view=True, static_graph=True (22938:train_pytorch.py:706)
+20:12:04.499 [I] LR schedule: warmup=250, peak_lr=2.50e-05, decay_steps=5000, end_lr=2.50e-06     (22938:train_pytorch.py:707)
+20:12:04.499 [I] Optimizer: AdamW, weight_decay=1e-10, clip_norm=1.0                              (22938:train_pytorch.py:710)
+20:12:04.500 [I] EMA is not supported for PyTorch training                                        (22938:train_pytorch.py:713)
+20:12:04.500 [I] Training precision: float32                                                      (22938:train_pytorch.py:714)
+20:12:04.509 [I] Resolved config name: pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k (22938:train_pytorch.py:308)
+20:12:04.509 [I] Dataset repo_id: lsnu/twin_dual_push_128_train                                   (22938:train_pytorch.py:309)
+20:12:04.510 [I] Norm-stats file path: /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json (22938:train_pytorch.py:310)
+20:12:04.510 [I] Norm-stats summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (22938:train_pytorch.py:311)
+20:12:04.511 [I] Checkpoint source path: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single (22938:train_pytorch.py:312)
+20:12:04.511 [I] Model type: split_communicating                                                  (22938:train_pytorch.py:313)
+20:12:04.511 [I] Packed transforms active: True                                                   (22938:train_pytorch.py:314)
+20:12:04.512 [I] World size: 1                                                                    (22938:train_pytorch.py:315)
+20:12:04.512 [I] Batch size: local=1, global=1                                                    (22938:train_pytorch.py:316)
+20:12:04.512 [I] num_workers: 0                                                                   (22938:train_pytorch.py:317)
+20:12:04.513 [I] Precision: float32                                                               (22938:train_pytorch.py:318)
+20:12:04.513 [I] LR schedule summary: warmup_steps=250, peak_lr=2.50e-05, decay_steps=5000, decay_lr=2.50e-06 (22938:train_pytorch.py:319)
+20:12:04.513 [I] Save/log intervals: save_interval=20, log_interval=1                             (22938:train_pytorch.py:326)
+20:12:04.514 [I] Action-loss mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (22938:train_pytorch.py:327)
+20:12:04.514 [I] Active mask dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]       (22938:train_pytorch.py:328)
+20:12:04.515 [I] Masked dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]      (22938:train_pytorch.py:329)
+20:12:04.515 [I] Gradient bucket diagnostics: left_action_in, right_action_in, left_expert, right_expert, action_out, cross_arm_comm (22938:train_pytorch.py:722)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:06.079 [I] debug_step=1 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22938:train_pytorch.py:831)
+20:12:06.080 [I] debug_step=1 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22938:train_pytorch.py:835)
+20:12:06.080 [I] debug_step=1 prompt_token_lengths=[75]                                           (22938:train_pytorch.py:838)
+20:12:06.081 [I] debug_step=1 state_stats min=-1.0000 max=1.0004 mean=0.0112 std=0.3876           (22938:train_pytorch.py:839)
+20:12:06.081 [I] debug_step=1 action_stats min=-1.0016 max=1.0004 mean=-0.0454 std=0.4716         (22938:train_pytorch.py:842)
+20:12:06.082 [I] debug_step=1 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22938:train_pytorch.py:845)
+20:12:06.097 [I] debug_step=1 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22938:train_pytorch.py:849)
+20:12:06.097 [I] debug_step=1 lr=9.96e-08 grad_norm=60.0473 data_time=0.2034s step_time=1.3216s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.25GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.25GB (22938:train_pytorch.py:854)
+20:12:06.098 [I] debug_step=1 grad_shared_backbone=36.9946 grad_left_action_in=2.3769 grad_right_action_in=1.7630 grad_left_expert=31.1244 grad_right_expert=27.8917 grad_action_out=13.0720 grad_cross_arm_comm=3.1067 cross_arm_comm_gate_layer_0=0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_2=0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=0.0000 cross_arm_comm_gate_layer_5=0.0000 cross_arm_comm_gate_layer_6=0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=0.0000 cross_arm_comm_gate_layer_9=0.0000 cross_arm_comm_gate_layer_10=0.0000 cross_arm_comm_gate_layer_11=0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=0.0000 cross_arm_attention_mass_layer_0=0.0001 cross_arm_attention_mass_layer_1=0.0050 cross_arm_attention_mass_layer_2=0.0217 cross_arm_attention_mass_layer_3=0.0086 cross_arm_attention_mass_layer_4=0.0279 cross_arm_attention_mass_layer_5=0.0355 cross_arm_attention_mass_layer_6=0.0179 cross_arm_attention_mass_layer_7=0.0369 cross_arm_attention_mass_layer_8=0.0183 cross_arm_attention_mass_layer_9=0.0153 cross_arm_attention_mass_layer_10=0.0188 cross_arm_attention_mass_layer_11=0.0278 cross_arm_attention_mass_layer_12=0.0052 cross_arm_attention_mass_layer_13=0.0161 cross_arm_attention_mass_layer_14=0.0091 cross_arm_attention_mass_layer_15=0.0342 cross_arm_attention_mass_layer_16=0.0457 cross_arm_attention_mass_layer_17=0.0454 (22938:train_pytorch.py:862)
+20:12:06.099 [I] step=1 loss=3.8411 smoothed_loss=3.8411 lr=9.96e-08 grad_norm=60.0473 step_time=1.3216s data_time=0.2034s it/s=0.625 eta_to_20=30.4s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0001 cross_arm_attention_mass_layer_1=0.0050 cross_arm_attention_mass_layer_10=0.0188 cross_arm_attention_mass_layer_11=0.0278 cross_arm_attention_mass_layer_12=0.0052 cross_arm_attention_mass_layer_13=0.0161 cross_arm_attention_mass_layer_14=0.0091 cross_arm_attention_mass_layer_15=0.0342 cross_arm_attention_mass_layer_16=0.0457 cross_arm_attention_mass_layer_17=0.0454 cross_arm_attention_mass_layer_2=0.0217 cross_arm_attention_mass_layer_3=0.0086 cross_arm_attention_mass_layer_4=0.0279 cross_arm_attention_mass_layer_5=0.0355 cross_arm_attention_mass_layer_6=0.0179 cross_arm_attention_mass_layer_7=0.0369 cross_arm_attention_mass_layer_8=0.0183 cross_arm_attention_mass_layer_9=0.0153 cross_arm_comm_gate_layer_0=0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_10=0.0000 cross_arm_comm_gate_layer_11=0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=0.0000 cross_arm_comm_gate_layer_2=0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=0.0000 cross_arm_comm_gate_layer_5=0.0000 cross_arm_comm_gate_layer_6=0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=0.0000 cross_arm_comm_gate_layer_9=0.0000 grad_action_out=13.0720 grad_cross_arm_comm=3.1067 grad_left_action_in=2.3769 grad_left_expert=31.1244 grad_right_action_in=1.7630 grad_right_expert=27.8917 grad_shared_backbone=36.9946 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:07.067 [I] debug_step=2 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22938:train_pytorch.py:831)
+20:12:07.067 [I] debug_step=2 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22938:train_pytorch.py:835)
+20:12:07.068 [I] debug_step=2 prompt_token_lengths=[76]                                           (22938:train_pytorch.py:838)
+20:12:07.069 [I] debug_step=2 state_stats min=-0.9415 max=1.0004 mean=-0.0010 std=0.4295          (22938:train_pytorch.py:839)
+20:12:07.069 [I] debug_step=2 action_stats min=-1.0000 max=1.1367 mean=0.0272 std=0.4576          (22938:train_pytorch.py:842)
+20:12:07.070 [I] debug_step=2 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22938:train_pytorch.py:845)
+20:12:07.070 [I] debug_step=2 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22938:train_pytorch.py:849)
+20:12:07.071 [I] debug_step=2 lr=1.99e-07 grad_norm=10.7247 data_time=0.2263s step_time=0.7585s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.30GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.30GB (22938:train_pytorch.py:854)
+20:12:07.071 [I] debug_step=2 grad_shared_backbone=9.1973 grad_left_action_in=0.1651 grad_right_action_in=0.1484 grad_left_expert=2.5023 grad_right_expert=2.3935 grad_action_out=4.0770 grad_cross_arm_comm=0.0166 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=-0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0019 cross_arm_attention_mass_layer_2=0.0161 cross_arm_attention_mass_layer_3=0.0029 cross_arm_attention_mass_layer_4=0.0175 cross_arm_attention_mass_layer_5=0.0243 cross_arm_attention_mass_layer_6=0.0074 cross_arm_attention_mass_layer_7=0.0232 cross_arm_attention_mass_layer_8=0.0155 cross_arm_attention_mass_layer_9=0.0135 cross_arm_attention_mass_layer_10=0.0094 cross_arm_attention_mass_layer_11=0.0151 cross_arm_attention_mass_layer_12=0.0021 cross_arm_attention_mass_layer_13=0.0053 cross_arm_attention_mass_layer_14=0.0056 cross_arm_attention_mass_layer_15=0.0250 cross_arm_attention_mass_layer_16=0.0356 cross_arm_attention_mass_layer_17=0.0413 (22938:train_pytorch.py:862)
+20:12:07.072 [I] step=2 loss=1.1389 smoothed_loss=3.5709 lr=1.99e-07 grad_norm=10.7247 step_time=0.7585s data_time=0.2263s it/s=1.028 eta_to_20=17.5s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0019 cross_arm_attention_mass_layer_10=0.0094 cross_arm_attention_mass_layer_11=0.0151 cross_arm_attention_mass_layer_12=0.0021 cross_arm_attention_mass_layer_13=0.0053 cross_arm_attention_mass_layer_14=0.0056 cross_arm_attention_mass_layer_15=0.0250 cross_arm_attention_mass_layer_16=0.0356 cross_arm_attention_mass_layer_17=0.0413 cross_arm_attention_mass_layer_2=0.0161 cross_arm_attention_mass_layer_3=0.0029 cross_arm_attention_mass_layer_4=0.0175 cross_arm_attention_mass_layer_5=0.0243 cross_arm_attention_mass_layer_6=0.0074 cross_arm_attention_mass_layer_7=0.0232 cross_arm_attention_mass_layer_8=0.0155 cross_arm_attention_mass_layer_9=0.0135 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=-0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=4.0770 grad_cross_arm_comm=0.0166 grad_left_action_in=0.1651 grad_left_expert=2.5023 grad_right_action_in=0.1484 grad_right_expert=2.3935 grad_shared_backbone=9.1973 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:07.689 [I] debug_step=3 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22938:train_pytorch.py:831)
+20:12:07.690 [I] debug_step=3 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22938:train_pytorch.py:835)
+20:12:07.690 [I] debug_step=3 prompt_token_lengths=[75]                                           (22938:train_pytorch.py:838)
+20:12:07.691 [I] debug_step=3 state_stats min=-1.0000 max=1.0004 mean=0.0558 std=0.4300           (22938:train_pytorch.py:839)
+20:12:07.692 [I] debug_step=3 action_stats min=-1.0033 max=1.0004 mean=-0.0658 std=0.4704         (22938:train_pytorch.py:842)
+20:12:07.692 [I] debug_step=3 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22938:train_pytorch.py:845)
+20:12:07.693 [I] debug_step=3 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22938:train_pytorch.py:849)
+20:12:07.693 [I] debug_step=3 lr=2.99e-07 grad_norm=343.6402 data_time=0.1557s step_time=0.4654s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.30GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.30GB (22938:train_pytorch.py:854)
+20:12:07.694 [I] debug_step=3 grad_shared_backbone=215.2410 grad_left_action_in=4.7969 grad_right_action_in=9.5325 grad_left_expert=72.6238 grad_right_expert=227.5470 grad_action_out=23.7695 grad_cross_arm_comm=3.3548 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_attention_mass_layer_0=0.0003 cross_arm_attention_mass_layer_1=0.0127 cross_arm_attention_mass_layer_2=0.0275 cross_arm_attention_mass_layer_3=0.0190 cross_arm_attention_mass_layer_4=0.0359 cross_arm_attention_mass_layer_5=0.0454 cross_arm_attention_mass_layer_6=0.0228 cross_arm_attention_mass_layer_7=0.0346 cross_arm_attention_mass_layer_8=0.0149 cross_arm_attention_mass_layer_9=0.0296 cross_arm_attention_mass_layer_10=0.0177 cross_arm_attention_mass_layer_11=0.0230 cross_arm_attention_mass_layer_12=0.0134 cross_arm_attention_mass_layer_13=0.0242 cross_arm_attention_mass_layer_14=0.0109 cross_arm_attention_mass_layer_15=0.0285 cross_arm_attention_mass_layer_16=0.0403 cross_arm_attention_mass_layer_17=0.0268 (22938:train_pytorch.py:862)
+20:12:07.694 [I] step=3 loss=5.0512 smoothed_loss=3.7189 lr=2.99e-07 grad_norm=343.6402 step_time=0.4654s data_time=0.1557s it/s=1.609 eta_to_20=10.6s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0003 cross_arm_attention_mass_layer_1=0.0127 cross_arm_attention_mass_layer_10=0.0177 cross_arm_attention_mass_layer_11=0.0230 cross_arm_attention_mass_layer_12=0.0134 cross_arm_attention_mass_layer_13=0.0242 cross_arm_attention_mass_layer_14=0.0109 cross_arm_attention_mass_layer_15=0.0285 cross_arm_attention_mass_layer_16=0.0403 cross_arm_attention_mass_layer_17=0.0268 cross_arm_attention_mass_layer_2=0.0275 cross_arm_attention_mass_layer_3=0.0190 cross_arm_attention_mass_layer_4=0.0359 cross_arm_attention_mass_layer_5=0.0454 cross_arm_attention_mass_layer_6=0.0228 cross_arm_attention_mass_layer_7=0.0346 cross_arm_attention_mass_layer_8=0.0149 cross_arm_attention_mass_layer_9=0.0296 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=23.7695 grad_cross_arm_comm=3.3548 grad_left_action_in=4.7969 grad_left_expert=72.6238 grad_right_action_in=9.5325 grad_right_expert=227.5470 grad_shared_backbone=215.2410 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:08.256 [I] debug_step=4 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22938:train_pytorch.py:831)
+20:12:08.257 [I] debug_step=4 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22938:train_pytorch.py:835)
+20:12:08.257 [I] debug_step=4 prompt_token_lengths=[78]                                           (22938:train_pytorch.py:838)
+20:12:08.258 [I] debug_step=4 state_stats min=-0.7017 max=1.0004 mean=0.0553 std=0.3507           (22938:train_pytorch.py:839)
+20:12:08.258 [I] debug_step=4 action_stats min=-1.0014 max=1.0004 mean=-0.0683 std=0.4561         (22938:train_pytorch.py:842)
+20:12:08.259 [I] debug_step=4 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22938:train_pytorch.py:845)
+20:12:08.259 [I] debug_step=4 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22938:train_pytorch.py:849)
+20:12:08.260 [I] debug_step=4 lr=3.98e-07 grad_norm=8.7944 data_time=0.1312s step_time=0.4359s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.30GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.30GB (22938:train_pytorch.py:854)
+20:12:08.260 [I] debug_step=4 grad_shared_backbone=7.5903 grad_left_action_in=0.1438 grad_right_action_in=0.1015 grad_left_expert=2.4058 grad_right_expert=1.2982 grad_action_out=3.3839 grad_cross_arm_comm=0.0147 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0015 cross_arm_attention_mass_layer_2=0.0133 cross_arm_attention_mass_layer_3=0.0026 cross_arm_attention_mass_layer_4=0.0148 cross_arm_attention_mass_layer_5=0.0199 cross_arm_attention_mass_layer_6=0.0062 cross_arm_attention_mass_layer_7=0.0154 cross_arm_attention_mass_layer_8=0.0102 cross_arm_attention_mass_layer_9=0.0086 cross_arm_attention_mass_layer_10=0.0065 cross_arm_attention_mass_layer_11=0.0099 cross_arm_attention_mass_layer_12=0.0010 cross_arm_attention_mass_layer_13=0.0040 cross_arm_attention_mass_layer_14=0.0072 cross_arm_attention_mass_layer_15=0.0227 cross_arm_attention_mass_layer_16=0.0351 cross_arm_attention_mass_layer_17=0.0406 (22938:train_pytorch.py:862)
+20:12:08.261 [I] step=4 loss=1.1860 smoothed_loss=3.4656 lr=3.98e-07 grad_norm=8.7944 step_time=0.4359s data_time=0.1312s it/s=1.768 eta_to_20=9.1s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0015 cross_arm_attention_mass_layer_10=0.0065 cross_arm_attention_mass_layer_11=0.0099 cross_arm_attention_mass_layer_12=0.0010 cross_arm_attention_mass_layer_13=0.0040 cross_arm_attention_mass_layer_14=0.0072 cross_arm_attention_mass_layer_15=0.0227 cross_arm_attention_mass_layer_16=0.0351 cross_arm_attention_mass_layer_17=0.0406 cross_arm_attention_mass_layer_2=0.0133 cross_arm_attention_mass_layer_3=0.0026 cross_arm_attention_mass_layer_4=0.0148 cross_arm_attention_mass_layer_5=0.0199 cross_arm_attention_mass_layer_6=0.0062 cross_arm_attention_mass_layer_7=0.0154 cross_arm_attention_mass_layer_8=0.0102 cross_arm_attention_mass_layer_9=0.0086 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=3.3839 grad_cross_arm_comm=0.0147 grad_left_action_in=0.1438 grad_left_expert=2.4058 grad_right_action_in=0.1015 grad_right_expert=1.2982 grad_shared_backbone=7.5903 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:08.933 [I] debug_step=5 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22938:train_pytorch.py:831)
+20:12:08.934 [I] debug_step=5 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22938:train_pytorch.py:835)
+20:12:08.934 [I] debug_step=5 prompt_token_lengths=[73]                                           (22938:train_pytorch.py:838)
+20:12:08.935 [I] debug_step=5 state_stats min=-0.9599 max=1.0004 mean=0.0170 std=0.5364           (22938:train_pytorch.py:839)
+20:12:08.935 [I] debug_step=5 action_stats min=-1.0392 max=1.0004 mean=-0.0159 std=0.4488         (22938:train_pytorch.py:842)
+20:12:08.935 [I] debug_step=5 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22938:train_pytorch.py:845)
+20:12:08.936 [I] debug_step=5 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22938:train_pytorch.py:849)
+20:12:08.936 [I] debug_step=5 lr=4.98e-07 grad_norm=20.1429 data_time=0.2048s step_time=0.4721s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.30GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.30GB (22938:train_pytorch.py:854)
+20:12:08.937 [I] debug_step=5 grad_shared_backbone=16.7899 grad_left_action_in=0.2534 grad_right_action_in=0.3335 grad_left_expert=7.9047 grad_right_expert=3.6853 grad_action_out=6.0934 grad_cross_arm_comm=0.0735 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0020 cross_arm_attention_mass_layer_2=0.0178 cross_arm_attention_mass_layer_3=0.0039 cross_arm_attention_mass_layer_4=0.0203 cross_arm_attention_mass_layer_5=0.0294 cross_arm_attention_mass_layer_6=0.0106 cross_arm_attention_mass_layer_7=0.0286 cross_arm_attention_mass_layer_8=0.0175 cross_arm_attention_mass_layer_9=0.0157 cross_arm_attention_mass_layer_10=0.0148 cross_arm_attention_mass_layer_11=0.0181 cross_arm_attention_mass_layer_12=0.0023 cross_arm_attention_mass_layer_13=0.0128 cross_arm_attention_mass_layer_14=0.0072 cross_arm_attention_mass_layer_15=0.0232 cross_arm_attention_mass_layer_16=0.0437 cross_arm_attention_mass_layer_17=0.0451 (22938:train_pytorch.py:862)
+20:12:08.937 [I] step=5 loss=1.8898 smoothed_loss=3.3081 lr=4.98e-07 grad_norm=20.1429 step_time=0.4721s data_time=0.2048s it/s=1.481 eta_to_20=10.1s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0020 cross_arm_attention_mass_layer_10=0.0148 cross_arm_attention_mass_layer_11=0.0181 cross_arm_attention_mass_layer_12=0.0023 cross_arm_attention_mass_layer_13=0.0128 cross_arm_attention_mass_layer_14=0.0072 cross_arm_attention_mass_layer_15=0.0232 cross_arm_attention_mass_layer_16=0.0437 cross_arm_attention_mass_layer_17=0.0451 cross_arm_attention_mass_layer_2=0.0178 cross_arm_attention_mass_layer_3=0.0039 cross_arm_attention_mass_layer_4=0.0203 cross_arm_attention_mass_layer_5=0.0294 cross_arm_attention_mass_layer_6=0.0106 cross_arm_attention_mass_layer_7=0.0286 cross_arm_attention_mass_layer_8=0.0175 cross_arm_attention_mass_layer_9=0.0157 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=6.0934 grad_cross_arm_comm=0.0735 grad_left_action_in=0.2534 grad_left_expert=7.9047 grad_right_action_in=0.3335 grad_right_expert=3.6853 grad_shared_backbone=16.7899 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:09.727 [I] step=6 loss=2.2855 smoothed_loss=3.2058 lr=5.98e-07 grad_norm=22.2605 step_time=0.5043s data_time=0.2901s it/s=1.267 eta_to_20=11.1s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0001 cross_arm_attention_mass_layer_1=0.0030 cross_arm_attention_mass_layer_10=0.0179 cross_arm_attention_mass_layer_11=0.0219 cross_arm_attention_mass_layer_12=0.0017 cross_arm_attention_mass_layer_13=0.0164 cross_arm_attention_mass_layer_14=0.0065 cross_arm_attention_mass_layer_15=0.0300 cross_arm_attention_mass_layer_16=0.0448 cross_arm_attention_mass_layer_17=0.0482 cross_arm_attention_mass_layer_2=0.0201 cross_arm_attention_mass_layer_3=0.0064 cross_arm_attention_mass_layer_4=0.0234 cross_arm_attention_mass_layer_5=0.0308 cross_arm_attention_mass_layer_6=0.0131 cross_arm_attention_mass_layer_7=0.0312 cross_arm_attention_mass_layer_8=0.0206 cross_arm_attention_mass_layer_9=0.0180 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=7.8420 grad_cross_arm_comm=0.1508 grad_left_action_in=0.2907 grad_left_expert=7.9865 grad_right_action_in=0.5407 grad_right_expert=5.3887 grad_shared_backbone=18.0209 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:10.423 [I] step=7 loss=1.0335 smoothed_loss=2.9886 lr=6.97e-07 grad_norm=8.7208 step_time=0.4962s data_time=0.1999s it/s=1.439 eta_to_20=9.0s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0014 cross_arm_attention_mass_layer_10=0.0066 cross_arm_attention_mass_layer_11=0.0060 cross_arm_attention_mass_layer_12=0.0024 cross_arm_attention_mass_layer_13=0.0015 cross_arm_attention_mass_layer_14=0.0062 cross_arm_attention_mass_layer_15=0.0146 cross_arm_attention_mass_layer_16=0.0319 cross_arm_attention_mass_layer_17=0.0417 cross_arm_attention_mass_layer_2=0.0105 cross_arm_attention_mass_layer_3=0.0022 cross_arm_attention_mass_layer_4=0.0130 cross_arm_attention_mass_layer_5=0.0188 cross_arm_attention_mass_layer_6=0.0045 cross_arm_attention_mass_layer_7=0.0127 cross_arm_attention_mass_layer_8=0.0097 cross_arm_attention_mass_layer_9=0.0097 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=4.0753 grad_cross_arm_comm=0.0098 grad_left_action_in=0.1514 grad_left_expert=2.5886 grad_right_action_in=0.0879 grad_right_expert=1.9729 grad_shared_backbone=6.8576 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:11.020 [I] step=8 loss=2.0034 smoothed_loss=2.8901 lr=7.97e-07 grad_norm=15.7969 step_time=0.4407s data_time=0.1564s it/s=1.677 eta_to_20=7.2s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0027 cross_arm_attention_mass_layer_10=0.0129 cross_arm_attention_mass_layer_11=0.0269 cross_arm_attention_mass_layer_12=0.0032 cross_arm_attention_mass_layer_13=0.0177 cross_arm_attention_mass_layer_14=0.0074 cross_arm_attention_mass_layer_15=0.0309 cross_arm_attention_mass_layer_16=0.0446 cross_arm_attention_mass_layer_17=0.0503 cross_arm_attention_mass_layer_2=0.0196 cross_arm_attention_mass_layer_3=0.0046 cross_arm_attention_mass_layer_4=0.0227 cross_arm_attention_mass_layer_5=0.0319 cross_arm_attention_mass_layer_6=0.0114 cross_arm_attention_mass_layer_7=0.0298 cross_arm_attention_mass_layer_8=0.0194 cross_arm_attention_mass_layer_9=0.0117 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=6.6005 grad_cross_arm_comm=0.1531 grad_left_action_in=0.1726 grad_left_expert=4.6426 grad_right_action_in=0.4530 grad_right_expert=3.8705 grad_shared_backbone=12.4324 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:11.571 [I] step=9 loss=0.4132 smoothed_loss=2.6424 lr=8.96e-07 grad_norm=3.3497 step_time=0.4161s data_time=0.1347s it/s=1.820 eta_to_20=6.0s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0008 cross_arm_attention_mass_layer_10=0.0014 cross_arm_attention_mass_layer_11=0.0006 cross_arm_attention_mass_layer_12=0.0028 cross_arm_attention_mass_layer_13=0.0018 cross_arm_attention_mass_layer_14=0.0059 cross_arm_attention_mass_layer_15=0.0078 cross_arm_attention_mass_layer_16=0.0337 cross_arm_attention_mass_layer_17=0.0442 cross_arm_attention_mass_layer_2=0.0015 cross_arm_attention_mass_layer_3=0.0012 cross_arm_attention_mass_layer_4=0.0019 cross_arm_attention_mass_layer_5=0.0036 cross_arm_attention_mass_layer_6=0.0013 cross_arm_attention_mass_layer_7=0.0022 cross_arm_attention_mass_layer_8=0.0006 cross_arm_attention_mass_layer_9=0.0052 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=1.7915 grad_cross_arm_comm=0.0012 grad_left_action_in=0.0692 grad_left_expert=1.0033 grad_right_action_in=0.0554 grad_right_expert=0.7293 grad_shared_backbone=2.5249 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:12.422 [I] step=10 loss=0.6162 smoothed_loss=2.4397 lr=9.96e-07 grad_norm=5.5674 step_time=0.6599s data_time=0.1905s it/s=1.178 eta_to_20=8.5s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0014 cross_arm_attention_mass_layer_10=0.0024 cross_arm_attention_mass_layer_11=0.0047 cross_arm_attention_mass_layer_12=0.0021 cross_arm_attention_mass_layer_13=0.0018 cross_arm_attention_mass_layer_14=0.0062 cross_arm_attention_mass_layer_15=0.0094 cross_arm_attention_mass_layer_16=0.0283 cross_arm_attention_mass_layer_17=0.0357 cross_arm_attention_mass_layer_2=0.0074 cross_arm_attention_mass_layer_3=0.0016 cross_arm_attention_mass_layer_4=0.0081 cross_arm_attention_mass_layer_5=0.0156 cross_arm_attention_mass_layer_6=0.0028 cross_arm_attention_mass_layer_7=0.0050 cross_arm_attention_mass_layer_8=0.0040 cross_arm_attention_mass_layer_9=0.0045 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=2.2079 grad_cross_arm_comm=0.0071 grad_left_action_in=0.0841 grad_left_expert=1.2018 grad_right_action_in=0.0868 grad_right_expert=1.2814 grad_shared_backbone=4.7763 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:12.957 [I] step=11 loss=0.9030 smoothed_loss=2.2861 lr=1.10e-06 grad_norm=7.2282 step_time=0.4104s data_time=0.1251s it/s=1.872 eta_to_20=4.8s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0015 cross_arm_attention_mass_layer_10=0.0064 cross_arm_attention_mass_layer_11=0.0098 cross_arm_attention_mass_layer_12=0.0013 cross_arm_attention_mass_layer_13=0.0031 cross_arm_attention_mass_layer_14=0.0072 cross_arm_attention_mass_layer_15=0.0208 cross_arm_attention_mass_layer_16=0.0355 cross_arm_attention_mass_layer_17=0.0421 cross_arm_attention_mass_layer_2=0.0136 cross_arm_attention_mass_layer_3=0.0023 cross_arm_attention_mass_layer_4=0.0152 cross_arm_attention_mass_layer_5=0.0219 cross_arm_attention_mass_layer_6=0.0054 cross_arm_attention_mass_layer_7=0.0144 cross_arm_attention_mass_layer_8=0.0131 cross_arm_attention_mass_layer_9=0.0082 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=3.3357 grad_cross_arm_comm=0.0099 grad_left_action_in=0.1355 grad_left_expert=2.0379 grad_right_action_in=0.0836 grad_right_expert=1.1722 grad_shared_backbone=5.8293 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:13.628 [I] step=12 loss=0.7531 smoothed_loss=2.1328 lr=1.20e-06 grad_norm=6.0473 step_time=0.4968s data_time=0.1739s it/s=1.493 eta_to_20=5.4s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0012 cross_arm_attention_mass_layer_10=0.0078 cross_arm_attention_mass_layer_11=0.0121 cross_arm_attention_mass_layer_12=0.0032 cross_arm_attention_mass_layer_13=0.0032 cross_arm_attention_mass_layer_14=0.0048 cross_arm_attention_mass_layer_15=0.0136 cross_arm_attention_mass_layer_16=0.0331 cross_arm_attention_mass_layer_17=0.0404 cross_arm_attention_mass_layer_2=0.0127 cross_arm_attention_mass_layer_3=0.0020 cross_arm_attention_mass_layer_4=0.0138 cross_arm_attention_mass_layer_5=0.0221 cross_arm_attention_mass_layer_6=0.0055 cross_arm_attention_mass_layer_7=0.0174 cross_arm_attention_mass_layer_8=0.0100 cross_arm_attention_mass_layer_9=0.0094 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=2.8673 grad_cross_arm_comm=0.0090 grad_left_action_in=0.1128 grad_left_expert=1.8561 grad_right_action_in=0.0739 grad_right_expert=1.0243 grad_shared_backbone=4.8443 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:14.427 [I] step=13 loss=3.7746 smoothed_loss=2.2970 lr=1.29e-06 grad_norm=206.8044 step_time=0.5601s data_time=0.2394s it/s=1.252 eta_to_20=5.6s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0003 cross_arm_attention_mass_layer_1=0.0128 cross_arm_attention_mass_layer_10=0.0240 cross_arm_attention_mass_layer_11=0.0241 cross_arm_attention_mass_layer_12=0.0213 cross_arm_attention_mass_layer_13=0.0213 cross_arm_attention_mass_layer_14=0.0164 cross_arm_attention_mass_layer_15=0.0265 cross_arm_attention_mass_layer_16=0.0367 cross_arm_attention_mass_layer_17=0.0289 cross_arm_attention_mass_layer_2=0.0282 cross_arm_attention_mass_layer_3=0.0184 cross_arm_attention_mass_layer_4=0.0365 cross_arm_attention_mass_layer_5=0.0441 cross_arm_attention_mass_layer_6=0.0238 cross_arm_attention_mass_layer_7=0.0371 cross_arm_attention_mass_layer_8=0.0137 cross_arm_attention_mass_layer_9=0.0293 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=15.4957 grad_cross_arm_comm=2.1022 grad_left_action_in=2.3745 grad_left_expert=37.1536 grad_right_action_in=5.2568 grad_right_expert=138.8291 grad_shared_backbone=127.7336 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:15.255 [I] step=14 loss=1.2933 smoothed_loss=2.1966 lr=1.39e-06 grad_norm=7.9182 step_time=0.5738s data_time=0.2541s it/s=1.210 eta_to_20=5.0s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0015 cross_arm_attention_mass_layer_10=0.0079 cross_arm_attention_mass_layer_11=0.0120 cross_arm_attention_mass_layer_12=0.0016 cross_arm_attention_mass_layer_13=0.0036 cross_arm_attention_mass_layer_14=0.0047 cross_arm_attention_mass_layer_15=0.0131 cross_arm_attention_mass_layer_16=0.0244 cross_arm_attention_mass_layer_17=0.0419 cross_arm_attention_mass_layer_2=0.0129 cross_arm_attention_mass_layer_3=0.0022 cross_arm_attention_mass_layer_4=0.0152 cross_arm_attention_mass_layer_5=0.0233 cross_arm_attention_mass_layer_6=0.0067 cross_arm_attention_mass_layer_7=0.0161 cross_arm_attention_mass_layer_8=0.0092 cross_arm_attention_mass_layer_9=0.0097 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=4.2052 grad_cross_arm_comm=0.0107 grad_left_action_in=0.1570 grad_left_expert=2.3411 grad_right_action_in=0.1025 grad_right_expert=1.1691 grad_shared_backbone=6.0836 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:16.034 [I] step=15 loss=3.1068 smoothed_loss=2.2876 lr=1.49e-06 grad_norm=24.4182 step_time=0.5474s data_time=0.2314s it/s=1.286 eta_to_20=3.9s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0001 cross_arm_attention_mass_layer_1=0.0033 cross_arm_attention_mass_layer_10=0.0154 cross_arm_attention_mass_layer_11=0.0284 cross_arm_attention_mass_layer_12=0.0046 cross_arm_attention_mass_layer_13=0.0187 cross_arm_attention_mass_layer_14=0.0121 cross_arm_attention_mass_layer_15=0.0370 cross_arm_attention_mass_layer_16=0.0460 cross_arm_attention_mass_layer_17=0.0516 cross_arm_attention_mass_layer_2=0.0206 cross_arm_attention_mass_layer_3=0.0064 cross_arm_attention_mass_layer_4=0.0239 cross_arm_attention_mass_layer_5=0.0299 cross_arm_attention_mass_layer_6=0.0143 cross_arm_attention_mass_layer_7=0.0349 cross_arm_attention_mass_layer_8=0.0213 cross_arm_attention_mass_layer_9=0.0171 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=-0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=9.3484 grad_cross_arm_comm=0.3843 grad_left_action_in=0.3015 grad_left_expert=7.0086 grad_right_action_in=0.6660 grad_right_expert=6.4185 grad_shared_backbone=18.8039 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:16.810 [I] step=16 loss=0.8710 smoothed_loss=2.1460 lr=1.59e-06 grad_norm=7.5162 step_time=0.5638s data_time=0.2117s it/s=1.292 eta_to_20=3.1s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0016 cross_arm_attention_mass_layer_10=0.0051 cross_arm_attention_mass_layer_11=0.0114 cross_arm_attention_mass_layer_12=0.0017 cross_arm_attention_mass_layer_13=0.0062 cross_arm_attention_mass_layer_14=0.0073 cross_arm_attention_mass_layer_15=0.0221 cross_arm_attention_mass_layer_16=0.0370 cross_arm_attention_mass_layer_17=0.0436 cross_arm_attention_mass_layer_2=0.0138 cross_arm_attention_mass_layer_3=0.0022 cross_arm_attention_mass_layer_4=0.0152 cross_arm_attention_mass_layer_5=0.0195 cross_arm_attention_mass_layer_6=0.0056 cross_arm_attention_mass_layer_7=0.0154 cross_arm_attention_mass_layer_8=0.0132 cross_arm_attention_mass_layer_9=0.0103 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=-0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=2.7344 grad_cross_arm_comm=0.0228 grad_left_action_in=0.1118 grad_left_expert=2.2761 grad_right_action_in=0.1234 grad_right_expert=1.1808 grad_shared_backbone=6.4124 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:17.396 [I] step=17 loss=1.7002 smoothed_loss=2.1014 lr=1.69e-06 grad_norm=14.0785 step_time=0.4252s data_time=0.1614s it/s=1.708 eta_to_20=1.8s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0021 cross_arm_attention_mass_layer_10=0.0103 cross_arm_attention_mass_layer_11=0.0197 cross_arm_attention_mass_layer_12=0.0029 cross_arm_attention_mass_layer_13=0.0103 cross_arm_attention_mass_layer_14=0.0067 cross_arm_attention_mass_layer_15=0.0163 cross_arm_attention_mass_layer_16=0.0436 cross_arm_attention_mass_layer_17=0.0446 cross_arm_attention_mass_layer_2=0.0162 cross_arm_attention_mass_layer_3=0.0030 cross_arm_attention_mass_layer_4=0.0192 cross_arm_attention_mass_layer_5=0.0268 cross_arm_attention_mass_layer_6=0.0092 cross_arm_attention_mass_layer_7=0.0242 cross_arm_attention_mass_layer_8=0.0146 cross_arm_attention_mass_layer_9=0.0095 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=-0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=4.2605 grad_cross_arm_comm=0.0625 grad_left_action_in=0.1989 grad_left_expert=4.9518 grad_right_action_in=0.2156 grad_right_expert=2.1764 grad_shared_backbone=12.0796 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:18.392 [I] step=18 loss=0.4844 smoothed_loss=1.9397 lr=1.79e-06 grad_norm=3.3459 step_time=0.6297s data_time=0.3660s it/s=1.005 eta_to_20=2.0s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0008 cross_arm_attention_mass_layer_10=0.0016 cross_arm_attention_mass_layer_11=0.0014 cross_arm_attention_mass_layer_12=0.0034 cross_arm_attention_mass_layer_13=0.0007 cross_arm_attention_mass_layer_14=0.0054 cross_arm_attention_mass_layer_15=0.0063 cross_arm_attention_mass_layer_16=0.0319 cross_arm_attention_mass_layer_17=0.0418 cross_arm_attention_mass_layer_2=0.0027 cross_arm_attention_mass_layer_3=0.0013 cross_arm_attention_mass_layer_4=0.0035 cross_arm_attention_mass_layer_5=0.0058 cross_arm_attention_mass_layer_6=0.0015 cross_arm_attention_mass_layer_7=0.0028 cross_arm_attention_mass_layer_8=0.0019 cross_arm_attention_mass_layer_9=0.0049 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=-0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=-0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=1.9561 grad_cross_arm_comm=0.0017 grad_left_action_in=0.0746 grad_left_expert=1.1140 grad_right_action_in=0.0388 grad_right_expert=0.5290 grad_shared_backbone=2.3985 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:19.239 [I] step=19 loss=0.7633 smoothed_loss=1.8220 lr=1.89e-06 grad_norm=7.1468 step_time=0.5757s data_time=0.2714s it/s=1.182 eta_to_20=0.8s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0015 cross_arm_attention_mass_layer_10=0.0069 cross_arm_attention_mass_layer_11=0.0093 cross_arm_attention_mass_layer_12=0.0016 cross_arm_attention_mass_layer_13=0.0034 cross_arm_attention_mass_layer_14=0.0046 cross_arm_attention_mass_layer_15=0.0166 cross_arm_attention_mass_layer_16=0.0297 cross_arm_attention_mass_layer_17=0.0418 cross_arm_attention_mass_layer_2=0.0130 cross_arm_attention_mass_layer_3=0.0026 cross_arm_attention_mass_layer_4=0.0156 cross_arm_attention_mass_layer_5=0.0208 cross_arm_attention_mass_layer_6=0.0062 cross_arm_attention_mass_layer_7=0.0164 cross_arm_attention_mass_layer_8=0.0124 cross_arm_attention_mass_layer_9=0.0115 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=-0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=-0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=3.7548 grad_cross_arm_comm=0.0125 grad_left_action_in=0.1160 grad_left_expert=2.3520 grad_right_action_in=0.0799 grad_right_expert=1.3128 grad_shared_backbone=5.3838 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:19.905 [I] step=20 loss=0.5943 smoothed_loss=1.6993 lr=1.99e-06 grad_norm=6.2792 step_time=0.4954s data_time=0.1707s it/s=1.504 eta_to_20=0.0s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0012 cross_arm_attention_mass_layer_10=0.0050 cross_arm_attention_mass_layer_11=0.0026 cross_arm_attention_mass_layer_12=0.0021 cross_arm_attention_mass_layer_13=0.0009 cross_arm_attention_mass_layer_14=0.0060 cross_arm_attention_mass_layer_15=0.0119 cross_arm_attention_mass_layer_16=0.0308 cross_arm_attention_mass_layer_17=0.0412 cross_arm_attention_mass_layer_2=0.0054 cross_arm_attention_mass_layer_3=0.0020 cross_arm_attention_mass_layer_4=0.0084 cross_arm_attention_mass_layer_5=0.0116 cross_arm_attention_mass_layer_6=0.0029 cross_arm_attention_mass_layer_7=0.0066 cross_arm_attention_mass_layer_8=0.0051 cross_arm_attention_mass_layer_9=0.0094 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=-0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=-0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=2.7963 grad_cross_arm_comm=0.0044 grad_left_action_in=0.0861 grad_left_expert=1.9694 grad_right_action_in=0.0578 grad_right_expert=1.3971 grad_shared_backbone=5.0478 (22938:train_pytorch.py:882)
+20:19:41.020 [I] Saved checkpoint at step 20 -> /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_train20/20 (22938:train_pytorch.py:378)

artifacts/twin_split_expert_bringup_20260310/run_logs/split_independent_real_smoke3_r2.log ADDED Viewed

	@@ -0,0 +1,104 @@

+19:45:11.253 [I] Created experiment checkpoint directory: /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_smoke3_r2 (20567:train_pytorch.py:533)
+19:45:11.254 [I] Using batch size per GPU: 1 (total batch size across 1 GPUs: 1)                  (20567:train_pytorch.py:552)
+19:45:11.330 [I] Loaded norm stats from /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train (20567:config.py:234)
+19:45:11.331 [I] data_config: DataConfig(repo_id='lsnu/twin_dual_push_128_train', asset_id='lsnu/twin_dual_push_128_train', norm_stats={'state': NormStats(mean=array([ 0.10604009,  0.20956482,  0.09184283, -1.98801565, -0.04930164,
+        2.20065784,  1.07595289,  0.52742052,  0.01585805,  0.08288047,
+       -0.06887393, -1.906394  ,  0.04810138,  2.01086807, -0.92902797,
+        0.8440811 ]), std=array([0.09207697, 0.31317395, 0.08127229, 0.53812712, 0.06093267,
+       0.51205784, 0.22527155, 0.49924755, 0.20230208, 0.31408131,
+       0.21665592, 0.5264315 , 0.20170984, 0.4745712 , 1.17861438,
+       0.36277843]), q01=array([-5.00321221e-06, -3.88026012e-01, -2.23782954e-05, -2.98962682e+00,
+       -2.38592355e-01,  1.22146201e+00,  7.85383821e-01,  0.00000000e+00,
+       -6.15615927e-01, -4.14941930e-01, -9.43696350e-01, -2.88397729e+00,
+       -9.05083556e-01,  1.22148895e+00, -2.79564499e+00,  0.00000000e+00]), q99=array([ 0.31251293,  0.86546916,  0.35174239, -0.87634897,  0.05212194,
+        2.97208117,  1.64465171,  0.9998    ,  0.7670313 ,  0.96073459,
+        0.68710467, -0.87498123,  0.35838486,  2.9773227 ,  0.78477909,
+        0.9998    ])), 'actions': NormStats(mean=array([ 0.03630241,  0.09624442,  0.01367408, -0.2224988 , -0.02762174,
+        0.27498844,  0.0892187 ,  0.45650524, -0.00378086,  0.09113847,
+       -0.00376227, -0.22537093,  0.00826233,  0.26799494, -0.57452869,
+        0.7731654 ]), std=array([0.04995174, 0.29268014, 0.06852161, 0.3647725 , 0.07012808,
+       0.27129024, 0.11329207, 0.4981046 , 0.0917461 , 0.22704004,
+       0.1069391 , 0.2572591 , 0.11801817, 0.1235588 , 0.35835782,
+       0.41878474]), q01=array([-5.86206436e-04, -3.88117499e-01, -2.55800724e-01, -8.34769463e-01,
+       -3.51454727e-01, -1.54787922e-03, -5.81741333e-04,  0.00000000e+00,
+       -2.64436970e-01, -3.51582764e-01, -3.69693995e-01, -7.30919549e-01,
+       -3.35441585e-01, -6.62303925e-04, -9.34731126e-01,  0.00000000e+00]), q99=array([0.20790743, 0.81198567, 0.19612836, 0.33958174, 0.05568643,
+       0.75265345, 0.425256  , 0.9998    , 0.2558236 , 0.58901345,
+       0.35822071, 0.18567593, 0.44035054, 0.49966629, 0.12655233,
+       0.9998    ]))}, repack_transforms=Group(inputs=[RepackTransform(structure={'images': {'cam_high': 'front_image', 'cam_left_wrist': 'wrist_left_image', 'cam_right_wrist': 'wrist_right_image'}, 'state': 'state', 'actions': 'action', 'prompt': 'task'})], outputs=()), data_transforms=Group(inputs=[AlohaInputs(adapt_to_pi=False)], outputs=[]), model_transforms=Group(inputs=[InjectDefaultPrompt(prompt=None), ResizeImages(height=224, width=224), TokenizePrompt(tokenizer=<openpi.models.tokenizer.PaligemmaTokenizer object at 0x79458ad85b50>, discrete_state_input=True), PackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))], outputs=[UnpackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))]), use_quantile_norm=True, action_sequence_keys=('action',), prompt_from_task=False, rlds_data_dir=None, action_space=None, datasets=()) (20567:data_loader.py:284)
+19:45:16.791 [I] JAX version 0.5.3 available.                                                     (20567:config.py:125)
+19:45:40.542 [I] Using existing local LeRobot dataset mirror for lsnu/twin_dual_push_128_train: /workspace/lerobot/lsnu/twin_dual_push_128_train (20567:data_loader.py:148)
+19:45:40.654 [W] 'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder (20567:video_utils.py:36)
+19:46:47.372 [I] local_batch_size: 1                                                              (20567:data_loader.py:365)
+19:50:09.799 [I] Enabled gradient checkpointing for PI0Pytorch model                              (20567:pi0_pytorch.py:138)
+19:50:09.802 [I] Enabled gradient checkpointing for memory optimization                           (20567:train_pytorch.py:624)
+19:50:09.803 [I] Step 0 (after_model_creation): GPU memory - allocated: 17.23GB, reserved: 17.23GB, free: 0.00GB, peak_allocated: 17.23GB, peak_reserved: 17.23GB (20567:train_pytorch.py:493)
+19:50:09.804 [I] Loading weights from: /workspace/checkpoints/pi05_base_split_independent_packed_from_single (20567:train_pytorch.py:653)
+19:50:13.559 [I] Weight loading missing key count: 0                                              (20567:train_pytorch.py:657)
+19:50:13.560 [I] Weight loading missing keys: set()                                               (20567:train_pytorch.py:658)
+19:50:13.560 [I] Weight loading unexpected key count: 0                                           (20567:train_pytorch.py:659)
+19:50:13.560 [I] Weight loading unexpected keys: []                                               (20567:train_pytorch.py:660)
+19:50:13.560 [I] Loaded PyTorch weights from /workspace/checkpoints/pi05_base_split_independent_packed_from_single (20567:train_pytorch.py:661)
+19:50:13.565 [I] Running on: 963c158043aa | world_size=1                                          (20567:train_pytorch.py:701)
+19:50:13.565 [I] Training config: batch_size=1, effective_batch_size=1, num_train_steps=3         (20567:train_pytorch.py:702)
+19:50:13.565 [I] Memory optimizations: gradient_checkpointing=True                                (20567:train_pytorch.py:705)
+19:50:13.566 [I] DDP settings: find_unused_parameters=False, gradient_as_bucket_view=True, static_graph=True (20567:train_pytorch.py:706)
+19:50:13.566 [I] LR schedule: warmup=250, peak_lr=2.50e-05, decay_steps=5000, end_lr=2.50e-06     (20567:train_pytorch.py:707)
+19:50:13.567 [I] Optimizer: AdamW, weight_decay=1e-10, clip_norm=1.0                              (20567:train_pytorch.py:710)
+19:50:13.567 [I] EMA is not supported for PyTorch training                                        (20567:train_pytorch.py:713)
+19:50:13.567 [I] Training precision: float32                                                      (20567:train_pytorch.py:714)
+19:50:13.576 [I] Resolved config name: pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k (20567:train_pytorch.py:308)
+19:50:13.576 [I] Dataset repo_id: lsnu/twin_dual_push_128_train                                   (20567:train_pytorch.py:309)
+19:50:13.577 [I] Norm-stats file path: /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json (20567:train_pytorch.py:310)
+19:50:13.577 [I] Norm-stats summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (20567:train_pytorch.py:311)
+19:50:13.578 [I] Checkpoint source path: /workspace/checkpoints/pi05_base_split_independent_packed_from_single (20567:train_pytorch.py:312)
+19:50:13.578 [I] Model type: split_independent                                                    (20567:train_pytorch.py:313)
+19:50:13.578 [I] Packed transforms active: True                                                   (20567:train_pytorch.py:314)
+19:50:13.579 [I] World size: 1                                                                    (20567:train_pytorch.py:315)
+19:50:13.579 [I] Batch size: local=1, global=1                                                    (20567:train_pytorch.py:316)
+19:50:13.580 [I] num_workers: 0                                                                   (20567:train_pytorch.py:317)
+19:50:13.580 [I] Precision: float32                                                               (20567:train_pytorch.py:318)
+19:50:13.580 [I] LR schedule summary: warmup_steps=250, peak_lr=2.50e-05, decay_steps=5000, decay_lr=2.50e-06 (20567:train_pytorch.py:319)
+19:50:13.581 [I] Save/log intervals: save_interval=3, log_interval=1                              (20567:train_pytorch.py:326)
+19:50:13.581 [I] Action-loss mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (20567:train_pytorch.py:327)
+19:50:13.581 [I] Active mask dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]       (20567:train_pytorch.py:328)
+19:50:13.582 [I] Masked dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]      (20567:train_pytorch.py:329)
+19:50:13.582 [I] Gradient bucket diagnostics: left_action_in, right_action_in, left_expert, right_expert, action_out, cross_arm_comm (20567:train_pytorch.py:722)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+19:50:15.125 [I] debug_step=1 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (20567:train_pytorch.py:831)
+19:50:15.126 [I] debug_step=1 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (20567:train_pytorch.py:835)
+19:50:15.126 [I] debug_step=1 prompt_token_lengths=[75]                                           (20567:train_pytorch.py:838)
+19:50:15.127 [I] debug_step=1 state_stats min=-1.0000 max=1.0004 mean=0.0112 std=0.3876           (20567:train_pytorch.py:839)
+19:50:15.127 [I] debug_step=1 action_stats min=-1.0016 max=1.0004 mean=-0.0454 std=0.4716         (20567:train_pytorch.py:842)
+19:50:15.128 [I] debug_step=1 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (20567:train_pytorch.py:845)
+19:50:15.143 [I] debug_step=1 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (20567:train_pytorch.py:849)
+19:50:15.143 [I] debug_step=1 lr=9.96e-08 grad_norm=31.4779 data_time=0.2101s step_time=1.2943s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.25GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.25GB (20567:train_pytorch.py:854)
+19:50:15.144 [I] debug_step=1 grad_shared_backbone=25.5606 grad_left_action_in=0.2318 grad_right_action_in=0.9885 grad_left_expert=5.5978 grad_right_expert=12.3518 grad_action_out=9.6154 (20567:train_pytorch.py:862)
+19:50:15.144 [I] step=1 loss=2.6238 smoothed_loss=2.6238 lr=9.96e-08 grad_norm=31.4779 step_time=1.2943s data_time=0.2101s it/s=0.633 eta_to_3=3.2s max_cuda_memory=76.13GB grad_action_out=9.6154 grad_left_action_in=0.2318 grad_left_expert=5.5978 grad_right_action_in=0.9885 grad_right_expert=12.3518 grad_shared_backbone=25.5606 (20567:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+19:50:16.012 [I] debug_step=2 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (20567:train_pytorch.py:831)
+19:50:16.013 [I] debug_step=2 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (20567:train_pytorch.py:835)
+19:50:16.013 [I] debug_step=2 prompt_token_lengths=[76]                                           (20567:train_pytorch.py:838)
+19:50:16.014 [I] debug_step=2 state_stats min=-0.9415 max=1.0004 mean=-0.0010 std=0.4295          (20567:train_pytorch.py:839)
+19:50:16.015 [I] debug_step=2 action_stats min=-1.0000 max=1.1367 mean=0.0272 std=0.4576          (20567:train_pytorch.py:842)
+19:50:16.016 [I] debug_step=2 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (20567:train_pytorch.py:845)
+19:50:16.016 [I] debug_step=2 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (20567:train_pytorch.py:849)
+19:50:16.017 [I] debug_step=2 lr=1.99e-07 grad_norm=12.2770 data_time=0.2123s step_time=0.6695s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.34GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.34GB (20567:train_pytorch.py:854)
+19:50:16.017 [I] debug_step=2 grad_shared_backbone=10.3527 grad_left_action_in=0.1586 grad_right_action_in=0.1584 grad_left_expert=2.8415 grad_right_expert=4.0156 grad_action_out=4.1478 (20567:train_pytorch.py:862)
+19:50:16.018 [I] step=2 loss=1.1717 smoothed_loss=2.4786 lr=1.99e-07 grad_norm=12.2770 step_time=0.6695s data_time=0.2123s it/s=1.146 eta_to_3=0.9s max_cuda_memory=76.13GB grad_action_out=4.1478 grad_left_action_in=0.1586 grad_left_expert=2.8415 grad_right_action_in=0.1584 grad_right_expert=4.0156 grad_shared_backbone=10.3527 (20567:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+19:50:16.906 [I] debug_step=3 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (20567:train_pytorch.py:831)
+19:50:16.907 [I] debug_step=3 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (20567:train_pytorch.py:835)
+19:50:16.908 [I] debug_step=3 prompt_token_lengths=[75]                                           (20567:train_pytorch.py:838)
+19:50:16.908 [I] debug_step=3 state_stats min=-1.0000 max=1.0004 mean=0.0558 std=0.4300           (20567:train_pytorch.py:839)
+19:50:16.908 [I] debug_step=3 action_stats min=-1.0033 max=1.0004 mean=-0.0658 std=0.4704         (20567:train_pytorch.py:842)
+19:50:16.909 [I] debug_step=3 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (20567:train_pytorch.py:845)
+19:50:16.910 [I] debug_step=3 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (20567:train_pytorch.py:849)
+19:50:16.910 [I] debug_step=3 lr=2.99e-07 grad_norm=15.1079 data_time=0.2612s step_time=0.6330s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.34GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.34GB (20567:train_pytorch.py:854)
+19:50:16.911 [I] debug_step=3 grad_shared_backbone=8.6850 grad_left_action_in=0.2570 grad_right_action_in=0.3869 grad_left_expert=4.4422 grad_right_expert=10.5777 grad_action_out=3.5502 (20567:train_pytorch.py:862)
+19:50:16.911 [I] step=3 loss=0.9128 smoothed_loss=2.3220 lr=2.99e-07 grad_norm=15.1079 step_time=0.6330s data_time=0.2612s it/s=1.120 eta_to_3=0.0s max_cuda_memory=76.13GB grad_action_out=3.5502 grad_left_action_in=0.2570 grad_left_expert=4.4422 grad_right_action_in=0.3869 grad_right_expert=10.5777 grad_shared_backbone=8.6850 (20567:train_pytorch.py:882)
+19:53:54.052 [I] Saved checkpoint at step 3 -> /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_smoke3_r2/3 (20567:train_pytorch.py:378)

artifacts/twin_split_expert_bringup_20260310/run_logs/split_independent_real_train20.log ADDED Viewed

	@@ -0,0 +1,173 @@

+20:03:03.080 [I] Created experiment checkpoint directory: /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_train20 (22934:train_pytorch.py:533)
+20:03:03.082 [I] Using batch size per GPU: 1 (total batch size across 1 GPUs: 1)                  (22934:train_pytorch.py:552)
+20:03:03.183 [I] Loaded norm stats from /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train (22934:config.py:234)
+20:03:03.185 [I] data_config: DataConfig(repo_id='lsnu/twin_dual_push_128_train', asset_id='lsnu/twin_dual_push_128_train', norm_stats={'state': NormStats(mean=array([ 0.10604009,  0.20956482,  0.09184283, -1.98801565, -0.04930164,
+        2.20065784,  1.07595289,  0.52742052,  0.01585805,  0.08288047,
+       -0.06887393, -1.906394  ,  0.04810138,  2.01086807, -0.92902797,
+        0.8440811 ]), std=array([0.09207697, 0.31317395, 0.08127229, 0.53812712, 0.06093267,
+       0.51205784, 0.22527155, 0.49924755, 0.20230208, 0.31408131,
+       0.21665592, 0.5264315 , 0.20170984, 0.4745712 , 1.17861438,
+       0.36277843]), q01=array([-5.00321221e-06, -3.88026012e-01, -2.23782954e-05, -2.98962682e+00,
+       -2.38592355e-01,  1.22146201e+00,  7.85383821e-01,  0.00000000e+00,
+       -6.15615927e-01, -4.14941930e-01, -9.43696350e-01, -2.88397729e+00,
+       -9.05083556e-01,  1.22148895e+00, -2.79564499e+00,  0.00000000e+00]), q99=array([ 0.31251293,  0.86546916,  0.35174239, -0.87634897,  0.05212194,
+        2.97208117,  1.64465171,  0.9998    ,  0.7670313 ,  0.96073459,
+        0.68710467, -0.87498123,  0.35838486,  2.9773227 ,  0.78477909,
+        0.9998    ])), 'actions': NormStats(mean=array([ 0.03630241,  0.09624442,  0.01367408, -0.2224988 , -0.02762174,
+        0.27498844,  0.0892187 ,  0.45650524, -0.00378086,  0.09113847,
+       -0.00376227, -0.22537093,  0.00826233,  0.26799494, -0.57452869,
+        0.7731654 ]), std=array([0.04995174, 0.29268014, 0.06852161, 0.3647725 , 0.07012808,
+       0.27129024, 0.11329207, 0.4981046 , 0.0917461 , 0.22704004,
+       0.1069391 , 0.2572591 , 0.11801817, 0.1235588 , 0.35835782,
+       0.41878474]), q01=array([-5.86206436e-04, -3.88117499e-01, -2.55800724e-01, -8.34769463e-01,
+       -3.51454727e-01, -1.54787922e-03, -5.81741333e-04,  0.00000000e+00,
+       -2.64436970e-01, -3.51582764e-01, -3.69693995e-01, -7.30919549e-01,
+       -3.35441585e-01, -6.62303925e-04, -9.34731126e-01,  0.00000000e+00]), q99=array([0.20790743, 0.81198567, 0.19612836, 0.33958174, 0.05568643,
+       0.75265345, 0.425256  , 0.9998    , 0.2558236 , 0.58901345,
+       0.35822071, 0.18567593, 0.44035054, 0.49966629, 0.12655233,
+       0.9998    ]))}, repack_transforms=Group(inputs=[RepackTransform(structure={'images': {'cam_high': 'front_image', 'cam_left_wrist': 'wrist_left_image', 'cam_right_wrist': 'wrist_right_image'}, 'state': 'state', 'actions': 'action', 'prompt': 'task'})], outputs=()), data_transforms=Group(inputs=[AlohaInputs(adapt_to_pi=False)], outputs=[]), model_transforms=Group(inputs=[InjectDefaultPrompt(prompt=None), ResizeImages(height=224, width=224), TokenizePrompt(tokenizer=<openpi.models.tokenizer.PaligemmaTokenizer object at 0x721cdf0dd610>, discrete_state_input=True), PackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))], outputs=[UnpackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))]), use_quantile_norm=True, action_sequence_keys=('action',), prompt_from_task=False, rlds_data_dir=None, action_space=None, datasets=()) (22934:data_loader.py:393)
+20:03:13.494 [I] JAX version 0.5.3 available.                                                     (22934:config.py:125)
+20:04:17.801 [I] Using existing local LeRobot dataset mirror for lsnu/twin_dual_push_128_train: /workspace/lerobot/lsnu/twin_dual_push_128_train (22934:data_loader.py:148)
+20:04:17.904 [W] 'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder (22934:video_utils.py:36)
+20:09:04.645 [I] local_batch_size: 1                                                              (22934:data_loader.py:474)
+20:11:56.606 [I] Enabled gradient checkpointing for PI0Pytorch model                              (22934:pi0_pytorch.py:138)
+20:11:56.607 [I] Enabled gradient checkpointing for memory optimization                           (22934:train_pytorch.py:624)
+20:11:56.608 [I] Step 0 (after_model_creation): GPU memory - allocated: 17.23GB, reserved: 17.23GB, free: 0.00GB, peak_allocated: 17.23GB, peak_reserved: 17.23GB (22934:train_pytorch.py:493)
+20:11:56.609 [I] Loading weights from: /workspace/checkpoints/pi05_base_split_independent_packed_from_single (22934:train_pytorch.py:653)
+20:12:01.374 [I] Weight loading missing key count: 0                                              (22934:train_pytorch.py:657)
+20:12:01.375 [I] Weight loading missing keys: set()                                               (22934:train_pytorch.py:658)
+20:12:01.375 [I] Weight loading unexpected key count: 0                                           (22934:train_pytorch.py:659)
+20:12:01.375 [I] Weight loading unexpected keys: []                                               (22934:train_pytorch.py:660)
+20:12:01.376 [I] Loaded PyTorch weights from /workspace/checkpoints/pi05_base_split_independent_packed_from_single (22934:train_pytorch.py:661)
+20:12:01.380 [I] Running on: 963c158043aa | world_size=1                                          (22934:train_pytorch.py:701)
+20:12:01.381 [I] Training config: batch_size=1, effective_batch_size=1, num_train_steps=20        (22934:train_pytorch.py:702)
+20:12:01.381 [I] Memory optimizations: gradient_checkpointing=True                                (22934:train_pytorch.py:705)
+20:12:01.381 [I] DDP settings: find_unused_parameters=False, gradient_as_bucket_view=True, static_graph=True (22934:train_pytorch.py:706)
+20:12:01.382 [I] LR schedule: warmup=250, peak_lr=2.50e-05, decay_steps=5000, end_lr=2.50e-06     (22934:train_pytorch.py:707)
+20:12:01.382 [I] Optimizer: AdamW, weight_decay=1e-10, clip_norm=1.0                              (22934:train_pytorch.py:710)
+20:12:01.382 [I] EMA is not supported for PyTorch training                                        (22934:train_pytorch.py:713)
+20:12:01.383 [I] Training precision: float32                                                      (22934:train_pytorch.py:714)
+20:12:01.410 [I] Resolved config name: pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k (22934:train_pytorch.py:308)
+20:12:01.410 [I] Dataset repo_id: lsnu/twin_dual_push_128_train                                   (22934:train_pytorch.py:309)
+20:12:01.411 [I] Norm-stats file path: /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json (22934:train_pytorch.py:310)
+20:12:01.411 [I] Norm-stats summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (22934:train_pytorch.py:311)
+20:12:01.412 [I] Checkpoint source path: /workspace/checkpoints/pi05_base_split_independent_packed_from_single (22934:train_pytorch.py:312)
+20:12:01.412 [I] Model type: split_independent                                                    (22934:train_pytorch.py:313)
+20:12:01.412 [I] Packed transforms active: True                                                   (22934:train_pytorch.py:314)
+20:12:01.413 [I] World size: 1                                                                    (22934:train_pytorch.py:315)
+20:12:01.413 [I] Batch size: local=1, global=1                                                    (22934:train_pytorch.py:316)
+20:12:01.414 [I] num_workers: 0                                                                   (22934:train_pytorch.py:317)
+20:12:01.414 [I] Precision: float32                                                               (22934:train_pytorch.py:318)
+20:12:01.414 [I] LR schedule summary: warmup_steps=250, peak_lr=2.50e-05, decay_steps=5000, decay_lr=2.50e-06 (22934:train_pytorch.py:319)
+20:12:01.415 [I] Save/log intervals: save_interval=20, log_interval=1                             (22934:train_pytorch.py:326)
+20:12:01.415 [I] Action-loss mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (22934:train_pytorch.py:327)
+20:12:01.415 [I] Active mask dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]       (22934:train_pytorch.py:328)
+20:12:01.416 [I] Masked dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]      (22934:train_pytorch.py:329)
+20:12:01.416 [I] Gradient bucket diagnostics: left_action_in, right_action_in, left_expert, right_expert, action_out, cross_arm_comm (22934:train_pytorch.py:722)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:03.701 [I] debug_step=1 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22934:train_pytorch.py:831)
+20:12:03.702 [I] debug_step=1 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22934:train_pytorch.py:835)
+20:12:03.702 [I] debug_step=1 prompt_token_lengths=[75]                                           (22934:train_pytorch.py:838)
+20:12:03.702 [I] debug_step=1 state_stats min=-1.0000 max=1.0004 mean=0.0112 std=0.3876           (22934:train_pytorch.py:839)
+20:12:03.702 [I] debug_step=1 action_stats min=-1.0016 max=1.0004 mean=-0.0454 std=0.4716         (22934:train_pytorch.py:842)
+20:12:03.703 [I] debug_step=1 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22934:train_pytorch.py:845)
+20:12:03.729 [I] debug_step=1 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22934:train_pytorch.py:849)
+20:12:03.730 [I] debug_step=1 lr=9.96e-08 grad_norm=31.4779 data_time=0.5472s step_time=1.7166s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.25GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.25GB (22934:train_pytorch.py:854)
+20:12:03.730 [I] debug_step=1 grad_shared_backbone=25.5606 grad_left_action_in=0.2318 grad_right_action_in=0.9885 grad_left_expert=5.5978 grad_right_expert=12.3518 grad_action_out=9.6154 (22934:train_pytorch.py:862)
+20:12:03.731 [I] step=1 loss=2.6238 smoothed_loss=2.6238 lr=9.96e-08 grad_norm=31.4779 step_time=1.7166s data_time=0.5472s it/s=0.425 eta_to_20=44.7s max_cuda_memory=76.13GB grad_action_out=9.6154 grad_left_action_in=0.2318 grad_left_expert=5.5978 grad_right_action_in=0.9885 grad_right_expert=12.3518 grad_shared_backbone=25.5606 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:05.012 [I] debug_step=2 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22934:train_pytorch.py:831)
+20:12:05.013 [I] debug_step=2 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22934:train_pytorch.py:835)
+20:12:05.014 [I] debug_step=2 prompt_token_lengths=[76]                                           (22934:train_pytorch.py:838)
+20:12:05.014 [I] debug_step=2 state_stats min=-0.9415 max=1.0004 mean=-0.0010 std=0.4295          (22934:train_pytorch.py:839)
+20:12:05.015 [I] debug_step=2 action_stats min=-1.0000 max=1.1367 mean=0.0272 std=0.4576          (22934:train_pytorch.py:842)
+20:12:05.016 [I] debug_step=2 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22934:train_pytorch.py:845)
+20:12:05.016 [I] debug_step=2 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22934:train_pytorch.py:849)
+20:12:05.017 [I] debug_step=2 lr=1.99e-07 grad_norm=12.2749 data_time=0.5381s step_time=0.7692s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.34GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.34GB (22934:train_pytorch.py:854)
+20:12:05.017 [I] debug_step=2 grad_shared_backbone=10.3515 grad_left_action_in=0.1585 grad_right_action_in=0.1584 grad_left_expert=2.8412 grad_right_expert=4.0131 grad_action_out=4.1470 (22934:train_pytorch.py:862)
+20:12:05.018 [I] step=2 loss=1.1715 smoothed_loss=2.4786 lr=1.99e-07 grad_norm=12.2749 step_time=0.7692s data_time=0.5381s it/s=0.777 eta_to_20=23.2s max_cuda_memory=76.13GB grad_action_out=4.1470 grad_left_action_in=0.1585 grad_left_expert=2.8412 grad_right_action_in=0.1584 grad_right_expert=4.0131 grad_shared_backbone=10.3515 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:05.585 [I] debug_step=3 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22934:train_pytorch.py:831)
+20:12:05.586 [I] debug_step=3 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22934:train_pytorch.py:835)
+20:12:05.586 [I] debug_step=3 prompt_token_lengths=[75]                                           (22934:train_pytorch.py:838)
+20:12:05.586 [I] debug_step=3 state_stats min=-1.0000 max=1.0004 mean=0.0558 std=0.4300           (22934:train_pytorch.py:839)
+20:12:05.587 [I] debug_step=3 action_stats min=-1.0033 max=1.0004 mean=-0.0658 std=0.4704         (22934:train_pytorch.py:842)
+20:12:05.588 [I] debug_step=3 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22934:train_pytorch.py:845)
+20:12:05.588 [I] debug_step=3 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22934:train_pytorch.py:849)
+20:12:05.589 [I] debug_step=3 lr=2.99e-07 grad_norm=15.1205 data_time=0.1545s step_time=0.4182s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.34GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.34GB (22934:train_pytorch.py:854)
+20:12:05.589 [I] debug_step=3 grad_shared_backbone=8.6946 grad_left_action_in=0.2568 grad_right_action_in=0.3873 grad_left_expert=4.4408 grad_right_expert=10.5877 grad_action_out=3.5507 (22934:train_pytorch.py:862)
+20:12:05.590 [I] step=3 loss=0.9126 smoothed_loss=2.3220 lr=2.99e-07 grad_norm=15.1205 step_time=0.4182s data_time=0.1545s it/s=1.751 eta_to_20=9.7s max_cuda_memory=76.13GB grad_action_out=3.5507 grad_left_action_in=0.2568 grad_left_expert=4.4408 grad_right_action_in=0.3873 grad_right_expert=10.5877 grad_shared_backbone=8.6946 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:06.414 [I] debug_step=4 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22934:train_pytorch.py:831)
+20:12:06.415 [I] debug_step=4 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22934:train_pytorch.py:835)
+20:12:06.416 [I] debug_step=4 prompt_token_lengths=[78]                                           (22934:train_pytorch.py:838)
+20:12:06.416 [I] debug_step=4 state_stats min=-0.7017 max=1.0004 mean=0.0553 std=0.3507           (22934:train_pytorch.py:839)
+20:12:06.417 [I] debug_step=4 action_stats min=-1.0014 max=1.0004 mean=-0.0683 std=0.4561         (22934:train_pytorch.py:842)
+20:12:06.417 [I] debug_step=4 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22934:train_pytorch.py:845)
+20:12:06.418 [I] debug_step=4 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22934:train_pytorch.py:849)
+20:12:06.419 [I] debug_step=4 lr=3.98e-07 grad_norm=9.2670 data_time=0.2679s step_time=0.5621s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.34GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.34GB (22934:train_pytorch.py:854)
+20:12:06.419 [I] debug_step=4 grad_shared_backbone=7.8629 grad_left_action_in=0.1341 grad_right_action_in=0.0877 grad_left_expert=3.2369 grad_right_expert=1.0658 grad_action_out=3.4116 (22934:train_pytorch.py:862)
+20:12:06.420 [I] step=4 loss=1.1718 smoothed_loss=2.2070 lr=3.98e-07 grad_norm=9.2670 step_time=0.5621s data_time=0.2679s it/s=1.206 eta_to_20=13.3s max_cuda_memory=76.13GB grad_action_out=3.4116 grad_left_action_in=0.1341 grad_left_expert=3.2369 grad_right_action_in=0.0877 grad_right_expert=1.0658 grad_shared_backbone=7.8629 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:07.218 [I] debug_step=5 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22934:train_pytorch.py:831)
+20:12:07.219 [I] debug_step=5 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22934:train_pytorch.py:835)
+20:12:07.219 [I] debug_step=5 prompt_token_lengths=[73]                                           (22934:train_pytorch.py:838)
+20:12:07.219 [I] debug_step=5 state_stats min=-0.9599 max=1.0004 mean=0.0170 std=0.5364           (22934:train_pytorch.py:839)
+20:12:07.220 [I] debug_step=5 action_stats min=-1.0392 max=1.0004 mean=-0.0159 std=0.4488         (22934:train_pytorch.py:842)
+20:12:07.220 [I] debug_step=5 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22934:train_pytorch.py:845)
+20:12:07.221 [I] debug_step=5 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22934:train_pytorch.py:849)
+20:12:07.221 [I] debug_step=5 lr=4.98e-07 grad_norm=18.8576 data_time=0.2330s step_time=0.5704s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.34GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.34GB (22934:train_pytorch.py:854)
+20:12:07.222 [I] debug_step=5 grad_shared_backbone=15.0420 grad_left_action_in=0.2664 grad_right_action_in=0.2257 grad_left_expert=7.9881 grad_right_expert=3.7966 grad_action_out=6.1884 (22934:train_pytorch.py:862)
+20:12:07.223 [I] step=5 loss=1.6473 smoothed_loss=2.1510 lr=4.98e-07 grad_norm=18.8576 step_time=0.5704s data_time=0.2330s it/s=1.246 eta_to_20=12.0s max_cuda_memory=76.13GB grad_action_out=6.1884 grad_left_action_in=0.2664 grad_left_expert=7.9881 grad_right_action_in=0.2257 grad_right_expert=3.7966 grad_shared_backbone=15.0420 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:07.822 [I] step=6 loss=1.6098 smoothed_loss=2.0969 lr=5.98e-07 grad_norm=20.9772 step_time=0.4435s data_time=0.1600s it/s=1.671 eta_to_20=8.4s max_cuda_memory=76.13GB grad_action_out=6.0592 grad_left_action_in=0.2873 grad_left_expert=8.8574 grad_right_action_in=0.4264 grad_right_expert=6.3071 grad_shared_backbone=16.1173 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:08.395 [I] step=7 loss=1.0401 smoothed_loss=1.9912 lr=6.97e-07 grad_norm=9.5173 step_time=0.4240s data_time=0.1495s it/s=1.747 eta_to_20=7.4s max_cuda_memory=76.13GB grad_action_out=4.1689 grad_left_action_in=0.1489 grad_left_expert=3.1996 grad_right_action_in=0.0904 grad_right_expert=2.4983 grad_shared_backbone=7.4213 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:08.914 [I] step=8 loss=1.7539 smoothed_loss=1.9675 lr=7.97e-07 grad_norm=12.9701 step_time=0.3829s data_time=0.1362s it/s=1.931 eta_to_20=6.2s max_cuda_memory=76.13GB grad_action_out=5.3617 grad_left_action_in=0.1890 grad_left_expert=3.6536 grad_right_action_in=0.3790 grad_right_expert=2.7904 grad_shared_backbone=10.5667 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:09.692 [I] step=9 loss=0.4114 smoothed_loss=1.8119 lr=8.96e-07 grad_norm=3.5873 step_time=0.5166s data_time=0.2609s it/s=1.288 eta_to_20=8.5s max_cuda_memory=76.13GB grad_action_out=1.8283 grad_left_action_in=0.0689 grad_left_expert=1.3656 grad_right_action_in=0.0549 grad_right_expert=0.7330 grad_shared_backbone=2.6507 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:10.646 [I] step=10 loss=0.6228 smoothed_loss=1.6930 lr=9.96e-07 grad_norm=6.7396 step_time=0.7100s data_time=0.2450s it/s=1.049 eta_to_20=9.5s max_cuda_memory=76.13GB grad_action_out=2.2553 grad_left_action_in=0.0813 grad_left_expert=1.3495 grad_right_action_in=0.0919 grad_right_expert=2.0906 grad_shared_backbone=5.8179 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:11.288 [I] step=11 loss=0.8688 smoothed_loss=1.6105 lr=1.10e-06 grad_norm=7.2182 step_time=0.4823s data_time=0.1593s it/s=1.561 eta_to_20=5.8s max_cuda_memory=76.13GB grad_action_out=3.3031 grad_left_action_in=0.1262 grad_left_expert=2.5456 grad_right_action_in=0.0809 grad_right_expert=0.9216 grad_shared_backbone=5.7177 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:11.903 [I] step=12 loss=0.7319 smoothed_loss=1.5227 lr=1.20e-06 grad_norm=6.1848 step_time=0.4468s data_time=0.1681s it/s=1.629 eta_to_20=4.9s max_cuda_memory=76.13GB grad_action_out=2.7925 grad_left_action_in=0.1038 grad_left_expert=2.4508 grad_right_action_in=0.0680 grad_right_expert=0.8716 grad_shared_backbone=4.8333 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:12.684 [I] step=13 loss=0.8788 smoothed_loss=1.4583 lr=1.29e-06 grad_norm=20.2227 step_time=0.5649s data_time=0.2162s it/s=1.282 eta_to_20=5.5s max_cuda_memory=76.13GB grad_action_out=3.0176 grad_left_action_in=0.1300 grad_left_expert=2.8276 grad_right_action_in=0.4691 grad_right_expert=12.9156 grad_shared_backbone=11.2157 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:13.370 [I] step=14 loss=1.2741 smoothed_loss=1.4399 lr=1.39e-06 grad_norm=7.8620 step_time=0.5100s data_time=0.1755s it/s=1.461 eta_to_20=4.1s max_cuda_memory=76.13GB grad_action_out=4.2194 grad_left_action_in=0.1433 grad_left_expert=2.8949 grad_right_action_in=0.0958 grad_right_expert=1.0096 grad_shared_backbone=5.8070 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:14.027 [I] step=15 loss=2.3729 smoothed_loss=1.5332 lr=1.49e-06 grad_norm=19.3589 step_time=0.4678s data_time=0.1899s it/s=1.523 eta_to_20=3.3s max_cuda_memory=76.13GB grad_action_out=7.2135 grad_left_action_in=0.2665 grad_left_expert=7.5354 grad_right_action_in=0.5496 grad_right_expert=4.5295 grad_shared_backbone=15.2257 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:14.874 [I] step=16 loss=0.8147 smoothed_loss=1.4613 lr=1.59e-06 grad_norm=7.7365 step_time=0.5547s data_time=0.2919s it/s=1.183 eta_to_20=3.4s max_cuda_memory=76.13GB grad_action_out=2.7237 grad_left_action_in=0.1192 grad_left_expert=2.8822 grad_right_action_in=0.0900 grad_right_expert=0.8615 grad_shared_backbone=6.4500 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:15.664 [I] step=17 loss=1.4318 smoothed_loss=1.4584 lr=1.69e-06 grad_norm=19.5452 step_time=0.5511s data_time=0.2382s it/s=1.268 eta_to_20=2.4s max_cuda_memory=76.13GB grad_action_out=3.9684 grad_left_action_in=0.3767 grad_left_expert=7.8636 grad_right_action_in=0.1317 grad_right_expert=1.6847 grad_shared_backbone=16.9059 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:16.588 [I] step=18 loss=0.4858 smoothed_loss=1.3611 lr=1.79e-06 grad_norm=3.4382 step_time=0.6846s data_time=0.2403s it/s=1.082 eta_to_20=1.8s max_cuda_memory=76.13GB grad_action_out=1.9985 grad_left_action_in=0.0749 grad_left_expert=1.4156 grad_right_action_in=0.0390 grad_right_expert=0.5210 grad_shared_backbone=2.3369 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:17.216 [I] step=19 loss=0.7492 smoothed_loss=1.2999 lr=1.89e-06 grad_norm=6.9377 step_time=0.4815s data_time=0.1459s it/s=1.596 eta_to_20=0.6s max_cuda_memory=76.13GB grad_action_out=3.7478 grad_left_action_in=0.1113 grad_left_expert=2.8716 grad_right_action_in=0.0729 grad_right_expert=1.0784 grad_shared_backbone=4.9024 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:18.186 [I] step=20 loss=0.6038 smoothed_loss=1.2303 lr=1.99e-06 grad_norm=7.0090 step_time=0.7175s data_time=0.2525s it/s=1.032 eta_to_20=0.0s max_cuda_memory=76.13GB grad_action_out=2.8786 grad_left_action_in=0.0890 grad_left_expert=2.7778 grad_right_action_in=0.0549 grad_right_expert=1.4578 grad_shared_backbone=5.5395 (22934:train_pytorch.py:882)
+20:19:39.399 [I] Saved checkpoint at step 20 -> /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_train20/20 (22934:train_pytorch.py:378)

artifacts/twin_split_expert_bringup_20260310/sanity_checks/split_communicating_invariants.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+weight_loading_missing_keys: []
+weight_loading_unexpected_keys: []
+identical_branch_suffix_max_abs_diff: 0.00000000
+identical_branch_suffix_match: True
+left_branch_invariance_check: skipped_for_split_communicating
+right_branch_invariance_check: skipped_for_split_communicating

artifacts/twin_split_expert_bringup_20260310/sanity_checks/split_independent_invariants.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+weight_loading_missing_keys: []
+weight_loading_unexpected_keys: []
+identical_branch_suffix_max_abs_diff: 0.00000000
+identical_branch_suffix_match: True
+left_branch_invariance_max_abs_diff: 0.00000000
+right_branch_invariance_max_abs_diff: 0.00000000
+left_branch_invariant: True
+right_branch_invariant: True

openpi/assets/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json ADDED Viewed

	@@ -0,0 +1,152 @@

+{
+  "norm_stats": {
+    "state": {
+      "mean": [
+        0.1060400903224945,
+        0.20956481993198395,
+        0.09184283018112183,
+        -1.9880156517028809,
+        -0.0493016354739666,
+        2.200657844543457,
+        1.0759528875350952,
+        0.5274205207824707,
+        0.015858052298426628,
+        0.08288046717643738,
+        -0.06887393444776535,
+        -1.9063940048217773,
+        0.048101384192705154,
+        2.0108680725097656,
+        -0.9290279746055603,
+        0.8440811038017273
+      ],
+      "std": [
+        0.09207697212696075,
+        0.31317394971847534,
+        0.08127228915691376,
+        0.5381271243095398,
+        0.060932666063308716,
+        0.5120578408241272,
+        0.2252715528011322,
+        0.49924755096435547,
+        0.20230208337306976,
+        0.3140813112258911,
+        0.2166559249162674,
+        0.5264315009117126,
+        0.2017098367214203,
+        0.47457119822502136,
+        1.1786143779754639,
+        0.3627784252166748
+      ],
+      "q01": [
+        -5.003212208976038e-6,
+        -0.3880260119378567,
+        -0.000022378295398084447,
+        -2.9896268159270285,
+        -0.23859235458523037,
+        1.2214620113372803,
+        0.7853838205337524,
+        0.0,
+        -0.6156159267425537,
+        -0.4149419299602509,
+        -0.9436963497161865,
+        -2.8839772893309594,
+        -0.9050835555553436,
+        1.2214889526367188,
+        -2.795644993972778,
+        0.0
+      ],
+      "q99": [
+        0.31251292623596166,
+        0.8654691616654395,
+        0.35174238551805614,
+        -0.8763489654541017,
+        0.052121943226456635,
+        2.9720811741352082,
+        1.6446517068386077,
+        0.9998,
+        0.7670312994003294,
+        0.9607345881462095,
+        0.6871046730995181,
+        -0.874981226503849,
+        0.35838486022949234,
+        2.977322695541382,
+        0.7847790859222412,
+        0.9998
+      ]
+    },
+    "actions": {
+      "mean": [
+        0.03630240634083748,
+        0.09624441713094711,
+        0.01367407850921154,
+        -0.2224988043308258,
+        -0.027621738612651825,
+        0.27498844265937805,
+        0.08921869844198227,
+        0.4565052390098572,
+        -0.0037808618508279324,
+        0.09113847464323044,
+        -0.0037622663658112288,
+        -0.22537092864513397,
+        0.008262325078248978,
+        0.2679949402809143,
+        -0.574528694152832,
+        0.7731654047966003
+      ],
+      "std": [
+        0.049951743334531784,
+        0.29268014430999756,
+        0.06852161139249802,
+        0.3647724986076355,
+        0.07012807577848434,
+        0.27129024267196655,
+        0.11329206824302673,
+        0.49810460209846497,
+        0.09174609929323196,
+        0.22704003751277924,
+        0.10693909972906113,
+        0.2572591006755829,
+        0.11801817268133163,
+        0.12355879694223404,
+        0.35835781693458557,
+        0.4187847375869751
+      ],
+      "q01": [
+        -0.0005862064361572272,
+        -0.38811749875545504,
+        -0.255800724029541,
+        -0.8347694625854493,
+        -0.35145472717285153,
+        -0.0015478792190551753,
+        -0.0005817413330078125,
+        0.0,
+        -0.2644369697570801,
+        -0.351582763671875,
+        -0.3696939945220947,
+        -0.7309195489883423,
+        -0.3354415845870973,
+        -0.000662303924560547,
+        -0.934731125831604,
+        0.0
+      ],
+      "q99": [
+        0.20790743064880374,
+        0.811985669732094,
+        0.19612836360931396,
+        0.3395817384719848,
+        0.05568643188476563,
+        0.7526534500122071,
+        0.4252559995651245,
+        0.9998,
+        0.2558236026763916,
+        0.5890134544372558,
+        0.35822071075439466,
+        0.18567593073844912,
+        0.44035053730010976,
+        0.4996662902832031,
+        0.1265523338317871,
+        0.9998
+      ]
+    }
+  }
+}

openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json ADDED Viewed

	@@ -0,0 +1,152 @@

+{
+  "norm_stats": {
+    "state": {
+      "mean": [
+        0.1060400903224945,
+        0.20956481993198395,
+        0.09184283018112183,
+        -1.9880156517028809,
+        -0.0493016354739666,
+        2.200657844543457,
+        1.0759528875350952,
+        0.5274205207824707,
+        0.015858052298426628,
+        0.08288046717643738,
+        -0.06887393444776535,
+        -1.9063940048217773,
+        0.048101384192705154,
+        2.0108680725097656,
+        -0.9290279746055603,
+        0.8440811038017273
+      ],
+      "std": [
+        0.09207697212696075,
+        0.31317394971847534,
+        0.08127228915691376,
+        0.5381271243095398,
+        0.060932666063308716,
+        0.5120578408241272,
+        0.2252715528011322,
+        0.49924755096435547,
+        0.20230208337306976,
+        0.3140813112258911,
+        0.2166559249162674,
+        0.5264315009117126,
+        0.2017098367214203,
+        0.47457119822502136,
+        1.1786143779754639,
+        0.3627784252166748
+      ],
+      "q01": [
+        -5.003212208976038e-6,
+        -0.3880260119378567,
+        -0.000022378295398084447,
+        -2.9896268159270285,
+        -0.23859235458523037,
+        1.2214620113372803,
+        0.7853838205337524,
+        0.0,
+        -0.6156159267425537,
+        -0.4149419299602509,
+        -0.9436963497161865,
+        -2.8839772893309594,
+        -0.9050835555553436,
+        1.2214889526367188,
+        -2.795644993972778,
+        0.0
+      ],
+      "q99": [
+        0.31251292623596166,
+        0.8654691616654395,
+        0.35174238551805614,
+        -0.8763489654541017,
+        0.052121943226456635,
+        2.9720811741352082,
+        1.6446517068386077,
+        0.9998,
+        0.7670312994003294,
+        0.9607345881462095,
+        0.6871046730995181,
+        -0.874981226503849,
+        0.35838486022949234,
+        2.977322695541382,
+        0.7847790859222412,
+        0.9998
+      ]
+    },
+    "actions": {
+      "mean": [
+        0.03630240634083748,
+        0.09624441713094711,
+        0.01367407850921154,
+        -0.2224988043308258,
+        -0.027621738612651825,
+        0.27498844265937805,
+        0.08921869844198227,
+        0.4565052390098572,
+        -0.0037808618508279324,
+        0.09113847464323044,
+        -0.0037622663658112288,
+        -0.22537092864513397,
+        0.008262325078248978,
+        0.2679949402809143,
+        -0.574528694152832,
+        0.7731654047966003
+      ],
+      "std": [
+        0.049951743334531784,
+        0.29268014430999756,
+        0.06852161139249802,
+        0.3647724986076355,
+        0.07012807577848434,
+        0.27129024267196655,
+        0.11329206824302673,
+        0.49810460209846497,
+        0.09174609929323196,
+        0.22704003751277924,
+        0.10693909972906113,
+        0.2572591006755829,
+        0.11801817268133163,
+        0.12355879694223404,
+        0.35835781693458557,
+        0.4187847375869751
+      ],
+      "q01": [
+        -0.0005862064361572272,
+        -0.38811749875545504,
+        -0.255800724029541,
+        -0.8347694625854493,
+        -0.35145472717285153,
+        -0.0015478792190551753,
+        -0.0005817413330078125,
+        0.0,
+        -0.2644369697570801,
+        -0.351582763671875,
+        -0.3696939945220947,
+        -0.7309195489883423,
+        -0.3354415845870973,
+        -0.000662303924560547,
+        -0.934731125831604,
+        0.0
+      ],
+      "q99": [
+        0.20790743064880374,
+        0.811985669732094,
+        0.19612836360931396,
+        0.3395817384719848,
+        0.05568643188476563,
+        0.7526534500122071,
+        0.4252559995651245,
+        0.9998,
+        0.2558236026763916,
+        0.5890134544372558,
+        0.35822071075439466,
+        0.18567593073844912,
+        0.44035053730010976,
+        0.4996662902832031,
+        0.1265523338317871,
+        0.9998
+      ]
+    }
+  }
+}

openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_smoke3/3/assets/lsnu/twin_dual_push_128_train/norm_stats.json ADDED Viewed

	@@ -0,0 +1,152 @@

+{
+  "norm_stats": {
+    "state": {
+      "mean": [
+        0.1060400903224945,
+        0.20956481993198395,
+        0.09184283018112183,
+        -1.9880156517028809,
+        -0.0493016354739666,
+        2.200657844543457,
+        1.0759528875350952,
+        0.5274205207824707,
+        0.015858052298426628,
+        0.08288046717643738,
+        -0.06887393444776535,
+        -1.9063940048217773,
+        0.048101384192705154,
+        2.0108680725097656,
+        -0.9290279746055603,
+        0.8440811038017273
+      ],
+      "std": [
+        0.09207697212696075,
+        0.31317394971847534,
+        0.08127228915691376,
+        0.5381271243095398,
+        0.060932666063308716,
+        0.5120578408241272,
+        0.2252715528011322,
+        0.49924755096435547,
+        0.20230208337306976,
+        0.3140813112258911,
+        0.2166559249162674,
+        0.5264315009117126,
+        0.2017098367214203,
+        0.47457119822502136,
+        1.1786143779754639,
+        0.3627784252166748
+      ],
+      "q01": [
+        -5.003212208976038e-6,
+        -0.3880260119378567,
+        -0.000022378295398084447,
+        -2.9896268159270285,
+        -0.23859235458523037,
+        1.2214620113372803,
+        0.7853838205337524,
+        0.0,
+        -0.6156159267425537,
+        -0.4149419299602509,
+        -0.9436963497161865,
+        -2.8839772893309594,
+        -0.9050835555553436,
+        1.2214889526367188,
+        -2.795644993972778,
+        0.0
+      ],
+      "q99": [
+        0.31251292623596166,
+        0.8654691616654395,
+        0.35174238551805614,
+        -0.8763489654541017,
+        0.052121943226456635,
+        2.9720811741352082,
+        1.6446517068386077,
+        0.9998,
+        0.7670312994003294,
+        0.9607345881462095,
+        0.6871046730995181,
+        -0.874981226503849,
+        0.35838486022949234,
+        2.977322695541382,
+        0.7847790859222412,
+        0.9998
+      ]
+    },
+    "actions": {
+      "mean": [
+        0.03630240634083748,
+        0.09624441713094711,
+        0.01367407850921154,
+        -0.2224988043308258,
+        -0.027621738612651825,
+        0.27498844265937805,
+        0.08921869844198227,
+        0.4565052390098572,
+        -0.0037808618508279324,
+        0.09113847464323044,
+        -0.0037622663658112288,
+        -0.22537092864513397,
+        0.008262325078248978,
+        0.2679949402809143,
+        -0.574528694152832,
+        0.7731654047966003
+      ],
+      "std": [
+        0.049951743334531784,
+        0.29268014430999756,
+        0.06852161139249802,
+        0.3647724986076355,
+        0.07012807577848434,
+        0.27129024267196655,
+        0.11329206824302673,
+        0.49810460209846497,
+        0.09174609929323196,
+        0.22704003751277924,
+        0.10693909972906113,
+        0.2572591006755829,
+        0.11801817268133163,
+        0.12355879694223404,
+        0.35835781693458557,
+        0.4187847375869751
+      ],
+      "q01": [
+        -0.0005862064361572272,
+        -0.38811749875545504,
+        -0.255800724029541,
+        -0.8347694625854493,
+        -0.35145472717285153,
+        -0.0015478792190551753,
+        -0.0005817413330078125,
+        0.0,
+        -0.2644369697570801,
+        -0.351582763671875,
+        -0.3696939945220947,
+        -0.7309195489883423,
+        -0.3354415845870973,
+        -0.000662303924560547,
+        -0.934731125831604,
+        0.0
+      ],
+      "q99": [
+        0.20790743064880374,
+        0.811985669732094,
+        0.19612836360931396,
+        0.3395817384719848,
+        0.05568643188476563,
+        0.7526534500122071,
+        0.4252559995651245,
+        0.9998,
+        0.2558236026763916,
+        0.5890134544372558,
+        0.35822071075439466,
+        0.18567593073844912,
+        0.44035053730010976,
+        0.4996662902832031,
+        0.1265523338317871,
+        0.9998
+      ]
+    }
+  }
+}

openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_smoke3/3/metadata.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61e6cd3a4c82f532df9754b41acaa1702add6fed61c90bd1e302f1ee902b13cd
+size 3044

openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_smoke3/3/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5729412bbd417c49d081aebe4be29d5dc462be0a5e46b00033e942bfaa6f82ba
+size 17232229008

openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_smoke3/3/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52e95638a814c7b6be696fa5f2fa5353a7d6c9679d5a7fc88a95569b35ff9bf5
+size 29412931288

openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_train20/20/assets/lsnu/twin_dual_push_128_train/norm_stats.json ADDED Viewed

	@@ -0,0 +1,152 @@

+{
+  "norm_stats": {
+    "state": {
+      "mean": [
+        0.1060400903224945,
+        0.20956481993198395,
+        0.09184283018112183,
+        -1.9880156517028809,
+        -0.0493016354739666,
+        2.200657844543457,
+        1.0759528875350952,
+        0.5274205207824707,
+        0.015858052298426628,
+        0.08288046717643738,
+        -0.06887393444776535,
+        -1.9063940048217773,
+        0.048101384192705154,
+        2.0108680725097656,
+        -0.9290279746055603,
+        0.8440811038017273
+      ],
+      "std": [
+        0.09207697212696075,
+        0.31317394971847534,
+        0.08127228915691376,
+        0.5381271243095398,
+        0.060932666063308716,
+        0.5120578408241272,
+        0.2252715528011322,
+        0.49924755096435547,
+        0.20230208337306976,
+        0.3140813112258911,
+        0.2166559249162674,
+        0.5264315009117126,
+        0.2017098367214203,
+        0.47457119822502136,
+        1.1786143779754639,
+        0.3627784252166748
+      ],
+      "q01": [
+        -5.003212208976038e-6,
+        -0.3880260119378567,
+        -0.000022378295398084447,
+        -2.9896268159270285,
+        -0.23859235458523037,
+        1.2214620113372803,
+        0.7853838205337524,
+        0.0,
+        -0.6156159267425537,
+        -0.4149419299602509,
+        -0.9436963497161865,
+        -2.8839772893309594,
+        -0.9050835555553436,
+        1.2214889526367188,
+        -2.795644993972778,
+        0.0
+      ],
+      "q99": [
+        0.31251292623596166,
+        0.8654691616654395,
+        0.35174238551805614,
+        -0.8763489654541017,
+        0.052121943226456635,
+        2.9720811741352082,
+        1.6446517068386077,
+        0.9998,
+        0.7670312994003294,
+        0.9607345881462095,
+        0.6871046730995181,
+        -0.874981226503849,
+        0.35838486022949234,
+        2.977322695541382,
+        0.7847790859222412,
+        0.9998
+      ]
+    },
+    "actions": {
+      "mean": [
+        0.03630240634083748,
+        0.09624441713094711,
+        0.01367407850921154,
+        -0.2224988043308258,
+        -0.027621738612651825,
+        0.27498844265937805,
+        0.08921869844198227,
+        0.4565052390098572,
+        -0.0037808618508279324,
+        0.09113847464323044,
+        -0.0037622663658112288,
+        -0.22537092864513397,
+        0.008262325078248978,
+        0.2679949402809143,
+        -0.574528694152832,
+        0.7731654047966003
+      ],
+      "std": [
+        0.049951743334531784,
+        0.29268014430999756,
+        0.06852161139249802,
+        0.3647724986076355,
+        0.07012807577848434,
+        0.27129024267196655,
+        0.11329206824302673,
+        0.49810460209846497,
+        0.09174609929323196,
+        0.22704003751277924,
+        0.10693909972906113,
+        0.2572591006755829,
+        0.11801817268133163,
+        0.12355879694223404,
+        0.35835781693458557,
+        0.4187847375869751
+      ],
+      "q01": [
+        -0.0005862064361572272,
+        -0.38811749875545504,
+        -0.255800724029541,
+        -0.8347694625854493,
+        -0.35145472717285153,
+        -0.0015478792190551753,
+        -0.0005817413330078125,
+        0.0,
+        -0.2644369697570801,
+        -0.351582763671875,
+        -0.3696939945220947,
+        -0.7309195489883423,
+        -0.3354415845870973,
+        -0.000662303924560547,
+        -0.934731125831604,
+        0.0
+      ],
+      "q99": [
+        0.20790743064880374,
+        0.811985669732094,
+        0.19612836360931396,
+        0.3395817384719848,
+        0.05568643188476563,
+        0.7526534500122071,
+        0.4252559995651245,
+        0.9998,
+        0.2558236026763916,
+        0.5890134544372558,
+        0.35822071075439466,
+        0.18567593073844912,
+        0.44035053730010976,
+        0.4996662902832031,
+        0.1265523338317871,
+        0.9998
+      ]
+    }
+  }
+}

openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_train20/20/metadata.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f437d3d865b5a6c0c8a907f6e68f92b55ee83c0dc09a12dcc47ab74a82dbf6f
+size 3044

openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_train20/20/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39ffa219f42d07ad472b0c128d2dc4bad0b33bf8750f9d4d13a5afb39debe72b
+size 17232229008

openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_train20/20/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:72592549367cbce929b8af3f0bc47054126facb774bfd4aca067a9f579b29db1
+size 29412931288

openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_smoke3_r2/3/assets/lsnu/twin_dual_push_128_train/norm_stats.json ADDED Viewed

	@@ -0,0 +1,152 @@

+{
+  "norm_stats": {
+    "state": {
+      "mean": [
+        0.1060400903224945,
+        0.20956481993198395,
+        0.09184283018112183,
+        -1.9880156517028809,
+        -0.0493016354739666,
+        2.200657844543457,
+        1.0759528875350952,
+        0.5274205207824707,
+        0.015858052298426628,
+        0.08288046717643738,
+        -0.06887393444776535,
+        -1.9063940048217773,
+        0.048101384192705154,
+        2.0108680725097656,
+        -0.9290279746055603,
+        0.8440811038017273
+      ],
+      "std": [
+        0.09207697212696075,
+        0.31317394971847534,
+        0.08127228915691376,
+        0.5381271243095398,
+        0.060932666063308716,
+        0.5120578408241272,
+        0.2252715528011322,
+        0.49924755096435547,
+        0.20230208337306976,
+        0.3140813112258911,
+        0.2166559249162674,
+        0.5264315009117126,
+        0.2017098367214203,
+        0.47457119822502136,
+        1.1786143779754639,
+        0.3627784252166748
+      ],
+      "q01": [
+        -5.003212208976038e-6,
+        -0.3880260119378567,
+        -0.000022378295398084447,
+        -2.9896268159270285,
+        -0.23859235458523037,
+        1.2214620113372803,
+        0.7853838205337524,
+        0.0,
+        -0.6156159267425537,
+        -0.4149419299602509,
+        -0.9436963497161865,
+        -2.8839772893309594,
+        -0.9050835555553436,
+        1.2214889526367188,
+        -2.795644993972778,
+        0.0
+      ],
+      "q99": [
+        0.31251292623596166,
+        0.8654691616654395,
+        0.35174238551805614,
+        -0.8763489654541017,
+        0.052121943226456635,
+        2.9720811741352082,
+        1.6446517068386077,
+        0.9998,
+        0.7670312994003294,
+        0.9607345881462095,
+        0.6871046730995181,
+        -0.874981226503849,
+        0.35838486022949234,
+        2.977322695541382,
+        0.7847790859222412,
+        0.9998
+      ]
+    },
+    "actions": {
+      "mean": [
+        0.03630240634083748,
+        0.09624441713094711,
+        0.01367407850921154,
+        -0.2224988043308258,
+        -0.027621738612651825,
+        0.27498844265937805,
+        0.08921869844198227,
+        0.4565052390098572,
+        -0.0037808618508279324,
+        0.09113847464323044,
+        -0.0037622663658112288,
+        -0.22537092864513397,
+        0.008262325078248978,
+        0.2679949402809143,
+        -0.574528694152832,
+        0.7731654047966003
+      ],
+      "std": [
+        0.049951743334531784,
+        0.29268014430999756,
+        0.06852161139249802,
+        0.3647724986076355,
+        0.07012807577848434,
+        0.27129024267196655,
+        0.11329206824302673,
+        0.49810460209846497,
+        0.09174609929323196,
+        0.22704003751277924,
+        0.10693909972906113,
+        0.2572591006755829,
+        0.11801817268133163,
+        0.12355879694223404,
+        0.35835781693458557,
+        0.4187847375869751
+      ],
+      "q01": [
+        -0.0005862064361572272,
+        -0.38811749875545504,
+        -0.255800724029541,
+        -0.8347694625854493,
+        -0.35145472717285153,
+        -0.0015478792190551753,
+        -0.0005817413330078125,
+        0.0,
+        -0.2644369697570801,
+        -0.351582763671875,
+        -0.3696939945220947,
+        -0.7309195489883423,
+        -0.3354415845870973,
+        -0.000662303924560547,
+        -0.934731125831604,
+        0.0
+      ],
+      "q99": [
+        0.20790743064880374,
+        0.811985669732094,
+        0.19612836360931396,
+        0.3395817384719848,
+        0.05568643188476563,
+        0.7526534500122071,
+        0.4252559995651245,
+        0.9998,
+        0.2558236026763916,
+        0.5890134544372558,
+        0.35822071075439466,
+        0.18567593073844912,
+        0.44035053730010976,
+        0.4996662902832031,
+        0.1265523338317871,
+        0.9998
+      ]
+    }
+  }
+}

openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_smoke3_r2/3/metadata.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0ca730216bba32605f1e94e967e0b4b88b237f6c234c668da521bd445bc33ff
+size 3044

openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_smoke3_r2/3/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2177e5fc2fdaa16dfa6711a21213c952b2913d36e290759003a58554ef7bd9f9
+size 17232228840

openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_smoke3_r2/3/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da191448fd206e9e012789f3a03faf31ff5694800dc621f163c4fa1af0295ee1
+size 29412930337

openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_train20/20/assets/lsnu/twin_dual_push_128_train/norm_stats.json ADDED Viewed

	@@ -0,0 +1,152 @@

+{
+  "norm_stats": {
+    "state": {
+      "mean": [
+        0.1060400903224945,
+        0.20956481993198395,
+        0.09184283018112183,
+        -1.9880156517028809,
+        -0.0493016354739666,
+        2.200657844543457,
+        1.0759528875350952,
+        0.5274205207824707,
+        0.015858052298426628,
+        0.08288046717643738,
+        -0.06887393444776535,
+        -1.9063940048217773,
+        0.048101384192705154,
+        2.0108680725097656,
+        -0.9290279746055603,
+        0.8440811038017273
+      ],
+      "std": [
+        0.09207697212696075,
+        0.31317394971847534,
+        0.08127228915691376,
+        0.5381271243095398,
+        0.060932666063308716,
+        0.5120578408241272,
+        0.2252715528011322,
+        0.49924755096435547,
+        0.20230208337306976,
+        0.3140813112258911,
+        0.2166559249162674,
+        0.5264315009117126,
+        0.2017098367214203,
+        0.47457119822502136,
+        1.1786143779754639,
+        0.3627784252166748
+      ],
+      "q01": [
+        -5.003212208976038e-6,
+        -0.3880260119378567,
+        -0.000022378295398084447,
+        -2.9896268159270285,
+        -0.23859235458523037,
+        1.2214620113372803,
+        0.7853838205337524,
+        0.0,
+        -0.6156159267425537,
+        -0.4149419299602509,
+        -0.9436963497161865,
+        -2.8839772893309594,
+        -0.9050835555553436,
+        1.2214889526367188,
+        -2.795644993972778,
+        0.0
+      ],
+      "q99": [
+        0.31251292623596166,
+        0.8654691616654395,
+        0.35174238551805614,
+        -0.8763489654541017,
+        0.052121943226456635,
+        2.9720811741352082,
+        1.6446517068386077,
+        0.9998,
+        0.7670312994003294,
+        0.9607345881462095,
+        0.6871046730995181,
+        -0.874981226503849,
+        0.35838486022949234,
+        2.977322695541382,
+        0.7847790859222412,
+        0.9998
+      ]
+    },
+    "actions": {
+      "mean": [
+        0.03630240634083748,
+        0.09624441713094711,
+        0.01367407850921154,
+        -0.2224988043308258,
+        -0.027621738612651825,
+        0.27498844265937805,
+        0.08921869844198227,
+        0.4565052390098572,
+        -0.0037808618508279324,
+        0.09113847464323044,
+        -0.0037622663658112288,
+        -0.22537092864513397,
+        0.008262325078248978,
+        0.2679949402809143,
+        -0.574528694152832,
+        0.7731654047966003
+      ],
+      "std": [
+        0.049951743334531784,
+        0.29268014430999756,
+        0.06852161139249802,
+        0.3647724986076355,
+        0.07012807577848434,
+        0.27129024267196655,
+        0.11329206824302673,
+        0.49810460209846497,
+        0.09174609929323196,
+        0.22704003751277924,
+        0.10693909972906113,
+        0.2572591006755829,
+        0.11801817268133163,
+        0.12355879694223404,
+        0.35835781693458557,
+        0.4187847375869751
+      ],
+      "q01": [
+        -0.0005862064361572272,
+        -0.38811749875545504,
+        -0.255800724029541,
+        -0.8347694625854493,
+        -0.35145472717285153,
+        -0.0015478792190551753,
+        -0.0005817413330078125,
+        0.0,
+        -0.2644369697570801,
+        -0.351582763671875,
+        -0.3696939945220947,
+        -0.7309195489883423,
+        -0.3354415845870973,
+        -0.000662303924560547,
+        -0.934731125831604,
+        0.0
+      ],
+      "q99": [
+        0.20790743064880374,
+        0.811985669732094,
+        0.19612836360931396,
+        0.3395817384719848,
+        0.05568643188476563,
+        0.7526534500122071,
+        0.4252559995651245,
+        0.9998,
+        0.2558236026763916,
+        0.5890134544372558,
+        0.35822071075439466,
+        0.18567593073844912,
+        0.44035053730010976,
+        0.4996662902832031,
+        0.1265523338317871,
+        0.9998
+      ]
+    }
+  }
+}

openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_train20/20/metadata.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e44c886f451c1d306910d471cc73daf51ad51fd7acd092e2174a44318747ee02
+size 3044

openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_train20/20/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c8bcb5f7990cf9d6623b6c55f4143be77d95fad7fb9e8edfcd3373f62d63ee8
+size 17232228840

openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_train20/20/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2567d911c1956025db067b45ade3aebe0933c0db81550235a5190157aad6a0
+size 29412930337

openpi/run_logs/split_communicating_real_smoke3.log ADDED Viewed

	@@ -0,0 +1,104 @@

+19:55:02.788 [I] Created experiment checkpoint directory: /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_smoke3 (22110:train_pytorch.py:533)
+19:55:02.789 [I] Using batch size per GPU: 1 (total batch size across 1 GPUs: 1)                  (22110:train_pytorch.py:552)
+19:55:02.865 [I] Loaded norm stats from /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/lsnu/twin_dual_push_128_train (22110:config.py:234)
+19:55:02.867 [I] data_config: DataConfig(repo_id='lsnu/twin_dual_push_128_train', asset_id='lsnu/twin_dual_push_128_train', norm_stats={'state': NormStats(mean=array([ 0.10604009,  0.20956482,  0.09184283, -1.98801565, -0.04930164,
+        2.20065784,  1.07595289,  0.52742052,  0.01585805,  0.08288047,
+       -0.06887393, -1.906394  ,  0.04810138,  2.01086807, -0.92902797,
+        0.8440811 ]), std=array([0.09207697, 0.31317395, 0.08127229, 0.53812712, 0.06093267,
+       0.51205784, 0.22527155, 0.49924755, 0.20230208, 0.31408131,
+       0.21665592, 0.5264315 , 0.20170984, 0.4745712 , 1.17861438,
+       0.36277843]), q01=array([-5.00321221e-06, -3.88026012e-01, -2.23782954e-05, -2.98962682e+00,
+       -2.38592355e-01,  1.22146201e+00,  7.85383821e-01,  0.00000000e+00,
+       -6.15615927e-01, -4.14941930e-01, -9.43696350e-01, -2.88397729e+00,
+       -9.05083556e-01,  1.22148895e+00, -2.79564499e+00,  0.00000000e+00]), q99=array([ 0.31251293,  0.86546916,  0.35174239, -0.87634897,  0.05212194,
+        2.97208117,  1.64465171,  0.9998    ,  0.7670313 ,  0.96073459,
+        0.68710467, -0.87498123,  0.35838486,  2.9773227 ,  0.78477909,
+        0.9998    ])), 'actions': NormStats(mean=array([ 0.03630241,  0.09624442,  0.01367408, -0.2224988 , -0.02762174,
+        0.27498844,  0.0892187 ,  0.45650524, -0.00378086,  0.09113847,
+       -0.00376227, -0.22537093,  0.00826233,  0.26799494, -0.57452869,
+        0.7731654 ]), std=array([0.04995174, 0.29268014, 0.06852161, 0.3647725 , 0.07012808,
+       0.27129024, 0.11329207, 0.4981046 , 0.0917461 , 0.22704004,
+       0.1069391 , 0.2572591 , 0.11801817, 0.1235588 , 0.35835782,
+       0.41878474]), q01=array([-5.86206436e-04, -3.88117499e-01, -2.55800724e-01, -8.34769463e-01,
+       -3.51454727e-01, -1.54787922e-03, -5.81741333e-04,  0.00000000e+00,
+       -2.64436970e-01, -3.51582764e-01, -3.69693995e-01, -7.30919549e-01,
+       -3.35441585e-01, -6.62303925e-04, -9.34731126e-01,  0.00000000e+00]), q99=array([0.20790743, 0.81198567, 0.19612836, 0.33958174, 0.05568643,
+       0.75265345, 0.425256  , 0.9998    , 0.2558236 , 0.58901345,
+       0.35822071, 0.18567593, 0.44035054, 0.49966629, 0.12655233,
+       0.9998    ]))}, repack_transforms=Group(inputs=[RepackTransform(structure={'images': {'cam_high': 'front_image', 'cam_left_wrist': 'wrist_left_image', 'cam_right_wrist': 'wrist_right_image'}, 'state': 'state', 'actions': 'action', 'prompt': 'task'})], outputs=()), data_transforms=Group(inputs=[AlohaInputs(adapt_to_pi=False)], outputs=[]), model_transforms=Group(inputs=[InjectDefaultPrompt(prompt=None), ResizeImages(height=224, width=224), TokenizePrompt(tokenizer=<openpi.models.tokenizer.PaligemmaTokenizer object at 0x7ec79fca8910>, discrete_state_input=True), PackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))], outputs=[UnpackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))]), use_quantile_norm=True, action_sequence_keys=('action',), prompt_from_task=False, rlds_data_dir=None, action_space=None, datasets=()) (22110:data_loader.py:284)
+19:55:09.225 [I] JAX version 0.5.3 available.                                                     (22110:config.py:125)
+19:55:34.099 [I] Using existing local LeRobot dataset mirror for lsnu/twin_dual_push_128_train: /workspace/lerobot/lsnu/twin_dual_push_128_train (22110:data_loader.py:148)
+19:55:34.205 [W] 'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder (22110:video_utils.py:36)
+19:56:38.376 [I] local_batch_size: 1                                                              (22110:data_loader.py:365)
+19:58:25.969 [I] Enabled gradient checkpointing for PI0Pytorch model                              (22110:pi0_pytorch.py:138)
+19:58:25.971 [I] Enabled gradient checkpointing for memory optimization                           (22110:train_pytorch.py:624)
+19:58:25.972 [I] Step 0 (after_model_creation): GPU memory - allocated: 17.23GB, reserved: 17.23GB, free: 0.00GB, peak_allocated: 17.23GB, peak_reserved: 17.23GB (22110:train_pytorch.py:493)
+19:58:25.972 [I] Loading weights from: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single (22110:train_pytorch.py:653)
+19:58:29.565 [I] Weight loading missing key count: 0                                              (22110:train_pytorch.py:657)
+19:58:29.566 [I] Weight loading missing keys: set()                                               (22110:train_pytorch.py:658)
+19:58:29.566 [I] Weight loading unexpected key count: 0                                           (22110:train_pytorch.py:659)
+19:58:29.566 [I] Weight loading unexpected keys: []                                               (22110:train_pytorch.py:660)
+19:58:29.567 [I] Loaded PyTorch weights from /workspace/checkpoints/pi05_base_split_communicating_packed_from_single (22110:train_pytorch.py:661)
+19:58:29.571 [I] Running on: 963c158043aa | world_size=1                                          (22110:train_pytorch.py:701)
+19:58:29.571 [I] Training config: batch_size=1, effective_batch_size=1, num_train_steps=3         (22110:train_pytorch.py:702)
+19:58:29.572 [I] Memory optimizations: gradient_checkpointing=True                                (22110:train_pytorch.py:705)
+19:58:29.572 [I] DDP settings: find_unused_parameters=False, gradient_as_bucket_view=True, static_graph=True (22110:train_pytorch.py:706)
+19:58:29.573 [I] LR schedule: warmup=250, peak_lr=2.50e-05, decay_steps=5000, end_lr=2.50e-06     (22110:train_pytorch.py:707)
+19:58:29.573 [I] Optimizer: AdamW, weight_decay=1e-10, clip_norm=1.0                              (22110:train_pytorch.py:710)
+19:58:29.573 [I] EMA is not supported for PyTorch training                                        (22110:train_pytorch.py:713)
+19:58:29.574 [I] Training precision: float32                                                      (22110:train_pytorch.py:714)
+19:58:29.590 [I] Resolved config name: pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k (22110:train_pytorch.py:308)
+19:58:29.590 [I] Dataset repo_id: lsnu/twin_dual_push_128_train                                   (22110:train_pytorch.py:309)
+19:58:29.591 [I] Norm-stats file path: /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json (22110:train_pytorch.py:310)
+19:58:29.592 [I] Norm-stats summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (22110:train_pytorch.py:311)
+19:58:29.592 [I] Checkpoint source path: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single (22110:train_pytorch.py:312)
+19:58:29.592 [I] Model type: split_communicating                                                  (22110:train_pytorch.py:313)
+19:58:29.593 [I] Packed transforms active: True                                                   (22110:train_pytorch.py:314)
+19:58:29.593 [I] World size: 1                                                                    (22110:train_pytorch.py:315)
+19:58:29.594 [I] Batch size: local=1, global=1                                                    (22110:train_pytorch.py:316)
+19:58:29.594 [I] num_workers: 0                                                                   (22110:train_pytorch.py:317)
+19:58:29.595 [I] Precision: float32                                                               (22110:train_pytorch.py:318)
+19:58:29.595 [I] LR schedule summary: warmup_steps=250, peak_lr=2.50e-05, decay_steps=5000, decay_lr=2.50e-06 (22110:train_pytorch.py:319)
+19:58:29.595 [I] Save/log intervals: save_interval=3, log_interval=1                              (22110:train_pytorch.py:326)
+19:58:29.596 [I] Action-loss mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (22110:train_pytorch.py:327)
+19:58:29.596 [I] Active mask dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]       (22110:train_pytorch.py:328)
+19:58:29.597 [I] Masked dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]      (22110:train_pytorch.py:329)
+19:58:29.597 [I] Gradient bucket diagnostics: left_action_in, right_action_in, left_expert, right_expert, action_out, cross_arm_comm (22110:train_pytorch.py:722)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+19:58:31.354 [I] debug_step=1 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22110:train_pytorch.py:831)
+19:58:31.355 [I] debug_step=1 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22110:train_pytorch.py:835)
+19:58:31.356 [I] debug_step=1 prompt_token_lengths=[75]                                           (22110:train_pytorch.py:838)
+19:58:31.356 [I] debug_step=1 state_stats min=-1.0000 max=1.0004 mean=0.0112 std=0.3876           (22110:train_pytorch.py:839)
+19:58:31.357 [I] debug_step=1 action_stats min=-1.0016 max=1.0004 mean=-0.0454 std=0.4716         (22110:train_pytorch.py:842)
+19:58:31.358 [I] debug_step=1 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22110:train_pytorch.py:845)
+19:58:31.372 [I] debug_step=1 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22110:train_pytorch.py:849)
+19:58:31.372 [I] debug_step=1 lr=9.96e-08 grad_norm=60.0472 data_time=0.3311s step_time=1.3966s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.25GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.25GB (22110:train_pytorch.py:854)
+19:58:31.373 [I] debug_step=1 grad_shared_backbone=36.9945 grad_left_action_in=2.3769 grad_right_action_in=1.7630 grad_left_expert=31.1244 grad_right_expert=27.8917 grad_action_out=13.0720 grad_cross_arm_comm=3.1067 cross_arm_comm_gate_layer_0=0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_2=0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=0.0000 cross_arm_comm_gate_layer_5=0.0000 cross_arm_comm_gate_layer_6=0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=0.0000 cross_arm_comm_gate_layer_9=0.0000 cross_arm_comm_gate_layer_10=0.0000 cross_arm_comm_gate_layer_11=0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=0.0000 cross_arm_attention_mass_layer_0=0.0001 cross_arm_attention_mass_layer_1=0.0050 cross_arm_attention_mass_layer_2=0.0217 cross_arm_attention_mass_layer_3=0.0086 cross_arm_attention_mass_layer_4=0.0279 cross_arm_attention_mass_layer_5=0.0355 cross_arm_attention_mass_layer_6=0.0179 cross_arm_attention_mass_layer_7=0.0369 cross_arm_attention_mass_layer_8=0.0183 cross_arm_attention_mass_layer_9=0.0153 cross_arm_attention_mass_layer_10=0.0188 cross_arm_attention_mass_layer_11=0.0278 cross_arm_attention_mass_layer_12=0.0052 cross_arm_attention_mass_layer_13=0.0161 cross_arm_attention_mass_layer_14=0.0091 cross_arm_attention_mass_layer_15=0.0342 cross_arm_attention_mass_layer_16=0.0457 cross_arm_attention_mass_layer_17=0.0454 (22110:train_pytorch.py:862)
+19:58:31.374 [I] step=1 loss=3.8411 smoothed_loss=3.8411 lr=9.96e-08 grad_norm=60.0472 step_time=1.3966s data_time=0.3311s it/s=0.555 eta_to_3=3.6s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0001 cross_arm_attention_mass_layer_1=0.0050 cross_arm_attention_mass_layer_10=0.0188 cross_arm_attention_mass_layer_11=0.0278 cross_arm_attention_mass_layer_12=0.0052 cross_arm_attention_mass_layer_13=0.0161 cross_arm_attention_mass_layer_14=0.0091 cross_arm_attention_mass_layer_15=0.0342 cross_arm_attention_mass_layer_16=0.0457 cross_arm_attention_mass_layer_17=0.0454 cross_arm_attention_mass_layer_2=0.0217 cross_arm_attention_mass_layer_3=0.0086 cross_arm_attention_mass_layer_4=0.0279 cross_arm_attention_mass_layer_5=0.0355 cross_arm_attention_mass_layer_6=0.0179 cross_arm_attention_mass_layer_7=0.0369 cross_arm_attention_mass_layer_8=0.0183 cross_arm_attention_mass_layer_9=0.0153 cross_arm_comm_gate_layer_0=0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_10=0.0000 cross_arm_comm_gate_layer_11=0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=0.0000 cross_arm_comm_gate_layer_2=0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=0.0000 cross_arm_comm_gate_layer_5=0.0000 cross_arm_comm_gate_layer_6=0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=0.0000 cross_arm_comm_gate_layer_9=0.0000 grad_action_out=13.0720 grad_cross_arm_comm=3.1067 grad_left_action_in=2.3769 grad_left_expert=31.1244 grad_right_action_in=1.7630 grad_right_expert=27.8917 grad_shared_backbone=36.9945 (22110:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+19:58:32.164 [I] debug_step=2 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22110:train_pytorch.py:831)
+19:58:32.165 [I] debug_step=2 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22110:train_pytorch.py:835)
+19:58:32.166 [I] debug_step=2 prompt_token_lengths=[76]                                           (22110:train_pytorch.py:838)
+19:58:32.166 [I] debug_step=2 state_stats min=-0.9415 max=1.0004 mean=-0.0010 std=0.4295          (22110:train_pytorch.py:839)
+19:58:32.167 [I] debug_step=2 action_stats min=-1.0000 max=1.1367 mean=0.0272 std=0.4576          (22110:train_pytorch.py:842)
+19:58:32.168 [I] debug_step=2 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22110:train_pytorch.py:845)
+19:58:32.168 [I] debug_step=2 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22110:train_pytorch.py:849)
+19:58:32.169 [I] debug_step=2 lr=1.99e-07 grad_norm=10.7300 data_time=0.1812s step_time=0.6234s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.30GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.30GB (22110:train_pytorch.py:854)
+19:58:32.169 [I] debug_step=2 grad_shared_backbone=9.2018 grad_left_action_in=0.1651 grad_right_action_in=0.1485 grad_left_expert=2.5032 grad_right_expert=2.3988 grad_action_out=4.0772 grad_cross_arm_comm=0.0166 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=-0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0019 cross_arm_attention_mass_layer_2=0.0161 cross_arm_attention_mass_layer_3=0.0029 cross_arm_attention_mass_layer_4=0.0175 cross_arm_attention_mass_layer_5=0.0243 cross_arm_attention_mass_layer_6=0.0074 cross_arm_attention_mass_layer_7=0.0232 cross_arm_attention_mass_layer_8=0.0155 cross_arm_attention_mass_layer_9=0.0135 cross_arm_attention_mass_layer_10=0.0094 cross_arm_attention_mass_layer_11=0.0151 cross_arm_attention_mass_layer_12=0.0021 cross_arm_attention_mass_layer_13=0.0053 cross_arm_attention_mass_layer_14=0.0056 cross_arm_attention_mass_layer_15=0.0250 cross_arm_attention_mass_layer_16=0.0356 cross_arm_attention_mass_layer_17=0.0413 (22110:train_pytorch.py:862)
+19:58:32.170 [I] step=2 loss=1.1389 smoothed_loss=3.5709 lr=1.99e-07 grad_norm=10.7300 step_time=0.6234s data_time=0.1812s it/s=1.257 eta_to_3=0.8s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0019 cross_arm_attention_mass_layer_10=0.0094 cross_arm_attention_mass_layer_11=0.0151 cross_arm_attention_mass_layer_12=0.0021 cross_arm_attention_mass_layer_13=0.0053 cross_arm_attention_mass_layer_14=0.0056 cross_arm_attention_mass_layer_15=0.0250 cross_arm_attention_mass_layer_16=0.0356 cross_arm_attention_mass_layer_17=0.0413 cross_arm_attention_mass_layer_2=0.0161 cross_arm_attention_mass_layer_3=0.0029 cross_arm_attention_mass_layer_4=0.0175 cross_arm_attention_mass_layer_5=0.0243 cross_arm_attention_mass_layer_6=0.0074 cross_arm_attention_mass_layer_7=0.0232 cross_arm_attention_mass_layer_8=0.0155 cross_arm_attention_mass_layer_9=0.0135 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=-0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=4.0772 grad_cross_arm_comm=0.0166 grad_left_action_in=0.1651 grad_left_expert=2.5032 grad_right_action_in=0.1485 grad_right_expert=2.3988 grad_shared_backbone=9.2018 (22110:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+19:58:32.708 [I] debug_step=3 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22110:train_pytorch.py:831)
+19:58:32.709 [I] debug_step=3 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22110:train_pytorch.py:835)
+19:58:32.709 [I] debug_step=3 prompt_token_lengths=[75]                                           (22110:train_pytorch.py:838)
+19:58:32.710 [I] debug_step=3 state_stats min=-1.0000 max=1.0004 mean=0.0558 std=0.4300           (22110:train_pytorch.py:839)
+19:58:32.711 [I] debug_step=3 action_stats min=-1.0033 max=1.0004 mean=-0.0658 std=0.4704         (22110:train_pytorch.py:842)
+19:58:32.711 [I] debug_step=3 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22110:train_pytorch.py:845)
+19:58:32.712 [I] debug_step=3 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22110:train_pytorch.py:849)
+19:58:32.712 [I] debug_step=3 lr=2.99e-07 grad_norm=343.7256 data_time=0.1312s step_time=0.4126s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.30GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.30GB (22110:train_pytorch.py:854)
+19:58:32.713 [I] debug_step=3 grad_shared_backbone=215.2880 grad_left_action_in=4.7981 grad_right_action_in=9.5346 grad_left_expert=72.6437 grad_right_expert=227.6029 grad_action_out=23.7709 grad_cross_arm_comm=3.3555 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_attention_mass_layer_0=0.0003 cross_arm_attention_mass_layer_1=0.0127 cross_arm_attention_mass_layer_2=0.0275 cross_arm_attention_mass_layer_3=0.0190 cross_arm_attention_mass_layer_4=0.0359 cross_arm_attention_mass_layer_5=0.0454 cross_arm_attention_mass_layer_6=0.0228 cross_arm_attention_mass_layer_7=0.0346 cross_arm_attention_mass_layer_8=0.0149 cross_arm_attention_mass_layer_9=0.0296 cross_arm_attention_mass_layer_10=0.0177 cross_arm_attention_mass_layer_11=0.0230 cross_arm_attention_mass_layer_12=0.0134 cross_arm_attention_mass_layer_13=0.0242 cross_arm_attention_mass_layer_14=0.0109 cross_arm_attention_mass_layer_15=0.0285 cross_arm_attention_mass_layer_16=0.0403 cross_arm_attention_mass_layer_17=0.0268 (22110:train_pytorch.py:862)
+19:58:32.713 [I] step=3 loss=5.0518 smoothed_loss=3.7190 lr=2.99e-07 grad_norm=343.7256 step_time=0.4126s data_time=0.1312s it/s=1.843 eta_to_3=0.0s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0003 cross_arm_attention_mass_layer_1=0.0127 cross_arm_attention_mass_layer_10=0.0177 cross_arm_attention_mass_layer_11=0.0230 cross_arm_attention_mass_layer_12=0.0134 cross_arm_attention_mass_layer_13=0.0242 cross_arm_attention_mass_layer_14=0.0109 cross_arm_attention_mass_layer_15=0.0285 cross_arm_attention_mass_layer_16=0.0403 cross_arm_attention_mass_layer_17=0.0268 cross_arm_attention_mass_layer_2=0.0275 cross_arm_attention_mass_layer_3=0.0190 cross_arm_attention_mass_layer_4=0.0359 cross_arm_attention_mass_layer_5=0.0454 cross_arm_attention_mass_layer_6=0.0228 cross_arm_attention_mass_layer_7=0.0346 cross_arm_attention_mass_layer_8=0.0149 cross_arm_attention_mass_layer_9=0.0296 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=23.7709 grad_cross_arm_comm=3.3555 grad_left_action_in=4.7981 grad_left_expert=72.6437 grad_right_action_in=9.5346 grad_right_expert=227.6029 grad_shared_backbone=215.2880 (22110:train_pytorch.py:882)
+20:01:38.475 [I] Saved checkpoint at step 3 -> /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_smoke3/3 (22110:train_pytorch.py:378)

openpi/run_logs/split_communicating_real_train20.log ADDED Viewed

	@@ -0,0 +1,173 @@

+20:03:03.480 [I] Created experiment checkpoint directory: /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_train20 (22938:train_pytorch.py:533)
+20:03:03.486 [I] Using batch size per GPU: 1 (total batch size across 1 GPUs: 1)                  (22938:train_pytorch.py:552)
+20:03:03.634 [I] Loaded norm stats from /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/lsnu/twin_dual_push_128_train (22938:config.py:234)
+20:03:03.637 [I] data_config: DataConfig(repo_id='lsnu/twin_dual_push_128_train', asset_id='lsnu/twin_dual_push_128_train', norm_stats={'state': NormStats(mean=array([ 0.10604009,  0.20956482,  0.09184283, -1.98801565, -0.04930164,
+        2.20065784,  1.07595289,  0.52742052,  0.01585805,  0.08288047,
+       -0.06887393, -1.906394  ,  0.04810138,  2.01086807, -0.92902797,
+        0.8440811 ]), std=array([0.09207697, 0.31317395, 0.08127229, 0.53812712, 0.06093267,
+       0.51205784, 0.22527155, 0.49924755, 0.20230208, 0.31408131,
+       0.21665592, 0.5264315 , 0.20170984, 0.4745712 , 1.17861438,
+       0.36277843]), q01=array([-5.00321221e-06, -3.88026012e-01, -2.23782954e-05, -2.98962682e+00,
+       -2.38592355e-01,  1.22146201e+00,  7.85383821e-01,  0.00000000e+00,
+       -6.15615927e-01, -4.14941930e-01, -9.43696350e-01, -2.88397729e+00,
+       -9.05083556e-01,  1.22148895e+00, -2.79564499e+00,  0.00000000e+00]), q99=array([ 0.31251293,  0.86546916,  0.35174239, -0.87634897,  0.05212194,
+        2.97208117,  1.64465171,  0.9998    ,  0.7670313 ,  0.96073459,
+        0.68710467, -0.87498123,  0.35838486,  2.9773227 ,  0.78477909,
+        0.9998    ])), 'actions': NormStats(mean=array([ 0.03630241,  0.09624442,  0.01367408, -0.2224988 , -0.02762174,
+        0.27498844,  0.0892187 ,  0.45650524, -0.00378086,  0.09113847,
+       -0.00376227, -0.22537093,  0.00826233,  0.26799494, -0.57452869,
+        0.7731654 ]), std=array([0.04995174, 0.29268014, 0.06852161, 0.3647725 , 0.07012808,
+       0.27129024, 0.11329207, 0.4981046 , 0.0917461 , 0.22704004,
+       0.1069391 , 0.2572591 , 0.11801817, 0.1235588 , 0.35835782,
+       0.41878474]), q01=array([-5.86206436e-04, -3.88117499e-01, -2.55800724e-01, -8.34769463e-01,
+       -3.51454727e-01, -1.54787922e-03, -5.81741333e-04,  0.00000000e+00,
+       -2.64436970e-01, -3.51582764e-01, -3.69693995e-01, -7.30919549e-01,
+       -3.35441585e-01, -6.62303925e-04, -9.34731126e-01,  0.00000000e+00]), q99=array([0.20790743, 0.81198567, 0.19612836, 0.33958174, 0.05568643,
+       0.75265345, 0.425256  , 0.9998    , 0.2558236 , 0.58901345,
+       0.35822071, 0.18567593, 0.44035054, 0.49966629, 0.12655233,
+       0.9998    ]))}, repack_transforms=Group(inputs=[RepackTransform(structure={'images': {'cam_high': 'front_image', 'cam_left_wrist': 'wrist_left_image', 'cam_right_wrist': 'wrist_right_image'}, 'state': 'state', 'actions': 'action', 'prompt': 'task'})], outputs=()), data_transforms=Group(inputs=[AlohaInputs(adapt_to_pi=False)], outputs=[]), model_transforms=Group(inputs=[InjectDefaultPrompt(prompt=None), ResizeImages(height=224, width=224), TokenizePrompt(tokenizer=<openpi.models.tokenizer.PaligemmaTokenizer object at 0x7303f4ce5b90>, discrete_state_input=True), PackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))], outputs=[UnpackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))]), use_quantile_norm=True, action_sequence_keys=('action',), prompt_from_task=False, rlds_data_dir=None, action_space=None, datasets=()) (22938:data_loader.py:393)
+20:03:15.223 [I] JAX version 0.5.3 available.                                                     (22938:config.py:125)
+20:04:19.283 [I] Using existing local LeRobot dataset mirror for lsnu/twin_dual_push_128_train: /workspace/lerobot/lsnu/twin_dual_push_128_train (22938:data_loader.py:148)
+20:04:19.378 [W] 'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder (22938:video_utils.py:36)
+20:09:10.375 [I] local_batch_size: 1                                                              (22938:data_loader.py:474)
+20:11:59.735 [I] Enabled gradient checkpointing for PI0Pytorch model                              (22938:pi0_pytorch.py:138)
+20:11:59.737 [I] Enabled gradient checkpointing for memory optimization                           (22938:train_pytorch.py:624)
+20:11:59.738 [I] Step 0 (after_model_creation): GPU memory - allocated: 17.23GB, reserved: 17.23GB, free: 0.00GB, peak_allocated: 17.23GB, peak_reserved: 17.23GB (22938:train_pytorch.py:493)
+20:11:59.738 [I] Loading weights from: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single (22938:train_pytorch.py:653)
+20:12:04.492 [I] Weight loading missing key count: 0                                              (22938:train_pytorch.py:657)
+20:12:04.492 [I] Weight loading missing keys: set()                                               (22938:train_pytorch.py:658)
+20:12:04.492 [I] Weight loading unexpected key count: 0                                           (22938:train_pytorch.py:659)
+20:12:04.493 [I] Weight loading unexpected keys: []                                               (22938:train_pytorch.py:660)
+20:12:04.493 [I] Loaded PyTorch weights from /workspace/checkpoints/pi05_base_split_communicating_packed_from_single (22938:train_pytorch.py:661)
+20:12:04.497 [I] Running on: 963c158043aa | world_size=1                                          (22938:train_pytorch.py:701)
+20:12:04.498 [I] Training config: batch_size=1, effective_batch_size=1, num_train_steps=20        (22938:train_pytorch.py:702)
+20:12:04.498 [I] Memory optimizations: gradient_checkpointing=True                                (22938:train_pytorch.py:705)
+20:12:04.499 [I] DDP settings: find_unused_parameters=False, gradient_as_bucket_view=True, static_graph=True (22938:train_pytorch.py:706)
+20:12:04.499 [I] LR schedule: warmup=250, peak_lr=2.50e-05, decay_steps=5000, end_lr=2.50e-06     (22938:train_pytorch.py:707)
+20:12:04.499 [I] Optimizer: AdamW, weight_decay=1e-10, clip_norm=1.0                              (22938:train_pytorch.py:710)
+20:12:04.500 [I] EMA is not supported for PyTorch training                                        (22938:train_pytorch.py:713)
+20:12:04.500 [I] Training precision: float32                                                      (22938:train_pytorch.py:714)
+20:12:04.509 [I] Resolved config name: pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k (22938:train_pytorch.py:308)
+20:12:04.509 [I] Dataset repo_id: lsnu/twin_dual_push_128_train                                   (22938:train_pytorch.py:309)
+20:12:04.510 [I] Norm-stats file path: /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json (22938:train_pytorch.py:310)
+20:12:04.510 [I] Norm-stats summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (22938:train_pytorch.py:311)
+20:12:04.511 [I] Checkpoint source path: /workspace/checkpoints/pi05_base_split_communicating_packed_from_single (22938:train_pytorch.py:312)
+20:12:04.511 [I] Model type: split_communicating                                                  (22938:train_pytorch.py:313)
+20:12:04.511 [I] Packed transforms active: True                                                   (22938:train_pytorch.py:314)
+20:12:04.512 [I] World size: 1                                                                    (22938:train_pytorch.py:315)
+20:12:04.512 [I] Batch size: local=1, global=1                                                    (22938:train_pytorch.py:316)
+20:12:04.512 [I] num_workers: 0                                                                   (22938:train_pytorch.py:317)
+20:12:04.513 [I] Precision: float32                                                               (22938:train_pytorch.py:318)
+20:12:04.513 [I] LR schedule summary: warmup_steps=250, peak_lr=2.50e-05, decay_steps=5000, decay_lr=2.50e-06 (22938:train_pytorch.py:319)
+20:12:04.513 [I] Save/log intervals: save_interval=20, log_interval=1                             (22938:train_pytorch.py:326)
+20:12:04.514 [I] Action-loss mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (22938:train_pytorch.py:327)
+20:12:04.514 [I] Active mask dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]       (22938:train_pytorch.py:328)
+20:12:04.515 [I] Masked dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]      (22938:train_pytorch.py:329)
+20:12:04.515 [I] Gradient bucket diagnostics: left_action_in, right_action_in, left_expert, right_expert, action_out, cross_arm_comm (22938:train_pytorch.py:722)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:06.079 [I] debug_step=1 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22938:train_pytorch.py:831)
+20:12:06.080 [I] debug_step=1 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22938:train_pytorch.py:835)
+20:12:06.080 [I] debug_step=1 prompt_token_lengths=[75]                                           (22938:train_pytorch.py:838)
+20:12:06.081 [I] debug_step=1 state_stats min=-1.0000 max=1.0004 mean=0.0112 std=0.3876           (22938:train_pytorch.py:839)
+20:12:06.081 [I] debug_step=1 action_stats min=-1.0016 max=1.0004 mean=-0.0454 std=0.4716         (22938:train_pytorch.py:842)
+20:12:06.082 [I] debug_step=1 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22938:train_pytorch.py:845)
+20:12:06.097 [I] debug_step=1 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22938:train_pytorch.py:849)
+20:12:06.097 [I] debug_step=1 lr=9.96e-08 grad_norm=60.0473 data_time=0.2034s step_time=1.3216s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.25GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.25GB (22938:train_pytorch.py:854)
+20:12:06.098 [I] debug_step=1 grad_shared_backbone=36.9946 grad_left_action_in=2.3769 grad_right_action_in=1.7630 grad_left_expert=31.1244 grad_right_expert=27.8917 grad_action_out=13.0720 grad_cross_arm_comm=3.1067 cross_arm_comm_gate_layer_0=0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_2=0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=0.0000 cross_arm_comm_gate_layer_5=0.0000 cross_arm_comm_gate_layer_6=0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=0.0000 cross_arm_comm_gate_layer_9=0.0000 cross_arm_comm_gate_layer_10=0.0000 cross_arm_comm_gate_layer_11=0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=0.0000 cross_arm_attention_mass_layer_0=0.0001 cross_arm_attention_mass_layer_1=0.0050 cross_arm_attention_mass_layer_2=0.0217 cross_arm_attention_mass_layer_3=0.0086 cross_arm_attention_mass_layer_4=0.0279 cross_arm_attention_mass_layer_5=0.0355 cross_arm_attention_mass_layer_6=0.0179 cross_arm_attention_mass_layer_7=0.0369 cross_arm_attention_mass_layer_8=0.0183 cross_arm_attention_mass_layer_9=0.0153 cross_arm_attention_mass_layer_10=0.0188 cross_arm_attention_mass_layer_11=0.0278 cross_arm_attention_mass_layer_12=0.0052 cross_arm_attention_mass_layer_13=0.0161 cross_arm_attention_mass_layer_14=0.0091 cross_arm_attention_mass_layer_15=0.0342 cross_arm_attention_mass_layer_16=0.0457 cross_arm_attention_mass_layer_17=0.0454 (22938:train_pytorch.py:862)
+20:12:06.099 [I] step=1 loss=3.8411 smoothed_loss=3.8411 lr=9.96e-08 grad_norm=60.0473 step_time=1.3216s data_time=0.2034s it/s=0.625 eta_to_20=30.4s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0001 cross_arm_attention_mass_layer_1=0.0050 cross_arm_attention_mass_layer_10=0.0188 cross_arm_attention_mass_layer_11=0.0278 cross_arm_attention_mass_layer_12=0.0052 cross_arm_attention_mass_layer_13=0.0161 cross_arm_attention_mass_layer_14=0.0091 cross_arm_attention_mass_layer_15=0.0342 cross_arm_attention_mass_layer_16=0.0457 cross_arm_attention_mass_layer_17=0.0454 cross_arm_attention_mass_layer_2=0.0217 cross_arm_attention_mass_layer_3=0.0086 cross_arm_attention_mass_layer_4=0.0279 cross_arm_attention_mass_layer_5=0.0355 cross_arm_attention_mass_layer_6=0.0179 cross_arm_attention_mass_layer_7=0.0369 cross_arm_attention_mass_layer_8=0.0183 cross_arm_attention_mass_layer_9=0.0153 cross_arm_comm_gate_layer_0=0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_10=0.0000 cross_arm_comm_gate_layer_11=0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=0.0000 cross_arm_comm_gate_layer_2=0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=0.0000 cross_arm_comm_gate_layer_5=0.0000 cross_arm_comm_gate_layer_6=0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=0.0000 cross_arm_comm_gate_layer_9=0.0000 grad_action_out=13.0720 grad_cross_arm_comm=3.1067 grad_left_action_in=2.3769 grad_left_expert=31.1244 grad_right_action_in=1.7630 grad_right_expert=27.8917 grad_shared_backbone=36.9946 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:07.067 [I] debug_step=2 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22938:train_pytorch.py:831)
+20:12:07.067 [I] debug_step=2 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22938:train_pytorch.py:835)
+20:12:07.068 [I] debug_step=2 prompt_token_lengths=[76]                                           (22938:train_pytorch.py:838)
+20:12:07.069 [I] debug_step=2 state_stats min=-0.9415 max=1.0004 mean=-0.0010 std=0.4295          (22938:train_pytorch.py:839)
+20:12:07.069 [I] debug_step=2 action_stats min=-1.0000 max=1.1367 mean=0.0272 std=0.4576          (22938:train_pytorch.py:842)
+20:12:07.070 [I] debug_step=2 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22938:train_pytorch.py:845)
+20:12:07.070 [I] debug_step=2 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22938:train_pytorch.py:849)
+20:12:07.071 [I] debug_step=2 lr=1.99e-07 grad_norm=10.7247 data_time=0.2263s step_time=0.7585s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.30GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.30GB (22938:train_pytorch.py:854)
+20:12:07.071 [I] debug_step=2 grad_shared_backbone=9.1973 grad_left_action_in=0.1651 grad_right_action_in=0.1484 grad_left_expert=2.5023 grad_right_expert=2.3935 grad_action_out=4.0770 grad_cross_arm_comm=0.0166 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=-0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0019 cross_arm_attention_mass_layer_2=0.0161 cross_arm_attention_mass_layer_3=0.0029 cross_arm_attention_mass_layer_4=0.0175 cross_arm_attention_mass_layer_5=0.0243 cross_arm_attention_mass_layer_6=0.0074 cross_arm_attention_mass_layer_7=0.0232 cross_arm_attention_mass_layer_8=0.0155 cross_arm_attention_mass_layer_9=0.0135 cross_arm_attention_mass_layer_10=0.0094 cross_arm_attention_mass_layer_11=0.0151 cross_arm_attention_mass_layer_12=0.0021 cross_arm_attention_mass_layer_13=0.0053 cross_arm_attention_mass_layer_14=0.0056 cross_arm_attention_mass_layer_15=0.0250 cross_arm_attention_mass_layer_16=0.0356 cross_arm_attention_mass_layer_17=0.0413 (22938:train_pytorch.py:862)
+20:12:07.072 [I] step=2 loss=1.1389 smoothed_loss=3.5709 lr=1.99e-07 grad_norm=10.7247 step_time=0.7585s data_time=0.2263s it/s=1.028 eta_to_20=17.5s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0019 cross_arm_attention_mass_layer_10=0.0094 cross_arm_attention_mass_layer_11=0.0151 cross_arm_attention_mass_layer_12=0.0021 cross_arm_attention_mass_layer_13=0.0053 cross_arm_attention_mass_layer_14=0.0056 cross_arm_attention_mass_layer_15=0.0250 cross_arm_attention_mass_layer_16=0.0356 cross_arm_attention_mass_layer_17=0.0413 cross_arm_attention_mass_layer_2=0.0161 cross_arm_attention_mass_layer_3=0.0029 cross_arm_attention_mass_layer_4=0.0175 cross_arm_attention_mass_layer_5=0.0243 cross_arm_attention_mass_layer_6=0.0074 cross_arm_attention_mass_layer_7=0.0232 cross_arm_attention_mass_layer_8=0.0155 cross_arm_attention_mass_layer_9=0.0135 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=-0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=4.0770 grad_cross_arm_comm=0.0166 grad_left_action_in=0.1651 grad_left_expert=2.5023 grad_right_action_in=0.1484 grad_right_expert=2.3935 grad_shared_backbone=9.1973 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:07.689 [I] debug_step=3 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22938:train_pytorch.py:831)
+20:12:07.690 [I] debug_step=3 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22938:train_pytorch.py:835)
+20:12:07.690 [I] debug_step=3 prompt_token_lengths=[75]                                           (22938:train_pytorch.py:838)
+20:12:07.691 [I] debug_step=3 state_stats min=-1.0000 max=1.0004 mean=0.0558 std=0.4300           (22938:train_pytorch.py:839)
+20:12:07.692 [I] debug_step=3 action_stats min=-1.0033 max=1.0004 mean=-0.0658 std=0.4704         (22938:train_pytorch.py:842)
+20:12:07.692 [I] debug_step=3 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22938:train_pytorch.py:845)
+20:12:07.693 [I] debug_step=3 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22938:train_pytorch.py:849)
+20:12:07.693 [I] debug_step=3 lr=2.99e-07 grad_norm=343.6402 data_time=0.1557s step_time=0.4654s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.30GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.30GB (22938:train_pytorch.py:854)
+20:12:07.694 [I] debug_step=3 grad_shared_backbone=215.2410 grad_left_action_in=4.7969 grad_right_action_in=9.5325 grad_left_expert=72.6238 grad_right_expert=227.5470 grad_action_out=23.7695 grad_cross_arm_comm=3.3548 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_attention_mass_layer_0=0.0003 cross_arm_attention_mass_layer_1=0.0127 cross_arm_attention_mass_layer_2=0.0275 cross_arm_attention_mass_layer_3=0.0190 cross_arm_attention_mass_layer_4=0.0359 cross_arm_attention_mass_layer_5=0.0454 cross_arm_attention_mass_layer_6=0.0228 cross_arm_attention_mass_layer_7=0.0346 cross_arm_attention_mass_layer_8=0.0149 cross_arm_attention_mass_layer_9=0.0296 cross_arm_attention_mass_layer_10=0.0177 cross_arm_attention_mass_layer_11=0.0230 cross_arm_attention_mass_layer_12=0.0134 cross_arm_attention_mass_layer_13=0.0242 cross_arm_attention_mass_layer_14=0.0109 cross_arm_attention_mass_layer_15=0.0285 cross_arm_attention_mass_layer_16=0.0403 cross_arm_attention_mass_layer_17=0.0268 (22938:train_pytorch.py:862)
+20:12:07.694 [I] step=3 loss=5.0512 smoothed_loss=3.7189 lr=2.99e-07 grad_norm=343.6402 step_time=0.4654s data_time=0.1557s it/s=1.609 eta_to_20=10.6s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0003 cross_arm_attention_mass_layer_1=0.0127 cross_arm_attention_mass_layer_10=0.0177 cross_arm_attention_mass_layer_11=0.0230 cross_arm_attention_mass_layer_12=0.0134 cross_arm_attention_mass_layer_13=0.0242 cross_arm_attention_mass_layer_14=0.0109 cross_arm_attention_mass_layer_15=0.0285 cross_arm_attention_mass_layer_16=0.0403 cross_arm_attention_mass_layer_17=0.0268 cross_arm_attention_mass_layer_2=0.0275 cross_arm_attention_mass_layer_3=0.0190 cross_arm_attention_mass_layer_4=0.0359 cross_arm_attention_mass_layer_5=0.0454 cross_arm_attention_mass_layer_6=0.0228 cross_arm_attention_mass_layer_7=0.0346 cross_arm_attention_mass_layer_8=0.0149 cross_arm_attention_mass_layer_9=0.0296 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=23.7695 grad_cross_arm_comm=3.3548 grad_left_action_in=4.7969 grad_left_expert=72.6238 grad_right_action_in=9.5325 grad_right_expert=227.5470 grad_shared_backbone=215.2410 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:08.256 [I] debug_step=4 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22938:train_pytorch.py:831)
+20:12:08.257 [I] debug_step=4 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22938:train_pytorch.py:835)
+20:12:08.257 [I] debug_step=4 prompt_token_lengths=[78]                                           (22938:train_pytorch.py:838)
+20:12:08.258 [I] debug_step=4 state_stats min=-0.7017 max=1.0004 mean=0.0553 std=0.3507           (22938:train_pytorch.py:839)
+20:12:08.258 [I] debug_step=4 action_stats min=-1.0014 max=1.0004 mean=-0.0683 std=0.4561         (22938:train_pytorch.py:842)
+20:12:08.259 [I] debug_step=4 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22938:train_pytorch.py:845)
+20:12:08.259 [I] debug_step=4 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22938:train_pytorch.py:849)
+20:12:08.260 [I] debug_step=4 lr=3.98e-07 grad_norm=8.7944 data_time=0.1312s step_time=0.4359s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.30GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.30GB (22938:train_pytorch.py:854)
+20:12:08.260 [I] debug_step=4 grad_shared_backbone=7.5903 grad_left_action_in=0.1438 grad_right_action_in=0.1015 grad_left_expert=2.4058 grad_right_expert=1.2982 grad_action_out=3.3839 grad_cross_arm_comm=0.0147 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0015 cross_arm_attention_mass_layer_2=0.0133 cross_arm_attention_mass_layer_3=0.0026 cross_arm_attention_mass_layer_4=0.0148 cross_arm_attention_mass_layer_5=0.0199 cross_arm_attention_mass_layer_6=0.0062 cross_arm_attention_mass_layer_7=0.0154 cross_arm_attention_mass_layer_8=0.0102 cross_arm_attention_mass_layer_9=0.0086 cross_arm_attention_mass_layer_10=0.0065 cross_arm_attention_mass_layer_11=0.0099 cross_arm_attention_mass_layer_12=0.0010 cross_arm_attention_mass_layer_13=0.0040 cross_arm_attention_mass_layer_14=0.0072 cross_arm_attention_mass_layer_15=0.0227 cross_arm_attention_mass_layer_16=0.0351 cross_arm_attention_mass_layer_17=0.0406 (22938:train_pytorch.py:862)
+20:12:08.261 [I] step=4 loss=1.1860 smoothed_loss=3.4656 lr=3.98e-07 grad_norm=8.7944 step_time=0.4359s data_time=0.1312s it/s=1.768 eta_to_20=9.1s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0015 cross_arm_attention_mass_layer_10=0.0065 cross_arm_attention_mass_layer_11=0.0099 cross_arm_attention_mass_layer_12=0.0010 cross_arm_attention_mass_layer_13=0.0040 cross_arm_attention_mass_layer_14=0.0072 cross_arm_attention_mass_layer_15=0.0227 cross_arm_attention_mass_layer_16=0.0351 cross_arm_attention_mass_layer_17=0.0406 cross_arm_attention_mass_layer_2=0.0133 cross_arm_attention_mass_layer_3=0.0026 cross_arm_attention_mass_layer_4=0.0148 cross_arm_attention_mass_layer_5=0.0199 cross_arm_attention_mass_layer_6=0.0062 cross_arm_attention_mass_layer_7=0.0154 cross_arm_attention_mass_layer_8=0.0102 cross_arm_attention_mass_layer_9=0.0086 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=3.3839 grad_cross_arm_comm=0.0147 grad_left_action_in=0.1438 grad_left_expert=2.4058 grad_right_action_in=0.1015 grad_right_expert=1.2982 grad_shared_backbone=7.5903 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:08.933 [I] debug_step=5 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22938:train_pytorch.py:831)
+20:12:08.934 [I] debug_step=5 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22938:train_pytorch.py:835)
+20:12:08.934 [I] debug_step=5 prompt_token_lengths=[73]                                           (22938:train_pytorch.py:838)
+20:12:08.935 [I] debug_step=5 state_stats min=-0.9599 max=1.0004 mean=0.0170 std=0.5364           (22938:train_pytorch.py:839)
+20:12:08.935 [I] debug_step=5 action_stats min=-1.0392 max=1.0004 mean=-0.0159 std=0.4488         (22938:train_pytorch.py:842)
+20:12:08.935 [I] debug_step=5 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22938:train_pytorch.py:845)
+20:12:08.936 [I] debug_step=5 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22938:train_pytorch.py:849)
+20:12:08.936 [I] debug_step=5 lr=4.98e-07 grad_norm=20.1429 data_time=0.2048s step_time=0.4721s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.30GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.30GB (22938:train_pytorch.py:854)
+20:12:08.937 [I] debug_step=5 grad_shared_backbone=16.7899 grad_left_action_in=0.2534 grad_right_action_in=0.3335 grad_left_expert=7.9047 grad_right_expert=3.6853 grad_action_out=6.0934 grad_cross_arm_comm=0.0735 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0020 cross_arm_attention_mass_layer_2=0.0178 cross_arm_attention_mass_layer_3=0.0039 cross_arm_attention_mass_layer_4=0.0203 cross_arm_attention_mass_layer_5=0.0294 cross_arm_attention_mass_layer_6=0.0106 cross_arm_attention_mass_layer_7=0.0286 cross_arm_attention_mass_layer_8=0.0175 cross_arm_attention_mass_layer_9=0.0157 cross_arm_attention_mass_layer_10=0.0148 cross_arm_attention_mass_layer_11=0.0181 cross_arm_attention_mass_layer_12=0.0023 cross_arm_attention_mass_layer_13=0.0128 cross_arm_attention_mass_layer_14=0.0072 cross_arm_attention_mass_layer_15=0.0232 cross_arm_attention_mass_layer_16=0.0437 cross_arm_attention_mass_layer_17=0.0451 (22938:train_pytorch.py:862)
+20:12:08.937 [I] step=5 loss=1.8898 smoothed_loss=3.3081 lr=4.98e-07 grad_norm=20.1429 step_time=0.4721s data_time=0.2048s it/s=1.481 eta_to_20=10.1s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0020 cross_arm_attention_mass_layer_10=0.0148 cross_arm_attention_mass_layer_11=0.0181 cross_arm_attention_mass_layer_12=0.0023 cross_arm_attention_mass_layer_13=0.0128 cross_arm_attention_mass_layer_14=0.0072 cross_arm_attention_mass_layer_15=0.0232 cross_arm_attention_mass_layer_16=0.0437 cross_arm_attention_mass_layer_17=0.0451 cross_arm_attention_mass_layer_2=0.0178 cross_arm_attention_mass_layer_3=0.0039 cross_arm_attention_mass_layer_4=0.0203 cross_arm_attention_mass_layer_5=0.0294 cross_arm_attention_mass_layer_6=0.0106 cross_arm_attention_mass_layer_7=0.0286 cross_arm_attention_mass_layer_8=0.0175 cross_arm_attention_mass_layer_9=0.0157 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=6.0934 grad_cross_arm_comm=0.0735 grad_left_action_in=0.2534 grad_left_expert=7.9047 grad_right_action_in=0.3335 grad_right_expert=3.6853 grad_shared_backbone=16.7899 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:09.727 [I] step=6 loss=2.2855 smoothed_loss=3.2058 lr=5.98e-07 grad_norm=22.2605 step_time=0.5043s data_time=0.2901s it/s=1.267 eta_to_20=11.1s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0001 cross_arm_attention_mass_layer_1=0.0030 cross_arm_attention_mass_layer_10=0.0179 cross_arm_attention_mass_layer_11=0.0219 cross_arm_attention_mass_layer_12=0.0017 cross_arm_attention_mass_layer_13=0.0164 cross_arm_attention_mass_layer_14=0.0065 cross_arm_attention_mass_layer_15=0.0300 cross_arm_attention_mass_layer_16=0.0448 cross_arm_attention_mass_layer_17=0.0482 cross_arm_attention_mass_layer_2=0.0201 cross_arm_attention_mass_layer_3=0.0064 cross_arm_attention_mass_layer_4=0.0234 cross_arm_attention_mass_layer_5=0.0308 cross_arm_attention_mass_layer_6=0.0131 cross_arm_attention_mass_layer_7=0.0312 cross_arm_attention_mass_layer_8=0.0206 cross_arm_attention_mass_layer_9=0.0180 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=7.8420 grad_cross_arm_comm=0.1508 grad_left_action_in=0.2907 grad_left_expert=7.9865 grad_right_action_in=0.5407 grad_right_expert=5.3887 grad_shared_backbone=18.0209 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:10.423 [I] step=7 loss=1.0335 smoothed_loss=2.9886 lr=6.97e-07 grad_norm=8.7208 step_time=0.4962s data_time=0.1999s it/s=1.439 eta_to_20=9.0s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0014 cross_arm_attention_mass_layer_10=0.0066 cross_arm_attention_mass_layer_11=0.0060 cross_arm_attention_mass_layer_12=0.0024 cross_arm_attention_mass_layer_13=0.0015 cross_arm_attention_mass_layer_14=0.0062 cross_arm_attention_mass_layer_15=0.0146 cross_arm_attention_mass_layer_16=0.0319 cross_arm_attention_mass_layer_17=0.0417 cross_arm_attention_mass_layer_2=0.0105 cross_arm_attention_mass_layer_3=0.0022 cross_arm_attention_mass_layer_4=0.0130 cross_arm_attention_mass_layer_5=0.0188 cross_arm_attention_mass_layer_6=0.0045 cross_arm_attention_mass_layer_7=0.0127 cross_arm_attention_mass_layer_8=0.0097 cross_arm_attention_mass_layer_9=0.0097 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=4.0753 grad_cross_arm_comm=0.0098 grad_left_action_in=0.1514 grad_left_expert=2.5886 grad_right_action_in=0.0879 grad_right_expert=1.9729 grad_shared_backbone=6.8576 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:11.020 [I] step=8 loss=2.0034 smoothed_loss=2.8901 lr=7.97e-07 grad_norm=15.7969 step_time=0.4407s data_time=0.1564s it/s=1.677 eta_to_20=7.2s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0027 cross_arm_attention_mass_layer_10=0.0129 cross_arm_attention_mass_layer_11=0.0269 cross_arm_attention_mass_layer_12=0.0032 cross_arm_attention_mass_layer_13=0.0177 cross_arm_attention_mass_layer_14=0.0074 cross_arm_attention_mass_layer_15=0.0309 cross_arm_attention_mass_layer_16=0.0446 cross_arm_attention_mass_layer_17=0.0503 cross_arm_attention_mass_layer_2=0.0196 cross_arm_attention_mass_layer_3=0.0046 cross_arm_attention_mass_layer_4=0.0227 cross_arm_attention_mass_layer_5=0.0319 cross_arm_attention_mass_layer_6=0.0114 cross_arm_attention_mass_layer_7=0.0298 cross_arm_attention_mass_layer_8=0.0194 cross_arm_attention_mass_layer_9=0.0117 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=6.6005 grad_cross_arm_comm=0.1531 grad_left_action_in=0.1726 grad_left_expert=4.6426 grad_right_action_in=0.4530 grad_right_expert=3.8705 grad_shared_backbone=12.4324 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:11.571 [I] step=9 loss=0.4132 smoothed_loss=2.6424 lr=8.96e-07 grad_norm=3.3497 step_time=0.4161s data_time=0.1347s it/s=1.820 eta_to_20=6.0s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0008 cross_arm_attention_mass_layer_10=0.0014 cross_arm_attention_mass_layer_11=0.0006 cross_arm_attention_mass_layer_12=0.0028 cross_arm_attention_mass_layer_13=0.0018 cross_arm_attention_mass_layer_14=0.0059 cross_arm_attention_mass_layer_15=0.0078 cross_arm_attention_mass_layer_16=0.0337 cross_arm_attention_mass_layer_17=0.0442 cross_arm_attention_mass_layer_2=0.0015 cross_arm_attention_mass_layer_3=0.0012 cross_arm_attention_mass_layer_4=0.0019 cross_arm_attention_mass_layer_5=0.0036 cross_arm_attention_mass_layer_6=0.0013 cross_arm_attention_mass_layer_7=0.0022 cross_arm_attention_mass_layer_8=0.0006 cross_arm_attention_mass_layer_9=0.0052 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=1.7915 grad_cross_arm_comm=0.0012 grad_left_action_in=0.0692 grad_left_expert=1.0033 grad_right_action_in=0.0554 grad_right_expert=0.7293 grad_shared_backbone=2.5249 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:12.422 [I] step=10 loss=0.6162 smoothed_loss=2.4397 lr=9.96e-07 grad_norm=5.5674 step_time=0.6599s data_time=0.1905s it/s=1.178 eta_to_20=8.5s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0014 cross_arm_attention_mass_layer_10=0.0024 cross_arm_attention_mass_layer_11=0.0047 cross_arm_attention_mass_layer_12=0.0021 cross_arm_attention_mass_layer_13=0.0018 cross_arm_attention_mass_layer_14=0.0062 cross_arm_attention_mass_layer_15=0.0094 cross_arm_attention_mass_layer_16=0.0283 cross_arm_attention_mass_layer_17=0.0357 cross_arm_attention_mass_layer_2=0.0074 cross_arm_attention_mass_layer_3=0.0016 cross_arm_attention_mass_layer_4=0.0081 cross_arm_attention_mass_layer_5=0.0156 cross_arm_attention_mass_layer_6=0.0028 cross_arm_attention_mass_layer_7=0.0050 cross_arm_attention_mass_layer_8=0.0040 cross_arm_attention_mass_layer_9=0.0045 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=-0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=2.2079 grad_cross_arm_comm=0.0071 grad_left_action_in=0.0841 grad_left_expert=1.2018 grad_right_action_in=0.0868 grad_right_expert=1.2814 grad_shared_backbone=4.7763 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:12.957 [I] step=11 loss=0.9030 smoothed_loss=2.2861 lr=1.10e-06 grad_norm=7.2282 step_time=0.4104s data_time=0.1251s it/s=1.872 eta_to_20=4.8s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0015 cross_arm_attention_mass_layer_10=0.0064 cross_arm_attention_mass_layer_11=0.0098 cross_arm_attention_mass_layer_12=0.0013 cross_arm_attention_mass_layer_13=0.0031 cross_arm_attention_mass_layer_14=0.0072 cross_arm_attention_mass_layer_15=0.0208 cross_arm_attention_mass_layer_16=0.0355 cross_arm_attention_mass_layer_17=0.0421 cross_arm_attention_mass_layer_2=0.0136 cross_arm_attention_mass_layer_3=0.0023 cross_arm_attention_mass_layer_4=0.0152 cross_arm_attention_mass_layer_5=0.0219 cross_arm_attention_mass_layer_6=0.0054 cross_arm_attention_mass_layer_7=0.0144 cross_arm_attention_mass_layer_8=0.0131 cross_arm_attention_mass_layer_9=0.0082 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=-0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=3.3357 grad_cross_arm_comm=0.0099 grad_left_action_in=0.1355 grad_left_expert=2.0379 grad_right_action_in=0.0836 grad_right_expert=1.1722 grad_shared_backbone=5.8293 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:13.628 [I] step=12 loss=0.7531 smoothed_loss=2.1328 lr=1.20e-06 grad_norm=6.0473 step_time=0.4968s data_time=0.1739s it/s=1.493 eta_to_20=5.4s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0012 cross_arm_attention_mass_layer_10=0.0078 cross_arm_attention_mass_layer_11=0.0121 cross_arm_attention_mass_layer_12=0.0032 cross_arm_attention_mass_layer_13=0.0032 cross_arm_attention_mass_layer_14=0.0048 cross_arm_attention_mass_layer_15=0.0136 cross_arm_attention_mass_layer_16=0.0331 cross_arm_attention_mass_layer_17=0.0404 cross_arm_attention_mass_layer_2=0.0127 cross_arm_attention_mass_layer_3=0.0020 cross_arm_attention_mass_layer_4=0.0138 cross_arm_attention_mass_layer_5=0.0221 cross_arm_attention_mass_layer_6=0.0055 cross_arm_attention_mass_layer_7=0.0174 cross_arm_attention_mass_layer_8=0.0100 cross_arm_attention_mass_layer_9=0.0094 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=2.8673 grad_cross_arm_comm=0.0090 grad_left_action_in=0.1128 grad_left_expert=1.8561 grad_right_action_in=0.0739 grad_right_expert=1.0243 grad_shared_backbone=4.8443 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:14.427 [I] step=13 loss=3.7746 smoothed_loss=2.2970 lr=1.29e-06 grad_norm=206.8044 step_time=0.5601s data_time=0.2394s it/s=1.252 eta_to_20=5.6s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0003 cross_arm_attention_mass_layer_1=0.0128 cross_arm_attention_mass_layer_10=0.0240 cross_arm_attention_mass_layer_11=0.0241 cross_arm_attention_mass_layer_12=0.0213 cross_arm_attention_mass_layer_13=0.0213 cross_arm_attention_mass_layer_14=0.0164 cross_arm_attention_mass_layer_15=0.0265 cross_arm_attention_mass_layer_16=0.0367 cross_arm_attention_mass_layer_17=0.0289 cross_arm_attention_mass_layer_2=0.0282 cross_arm_attention_mass_layer_3=0.0184 cross_arm_attention_mass_layer_4=0.0365 cross_arm_attention_mass_layer_5=0.0441 cross_arm_attention_mass_layer_6=0.0238 cross_arm_attention_mass_layer_7=0.0371 cross_arm_attention_mass_layer_8=0.0137 cross_arm_attention_mass_layer_9=0.0293 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=15.4957 grad_cross_arm_comm=2.1022 grad_left_action_in=2.3745 grad_left_expert=37.1536 grad_right_action_in=5.2568 grad_right_expert=138.8291 grad_shared_backbone=127.7336 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:15.255 [I] step=14 loss=1.2933 smoothed_loss=2.1966 lr=1.39e-06 grad_norm=7.9182 step_time=0.5738s data_time=0.2541s it/s=1.210 eta_to_20=5.0s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0015 cross_arm_attention_mass_layer_10=0.0079 cross_arm_attention_mass_layer_11=0.0120 cross_arm_attention_mass_layer_12=0.0016 cross_arm_attention_mass_layer_13=0.0036 cross_arm_attention_mass_layer_14=0.0047 cross_arm_attention_mass_layer_15=0.0131 cross_arm_attention_mass_layer_16=0.0244 cross_arm_attention_mass_layer_17=0.0419 cross_arm_attention_mass_layer_2=0.0129 cross_arm_attention_mass_layer_3=0.0022 cross_arm_attention_mass_layer_4=0.0152 cross_arm_attention_mass_layer_5=0.0233 cross_arm_attention_mass_layer_6=0.0067 cross_arm_attention_mass_layer_7=0.0161 cross_arm_attention_mass_layer_8=0.0092 cross_arm_attention_mass_layer_9=0.0097 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=4.2052 grad_cross_arm_comm=0.0107 grad_left_action_in=0.1570 grad_left_expert=2.3411 grad_right_action_in=0.1025 grad_right_expert=1.1691 grad_shared_backbone=6.0836 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:16.034 [I] step=15 loss=3.1068 smoothed_loss=2.2876 lr=1.49e-06 grad_norm=24.4182 step_time=0.5474s data_time=0.2314s it/s=1.286 eta_to_20=3.9s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0001 cross_arm_attention_mass_layer_1=0.0033 cross_arm_attention_mass_layer_10=0.0154 cross_arm_attention_mass_layer_11=0.0284 cross_arm_attention_mass_layer_12=0.0046 cross_arm_attention_mass_layer_13=0.0187 cross_arm_attention_mass_layer_14=0.0121 cross_arm_attention_mass_layer_15=0.0370 cross_arm_attention_mass_layer_16=0.0460 cross_arm_attention_mass_layer_17=0.0516 cross_arm_attention_mass_layer_2=0.0206 cross_arm_attention_mass_layer_3=0.0064 cross_arm_attention_mass_layer_4=0.0239 cross_arm_attention_mass_layer_5=0.0299 cross_arm_attention_mass_layer_6=0.0143 cross_arm_attention_mass_layer_7=0.0349 cross_arm_attention_mass_layer_8=0.0213 cross_arm_attention_mass_layer_9=0.0171 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=-0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=9.3484 grad_cross_arm_comm=0.3843 grad_left_action_in=0.3015 grad_left_expert=7.0086 grad_right_action_in=0.6660 grad_right_expert=6.4185 grad_shared_backbone=18.8039 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:16.810 [I] step=16 loss=0.8710 smoothed_loss=2.1460 lr=1.59e-06 grad_norm=7.5162 step_time=0.5638s data_time=0.2117s it/s=1.292 eta_to_20=3.1s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0016 cross_arm_attention_mass_layer_10=0.0051 cross_arm_attention_mass_layer_11=0.0114 cross_arm_attention_mass_layer_12=0.0017 cross_arm_attention_mass_layer_13=0.0062 cross_arm_attention_mass_layer_14=0.0073 cross_arm_attention_mass_layer_15=0.0221 cross_arm_attention_mass_layer_16=0.0370 cross_arm_attention_mass_layer_17=0.0436 cross_arm_attention_mass_layer_2=0.0138 cross_arm_attention_mass_layer_3=0.0022 cross_arm_attention_mass_layer_4=0.0152 cross_arm_attention_mass_layer_5=0.0195 cross_arm_attention_mass_layer_6=0.0056 cross_arm_attention_mass_layer_7=0.0154 cross_arm_attention_mass_layer_8=0.0132 cross_arm_attention_mass_layer_9=0.0103 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=-0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=2.7344 grad_cross_arm_comm=0.0228 grad_left_action_in=0.1118 grad_left_expert=2.2761 grad_right_action_in=0.1234 grad_right_expert=1.1808 grad_shared_backbone=6.4124 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:17.396 [I] step=17 loss=1.7002 smoothed_loss=2.1014 lr=1.69e-06 grad_norm=14.0785 step_time=0.4252s data_time=0.1614s it/s=1.708 eta_to_20=1.8s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0021 cross_arm_attention_mass_layer_10=0.0103 cross_arm_attention_mass_layer_11=0.0197 cross_arm_attention_mass_layer_12=0.0029 cross_arm_attention_mass_layer_13=0.0103 cross_arm_attention_mass_layer_14=0.0067 cross_arm_attention_mass_layer_15=0.0163 cross_arm_attention_mass_layer_16=0.0436 cross_arm_attention_mass_layer_17=0.0446 cross_arm_attention_mass_layer_2=0.0162 cross_arm_attention_mass_layer_3=0.0030 cross_arm_attention_mass_layer_4=0.0192 cross_arm_attention_mass_layer_5=0.0268 cross_arm_attention_mass_layer_6=0.0092 cross_arm_attention_mass_layer_7=0.0242 cross_arm_attention_mass_layer_8=0.0146 cross_arm_attention_mass_layer_9=0.0095 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=-0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=4.2605 grad_cross_arm_comm=0.0625 grad_left_action_in=0.1989 grad_left_expert=4.9518 grad_right_action_in=0.2156 grad_right_expert=2.1764 grad_shared_backbone=12.0796 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:18.392 [I] step=18 loss=0.4844 smoothed_loss=1.9397 lr=1.79e-06 grad_norm=3.3459 step_time=0.6297s data_time=0.3660s it/s=1.005 eta_to_20=2.0s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0008 cross_arm_attention_mass_layer_10=0.0016 cross_arm_attention_mass_layer_11=0.0014 cross_arm_attention_mass_layer_12=0.0034 cross_arm_attention_mass_layer_13=0.0007 cross_arm_attention_mass_layer_14=0.0054 cross_arm_attention_mass_layer_15=0.0063 cross_arm_attention_mass_layer_16=0.0319 cross_arm_attention_mass_layer_17=0.0418 cross_arm_attention_mass_layer_2=0.0027 cross_arm_attention_mass_layer_3=0.0013 cross_arm_attention_mass_layer_4=0.0035 cross_arm_attention_mass_layer_5=0.0058 cross_arm_attention_mass_layer_6=0.0015 cross_arm_attention_mass_layer_7=0.0028 cross_arm_attention_mass_layer_8=0.0019 cross_arm_attention_mass_layer_9=0.0049 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=-0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=-0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=1.9561 grad_cross_arm_comm=0.0017 grad_left_action_in=0.0746 grad_left_expert=1.1140 grad_right_action_in=0.0388 grad_right_expert=0.5290 grad_shared_backbone=2.3985 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:19.239 [I] step=19 loss=0.7633 smoothed_loss=1.8220 lr=1.89e-06 grad_norm=7.1468 step_time=0.5757s data_time=0.2714s it/s=1.182 eta_to_20=0.8s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0015 cross_arm_attention_mass_layer_10=0.0069 cross_arm_attention_mass_layer_11=0.0093 cross_arm_attention_mass_layer_12=0.0016 cross_arm_attention_mass_layer_13=0.0034 cross_arm_attention_mass_layer_14=0.0046 cross_arm_attention_mass_layer_15=0.0166 cross_arm_attention_mass_layer_16=0.0297 cross_arm_attention_mass_layer_17=0.0418 cross_arm_attention_mass_layer_2=0.0130 cross_arm_attention_mass_layer_3=0.0026 cross_arm_attention_mass_layer_4=0.0156 cross_arm_attention_mass_layer_5=0.0208 cross_arm_attention_mass_layer_6=0.0062 cross_arm_attention_mass_layer_7=0.0164 cross_arm_attention_mass_layer_8=0.0124 cross_arm_attention_mass_layer_9=0.0115 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=-0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=-0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=3.7548 grad_cross_arm_comm=0.0125 grad_left_action_in=0.1160 grad_left_expert=2.3520 grad_right_action_in=0.0799 grad_right_expert=1.3128 grad_shared_backbone=5.3838 (22938:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:19.905 [I] step=20 loss=0.5943 smoothed_loss=1.6993 lr=1.99e-06 grad_norm=6.2792 step_time=0.4954s data_time=0.1707s it/s=1.504 eta_to_20=0.0s max_cuda_memory=76.13GB cross_arm_attention_mass_layer_0=0.0000 cross_arm_attention_mass_layer_1=0.0012 cross_arm_attention_mass_layer_10=0.0050 cross_arm_attention_mass_layer_11=0.0026 cross_arm_attention_mass_layer_12=0.0021 cross_arm_attention_mass_layer_13=0.0009 cross_arm_attention_mass_layer_14=0.0060 cross_arm_attention_mass_layer_15=0.0119 cross_arm_attention_mass_layer_16=0.0308 cross_arm_attention_mass_layer_17=0.0412 cross_arm_attention_mass_layer_2=0.0054 cross_arm_attention_mass_layer_3=0.0020 cross_arm_attention_mass_layer_4=0.0084 cross_arm_attention_mass_layer_5=0.0116 cross_arm_attention_mass_layer_6=0.0029 cross_arm_attention_mass_layer_7=0.0066 cross_arm_attention_mass_layer_8=0.0051 cross_arm_attention_mass_layer_9=0.0094 cross_arm_comm_gate_layer_0=-0.0000 cross_arm_comm_gate_layer_1=-0.0000 cross_arm_comm_gate_layer_10=-0.0000 cross_arm_comm_gate_layer_11=-0.0000 cross_arm_comm_gate_layer_12=0.0000 cross_arm_comm_gate_layer_13=0.0000 cross_arm_comm_gate_layer_14=0.0000 cross_arm_comm_gate_layer_15=0.0000 cross_arm_comm_gate_layer_16=0.0000 cross_arm_comm_gate_layer_17=-0.0000 cross_arm_comm_gate_layer_2=-0.0000 cross_arm_comm_gate_layer_3=-0.0000 cross_arm_comm_gate_layer_4=-0.0000 cross_arm_comm_gate_layer_5=-0.0000 cross_arm_comm_gate_layer_6=-0.0000 cross_arm_comm_gate_layer_7=-0.0000 cross_arm_comm_gate_layer_8=-0.0000 cross_arm_comm_gate_layer_9=-0.0000 grad_action_out=2.7963 grad_cross_arm_comm=0.0044 grad_left_action_in=0.0861 grad_left_expert=1.9694 grad_right_action_in=0.0578 grad_right_expert=1.3971 grad_shared_backbone=5.0478 (22938:train_pytorch.py:882)
+20:19:41.020 [I] Saved checkpoint at step 20 -> /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k/split_communicating_real_train20/20 (22938:train_pytorch.py:378)

openpi/run_logs/split_independent_real_smoke3_r2.log ADDED Viewed

	@@ -0,0 +1,104 @@

+19:45:11.253 [I] Created experiment checkpoint directory: /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_smoke3_r2 (20567:train_pytorch.py:533)
+19:45:11.254 [I] Using batch size per GPU: 1 (total batch size across 1 GPUs: 1)                  (20567:train_pytorch.py:552)
+19:45:11.330 [I] Loaded norm stats from /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train (20567:config.py:234)
+19:45:11.331 [I] data_config: DataConfig(repo_id='lsnu/twin_dual_push_128_train', asset_id='lsnu/twin_dual_push_128_train', norm_stats={'state': NormStats(mean=array([ 0.10604009,  0.20956482,  0.09184283, -1.98801565, -0.04930164,
+        2.20065784,  1.07595289,  0.52742052,  0.01585805,  0.08288047,
+       -0.06887393, -1.906394  ,  0.04810138,  2.01086807, -0.92902797,
+        0.8440811 ]), std=array([0.09207697, 0.31317395, 0.08127229, 0.53812712, 0.06093267,
+       0.51205784, 0.22527155, 0.49924755, 0.20230208, 0.31408131,
+       0.21665592, 0.5264315 , 0.20170984, 0.4745712 , 1.17861438,
+       0.36277843]), q01=array([-5.00321221e-06, -3.88026012e-01, -2.23782954e-05, -2.98962682e+00,
+       -2.38592355e-01,  1.22146201e+00,  7.85383821e-01,  0.00000000e+00,
+       -6.15615927e-01, -4.14941930e-01, -9.43696350e-01, -2.88397729e+00,
+       -9.05083556e-01,  1.22148895e+00, -2.79564499e+00,  0.00000000e+00]), q99=array([ 0.31251293,  0.86546916,  0.35174239, -0.87634897,  0.05212194,
+        2.97208117,  1.64465171,  0.9998    ,  0.7670313 ,  0.96073459,
+        0.68710467, -0.87498123,  0.35838486,  2.9773227 ,  0.78477909,
+        0.9998    ])), 'actions': NormStats(mean=array([ 0.03630241,  0.09624442,  0.01367408, -0.2224988 , -0.02762174,
+        0.27498844,  0.0892187 ,  0.45650524, -0.00378086,  0.09113847,
+       -0.00376227, -0.22537093,  0.00826233,  0.26799494, -0.57452869,
+        0.7731654 ]), std=array([0.04995174, 0.29268014, 0.06852161, 0.3647725 , 0.07012808,
+       0.27129024, 0.11329207, 0.4981046 , 0.0917461 , 0.22704004,
+       0.1069391 , 0.2572591 , 0.11801817, 0.1235588 , 0.35835782,
+       0.41878474]), q01=array([-5.86206436e-04, -3.88117499e-01, -2.55800724e-01, -8.34769463e-01,
+       -3.51454727e-01, -1.54787922e-03, -5.81741333e-04,  0.00000000e+00,
+       -2.64436970e-01, -3.51582764e-01, -3.69693995e-01, -7.30919549e-01,
+       -3.35441585e-01, -6.62303925e-04, -9.34731126e-01,  0.00000000e+00]), q99=array([0.20790743, 0.81198567, 0.19612836, 0.33958174, 0.05568643,
+       0.75265345, 0.425256  , 0.9998    , 0.2558236 , 0.58901345,
+       0.35822071, 0.18567593, 0.44035054, 0.49966629, 0.12655233,
+       0.9998    ]))}, repack_transforms=Group(inputs=[RepackTransform(structure={'images': {'cam_high': 'front_image', 'cam_left_wrist': 'wrist_left_image', 'cam_right_wrist': 'wrist_right_image'}, 'state': 'state', 'actions': 'action', 'prompt': 'task'})], outputs=()), data_transforms=Group(inputs=[AlohaInputs(adapt_to_pi=False)], outputs=[]), model_transforms=Group(inputs=[InjectDefaultPrompt(prompt=None), ResizeImages(height=224, width=224), TokenizePrompt(tokenizer=<openpi.models.tokenizer.PaligemmaTokenizer object at 0x79458ad85b50>, discrete_state_input=True), PackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))], outputs=[UnpackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))]), use_quantile_norm=True, action_sequence_keys=('action',), prompt_from_task=False, rlds_data_dir=None, action_space=None, datasets=()) (20567:data_loader.py:284)
+19:45:16.791 [I] JAX version 0.5.3 available.                                                     (20567:config.py:125)
+19:45:40.542 [I] Using existing local LeRobot dataset mirror for lsnu/twin_dual_push_128_train: /workspace/lerobot/lsnu/twin_dual_push_128_train (20567:data_loader.py:148)
+19:45:40.654 [W] 'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder (20567:video_utils.py:36)
+19:46:47.372 [I] local_batch_size: 1                                                              (20567:data_loader.py:365)
+19:50:09.799 [I] Enabled gradient checkpointing for PI0Pytorch model                              (20567:pi0_pytorch.py:138)
+19:50:09.802 [I] Enabled gradient checkpointing for memory optimization                           (20567:train_pytorch.py:624)
+19:50:09.803 [I] Step 0 (after_model_creation): GPU memory - allocated: 17.23GB, reserved: 17.23GB, free: 0.00GB, peak_allocated: 17.23GB, peak_reserved: 17.23GB (20567:train_pytorch.py:493)
+19:50:09.804 [I] Loading weights from: /workspace/checkpoints/pi05_base_split_independent_packed_from_single (20567:train_pytorch.py:653)
+19:50:13.559 [I] Weight loading missing key count: 0                                              (20567:train_pytorch.py:657)
+19:50:13.560 [I] Weight loading missing keys: set()                                               (20567:train_pytorch.py:658)
+19:50:13.560 [I] Weight loading unexpected key count: 0                                           (20567:train_pytorch.py:659)
+19:50:13.560 [I] Weight loading unexpected keys: []                                               (20567:train_pytorch.py:660)
+19:50:13.560 [I] Loaded PyTorch weights from /workspace/checkpoints/pi05_base_split_independent_packed_from_single (20567:train_pytorch.py:661)
+19:50:13.565 [I] Running on: 963c158043aa | world_size=1                                          (20567:train_pytorch.py:701)
+19:50:13.565 [I] Training config: batch_size=1, effective_batch_size=1, num_train_steps=3         (20567:train_pytorch.py:702)
+19:50:13.565 [I] Memory optimizations: gradient_checkpointing=True                                (20567:train_pytorch.py:705)
+19:50:13.566 [I] DDP settings: find_unused_parameters=False, gradient_as_bucket_view=True, static_graph=True (20567:train_pytorch.py:706)
+19:50:13.566 [I] LR schedule: warmup=250, peak_lr=2.50e-05, decay_steps=5000, end_lr=2.50e-06     (20567:train_pytorch.py:707)
+19:50:13.567 [I] Optimizer: AdamW, weight_decay=1e-10, clip_norm=1.0                              (20567:train_pytorch.py:710)
+19:50:13.567 [I] EMA is not supported for PyTorch training                                        (20567:train_pytorch.py:713)
+19:50:13.567 [I] Training precision: float32                                                      (20567:train_pytorch.py:714)
+19:50:13.576 [I] Resolved config name: pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k (20567:train_pytorch.py:308)
+19:50:13.576 [I] Dataset repo_id: lsnu/twin_dual_push_128_train                                   (20567:train_pytorch.py:309)
+19:50:13.577 [I] Norm-stats file path: /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json (20567:train_pytorch.py:310)
+19:50:13.577 [I] Norm-stats summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (20567:train_pytorch.py:311)
+19:50:13.578 [I] Checkpoint source path: /workspace/checkpoints/pi05_base_split_independent_packed_from_single (20567:train_pytorch.py:312)
+19:50:13.578 [I] Model type: split_independent                                                    (20567:train_pytorch.py:313)
+19:50:13.578 [I] Packed transforms active: True                                                   (20567:train_pytorch.py:314)
+19:50:13.579 [I] World size: 1                                                                    (20567:train_pytorch.py:315)
+19:50:13.579 [I] Batch size: local=1, global=1                                                    (20567:train_pytorch.py:316)
+19:50:13.580 [I] num_workers: 0                                                                   (20567:train_pytorch.py:317)
+19:50:13.580 [I] Precision: float32                                                               (20567:train_pytorch.py:318)
+19:50:13.580 [I] LR schedule summary: warmup_steps=250, peak_lr=2.50e-05, decay_steps=5000, decay_lr=2.50e-06 (20567:train_pytorch.py:319)
+19:50:13.581 [I] Save/log intervals: save_interval=3, log_interval=1                              (20567:train_pytorch.py:326)
+19:50:13.581 [I] Action-loss mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (20567:train_pytorch.py:327)
+19:50:13.581 [I] Active mask dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]       (20567:train_pytorch.py:328)
+19:50:13.582 [I] Masked dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]      (20567:train_pytorch.py:329)
+19:50:13.582 [I] Gradient bucket diagnostics: left_action_in, right_action_in, left_expert, right_expert, action_out, cross_arm_comm (20567:train_pytorch.py:722)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+19:50:15.125 [I] debug_step=1 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (20567:train_pytorch.py:831)
+19:50:15.126 [I] debug_step=1 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (20567:train_pytorch.py:835)
+19:50:15.126 [I] debug_step=1 prompt_token_lengths=[75]                                           (20567:train_pytorch.py:838)
+19:50:15.127 [I] debug_step=1 state_stats min=-1.0000 max=1.0004 mean=0.0112 std=0.3876           (20567:train_pytorch.py:839)
+19:50:15.127 [I] debug_step=1 action_stats min=-1.0016 max=1.0004 mean=-0.0454 std=0.4716         (20567:train_pytorch.py:842)
+19:50:15.128 [I] debug_step=1 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (20567:train_pytorch.py:845)
+19:50:15.143 [I] debug_step=1 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (20567:train_pytorch.py:849)
+19:50:15.143 [I] debug_step=1 lr=9.96e-08 grad_norm=31.4779 data_time=0.2101s step_time=1.2943s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.25GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.25GB (20567:train_pytorch.py:854)
+19:50:15.144 [I] debug_step=1 grad_shared_backbone=25.5606 grad_left_action_in=0.2318 grad_right_action_in=0.9885 grad_left_expert=5.5978 grad_right_expert=12.3518 grad_action_out=9.6154 (20567:train_pytorch.py:862)
+19:50:15.144 [I] step=1 loss=2.6238 smoothed_loss=2.6238 lr=9.96e-08 grad_norm=31.4779 step_time=1.2943s data_time=0.2101s it/s=0.633 eta_to_3=3.2s max_cuda_memory=76.13GB grad_action_out=9.6154 grad_left_action_in=0.2318 grad_left_expert=5.5978 grad_right_action_in=0.9885 grad_right_expert=12.3518 grad_shared_backbone=25.5606 (20567:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+19:50:16.012 [I] debug_step=2 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (20567:train_pytorch.py:831)
+19:50:16.013 [I] debug_step=2 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (20567:train_pytorch.py:835)
+19:50:16.013 [I] debug_step=2 prompt_token_lengths=[76]                                           (20567:train_pytorch.py:838)
+19:50:16.014 [I] debug_step=2 state_stats min=-0.9415 max=1.0004 mean=-0.0010 std=0.4295          (20567:train_pytorch.py:839)
+19:50:16.015 [I] debug_step=2 action_stats min=-1.0000 max=1.1367 mean=0.0272 std=0.4576          (20567:train_pytorch.py:842)
+19:50:16.016 [I] debug_step=2 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (20567:train_pytorch.py:845)
+19:50:16.016 [I] debug_step=2 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (20567:train_pytorch.py:849)
+19:50:16.017 [I] debug_step=2 lr=1.99e-07 grad_norm=12.2770 data_time=0.2123s step_time=0.6695s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.34GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.34GB (20567:train_pytorch.py:854)
+19:50:16.017 [I] debug_step=2 grad_shared_backbone=10.3527 grad_left_action_in=0.1586 grad_right_action_in=0.1584 grad_left_expert=2.8415 grad_right_expert=4.0156 grad_action_out=4.1478 (20567:train_pytorch.py:862)
+19:50:16.018 [I] step=2 loss=1.1717 smoothed_loss=2.4786 lr=1.99e-07 grad_norm=12.2770 step_time=0.6695s data_time=0.2123s it/s=1.146 eta_to_3=0.9s max_cuda_memory=76.13GB grad_action_out=4.1478 grad_left_action_in=0.1586 grad_left_expert=2.8415 grad_right_action_in=0.1584 grad_right_expert=4.0156 grad_shared_backbone=10.3527 (20567:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+19:50:16.906 [I] debug_step=3 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (20567:train_pytorch.py:831)
+19:50:16.907 [I] debug_step=3 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (20567:train_pytorch.py:835)
+19:50:16.908 [I] debug_step=3 prompt_token_lengths=[75]                                           (20567:train_pytorch.py:838)
+19:50:16.908 [I] debug_step=3 state_stats min=-1.0000 max=1.0004 mean=0.0558 std=0.4300           (20567:train_pytorch.py:839)
+19:50:16.908 [I] debug_step=3 action_stats min=-1.0033 max=1.0004 mean=-0.0658 std=0.4704         (20567:train_pytorch.py:842)
+19:50:16.909 [I] debug_step=3 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (20567:train_pytorch.py:845)
+19:50:16.910 [I] debug_step=3 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (20567:train_pytorch.py:849)
+19:50:16.910 [I] debug_step=3 lr=2.99e-07 grad_norm=15.1079 data_time=0.2612s step_time=0.6330s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.34GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.34GB (20567:train_pytorch.py:854)
+19:50:16.911 [I] debug_step=3 grad_shared_backbone=8.6850 grad_left_action_in=0.2570 grad_right_action_in=0.3869 grad_left_expert=4.4422 grad_right_expert=10.5777 grad_action_out=3.5502 (20567:train_pytorch.py:862)
+19:50:16.911 [I] step=3 loss=0.9128 smoothed_loss=2.3220 lr=2.99e-07 grad_norm=15.1079 step_time=0.6330s data_time=0.2612s it/s=1.120 eta_to_3=0.0s max_cuda_memory=76.13GB grad_action_out=3.5502 grad_left_action_in=0.2570 grad_left_expert=4.4422 grad_right_action_in=0.3869 grad_right_expert=10.5777 grad_shared_backbone=8.6850 (20567:train_pytorch.py:882)
+19:53:54.052 [I] Saved checkpoint at step 3 -> /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_smoke3_r2/3 (20567:train_pytorch.py:378)

openpi/run_logs/split_independent_real_train20.log ADDED Viewed

	@@ -0,0 +1,173 @@

+20:03:03.080 [I] Created experiment checkpoint directory: /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_train20 (22934:train_pytorch.py:533)
+20:03:03.082 [I] Using batch size per GPU: 1 (total batch size across 1 GPUs: 1)                  (22934:train_pytorch.py:552)
+20:03:03.183 [I] Loaded norm stats from /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train (22934:config.py:234)
+20:03:03.185 [I] data_config: DataConfig(repo_id='lsnu/twin_dual_push_128_train', asset_id='lsnu/twin_dual_push_128_train', norm_stats={'state': NormStats(mean=array([ 0.10604009,  0.20956482,  0.09184283, -1.98801565, -0.04930164,
+        2.20065784,  1.07595289,  0.52742052,  0.01585805,  0.08288047,
+       -0.06887393, -1.906394  ,  0.04810138,  2.01086807, -0.92902797,
+        0.8440811 ]), std=array([0.09207697, 0.31317395, 0.08127229, 0.53812712, 0.06093267,
+       0.51205784, 0.22527155, 0.49924755, 0.20230208, 0.31408131,
+       0.21665592, 0.5264315 , 0.20170984, 0.4745712 , 1.17861438,
+       0.36277843]), q01=array([-5.00321221e-06, -3.88026012e-01, -2.23782954e-05, -2.98962682e+00,
+       -2.38592355e-01,  1.22146201e+00,  7.85383821e-01,  0.00000000e+00,
+       -6.15615927e-01, -4.14941930e-01, -9.43696350e-01, -2.88397729e+00,
+       -9.05083556e-01,  1.22148895e+00, -2.79564499e+00,  0.00000000e+00]), q99=array([ 0.31251293,  0.86546916,  0.35174239, -0.87634897,  0.05212194,
+        2.97208117,  1.64465171,  0.9998    ,  0.7670313 ,  0.96073459,
+        0.68710467, -0.87498123,  0.35838486,  2.9773227 ,  0.78477909,
+        0.9998    ])), 'actions': NormStats(mean=array([ 0.03630241,  0.09624442,  0.01367408, -0.2224988 , -0.02762174,
+        0.27498844,  0.0892187 ,  0.45650524, -0.00378086,  0.09113847,
+       -0.00376227, -0.22537093,  0.00826233,  0.26799494, -0.57452869,
+        0.7731654 ]), std=array([0.04995174, 0.29268014, 0.06852161, 0.3647725 , 0.07012808,
+       0.27129024, 0.11329207, 0.4981046 , 0.0917461 , 0.22704004,
+       0.1069391 , 0.2572591 , 0.11801817, 0.1235588 , 0.35835782,
+       0.41878474]), q01=array([-5.86206436e-04, -3.88117499e-01, -2.55800724e-01, -8.34769463e-01,
+       -3.51454727e-01, -1.54787922e-03, -5.81741333e-04,  0.00000000e+00,
+       -2.64436970e-01, -3.51582764e-01, -3.69693995e-01, -7.30919549e-01,
+       -3.35441585e-01, -6.62303925e-04, -9.34731126e-01,  0.00000000e+00]), q99=array([0.20790743, 0.81198567, 0.19612836, 0.33958174, 0.05568643,
+       0.75265345, 0.425256  , 0.9998    , 0.2558236 , 0.58901345,
+       0.35822071, 0.18567593, 0.44035054, 0.49966629, 0.12655233,
+       0.9998    ]))}, repack_transforms=Group(inputs=[RepackTransform(structure={'images': {'cam_high': 'front_image', 'cam_left_wrist': 'wrist_left_image', 'cam_right_wrist': 'wrist_right_image'}, 'state': 'state', 'actions': 'action', 'prompt': 'task'})], outputs=()), data_transforms=Group(inputs=[AlohaInputs(adapt_to_pi=False)], outputs=[]), model_transforms=Group(inputs=[InjectDefaultPrompt(prompt=None), ResizeImages(height=224, width=224), TokenizePrompt(tokenizer=<openpi.models.tokenizer.PaligemmaTokenizer object at 0x721cdf0dd610>, discrete_state_input=True), PackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))], outputs=[UnpackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))]), use_quantile_norm=True, action_sequence_keys=('action',), prompt_from_task=False, rlds_data_dir=None, action_space=None, datasets=()) (22934:data_loader.py:393)
+20:03:13.494 [I] JAX version 0.5.3 available.                                                     (22934:config.py:125)
+20:04:17.801 [I] Using existing local LeRobot dataset mirror for lsnu/twin_dual_push_128_train: /workspace/lerobot/lsnu/twin_dual_push_128_train (22934:data_loader.py:148)
+20:04:17.904 [W] 'torchcodec' is not available in your platform, falling back to 'pyav' as a default decoder (22934:video_utils.py:36)
+20:09:04.645 [I] local_batch_size: 1                                                              (22934:data_loader.py:474)
+20:11:56.606 [I] Enabled gradient checkpointing for PI0Pytorch model                              (22934:pi0_pytorch.py:138)
+20:11:56.607 [I] Enabled gradient checkpointing for memory optimization                           (22934:train_pytorch.py:624)
+20:11:56.608 [I] Step 0 (after_model_creation): GPU memory - allocated: 17.23GB, reserved: 17.23GB, free: 0.00GB, peak_allocated: 17.23GB, peak_reserved: 17.23GB (22934:train_pytorch.py:493)
+20:11:56.609 [I] Loading weights from: /workspace/checkpoints/pi05_base_split_independent_packed_from_single (22934:train_pytorch.py:653)
+20:12:01.374 [I] Weight loading missing key count: 0                                              (22934:train_pytorch.py:657)
+20:12:01.375 [I] Weight loading missing keys: set()                                               (22934:train_pytorch.py:658)
+20:12:01.375 [I] Weight loading unexpected key count: 0                                           (22934:train_pytorch.py:659)
+20:12:01.375 [I] Weight loading unexpected keys: []                                               (22934:train_pytorch.py:660)
+20:12:01.376 [I] Loaded PyTorch weights from /workspace/checkpoints/pi05_base_split_independent_packed_from_single (22934:train_pytorch.py:661)
+20:12:01.380 [I] Running on: 963c158043aa | world_size=1                                          (22934:train_pytorch.py:701)
+20:12:01.381 [I] Training config: batch_size=1, effective_batch_size=1, num_train_steps=20        (22934:train_pytorch.py:702)
+20:12:01.381 [I] Memory optimizations: gradient_checkpointing=True                                (22934:train_pytorch.py:705)
+20:12:01.381 [I] DDP settings: find_unused_parameters=False, gradient_as_bucket_view=True, static_graph=True (22934:train_pytorch.py:706)
+20:12:01.382 [I] LR schedule: warmup=250, peak_lr=2.50e-05, decay_steps=5000, end_lr=2.50e-06     (22934:train_pytorch.py:707)
+20:12:01.382 [I] Optimizer: AdamW, weight_decay=1e-10, clip_norm=1.0                              (22934:train_pytorch.py:710)
+20:12:01.382 [I] EMA is not supported for PyTorch training                                        (22934:train_pytorch.py:713)
+20:12:01.383 [I] Training precision: float32                                                      (22934:train_pytorch.py:714)
+20:12:01.410 [I] Resolved config name: pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k (22934:train_pytorch.py:308)
+20:12:01.410 [I] Dataset repo_id: lsnu/twin_dual_push_128_train                                   (22934:train_pytorch.py:309)
+20:12:01.411 [I] Norm-stats file path: /workspace/pi05tests/openpi/assets/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/lsnu/twin_dual_push_128_train/norm_stats.json (22934:train_pytorch.py:310)
+20:12:01.411 [I] Norm-stats summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (22934:train_pytorch.py:311)
+20:12:01.412 [I] Checkpoint source path: /workspace/checkpoints/pi05_base_split_independent_packed_from_single (22934:train_pytorch.py:312)
+20:12:01.412 [I] Model type: split_independent                                                    (22934:train_pytorch.py:313)
+20:12:01.412 [I] Packed transforms active: True                                                   (22934:train_pytorch.py:314)
+20:12:01.413 [I] World size: 1                                                                    (22934:train_pytorch.py:315)
+20:12:01.413 [I] Batch size: local=1, global=1                                                    (22934:train_pytorch.py:316)
+20:12:01.414 [I] num_workers: 0                                                                   (22934:train_pytorch.py:317)
+20:12:01.414 [I] Precision: float32                                                               (22934:train_pytorch.py:318)
+20:12:01.414 [I] LR schedule summary: warmup_steps=250, peak_lr=2.50e-05, decay_steps=5000, decay_lr=2.50e-06 (22934:train_pytorch.py:319)
+20:12:01.415 [I] Save/log intervals: save_interval=20, log_interval=1                             (22934:train_pytorch.py:326)
+20:12:01.415 [I] Action-loss mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (22934:train_pytorch.py:327)
+20:12:01.415 [I] Active mask dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]       (22934:train_pytorch.py:328)
+20:12:01.416 [I] Masked dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]      (22934:train_pytorch.py:329)
+20:12:01.416 [I] Gradient bucket diagnostics: left_action_in, right_action_in, left_expert, right_expert, action_out, cross_arm_comm (22934:train_pytorch.py:722)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:03.701 [I] debug_step=1 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22934:train_pytorch.py:831)
+20:12:03.702 [I] debug_step=1 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22934:train_pytorch.py:835)
+20:12:03.702 [I] debug_step=1 prompt_token_lengths=[75]                                           (22934:train_pytorch.py:838)
+20:12:03.702 [I] debug_step=1 state_stats min=-1.0000 max=1.0004 mean=0.0112 std=0.3876           (22934:train_pytorch.py:839)
+20:12:03.702 [I] debug_step=1 action_stats min=-1.0016 max=1.0004 mean=-0.0454 std=0.4716         (22934:train_pytorch.py:842)
+20:12:03.703 [I] debug_step=1 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22934:train_pytorch.py:845)
+20:12:03.729 [I] debug_step=1 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22934:train_pytorch.py:849)
+20:12:03.730 [I] debug_step=1 lr=9.96e-08 grad_norm=31.4779 data_time=0.5472s step_time=1.7166s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.25GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.25GB (22934:train_pytorch.py:854)
+20:12:03.730 [I] debug_step=1 grad_shared_backbone=25.5606 grad_left_action_in=0.2318 grad_right_action_in=0.9885 grad_left_expert=5.5978 grad_right_expert=12.3518 grad_action_out=9.6154 (22934:train_pytorch.py:862)
+20:12:03.731 [I] step=1 loss=2.6238 smoothed_loss=2.6238 lr=9.96e-08 grad_norm=31.4779 step_time=1.7166s data_time=0.5472s it/s=0.425 eta_to_20=44.7s max_cuda_memory=76.13GB grad_action_out=9.6154 grad_left_action_in=0.2318 grad_left_expert=5.5978 grad_right_action_in=0.9885 grad_right_expert=12.3518 grad_shared_backbone=25.5606 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:05.012 [I] debug_step=2 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22934:train_pytorch.py:831)
+20:12:05.013 [I] debug_step=2 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22934:train_pytorch.py:835)
+20:12:05.014 [I] debug_step=2 prompt_token_lengths=[76]                                           (22934:train_pytorch.py:838)
+20:12:05.014 [I] debug_step=2 state_stats min=-0.9415 max=1.0004 mean=-0.0010 std=0.4295          (22934:train_pytorch.py:839)
+20:12:05.015 [I] debug_step=2 action_stats min=-1.0000 max=1.1367 mean=0.0272 std=0.4576          (22934:train_pytorch.py:842)
+20:12:05.016 [I] debug_step=2 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22934:train_pytorch.py:845)
+20:12:05.016 [I] debug_step=2 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22934:train_pytorch.py:849)
+20:12:05.017 [I] debug_step=2 lr=1.99e-07 grad_norm=12.2749 data_time=0.5381s step_time=0.7692s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.34GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.34GB (22934:train_pytorch.py:854)
+20:12:05.017 [I] debug_step=2 grad_shared_backbone=10.3515 grad_left_action_in=0.1585 grad_right_action_in=0.1584 grad_left_expert=2.8412 grad_right_expert=4.0131 grad_action_out=4.1470 (22934:train_pytorch.py:862)
+20:12:05.018 [I] step=2 loss=1.1715 smoothed_loss=2.4786 lr=1.99e-07 grad_norm=12.2749 step_time=0.7692s data_time=0.5381s it/s=0.777 eta_to_20=23.2s max_cuda_memory=76.13GB grad_action_out=4.1470 grad_left_action_in=0.1585 grad_left_expert=2.8412 grad_right_action_in=0.1584 grad_right_expert=4.0131 grad_shared_backbone=10.3515 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:05.585 [I] debug_step=3 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22934:train_pytorch.py:831)
+20:12:05.586 [I] debug_step=3 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22934:train_pytorch.py:835)
+20:12:05.586 [I] debug_step=3 prompt_token_lengths=[75]                                           (22934:train_pytorch.py:838)
+20:12:05.586 [I] debug_step=3 state_stats min=-1.0000 max=1.0004 mean=0.0558 std=0.4300           (22934:train_pytorch.py:839)
+20:12:05.587 [I] debug_step=3 action_stats min=-1.0033 max=1.0004 mean=-0.0658 std=0.4704         (22934:train_pytorch.py:842)
+20:12:05.588 [I] debug_step=3 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22934:train_pytorch.py:845)
+20:12:05.588 [I] debug_step=3 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22934:train_pytorch.py:849)
+20:12:05.589 [I] debug_step=3 lr=2.99e-07 grad_norm=15.1205 data_time=0.1545s step_time=0.4182s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.34GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.34GB (22934:train_pytorch.py:854)
+20:12:05.589 [I] debug_step=3 grad_shared_backbone=8.6946 grad_left_action_in=0.2568 grad_right_action_in=0.3873 grad_left_expert=4.4408 grad_right_expert=10.5877 grad_action_out=3.5507 (22934:train_pytorch.py:862)
+20:12:05.590 [I] step=3 loss=0.9126 smoothed_loss=2.3220 lr=2.99e-07 grad_norm=15.1205 step_time=0.4182s data_time=0.1545s it/s=1.751 eta_to_20=9.7s max_cuda_memory=76.13GB grad_action_out=3.5507 grad_left_action_in=0.2568 grad_left_expert=4.4408 grad_right_action_in=0.3873 grad_right_expert=10.5877 grad_shared_backbone=8.6946 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:06.414 [I] debug_step=4 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22934:train_pytorch.py:831)
+20:12:06.415 [I] debug_step=4 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22934:train_pytorch.py:835)
+20:12:06.416 [I] debug_step=4 prompt_token_lengths=[78]                                           (22934:train_pytorch.py:838)
+20:12:06.416 [I] debug_step=4 state_stats min=-0.7017 max=1.0004 mean=0.0553 std=0.3507           (22934:train_pytorch.py:839)
+20:12:06.417 [I] debug_step=4 action_stats min=-1.0014 max=1.0004 mean=-0.0683 std=0.4561         (22934:train_pytorch.py:842)
+20:12:06.417 [I] debug_step=4 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22934:train_pytorch.py:845)
+20:12:06.418 [I] debug_step=4 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22934:train_pytorch.py:849)
+20:12:06.419 [I] debug_step=4 lr=3.98e-07 grad_norm=9.2670 data_time=0.2679s step_time=0.5621s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.34GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.34GB (22934:train_pytorch.py:854)
+20:12:06.419 [I] debug_step=4 grad_shared_backbone=7.8629 grad_left_action_in=0.1341 grad_right_action_in=0.0877 grad_left_expert=3.2369 grad_right_expert=1.0658 grad_action_out=3.4116 (22934:train_pytorch.py:862)
+20:12:06.420 [I] step=4 loss=1.1718 smoothed_loss=2.2070 lr=3.98e-07 grad_norm=9.2670 step_time=0.5621s data_time=0.2679s it/s=1.206 eta_to_20=13.3s max_cuda_memory=76.13GB grad_action_out=3.4116 grad_left_action_in=0.1341 grad_left_expert=3.2369 grad_right_action_in=0.0877 grad_right_expert=1.0658 grad_shared_backbone=7.8629 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:07.218 [I] debug_step=5 observation.state shape=(1, 32) dtype=torch.float64 actions shape=(1, 16, 32) dtype=torch.float32 (22934:train_pytorch.py:831)
+20:12:07.219 [I] debug_step=5 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (1, 3, 224, 224), 'left_wrist_0_rgb': (1, 3, 224, 224), 'right_wrist_0_rgb': (1, 3, 224, 224)} (22934:train_pytorch.py:835)
+20:12:07.219 [I] debug_step=5 prompt_token_lengths=[73]                                           (22934:train_pytorch.py:838)
+20:12:07.219 [I] debug_step=5 state_stats min=-0.9599 max=1.0004 mean=0.0170 std=0.5364           (22934:train_pytorch.py:839)
+20:12:07.220 [I] debug_step=5 action_stats min=-1.0392 max=1.0004 mean=-0.0159 std=0.4488         (22934:train_pytorch.py:842)
+20:12:07.220 [I] debug_step=5 state_nonzero_counts_8d_blocks=[8, 0, 8, 0] action_nonzero_counts_8d_blocks=[128, 0, 128, 0] (22934:train_pytorch.py:845)
+20:12:07.221 [I] debug_step=5 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=16 actions=256 (22934:train_pytorch.py:849)
+20:12:07.221 [I] debug_step=5 lr=4.98e-07 grad_norm=18.8576 data_time=0.2330s step_time=0.5704s gpu_mem_allocated=46.71GB gpu_mem_reserved=76.34GB gpu_mem_max_allocated=76.13GB gpu_mem_max_reserved=76.34GB (22934:train_pytorch.py:854)
+20:12:07.222 [I] debug_step=5 grad_shared_backbone=15.0420 grad_left_action_in=0.2664 grad_right_action_in=0.2257 grad_left_expert=7.9881 grad_right_expert=3.7966 grad_action_out=6.1884 (22934:train_pytorch.py:862)
+20:12:07.223 [I] step=5 loss=1.6473 smoothed_loss=2.1510 lr=4.98e-07 grad_norm=18.8576 step_time=0.5704s data_time=0.2330s it/s=1.246 eta_to_20=12.0s max_cuda_memory=76.13GB grad_action_out=6.1884 grad_left_action_in=0.2664 grad_left_expert=7.9881 grad_right_action_in=0.2257 grad_right_expert=3.7966 grad_shared_backbone=15.0420 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:07.822 [I] step=6 loss=1.6098 smoothed_loss=2.0969 lr=5.98e-07 grad_norm=20.9772 step_time=0.4435s data_time=0.1600s it/s=1.671 eta_to_20=8.4s max_cuda_memory=76.13GB grad_action_out=6.0592 grad_left_action_in=0.2873 grad_left_expert=8.8574 grad_right_action_in=0.4264 grad_right_expert=6.3071 grad_shared_backbone=16.1173 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:08.395 [I] step=7 loss=1.0401 smoothed_loss=1.9912 lr=6.97e-07 grad_norm=9.5173 step_time=0.4240s data_time=0.1495s it/s=1.747 eta_to_20=7.4s max_cuda_memory=76.13GB grad_action_out=4.1689 grad_left_action_in=0.1489 grad_left_expert=3.1996 grad_right_action_in=0.0904 grad_right_expert=2.4983 grad_shared_backbone=7.4213 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:08.914 [I] step=8 loss=1.7539 smoothed_loss=1.9675 lr=7.97e-07 grad_norm=12.9701 step_time=0.3829s data_time=0.1362s it/s=1.931 eta_to_20=6.2s max_cuda_memory=76.13GB grad_action_out=5.3617 grad_left_action_in=0.1890 grad_left_expert=3.6536 grad_right_action_in=0.3790 grad_right_expert=2.7904 grad_shared_backbone=10.5667 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:09.692 [I] step=9 loss=0.4114 smoothed_loss=1.8119 lr=8.96e-07 grad_norm=3.5873 step_time=0.5166s data_time=0.2609s it/s=1.288 eta_to_20=8.5s max_cuda_memory=76.13GB grad_action_out=1.8283 grad_left_action_in=0.0689 grad_left_expert=1.3656 grad_right_action_in=0.0549 grad_right_expert=0.7330 grad_shared_backbone=2.6507 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:10.646 [I] step=10 loss=0.6228 smoothed_loss=1.6930 lr=9.96e-07 grad_norm=6.7396 step_time=0.7100s data_time=0.2450s it/s=1.049 eta_to_20=9.5s max_cuda_memory=76.13GB grad_action_out=2.2553 grad_left_action_in=0.0813 grad_left_expert=1.3495 grad_right_action_in=0.0919 grad_right_expert=2.0906 grad_shared_backbone=5.8179 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:11.288 [I] step=11 loss=0.8688 smoothed_loss=1.6105 lr=1.10e-06 grad_norm=7.2182 step_time=0.4823s data_time=0.1593s it/s=1.561 eta_to_20=5.8s max_cuda_memory=76.13GB grad_action_out=3.3031 grad_left_action_in=0.1262 grad_left_expert=2.5456 grad_right_action_in=0.0809 grad_right_expert=0.9216 grad_shared_backbone=5.7177 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:11.903 [I] step=12 loss=0.7319 smoothed_loss=1.5227 lr=1.20e-06 grad_norm=6.1848 step_time=0.4468s data_time=0.1681s it/s=1.629 eta_to_20=4.9s max_cuda_memory=76.13GB grad_action_out=2.7925 grad_left_action_in=0.1038 grad_left_expert=2.4508 grad_right_action_in=0.0680 grad_right_expert=0.8716 grad_shared_backbone=4.8333 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:12.684 [I] step=13 loss=0.8788 smoothed_loss=1.4583 lr=1.29e-06 grad_norm=20.2227 step_time=0.5649s data_time=0.2162s it/s=1.282 eta_to_20=5.5s max_cuda_memory=76.13GB grad_action_out=3.0176 grad_left_action_in=0.1300 grad_left_expert=2.8276 grad_right_action_in=0.4691 grad_right_expert=12.9156 grad_shared_backbone=11.2157 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:13.370 [I] step=14 loss=1.2741 smoothed_loss=1.4399 lr=1.39e-06 grad_norm=7.8620 step_time=0.5100s data_time=0.1755s it/s=1.461 eta_to_20=4.1s max_cuda_memory=76.13GB grad_action_out=4.2194 grad_left_action_in=0.1433 grad_left_expert=2.8949 grad_right_action_in=0.0958 grad_right_expert=1.0096 grad_shared_backbone=5.8070 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:14.027 [I] step=15 loss=2.3729 smoothed_loss=1.5332 lr=1.49e-06 grad_norm=19.3589 step_time=0.4678s data_time=0.1899s it/s=1.523 eta_to_20=3.3s max_cuda_memory=76.13GB grad_action_out=7.2135 grad_left_action_in=0.2665 grad_left_expert=7.5354 grad_right_action_in=0.5496 grad_right_expert=4.5295 grad_shared_backbone=15.2257 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:14.874 [I] step=16 loss=0.8147 smoothed_loss=1.4613 lr=1.59e-06 grad_norm=7.7365 step_time=0.5547s data_time=0.2919s it/s=1.183 eta_to_20=3.4s max_cuda_memory=76.13GB grad_action_out=2.7237 grad_left_action_in=0.1192 grad_left_expert=2.8822 grad_right_action_in=0.0900 grad_right_expert=0.8615 grad_shared_backbone=6.4500 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:15.664 [I] step=17 loss=1.4318 smoothed_loss=1.4584 lr=1.69e-06 grad_norm=19.5452 step_time=0.5511s data_time=0.2382s it/s=1.268 eta_to_20=2.4s max_cuda_memory=76.13GB grad_action_out=3.9684 grad_left_action_in=0.3767 grad_left_expert=7.8636 grad_right_action_in=0.1317 grad_right_expert=1.6847 grad_shared_backbone=16.9059 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:16.588 [I] step=18 loss=0.4858 smoothed_loss=1.3611 lr=1.79e-06 grad_norm=3.4382 step_time=0.6846s data_time=0.2403s it/s=1.082 eta_to_20=1.8s max_cuda_memory=76.13GB grad_action_out=1.9985 grad_left_action_in=0.0749 grad_left_expert=1.4156 grad_right_action_in=0.0390 grad_right_expert=0.5210 grad_shared_backbone=2.3369 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:17.216 [I] step=19 loss=0.7492 smoothed_loss=1.2999 lr=1.89e-06 grad_norm=6.9377 step_time=0.4815s data_time=0.1459s it/s=1.596 eta_to_20=0.6s max_cuda_memory=76.13GB grad_action_out=3.7478 grad_left_action_in=0.1113 grad_left_expert=2.8716 grad_right_action_in=0.0729 grad_right_expert=1.0784 grad_shared_backbone=4.9024 (22934:train_pytorch.py:882)
+  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
+20:12:18.186 [I] step=20 loss=0.6038 smoothed_loss=1.2303 lr=1.99e-06 grad_norm=7.0090 step_time=0.7175s data_time=0.2525s it/s=1.032 eta_to_20=0.0s max_cuda_memory=76.13GB grad_action_out=2.8786 grad_left_action_in=0.0890 grad_left_expert=2.7778 grad_right_action_in=0.0549 grad_right_expert=1.4578 grad_shared_backbone=5.5395 (22934:train_pytorch.py:882)
+20:19:39.399 [I] Saved checkpoint at step 20 -> /workspace/pi05tests/openpi/checkpoints/pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k/split_independent_real_train20/20 (22934:train_pytorch.py:378)

openpi/scripts/check_parallel_warmstart_equivalence.py CHANGED Viewed

@@ -76,6 +76,13 @@ def main() -> None:
     )
     baseline_config = _config.get_config(args.baseline_config_name)
     parallel_config = _config.get_config(args.parallel_config_name)
     data_config = baseline_config.data.create(baseline_config.assets_dirs, baseline_config.model)
     data_config = dataclasses.replace(data_config, repo_id=args.repo_id)

     )
     baseline_config = _config.get_config(args.baseline_config_name)
     parallel_config = _config.get_config(args.parallel_config_name)
+    parallel_model_cfg = build_model_config(parallel_config)
+    if parallel_model_cfg.use_split_action_expert:
+        raise ValueError(
+            "Exact end-to-end warm-start equivalence is not expected for split action experts. "
+            "Use init_parallel_pi05_from_single_pytorch.py for branch copy checks and "
+            "check_split_expert_invariants.py for branch-local invariants."
+        )
     data_config = baseline_config.data.create(baseline_config.assets_dirs, baseline_config.model)
     data_config = dataclasses.replace(data_config, repo_id=args.repo_id)

openpi/scripts/check_split_expert_invariants.py ADDED Viewed

	@@ -0,0 +1,154 @@

+#!/usr/bin/env python3
+import dataclasses
+import safetensors.torch
+import torch
+import tyro
+import openpi.models.pi0_config
+import openpi.training.config as _config
+@dataclasses.dataclass
+class Args:
+    config_name: str
+    checkpoint_dir: str
+    tolerance: float = 1e-6
+    batch_size: int = 2
+    prefix_len: int = 12
+    seed: int = 123
+def _build_model_config(config: _config.TrainConfig) -> openpi.models.pi0_config.Pi0Config:
+    if not isinstance(config.model, openpi.models.pi0_config.Pi0Config):
+        return openpi.models.pi0_config.Pi0Config(
+            dtype="float32",
+            action_dim=config.model.action_dim,
+            action_horizon=config.model.action_horizon,
+            max_token_len=config.model.max_token_len,
+            paligemma_variant=getattr(config.model, "paligemma_variant", "gemma_2b"),
+            action_expert_variant=getattr(config.model, "action_expert_variant", "gemma_300m"),
+            pi05=getattr(config.model, "pi05", False),
+            arm_action_dims=getattr(config.model, "arm_action_dims", None),
+            action_expert_mode=getattr(config.model, "action_expert_mode", None),
+        )
+    model_cfg = dataclasses.replace(config.model)
+    object.__setattr__(model_cfg, "dtype", "float32")
+    return model_cfg
+def _random_prefix_context(model, batch_size: int, prefix_len: int, seed: int):
+    generator = torch.Generator(device="cpu")
+    generator.manual_seed(seed)
+    prefix_width = model.paligemma_with_expert.paligemma.config.text_config.hidden_size
+    prefix_embs = torch.randn(batch_size, prefix_len, prefix_width, generator=generator, dtype=torch.float32)
+    prefix_pad_masks = torch.ones(batch_size, prefix_len, dtype=torch.bool)
+    prefix_att_masks = torch.zeros(batch_size, prefix_len, dtype=torch.bool)
+    return prefix_embs, prefix_pad_masks, prefix_att_masks
+def _run_model(model, prefix_context, x_t, timestep):
+    prefix_embs, prefix_pad_masks, prefix_att_masks = prefix_context
+    state = torch.zeros(x_t.shape[0], model.config.action_dim, dtype=torch.float32)
+    suffix_embs, suffix_pad_masks, suffix_att_masks, adarms_cond = model.embed_suffix(state, x_t, timestep)
+    suffix_outputs = model._run_joint_action_expert(  # noqa: SLF001
+        prefix_embs,
+        prefix_pad_masks,
+        prefix_att_masks,
+        suffix_embs,
+        suffix_pad_masks,
+        suffix_att_masks,
+        adarms_cond,
+    )
+    suffix_outputs = [output[:, -model.config.action_horizon :].to(dtype=torch.float32) for output in suffix_outputs]
+    projected_actions = model._project_action_outputs(suffix_outputs)  # noqa: SLF001
+    return suffix_outputs, projected_actions
+def _run_identical_branch_inputs(model, prefix_context, timestep, seed: int):
+    generator = torch.Generator(device="cpu")
+    generator.manual_seed(seed)
+    width = model.action_expert_width
+    horizon = model.config.action_horizon
+    batch_size = prefix_context[0].shape[0]
+    shared_suffix = torch.randn(batch_size, horizon, width, generator=generator, dtype=torch.float32)
+    shared_cond = torch.randn(batch_size, width, generator=generator, dtype=torch.float32)
+    suffix_pad_masks = [torch.ones(batch_size, horizon, dtype=torch.bool) for _ in range(2)]
+    suffix_att_masks = [model._action_att_mask(batch_size, torch.device("cpu"), torch.float32) for _ in range(2)]  # noqa: SLF001
+    suffix_outputs = model._run_joint_action_expert(  # noqa: SLF001
+        prefix_context[0],
+        prefix_context[1],
+        prefix_context[2],
+        [shared_suffix.clone(), shared_suffix.clone()],
+        suffix_pad_masks,
+        suffix_att_masks,
+        [shared_cond.clone(), shared_cond.clone()],
+    )
+    return suffix_outputs
+def main() -> None:
+    args = tyro.cli(Args)
+    config = _config.get_config(args.config_name)
+    model_cfg = _build_model_config(config)
+    if not model_cfg.use_split_action_expert:
+        raise ValueError(f"Config {args.config_name} is not a split-expert config.")
+    import openpi.models_pytorch.pi0_pytorch as pi0_pytorch
+    torch.manual_seed(args.seed)
+    model = pi0_pytorch.PI0Pytorch(model_cfg)
+    missing, unexpected = safetensors.torch.load_model(model, f"{args.checkpoint_dir}/model.safetensors", strict=False)
+    model.eval()
+    prefix_context = _random_prefix_context(model, args.batch_size, args.prefix_len, args.seed + 1)
+    x_t = torch.randn(args.batch_size, model.config.action_horizon, model.config.action_dim, dtype=torch.float32)
+    timestep = torch.full((args.batch_size,), 0.5, dtype=torch.float32)
+    identical_suffix_outputs = _run_identical_branch_inputs(model, prefix_context, timestep, args.seed + 2)
+    identical_branch_suffix_max_abs_diff = float(
+        (identical_suffix_outputs[0] - identical_suffix_outputs[1]).abs().max().item()
+    )
+    left_suffix_outputs, left_projected_actions = _run_model(model, prefix_context, x_t, timestep)
+    x_t_right_perturbed = x_t.clone()
+    x_t_right_perturbed[:, :, 16:32] += 0.5 * torch.randn_like(x_t_right_perturbed[:, :, 16:32])
+    _, right_perturbed_actions = _run_model(model, prefix_context, x_t_right_perturbed, timestep)
+    left_branch_invariance_max_abs_diff = float(
+        (left_projected_actions[:, :, 0:16] - right_perturbed_actions[:, :, 0:16]).abs().max().item()
+    )
+    x_t_left_perturbed = x_t.clone()
+    x_t_left_perturbed[:, :, 0:16] += 0.5 * torch.randn_like(x_t_left_perturbed[:, :, 0:16])
+    _, left_perturbed_actions = _run_model(model, prefix_context, x_t_left_perturbed, timestep)
+    right_branch_invariance_max_abs_diff = float(
+        (left_projected_actions[:, :, 16:32] - left_perturbed_actions[:, :, 16:32]).abs().max().item()
+    )
+    print(f"config_name: {args.config_name}")
+    print(f"checkpoint_dir: {args.checkpoint_dir}")
+    print(f"action_expert_mode: {model_cfg.action_expert_mode}")
+    print(f"weight_loading_missing_keys: {list(missing)}")
+    print(f"weight_loading_unexpected_keys: {list(unexpected)}")
+    print(f"identical_branch_suffix_max_abs_diff: {identical_branch_suffix_max_abs_diff:.8f}")
+    print(
+        f"identical_branch_suffix_match: "
+        f"{identical_branch_suffix_max_abs_diff <= args.tolerance}"
+    )
+    if model_cfg.action_expert_mode == "split_independent":
+        print(f"left_branch_invariance_max_abs_diff: {left_branch_invariance_max_abs_diff:.8f}")
+        print(f"right_branch_invariance_max_abs_diff: {right_branch_invariance_max_abs_diff:.8f}")
+        print(f"left_branch_invariant: {left_branch_invariance_max_abs_diff <= args.tolerance}")
+        print(f"right_branch_invariant: {right_branch_invariance_max_abs_diff <= args.tolerance}")
+    else:
+        print("left_branch_invariance_max_abs_diff: skipped_for_split_communicating")
+        print("right_branch_invariance_max_abs_diff: skipped_for_split_communicating")
+if __name__ == "__main__":
+    main()

openpi/scripts/eval_twin_val_loss_pytorch.py CHANGED Viewed

@@ -44,7 +44,7 @@ class Args:
     eval_seed: int = 123
     sample_num_batches: int = 0
     sample_batch_size: int | None = None
-    sample_num_steps: str = "4,10"
     sample_seed: int = 321

     eval_seed: int = 123
     sample_num_batches: int = 0
     sample_batch_size: int | None = None
+    sample_num_steps: str = "1,2,4,8,16"
     sample_seed: int = 321

openpi/scripts/init_parallel_pi05_from_single_pytorch.py CHANGED Viewed

@@ -33,6 +33,7 @@ def _build_model_config(config: _config.TrainConfig) -> openpi.models.pi0_config
             action_expert_variant=getattr(config.model, "action_expert_variant", "gemma_300m"),
             pi05=getattr(config.model, "pi05", False),
             arm_action_dims=getattr(config.model, "arm_action_dims", None),
         )
     model_cfg = config.model
@@ -40,12 +41,60 @@ def _build_model_config(config: _config.TrainConfig) -> openpi.models.pi0_config
     return model_cfg
 def main() -> None:
     args = tyro.cli(Args)
     config = _config.get_config(args.config_name)
     model_cfg = _build_model_config(config)
     if not model_cfg.use_parallel_action_heads:
-        raise ValueError(f"Config {args.config_name} is not a parallel-head config.")
     if tuple(model_cfg.arm_action_dims) != (16, 16):
         raise ValueError(f"Expected arm_action_dims=(16, 16), got {model_cfg.arm_action_dims}.")
@@ -65,74 +114,139 @@ def main() -> None:
             f"Expected single-head checkpoint with packed 32-dim actions, got in={tuple(weight_in.shape)} out={tuple(weight_out.shape)}."
         )
-    with torch.no_grad():
-        parallel_model.action_in_proj_arms[0].weight.copy_(weight_in[:, 0:16])
-        parallel_model.action_in_proj_arms[0].bias.zero_()
-        parallel_model.action_in_proj_arms[1].weight.copy_(weight_in[:, 16:32])
-        parallel_model.action_in_proj_arms[1].bias.zero_()
-        fuse_weight = torch.zeros_like(parallel_model.arm_token_fuse.weight)
-        identity = torch.eye(hidden_width, dtype=fuse_weight.dtype)
-        fuse_weight[:, 0:hidden_width] = identity
-        fuse_weight[:, hidden_width : 2 * hidden_width] = identity
-        parallel_model.arm_token_fuse.weight.copy_(fuse_weight)
-        parallel_model.arm_token_fuse.bias.copy_(bias_in)
-        parallel_model.action_out_proj_arms[0].weight.copy_(weight_out[0:16, :])
-        parallel_model.action_out_proj_arms[0].bias.copy_(bias_out[0:16])
-        parallel_model.action_out_proj_arms[1].weight.copy_(weight_out[16:32, :])
-        parallel_model.action_out_proj_arms[1].bias.copy_(bias_out[16:32])
     proj_in_dtype = parallel_model.action_in_proj_arms[0].weight.dtype
     proj_out_dtype = parallel_model.action_out_proj_arms[0].weight.dtype
     x = torch.randn(2, model_cfg.action_horizon, model_cfg.action_dim, dtype=proj_in_dtype)
     suffix = torch.randn(2, model_cfg.action_horizon, hidden_width, dtype=proj_out_dtype)
     with torch.no_grad():
-        input_max_abs_diff = float(
             (
-                F.linear(x, weight_in.to(proj_in_dtype), bias_in.to(proj_in_dtype))
-                - parallel_model._project_action_inputs(x)
             )
             .abs()
             .max()
             .item()
         )
-        output_max_abs_diff = float(
             (
-                F.linear(suffix, weight_out.to(proj_out_dtype), bias_out.to(proj_out_dtype))
-                - parallel_model._project_action_outputs(suffix)
             )
             .abs()
             .max()
             .item()
         )
     output_dir = Path(args.output_path)
     output_dir.mkdir(parents=True, exist_ok=True)
     safetensors.torch.save_model(parallel_model, output_dir / "model.safetensors")
     (output_dir / "config.json").write_text(json.dumps(dataclasses.asdict(model_cfg), indent=2, sort_keys=True))
-    metadata = {
-        "config_name": args.config_name,
-        "single_ckpt": args.single_ckpt,
-        "output_path": args.output_path,
-        "load_state_missing_keys": list(missing),
-        "load_state_unexpected_keys": list(unexpected),
-        "input_projection_max_abs_diff": input_max_abs_diff,
-        "output_projection_max_abs_diff": output_max_abs_diff,
-        "warm_start_exact": input_max_abs_diff == 0.0 and output_max_abs_diff == 0.0,
-    }
     (output_dir / "init_parallel_metadata.json").write_text(json.dumps(metadata, indent=2, sort_keys=True))
     print(f"config_name: {args.config_name}")
     print(f"single_ckpt: {args.single_ckpt}")
     print(f"output_path: {args.output_path}")
     print(f"load_state_missing_keys_count: {len(missing)}")
     print(f"load_state_missing_keys: {list(missing)}")
     print(f"load_state_unexpected_keys_count: {len(unexpected)}")
     print(f"load_state_unexpected_keys: {list(unexpected)}")
-    print(f"input_projection_max_abs_diff: {input_max_abs_diff:.8f}")
-    print(f"output_projection_max_abs_diff: {output_max_abs_diff:.8f}")
-    print(f"warm_start_exact: {metadata['warm_start_exact']}")
 if __name__ == "__main__":

             action_expert_variant=getattr(config.model, "action_expert_variant", "gemma_300m"),
             pi05=getattr(config.model, "pi05", False),
             arm_action_dims=getattr(config.model, "arm_action_dims", None),
+            action_expert_mode=getattr(config.model, "action_expert_mode", None),
         )
     model_cfg = config.model
     return model_cfg
+def _copy_factorized_heads(model, weight_in, bias_in, weight_out, bias_out) -> None:
+    hidden_width = weight_in.shape[0]
+    with torch.no_grad():
+        model.action_in_proj_arms[0].weight.copy_(weight_in[:, 0:16])
+        model.action_in_proj_arms[0].bias.zero_()
+        model.action_in_proj_arms[1].weight.copy_(weight_in[:, 16:32])
+        model.action_in_proj_arms[1].bias.zero_()
+        if hasattr(model, "arm_token_fuse"):
+            fuse_weight = torch.zeros_like(model.arm_token_fuse.weight)
+            identity = torch.eye(hidden_width, dtype=fuse_weight.dtype)
+            fuse_weight[:, 0:hidden_width] = identity
+            fuse_weight[:, hidden_width : 2 * hidden_width] = identity
+            model.arm_token_fuse.weight.copy_(fuse_weight)
+            model.arm_token_fuse.bias.copy_(bias_in)
+        model.action_out_proj_arms[0].weight.copy_(weight_out[0:16, :])
+        model.action_out_proj_arms[0].bias.copy_(bias_out[0:16])
+        model.action_out_proj_arms[1].weight.copy_(weight_out[16:32, :])
+        model.action_out_proj_arms[1].bias.copy_(bias_out[16:32])
+def _copy_split_expert_weights(model, single_state) -> None:
+    model_state = model.state_dict()
+    with torch.no_grad():
+        for key, value in single_state.items():
+            if not key.startswith("paligemma_with_expert.gemma_expert."):
+                continue
+            suffix = key.removeprefix("paligemma_with_expert.gemma_expert.")
+            left_key = f"paligemma_with_expert.left_gemma_expert.{suffix}"
+            right_key = f"paligemma_with_expert.right_gemma_expert.{suffix}"
+            model_state[left_key].copy_(value.to(dtype=model_state[left_key].dtype))
+            model_state[right_key].copy_(value.to(dtype=model_state[right_key].dtype))
+def _expert_copy_max_abs_diff(model, single_state, target_prefix: str) -> float:
+    model_state = model.state_dict()
+    max_abs_diff = 0.0
+    for key, value in single_state.items():
+        if not key.startswith("paligemma_with_expert.gemma_expert."):
+            continue
+        suffix = key.removeprefix("paligemma_with_expert.gemma_expert.")
+        target_key = f"{target_prefix}{suffix}"
+        diff = (model_state[target_key].to(torch.float32) - value.to(torch.float32)).abs().max().item()
+        max_abs_diff = max(max_abs_diff, float(diff))
+    return max_abs_diff
 def main() -> None:
     args = tyro.cli(Args)
     config = _config.get_config(args.config_name)
     model_cfg = _build_model_config(config)
     if not model_cfg.use_parallel_action_heads:
+        raise ValueError(f"Config {args.config_name} does not use factorized or split action heads.")
     if tuple(model_cfg.arm_action_dims) != (16, 16):
         raise ValueError(f"Expected arm_action_dims=(16, 16), got {model_cfg.arm_action_dims}.")
             f"Expected single-head checkpoint with packed 32-dim actions, got in={tuple(weight_in.shape)} out={tuple(weight_out.shape)}."
         )
+    _copy_factorized_heads(parallel_model, weight_in, bias_in, weight_out, bias_out)
+    if model_cfg.use_split_action_expert:
+        _copy_split_expert_weights(parallel_model, single_state)
     proj_in_dtype = parallel_model.action_in_proj_arms[0].weight.dtype
     proj_out_dtype = parallel_model.action_out_proj_arms[0].weight.dtype
     x = torch.randn(2, model_cfg.action_horizon, model_cfg.action_dim, dtype=proj_in_dtype)
+    x_left = x[:, :, 0:16]
+    x_right = x[:, :, 16:32]
     suffix = torch.randn(2, model_cfg.action_horizon, hidden_width, dtype=proj_out_dtype)
+    metadata = {
+        "config_name": args.config_name,
+        "action_expert_mode": model_cfg.action_expert_mode,
+        "single_ckpt": args.single_ckpt,
+        "output_path": args.output_path,
+        "load_state_missing_keys": list(missing),
+        "load_state_unexpected_keys": list(unexpected),
+    }
     with torch.no_grad():
+        left_input_projection_max_abs_diff = float(
+            (
+                F.linear(x_left, weight_in[:, 0:16].to(proj_in_dtype), None)
+                - parallel_model.action_in_proj_arms[0](x_left)
+            )
+            .abs()
+            .max()
+            .item()
+        )
+        right_input_projection_max_abs_diff = float(
             (
+                F.linear(x_right, weight_in[:, 16:32].to(proj_in_dtype), None)
+                - parallel_model.action_in_proj_arms[1](x_right)
             )
             .abs()
             .max()
             .item()
         )
+        left_output_projection_max_abs_diff = float(
             (
+                F.linear(suffix, weight_out[0:16, :].to(proj_out_dtype), bias_out[0:16].to(proj_out_dtype))
+                - parallel_model.action_out_proj_arms[0](suffix)
+            )
+            .abs()
+            .max()
+            .item()
+        )
+        right_output_projection_max_abs_diff = float(
+            (
+                F.linear(suffix, weight_out[16:32, :].to(proj_out_dtype), bias_out[16:32].to(proj_out_dtype))
+                - parallel_model.action_out_proj_arms[1](suffix)
             )
             .abs()
             .max()
             .item()
         )
+        metadata.update(
+            {
+                "left_input_projection_max_abs_diff": left_input_projection_max_abs_diff,
+                "right_input_projection_max_abs_diff": right_input_projection_max_abs_diff,
+                "left_output_projection_max_abs_diff": left_output_projection_max_abs_diff,
+                "right_output_projection_max_abs_diff": right_output_projection_max_abs_diff,
+            }
+        )
+        if model_cfg.action_expert_mode == "head_only_parallel":
+            input_max_abs_diff = float(
+                (
+                    F.linear(x, weight_in.to(proj_in_dtype), bias_in.to(proj_in_dtype))
+                    - parallel_model._project_action_inputs(x)
+                )
+                .abs()
+                .max()
+                .item()
+            )
+            output_max_abs_diff = float(
+                (
+                    F.linear(suffix, weight_out.to(proj_out_dtype), bias_out.to(proj_out_dtype))
+                    - parallel_model._project_action_outputs(suffix)
+                )
+                .abs()
+                .max()
+                .item()
+            )
+            metadata["input_projection_max_abs_diff"] = input_max_abs_diff
+            metadata["output_projection_max_abs_diff"] = output_max_abs_diff
+            metadata["warm_start_exact"] = input_max_abs_diff == 0.0 and output_max_abs_diff == 0.0
+        else:
+            left_expert_max_abs_diff = _expert_copy_max_abs_diff(
+                parallel_model,
+                single_state,
+                "paligemma_with_expert.left_gemma_expert.",
+            )
+            right_expert_max_abs_diff = _expert_copy_max_abs_diff(
+                parallel_model,
+                single_state,
+                "paligemma_with_expert.right_gemma_expert.",
+            )
+            metadata["left_expert_max_abs_diff"] = left_expert_max_abs_diff
+            metadata["right_expert_max_abs_diff"] = right_expert_max_abs_diff
+            if parallel_model.paligemma_with_expert.cross_arm_comm is not None:
+                metadata["cross_arm_comm_init"] = [
+                    float(value) for value in parallel_model.paligemma_with_expert.cross_arm_comm.detach().cpu().tolist()
+                ]
+            metadata["warm_start_exact"] = (
+                left_input_projection_max_abs_diff == 0.0
+                and right_input_projection_max_abs_diff == 0.0
+                and left_output_projection_max_abs_diff == 0.0
+                and right_output_projection_max_abs_diff == 0.0
+                and left_expert_max_abs_diff == 0.0
+                and right_expert_max_abs_diff == 0.0
+            )
     output_dir = Path(args.output_path)
     output_dir.mkdir(parents=True, exist_ok=True)
     safetensors.torch.save_model(parallel_model, output_dir / "model.safetensors")
     (output_dir / "config.json").write_text(json.dumps(dataclasses.asdict(model_cfg), indent=2, sort_keys=True))
     (output_dir / "init_parallel_metadata.json").write_text(json.dumps(metadata, indent=2, sort_keys=True))
     print(f"config_name: {args.config_name}")
+    print(f"action_expert_mode: {model_cfg.action_expert_mode}")
     print(f"single_ckpt: {args.single_ckpt}")
     print(f"output_path: {args.output_path}")
     print(f"load_state_missing_keys_count: {len(missing)}")
     print(f"load_state_missing_keys: {list(missing)}")
     print(f"load_state_unexpected_keys_count: {len(unexpected)}")
     print(f"load_state_unexpected_keys: {list(unexpected)}")
+    for key in sorted(metadata):
+        if key in {"config_name", "action_expert_mode", "single_ckpt", "output_path", "load_state_missing_keys", "load_state_unexpected_keys"}:
+            continue
+        print(f"{key}: {metadata[key]}")
 if __name__ == "__main__":

openpi/scripts/run_twin_dual_push_128_packed_5k.sh CHANGED Viewed

@@ -26,8 +26,8 @@ PARALLEL_EXP="dual_push_128_packed_parallel_5k"
 VAL_REPO="lsnu/twin_dual_push_128_val"
 INTERMEDIATE_VAL_BATCHES=50
 FINAL_VAL_BATCHES=100
-SAMPLE_VAL_BATCHES=16
-SAMPLE_NUM_STEPS="4,10"
 RUN_WARMSTART_CHECK="${RUN_WARMSTART_CHECK:-0}"
 BASELINE_CKPT_ROOT="$ROOT/checkpoints/$BASELINE_CONFIG/$BASELINE_EXP"

 VAL_REPO="lsnu/twin_dual_push_128_val"
 INTERMEDIATE_VAL_BATCHES=50
 FINAL_VAL_BATCHES=100
+SAMPLE_VAL_BATCHES=64
+SAMPLE_NUM_STEPS="1,2,4,8,16"
 RUN_WARMSTART_CHECK="${RUN_WARMSTART_CHECK:-0}"
 BASELINE_CKPT_ROOT="$ROOT/checkpoints/$BASELINE_CONFIG/$BASELINE_EXP"

openpi/scripts/run_twin_handover_packed_10k.sh CHANGED Viewed

@@ -26,8 +26,8 @@ PARALLEL_EXP="handover_packed_parallel_10k"
 VAL_REPO="lsnu/twin_handover_256_val"
 INTERMEDIATE_VAL_BATCHES=50
 FINAL_VAL_BATCHES=100
-SAMPLE_VAL_BATCHES=16
-SAMPLE_NUM_STEPS="4,10"
 BASELINE_CKPT_ROOT="$ROOT/checkpoints/$BASELINE_CONFIG/$BASELINE_EXP"
 PARALLEL_CKPT_ROOT="$ROOT/checkpoints/$PARALLEL_CONFIG/$PARALLEL_EXP"

 VAL_REPO="lsnu/twin_handover_256_val"
 INTERMEDIATE_VAL_BATCHES=50
 FINAL_VAL_BATCHES=100
+SAMPLE_VAL_BATCHES=64
+SAMPLE_NUM_STEPS="1,2,4,8,16"
 BASELINE_CKPT_ROOT="$ROOT/checkpoints/$BASELINE_CONFIG/$BASELINE_EXP"
 PARALLEL_CKPT_ROOT="$ROOT/checkpoints/$PARALLEL_CONFIG/$PARALLEL_EXP"

openpi/scripts/train_pytorch.py CHANGED Viewed

@@ -216,6 +216,34 @@ def grad_norm_for_parameters(parameters) -> float:
 def collect_gradient_bucket_norms(model: torch.nn.Module) -> dict[str, float]:
     model_for_logging = unwrap_model(model)
     metrics = {"grad_shared_expert": grad_norm_for_parameters(model_for_logging.paligemma_with_expert.parameters())}
     if model_for_logging.use_parallel_action_heads:
         metrics["grad_action_in_proj_arms"] = grad_norm_for_parameters(model_for_logging.action_in_proj_arms.parameters())
@@ -669,7 +697,7 @@ def train_loop(config: _config.TrainConfig):
     last_step_end = time.perf_counter()
     smoothed_loss = None
     if is_main:
-        model_kind = "parallel" if model_cfg.use_parallel_action_heads else "baseline"
         logging.info(f"Running on: {platform.node()} | world_size={world_size}")
         logging.info(
             f"Training config: batch_size={config.batch_size}, effective_batch_size={effective_batch_size}, num_train_steps={config.num_train_steps}"
@@ -696,7 +724,11 @@ def train_loop(config: _config.TrainConfig):
             + (
                 "action_in_proj, action_out_proj, shared_expert"
                 if not model_cfg.use_parallel_action_heads
-                else "action_in_proj_arms, arm_token_fuse, action_out_proj_arms, shared_expert"
             )
         )

 def collect_gradient_bucket_norms(model: torch.nn.Module) -> dict[str, float]:
     model_for_logging = unwrap_model(model)
+    if model_for_logging.use_split_action_expert:
+        metrics = {
+            "grad_shared_backbone": grad_norm_for_parameters(
+                model_for_logging.paligemma_with_expert.paligemma.parameters()
+            ),
+            "grad_left_action_in": grad_norm_for_parameters(model_for_logging.action_in_proj_arms[0].parameters()),
+            "grad_right_action_in": grad_norm_for_parameters(model_for_logging.action_in_proj_arms[1].parameters()),
+            "grad_left_expert": grad_norm_for_parameters(
+                model_for_logging.paligemma_with_expert.left_gemma_expert.parameters()
+            ),
+            "grad_right_expert": grad_norm_for_parameters(
+                model_for_logging.paligemma_with_expert.right_gemma_expert.parameters()
+            ),
+            "grad_action_out": grad_norm_for_parameters(model_for_logging.action_out_proj_arms.parameters()),
+        }
+        if model_for_logging.use_communicating_action_expert:
+            metrics["grad_cross_arm_comm"] = grad_norm_for_parameters(
+                [model_for_logging.paligemma_with_expert.cross_arm_comm]
+            )
+            for layer_idx, gate_value in enumerate(model_for_logging.paligemma_with_expert.cross_arm_comm.detach().cpu()):
+                metrics[f"cross_arm_comm_gate_layer_{layer_idx}"] = float(gate_value.item())
+            if model_for_logging.paligemma_with_expert.latest_cross_arm_attention_mass is not None:
+                for layer_idx, attn_mass in enumerate(
+                    model_for_logging.paligemma_with_expert.latest_cross_arm_attention_mass.detach().cpu()
+                ):
+                    metrics[f"cross_arm_attention_mass_layer_{layer_idx}"] = float(attn_mass.item())
+        return metrics
     metrics = {"grad_shared_expert": grad_norm_for_parameters(model_for_logging.paligemma_with_expert.parameters())}
     if model_for_logging.use_parallel_action_heads:
         metrics["grad_action_in_proj_arms"] = grad_norm_for_parameters(model_for_logging.action_in_proj_arms.parameters())
     last_step_end = time.perf_counter()
     smoothed_loss = None
     if is_main:
+        model_kind = model_cfg.action_expert_mode
         logging.info(f"Running on: {platform.node()} | world_size={world_size}")
         logging.info(
             f"Training config: batch_size={config.batch_size}, effective_batch_size={effective_batch_size}, num_train_steps={config.num_train_steps}"
             + (
                 "action_in_proj, action_out_proj, shared_expert"
                 if not model_cfg.use_parallel_action_heads
+                else (
+                    "left_action_in, right_action_in, left_expert, right_expert, action_out, cross_arm_comm"
+                    if model_cfg.use_split_action_expert
+                    else "action_in_proj_arms, arm_token_fuse, action_out_proj_arms, shared_expert"
+                )
             )
         )

openpi/src/openpi/models/pi0_config.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import dataclasses
-from typing import TYPE_CHECKING
 import flax.nnx as nnx
 import jax
@@ -15,6 +15,9 @@ if TYPE_CHECKING:
     from openpi.models.pi0 import Pi0
 @dataclasses.dataclass(frozen=True)
 class Pi0Config(_model.BaseModelConfig):
     dtype: str = "bfloat16"
@@ -32,6 +35,8 @@ class Pi0Config(_model.BaseModelConfig):
     # Per-arm action dimensions for parallel arm heads. For a dual-arm setup with 32-dim actions,
     # this could be `(16, 16)`. The sum must equal `action_dim`.
     arm_action_dims: tuple[int, ...] | None = None
     # This config option is not used directly by the model, but it is read by the ModelTransformFactory.
     discrete_state_input: bool = None  # type: ignore
@@ -44,6 +49,9 @@ class Pi0Config(_model.BaseModelConfig):
             object.__setattr__(self, "arm_action_dims", (self.action_dim,))
         else:
             object.__setattr__(self, "arm_action_dims", tuple(self.arm_action_dims))
         if not self.arm_action_dims:
             raise ValueError("arm_action_dims must contain at least one arm.")
@@ -55,10 +63,27 @@ class Pi0Config(_model.BaseModelConfig):
             )
         if len(self.arm_action_dims) > 1 and not self.pi05:
             raise ValueError("Parallel arm heads are only supported for pi05 models.")
     @property
     def use_parallel_action_heads(self) -> bool:
-        return len(self.arm_action_dims) > 1
     @property
     @override
@@ -71,6 +96,8 @@ class Pi0Config(_model.BaseModelConfig):
     def create(self, rng: at.KeyArrayLike) -> "Pi0":
         from openpi.models.pi0 import Pi0
         return Pi0(self, rngs=nnx.Rngs(rng))
     @override

 import dataclasses
+from typing import TYPE_CHECKING, Literal
 import flax.nnx as nnx
 import jax
     from openpi.models.pi0 import Pi0
+ActionExpertMode = Literal["shared", "head_only_parallel", "split_independent", "split_communicating"]
 @dataclasses.dataclass(frozen=True)
 class Pi0Config(_model.BaseModelConfig):
     dtype: str = "bfloat16"
     # Per-arm action dimensions for parallel arm heads. For a dual-arm setup with 32-dim actions,
     # this could be `(16, 16)`. The sum must equal `action_dim`.
     arm_action_dims: tuple[int, ...] | None = None
+    # Defines whether the action expert is shared, split only at the factorized heads, or duplicated per arm.
+    action_expert_mode: ActionExpertMode | None = None
     # This config option is not used directly by the model, but it is read by the ModelTransformFactory.
     discrete_state_input: bool = None  # type: ignore
             object.__setattr__(self, "arm_action_dims", (self.action_dim,))
         else:
             object.__setattr__(self, "arm_action_dims", tuple(self.arm_action_dims))
+        if self.action_expert_mode is None:
+            default_mode: ActionExpertMode = "head_only_parallel" if len(self.arm_action_dims) > 1 else "shared"
+            object.__setattr__(self, "action_expert_mode", default_mode)
         if not self.arm_action_dims:
             raise ValueError("arm_action_dims must contain at least one arm.")
             )
         if len(self.arm_action_dims) > 1 and not self.pi05:
             raise ValueError("Parallel arm heads are only supported for pi05 models.")
+        if self.action_expert_mode != "shared" and len(self.arm_action_dims) < 2:
+            raise ValueError(
+                f"action_expert_mode={self.action_expert_mode!r} requires at least two arm_action_dims, got {self.arm_action_dims}."
+            )
+        if self.action_expert_mode in ("split_independent", "split_communicating") and len(self.arm_action_dims) != 2:
+            raise ValueError(
+                "split action expert modes currently require exactly two arm_action_dims "
+                f"(left/right), got {self.arm_action_dims}."
+            )
     @property
     def use_parallel_action_heads(self) -> bool:
+        return self.action_expert_mode != "shared"
+    @property
+    def use_split_action_expert(self) -> bool:
+        return self.action_expert_mode in ("split_independent", "split_communicating")
+    @property
+    def use_communicating_action_expert(self) -> bool:
+        return self.action_expert_mode == "split_communicating"
     @property
     @override
     def create(self, rng: at.KeyArrayLike) -> "Pi0":
         from openpi.models.pi0 import Pi0
+        if self.use_split_action_expert:
+            raise NotImplementedError("Split action expert modes are currently supported only in the PyTorch model.")
         return Pi0(self, rngs=nnx.Rngs(rng))
     @override

openpi/src/openpi/models/utils/fsq_tokenizer.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import math
 from typing import Any, Literal
-import chex
 from einops import einops
 from flax import linen as nn
 from flax.linen.module import Module
@@ -12,6 +11,20 @@ import jax
 import jax.numpy as jnp
 class FsqCodebook(nn.Module):
     input_dim: int
     target_codebook_size: int
@@ -109,7 +122,7 @@ class FsqCodebook(nn.Module):
         z_q = digits / (bases - 1) * 2 - 1
         if z_grad is not None:
-            chex.assert_equal_shape([z_q, z_grad])
             z_q = jax.lax.stop_gradient(z_q - z_grad) + z_grad
         return self.proj_up(z_q)
@@ -216,7 +229,7 @@ class LookupFreeQuantization(nn.Module):
             + token_bit_log_probs[..., 1] @ token_bit_expansions
         )  # (batch_size, num_tokens, 2 ** num_dims)
         token_log_probs = jax.lax.stop_gradient(jax.nn.log_softmax(token_log_probs, axis=-1))
-        chex.assert_shape(token_log_probs, (*x.shape[:-1], 2**self.num_dims))
         z_q = self.codebook[tokens]
         commit_loss = jnp.square(z - z_q).mean()
@@ -361,7 +374,7 @@ class TokenizerEncoderDecoder(nn.Module):
         if mask is not None:
             # mask is (batch_dims..., num_cross_tokens)
-            chex.assert_equal_shape([y[..., 0], mask])
             attn_mask = einops.repeat(mask, "... kv -> ... 1 q kv", q=self.num_tokens)
         else:
             attn_mask = jnp.ones((*y.shape[:-2], 1, self.num_tokens, self.num_cross_tokens))

 import math
 from typing import Any, Literal
 from einops import einops
 from flax import linen as nn
 from flax.linen.module import Module
 import jax.numpy as jnp
+def _assert_equal_shape(arrays: list[jnp.ndarray]) -> None:
+    if not arrays:
+        return
+    expected_shape = tuple(arrays[0].shape)
+    for array in arrays[1:]:
+        if tuple(array.shape) != expected_shape:
+            raise ValueError(f"Expected equal shapes, got {expected_shape} and {tuple(array.shape)}.")
+def _assert_shape(array: jnp.ndarray, expected_shape: tuple[int, ...]) -> None:
+    if tuple(array.shape) != expected_shape:
+        raise ValueError(f"Expected shape {expected_shape}, got {tuple(array.shape)}.")
 class FsqCodebook(nn.Module):
     input_dim: int
     target_codebook_size: int
         z_q = digits / (bases - 1) * 2 - 1
         if z_grad is not None:
+            _assert_equal_shape([z_q, z_grad])
             z_q = jax.lax.stop_gradient(z_q - z_grad) + z_grad
         return self.proj_up(z_q)
             + token_bit_log_probs[..., 1] @ token_bit_expansions
         )  # (batch_size, num_tokens, 2 ** num_dims)
         token_log_probs = jax.lax.stop_gradient(jax.nn.log_softmax(token_log_probs, axis=-1))
+        _assert_shape(token_log_probs, (*x.shape[:-1], 2**self.num_dims))
         z_q = self.codebook[tokens]
         commit_loss = jnp.square(z - z_q).mean()
         if mask is not None:
             # mask is (batch_dims..., num_cross_tokens)
+            _assert_equal_shape([y[..., 0], mask])
             attn_mask = einops.repeat(mask, "... kv -> ... 1 q kv", q=self.num_tokens)
         else:
             attn_mask = jnp.ones((*y.shape[:-2], 1, self.num_tokens, self.num_cross_tokens))

openpi/src/openpi/models_pytorch/gemma_pytorch.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import Literal
-import pytest
 import torch
 from torch import nn
 from transformers import GemmaForCausalLM
@@ -16,11 +16,33 @@ class PaliGemmaWithExpertModel(nn.Module):
         action_expert_config,
         use_adarms=None,
         precision: Literal["bfloat16", "float32"] = "bfloat16",
     ):
         if use_adarms is None:
             use_adarms = [False, False]
         super().__init__()
         vlm_config_hf = CONFIG_MAPPING["paligemma"]()
         vlm_config_hf._vocab_size = 257152  # noqa: SLF001
         vlm_config_hf.image_token_index = 257152
@@ -36,7 +58,6 @@ class PaliGemmaWithExpertModel(nn.Module):
         vlm_config_hf.text_config.use_adarms = use_adarms[0]
         vlm_config_hf.text_config.adarms_cond_dim = vlm_config.width if use_adarms[0] else None
         vlm_config_hf.vision_config.intermediate_size = 4304
-        # Keep image and language embedding dimensions aligned for all variants, including dummy.
         vlm_config_hf.vision_config.projection_dim = vlm_config.width
         vlm_config_hf.vision_config.projector_hidden_act = "gelu_fast"
         vlm_config_hf.vision_config.torch_dtype = "float32"
@@ -51,16 +72,41 @@ class PaliGemmaWithExpertModel(nn.Module):
             vocab_size=257152,
             hidden_activation="gelu_pytorch_tanh",
             torch_dtype="float32",
-            use_adarms=use_adarms[1],
-            adarms_cond_dim=action_expert_config.width if use_adarms[1] else None,
         )
         self.paligemma = PaliGemmaForConditionalGeneration(config=vlm_config_hf)
-        self.gemma_expert = GemmaForCausalLM(config=action_expert_config_hf)
-        self.gemma_expert.model.embed_tokens = None
         self.to_bfloat16_for_selected_params(precision)
     def to_bfloat16_for_selected_params(self, precision: Literal["bfloat16", "float32"] = "bfloat16"):
         if precision == "bfloat16":
             self.to(dtype=torch.bfloat16)
@@ -89,194 +135,214 @@ class PaliGemmaWithExpertModel(nn.Module):
     def embed_language_tokens(self, tokens: torch.Tensor):
         return self.paligemma.language_model.embed_tokens(tokens)
     def forward(
         self,
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.LongTensor | None = None,
-        past_key_values: list[torch.FloatTensor] | pytest.Cache | None = None,
-        inputs_embeds: list[torch.FloatTensor] | None = None,
         use_cache: bool | None = None,
-        adarms_cond: list[torch.Tensor] | None = None,
     ):
         if adarms_cond is None:
-            adarms_cond = [None, None]
-        if inputs_embeds[1] is None:
             prefix_output = self.paligemma.language_model.forward(
                 inputs_embeds=inputs_embeds[0],
                 attention_mask=attention_mask,
                 position_ids=position_ids,
                 past_key_values=past_key_values,
                 use_cache=use_cache,
-                adarms_cond=adarms_cond[0] if adarms_cond is not None else None,
             )
-            prefix_past_key_values = prefix_output.past_key_values
-            prefix_output = prefix_output.last_hidden_state
-            suffix_output = None
-        elif inputs_embeds[0] is None:
-            suffix_output = self.gemma_expert.model.forward(
-                inputs_embeds=inputs_embeds[1],
                 attention_mask=attention_mask,
                 position_ids=position_ids,
                 past_key_values=past_key_values,
                 use_cache=use_cache,
-                adarms_cond=adarms_cond[1] if adarms_cond is not None else None,
             )
-            suffix_output = suffix_output.last_hidden_state
-            prefix_output = None
-            prefix_past_key_values = None
-        else:
-            models = [self.paligemma.language_model, self.gemma_expert.model]
-            num_layers = self.paligemma.config.text_config.num_hidden_layers
-            # Check if gradient checkpointing is enabled for any of the models
-            use_gradient_checkpointing = (
-                hasattr(self.gemma_expert.model, "gradient_checkpointing")
-                and self.gemma_expert.model.gradient_checkpointing
-                and self.training
-            ) or (hasattr(self, "gradient_checkpointing") and self.gradient_checkpointing and self.training)
-            # Force enable gradient checkpointing if we're in training mode and the model supports it
-            if self.training and hasattr(self.gemma_expert.model, "gradient_checkpointing"):
-                if not self.gemma_expert.model.gradient_checkpointing:
-                    print("Forcing gradient checkpointing to be enabled for Gemma expert model")
-                    self.gemma_expert.model.gradient_checkpointing = True
-                use_gradient_checkpointing = True
-            # Debug gradient checkpointing status
-            if hasattr(self, "_debug_gc_printed") and not self._debug_gc_printed:
-                print(f"Gemma expert model gradient checkpointing: {use_gradient_checkpointing}")
-                print(f"Model training mode: {self.training}")
-                print(
-                    f"Gemma expert model has gradient_checkpointing attr: {hasattr(self.gemma_expert.model, 'gradient_checkpointing')}"
-                )
-                if hasattr(self.gemma_expert.model, "gradient_checkpointing"):
-                    print(
-                        f"Gemma expert model gradient_checkpointing value: {self.gemma_expert.model.gradient_checkpointing}"
-                    )
-                self._debug_gc_printed = True
-            # Define the complete layer computation function for gradient checkpointing
-            def compute_layer_complete(layer_idx, inputs_embeds, attention_mask, position_ids, adarms_cond):
-                models = [self.paligemma.language_model, self.gemma_expert.model]
-                query_states = []
-                key_states = []
-                value_states = []
-                gates = []
-                for i, hidden_states in enumerate(inputs_embeds):
-                    layer = models[i].layers[layer_idx]
-                    hidden_states, gate = layer.input_layernorm(hidden_states, cond=adarms_cond[i])  # noqa: PLW2901
-                    gates.append(gate)
-                    input_shape = hidden_states.shape[:-1]
-                    hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
-                    query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-                    key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-                    value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-                    query_states.append(query_state)
-                    key_states.append(key_state)
-                    value_states.append(value_state)
-                # Concatenate and process attention
-                query_states = torch.cat(query_states, dim=2)
-                key_states = torch.cat(key_states, dim=2)
-                value_states = torch.cat(value_states, dim=2)
-                dummy_tensor = torch.zeros(
-                    query_states.shape[0],
-                    query_states.shape[2],
-                    query_states.shape[-1],
-                    device=query_states.device,
-                    dtype=query_states.dtype,
-                )
-                cos, sin = self.paligemma.model.language_model.rotary_emb(dummy_tensor, position_ids)
-                query_states, key_states = modeling_gemma.apply_rotary_pos_emb(
-                    query_states, key_states, cos, sin, unsqueeze_dim=1
-                )
-                batch_size = query_states.shape[0]
-                scaling = self.paligemma.language_model.layers[layer_idx].self_attn.scaling
-                # Attention computation
-                att_output, _ = modeling_gemma.eager_attention_forward(
-                    self.paligemma.language_model.layers[layer_idx].self_attn,
-                    query_states,
-                    key_states,
-                    value_states,
-                    attention_mask,
-                    scaling,
                 )
-                # Get head_dim from the current layer, not from the model
-                head_dim = self.paligemma.language_model.layers[layer_idx].self_attn.head_dim
-                att_output = att_output.reshape(batch_size, -1, 1 * 8 * head_dim)
-                # Process layer outputs
-                outputs_embeds = []
-                start_pos = 0
-                for i, hidden_states in enumerate(inputs_embeds):
-                    layer = models[i].layers[layer_idx]
-                    end_pos = start_pos + hidden_states.shape[1]
-                    if att_output.dtype != layer.self_attn.o_proj.weight.dtype:
-                        att_output = att_output.to(layer.self_attn.o_proj.weight.dtype)
-                    out_emb = layer.self_attn.o_proj(att_output[:, start_pos:end_pos])
-                    # first residual
-                    out_emb = modeling_gemma._gated_residual(hidden_states, out_emb, gates[i])  # noqa: SLF001
-                    after_first_residual = out_emb.clone()
-                    out_emb, gate = layer.post_attention_layernorm(out_emb, cond=adarms_cond[i])
-                    # Convert to bfloat16 if the next layer (mlp) uses bfloat16
-                    if layer.mlp.up_proj.weight.dtype == torch.bfloat16:
-                        out_emb = out_emb.to(dtype=torch.bfloat16)
-                    out_emb = layer.mlp(out_emb)
-                    # second residual
-                    out_emb = modeling_gemma._gated_residual(after_first_residual, out_emb, gate)  # noqa: SLF001
-                    outputs_embeds.append(out_emb)
-                    start_pos = end_pos
-                return outputs_embeds
-            # Process all layers with gradient checkpointing if enabled
-            for layer_idx in range(num_layers):
-                if use_gradient_checkpointing:
-                    inputs_embeds = torch.utils.checkpoint.checkpoint(
-                        compute_layer_complete,
-                        layer_idx,
-                        inputs_embeds,
-                        attention_mask,
-                        position_ids,
-                        adarms_cond,
-                        use_reentrant=False,
-                        preserve_rng_state=False,
-                    )
-                else:
-                    inputs_embeds = compute_layer_complete(
-                        layer_idx, inputs_embeds, attention_mask, position_ids, adarms_cond
-                    )
-                # Old code removed - now using compute_layer_complete function above
-            # final norm
-            # Define final norm computation function for gradient checkpointing
-            def compute_final_norms(inputs_embeds, adarms_cond):
-                outputs_embeds = []
-                for i, hidden_states in enumerate(inputs_embeds):
-                    out_emb, _ = models[i].norm(hidden_states, cond=adarms_cond[i])
-                    outputs_embeds.append(out_emb)
-                return outputs_embeds
-            # Apply gradient checkpointing to final norm if enabled
             if use_gradient_checkpointing:
-                outputs_embeds = torch.utils.checkpoint.checkpoint(
-                    compute_final_norms, inputs_embeds, adarms_cond, use_reentrant=False, preserve_rng_state=False
                 )
             else:
-                outputs_embeds = compute_final_norms(inputs_embeds, adarms_cond)
-            prefix_output = outputs_embeds[0]
-            suffix_output = outputs_embeds[1]
-            prefix_past_key_values = None
-        return [prefix_output, suffix_output], prefix_past_key_values

+from typing import Any
 from typing import Literal
 import torch
 from torch import nn
 from transformers import GemmaForCausalLM
         action_expert_config,
         use_adarms=None,
         precision: Literal["bfloat16", "float32"] = "bfloat16",
+        *,
+        num_action_experts: int = 1,
+        enable_cross_arm_communication: bool = False,
     ):
         if use_adarms is None:
             use_adarms = [False, False]
         super().__init__()
+        if num_action_experts < 1:
+            raise ValueError(f"num_action_experts must be positive, got {num_action_experts}.")
+        if enable_cross_arm_communication and num_action_experts < 2:
+            raise ValueError("Cross-arm communication requires at least two action experts.")
+        if len(use_adarms) == 2 and num_action_experts > 1:
+            use_adarms = [use_adarms[0], *([use_adarms[1]] * num_action_experts)]
+        if len(use_adarms) != num_action_experts + 1:
+            raise ValueError(
+                f"use_adarms must have one entry per stream, got {len(use_adarms)} for {num_action_experts + 1} streams."
+            )
+        expert_use_adarms = use_adarms[1]
+        if any(expert_flag != expert_use_adarms for expert_flag in use_adarms[1:]):
+            raise ValueError(f"All action expert streams must agree on use_adarms, got {use_adarms[1:]}.")
+        self.num_action_experts = num_action_experts
+        self.enable_cross_arm_communication = enable_cross_arm_communication
         vlm_config_hf = CONFIG_MAPPING["paligemma"]()
         vlm_config_hf._vocab_size = 257152  # noqa: SLF001
         vlm_config_hf.image_token_index = 257152
         vlm_config_hf.text_config.use_adarms = use_adarms[0]
         vlm_config_hf.text_config.adarms_cond_dim = vlm_config.width if use_adarms[0] else None
         vlm_config_hf.vision_config.intermediate_size = 4304
         vlm_config_hf.vision_config.projection_dim = vlm_config.width
         vlm_config_hf.vision_config.projector_hidden_act = "gelu_fast"
         vlm_config_hf.vision_config.torch_dtype = "float32"
             vocab_size=257152,
             hidden_activation="gelu_pytorch_tanh",
             torch_dtype="float32",
+            use_adarms=expert_use_adarms,
+            adarms_cond_dim=action_expert_config.width if expert_use_adarms else None,
         )
         self.paligemma = PaliGemmaForConditionalGeneration(config=vlm_config_hf)
+        if num_action_experts == 1:
+            self.gemma_expert = GemmaForCausalLM(config=action_expert_config_hf)
+            self.gemma_expert.model.embed_tokens = None
+            self._action_expert_names = ("gemma_expert",)
+        else:
+            self.left_gemma_expert = GemmaForCausalLM(config=action_expert_config_hf)
+            self.right_gemma_expert = GemmaForCausalLM(config=action_expert_config_hf)
+            self.left_gemma_expert.model.embed_tokens = None
+            self.right_gemma_expert.model.embed_tokens = None
+            self._action_expert_names = ("left_gemma_expert", "right_gemma_expert")
+        if self.enable_cross_arm_communication:
+            self.cross_arm_comm = nn.Parameter(torch.zeros(action_expert_config.depth, dtype=torch.float32))
+            self.register_buffer(
+                "latest_cross_arm_attention_mass",
+                torch.zeros(action_expert_config.depth, dtype=torch.float32),
+                persistent=False,
+            )
+        else:
+            self.cross_arm_comm = None
+            self.latest_cross_arm_attention_mass = None
         self.to_bfloat16_for_selected_params(precision)
+    def _get_action_experts(self) -> list[GemmaForCausalLM]:
+        return [getattr(self, name) for name in self._action_expert_names]
+    def _get_action_expert_models(self) -> list[nn.Module]:
+        return [expert.model for expert in self._get_action_experts()]
     def to_bfloat16_for_selected_params(self, precision: Literal["bfloat16", "float32"] = "bfloat16"):
         if precision == "bfloat16":
             self.to(dtype=torch.bfloat16)
     def embed_language_tokens(self, tokens: torch.Tensor):
         return self.paligemma.language_model.embed_tokens(tokens)
+    def _make_outputs(self, prefix_output, suffix_outputs: list[torch.Tensor | None]) -> list[torch.Tensor | None]:
+        return [prefix_output, *suffix_outputs]
+    def _compute_cross_arm_attention_mass(
+        self,
+        layer_idx: int,
+        att_weights: torch.Tensor,
+        cross_attention_selector: torch.Tensor | None,
+    ) -> None:
+        if self.latest_cross_arm_attention_mass is None or cross_attention_selector is None:
+            return
+        selector = cross_attention_selector.to(device=att_weights.device, dtype=att_weights.dtype)
+        denom = selector.sum() * att_weights.shape[0] * att_weights.shape[1]
+        if float(denom.item()) <= 0:
+            self.latest_cross_arm_attention_mass[layer_idx].zero_()
+            return
+        mass = (att_weights * selector).sum() / denom
+        self.latest_cross_arm_attention_mass[layer_idx].copy_(mass.detach().to(torch.float32))
     def forward(
         self,
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | Any | None = None,
+        inputs_embeds: list[torch.FloatTensor | None] | None = None,
         use_cache: bool | None = None,
+        adarms_cond: list[torch.Tensor | None] | None = None,
+        cross_attention_selector: torch.Tensor | None = None,
     ):
+        if inputs_embeds is None:
+            raise ValueError("inputs_embeds is required.")
+        if len(inputs_embeds) != self.num_action_experts + 1:
+            raise ValueError(
+                f"Expected {self.num_action_experts + 1} input streams, got {len(inputs_embeds)}."
+            )
         if adarms_cond is None:
+            adarms_cond = [None] * len(inputs_embeds)
+        if len(adarms_cond) != len(inputs_embeds):
+            raise ValueError(f"Expected {len(inputs_embeds)} adarms_cond entries, got {len(adarms_cond)}.")
+        suffix_inputs = inputs_embeds[1:]
+        if inputs_embeds[0] is not None and all(suffix is None for suffix in suffix_inputs):
             prefix_output = self.paligemma.language_model.forward(
                 inputs_embeds=inputs_embeds[0],
                 attention_mask=attention_mask,
                 position_ids=position_ids,
                 past_key_values=past_key_values,
                 use_cache=use_cache,
+                adarms_cond=adarms_cond[0],
             )
+            outputs = self._make_outputs(prefix_output.last_hidden_state, [None] * self.num_action_experts)
+            return outputs, prefix_output.past_key_values
+        active_suffix_indices = [i for i, suffix in enumerate(suffix_inputs) if suffix is not None]
+        if inputs_embeds[0] is None and len(active_suffix_indices) == 1:
+            expert_idx = active_suffix_indices[0]
+            suffix_output = self._get_action_expert_models()[expert_idx].forward(
+                inputs_embeds=suffix_inputs[expert_idx],
                 attention_mask=attention_mask,
                 position_ids=position_ids,
                 past_key_values=past_key_values,
                 use_cache=use_cache,
+                adarms_cond=adarms_cond[expert_idx + 1],
             )
+            outputs = [None] * len(inputs_embeds)
+            outputs[expert_idx + 1] = suffix_output.last_hidden_state
+            return outputs, None
+        if inputs_embeds[0] is None:
+            raise NotImplementedError("Multi-stream suffix-only forward is not implemented.")
+        if any(suffix is None for suffix in suffix_inputs):
+            raise ValueError("Joint forward requires all suffix streams to be present.")
+        models = [self.paligemma.language_model, *self._get_action_expert_models()]
+        num_layers = self.paligemma.config.text_config.num_hidden_layers
+        use_gradient_checkpointing = self.training and any(
+            getattr(model, "gradient_checkpointing", False) for model in models
+        )
+        def compute_layer_complete(
+            layer_idx,
+            stream_inputs,
+            attention_mask,
+            position_ids,
+            adarms_cond,
+            cross_attention_selector,
+        ):
+            query_states = []
+            key_states = []
+            value_states = []
+            gates = []
+            for stream_idx, hidden_states in enumerate(stream_inputs):
+                layer = models[stream_idx].layers[layer_idx]
+                hidden_states, gate = layer.input_layernorm(hidden_states, cond=adarms_cond[stream_idx])  # noqa: PLW2901
+                gates.append(gate)
+                input_shape = hidden_states.shape[:-1]
+                hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
+                query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+                key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+                value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+                query_states.append(query_state)
+                key_states.append(key_state)
+                value_states.append(value_state)
+            query_states = torch.cat(query_states, dim=2)
+            key_states = torch.cat(key_states, dim=2)
+            value_states = torch.cat(value_states, dim=2)
+            dummy_tensor = torch.zeros(
+                query_states.shape[0],
+                query_states.shape[2],
+                query_states.shape[-1],
+                device=query_states.device,
+                dtype=query_states.dtype,
+            )
+            cos, sin = self.paligemma.model.language_model.rotary_emb(dummy_tensor, position_ids)
+            query_states, key_states = modeling_gemma.apply_rotary_pos_emb(
+                query_states, key_states, cos, sin, unsqueeze_dim=1
+            )
+            layer_attention_mask = attention_mask
+            if self.cross_arm_comm is not None and cross_attention_selector is not None:
+                cross_bias = self.cross_arm_comm[layer_idx].to(device=attention_mask.device, dtype=attention_mask.dtype)
+                layer_attention_mask = attention_mask + cross_bias * cross_attention_selector.to(
+                    device=attention_mask.device,
+                    dtype=attention_mask.dtype,
                 )
+            batch_size = query_states.shape[0]
+            scaling = self.paligemma.language_model.layers[layer_idx].self_attn.scaling
+            att_output, att_weights = modeling_gemma.eager_attention_forward(
+                self.paligemma.language_model.layers[layer_idx].self_attn,
+                query_states,
+                key_states,
+                value_states,
+                layer_attention_mask,
+                scaling,
+            )
+            self._compute_cross_arm_attention_mass(layer_idx, att_weights, cross_attention_selector)
+            proj_dim = self.paligemma.language_model.layers[layer_idx].self_attn.o_proj.in_features
+            att_output = att_output.reshape(batch_size, -1, proj_dim)
+            outputs_embeds = []
+            start_pos = 0
+            for stream_idx, hidden_states in enumerate(stream_inputs):
+                layer = models[stream_idx].layers[layer_idx]
+                end_pos = start_pos + hidden_states.shape[1]
+                stream_att_output = att_output[:, start_pos:end_pos]
+                if stream_att_output.dtype != layer.self_attn.o_proj.weight.dtype:
+                    stream_att_output = stream_att_output.to(layer.self_attn.o_proj.weight.dtype)
+                out_emb = layer.self_attn.o_proj(stream_att_output)
+                out_emb = modeling_gemma._gated_residual(hidden_states, out_emb, gates[stream_idx])  # noqa: SLF001
+                after_first_residual = out_emb.clone()
+                out_emb, gate = layer.post_attention_layernorm(out_emb, cond=adarms_cond[stream_idx])
+                if layer.mlp.up_proj.weight.dtype == torch.bfloat16:
+                    out_emb = out_emb.to(dtype=torch.bfloat16)
+                out_emb = layer.mlp(out_emb)
+                out_emb = modeling_gemma._gated_residual(after_first_residual, out_emb, gate)  # noqa: SLF001
+                outputs_embeds.append(out_emb)
+                start_pos = end_pos
+            return outputs_embeds
+        for layer_idx in range(num_layers):
             if use_gradient_checkpointing:
+                inputs_embeds = torch.utils.checkpoint.checkpoint(
+                    compute_layer_complete,
+                    layer_idx,
+                    inputs_embeds,
+                    attention_mask,
+                    position_ids,
+                    adarms_cond,
+                    cross_attention_selector,
+                    use_reentrant=False,
+                    preserve_rng_state=False,
                 )
             else:
+                inputs_embeds = compute_layer_complete(
+                    layer_idx,
+                    inputs_embeds,
+                    attention_mask,
+                    position_ids,
+                    adarms_cond,
+                    cross_attention_selector,
+                )
+        def compute_final_norms(stream_inputs, adarms_cond):
+            outputs_embeds = []
+            for stream_idx, hidden_states in enumerate(stream_inputs):
+                out_emb, _ = models[stream_idx].norm(hidden_states, cond=adarms_cond[stream_idx])
+                outputs_embeds.append(out_emb)
+            return outputs_embeds
+        if use_gradient_checkpointing:
+            outputs_embeds = torch.utils.checkpoint.checkpoint(
+                compute_final_norms,
+                inputs_embeds,
+                adarms_cond,
+                use_reentrant=False,
+                preserve_rng_state=False,
+            )
+        else:
+            outputs_embeds = compute_final_norms(inputs_embeds, adarms_cond)
+        return outputs_embeds, None

openpi/src/openpi/models_pytorch/pi0_pytorch.py CHANGED Viewed

@@ -15,7 +15,6 @@ import openpi.models_pytorch.preprocessing_pytorch as _preprocessing
 def get_safe_dtype(target_dtype, device_type):
     """Get a safe dtype for the given device type."""
     if device_type == "cpu":
-        # CPU doesn't support bfloat16, use float32 instead
         if target_dtype == torch.bfloat16:
             return torch.float32
         if target_dtype == torch.float64:
@@ -29,15 +28,12 @@ def create_sinusoidal_pos_embedding(
     """Computes sine-cosine positional embedding vectors for scalar positions."""
     if dimension % 2 != 0:
         raise ValueError(f"dimension ({dimension}) must be divisible by 2")
     if time.ndim != 1:
         raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.")
     dtype = get_safe_dtype(torch.float64, device.type)
     fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device)
     period = min_period * (max_period / min_period) ** fraction
-    # Compute the outer product
     scaling_factor = 1.0 / period * 2 * math.pi
     sin_input = scaling_factor[None, :] * time[:, None]
     return torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
@@ -51,26 +47,7 @@ def sample_beta(alpha, beta, bsize, device):
 def make_att_2d_masks(pad_masks, att_masks):
-    """Copied from big_vision.
-    Tokens can attend to valid inputs tokens which have a cumulative mask_ar
-    smaller or equal to theirs. This way `mask_ar` int[B, N] can be used to
-    setup several types of attention, for example:
-      [[1 1 1 1 1 1]]: pure causal attention.
-      [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between
-          themselves and the last 3 tokens have a causal attention. The first
-          entry could also be a 1 without changing behaviour.
-      [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a
-          block can attend all previous blocks and all tokens on the same block.
-    Args:
-      input_mask: bool[B, N] true if its part of the input, false if padding.
-      mask_ar: int32[B, N] mask that's 1 where previous tokens cannot depend on
-        it and 0 where it shares the same attention mask as the previous token.
-    """
     if att_masks.ndim != 2:
         raise ValueError(att_masks.ndim)
     if pad_masks.ndim != 2:
@@ -89,26 +66,36 @@ class PI0Pytorch(nn.Module):
         self.pi05 = config.pi05
         self.arm_action_dims = tuple(config.arm_action_dims)
         self.use_parallel_action_heads = config.use_parallel_action_heads
         self.action_split_dims = list(self.arm_action_dims)
         paligemma_config = _gemma.get_config(config.paligemma_variant)
         action_expert_config = _gemma.get_config(config.action_expert_variant)
         self.action_expert_width = action_expert_config.width
         self.paligemma_with_expert = PaliGemmaWithExpertModel(
             paligemma_config,
             action_expert_config,
             use_adarms=[False, True] if self.pi05 else [False, False],
             precision=config.dtype,
         )
         if self.use_parallel_action_heads:
             self.action_in_proj_arms = nn.ModuleList(
                 [nn.Linear(arm_dim, action_expert_config.width) for arm_dim in self.arm_action_dims]
             )
-            self.arm_token_fuse = nn.Linear(
-                len(self.arm_action_dims) * action_expert_config.width, action_expert_config.width
-            )
             self.action_out_proj_arms = nn.ModuleList(
                 [nn.Linear(action_expert_config.width, arm_dim) for arm_dim in self.arm_action_dims]
             )
@@ -128,7 +115,6 @@ class PI0Pytorch(nn.Module):
         if os.environ.get("OPENPI_TORCH_COMPILE_SAMPLE_ACTIONS", "0") == "1":
             self.sample_actions = torch.compile(self.sample_actions, mode="max-autotune")
-        # Initialize gradient checkpointing flag
         self.gradient_checkpointing_enabled = False
         msg = "transformers_replace is not installed correctly. Please install it with `uv pip install transformers==4.53.2` and `cp -r ./src/openpi/models_pytorch/transformers_replace/* .venv/lib/python3.11/site-packages/transformers/`."
@@ -140,30 +126,29 @@ class PI0Pytorch(nn.Module):
         except ImportError:
             raise ValueError(msg) from None
     def gradient_checkpointing_enable(self):
-        """Enable gradient checkpointing for memory optimization."""
         self.gradient_checkpointing_enabled = True
         self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing = True
         self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing = True
-        self.paligemma_with_expert.gemma_expert.model.gradient_checkpointing = True
         logging.info("Enabled gradient checkpointing for PI0Pytorch model")
     def gradient_checkpointing_disable(self):
-        """Disable gradient checkpointing."""
         self.gradient_checkpointing_enabled = False
         self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing = False
         self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing = False
-        self.paligemma_with_expert.gemma_expert.model.gradient_checkpointing = False
         logging.info("Disabled gradient checkpointing for PI0Pytorch model")
     def is_gradient_checkpointing_enabled(self):
-        """Check if gradient checkpointing is enabled."""
         return self.gradient_checkpointing_enabled
     def _apply_checkpoint(self, func, *args, **kwargs):
-        """Helper method to apply gradient checkpointing if enabled."""
         if self.gradient_checkpointing_enabled and self.training:
             return torch.utils.checkpoint.checkpoint(
                 func, *args, use_reentrant=False, preserve_rng_state=False, **kwargs
@@ -171,12 +156,10 @@ class PI0Pytorch(nn.Module):
         return func(*args, **kwargs)
     def _prepare_attention_masks_4d(self, att_2d_masks):
-        """Helper method to prepare 4D attention masks for transformer."""
         att_2d_masks_4d = att_2d_masks[:, None, :, :]
         return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38)
     def _preprocess_observation(self, observation, *, train=True):
-        """Helper method to preprocess observation."""
         observation = _preprocessing.preprocess_observation_pytorch(observation, train=train)
         return (
             list(observation.images.values()),
@@ -187,13 +170,7 @@ class PI0Pytorch(nn.Module):
         )
     def sample_noise(self, shape, device):
-        return torch.normal(
-            mean=0.0,
-            std=1.0,
-            size=shape,
-            dtype=torch.float32,
-            device=device,
-        )
     def sample_time(self, bsize, device):
         time_beta = sample_beta(1.5, 1.0, bsize, device)
@@ -217,6 +194,9 @@ class PI0Pytorch(nn.Module):
             per_arm_embeddings.append(self._apply_checkpoint(arm_proj_func, arm_actions))
         fused_inputs = torch.cat(per_arm_embeddings, dim=-1)
         def fuse_func(fused_inputs):
@@ -232,68 +212,69 @@ class PI0Pytorch(nn.Module):
             return self._apply_checkpoint(action_out_proj_func, suffix_out)
         per_arm_outputs = []
-        for arm_head in self.action_out_proj_arms:
-            def arm_out_func(suffix_out, arm_head=arm_head):
-                return arm_head(suffix_out)
-            per_arm_outputs.append(self._apply_checkpoint(arm_out_func, suffix_out))
         return torch.cat(per_arm_outputs, dim=-1)
     def embed_prefix(
         self, images, img_masks, lang_tokens, lang_masks
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Embed images with SigLIP and language tokens with embedding layer to prepare
-        for PaliGemma transformer processing.
-        """
         embs = []
         pad_masks = []
         att_masks = []
-        # Process images
         for img, img_mask in zip(images, img_masks, strict=True):
             def image_embed_func(img):
                 return self.paligemma_with_expert.embed_image(img)
             img_emb = self._apply_checkpoint(image_embed_func, img)
             bsize, num_img_embs = img_emb.shape[:2]
             embs.append(img_emb)
             pad_masks.append(img_mask[:, None].expand(bsize, num_img_embs))
-            # Create attention masks so that image tokens attend to each other
             att_masks += [0] * num_img_embs
-        # Process language tokens
         def lang_embed_func(lang_tokens):
             lang_emb = self.paligemma_with_expert.embed_language_tokens(lang_tokens)
-            lang_emb_dim = lang_emb.shape[-1]
-            return lang_emb * math.sqrt(lang_emb_dim)
         lang_emb = self._apply_checkpoint(lang_embed_func, lang_tokens)
         embs.append(lang_emb)
         pad_masks.append(lang_masks)
-        # full attention between image and language inputs
-        num_lang_embs = lang_emb.shape[1]
-        att_masks += [0] * num_lang_embs
         embs = torch.cat(embs, dim=1)
         pad_masks = torch.cat(pad_masks, dim=1)
         att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device)
-        # Get batch size from the first dimension of the concatenated tensors
-        bsize = pad_masks.shape[0]
-        att_masks = att_masks[None, :].expand(bsize, len(att_masks))
         return embs, pad_masks, att_masks
     def embed_suffix(self, state, noisy_actions, timestep):
-        """Embed state, noisy_actions, timestep to prepare for Expert Gemma processing."""
         embs = []
         pad_masks = []
         att_masks = []
@@ -302,79 +283,186 @@ class PI0Pytorch(nn.Module):
             if self.state_proj.weight.dtype == torch.float32:
                 state = state.to(torch.float32)
-            # Embed state
             def state_proj_func(state):
                 return self.state_proj(state)
             state_emb = self._apply_checkpoint(state_proj_func, state)
             embs.append(state_emb[:, None, :])
             bsize = state_emb.shape[0]
             device = state_emb.device
-            state_mask = torch.ones(bsize, 1, dtype=torch.bool, device=device)
-            pad_masks.append(state_mask)
-            # Set attention masks so that image and language inputs do not attend to state or actions
             att_masks += [1]
-        # Embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1]
         time_emb = create_sinusoidal_pos_embedding(
             timestep, self.action_expert_width, min_period=4e-3, max_period=4.0, device=timestep.device
         )
         time_emb = time_emb.type(dtype=timestep.dtype)
-        # Fuse timestep + action information using an MLP
         action_emb = self._project_action_inputs(noisy_actions)
         if not self.pi05:
             time_emb = time_emb[:, None, :].expand_as(action_emb)
             action_time_emb = torch.cat([action_emb, time_emb], dim=2)
-            # Apply MLP layers
             def mlp_func(action_time_emb):
                 x = self.action_time_mlp_in(action_time_emb)
-                x = F.silu(x)  # swish == silu
                 return self.action_time_mlp_out(x)
             action_time_emb = self._apply_checkpoint(mlp_func, action_time_emb)
             adarms_cond = None
         else:
-            # time MLP (for adaRMS)
             def time_mlp_func(time_emb):
                 x = self.time_mlp_in(time_emb)
-                x = F.silu(x)  # swish == silu
                 x = self.time_mlp_out(x)
                 return F.silu(x)
             time_emb = self._apply_checkpoint(time_mlp_func, time_emb)
             action_time_emb = action_emb
-            adarms_cond = time_emb
-        # Add to input tokens
         embs.append(action_time_emb)
         bsize, action_time_dim = action_time_emb.shape[:2]
-        action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device)
-        pad_masks.append(action_time_mask)
-        # Set attention masks so that image, language and state inputs do not attend to action tokens
         att_masks += [1] + ([0] * (self.config.action_horizon - 1))
         embs = torch.cat(embs, dim=1)
         pad_masks = torch.cat(pad_masks, dim=1)
         att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device)
         att_masks = att_masks[None, :].expand(bsize, len(att_masks))
         return embs, pad_masks, att_masks, adarms_cond
     def forward(self, observation, actions, noise=None, time=None) -> Tensor:
-        """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)"""
         images, img_masks, lang_tokens, lang_masks, state = self._preprocess_observation(observation, train=True)
         if noise is None:
             noise = self.sample_noise(actions.shape, actions.device)
         if time is None:
             time = self.sample_time(actions.shape[0], actions.device)
@@ -384,60 +472,55 @@ class PI0Pytorch(nn.Module):
         prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(images, img_masks, lang_tokens, lang_masks)
         suffix_embs, suffix_pad_masks, suffix_att_masks, adarms_cond = self.embed_suffix(state, x_t, time)
-        if (
-            self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype
-            == torch.bfloat16
-        ):
-            suffix_embs = suffix_embs.to(dtype=torch.bfloat16)
-            prefix_embs = prefix_embs.to(dtype=torch.bfloat16)
-        pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1)
-        att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1)
-        att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
-        position_ids = torch.cumsum(pad_masks, dim=1) - 1
-        # Prepare attention masks
-        att_2d_masks_4d = self._prepare_attention_masks_4d(att_2d_masks)
-        # Apply gradient checkpointing if enabled
-        def forward_func(prefix_embs, suffix_embs, att_2d_masks_4d, position_ids, adarms_cond):
-            (_, suffix_out), _ = self.paligemma_with_expert.forward(
-                attention_mask=att_2d_masks_4d,
-                position_ids=position_ids,
-                past_key_values=None,
-                inputs_embeds=[prefix_embs, suffix_embs],
-                use_cache=False,
-                adarms_cond=[None, adarms_cond],
-            )
-            return suffix_out
-        suffix_out = self._apply_checkpoint(
-            forward_func, prefix_embs, suffix_embs, att_2d_masks_4d, position_ids, adarms_cond
         )
-        suffix_out = suffix_out[:, -self.config.action_horizon :]
-        suffix_out = suffix_out.to(dtype=torch.float32)
         v_t = self._project_action_outputs(suffix_out)
         return F.mse_loss(u_t, v_t, reduction="none")
     @torch.no_grad()
     def sample_actions(self, device, observation, noise=None, num_steps=10) -> Tensor:
-        """Do a full inference forward and compute the action (batch_size x num_steps x num_motors)"""
         bsize = observation.state.shape[0]
         if noise is None:
             actions_shape = (bsize, self.config.action_horizon, self.config.action_dim)
             noise = self.sample_noise(actions_shape, device)
         images, img_masks, lang_tokens, lang_masks, state = self._preprocess_observation(observation, train=False)
         prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(images, img_masks, lang_tokens, lang_masks)
         prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks)
         prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1
-        # Compute image and language key value cache
         prefix_att_2d_masks_4d = self._prepare_attention_masks_4d(prefix_att_2d_masks)
         self.paligemma_with_expert.paligemma.language_model.config._attn_implementation = "eager"  # noqa: SLF001
@@ -449,26 +532,35 @@ class PI0Pytorch(nn.Module):
             use_cache=True,
         )
-        dt = -1.0 / num_steps
-        dt = torch.tensor(dt, dtype=torch.float32, device=device)
-        x_t = noise
-        time = torch.tensor(1.0, dtype=torch.float32, device=device)
         while time >= -dt / 2:
             expanded_time = time.expand(bsize)
-            v_t = self.denoise_step(
-                state,
-                prefix_pad_masks,
-                past_key_values,
-                x_t,
-                expanded_time,
-            )
-            # Euler step - use new tensor assignment instead of in-place operation
             x_t = x_t + dt * v_t
             time += dt
         return x_t
     def denoise_step(
         self,
         state,
@@ -477,7 +569,6 @@ class PI0Pytorch(nn.Module):
         x_t,
         timestep,
     ):
-        """Apply one denoising step of the noise `x_t` at a given timestep."""
         suffix_embs, suffix_pad_masks, suffix_att_masks, adarms_cond = self.embed_suffix(state, x_t, timestep)
         suffix_len = suffix_pad_masks.shape[1]
@@ -485,15 +576,12 @@ class PI0Pytorch(nn.Module):
         prefix_len = prefix_pad_masks.shape[1]
         prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(batch_size, suffix_len, prefix_len)
         suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks)
         full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2)
         prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None]
         position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1
-        # Prepare attention masks
         full_att_2d_masks_4d = self._prepare_attention_masks_4d(full_att_2d_masks)
         self.paligemma_with_expert.gemma_expert.model.config._attn_implementation = "eager"  # noqa: SLF001

 def get_safe_dtype(target_dtype, device_type):
     """Get a safe dtype for the given device type."""
     if device_type == "cpu":
         if target_dtype == torch.bfloat16:
             return torch.float32
         if target_dtype == torch.float64:
     """Computes sine-cosine positional embedding vectors for scalar positions."""
     if dimension % 2 != 0:
         raise ValueError(f"dimension ({dimension}) must be divisible by 2")
     if time.ndim != 1:
         raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.")
     dtype = get_safe_dtype(torch.float64, device.type)
     fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device)
     period = min_period * (max_period / min_period) ** fraction
     scaling_factor = 1.0 / period * 2 * math.pi
     sin_input = scaling_factor[None, :] * time[:, None]
     return torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
 def make_att_2d_masks(pad_masks, att_masks):
+    """Copied from big_vision."""
     if att_masks.ndim != 2:
         raise ValueError(att_masks.ndim)
     if pad_masks.ndim != 2:
         self.pi05 = config.pi05
         self.arm_action_dims = tuple(config.arm_action_dims)
         self.use_parallel_action_heads = config.use_parallel_action_heads
+        self.use_split_action_expert = config.use_split_action_expert
+        self.use_communicating_action_expert = config.use_communicating_action_expert
+        self.action_expert_mode = config.action_expert_mode
         self.action_split_dims = list(self.arm_action_dims)
+        if self.use_split_action_expert and not self.pi05:
+            raise NotImplementedError("Split action experts are currently implemented only for pi0.5 models.")
         paligemma_config = _gemma.get_config(config.paligemma_variant)
         action_expert_config = _gemma.get_config(config.action_expert_variant)
         self.action_expert_width = action_expert_config.width
+        num_action_experts = len(self.arm_action_dims) if self.use_split_action_expert else 1
         self.paligemma_with_expert = PaliGemmaWithExpertModel(
             paligemma_config,
             action_expert_config,
             use_adarms=[False, True] if self.pi05 else [False, False],
             precision=config.dtype,
+            num_action_experts=num_action_experts,
+            enable_cross_arm_communication=self.use_communicating_action_expert,
         )
         if self.use_parallel_action_heads:
             self.action_in_proj_arms = nn.ModuleList(
                 [nn.Linear(arm_dim, action_expert_config.width) for arm_dim in self.arm_action_dims]
             )
+            if self.action_expert_mode == "head_only_parallel":
+                self.arm_token_fuse = nn.Linear(
+                    len(self.arm_action_dims) * action_expert_config.width, action_expert_config.width
+                )
             self.action_out_proj_arms = nn.ModuleList(
                 [nn.Linear(action_expert_config.width, arm_dim) for arm_dim in self.arm_action_dims]
             )
         if os.environ.get("OPENPI_TORCH_COMPILE_SAMPLE_ACTIONS", "0") == "1":
             self.sample_actions = torch.compile(self.sample_actions, mode="max-autotune")
         self.gradient_checkpointing_enabled = False
         msg = "transformers_replace is not installed correctly. Please install it with `uv pip install transformers==4.53.2` and `cp -r ./src/openpi/models_pytorch/transformers_replace/* .venv/lib/python3.11/site-packages/transformers/`."
         except ImportError:
             raise ValueError(msg) from None
+    def _expert_models(self) -> list[nn.Module]:
+        return self.paligemma_with_expert._get_action_expert_models()
     def gradient_checkpointing_enable(self):
         self.gradient_checkpointing_enabled = True
         self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing = True
         self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing = True
+        for expert_model in self._expert_models():
+            expert_model.gradient_checkpointing = True
         logging.info("Enabled gradient checkpointing for PI0Pytorch model")
     def gradient_checkpointing_disable(self):
         self.gradient_checkpointing_enabled = False
         self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing = False
         self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing = False
+        for expert_model in self._expert_models():
+            expert_model.gradient_checkpointing = False
         logging.info("Disabled gradient checkpointing for PI0Pytorch model")
     def is_gradient_checkpointing_enabled(self):
         return self.gradient_checkpointing_enabled
     def _apply_checkpoint(self, func, *args, **kwargs):
         if self.gradient_checkpointing_enabled and self.training:
             return torch.utils.checkpoint.checkpoint(
                 func, *args, use_reentrant=False, preserve_rng_state=False, **kwargs
         return func(*args, **kwargs)
     def _prepare_attention_masks_4d(self, att_2d_masks):
         att_2d_masks_4d = att_2d_masks[:, None, :, :]
         return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38)
     def _preprocess_observation(self, observation, *, train=True):
         observation = _preprocessing.preprocess_observation_pytorch(observation, train=train)
         return (
             list(observation.images.values()),
         )
     def sample_noise(self, shape, device):
+        return torch.normal(mean=0.0, std=1.0, size=shape, dtype=torch.float32, device=device)
     def sample_time(self, bsize, device):
         time_beta = sample_beta(1.5, 1.0, bsize, device)
             per_arm_embeddings.append(self._apply_checkpoint(arm_proj_func, arm_actions))
+        if self.use_split_action_expert:
+            return per_arm_embeddings
         fused_inputs = torch.cat(per_arm_embeddings, dim=-1)
         def fuse_func(fused_inputs):
             return self._apply_checkpoint(action_out_proj_func, suffix_out)
+        if not self.use_split_action_expert:
+            per_arm_outputs = []
+            for arm_head in self.action_out_proj_arms:
+                def arm_out_func(suffix_out, arm_head=arm_head):
+                    return arm_head(suffix_out)
+                per_arm_outputs.append(self._apply_checkpoint(arm_out_func, suffix_out))
+            return torch.cat(per_arm_outputs, dim=-1)
+        if len(suffix_out) != len(self.action_out_proj_arms):
+            raise ValueError(f"Expected {len(self.action_out_proj_arms)} arm outputs, got {len(suffix_out)}.")
         per_arm_outputs = []
+        for arm_head, arm_suffix_out in zip(self.action_out_proj_arms, suffix_out, strict=True):
+            def arm_out_func(arm_suffix_out, arm_head=arm_head):
+                return arm_head(arm_suffix_out)
+            per_arm_outputs.append(self._apply_checkpoint(arm_out_func, arm_suffix_out))
         return torch.cat(per_arm_outputs, dim=-1)
     def embed_prefix(
         self, images, img_masks, lang_tokens, lang_masks
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         embs = []
         pad_masks = []
         att_masks = []
         for img, img_mask in zip(images, img_masks, strict=True):
             def image_embed_func(img):
                 return self.paligemma_with_expert.embed_image(img)
             img_emb = self._apply_checkpoint(image_embed_func, img)
             bsize, num_img_embs = img_emb.shape[:2]
             embs.append(img_emb)
             pad_masks.append(img_mask[:, None].expand(bsize, num_img_embs))
             att_masks += [0] * num_img_embs
         def lang_embed_func(lang_tokens):
             lang_emb = self.paligemma_with_expert.embed_language_tokens(lang_tokens)
+            return lang_emb * math.sqrt(lang_emb.shape[-1])
         lang_emb = self._apply_checkpoint(lang_embed_func, lang_tokens)
         embs.append(lang_emb)
         pad_masks.append(lang_masks)
+        att_masks += [0] * lang_emb.shape[1]
         embs = torch.cat(embs, dim=1)
         pad_masks = torch.cat(pad_masks, dim=1)
         att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device)
+        att_masks = att_masks[None, :].expand(pad_masks.shape[0], len(att_masks))
         return embs, pad_masks, att_masks
+    def _action_att_mask(self, batch_size: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+        att_mask = torch.tensor([1] + ([0] * (self.config.action_horizon - 1)), dtype=dtype, device=device)
+        return att_mask[None, :].expand(batch_size, -1)
     def embed_suffix(self, state, noisy_actions, timestep):
+        if self.use_split_action_expert and not self.pi05:
+            raise NotImplementedError("Split action experts are currently implemented only for pi0.5 models.")
         embs = []
         pad_masks = []
         att_masks = []
             if self.state_proj.weight.dtype == torch.float32:
                 state = state.to(torch.float32)
             def state_proj_func(state):
                 return self.state_proj(state)
             state_emb = self._apply_checkpoint(state_proj_func, state)
             embs.append(state_emb[:, None, :])
             bsize = state_emb.shape[0]
             device = state_emb.device
+            pad_masks.append(torch.ones(bsize, 1, dtype=torch.bool, device=device))
             att_masks += [1]
         time_emb = create_sinusoidal_pos_embedding(
             timestep, self.action_expert_width, min_period=4e-3, max_period=4.0, device=timestep.device
         )
         time_emb = time_emb.type(dtype=timestep.dtype)
         action_emb = self._project_action_inputs(noisy_actions)
         if not self.pi05:
             time_emb = time_emb[:, None, :].expand_as(action_emb)
             action_time_emb = torch.cat([action_emb, time_emb], dim=2)
             def mlp_func(action_time_emb):
                 x = self.action_time_mlp_in(action_time_emb)
+                x = F.silu(x)
                 return self.action_time_mlp_out(x)
             action_time_emb = self._apply_checkpoint(mlp_func, action_time_emb)
             adarms_cond = None
         else:
             def time_mlp_func(time_emb):
                 x = self.time_mlp_in(time_emb)
+                x = F.silu(x)
                 x = self.time_mlp_out(x)
                 return F.silu(x)
             time_emb = self._apply_checkpoint(time_mlp_func, time_emb)
             action_time_emb = action_emb
+            adarms_cond = [time_emb] * len(action_time_emb) if self.use_split_action_expert else time_emb
+        if self.use_split_action_expert:
+            suffix_embs = []
+            suffix_pad_masks = []
+            suffix_att_masks = []
+            for arm_emb in action_time_emb:
+                bsize, action_time_dim = arm_emb.shape[:2]
+                suffix_embs.append(arm_emb)
+                suffix_pad_masks.append(torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device))
+                suffix_att_masks.append(self._action_att_mask(bsize, timestep.device, arm_emb.dtype))
+            return suffix_embs, suffix_pad_masks, suffix_att_masks, adarms_cond
         embs.append(action_time_emb)
         bsize, action_time_dim = action_time_emb.shape[:2]
+        pad_masks.append(torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device))
         att_masks += [1] + ([0] * (self.config.action_horizon - 1))
         embs = torch.cat(embs, dim=1)
         pad_masks = torch.cat(pad_masks, dim=1)
         att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device)
         att_masks = att_masks[None, :].expand(bsize, len(att_masks))
         return embs, pad_masks, att_masks, adarms_cond
+    def _cast_joint_embs(self, prefix_embs, suffix_embs):
+        if (
+            self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype
+            == torch.bfloat16
+        ):
+            prefix_embs = prefix_embs.to(dtype=torch.bfloat16)
+            if isinstance(suffix_embs, (list, tuple)):
+                suffix_embs = [suffix_emb.to(dtype=torch.bfloat16) for suffix_emb in suffix_embs]
+            else:
+                suffix_embs = suffix_embs.to(dtype=torch.bfloat16)
+        return prefix_embs, suffix_embs
+    def _build_split_joint_attention(self, prefix_pad_masks, prefix_att_masks, suffix_pad_masks):
+        batch_size = prefix_pad_masks.shape[0]
+        prefix_len = prefix_pad_masks.shape[1]
+        branch_lengths = [branch_pad.shape[1] for branch_pad in suffix_pad_masks]
+        total_len = prefix_len + sum(branch_lengths)
+        device = prefix_pad_masks.device
+        prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks)
+        full_att_2d_masks = torch.zeros((batch_size, total_len, total_len), dtype=torch.bool, device=device)
+        full_att_2d_masks[:, :prefix_len, :prefix_len] = prefix_att_2d_masks
+        cross_attention_selector = None
+        if self.use_communicating_action_expert:
+            cross_attention_selector = torch.zeros((1, 1, total_len, total_len), dtype=torch.float32, device=device)
+        prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1
+        prefix_offsets = torch.sum(prefix_pad_masks, dim=-1, keepdim=True)
+        branch_position_ids = []
+        branch_starts = []
+        start = prefix_len
+        for branch_len in branch_lengths:
+            branch_starts.append(start)
+            start += branch_len
+        for branch_idx, branch_pad_masks in enumerate(suffix_pad_masks):
+            branch_start = branch_starts[branch_idx]
+            branch_len = branch_pad_masks.shape[1]
+            branch_position_ids.append(prefix_offsets + torch.cumsum(branch_pad_masks, dim=1) - 1)
+            full_att_2d_masks[:, branch_start : branch_start + branch_len, :prefix_len] = (
+                branch_pad_masks[:, :, None] & prefix_pad_masks[:, None, :]
+            )
+            q_positions = torch.arange(branch_len, device=device)[:, None]
+            k_positions = torch.arange(branch_len, device=device)[None, :]
+            same_branch_causal = k_positions <= q_positions
+            full_att_2d_masks[:, branch_start : branch_start + branch_len, branch_start : branch_start + branch_len] = (
+                branch_pad_masks[:, :, None] & branch_pad_masks[:, None, :] & same_branch_causal[None, :, :]
+            )
+        if self.use_communicating_action_expert:
+            for query_idx, query_pad_masks in enumerate(suffix_pad_masks):
+                query_start = branch_starts[query_idx]
+                query_len = query_pad_masks.shape[1]
+                query_positions = torch.arange(query_len, device=device)[:, None]
+                for key_idx, key_pad_masks in enumerate(suffix_pad_masks):
+                    if query_idx == key_idx:
+                        continue
+                    key_start = branch_starts[key_idx]
+                    key_len = key_pad_masks.shape[1]
+                    key_positions = torch.arange(key_len, device=device)[None, :]
+                    cross_causal = key_positions <= query_positions
+                    cross_block = query_pad_masks[:, :, None] & key_pad_masks[:, None, :] & cross_causal[None, :, :]
+                    full_att_2d_masks[:, query_start : query_start + query_len, key_start : key_start + key_len] = (
+                        cross_block
+                    )
+                    cross_attention_selector[
+                        :, :, query_start : query_start + query_len, key_start : key_start + key_len
+                    ] = cross_causal[None, None, :, :].to(dtype=torch.float32)
+        position_ids = torch.cat([prefix_position_ids, *branch_position_ids], dim=1)
+        return full_att_2d_masks, position_ids, cross_attention_selector
+    def _run_joint_action_expert(
+        self,
+        prefix_embs,
+        prefix_pad_masks,
+        prefix_att_masks,
+        suffix_embs,
+        suffix_pad_masks,
+        suffix_att_masks,
+        adarms_cond,
+    ):
+        prefix_embs, suffix_embs = self._cast_joint_embs(prefix_embs, suffix_embs)
+        if self.use_split_action_expert:
+            att_2d_masks, position_ids, cross_attention_selector = self._build_split_joint_attention(
+                prefix_pad_masks, prefix_att_masks, suffix_pad_masks
+            )
+            inputs_embeds = [prefix_embs, *suffix_embs]
+            adarms_cond = [None, *adarms_cond]
+        else:
+            pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1)
+            att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1)
+            att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
+            position_ids = torch.cumsum(pad_masks, dim=1) - 1
+            cross_attention_selector = None
+            inputs_embeds = [prefix_embs, suffix_embs]
+            adarms_cond = [None, adarms_cond]
+        att_2d_masks_4d = self._prepare_attention_masks_4d(att_2d_masks)
+        outputs_embeds, _ = self.paligemma_with_expert.forward(
+            attention_mask=att_2d_masks_4d,
+            position_ids=position_ids,
+            past_key_values=None,
+            inputs_embeds=inputs_embeds,
+            use_cache=False,
+            adarms_cond=adarms_cond,
+            cross_attention_selector=cross_attention_selector,
+        )
+        return outputs_embeds[1:]
     def forward(self, observation, actions, noise=None, time=None) -> Tensor:
         images, img_masks, lang_tokens, lang_masks, state = self._preprocess_observation(observation, train=True)
         if noise is None:
             noise = self.sample_noise(actions.shape, actions.device)
         if time is None:
             time = self.sample_time(actions.shape[0], actions.device)
         prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(images, img_masks, lang_tokens, lang_masks)
         suffix_embs, suffix_pad_masks, suffix_att_masks, adarms_cond = self.embed_suffix(state, x_t, time)
+        suffix_out = self._run_joint_action_expert(
+            prefix_embs,
+            prefix_pad_masks,
+            prefix_att_masks,
+            suffix_embs,
+            suffix_pad_masks,
+            suffix_att_masks,
+            adarms_cond,
         )
+        if self.use_split_action_expert:
+            suffix_out = [output[:, -self.config.action_horizon :].to(dtype=torch.float32) for output in suffix_out]
+        else:
+            suffix_out = suffix_out[0][:, -self.config.action_horizon :].to(dtype=torch.float32)
         v_t = self._project_action_outputs(suffix_out)
         return F.mse_loss(u_t, v_t, reduction="none")
     @torch.no_grad()
     def sample_actions(self, device, observation, noise=None, num_steps=10) -> Tensor:
         bsize = observation.state.shape[0]
         if noise is None:
             actions_shape = (bsize, self.config.action_horizon, self.config.action_dim)
             noise = self.sample_noise(actions_shape, device)
         images, img_masks, lang_tokens, lang_masks, state = self._preprocess_observation(observation, train=False)
         prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(images, img_masks, lang_tokens, lang_masks)
+        dt = torch.tensor(-1.0 / num_steps, dtype=torch.float32, device=device)
+        x_t = noise
+        time = torch.tensor(1.0, dtype=torch.float32, device=device)
+        if self.use_split_action_expert:
+            while time >= -dt / 2:
+                expanded_time = time.expand(bsize)
+                v_t = self._denoise_step_split(
+                    prefix_embs,
+                    prefix_pad_masks,
+                    prefix_att_masks,
+                    state,
+                    x_t,
+                    expanded_time,
+                )
+                x_t = x_t + dt * v_t
+                time += dt
+            return x_t
         prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks)
         prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1
         prefix_att_2d_masks_4d = self._prepare_attention_masks_4d(prefix_att_2d_masks)
         self.paligemma_with_expert.paligemma.language_model.config._attn_implementation = "eager"  # noqa: SLF001
             use_cache=True,
         )
         while time >= -dt / 2:
             expanded_time = time.expand(bsize)
+            v_t = self.denoise_step(state, prefix_pad_masks, past_key_values, x_t, expanded_time)
             x_t = x_t + dt * v_t
             time += dt
         return x_t
+    def _denoise_step_split(
+        self,
+        prefix_embs,
+        prefix_pad_masks,
+        prefix_att_masks,
+        state,
+        x_t,
+        timestep,
+    ):
+        suffix_embs, suffix_pad_masks, suffix_att_masks, adarms_cond = self.embed_suffix(state, x_t, timestep)
+        outputs = self._run_joint_action_expert(
+            prefix_embs,
+            prefix_pad_masks,
+            prefix_att_masks,
+            suffix_embs,
+            suffix_pad_masks,
+            suffix_att_masks,
+            adarms_cond,
+        )
+        outputs = [output[:, -self.config.action_horizon :].to(dtype=torch.float32) for output in outputs]
+        return self._project_action_outputs(outputs)
     def denoise_step(
         self,
         state,
         x_t,
         timestep,
     ):
         suffix_embs, suffix_pad_masks, suffix_att_masks, adarms_cond = self.embed_suffix(state, x_t, timestep)
         suffix_len = suffix_pad_masks.shape[1]
         prefix_len = prefix_pad_masks.shape[1]
         prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(batch_size, suffix_len, prefix_len)
         suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks)
         full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2)
         prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None]
         position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1
         full_att_2d_masks_4d = self._prepare_attention_masks_4d(full_att_2d_masks)
         self.paligemma_with_expert.gemma_expert.model.config._attn_implementation = "eager"  # noqa: SLF001

openpi/src/openpi/training/config.py CHANGED Viewed

@@ -1050,6 +1050,7 @@ _CONFIGS = [
             action_dim=32,
             action_horizon=16,
             arm_action_dims=(16, 16),
         ),
         data=LeRobotDROIDDataConfig(
             repo_id="your_hf_username/my_multiarm_droid_dataset",
@@ -1087,6 +1088,7 @@ _CONFIGS = [
             action_dim=32,
             action_horizon=16,
             arm_action_dims=(16, 16),
         ),
         data=LeRobotTWINBimanualDataConfig(
             repo_id="your_hf_username/twin_bimanual_lerobot_train",
@@ -1131,6 +1133,7 @@ _CONFIGS = [
             action_dim=32,
             action_horizon=16,
             arm_action_dims=(16, 16),
         ),
         data=LeRobotTWINBimanualPackedDataConfig(
             repo_id="lsnu/twin_handover_256_train",
@@ -1153,6 +1156,66 @@ _CONFIGS = [
         overwrite=True,
         wandb_enabled=False,
     ),
     TrainConfig(
         name="pi05_twin_handover_256_packed_baseline_pytorch_10k",
         model=pi0_config.Pi0Config(
@@ -1188,6 +1251,7 @@ _CONFIGS = [
             action_dim=32,
             action_horizon=16,
             arm_action_dims=(16, 16),
         ),
         data=LeRobotTWINBimanualPackedDataConfig(
             repo_id="lsnu/twin_handover_256_train",
@@ -1210,6 +1274,66 @@ _CONFIGS = [
         overwrite=True,
         wandb_enabled=False,
     ),
     TrainConfig(
         name="pi05_twin_dual_push_128_packed_baseline_pytorch_5k",
         model=pi0_config.Pi0Config(
@@ -1245,6 +1369,7 @@ _CONFIGS = [
             action_dim=32,
             action_horizon=16,
             arm_action_dims=(16, 16),
         ),
         data=LeRobotTWINBimanualPackedDataConfig(
             repo_id="lsnu/twin_dual_push_128_train",
@@ -1267,6 +1392,66 @@ _CONFIGS = [
         overwrite=True,
         wandb_enabled=False,
     ),
     #
     # ALOHA Sim configs. This config is used to demonstrate how to train on a simple simulated environment.
     #
@@ -1327,6 +1512,7 @@ _CONFIGS = [
             action_horizon=8,
             max_token_len=32,
             arm_action_dims=(16, 16),
         ),
         data=FakeDataConfig(),
         batch_size=1,
@@ -1339,6 +1525,52 @@ _CONFIGS = [
         wandb_enabled=False,
         pytorch_training_precision="float32",
     ),
     TrainConfig(
         # Local smoke-test for converted TWIN LeRobot data.
         name="debug_pi05_twin_bimanual_parallel_local_smoke",
@@ -1350,6 +1582,7 @@ _CONFIGS = [
             action_horizon=8,
             max_token_len=64,
             arm_action_dims=(16, 16),
         ),
         data=LeRobotTWINBimanualDataConfig(
             # This repo id is produced by scripts/convert_twin_squashfs_to_lerobot.py in local smoke mode.

             action_dim=32,
             action_horizon=16,
             arm_action_dims=(16, 16),
+            action_expert_mode="head_only_parallel",
         ),
         data=LeRobotDROIDDataConfig(
             repo_id="your_hf_username/my_multiarm_droid_dataset",
             action_dim=32,
             action_horizon=16,
             arm_action_dims=(16, 16),
+            action_expert_mode="head_only_parallel",
         ),
         data=LeRobotTWINBimanualDataConfig(
             repo_id="your_hf_username/twin_bimanual_lerobot_train",
             action_dim=32,
             action_horizon=16,
             arm_action_dims=(16, 16),
+            action_expert_mode="head_only_parallel",
         ),
         data=LeRobotTWINBimanualPackedDataConfig(
             repo_id="lsnu/twin_handover_256_train",
         overwrite=True,
         wandb_enabled=False,
     ),
+    TrainConfig(
+        name="pi05_twin_handover_256_packed_split_expert_independent_pytorch_2k",
+        model=pi0_config.Pi0Config(
+            pi05=True,
+            action_dim=32,
+            action_horizon=16,
+            arm_action_dims=(16, 16),
+            action_expert_mode="split_independent",
+        ),
+        data=LeRobotTWINBimanualPackedDataConfig(
+            repo_id="lsnu/twin_handover_256_train",
+            base_config=DataConfig(prompt_from_task=False),
+        ),
+        pytorch_weight_path="/workspace/checkpoints/pi05_base_split_independent_packed_from_single",
+        pytorch_training_precision="bfloat16",
+        action_loss_mask=(1.0,) * 8 + (0.0,) * 8 + (1.0,) * 8 + (0.0,) * 8,
+        lr_schedule=_optimizer.CosineDecaySchedule(
+            warmup_steps=200,
+            peak_lr=2.5e-5,
+            decay_steps=2_000,
+            decay_lr=2.5e-6,
+        ),
+        batch_size=16,
+        num_workers=8,
+        num_train_steps=2_000,
+        log_interval=10,
+        save_interval=250,
+        overwrite=True,
+        wandb_enabled=False,
+    ),
+    TrainConfig(
+        name="pi05_twin_handover_256_packed_split_expert_communicating_pytorch_2k",
+        model=pi0_config.Pi0Config(
+            pi05=True,
+            action_dim=32,
+            action_horizon=16,
+            arm_action_dims=(16, 16),
+            action_expert_mode="split_communicating",
+        ),
+        data=LeRobotTWINBimanualPackedDataConfig(
+            repo_id="lsnu/twin_handover_256_train",
+            base_config=DataConfig(prompt_from_task=False),
+        ),
+        pytorch_weight_path="/workspace/checkpoints/pi05_base_split_communicating_packed_from_single",
+        pytorch_training_precision="bfloat16",
+        action_loss_mask=(1.0,) * 8 + (0.0,) * 8 + (1.0,) * 8 + (0.0,) * 8,
+        lr_schedule=_optimizer.CosineDecaySchedule(
+            warmup_steps=200,
+            peak_lr=2.5e-5,
+            decay_steps=2_000,
+            decay_lr=2.5e-6,
+        ),
+        batch_size=16,
+        num_workers=8,
+        num_train_steps=2_000,
+        log_interval=10,
+        save_interval=250,
+        overwrite=True,
+        wandb_enabled=False,
+    ),
     TrainConfig(
         name="pi05_twin_handover_256_packed_baseline_pytorch_10k",
         model=pi0_config.Pi0Config(
             action_dim=32,
             action_horizon=16,
             arm_action_dims=(16, 16),
+            action_expert_mode="head_only_parallel",
         ),
         data=LeRobotTWINBimanualPackedDataConfig(
             repo_id="lsnu/twin_handover_256_train",
         overwrite=True,
         wandb_enabled=False,
     ),
+    TrainConfig(
+        name="pi05_twin_handover_256_packed_split_expert_independent_pytorch_10k",
+        model=pi0_config.Pi0Config(
+            pi05=True,
+            action_dim=32,
+            action_horizon=16,
+            arm_action_dims=(16, 16),
+            action_expert_mode="split_independent",
+        ),
+        data=LeRobotTWINBimanualPackedDataConfig(
+            repo_id="lsnu/twin_handover_256_train",
+            base_config=DataConfig(prompt_from_task=False),
+        ),
+        pytorch_weight_path="/workspace/checkpoints/pi05_base_split_independent_packed_from_single",
+        pytorch_training_precision="bfloat16",
+        action_loss_mask=(1.0,) * 8 + (0.0,) * 8 + (1.0,) * 8 + (0.0,) * 8,
+        lr_schedule=_optimizer.CosineDecaySchedule(
+            warmup_steps=500,
+            peak_lr=2.5e-5,
+            decay_steps=10_000,
+            decay_lr=2.5e-6,
+        ),
+        batch_size=16,
+        num_workers=8,
+        num_train_steps=10_000,
+        log_interval=10,
+        save_interval=1_000,
+        overwrite=True,
+        wandb_enabled=False,
+    ),
+    TrainConfig(
+        name="pi05_twin_handover_256_packed_split_expert_communicating_pytorch_10k",
+        model=pi0_config.Pi0Config(
+            pi05=True,
+            action_dim=32,
+            action_horizon=16,
+            arm_action_dims=(16, 16),
+            action_expert_mode="split_communicating",
+        ),
+        data=LeRobotTWINBimanualPackedDataConfig(
+            repo_id="lsnu/twin_handover_256_train",
+            base_config=DataConfig(prompt_from_task=False),
+        ),
+        pytorch_weight_path="/workspace/checkpoints/pi05_base_split_communicating_packed_from_single",
+        pytorch_training_precision="bfloat16",
+        action_loss_mask=(1.0,) * 8 + (0.0,) * 8 + (1.0,) * 8 + (0.0,) * 8,
+        lr_schedule=_optimizer.CosineDecaySchedule(
+            warmup_steps=500,
+            peak_lr=2.5e-5,
+            decay_steps=10_000,
+            decay_lr=2.5e-6,
+        ),
+        batch_size=16,
+        num_workers=8,
+        num_train_steps=10_000,
+        log_interval=10,
+        save_interval=1_000,
+        overwrite=True,
+        wandb_enabled=False,
+    ),
     TrainConfig(
         name="pi05_twin_dual_push_128_packed_baseline_pytorch_5k",
         model=pi0_config.Pi0Config(
             action_dim=32,
             action_horizon=16,
             arm_action_dims=(16, 16),
+            action_expert_mode="head_only_parallel",
         ),
         data=LeRobotTWINBimanualPackedDataConfig(
             repo_id="lsnu/twin_dual_push_128_train",
         overwrite=True,
         wandb_enabled=False,
     ),
+    TrainConfig(
+        name="pi05_twin_dual_push_128_packed_split_expert_independent_pytorch_5k",
+        model=pi0_config.Pi0Config(
+            pi05=True,
+            action_dim=32,
+            action_horizon=16,
+            arm_action_dims=(16, 16),
+            action_expert_mode="split_independent",
+        ),
+        data=LeRobotTWINBimanualPackedDataConfig(
+            repo_id="lsnu/twin_dual_push_128_train",
+            base_config=DataConfig(prompt_from_task=False),
+        ),
+        pytorch_weight_path="/workspace/checkpoints/pi05_base_split_independent_packed_from_single",
+        pytorch_training_precision="bfloat16",
+        action_loss_mask=(1.0,) * 8 + (0.0,) * 8 + (1.0,) * 8 + (0.0,) * 8,
+        lr_schedule=_optimizer.CosineDecaySchedule(
+            warmup_steps=250,
+            peak_lr=2.5e-5,
+            decay_steps=5_000,
+            decay_lr=2.5e-6,
+        ),
+        batch_size=16,
+        num_workers=8,
+        num_train_steps=5_000,
+        log_interval=10,
+        save_interval=1_000,
+        overwrite=True,
+        wandb_enabled=False,
+    ),
+    TrainConfig(
+        name="pi05_twin_dual_push_128_packed_split_expert_communicating_pytorch_5k",
+        model=pi0_config.Pi0Config(
+            pi05=True,
+            action_dim=32,
+            action_horizon=16,
+            arm_action_dims=(16, 16),
+            action_expert_mode="split_communicating",
+        ),
+        data=LeRobotTWINBimanualPackedDataConfig(
+            repo_id="lsnu/twin_dual_push_128_train",
+            base_config=DataConfig(prompt_from_task=False),
+        ),
+        pytorch_weight_path="/workspace/checkpoints/pi05_base_split_communicating_packed_from_single",
+        pytorch_training_precision="bfloat16",
+        action_loss_mask=(1.0,) * 8 + (0.0,) * 8 + (1.0,) * 8 + (0.0,) * 8,
+        lr_schedule=_optimizer.CosineDecaySchedule(
+            warmup_steps=250,
+            peak_lr=2.5e-5,
+            decay_steps=5_000,
+            decay_lr=2.5e-6,
+        ),
+        batch_size=16,
+        num_workers=8,
+        num_train_steps=5_000,
+        log_interval=10,
+        save_interval=1_000,
+        overwrite=True,
+        wandb_enabled=False,
+    ),
     #
     # ALOHA Sim configs. This config is used to demonstrate how to train on a simple simulated environment.
     #
             action_horizon=8,
             max_token_len=32,
             arm_action_dims=(16, 16),
+            action_expert_mode="head_only_parallel",
         ),
         data=FakeDataConfig(),
         batch_size=1,
         wandb_enabled=False,
         pytorch_training_precision="float32",
     ),
+    TrainConfig(
+        name="debug_pi05_split_independent_pytorch_smoke",
+        model=pi0_config.Pi0Config(
+            pi05=True,
+            paligemma_variant="dummy",
+            action_expert_variant="dummy",
+            action_dim=32,
+            action_horizon=8,
+            max_token_len=32,
+            arm_action_dims=(16, 16),
+            action_expert_mode="split_independent",
+        ),
+        data=FakeDataConfig(),
+        batch_size=1,
+        num_workers=0,
+        num_train_steps=2,
+        log_interval=1,
+        save_interval=1,
+        overwrite=True,
+        exp_name="debug_pi05_split_independent_pytorch_smoke",
+        wandb_enabled=False,
+        pytorch_training_precision="float32",
+    ),
+    TrainConfig(
+        name="debug_pi05_split_communicating_pytorch_smoke",
+        model=pi0_config.Pi0Config(
+            pi05=True,
+            paligemma_variant="dummy",
+            action_expert_variant="dummy",
+            action_dim=32,
+            action_horizon=8,
+            max_token_len=32,
+            arm_action_dims=(16, 16),
+            action_expert_mode="split_communicating",
+        ),
+        data=FakeDataConfig(),
+        batch_size=1,
+        num_workers=0,
+        num_train_steps=2,
+        log_interval=1,
+        save_interval=1,
+        overwrite=True,
+        exp_name="debug_pi05_split_communicating_pytorch_smoke",
+        wandb_enabled=False,
+        pytorch_training_precision="float32",
+    ),
     TrainConfig(
         # Local smoke-test for converted TWIN LeRobot data.
         name="debug_pi05_twin_bimanual_parallel_local_smoke",
             action_horizon=8,
             max_token_len=64,
             arm_action_dims=(16, 16),
+            action_expert_mode="head_only_parallel",
         ),
         data=LeRobotTWINBimanualDataConfig(
             # This repo id is produced by scripts/convert_twin_squashfs_to_lerobot.py in local smoke mode.

openpi/src/openpi/training/data_loader.py CHANGED Viewed

@@ -10,7 +10,6 @@ from typing import Literal, Protocol, SupportsIndex, TypeVar
 from huggingface_hub import snapshot_download
 import jax
 import jax.numpy as jnp
-import lerobot.common.datasets.lerobot_dataset as lerobot_dataset
 import numpy as np
 import torch
@@ -164,6 +163,114 @@ def _ensure_local_lerobot_dataset(repo_id: str) -> Path:
     return root
 def create_torch_dataset(
     data_config: _config.DataConfig, action_horizon: int, model_config: _model.BaseModelConfig
 ) -> Dataset:
@@ -174,6 +281,9 @@ def create_torch_dataset(
     if repo_id == "fake":
         return FakeDataset(model_config, num_samples=1024)
     dataset_root = _ensure_local_lerobot_dataset(repo_id)
     dataset_meta = lerobot_dataset.LeRobotDatasetMetadata(repo_id, root=dataset_root, revision="main")
     dataset = lerobot_dataset.LeRobotDataset(

 from huggingface_hub import snapshot_download
 import jax
 import jax.numpy as jnp
 import numpy as np
 import torch
     return root
+def _patch_lerobot_column_compat(lerobot_dataset) -> None:
+    if getattr(lerobot_dataset, "_openpi_column_compat_patched", False):
+        return
+    def _hf_column_to_numpy(column) -> np.ndarray:
+        if isinstance(column, torch.Tensor):
+            return column.detach().cpu().numpy()
+        if hasattr(column, "to_pylist"):
+            return np.asarray(column.to_pylist())
+        values = list(column)
+        if values and isinstance(values[0], torch.Tensor):
+            return torch.stack(values).detach().cpu().numpy()
+        return np.asarray(values)
+    def _hf_column_to_tensor(column) -> torch.Tensor:
+        if isinstance(column, torch.Tensor):
+            return column
+        values = column.to_pylist() if hasattr(column, "to_pylist") else list(column)
+        if values and isinstance(values[0], torch.Tensor):
+            return torch.stack(values)
+        return torch.as_tensor(values)
+    def _patched_init(
+        self,
+        repo_id: str,
+        root: str | Path | None = None,
+        episodes: list[int] | None = None,
+        image_transforms=None,
+        delta_timestamps: dict[list[float]] | None = None,
+        tolerance_s: float = 1e-4,
+        revision: str | None = None,
+        force_cache_sync: bool = False,
+        download_videos: bool = True,
+        video_backend: str | None = None,
+    ):
+        self.repo_id = repo_id
+        self.root = Path(root) if root else lerobot_dataset.HF_LEROBOT_HOME / repo_id
+        self.image_transforms = image_transforms
+        self.delta_timestamps = delta_timestamps
+        self.episodes = episodes
+        self.tolerance_s = tolerance_s
+        self.revision = revision if revision else lerobot_dataset.CODEBASE_VERSION
+        self.video_backend = video_backend if video_backend else lerobot_dataset.get_safe_default_codec()
+        self.delta_indices = None
+        self.image_writer = None
+        self.episode_buffer = None
+        self.root.mkdir(exist_ok=True, parents=True)
+        self.meta = lerobot_dataset.LeRobotDatasetMetadata(
+            self.repo_id, self.root, self.revision, force_cache_sync=force_cache_sync
+        )
+        if self.episodes is not None and self.meta._version >= lerobot_dataset.packaging.version.parse("v2.1"):
+            episodes_stats = [self.meta.episodes_stats[ep_idx] for ep_idx in self.episodes]
+            self.stats = lerobot_dataset.aggregate_stats(episodes_stats)
+        try:
+            if force_cache_sync:
+                raise FileNotFoundError
+            assert all((self.root / fpath).is_file() for fpath in self.get_episodes_file_paths())
+            self.hf_dataset = self.load_hf_dataset()
+        except (AssertionError, FileNotFoundError, NotADirectoryError):
+            self.revision = lerobot_dataset.get_safe_version(self.repo_id, self.revision)
+            self.download_episodes(download_videos)
+            self.hf_dataset = self.load_hf_dataset()
+        self.episode_data_index = lerobot_dataset.get_episode_data_index(self.meta.episodes, self.episodes)
+        timestamps = _hf_column_to_numpy(self.hf_dataset["timestamp"])
+        episode_indices = _hf_column_to_numpy(self.hf_dataset["episode_index"])
+        ep_data_index_np = {k: t.numpy() for k, t in self.episode_data_index.items()}
+        lerobot_dataset.check_timestamps_sync(
+            timestamps, episode_indices, ep_data_index_np, self.fps, self.tolerance_s
+        )
+        if self.delta_timestamps is not None:
+            lerobot_dataset.check_delta_timestamps(self.delta_timestamps, self.fps, self.tolerance_s)
+            self.delta_indices = lerobot_dataset.get_delta_indices(self.delta_timestamps, self.fps)
+    def _patched_get_query_timestamps(
+        self,
+        current_ts: float,
+        query_indices: dict[str, list[int]] | None = None,
+    ) -> dict[str, list[float]]:
+        query_timestamps = {}
+        for key in self.meta.video_keys:
+            if query_indices is not None and key in query_indices:
+                timestamps = self.hf_dataset.select(query_indices[key])["timestamp"]
+                query_timestamps[key] = _hf_column_to_tensor(timestamps).tolist()
+            else:
+                query_timestamps[key] = [current_ts]
+        return query_timestamps
+    def _patched_query_hf_dataset(self, query_indices: dict[str, list[int]]) -> dict:
+        return {
+            key: _hf_column_to_tensor(self.hf_dataset.select(q_idx)[key])
+            for key, q_idx in query_indices.items()
+            if key not in self.meta.video_keys
+        }
+    lerobot_dataset._hf_column_to_numpy = _hf_column_to_numpy
+    lerobot_dataset._hf_column_to_tensor = _hf_column_to_tensor
+    lerobot_dataset.LeRobotDataset.__init__ = _patched_init
+    lerobot_dataset.LeRobotDataset._get_query_timestamps = _patched_get_query_timestamps
+    lerobot_dataset.LeRobotDataset._query_hf_dataset = _patched_query_hf_dataset
+    lerobot_dataset._openpi_column_compat_patched = True
 def create_torch_dataset(
     data_config: _config.DataConfig, action_horizon: int, model_config: _model.BaseModelConfig
 ) -> Dataset:
     if repo_id == "fake":
         return FakeDataset(model_config, num_samples=1024)
+    import lerobot.common.datasets.lerobot_dataset as lerobot_dataset
+    _patch_lerobot_column_compat(lerobot_dataset)
     dataset_root = _ensure_local_lerobot_dataset(repo_id)
     dataset_meta = lerobot_dataset.LeRobotDatasetMetadata(repo_id, root=dataset_root, revision="main")
     dataset = lerobot_dataset.LeRobotDataset(