Fix lint errors

Files changed (16) hide show

examples/convert_jax_model_to_pytorch.py +282 -174
examples/droid/convert_droid_data_to_lerobot.py +1 -1
pyproject.toml +1 -2
scripts/train_pytorch.py +484 -455
src/openpi/models/model.py +3 -4
src/openpi/models/pi0_config.py +2 -1
src/openpi/models/tokenizer.py +1 -1
src/openpi/models_pytorch/gemma_pytorch.py +75 -48
src/openpi/models_pytorch/pi0_pytorch.py +55 -54
src/openpi/models_pytorch/preprocessing_pytorch.py +12 -10
src/openpi/policies/policy.py +3 -6
src/openpi/policies/policy_config.py +7 -9
src/openpi/shared/array_typing.py +1 -1
src/openpi/shared/image_tools.py +7 -10
src/openpi/training/config.py +2 -2
src/openpi/training/data_loader.py +11 -13

examples/convert_jax_model_to_pytorch.py CHANGED Viewed

@@ -10,13 +10,13 @@ Usage:
     # Just inspect keys:
     python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /path/to/checkpoint --inspect_only
     python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /path/to/checkpoint --inspect_only
     # Convert to PyTorch:
     python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /path/to/checkpoint --output_path /path/to/output
     python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /path/to/checkpoint --output_path /path/to/output
-Example:
-    # pi0_droid
     python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /home/$USER/.cache/openpi/openpi-assets/checkpoints/pi0_droid/params --output_path /home/$USER/.cache/openpi/openpi-assets/checkpoints/pi0_droid_pytorch
     # pi0_aloha_sim
@@ -33,44 +33,45 @@ import pathlib
 import shutil
 import traceback
 import jax
 import jax.numpy as jnp
 import jax.sharding
 import numpy as np
 import orbax.checkpoint as ocp
-import torch
 import safetensors
-from flax.nnx.traversals import flatten_mapping
 # Import our modules
 import openpi.models_pytorch.pi0_pytorch
-import openpi.models.pi0_config
-import openpi.models.gemma
 import openpi.shared.download
-import openpi.models.model
 def flatten_for_inspection(tree, separator="/"):
     """
     Flatten a nested dictionary for easy inspection of keys using flax.nnx.traversals.flatten_mapping.
     Args:
         tree: The nested dictionary (JAX pytree)
         separator: Separator to use between key levels
     Returns:
         Dictionary with flattened keys and array shapes as values
     """
     flattened = flatten_mapping(tree, separator=separator)
     # Convert values to shape/dtype information for inspection
     result = {}
     for key, value in flattened.items():
-        if hasattr(value, 'shape') and hasattr(value, 'dtype'):
             result[key] = f"shape: {value.shape}, dtype: {value.dtype}"
         else:
             result[key] = f"type: {type(value)}"
     return result
@@ -90,19 +91,15 @@ def slice_paligemma_state_dict(state_dict, config):
     """Convert PaliGemma JAX parameters to PyTorch format."""
     suffix = "/value" if "img/embedding/kernel/value" in state_dict else ""
     # patch embeddings
     jax_key = f"img/embedding/kernel{suffix}"
     pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.embeddings.patch_embedding.weight"
     state_dict[pytorch_key] = state_dict.pop(jax_key).transpose(3, 2, 0, 1)
     jax_key = f"img/embedding/bias{suffix}"
     pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.embeddings.patch_embedding.bias"
     state_dict[pytorch_key] = state_dict.pop(jax_key)
     # positional embeddings
     jax_key = f"img/pos_embedding{suffix}"
     pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.embeddings.position_embedding.weight"
@@ -114,54 +111,101 @@ def slice_paligemma_state_dict(state_dict, config):
     encoderblock_layernorm1_scale = state_dict.pop(f"img/Transformer/encoderblock/LayerNorm_1/scale{suffix}")
     encoderblock_layernorm1_bias = state_dict.pop(f"img/Transformer/encoderblock/LayerNorm_1/bias{suffix}")
-    encoderblock_mlp_dense0_kernel= state_dict.pop(f"img/Transformer/encoderblock/MlpBlock_0/Dense_0/kernel{suffix}")
-    encoderblock_mlp_dense0_bias= state_dict.pop(f"img/Transformer/encoderblock/MlpBlock_0/Dense_0/bias{suffix}")
-    encoderblock_mlp_dense1_kernel= state_dict.pop(f"img/Transformer/encoderblock/MlpBlock_0/Dense_1/kernel{suffix}")
-    encoderblock_mlp_dense1_bias= state_dict.pop(f"img/Transformer/encoderblock/MlpBlock_0/Dense_1/bias{suffix}")
-    encoderblock_attention_0_key_kernel = state_dict.pop(f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/key/kernel{suffix}")
-    encoderblock_attention_0_key_bias = state_dict.pop(f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/key/bias{suffix}")
-    encoderblock_attention_0_value_kernel = state_dict.pop(f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/value/kernel{suffix}")
-    encoderblock_attention_0_value_bias = state_dict.pop(f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/value/bias{suffix}")
-    encoderblock_attention_0_query_kernel = state_dict.pop(f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/query/kernel{suffix}")
-    encoderblock_attention_0_query_bias = state_dict.pop(f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/query/bias{suffix}")
-    encoderblock_attention_0_out_kernel = state_dict.pop(f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/out/kernel{suffix}")
-    encoderblock_attention_0_out_bias = state_dict.pop(f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/out/bias{suffix}")
     for i in range(config.vision_config.num_hidden_layers):
-        state_dict[f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.layer_norm1.weight"] = encoderblock_layernorm0_scale[i].transpose()
-        state_dict[f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.layer_norm1.bias"] = encoderblock_layernorm0_bias[i]
-        state_dict[f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.layer_norm2.weight"] = encoderblock_layernorm1_scale[i].transpose()
-        state_dict[f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.layer_norm2.bias"] = encoderblock_layernorm1_bias[i]
-        state_dict[f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.mlp.fc1.weight"] = encoderblock_mlp_dense0_kernel[i].transpose()
-        state_dict[f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.mlp.fc1.bias"] = encoderblock_mlp_dense0_bias[i]
-        state_dict[f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.mlp.fc2.weight"] = encoderblock_mlp_dense1_kernel[i].transpose()
-        state_dict[f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.mlp.fc2.bias"] = encoderblock_mlp_dense1_bias[i]
-        state_dict[f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.k_proj.weight"] = encoderblock_attention_0_key_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.k_proj.bias"] = encoderblock_attention_0_key_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-        state_dict[f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.v_proj.weight"] = encoderblock_attention_0_value_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.v_proj.bias"] = encoderblock_attention_0_value_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-        state_dict[f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.q_proj.weight"] = encoderblock_attention_0_query_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.q_proj.bias"] = encoderblock_attention_0_query_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-        state_dict[f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.out_proj.weight"] = encoderblock_attention_0_out_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.out_proj.bias"] = encoderblock_attention_0_out_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
     jax_key = f"img/Transformer/encoder_norm/scale{suffix}"
     pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.post_layernorm.weight"
     state_dict[pytorch_key] = state_dict.pop(jax_key).transpose()
     jax_key = f"img/Transformer/encoder_norm/bias{suffix}"
     pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.post_layernorm.bias"
     state_dict[pytorch_key] = state_dict.pop(jax_key)
     # multimodal projector
     jax_key = f"img/head/kernel{suffix}"
-    pytorch_key = 'paligemma_with_expert.paligemma.model.multi_modal_projector.linear.weight'
     state_dict[pytorch_key] = state_dict.pop(jax_key).transpose()
     jax_key = f"img/head/bias{suffix}"
-    pytorch_key = 'paligemma_with_expert.paligemma.model.multi_modal_projector.linear.bias'
     state_dict[pytorch_key] = state_dict.pop(jax_key)
     # text decoder (gemma)
@@ -181,24 +225,54 @@ def slice_paligemma_state_dict(state_dict, config):
     llm_post_attention_layernorm = state_dict.pop(f"llm/layers/pre_ffw_norm/scale{suffix}")
     for i in range(config.text_config.num_hidden_layers):
-        q_proj_weight_reshaped = llm_attention_q_einsum[i].transpose(0, 2, 1).reshape(config.text_config.num_attention_heads * config.text_config.head_dim, config.text_config.hidden_size)
-        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.self_attn.q_proj.weight"] = q_proj_weight_reshaped
         k_proj_weight_reshaped = llm_attention_kv_einsum[i, 0, 0].transpose()
-        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.self_attn.k_proj.weight"] = k_proj_weight_reshaped
         v_proj_weight_reshaped = llm_attention_kv_einsum[i, 1, 0].transpose()
-        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.self_attn.v_proj.weight"] = v_proj_weight_reshaped
-        o_proj_weight_reshaped = llm_attention_attn_vec_einsum[i].transpose(2, 0, 1).reshape(config.text_config.num_attention_heads * config.text_config.head_dim, config.text_config.hidden_size)
-        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.self_attn.o_proj.weight"] = o_proj_weight_reshaped
         gate_proj_weight = llm_mlp_gating_einsum[i, 0]
-        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.mlp.gate_proj.weight"] = gate_proj_weight.transpose()
         up_proj_weight = llm_mlp_gating_einsum[i, 1]
-        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.mlp.up_proj.weight"] = up_proj_weight.transpose()
-        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.mlp.down_proj.weight"] = llm_mlp_linear[i].transpose()
-        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.input_layernorm.weight"] = llm_input_layernorm[i]
-        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.post_attention_layernorm.weight"] = llm_post_attention_layernorm[i]
     jax_key = f"llm/final_norm/scale{suffix}"
     pytorch_key = "paligemma_with_expert.paligemma.model.language_model.norm.weight"
@@ -206,7 +280,7 @@ def slice_paligemma_state_dict(state_dict, config):
     expert_dict = {}
     final_state_dict = {}
     # Expert-related keys to extract (including pi05 Dense layer parameters)
     expert_keys = [
         f"llm/final_norm_1/scale{suffix}",
@@ -224,7 +298,7 @@ def slice_paligemma_state_dict(state_dict, config):
         f"llm/layers/pre_ffw_norm_1/Dense_0/bias{suffix}",
         f"llm/layers/pre_ffw_norm_1/Dense_0/kernel{suffix}",
     ]
     for key, value in state_dict.items():
         if key not in expert_keys:
             final_state_dict[key] = torch.from_numpy(value)
@@ -237,13 +311,13 @@ def slice_paligemma_state_dict(state_dict, config):
 def slice_gemma_state_dict(state_dict, config, num_expert=1, checkpoint_dir=None):
     """Convert Gemma JAX parameters to PyTorch format."""
     # Add missing attributes to config if they don't exist
-    if not hasattr(config, 'vocab_size'):
         config.vocab_size = 257152  # PALIGEMMA_VOCAB_SIZE
-    if not hasattr(config, 'hidden_size'):
         config.hidden_size = config.width
-    if not hasattr(config, 'num_hidden_layers'):
         config.num_hidden_layers = config.depth
-    if not hasattr(config, 'num_attention_heads'):
         config.num_attention_heads = config.num_heads
     suffix = "/value" if f"llm/layers/attn/attn_vec_einsum_{num_expert}/w/value" in state_dict else ""
@@ -260,42 +334,79 @@ def slice_gemma_state_dict(state_dict, config, num_expert=1, checkpoint_dir=None
         # Pi05 with adaptive normalization
         llm_input_layernorm_bias = state_dict.pop(f"llm/layers/pre_attention_norm_{num_expert}/Dense_0/bias{suffix}")
         llm_post_attention_layernorm_bias = state_dict.pop(f"llm/layers/pre_ffw_norm_{num_expert}/Dense_0/bias{suffix}")
-        llm_input_layernorm_kernel = state_dict.pop(f"llm/layers/pre_attention_norm_{num_expert}/Dense_0/kernel{suffix}")
-        llm_post_attention_layernorm_kernel = state_dict.pop(f"llm/layers/pre_ffw_norm_{num_expert}/Dense_0/kernel{suffix}")
     else:
         # Regular pi0 with standard RMSNorm
         llm_input_layernorm = state_dict.pop(f"llm/layers/pre_attention_norm_{num_expert}/scale{suffix}")
         llm_post_attention_layernorm = state_dict.pop(f"llm/layers/pre_ffw_norm_{num_expert}/scale{suffix}")
     for i in range(config.num_hidden_layers):
-        q_proj_weight_reshaped = llm_attention_q_einsum[i].transpose(0, 2, 1).reshape(config.num_attention_heads * config.head_dim, config.hidden_size)
-        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.self_attn.q_proj.weight"] = q_proj_weight_reshaped
         k_proj_weight_reshaped = llm_attention_kv_einsum[i, 0, 0].transpose()
-        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.self_attn.k_proj.weight"] = k_proj_weight_reshaped
         v_proj_weight_reshaped = llm_attention_kv_einsum[i, 1, 0].transpose()
-        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.self_attn.v_proj.weight"] = v_proj_weight_reshaped
-        o_proj_weight_reshaped = llm_attention_attn_vec_einsum[i].reshape(config.num_attention_heads * config.head_dim, config.hidden_size).transpose(1,0)
-        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.self_attn.o_proj.weight"] = o_proj_weight_reshaped
         gate_proj_weight = llm_mlp_gating_einsum[i, 0]
-        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.mlp.gate_proj.weight"] = gate_proj_weight.transpose()
         up_proj_weight = llm_mlp_gating_einsum[i, 1]
-        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.mlp.up_proj.weight"] = up_proj_weight.transpose()
-        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.mlp.down_proj.weight"] = llm_mlp_linear[i].transpose()
         if "pi05" in checkpoint_dir:
             # Pi05 with adaptive normalization - use Dense layer parameters directly
-            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.input_layernorm.dense.bias"] = llm_input_layernorm_bias[i]
-            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.post_attention_layernorm.dense.bias"] = llm_post_attention_layernorm_bias[i]
-            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.input_layernorm.dense.weight"] = llm_input_layernorm_kernel[i].transpose()
-            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.post_attention_layernorm.dense.weight"] = llm_post_attention_layernorm_kernel[i].transpose()
         else:
             # Regular pi0 with standard RMSNorm
-            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.input_layernorm.weight"] = llm_input_layernorm[i]
-            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.post_attention_layernorm.weight"] = llm_post_attention_layernorm[i]
     # Handle final norm layer
     if "pi05" in checkpoint_dir:
@@ -306,9 +417,11 @@ def slice_gemma_state_dict(state_dict, config, num_expert=1, checkpoint_dir=None
         state_dict["paligemma_with_expert.gemma_expert.model.norm.dense.weight"] = final_norm_kernel.transpose()
     else:
         # Regular pi0 with standard RMSNorm
-        state_dict["paligemma_with_expert.gemma_expert.model.norm.weight"] = state_dict.pop(f"llm/final_norm_{num_expert}/scale{suffix}")
-        #state_dict["paligemma_with_expert.gemma_expert.lm_head.weight"] = embedding_vector # weights are tied.
     final_state_dict = {}
     for key, value in state_dict.items():
@@ -316,7 +429,6 @@ def slice_gemma_state_dict(state_dict, config, num_expert=1, checkpoint_dir=None
             final_state_dict[key] = torch.from_numpy(value)
         else:
             final_state_dict[key] = value
     return final_state_dict
@@ -339,11 +451,13 @@ def slice_initial_orbax_checkpoint(checkpoint_dir: str, restore_precision: str |
     restore_dtype = dtype_map.get(restore_precision) if restore_precision else None
     # Use CPU sharding to avoid GPU memory issues during checkpoint loading
-    cpu_device = jax.devices('cpu')[0]
     cpu_sharding = jax.sharding.SingleDeviceSharding(cpu_device)
     # Use repository restore utility to load a pure dict of params (value suffix removed)
-    params = openpi.models.model.restore_params(params_dir, restore_type=jax.Array, dtype=restore_dtype, sharding=cpu_sharding)
     # get params for PaliGemma
     pali_params = params["PaliGemma"]
@@ -355,43 +469,43 @@ def slice_initial_orbax_checkpoint(checkpoint_dir: str, restore_precision: str |
 def load_jax_model_and_print_keys(checkpoint_dir: str):
     """
     Load JAX model from checkpoint and print all parameter keys.
     Args:
         checkpoint_dir: Path to the checkpoint directory
     """
     params_path = pathlib.Path(checkpoint_dir).resolve()
     if not params_path.exists():
         print(f"Error: Checkpoint directory does not exist: {params_path}")
         return
     try:
         # Initialize checkpointer
         checkpointer = ocp.PyTreeCheckpointer()
         # Load metadata to see available keys
         metadata = checkpointer.metadata(params_path)
         print("Available top-level keys in checkpoint:")
-        for key in metadata.keys():
             print(f"  - {key}")
         print()
         # Restore the parameters
         params_name = "params"
         if params_name not in metadata:
             print(f"Warning: '{params_name}' not found in metadata. Available keys: {list(metadata.keys())}")
             if metadata.keys():
-                params_name = list(metadata.keys())[0]
                 print(f"Using '{params_name}' instead.")
             else:
                 print("No keys found in metadata!")
                 return
         item = {params_name: metadata[params_name]}
         # Use CPU device to avoid GPU memory issues
-        device = jax.devices('cpu')[0]
         sharding = jax.sharding.SingleDeviceSharding(device)
         restored = checkpointer.restore(
             params_path,
             ocp.args.PyTreeRestore(
@@ -406,33 +520,33 @@ def load_jax_model_and_print_keys(checkpoint_dir: str):
                 transforms={},
             ),
         )
         params = restored[params_name]
         # Flatten and print all keys
         flat_params = flatten_for_inspection(params)
         print(f"All parameter keys with shapes and dtypes ({len(flat_params)} total):")
         print("=" * 80)
         # Sort keys for better readability
         sorted_keys = sorted(flat_params.keys())
         for key in sorted_keys:
             print(f"{key:<60} -> {flat_params[key]}")
         print()
         print("=" * 80)
         print(f"Summary: Found {len(flat_params)} parameters")
         # Print some high-level structure information
         top_level_keys = set()
         for key in sorted_keys:
-            top_level_key = key.split('/')[0]
             top_level_keys.add(top_level_key)
-        print(f"Top-level parameter groups: {sorted(list(top_level_keys))}")
     except Exception as e:
         print(f"Error loading checkpoint: {e}")
         traceback.print_exc()
@@ -441,29 +555,29 @@ def load_jax_model_and_print_keys(checkpoint_dir: str):
 def convert_pi0_checkpoint(checkpoint_dir: str, precision: str, output_path: str):
     """
     Convert PI0 JAX checkpoint to PyTorch format.
     Args:
         checkpoint_dir: Path to the JAX checkpoint
         precision: Model precision (float32, bfloat16, float16)
         output_path: Path to save the converted PyTorch model
     """
     print(f"Converting PI0 checkpoint from {checkpoint_dir} to {output_path}")
     # Break down orbax ckpts by restoring via JAX to respect dtype
-    initial_params = slice_initial_orbax_checkpoint(checkpoint_dir=checkpoint_dir, restore_precision='float32')
     # Process projection params
     if "pi05" in checkpoint_dir:
         keys = [
-            "action_in_proj",
             "action_out_proj",
-            "time_mlp_in",
             "time_mlp_out",
         ]
     else:
         keys = [
             "state_proj",
-            "action_in_proj",
             "action_out_proj",
             "action_time_mlp_in",
             "action_time_mlp_out",
@@ -479,10 +593,10 @@ def convert_pi0_checkpoint(checkpoint_dir: str, precision: str, output_path: str
         else:
             weight = kernel_params
             bias = bias_params
         pytorch_weight_key = f"{key}.weight"
         pytorch_bias_key = f"{key}.bias"
         projection_params[pytorch_weight_key] = torch.from_numpy(np.array(weight)).T
         projection_params[pytorch_bias_key] = torch.from_numpy(np.array(bias))
@@ -490,22 +604,30 @@ def convert_pi0_checkpoint(checkpoint_dir: str, precision: str, output_path: str
     # All models use the same PaliGemma config structure
     class PaliGemmaConfig:
         def __init__(self):
-            self.vision_config = type('obj', (object,), {
-                'hidden_size': 1152,
-                'num_hidden_layers': 27,
-                'num_attention_heads': 16,
-                'intermediate_size': 4304,
-                'patch_size': 14,
-                'projection_dim': 2048
-            })()
-            self.text_config = type('obj', (object,), {
-                'hidden_size': 2048,
-                'num_hidden_layers': 18,
-                'num_attention_heads': 8,
-                'head_dim': 256,
-                'intermediate_size': 16384
-            })()
     paligemma_config = PaliGemmaConfig()
     action_expert_config = openpi.models.gemma.get_config("gemma_300m")
@@ -513,27 +635,24 @@ def convert_pi0_checkpoint(checkpoint_dir: str, precision: str, output_path: str
     paligemma_params, expert_params = slice_paligemma_state_dict(initial_params["paligemma_params"], paligemma_config)
     # Process Gemma weights from expert_params
-    gemma_params = slice_gemma_state_dict(expert_params, action_expert_config, num_expert=1, checkpoint_dir=checkpoint_dir)
     # Create Pi0Config based on checkpoint path
-    if "pi0_aloha_sim" in checkpoint_dir:
-        pi0_config = openpi.models.pi0_config.Pi0Config(
-            action_dim=14,  # ALOHA has 14 action dimensions
-            action_horizon=50,
-        )
-    elif "pi0_aloha_towel" in checkpoint_dir:
         pi0_config = openpi.models.pi0_config.Pi0Config(
             action_dim=14,  # ALOHA has 14 action dimensions
             action_horizon=50,
         )
     elif "pi0_base" in checkpoint_dir:
         pi0_config = openpi.models.pi0_config.Pi0Config(
-            action_dim=8,   # Base droid has 8 action dimensions
             action_horizon=10,
         )
     elif "pi05_droid" in checkpoint_dir:
         pi0_config = openpi.models.pi0_config.Pi0Config(
-            action_dim=8,   # Base droid has 8 action dimensions
             action_horizon=10,
             pi05=True,
         )
@@ -560,10 +679,10 @@ def convert_pi0_checkpoint(checkpoint_dir: str, precision: str, output_path: str
     # Combine all parameters (no prefix needed for our model structure)
     all_params = {**paligemma_params, **gemma_params, **projection_params}
     # Load state dict
     pi0_model.load_state_dict(all_params, strict=False)
     if precision == "float32":
         pi0_model = pi0_model.to(torch.float32)
     elif precision == "bfloat16":
@@ -573,10 +692,10 @@ def convert_pi0_checkpoint(checkpoint_dir: str, precision: str, output_path: str
     # Save the converted model using safetensors
     os.makedirs(output_path, exist_ok=True)
     # Save model weights as SafeTensors using save_model to handle tied weights
     safetensors.torch.save_model(pi0_model, os.path.join(output_path, "model.safetensors"))
     # Copy assets folder if it exists
     assets_source = pathlib.Path(checkpoint_dir).parent / "assets"
     if assets_source.exists():
@@ -584,7 +703,7 @@ def convert_pi0_checkpoint(checkpoint_dir: str, precision: str, output_path: str
         if assets_dest.exists():
             shutil.rmtree(assets_dest)
         shutil.copytree(assets_source, assets_dest)
     # Save config as JSON for reference
     config_dict = {
         "action_dim": pi0_config.action_dim,
@@ -595,37 +714,26 @@ def convert_pi0_checkpoint(checkpoint_dir: str, precision: str, output_path: str
     }
     with open(os.path.join(output_path, "config.json"), "w") as f:
         json.dump(config_dict, f, indent=2)
-    print(f"Model conversion completed successfully!")
     print(f"Model saved to {output_path}")
 def main():
     parser = argparse.ArgumentParser(description="Load JAX model and optionally convert to PyTorch")
     parser.add_argument(
-        "--checkpoint_dir",
-        type=str,
-        required=True,
-        help="Path to the JAX checkpoint directory"
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        help="Path to save converted PyTorch model (required for conversion)"
     )
     parser.add_argument(
         "--precision",
         choices=["float32", "bfloat16", "float16"],
         default="bfloat16",
         type=str,
-        help="Precision for model conversion"
-    )
-    parser.add_argument(
-        "--inspect_only",
-        action="store_true",
-        help="Only inspect parameter keys, don't convert"
     )
     args = parser.parse_args()
     if not os.path.exists(args.checkpoint_dir):
@@ -633,7 +741,7 @@ def main():
         checkpoint_dir = openpi.shared.download.maybe_download(f"gs://openpi-assets/checkpoints/{model_name}")
     else:
         checkpoint_dir = args.checkpoint_dir
     if args.inspect_only:
         load_jax_model_and_print_keys(args.checkpoint_dir)
     else:

     # Just inspect keys:
     python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /path/to/checkpoint --inspect_only
     python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /path/to/checkpoint --inspect_only
     # Convert to PyTorch:
     python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /path/to/checkpoint --output_path /path/to/output
     python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /path/to/checkpoint --output_path /path/to/output
+Example:
+    # pi0_droid
     python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /home/$USER/.cache/openpi/openpi-assets/checkpoints/pi0_droid/params --output_path /home/$USER/.cache/openpi/openpi-assets/checkpoints/pi0_droid_pytorch
     # pi0_aloha_sim
 import shutil
 import traceback
+from flax.nnx.traversals import flatten_mapping
 import jax
 import jax.numpy as jnp
 import jax.sharding
 import numpy as np
 import orbax.checkpoint as ocp
 import safetensors
+import torch
+import openpi.models.gemma
+import openpi.models.model
+import openpi.models.pi0_config
 # Import our modules
 import openpi.models_pytorch.pi0_pytorch
 import openpi.shared.download
 def flatten_for_inspection(tree, separator="/"):
     """
     Flatten a nested dictionary for easy inspection of keys using flax.nnx.traversals.flatten_mapping.
     Args:
         tree: The nested dictionary (JAX pytree)
         separator: Separator to use between key levels
     Returns:
         Dictionary with flattened keys and array shapes as values
     """
     flattened = flatten_mapping(tree, separator=separator)
     # Convert values to shape/dtype information for inspection
     result = {}
     for key, value in flattened.items():
+        if hasattr(value, "shape") and hasattr(value, "dtype"):
             result[key] = f"shape: {value.shape}, dtype: {value.dtype}"
         else:
             result[key] = f"type: {type(value)}"
     return result
     """Convert PaliGemma JAX parameters to PyTorch format."""
     suffix = "/value" if "img/embedding/kernel/value" in state_dict else ""
     # patch embeddings
     jax_key = f"img/embedding/kernel{suffix}"
     pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.embeddings.patch_embedding.weight"
     state_dict[pytorch_key] = state_dict.pop(jax_key).transpose(3, 2, 0, 1)
     jax_key = f"img/embedding/bias{suffix}"
     pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.embeddings.patch_embedding.bias"
     state_dict[pytorch_key] = state_dict.pop(jax_key)
     # positional embeddings
     jax_key = f"img/pos_embedding{suffix}"
     pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.embeddings.position_embedding.weight"
     encoderblock_layernorm1_scale = state_dict.pop(f"img/Transformer/encoderblock/LayerNorm_1/scale{suffix}")
     encoderblock_layernorm1_bias = state_dict.pop(f"img/Transformer/encoderblock/LayerNorm_1/bias{suffix}")
+    encoderblock_mlp_dense0_kernel = state_dict.pop(f"img/Transformer/encoderblock/MlpBlock_0/Dense_0/kernel{suffix}")
+    encoderblock_mlp_dense0_bias = state_dict.pop(f"img/Transformer/encoderblock/MlpBlock_0/Dense_0/bias{suffix}")
+    encoderblock_mlp_dense1_kernel = state_dict.pop(f"img/Transformer/encoderblock/MlpBlock_0/Dense_1/kernel{suffix}")
+    encoderblock_mlp_dense1_bias = state_dict.pop(f"img/Transformer/encoderblock/MlpBlock_0/Dense_1/bias{suffix}")
+    encoderblock_attention_0_key_kernel = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/key/kernel{suffix}"
+    )
+    encoderblock_attention_0_key_bias = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/key/bias{suffix}"
+    )
+    encoderblock_attention_0_value_kernel = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/value/kernel{suffix}"
+    )
+    encoderblock_attention_0_value_bias = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/value/bias{suffix}"
+    )
+    encoderblock_attention_0_query_kernel = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/query/kernel{suffix}"
+    )
+    encoderblock_attention_0_query_bias = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/query/bias{suffix}"
+    )
+    encoderblock_attention_0_out_kernel = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/out/kernel{suffix}"
+    )
+    encoderblock_attention_0_out_bias = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/out/bias{suffix}"
+    )
     for i in range(config.vision_config.num_hidden_layers):
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.layer_norm1.weight"
+        ] = encoderblock_layernorm0_scale[i].transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.layer_norm1.bias"
+        ] = encoderblock_layernorm0_bias[i]
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.layer_norm2.weight"
+        ] = encoderblock_layernorm1_scale[i].transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.layer_norm2.bias"
+        ] = encoderblock_layernorm1_bias[i]
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.mlp.fc1.weight"
+        ] = encoderblock_mlp_dense0_kernel[i].transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.mlp.fc1.bias"
+        ] = encoderblock_mlp_dense0_bias[i]
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.mlp.fc2.weight"
+        ] = encoderblock_mlp_dense1_kernel[i].transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.mlp.fc2.bias"
+        ] = encoderblock_mlp_dense1_bias[i]
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.k_proj.weight"
+        ] = encoderblock_attention_0_key_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.k_proj.bias"
+        ] = encoderblock_attention_0_key_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.v_proj.weight"
+        ] = encoderblock_attention_0_value_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.v_proj.bias"
+        ] = encoderblock_attention_0_value_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.q_proj.weight"
+        ] = encoderblock_attention_0_query_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.q_proj.bias"
+        ] = encoderblock_attention_0_query_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.out_proj.weight"
+        ] = encoderblock_attention_0_out_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.out_proj.bias"
+        ] = encoderblock_attention_0_out_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
     jax_key = f"img/Transformer/encoder_norm/scale{suffix}"
     pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.post_layernorm.weight"
     state_dict[pytorch_key] = state_dict.pop(jax_key).transpose()
     jax_key = f"img/Transformer/encoder_norm/bias{suffix}"
     pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.post_layernorm.bias"
     state_dict[pytorch_key] = state_dict.pop(jax_key)
     # multimodal projector
     jax_key = f"img/head/kernel{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.multi_modal_projector.linear.weight"
     state_dict[pytorch_key] = state_dict.pop(jax_key).transpose()
     jax_key = f"img/head/bias{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.multi_modal_projector.linear.bias"
     state_dict[pytorch_key] = state_dict.pop(jax_key)
     # text decoder (gemma)
     llm_post_attention_layernorm = state_dict.pop(f"llm/layers/pre_ffw_norm/scale{suffix}")
     for i in range(config.text_config.num_hidden_layers):
+        q_proj_weight_reshaped = (
+            llm_attention_q_einsum[i]
+            .transpose(0, 2, 1)
+            .reshape(
+                config.text_config.num_attention_heads * config.text_config.head_dim, config.text_config.hidden_size
+            )
+        )
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.self_attn.q_proj.weight"] = (
+            q_proj_weight_reshaped
+        )
         k_proj_weight_reshaped = llm_attention_kv_einsum[i, 0, 0].transpose()
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.self_attn.k_proj.weight"] = (
+            k_proj_weight_reshaped
+        )
         v_proj_weight_reshaped = llm_attention_kv_einsum[i, 1, 0].transpose()
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.self_attn.v_proj.weight"] = (
+            v_proj_weight_reshaped
+        )
+        o_proj_weight_reshaped = (
+            llm_attention_attn_vec_einsum[i]
+            .transpose(2, 0, 1)
+            .reshape(
+                config.text_config.num_attention_heads * config.text_config.head_dim, config.text_config.hidden_size
+            )
+        )
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.self_attn.o_proj.weight"] = (
+            o_proj_weight_reshaped
+        )
         gate_proj_weight = llm_mlp_gating_einsum[i, 0]
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.mlp.gate_proj.weight"] = (
+            gate_proj_weight.transpose()
+        )
         up_proj_weight = llm_mlp_gating_einsum[i, 1]
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.mlp.up_proj.weight"] = (
+            up_proj_weight.transpose()
+        )
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.mlp.down_proj.weight"] = (
+            llm_mlp_linear[i].transpose()
+        )
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.input_layernorm.weight"] = (
+            llm_input_layernorm[i]
+        )
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.post_attention_layernorm.weight"
+        ] = llm_post_attention_layernorm[i]
     jax_key = f"llm/final_norm/scale{suffix}"
     pytorch_key = "paligemma_with_expert.paligemma.model.language_model.norm.weight"
     expert_dict = {}
     final_state_dict = {}
     # Expert-related keys to extract (including pi05 Dense layer parameters)
     expert_keys = [
         f"llm/final_norm_1/scale{suffix}",
         f"llm/layers/pre_ffw_norm_1/Dense_0/bias{suffix}",
         f"llm/layers/pre_ffw_norm_1/Dense_0/kernel{suffix}",
     ]
     for key, value in state_dict.items():
         if key not in expert_keys:
             final_state_dict[key] = torch.from_numpy(value)
 def slice_gemma_state_dict(state_dict, config, num_expert=1, checkpoint_dir=None):
     """Convert Gemma JAX parameters to PyTorch format."""
     # Add missing attributes to config if they don't exist
+    if not hasattr(config, "vocab_size"):
         config.vocab_size = 257152  # PALIGEMMA_VOCAB_SIZE
+    if not hasattr(config, "hidden_size"):
         config.hidden_size = config.width
+    if not hasattr(config, "num_hidden_layers"):
         config.num_hidden_layers = config.depth
+    if not hasattr(config, "num_attention_heads"):
         config.num_attention_heads = config.num_heads
     suffix = "/value" if f"llm/layers/attn/attn_vec_einsum_{num_expert}/w/value" in state_dict else ""
         # Pi05 with adaptive normalization
         llm_input_layernorm_bias = state_dict.pop(f"llm/layers/pre_attention_norm_{num_expert}/Dense_0/bias{suffix}")
         llm_post_attention_layernorm_bias = state_dict.pop(f"llm/layers/pre_ffw_norm_{num_expert}/Dense_0/bias{suffix}")
+        llm_input_layernorm_kernel = state_dict.pop(
+            f"llm/layers/pre_attention_norm_{num_expert}/Dense_0/kernel{suffix}"
+        )
+        llm_post_attention_layernorm_kernel = state_dict.pop(
+            f"llm/layers/pre_ffw_norm_{num_expert}/Dense_0/kernel{suffix}"
+        )
     else:
         # Regular pi0 with standard RMSNorm
         llm_input_layernorm = state_dict.pop(f"llm/layers/pre_attention_norm_{num_expert}/scale{suffix}")
         llm_post_attention_layernorm = state_dict.pop(f"llm/layers/pre_ffw_norm_{num_expert}/scale{suffix}")
     for i in range(config.num_hidden_layers):
+        q_proj_weight_reshaped = (
+            llm_attention_q_einsum[i]
+            .transpose(0, 2, 1)
+            .reshape(config.num_attention_heads * config.head_dim, config.hidden_size)
+        )
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.self_attn.q_proj.weight"] = (
+            q_proj_weight_reshaped
+        )
         k_proj_weight_reshaped = llm_attention_kv_einsum[i, 0, 0].transpose()
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.self_attn.k_proj.weight"] = (
+            k_proj_weight_reshaped
+        )
         v_proj_weight_reshaped = llm_attention_kv_einsum[i, 1, 0].transpose()
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.self_attn.v_proj.weight"] = (
+            v_proj_weight_reshaped
+        )
+        o_proj_weight_reshaped = (
+            llm_attention_attn_vec_einsum[i]
+            .reshape(config.num_attention_heads * config.head_dim, config.hidden_size)
+            .transpose(1, 0)
+        )
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.self_attn.o_proj.weight"] = (
+            o_proj_weight_reshaped
+        )
         gate_proj_weight = llm_mlp_gating_einsum[i, 0]
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.mlp.gate_proj.weight"] = (
+            gate_proj_weight.transpose()
+        )
         up_proj_weight = llm_mlp_gating_einsum[i, 1]
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.mlp.up_proj.weight"] = (
+            up_proj_weight.transpose()
+        )
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.mlp.down_proj.weight"] = llm_mlp_linear[
+            i
+        ].transpose()
         if "pi05" in checkpoint_dir:
             # Pi05 with adaptive normalization - use Dense layer parameters directly
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.input_layernorm.dense.bias"] = (
+                llm_input_layernorm_bias[i]
+            )
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.post_attention_layernorm.dense.bias"] = (
+                llm_post_attention_layernorm_bias[i]
+            )
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.input_layernorm.dense.weight"] = (
+                llm_input_layernorm_kernel[i].transpose()
+            )
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.post_attention_layernorm.dense.weight"] = (
+                llm_post_attention_layernorm_kernel[i].transpose()
+            )
         else:
             # Regular pi0 with standard RMSNorm
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.input_layernorm.weight"] = (
+                llm_input_layernorm[i]
+            )
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.post_attention_layernorm.weight"] = (
+                llm_post_attention_layernorm[i]
+            )
     # Handle final norm layer
     if "pi05" in checkpoint_dir:
         state_dict["paligemma_with_expert.gemma_expert.model.norm.dense.weight"] = final_norm_kernel.transpose()
     else:
         # Regular pi0 with standard RMSNorm
+        state_dict["paligemma_with_expert.gemma_expert.model.norm.weight"] = state_dict.pop(
+            f"llm/final_norm_{num_expert}/scale{suffix}"
+        )
+        # state_dict["paligemma_with_expert.gemma_expert.lm_head.weight"] = embedding_vector # weights are tied.
     final_state_dict = {}
     for key, value in state_dict.items():
             final_state_dict[key] = torch.from_numpy(value)
         else:
             final_state_dict[key] = value
     return final_state_dict
     restore_dtype = dtype_map.get(restore_precision) if restore_precision else None
     # Use CPU sharding to avoid GPU memory issues during checkpoint loading
+    cpu_device = jax.devices("cpu")[0]
     cpu_sharding = jax.sharding.SingleDeviceSharding(cpu_device)
     # Use repository restore utility to load a pure dict of params (value suffix removed)
+    params = openpi.models.model.restore_params(
+        params_dir, restore_type=jax.Array, dtype=restore_dtype, sharding=cpu_sharding
+    )
     # get params for PaliGemma
     pali_params = params["PaliGemma"]
 def load_jax_model_and_print_keys(checkpoint_dir: str):
     """
     Load JAX model from checkpoint and print all parameter keys.
     Args:
         checkpoint_dir: Path to the checkpoint directory
     """
     params_path = pathlib.Path(checkpoint_dir).resolve()
     if not params_path.exists():
         print(f"Error: Checkpoint directory does not exist: {params_path}")
         return
     try:
         # Initialize checkpointer
         checkpointer = ocp.PyTreeCheckpointer()
         # Load metadata to see available keys
         metadata = checkpointer.metadata(params_path)
         print("Available top-level keys in checkpoint:")
+        for key in metadata:
             print(f"  - {key}")
         print()
         # Restore the parameters
         params_name = "params"
         if params_name not in metadata:
             print(f"Warning: '{params_name}' not found in metadata. Available keys: {list(metadata.keys())}")
             if metadata.keys():
+                params_name = next(iter(metadata.keys()))
                 print(f"Using '{params_name}' instead.")
             else:
                 print("No keys found in metadata!")
                 return
         item = {params_name: metadata[params_name]}
         # Use CPU device to avoid GPU memory issues
+        device = jax.devices("cpu")[0]
         sharding = jax.sharding.SingleDeviceSharding(device)
         restored = checkpointer.restore(
             params_path,
             ocp.args.PyTreeRestore(
                 transforms={},
             ),
         )
         params = restored[params_name]
         # Flatten and print all keys
         flat_params = flatten_for_inspection(params)
         print(f"All parameter keys with shapes and dtypes ({len(flat_params)} total):")
         print("=" * 80)
         # Sort keys for better readability
         sorted_keys = sorted(flat_params.keys())
         for key in sorted_keys:
             print(f"{key:<60} -> {flat_params[key]}")
         print()
         print("=" * 80)
         print(f"Summary: Found {len(flat_params)} parameters")
         # Print some high-level structure information
         top_level_keys = set()
         for key in sorted_keys:
+            top_level_key = key.split("/")[0]
             top_level_keys.add(top_level_key)
+        print(f"Top-level parameter groups: {sorted(top_level_keys)}")
     except Exception as e:
         print(f"Error loading checkpoint: {e}")
         traceback.print_exc()
 def convert_pi0_checkpoint(checkpoint_dir: str, precision: str, output_path: str):
     """
     Convert PI0 JAX checkpoint to PyTorch format.
     Args:
         checkpoint_dir: Path to the JAX checkpoint
         precision: Model precision (float32, bfloat16, float16)
         output_path: Path to save the converted PyTorch model
     """
     print(f"Converting PI0 checkpoint from {checkpoint_dir} to {output_path}")
     # Break down orbax ckpts by restoring via JAX to respect dtype
+    initial_params = slice_initial_orbax_checkpoint(checkpoint_dir=checkpoint_dir, restore_precision="float32")
     # Process projection params
     if "pi05" in checkpoint_dir:
         keys = [
+            "action_in_proj",
             "action_out_proj",
+            "time_mlp_in",
             "time_mlp_out",
         ]
     else:
         keys = [
             "state_proj",
+            "action_in_proj",
             "action_out_proj",
             "action_time_mlp_in",
             "action_time_mlp_out",
         else:
             weight = kernel_params
             bias = bias_params
         pytorch_weight_key = f"{key}.weight"
         pytorch_bias_key = f"{key}.bias"
         projection_params[pytorch_weight_key] = torch.from_numpy(np.array(weight)).T
         projection_params[pytorch_bias_key] = torch.from_numpy(np.array(bias))
     # All models use the same PaliGemma config structure
     class PaliGemmaConfig:
         def __init__(self):
+            self.vision_config = type(
+                "obj",
+                (object,),
+                {
+                    "hidden_size": 1152,
+                    "num_hidden_layers": 27,
+                    "num_attention_heads": 16,
+                    "intermediate_size": 4304,
+                    "patch_size": 14,
+                    "projection_dim": 2048,
+                },
+            )()
+            self.text_config = type(
+                "obj",
+                (object,),
+                {
+                    "hidden_size": 2048,
+                    "num_hidden_layers": 18,
+                    "num_attention_heads": 8,
+                    "head_dim": 256,
+                    "intermediate_size": 16384,
+                },
+            )()
     paligemma_config = PaliGemmaConfig()
     action_expert_config = openpi.models.gemma.get_config("gemma_300m")
     paligemma_params, expert_params = slice_paligemma_state_dict(initial_params["paligemma_params"], paligemma_config)
     # Process Gemma weights from expert_params
+    gemma_params = slice_gemma_state_dict(
+        expert_params, action_expert_config, num_expert=1, checkpoint_dir=checkpoint_dir
+    )
     # Create Pi0Config based on checkpoint path
+    if "pi0_aloha_sim" in checkpoint_dir or "pi0_aloha_towel" in checkpoint_dir:
         pi0_config = openpi.models.pi0_config.Pi0Config(
             action_dim=14,  # ALOHA has 14 action dimensions
             action_horizon=50,
         )
     elif "pi0_base" in checkpoint_dir:
         pi0_config = openpi.models.pi0_config.Pi0Config(
+            action_dim=8,  # Base droid has 8 action dimensions
             action_horizon=10,
         )
     elif "pi05_droid" in checkpoint_dir:
         pi0_config = openpi.models.pi0_config.Pi0Config(
+            action_dim=8,  # Base droid has 8 action dimensions
             action_horizon=10,
             pi05=True,
         )
     # Combine all parameters (no prefix needed for our model structure)
     all_params = {**paligemma_params, **gemma_params, **projection_params}
     # Load state dict
     pi0_model.load_state_dict(all_params, strict=False)
     if precision == "float32":
         pi0_model = pi0_model.to(torch.float32)
     elif precision == "bfloat16":
     # Save the converted model using safetensors
     os.makedirs(output_path, exist_ok=True)
     # Save model weights as SafeTensors using save_model to handle tied weights
     safetensors.torch.save_model(pi0_model, os.path.join(output_path, "model.safetensors"))
     # Copy assets folder if it exists
     assets_source = pathlib.Path(checkpoint_dir).parent / "assets"
     if assets_source.exists():
         if assets_dest.exists():
             shutil.rmtree(assets_dest)
         shutil.copytree(assets_source, assets_dest)
     # Save config as JSON for reference
     config_dict = {
         "action_dim": pi0_config.action_dim,
     }
     with open(os.path.join(output_path, "config.json"), "w") as f:
         json.dump(config_dict, f, indent=2)
+    print("Model conversion completed successfully!")
     print(f"Model saved to {output_path}")
 def main():
     parser = argparse.ArgumentParser(description="Load JAX model and optionally convert to PyTorch")
+    parser.add_argument("--checkpoint_dir", type=str, required=True, help="Path to the JAX checkpoint directory")
     parser.add_argument(
+        "--output_path", type=str, help="Path to save converted PyTorch model (required for conversion)"
     )
     parser.add_argument(
         "--precision",
         choices=["float32", "bfloat16", "float16"],
         default="bfloat16",
         type=str,
+        help="Precision for model conversion",
     )
+    parser.add_argument("--inspect_only", action="store_true", help="Only inspect parameter keys, don't convert")
     args = parser.parse_args()
     if not os.path.exists(args.checkpoint_dir):
         checkpoint_dir = openpi.shared.download.maybe_download(f"gs://openpi-assets/checkpoints/{model_name}")
     else:
         checkpoint_dir = args.checkpoint_dir
     if args.inspect_only:
         load_jax_model_and_print_keys(args.checkpoint_dir)
     else:

examples/droid/convert_droid_data_to_lerobot.py CHANGED Viewed

@@ -277,7 +277,7 @@ class RecordedMultiCameraWrapper:
         self.camera_kwargs = camera_kwargs
         # Open Camera Readers #
-        mp4_filepaths = glob.glob(recording_folderpath + "/*.mp4")  # noqa: PTH207
         all_filepaths = mp4_filepaths
         self.camera_dict = {}

         self.camera_kwargs = camera_kwargs
         # Open Camera Readers #
+        mp4_filepaths = glob.glob(recording_folderpath + "/*.mp4")
         all_filepaths = mp4_filepaths
         self.camera_dict = {}

pyproject.toml CHANGED Viewed

@@ -73,7 +73,7 @@ members = ["packages/*"]
 [tool.ruff]
 line-length = 120
 target-version = "py311"
-extend-exclude = ["docker", "third_party"]
 [tool.ruff.lint]
 # https://docs.astral.sh/ruff/rules/
@@ -101,7 +101,6 @@ select = [
     "PLR5",
     "PLW",
     "PT",
-    "PTH",
     "Q",
     "RET",
     "RUF",

 [tool.ruff]
 line-length = 120
 target-version = "py311"
+extend-exclude = ["docker", "third_party", "src/openpi/models_pytorch/transformers_replace/*"]
 [tool.ruff.lint]
 # https://docs.astral.sh/ruff/rules/
     "PLR5",
     "PLW",
     "PT",
     "Q",
     "RET",
     "RUF",

scripts/train_pytorch.py CHANGED Viewed

@@ -23,7 +23,6 @@ Multi-Node Training:
 """
-import argparse
 import dataclasses
 import gc
 import logging
@@ -31,10 +30,10 @@ import os
 import platform
 import shutil
 import time
-from typing import Any, Dict
 import jax
 import numpy as np
 import torch
 import torch.distributed as dist
 import torch.nn.parallel
@@ -42,162 +41,169 @@ import torch.utils.data
 import torch.utils.data.distributed
 import tqdm
 import wandb
-import safetensors.torch
 import openpi.training.config as _config
 import openpi.training.data_loader as _data
-import openpi.models.model as _model
-import openpi.models_pytorch.pi0_pytorch
-import openpi.models.pi0_config
 def init_logging():
-	level_mapping = {"DEBUG": "D", "INFO": "I", "WARNING": "W", "ERROR": "E", "CRITICAL": "C"}
-	class CustomFormatter(logging.Formatter):
-		def format(self, record):
-			record.levelname = level_mapping.get(record.levelname, record.levelname)
-			return super().format(record)
-	formatter = CustomFormatter(
-		fmt="%(asctime)s.%(msecs)03d [%(levelname)s] %(message)-80s (%(process)d:%(filename)s:%(lineno)s)",
-		datefmt="%H:%M:%S",
-	)
-	logger = logging.getLogger()
-	logger.setLevel(logging.INFO)
-	if not logger.handlers:
-		ch = logging.StreamHandler()
-		ch.setFormatter(formatter)
-		logger.addHandler(ch)
-	else:
-		logger.handlers[0].setFormatter(formatter)
 def init_wandb(config: _config.TrainConfig, *, resuming: bool, enabled: bool = True):
-	"""Initialize wandb logging."""
-	if not enabled:
-		wandb.init(mode="disabled")
-		return
-	ckpt_dir = config.checkpoint_dir
-	if not ckpt_dir.exists():
-		raise FileNotFoundError(f"Checkpoint directory {ckpt_dir} does not exist.")
-	if resuming:
-		run_id = (ckpt_dir / "wandb_id.txt").read_text().strip()
-		wandb.init(id=run_id, resume="must", project=config.project_name)
-	else:
-		wandb.init(
-			name=config.exp_name,
-			config=dataclasses.asdict(config),
-			project=config.project_name,
-		)
-		(ckpt_dir / "wandb_id.txt").write_text(wandb.run.id)
 def setup_ddp():
-	world_size = int(os.environ.get("WORLD_SIZE", "1"))
-	use_ddp = world_size > 1
-	if use_ddp and not torch.distributed.is_initialized():
-		backend = "nccl" if torch.cuda.is_available() else "gloo"
-		torch.distributed.init_process_group(backend=backend, init_method="env://")
-		# Set up debugging environment variables for DDP issues
-		if os.environ.get("TORCH_DISTRIBUTED_DEBUG") is None:
-			os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO"
-	local_rank = int(os.environ.get("LOCAL_RANK", os.environ.get("RANK", "0")))
-	device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu")
-	if torch.cuda.is_available():
-		torch.cuda.set_device(device)
-	return use_ddp, local_rank, device
 def cleanup_ddp():
-	if torch.distributed.is_initialized():
-		torch.distributed.barrier()
-		torch.distributed.destroy_process_group()
 def set_seed(seed: int, local_rank: int):
-	torch.manual_seed(seed + local_rank)
-	np.random.seed(seed + local_rank)
-	if torch.cuda.is_available():
-		torch.cuda.manual_seed_all(seed + local_rank)
 def build_datasets(config: _config.TrainConfig):
-	# Use the unified data loader with PyTorch framework
-	data_loader = _data.create_data_loader(config, framework="pytorch", shuffle=True)
-	return data_loader, data_loader.data_config()
 def get_model_state_dict(model):
-	"""Get state dict from model, handling DDP wrapper."""
-	return model.module.state_dict() if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model.state_dict()
 def get_model_parameters(model):
-	"""Get parameters from model, handling DDP wrapper."""
-	return model.module.parameters() if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model.parameters()
 def save_checkpoint(model, optimizer, global_step, config, is_main):
-	"""Save a checkpoint with model state, optimizer state, and metadata."""
-	if not is_main:
-		return
-	# Only save if it's time to save or if it's the final step
-	if (global_step % config.save_interval == 0 and global_step > 0) or global_step == config.num_train_steps - 1:
-		# Create temporary directory for atomic checkpoint saving
-		final_ckpt_dir = config.checkpoint_dir / f"{global_step}"
-		tmp_ckpt_dir = config.checkpoint_dir / f"tmp_{global_step}"
-		# Remove any existing temp directory and create new one
-		if tmp_ckpt_dir.exists():
-			shutil.rmtree(tmp_ckpt_dir)
-		tmp_ckpt_dir.mkdir(parents=True, exist_ok=True)
-		# Save model state using safetensors (handle shared tensors)
-		model_to_save = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
-		safetensors.torch.save_model(model_to_save, tmp_ckpt_dir / "pytorch_model.safetensors")
-		# Save optimizer state using PyTorch format
-		torch.save(optimizer.state_dict(), tmp_ckpt_dir / "optimizer.pt")
-		# Save training metadata (avoid saving full config to prevent JAX/Flax compatibility issues)
-		metadata = {
-			"global_step": global_step,
-			"config": dataclasses.asdict(config),
-			"timestamp": time.time(),
-		}
-		torch.save(metadata, tmp_ckpt_dir / "metadata.pt")
-		# Atomically move temp directory to final location
-		if final_ckpt_dir.exists():
-			shutil.rmtree(final_ckpt_dir)
-		tmp_ckpt_dir.rename(final_ckpt_dir)
-		logging.info(f"Saved checkpoint at step {global_step} -> {final_ckpt_dir}")
-		# Log checkpoint to wandb
-		if config.wandb_enabled:
-			wandb.log({"checkpoint_step": global_step}, step=global_step)
 def load_checkpoint(model, optimizer, checkpoint_dir, device):
     """Load the latest checkpoint and return the global step."""
-    checkpoint_steps = []
-    for d in checkpoint_dir.iterdir():
-        if d.is_dir() and d.name.isdigit() and not d.name.startswith("tmp_"):
-            checkpoint_steps.append(int(d.name))
     if not checkpoint_steps:
         raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}")
     latest_step = max(checkpoint_steps)
     ckpt_dir = checkpoint_dir / f"{latest_step}"
     # Clear memory before loading checkpoints
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
@@ -208,35 +214,34 @@ def load_checkpoint(model, optimizer, checkpoint_dir, device):
         # Load model state with error handling
         logging.info("Loading model state...")
         safetensors_path = ckpt_dir / "pytorch_model.safetensors"
         if safetensors_path.exists():
             model_to_load = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
             safetensors.torch.load_model(model_to_load, safetensors_path, device=str(device))
             logging.info("Loaded model state from safetensors format")
         else:
             raise FileNotFoundError(f"No model checkpoint found at {ckpt_dir}")
         torch.cuda.empty_cache()
         gc.collect()
         log_memory_usage(device, latest_step, "after_loading_model")
         # Load optimizer state with error handling
         logging.info("Loading optimizer state...")
         optimizer_path = ckpt_dir / "optimizer.pt"
         if optimizer_path.exists():
             optimizer_state_dict = torch.load(optimizer_path, map_location=device, weights_only=False)
             logging.info("Loaded optimizer state from pt format")
         else:
             raise FileNotFoundError(f"No optimizer checkpoint found at {ckpt_dir}")
         optimizer.load_state_dict(optimizer_state_dict)
         del optimizer_state_dict
         torch.cuda.empty_cache()
         gc.collect()
         log_memory_usage(device, latest_step, "after_loading_optimizer")
         # Load metadata
         logging.info("Loading metadata...")
         metadata = torch.load(ckpt_dir / "metadata.pt", map_location=device, weights_only=False)
@@ -245,355 +250,379 @@ def load_checkpoint(model, optimizer, checkpoint_dir, device):
         torch.cuda.empty_cache()
         gc.collect()
         log_memory_usage(device, latest_step, "after_loading_metadata")
         logging.info(f"Successfully loaded all checkpoint components from step {latest_step}")
         return global_step
     except RuntimeError as e:
         if "out of memory" in str(e):
             # Clear memory and provide detailed error message
             torch.cuda.empty_cache()
             gc.collect()
-            logging.error(f"Out of memory error while loading checkpoint: {str(e)}")
             log_memory_usage(device, latest_step, "after_oom_error")
-            raise RuntimeError(f"Out of memory while loading checkpoint. Try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True") from e
         raise
 def get_latest_checkpoint_step(checkpoint_dir):
-	"""Get the latest checkpoint step number from a checkpoint directory."""
-	checkpoint_steps = []
-	for d in checkpoint_dir.iterdir():
-		if d.is_dir() and d.name.isdigit() and not d.name.startswith("tmp_"):
-			checkpoint_steps.append(int(d.name))
-	return max(checkpoint_steps) if checkpoint_steps else None
 def log_memory_usage(device, step, phase="unknown"):
-	"""Log detailed memory usage information."""
-	if not torch.cuda.is_available():
-		return
-	memory_allocated = torch.cuda.memory_allocated(device) / 1e9
-	memory_reserved = torch.cuda.memory_reserved(device) / 1e9
-	memory_free = torch.cuda.memory_reserved(device) - torch.cuda.memory_allocated(device)
-	memory_free = memory_free / 1e9
-	# Get more detailed memory info
-	memory_stats = torch.cuda.memory_stats(device)
-	max_memory_allocated = memory_stats.get('allocated_bytes.all.peak', 0) / 1e9
-	max_memory_reserved = memory_stats.get('reserved_bytes.all.peak', 0) / 1e9
-	# Get DDP info if available
-	ddp_info = ""
-	if dist.is_initialized():
-		ddp_info = f" | DDP: rank={dist.get_rank()}, world_size={dist.get_world_size()}"
-	logging.info(f"Step {step} ({phase}): GPU memory - allocated: {memory_allocated:.2f}GB, reserved: {memory_reserved:.2f}GB, free: {memory_free:.2f}GB, peak_allocated: {max_memory_allocated:.2f}GB, peak_reserved: {max_memory_reserved:.2f}GB{ddp_info}")
 def train_loop(config: _config.TrainConfig):
-	use_ddp, local_rank, device = setup_ddp()
-	is_main = (not use_ddp) or (dist.get_rank() == 0)
-	set_seed(config.seed, local_rank)
-	# Initialize checkpoint directory and wandb
-	resuming = False
-	if config.resume:
-		# Find checkpoint directory based on experiment name
-		exp_checkpoint_dir = config.checkpoint_dir
-		if exp_checkpoint_dir.exists():
-			# Use validation to find the latest working checkpoint
-			latest_step = get_latest_checkpoint_step(exp_checkpoint_dir)
-			if latest_step is not None:
-				resuming = True
-				logging.info(f"Resuming from experiment checkpoint directory: {exp_checkpoint_dir} at step {latest_step}")
-			else:
-				raise FileNotFoundError(f"No valid checkpoints found in {exp_checkpoint_dir} for resume")
-		else:
-			raise FileNotFoundError(f"Experiment checkpoint directory {exp_checkpoint_dir} does not exist for resume")
-	elif config.overwrite and config.checkpoint_dir.exists():
-		shutil.rmtree(config.checkpoint_dir)
-		logging.info(f"Overwriting checkpoint directory: {config.checkpoint_dir}")
-	# Create checkpoint directory with experiment name
-	if not resuming:
-		# For new runs, create experiment-specific checkpoint directory
-		exp_checkpoint_dir = config.checkpoint_dir
-		exp_checkpoint_dir.mkdir(parents=True, exist_ok=True)
-		logging.info(f"Created experiment checkpoint directory: {exp_checkpoint_dir}")
-	else:
-		# For resume, checkpoint_dir is already set to the experiment directory
-		logging.info(f"Using existing experiment checkpoint directory: {config.checkpoint_dir}")
-	# Initialize wandb (only on main process)
-	if is_main:
-		init_wandb(config, resuming=resuming, enabled=config.wandb_enabled)
-	# Build data loader using the unified data loader
-	# Calculate effective batch size per GPU for DDP
-	# For N GPUs, each GPU should get batch_size/N samples, so total across all GPUs is batch_size
-	world_size = torch.distributed.get_world_size() if use_ddp else 1
-	effective_batch_size = config.batch_size // world_size
-	logging.info(f"Using batch size per GPU: {effective_batch_size} (total batch size across {world_size} GPUs: {config.batch_size})")
-	# Pass the original batch size to data loader - it will handle DDP splitting internally
-	loader, _ = build_datasets(config)
-	# Log sample images to wandb on first batch
-	if is_main and config.wandb_enabled and not resuming:
-		# Create a separate data loader for sample batch to avoid consuming the main loader
-		sample_data_loader = _data.create_data_loader(config, framework="pytorch", shuffle=False)
-		sample_batch = next(iter(sample_data_loader))
-		# Convert observation and actions to torch tensors
-		observation, actions = sample_batch
-		sample_batch = observation.to_dict()
-		sample_batch["actions"] = actions
-		# Create sample images for wandb
-		images_to_log = []
-		# Get batch size from the first image tensor
-		batch_size = next(iter(sample_batch['image'].values())).shape[0]
-		for i in range(min(5, batch_size)):
-			# Concatenate all camera views horizontally for this batch item
-			# Convert from NCHW to NHWC format for wandb
-			img_concatenated = torch.cat([img[i].permute(1, 2, 0) for img in sample_batch['image'].values()], axis=1)
-			img_concatenated = img_concatenated.cpu().numpy()
-			images_to_log.append(wandb.Image(img_concatenated))
-		wandb.log({"camera_views": images_to_log}, step=0)
-		# Clear sample batch from memory aggressively
-		del sample_batch, observation, actions, images_to_log, img_concatenated
-		del sample_data_loader  # Also delete the sample data loader
-		gc.collect()
-		if torch.cuda.is_available():
-			torch.cuda.empty_cache()
-		logging.info("Cleared sample batch and data loader from memory")
-	# Build model
-	if not isinstance(config.model, openpi.models.pi0_config.Pi0Config):
-		# Convert dataclass to Pi0Config if needed
-		model_cfg = openpi.models.pi0_config.Pi0Config(
-			dtype=config.pytorch_training_precision,
-			action_dim=config.model.action_dim,
-			action_horizon=config.model.action_horizon,
-			max_token_len=config.model.max_token_len,
-			paligemma_variant=getattr(config.model, "paligemma_variant", "gemma_2b"),
-			action_expert_variant=getattr(config.model, "action_expert_variant", "gemma_300m"),
-			pi05=getattr(config.model, "pi05", False),
-		)
-	else:
-		model_cfg = config.model
-		# Update dtype to match pytorch_training_precision
-		object.__setattr__(model_cfg, "dtype", config.pytorch_training_precision)
-	model = openpi.models_pytorch.pi0_pytorch.PI0Pytorch(model_cfg).to(device)
-	if hasattr(model, 'gradient_checkpointing_enable'):
-		enable_gradient_checkpointing = True
-		model.gradient_checkpointing_enable()
-		logging.info("Enabled gradient checkpointing for memory optimization")
-	else:
-		enable_gradient_checkpointing = False
-		logging.info("Gradient checkpointing is not supported for this model")
-	# Log initial memory usage after model creation
-	if is_main and torch.cuda.is_available():
-		log_memory_usage(device, 0, "after_model_creation")
-	# Enable memory optimizations for large-scale training
-	if world_size >= 8:
-		torch.backends.cudnn.benchmark = True
-		torch.backends.cuda.matmul.allow_tf32 = True
-		torch.backends.cudnn.allow_tf32 = True
-		# Set memory allocation configuration
-		os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"
-		logging.info("Enabled memory optimizations for 8+ GPU training")
-	if use_ddp:
-		model = torch.nn.parallel.DistributedDataParallel(
-			model,
-			device_ids=[device.index] if device.type == "cuda" else None,
-			find_unused_parameters=True,  # Disable for memory efficiency
-			gradient_as_bucket_view=True,   # Enable for memory efficiency
-			static_graph=True if world_size >= 8 else False,  # Enable for 8+ GPUs
-		)
-	# Load weights from weight_loader if specified (for fine-tuning)
-	if config.pytorch_weight_path is not None:
-		logging.info(f"Loading weights from: {config.pytorch_weight_path}")
-		model_path = os.path.join(config.pytorch_weight_path, "model.safetensors")
-		safetensors.torch.load_model((model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model), model_path)
-		logging.info(f"Loaded PyTorch weights from {config.pytorch_weight_path}")
-	# Optimizer + learning rate schedule from config
-	warmup_steps = config.lr_schedule.warmup_steps
-	peak_lr = config.lr_schedule.peak_lr
-	decay_steps = config.lr_schedule.decay_steps
-	end_lr = config.lr_schedule.decay_lr
-	# Create optimizer with config parameters
-	optim = torch.optim.AdamW(
-		model.parameters(),
-		lr=peak_lr,
-		betas=(config.optimizer.b1, config.optimizer.b2),
-		eps=config.optimizer.eps,
-		weight_decay=config.optimizer.weight_decay
-	)
-	# Load checkpoint if resuming
-	global_step = 0
-	if resuming:
-		global_step = load_checkpoint(model, optim, config.checkpoint_dir, device)
-		logging.info(f"Resumed training from step {global_step}")
-	def lr_schedule(step: int):
-		if step < warmup_steps:
-			# Match JAX behavior: start from peak_lr / (warmup_steps + 1)
-			init_lr = peak_lr / (warmup_steps + 1)
-			return init_lr + (peak_lr - init_lr) * step / warmup_steps
-		# cosine decay
-		progress = min(1.0, (step - warmup_steps) / max(1, decay_steps - warmup_steps))
-		cos = 0.5 * (1 + np.cos(np.pi * progress))
-		return end_lr + (peak_lr - end_lr) * cos
-	model.train()
-	start_time = time.time()
-	infos = []  # Collect stats over log interval
-	if is_main:
-		logging.info(f"Running on: {platform.node()} | world_size={torch.distributed.get_world_size() if use_ddp else 1}")
-		logging.info(f"Training config: batch_size={config.batch_size}, effective_batch_size={effective_batch_size}, num_train_steps={config.num_train_steps}")
-		logging.info(f"Memory optimizations: gradient_checkpointing={enable_gradient_checkpointing}")
-		logging.info(f"LR schedule: warmup={warmup_steps}, peak_lr={peak_lr:.2e}, decay_steps={decay_steps}, end_lr={end_lr:.2e}")
-		logging.info(f"Optimizer: {type(config.optimizer).__name__}, weight_decay={config.optimizer.weight_decay}, clip_norm={config.optimizer.clip_gradient_norm}")
-		logging.info(f"EMA is not supported for PyTorch training")
-		logging.info(f"Training precision: {model_cfg.dtype}")
-	# Training loop - iterate until we reach num_train_steps
-	pbar = tqdm.tqdm(total=config.num_train_steps, initial=global_step, desc="Training", disable=not is_main) if is_main else None
-	while global_step < config.num_train_steps:
-		# Set epoch for distributed training
-		if use_ddp and hasattr(loader, 'set_epoch'):
-			loader.set_epoch(global_step // len(loader))
-		for observation, actions in loader:
-			# Check if we've reached the target number of steps
-			if global_step >= config.num_train_steps:
-				break
-			# The unified data loader returns (observation, actions) tuple
-			observation = jax.tree.map(lambda x: x.to(device), observation)
-			actions = actions.to(torch.float32)
-			actions = actions.to(device)
-			# Update LR
-			for pg in optim.param_groups:
-				pg["lr"] = lr_schedule(global_step)
-			# Forward pass
-			losses = model(observation, actions)
-			# Ensure losses is a tensor and handle different return types
-			if isinstance(losses, (list, tuple)):
-				losses = torch.stack(losses)
-			elif not isinstance(losses, torch.Tensor):
-				losses = torch.tensor(losses, device=device, dtype=torch.float32)
-			loss = losses.mean()
-			# Backward pass
-			loss.backward()
-			# Log memory usage after backward pass
-			if global_step < 5 and is_main:
-				if torch.cuda.is_available():
-					log_memory_usage(device, global_step, "after_backward")
-			# Gradient clipping
-			grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.optimizer.clip_gradient_norm)
-			# Optimizer step
-			optim.step()
-			optim.zero_grad(set_to_none=True)
-			# Clear gradients more aggressively
-			for param in model.parameters():
-				if param.grad is not None:
-					param.grad.detach_()
-					param.grad = None
-			# Collect stats
-			if is_main:
-				infos.append({
-					"loss": loss.item(),
-					"learning_rate": optim.param_groups[0]['lr'],
-					"grad_norm": float(grad_norm) if isinstance(grad_norm, torch.Tensor) else grad_norm,
-				})
-			if is_main and (global_step % config.log_interval == 0):
-				elapsed = time.time() - start_time
-				# Average stats over log interval
-				avg_loss = sum(info["loss"] for info in infos) / len(infos)
-				avg_lr = sum(info["learning_rate"] for info in infos) / len(infos)
-				avg_grad_norm = None
-				if any('grad_norm' in info for info in infos):
-					vals = [info['grad_norm'] for info in infos if 'grad_norm' in info and info['grad_norm'] is not None]
-					if len(vals) > 0:
-						avg_grad_norm = sum(vals) / len(vals)
-				logging.info(f"step={global_step} loss={avg_loss:.4f} lr={avg_lr:.2e} grad_norm={avg_grad_norm:.2f} time={elapsed:.1f}s" if avg_grad_norm is not None else f"step={global_step} loss={avg_loss:.4f} lr={avg_lr:.2e} time={elapsed:.1f}s")
-				# Log to wandb
-				if config.wandb_enabled and len(infos) > 0:
-					log_payload = {
-						"loss": avg_loss,
-						"learning_rate": avg_lr,
-						"step": global_step,
-						"time_per_step": elapsed / config.log_interval,
-					}
-					if avg_grad_norm is not None:
-						log_payload["grad_norm"] = avg_grad_norm
-					wandb.log(log_payload, step=global_step)
-				start_time = time.time()
-				infos = []  # Reset stats collection
-			global_step += 1
-			# Save checkpoint using the new mechanism
-			save_checkpoint(model, optim, global_step, config, is_main)
-			# Update progress bar
-			if pbar is not None:
-				pbar.update(1)
-				pbar.set_postfix({
-					'loss': f'{loss.item():.4f}',
-					'lr': f'{optim.param_groups[0]["lr"]:.2e}',
-					'step': global_step
-				})
-	# Close progress bar
-	if pbar is not None:
-		pbar.close()
-	# Finish wandb run
-	if is_main and config.wandb_enabled:
-		wandb.finish()
-	cleanup_ddp()
 def main():
-	init_logging()
-	config = _config.cli()
-	train_loop(config)
 if __name__ == "__main__":
-	main()

 """
 import dataclasses
 import gc
 import logging
 import platform
 import shutil
 import time
 import jax
 import numpy as np
+import safetensors.torch
 import torch
 import torch.distributed as dist
 import torch.nn.parallel
 import torch.utils.data.distributed
 import tqdm
 import wandb
+import openpi.models.pi0_config
+import openpi.models_pytorch.pi0_pytorch
 import openpi.training.config as _config
 import openpi.training.data_loader as _data
 def init_logging():
+    level_mapping = {"DEBUG": "D", "INFO": "I", "WARNING": "W", "ERROR": "E", "CRITICAL": "C"}
+    class CustomFormatter(logging.Formatter):
+        def format(self, record):
+            record.levelname = level_mapping.get(record.levelname, record.levelname)
+            return super().format(record)
+    formatter = CustomFormatter(
+        fmt="%(asctime)s.%(msecs)03d [%(levelname)s] %(message)-80s (%(process)d:%(filename)s:%(lineno)s)",
+        datefmt="%H:%M:%S",
+    )
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    if not logger.handlers:
+        ch = logging.StreamHandler()
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+    else:
+        logger.handlers[0].setFormatter(formatter)
 def init_wandb(config: _config.TrainConfig, *, resuming: bool, enabled: bool = True):
+    """Initialize wandb logging."""
+    if not enabled:
+        wandb.init(mode="disabled")
+        return
+    ckpt_dir = config.checkpoint_dir
+    if not ckpt_dir.exists():
+        raise FileNotFoundError(f"Checkpoint directory {ckpt_dir} does not exist.")
+    if resuming:
+        run_id = (ckpt_dir / "wandb_id.txt").read_text().strip()
+        wandb.init(id=run_id, resume="must", project=config.project_name)
+    else:
+        wandb.init(
+            name=config.exp_name,
+            config=dataclasses.asdict(config),
+            project=config.project_name,
+        )
+        (ckpt_dir / "wandb_id.txt").write_text(wandb.run.id)
 def setup_ddp():
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    use_ddp = world_size > 1
+    if use_ddp and not torch.distributed.is_initialized():
+        backend = "nccl" if torch.cuda.is_available() else "gloo"
+        torch.distributed.init_process_group(backend=backend, init_method="env://")
+        # Set up debugging environment variables for DDP issues
+        if os.environ.get("TORCH_DISTRIBUTED_DEBUG") is None:
+            os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO"
+    local_rank = int(os.environ.get("LOCAL_RANK", os.environ.get("RANK", "0")))
+    device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu")
+    if torch.cuda.is_available():
+        torch.cuda.set_device(device)
+    return use_ddp, local_rank, device
 def cleanup_ddp():
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+        torch.distributed.destroy_process_group()
 def set_seed(seed: int, local_rank: int):
+    torch.manual_seed(seed + local_rank)
+    np.random.seed(seed + local_rank)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed + local_rank)
 def build_datasets(config: _config.TrainConfig):
+    # Use the unified data loader with PyTorch framework
+    data_loader = _data.create_data_loader(config, framework="pytorch", shuffle=True)
+    return data_loader, data_loader.data_config()
 def get_model_state_dict(model):
+    """Get state dict from model, handling DDP wrapper."""
+    return (
+        model.module.state_dict()
+        if isinstance(model, torch.nn.parallel.DistributedDataParallel)
+        else model.state_dict()
+    )
 def get_model_parameters(model):
+    """Get parameters from model, handling DDP wrapper."""
+    return (
+        model.module.parameters()
+        if isinstance(model, torch.nn.parallel.DistributedDataParallel)
+        else model.parameters()
+    )
 def save_checkpoint(model, optimizer, global_step, config, is_main):
+    """Save a checkpoint with model state, optimizer state, and metadata."""
+    if not is_main:
+        return
+    # Only save if it's time to save or if it's the final step
+    if (global_step % config.save_interval == 0 and global_step > 0) or global_step == config.num_train_steps - 1:
+        # Create temporary directory for atomic checkpoint saving
+        final_ckpt_dir = config.checkpoint_dir / f"{global_step}"
+        tmp_ckpt_dir = config.checkpoint_dir / f"tmp_{global_step}"
+        # Remove any existing temp directory and create new one
+        if tmp_ckpt_dir.exists():
+            shutil.rmtree(tmp_ckpt_dir)
+        tmp_ckpt_dir.mkdir(parents=True, exist_ok=True)
+        # Save model state using safetensors (handle shared tensors)
+        model_to_save = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
+        safetensors.torch.save_model(model_to_save, tmp_ckpt_dir / "pytorch_model.safetensors")
+        # Save optimizer state using PyTorch format
+        torch.save(optimizer.state_dict(), tmp_ckpt_dir / "optimizer.pt")
+        # Save training metadata (avoid saving full config to prevent JAX/Flax compatibility issues)
+        metadata = {
+            "global_step": global_step,
+            "config": dataclasses.asdict(config),
+            "timestamp": time.time(),
+        }
+        torch.save(metadata, tmp_ckpt_dir / "metadata.pt")
+        # Atomically move temp directory to final location
+        if final_ckpt_dir.exists():
+            shutil.rmtree(final_ckpt_dir)
+        tmp_ckpt_dir.rename(final_ckpt_dir)
+        logging.info(f"Saved checkpoint at step {global_step} -> {final_ckpt_dir}")
+        # Log checkpoint to wandb
+        if config.wandb_enabled:
+            wandb.log({"checkpoint_step": global_step}, step=global_step)
 def load_checkpoint(model, optimizer, checkpoint_dir, device):
     """Load the latest checkpoint and return the global step."""
+    checkpoint_steps = [
+        int(d.name)
+        for d in checkpoint_dir.iterdir()
+        if d.is_dir() and d.name.isdigit() and not d.name.startswith("tmp_")
+    ]
     if not checkpoint_steps:
         raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}")
     latest_step = max(checkpoint_steps)
     ckpt_dir = checkpoint_dir / f"{latest_step}"
     # Clear memory before loading checkpoints
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
         # Load model state with error handling
         logging.info("Loading model state...")
         safetensors_path = ckpt_dir / "pytorch_model.safetensors"
         if safetensors_path.exists():
             model_to_load = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
             safetensors.torch.load_model(model_to_load, safetensors_path, device=str(device))
             logging.info("Loaded model state from safetensors format")
         else:
             raise FileNotFoundError(f"No model checkpoint found at {ckpt_dir}")
         torch.cuda.empty_cache()
         gc.collect()
         log_memory_usage(device, latest_step, "after_loading_model")
         # Load optimizer state with error handling
         logging.info("Loading optimizer state...")
         optimizer_path = ckpt_dir / "optimizer.pt"
         if optimizer_path.exists():
             optimizer_state_dict = torch.load(optimizer_path, map_location=device, weights_only=False)
             logging.info("Loaded optimizer state from pt format")
         else:
             raise FileNotFoundError(f"No optimizer checkpoint found at {ckpt_dir}")
         optimizer.load_state_dict(optimizer_state_dict)
         del optimizer_state_dict
         torch.cuda.empty_cache()
         gc.collect()
         log_memory_usage(device, latest_step, "after_loading_optimizer")
         # Load metadata
         logging.info("Loading metadata...")
         metadata = torch.load(ckpt_dir / "metadata.pt", map_location=device, weights_only=False)
         torch.cuda.empty_cache()
         gc.collect()
         log_memory_usage(device, latest_step, "after_loading_metadata")
         logging.info(f"Successfully loaded all checkpoint components from step {latest_step}")
         return global_step
     except RuntimeError as e:
         if "out of memory" in str(e):
             # Clear memory and provide detailed error message
             torch.cuda.empty_cache()
             gc.collect()
+            logging.error(f"Out of memory error while loading checkpoint: {e!s}")
             log_memory_usage(device, latest_step, "after_oom_error")
+            raise RuntimeError(
+                "Out of memory while loading checkpoint. Try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True"
+            ) from e
         raise
 def get_latest_checkpoint_step(checkpoint_dir):
+    """Get the latest checkpoint step number from a checkpoint directory."""
+    checkpoint_steps = [
+        int(d.name)
+        for d in checkpoint_dir.iterdir()
+        if d.is_dir() and d.name.isdigit() and not d.name.startswith("tmp_")
+    ]
+    return max(checkpoint_steps) if checkpoint_steps else None
 def log_memory_usage(device, step, phase="unknown"):
+    """Log detailed memory usage information."""
+    if not torch.cuda.is_available():
+        return
+    memory_allocated = torch.cuda.memory_allocated(device) / 1e9
+    memory_reserved = torch.cuda.memory_reserved(device) / 1e9
+    memory_free = torch.cuda.memory_reserved(device) - torch.cuda.memory_allocated(device)
+    memory_free = memory_free / 1e9
+    # Get more detailed memory info
+    memory_stats = torch.cuda.memory_stats(device)
+    max_memory_allocated = memory_stats.get("allocated_bytes.all.peak", 0) / 1e9
+    max_memory_reserved = memory_stats.get("reserved_bytes.all.peak", 0) / 1e9
+    # Get DDP info if available
+    ddp_info = ""
+    if dist.is_initialized():
+        ddp_info = f" | DDP: rank={dist.get_rank()}, world_size={dist.get_world_size()}"
+    logging.info(
+        f"Step {step} ({phase}): GPU memory - allocated: {memory_allocated:.2f}GB, reserved: {memory_reserved:.2f}GB, free: {memory_free:.2f}GB, peak_allocated: {max_memory_allocated:.2f}GB, peak_reserved: {max_memory_reserved:.2f}GB{ddp_info}"
+    )
 def train_loop(config: _config.TrainConfig):
+    use_ddp, local_rank, device = setup_ddp()
+    is_main = (not use_ddp) or (dist.get_rank() == 0)
+    set_seed(config.seed, local_rank)
+    # Initialize checkpoint directory and wandb
+    resuming = False
+    if config.resume:
+        # Find checkpoint directory based on experiment name
+        exp_checkpoint_dir = config.checkpoint_dir
+        if exp_checkpoint_dir.exists():
+            # Use validation to find the latest working checkpoint
+            latest_step = get_latest_checkpoint_step(exp_checkpoint_dir)
+            if latest_step is not None:
+                resuming = True
+                logging.info(
+                    f"Resuming from experiment checkpoint directory: {exp_checkpoint_dir} at step {latest_step}"
+                )
+            else:
+                raise FileNotFoundError(f"No valid checkpoints found in {exp_checkpoint_dir} for resume")
+        else:
+            raise FileNotFoundError(f"Experiment checkpoint directory {exp_checkpoint_dir} does not exist for resume")
+    elif config.overwrite and config.checkpoint_dir.exists():
+        shutil.rmtree(config.checkpoint_dir)
+        logging.info(f"Overwriting checkpoint directory: {config.checkpoint_dir}")
+    # Create checkpoint directory with experiment name
+    if not resuming:
+        # For new runs, create experiment-specific checkpoint directory
+        exp_checkpoint_dir = config.checkpoint_dir
+        exp_checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        logging.info(f"Created experiment checkpoint directory: {exp_checkpoint_dir}")
+    else:
+        # For resume, checkpoint_dir is already set to the experiment directory
+        logging.info(f"Using existing experiment checkpoint directory: {config.checkpoint_dir}")
+    # Initialize wandb (only on main process)
+    if is_main:
+        init_wandb(config, resuming=resuming, enabled=config.wandb_enabled)
+    # Build data loader using the unified data loader
+    # Calculate effective batch size per GPU for DDP
+    # For N GPUs, each GPU should get batch_size/N samples, so total across all GPUs is batch_size
+    world_size = torch.distributed.get_world_size() if use_ddp else 1
+    effective_batch_size = config.batch_size // world_size
+    logging.info(
+        f"Using batch size per GPU: {effective_batch_size} (total batch size across {world_size} GPUs: {config.batch_size})"
+    )
+    # Pass the original batch size to data loader - it will handle DDP splitting internally
+    loader, _ = build_datasets(config)
+    # Log sample images to wandb on first batch
+    if is_main and config.wandb_enabled and not resuming:
+        # Create a separate data loader for sample batch to avoid consuming the main loader
+        sample_data_loader = _data.create_data_loader(config, framework="pytorch", shuffle=False)
+        sample_batch = next(iter(sample_data_loader))
+        # Convert observation and actions to torch tensors
+        observation, actions = sample_batch
+        sample_batch = observation.to_dict()
+        sample_batch["actions"] = actions
+        # Create sample images for wandb
+        images_to_log = []
+        # Get batch size from the first image tensor
+        batch_size = next(iter(sample_batch["image"].values())).shape[0]
+        for i in range(min(5, batch_size)):
+            # Concatenate all camera views horizontally for this batch item
+            # Convert from NCHW to NHWC format for wandb
+            img_concatenated = torch.cat([img[i].permute(1, 2, 0) for img in sample_batch["image"].values()], axis=1)
+            img_concatenated = img_concatenated.cpu().numpy()
+            images_to_log.append(wandb.Image(img_concatenated))
+        wandb.log({"camera_views": images_to_log}, step=0)
+        # Clear sample batch from memory aggressively
+        del sample_batch, observation, actions, images_to_log, img_concatenated
+        del sample_data_loader  # Also delete the sample data loader
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        logging.info("Cleared sample batch and data loader from memory")
+    # Build model
+    if not isinstance(config.model, openpi.models.pi0_config.Pi0Config):
+        # Convert dataclass to Pi0Config if needed
+        model_cfg = openpi.models.pi0_config.Pi0Config(
+            dtype=config.pytorch_training_precision,
+            action_dim=config.model.action_dim,
+            action_horizon=config.model.action_horizon,
+            max_token_len=config.model.max_token_len,
+            paligemma_variant=getattr(config.model, "paligemma_variant", "gemma_2b"),
+            action_expert_variant=getattr(config.model, "action_expert_variant", "gemma_300m"),
+            pi05=getattr(config.model, "pi05", False),
+        )
+    else:
+        model_cfg = config.model
+        # Update dtype to match pytorch_training_precision
+        object.__setattr__(model_cfg, "dtype", config.pytorch_training_precision)
+    model = openpi.models_pytorch.pi0_pytorch.PI0Pytorch(model_cfg).to(device)
+    if hasattr(model, "gradient_checkpointing_enable"):
+        enable_gradient_checkpointing = True
+        model.gradient_checkpointing_enable()
+        logging.info("Enabled gradient checkpointing for memory optimization")
+    else:
+        enable_gradient_checkpointing = False
+        logging.info("Gradient checkpointing is not supported for this model")
+    # Log initial memory usage after model creation
+    if is_main and torch.cuda.is_available():
+        log_memory_usage(device, 0, "after_model_creation")
+    # Enable memory optimizations for large-scale training
+    if world_size >= 8:
+        torch.backends.cudnn.benchmark = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        # Set memory allocation configuration
+        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"
+        logging.info("Enabled memory optimizations for 8+ GPU training")
+    if use_ddp:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model,
+            device_ids=[device.index] if device.type == "cuda" else None,
+            find_unused_parameters=True,  # Disable for memory efficiency
+            gradient_as_bucket_view=True,  # Enable for memory efficiency
+            static_graph=world_size >= 8,  # Enable for 8+ GPUs
+        )
+    # Load weights from weight_loader if specified (for fine-tuning)
+    if config.pytorch_weight_path is not None:
+        logging.info(f"Loading weights from: {config.pytorch_weight_path}")
+        model_path = os.path.join(config.pytorch_weight_path, "model.safetensors")
+        safetensors.torch.load_model(
+            (model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model), model_path
+        )
+        logging.info(f"Loaded PyTorch weights from {config.pytorch_weight_path}")
+    # Optimizer + learning rate schedule from config
+    warmup_steps = config.lr_schedule.warmup_steps
+    peak_lr = config.lr_schedule.peak_lr
+    decay_steps = config.lr_schedule.decay_steps
+    end_lr = config.lr_schedule.decay_lr
+    # Create optimizer with config parameters
+    optim = torch.optim.AdamW(
+        model.parameters(),
+        lr=peak_lr,
+        betas=(config.optimizer.b1, config.optimizer.b2),
+        eps=config.optimizer.eps,
+        weight_decay=config.optimizer.weight_decay,
+    )
+    # Load checkpoint if resuming
+    global_step = 0
+    if resuming:
+        global_step = load_checkpoint(model, optim, config.checkpoint_dir, device)
+        logging.info(f"Resumed training from step {global_step}")
+    def lr_schedule(step: int):
+        if step < warmup_steps:
+            # Match JAX behavior: start from peak_lr / (warmup_steps + 1)
+            init_lr = peak_lr / (warmup_steps + 1)
+            return init_lr + (peak_lr - init_lr) * step / warmup_steps
+        # cosine decay
+        progress = min(1.0, (step - warmup_steps) / max(1, decay_steps - warmup_steps))
+        cos = 0.5 * (1 + np.cos(np.pi * progress))
+        return end_lr + (peak_lr - end_lr) * cos
+    model.train()
+    start_time = time.time()
+    infos = []  # Collect stats over log interval
+    if is_main:
+        logging.info(
+            f"Running on: {platform.node()} | world_size={torch.distributed.get_world_size() if use_ddp else 1}"
+        )
+        logging.info(
+            f"Training config: batch_size={config.batch_size}, effective_batch_size={effective_batch_size}, num_train_steps={config.num_train_steps}"
+        )
+        logging.info(f"Memory optimizations: gradient_checkpointing={enable_gradient_checkpointing}")
+        logging.info(
+            f"LR schedule: warmup={warmup_steps}, peak_lr={peak_lr:.2e}, decay_steps={decay_steps}, end_lr={end_lr:.2e}"
+        )
+        logging.info(
+            f"Optimizer: {type(config.optimizer).__name__}, weight_decay={config.optimizer.weight_decay}, clip_norm={config.optimizer.clip_gradient_norm}"
+        )
+        logging.info("EMA is not supported for PyTorch training")
+        logging.info(f"Training precision: {model_cfg.dtype}")
+    # Training loop - iterate until we reach num_train_steps
+    pbar = (
+        tqdm.tqdm(total=config.num_train_steps, initial=global_step, desc="Training", disable=not is_main)
+        if is_main
+        else None
+    )
+    while global_step < config.num_train_steps:
+        # Set epoch for distributed training
+        if use_ddp and hasattr(loader, "set_epoch"):
+            loader.set_epoch(global_step // len(loader))
+        for observation, actions in loader:
+            # Check if we've reached the target number of steps
+            if global_step >= config.num_train_steps:
+                break
+            # The unified data loader returns (observation, actions) tuple
+            observation = jax.tree.map(lambda x: x.to(device), observation)  # noqa: PLW2901
+            actions = actions.to(torch.float32)  # noqa: PLW2901
+            actions = actions.to(device)  # noqa: PLW2901
+            # Update LR
+            for pg in optim.param_groups:
+                pg["lr"] = lr_schedule(global_step)
+            # Forward pass
+            losses = model(observation, actions)
+            # Ensure losses is a tensor and handle different return types
+            if isinstance(losses, list | tuple):
+                losses = torch.stack(losses)
+            elif not isinstance(losses, torch.Tensor):
+                losses = torch.tensor(losses, device=device, dtype=torch.float32)
+            loss = losses.mean()
+            # Backward pass
+            loss.backward()
+            # Log memory usage after backward pass
+            if global_step < 5 and is_main and torch.cuda.is_available():
+                log_memory_usage(device, global_step, "after_backward")
+            # Gradient clipping
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.optimizer.clip_gradient_norm)
+            # Optimizer step
+            optim.step()
+            optim.zero_grad(set_to_none=True)
+            # Clear gradients more aggressively
+            for param in model.parameters():
+                if param.grad is not None:
+                    param.grad.detach_()
+                    param.grad = None
+            # Collect stats
+            if is_main:
+                infos.append(
+                    {
+                        "loss": loss.item(),
+                        "learning_rate": optim.param_groups[0]["lr"],
+                        "grad_norm": float(grad_norm) if isinstance(grad_norm, torch.Tensor) else grad_norm,
+                    }
+                )
+            if is_main and (global_step % config.log_interval == 0):
+                elapsed = time.time() - start_time
+                # Average stats over log interval
+                avg_loss = sum(info["loss"] for info in infos) / len(infos)
+                avg_lr = sum(info["learning_rate"] for info in infos) / len(infos)
+                avg_grad_norm = None
+                if any("grad_norm" in info for info in infos):
+                    vals = [
+                        info["grad_norm"] for info in infos if "grad_norm" in info and info["grad_norm"] is not None
+                    ]
+                    if len(vals) > 0:
+                        avg_grad_norm = sum(vals) / len(vals)
+                logging.info(
+                    f"step={global_step} loss={avg_loss:.4f} lr={avg_lr:.2e} grad_norm={avg_grad_norm:.2f} time={elapsed:.1f}s"
+                    if avg_grad_norm is not None
+                    else f"step={global_step} loss={avg_loss:.4f} lr={avg_lr:.2e} time={elapsed:.1f}s"
+                )
+                # Log to wandb
+                if config.wandb_enabled and len(infos) > 0:
+                    log_payload = {
+                        "loss": avg_loss,
+                        "learning_rate": avg_lr,
+                        "step": global_step,
+                        "time_per_step": elapsed / config.log_interval,
+                    }
+                    if avg_grad_norm is not None:
+                        log_payload["grad_norm"] = avg_grad_norm
+                    wandb.log(log_payload, step=global_step)
+                start_time = time.time()
+                infos = []  # Reset stats collection
+            global_step += 1
+            # Save checkpoint using the new mechanism
+            save_checkpoint(model, optim, global_step, config, is_main)
+            # Update progress bar
+            if pbar is not None:
+                pbar.update(1)
+                pbar.set_postfix(
+                    {"loss": f"{loss.item():.4f}", "lr": f"{optim.param_groups[0]['lr']:.2e}", "step": global_step}
+                )
+    # Close progress bar
+    if pbar is not None:
+        pbar.close()
+    # Finish wandb run
+    if is_main and config.wandb_enabled:
+        wandb.finish()
+    cleanup_ddp()
 def main():
+    init_logging()
+    config = _config.cli()
+    train_loop(config)
 if __name__ == "__main__":
+    main()

src/openpi/models/model.py CHANGED Viewed

@@ -4,7 +4,7 @@ import dataclasses
 import enum
 import logging
 import pathlib
-from typing import Generic, TypeVar, Union
 import augmax
 from flax import nnx
@@ -12,7 +12,6 @@ from flax import struct
 from flax import traverse_util
 import jax
 import jax.numpy as jnp
-import logging
 import numpy as np
 import orbax.checkpoint as ocp
 import safetensors
@@ -25,7 +24,7 @@ import openpi.shared.array_typing as at
 logger = logging.getLogger("openpi")
 # Type variable for array types (JAX arrays, PyTorch tensors, or numpy arrays)
-ArrayT = TypeVar("ArrayT", bound=Union[jax.Array, torch.Tensor, np.ndarray])
 class ModelType(enum.Enum):
@@ -117,7 +116,7 @@ class Observation(Generic[ArrayT]):
         for key in data["image"]:
             if data["image"][key].dtype == np.uint8:
                 data["image"][key] = data["image"][key].astype(np.float32) / 255.0 * 2.0 - 1.0
-            elif hasattr(data["image"][key], 'dtype') and data["image"][key].dtype == torch.uint8:
                 data["image"][key] = data["image"][key].to(torch.float32).permute(0, 3, 1, 2) / 255.0 * 2.0 - 1.0
         return cls(
             images=data["image"],

 import enum
 import logging
 import pathlib
+from typing import Generic, TypeVar
 import augmax
 from flax import nnx
 from flax import traverse_util
 import jax
 import jax.numpy as jnp
 import numpy as np
 import orbax.checkpoint as ocp
 import safetensors
 logger = logging.getLogger("openpi")
 # Type variable for array types (JAX arrays, PyTorch tensors, or numpy arrays)
+ArrayT = TypeVar("ArrayT", bound=jax.Array | torch.Tensor | np.ndarray)
 class ModelType(enum.Enum):
         for key in data["image"]:
             if data["image"][key].dtype == np.uint8:
                 data["image"][key] = data["image"][key].astype(np.float32) / 255.0 * 2.0 - 1.0
+            elif hasattr(data["image"][key], "dtype") and data["image"][key].dtype == torch.uint8:
                 data["image"][key] = data["image"][key].to(torch.float32).permute(0, 3, 1, 2) / 255.0 * 2.0 - 1.0
         return cls(
             images=data["image"],

src/openpi/models/pi0_config.py CHANGED Viewed

@@ -48,6 +48,7 @@ class Pi0Config(_model.BaseModelConfig):
     @override
     def create(self, rng: at.KeyArrayLike) -> "Pi0":
         from openpi.models.pi0 import Pi0
         return Pi0(self, rngs=nnx.Rngs(rng))
     @override
@@ -104,4 +105,4 @@ class Pi0Config(_model.BaseModelConfig):
             )
         if not filters:
             return nnx.Nothing
-        return nnx.All(*filters)

     @override
     def create(self, rng: at.KeyArrayLike) -> "Pi0":
         from openpi.models.pi0 import Pi0
         return Pi0(self, rngs=nnx.Rngs(rng))
     @override
             )
         if not filters:
             return nnx.Nothing
+        return nnx.All(*filters)

src/openpi/models/tokenizer.py CHANGED Viewed

@@ -254,7 +254,7 @@ class FSQTokenizer:
         assert fsq_tokenizer_path is not None, "fsq_tokenizer_path must be provided"
         # Download tokenizer
         path = download.maybe_download(fsq_tokenizer_path)
-        tok_path = os.path.join(path, os.listdir(path)[0])  # noqa: PTH118
         # Split step from path
         step = int(tok_path.split("/")[-1])

         assert fsq_tokenizer_path is not None, "fsq_tokenizer_path must be provided"
         # Download tokenizer
         path = download.maybe_download(fsq_tokenizer_path)
+        tok_path = os.path.join(path, os.listdir(path)[0])
         # Split step from path
         step = int(tok_path.split("/")[-1])

src/openpi/models_pytorch/gemma_pytorch.py CHANGED Viewed

@@ -1,19 +1,28 @@
-from pytest import Cache
 import torch
 from torch import nn
-from transformers import GemmaForCausalLM, PaliGemmaForConditionalGeneration
-from transformers.models.gemma import modeling_gemma
 from transformers.models.auto import CONFIG_MAPPING
-from typing import Literal
 class PaliGemmaWithExpertModel(nn.Module):
-    def __init__(self, vlm_config, action_expert_config, use_adarms=[False, False], precision: Literal["bfloat16", "float32"] = "bfloat16"):
         super().__init__()
         vlm_config_hf = CONFIG_MAPPING["paligemma"]()
-        vlm_config_hf._vocab_size = 257152
         vlm_config_hf.image_token_index = 257152
         vlm_config_hf.text_config.hidden_size = vlm_config.width
         vlm_config_hf.text_config.intermediate_size = vlm_config.mlp_dim
@@ -53,9 +62,9 @@ class PaliGemmaWithExpertModel(nn.Module):
     def to_bfloat16_for_selected_params(self, precision: Literal["bfloat16", "float32"] = "bfloat16"):
         if precision == "bfloat16":
-            self = self.to(dtype=torch.bfloat16)
         elif precision == "float32":
-            self = self.to(dtype=torch.float32)
             return
         else:
             raise ValueError(f"Invalid precision: {precision}")
@@ -83,11 +92,13 @@ class PaliGemmaWithExpertModel(nn.Module):
         self,
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.LongTensor | None = None,
-        past_key_values: list[torch.FloatTensor] | Cache | None = None,
-        inputs_embeds: list[torch.FloatTensor] = None,
         use_cache: bool | None = None,
-        adarms_cond: list[torch.Tensor] = [None, None],
     ):
         if inputs_embeds[1] is None:
             prefix_output = self.paligemma.language_model.forward(
                 inputs_embeds=inputs_embeds[0],
@@ -115,45 +126,45 @@ class PaliGemmaWithExpertModel(nn.Module):
         else:
             models = [self.paligemma.language_model, self.gemma_expert.model]
             num_layers = self.paligemma.config.text_config.num_hidden_layers
             # Check if gradient checkpointing is enabled for any of the models
             use_gradient_checkpointing = (
-                hasattr(self.gemma_expert.model, 'gradient_checkpointing') and
-                self.gemma_expert.model.gradient_checkpointing and
-                self.training
-            ) or (
-                hasattr(self, 'gradient_checkpointing') and
-                self.gradient_checkpointing and
-                self.training
-            )
             # Force enable gradient checkpointing if we're in training mode and the model supports it
-            if self.training and hasattr(self.gemma_expert.model, 'gradient_checkpointing'):
                 if not self.gemma_expert.model.gradient_checkpointing:
                     print("Forcing gradient checkpointing to be enabled for Gemma expert model")
                     self.gemma_expert.model.gradient_checkpointing = True
                 use_gradient_checkpointing = True
             # Debug gradient checkpointing status
-            if hasattr(self, '_debug_gc_printed') and not self._debug_gc_printed:
                 print(f"Gemma expert model gradient checkpointing: {use_gradient_checkpointing}")
                 print(f"Model training mode: {self.training}")
-                print(f"Gemma expert model has gradient_checkpointing attr: {hasattr(self.gemma_expert.model, 'gradient_checkpointing')}")
-                if hasattr(self.gemma_expert.model, 'gradient_checkpointing'):
-                    print(f"Gemma expert model gradient_checkpointing value: {self.gemma_expert.model.gradient_checkpointing}")
                 self._debug_gc_printed = True
             # Define the complete layer computation function for gradient checkpointing
             def compute_layer_complete(layer_idx, inputs_embeds, attention_mask, position_ids, adarms_cond):
                 models = [self.paligemma.language_model, self.gemma_expert.model]
                 query_states = []
                 key_states = []
                 value_states = []
                 gates = []
                 for i, hidden_states in enumerate(inputs_embeds):
                     layer = models[i].layers[layer_idx]
-                    hidden_states, gate = layer.input_layernorm(hidden_states, cond=adarms_cond[i])
                     gates.append(gate)
                     input_shape = hidden_states.shape[:-1]
@@ -171,16 +182,29 @@ class PaliGemmaWithExpertModel(nn.Module):
                 key_states = torch.cat(key_states, dim=2)
                 value_states = torch.cat(value_states, dim=2)
-                dummy_tensor = torch.zeros(query_states.shape[0], query_states.shape[2], query_states.shape[-1], device=query_states.device, dtype=query_states.dtype)
                 cos, sin = self.paligemma.model.language_model.rotary_emb(dummy_tensor, position_ids)
-                query_states, key_states = modeling_gemma.apply_rotary_pos_emb(query_states, key_states, cos, sin, unsqueeze_dim=1)
                 batch_size = query_states.shape[0]
                 scaling = self.paligemma.language_model.layers[layer_idx].self_attn.scaling
                 # Attention computation
                 att_output, _ = modeling_gemma.eager_attention_forward(
-                    self.paligemma.language_model.layers[layer_idx].self_attn, query_states, key_states, value_states, attention_mask, scaling
                 )
                 # Get head_dim from the current layer, not from the model
                 head_dim = self.paligemma.language_model.layers[layer_idx].self_attn.head_dim
@@ -195,10 +219,10 @@ class PaliGemmaWithExpertModel(nn.Module):
                     if att_output.dtype != layer.self_attn.o_proj.weight.dtype:
                         att_output = att_output.to(layer.self_attn.o_proj.weight.dtype)
-                    out_emb = layer.self_attn.o_proj(att_output[:, start_pos:end_pos])
                     # first residual
-                    out_emb = modeling_gemma._gated_residual(hidden_states, out_emb, gates[i])
                     after_first_residual = out_emb.clone()
                     out_emb, gate = layer.post_attention_layernorm(out_emb, cond=adarms_cond[i])
                     # Convert to bfloat16 if the next layer (mlp) uses bfloat16
@@ -207,10 +231,10 @@ class PaliGemmaWithExpertModel(nn.Module):
                     out_emb = layer.mlp(out_emb)
                     # second residual
-                    out_emb = modeling_gemma._gated_residual(after_first_residual, out_emb, gate)
                     outputs_embeds.append(out_emb)
                     start_pos = end_pos
                 return outputs_embeds
             # Process all layers with gradient checkpointing if enabled
@@ -218,12 +242,18 @@ class PaliGemmaWithExpertModel(nn.Module):
                 if use_gradient_checkpointing:
                     inputs_embeds = torch.utils.checkpoint.checkpoint(
                         compute_layer_complete,
-                        layer_idx, inputs_embeds, attention_mask, position_ids, adarms_cond,
                         use_reentrant=False,
-                        preserve_rng_state=False
                     )
                 else:
-                    inputs_embeds = compute_layer_complete(layer_idx, inputs_embeds, attention_mask, position_ids, adarms_cond)
                 # Old code removed - now using compute_layer_complete function above
@@ -235,14 +265,11 @@ class PaliGemmaWithExpertModel(nn.Module):
                     out_emb, _ = models[i].norm(hidden_states, cond=adarms_cond[i])
                     outputs_embeds.append(out_emb)
                 return outputs_embeds
             # Apply gradient checkpointing to final norm if enabled
             if use_gradient_checkpointing:
                 outputs_embeds = torch.utils.checkpoint.checkpoint(
-                    compute_final_norms,
-                    inputs_embeds, adarms_cond,
-                    use_reentrant=False,
-                    preserve_rng_state=False
                 )
             else:
                 outputs_embeds = compute_final_norms(inputs_embeds, adarms_cond)
@@ -251,4 +278,4 @@ class PaliGemmaWithExpertModel(nn.Module):
             suffix_output = outputs_embeds[1]
             prefix_past_key_values = None
-        return [prefix_output, suffix_output], prefix_past_key_values

+from typing import Literal
+import pytest
 import torch
 from torch import nn
+from transformers import GemmaForCausalLM
+from transformers import PaliGemmaForConditionalGeneration
 from transformers.models.auto import CONFIG_MAPPING
+from transformers.models.gemma import modeling_gemma
 class PaliGemmaWithExpertModel(nn.Module):
+    def __init__(
+        self,
+        vlm_config,
+        action_expert_config,
+        use_adarms=None,
+        precision: Literal["bfloat16", "float32"] = "bfloat16",
+    ):
+        if use_adarms is None:
+            use_adarms = [False, False]
         super().__init__()
         vlm_config_hf = CONFIG_MAPPING["paligemma"]()
+        vlm_config_hf._vocab_size = 257152  # noqa: SLF001
         vlm_config_hf.image_token_index = 257152
         vlm_config_hf.text_config.hidden_size = vlm_config.width
         vlm_config_hf.text_config.intermediate_size = vlm_config.mlp_dim
     def to_bfloat16_for_selected_params(self, precision: Literal["bfloat16", "float32"] = "bfloat16"):
         if precision == "bfloat16":
+            self.to(dtype=torch.bfloat16)
         elif precision == "float32":
+            self.to(dtype=torch.float32)
             return
         else:
             raise ValueError(f"Invalid precision: {precision}")
         self,
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | pytest.Cache | None = None,
+        inputs_embeds: list[torch.FloatTensor] | None = None,
         use_cache: bool | None = None,
+        adarms_cond: list[torch.Tensor] | None = None,
     ):
+        if adarms_cond is None:
+            adarms_cond = [None, None]
         if inputs_embeds[1] is None:
             prefix_output = self.paligemma.language_model.forward(
                 inputs_embeds=inputs_embeds[0],
         else:
             models = [self.paligemma.language_model, self.gemma_expert.model]
             num_layers = self.paligemma.config.text_config.num_hidden_layers
             # Check if gradient checkpointing is enabled for any of the models
             use_gradient_checkpointing = (
+                hasattr(self.gemma_expert.model, "gradient_checkpointing")
+                and self.gemma_expert.model.gradient_checkpointing
+                and self.training
+            ) or (hasattr(self, "gradient_checkpointing") and self.gradient_checkpointing and self.training)
             # Force enable gradient checkpointing if we're in training mode and the model supports it
+            if self.training and hasattr(self.gemma_expert.model, "gradient_checkpointing"):
                 if not self.gemma_expert.model.gradient_checkpointing:
                     print("Forcing gradient checkpointing to be enabled for Gemma expert model")
                     self.gemma_expert.model.gradient_checkpointing = True
                 use_gradient_checkpointing = True
             # Debug gradient checkpointing status
+            if hasattr(self, "_debug_gc_printed") and not self._debug_gc_printed:
                 print(f"Gemma expert model gradient checkpointing: {use_gradient_checkpointing}")
                 print(f"Model training mode: {self.training}")
+                print(
+                    f"Gemma expert model has gradient_checkpointing attr: {hasattr(self.gemma_expert.model, 'gradient_checkpointing')}"
+                )
+                if hasattr(self.gemma_expert.model, "gradient_checkpointing"):
+                    print(
+                        f"Gemma expert model gradient_checkpointing value: {self.gemma_expert.model.gradient_checkpointing}"
+                    )
                 self._debug_gc_printed = True
             # Define the complete layer computation function for gradient checkpointing
             def compute_layer_complete(layer_idx, inputs_embeds, attention_mask, position_ids, adarms_cond):
                 models = [self.paligemma.language_model, self.gemma_expert.model]
                 query_states = []
                 key_states = []
                 value_states = []
                 gates = []
                 for i, hidden_states in enumerate(inputs_embeds):
                     layer = models[i].layers[layer_idx]
+                    hidden_states, gate = layer.input_layernorm(hidden_states, cond=adarms_cond[i])  # noqa: PLW2901
                     gates.append(gate)
                     input_shape = hidden_states.shape[:-1]
                 key_states = torch.cat(key_states, dim=2)
                 value_states = torch.cat(value_states, dim=2)
+                dummy_tensor = torch.zeros(
+                    query_states.shape[0],
+                    query_states.shape[2],
+                    query_states.shape[-1],
+                    device=query_states.device,
+                    dtype=query_states.dtype,
+                )
                 cos, sin = self.paligemma.model.language_model.rotary_emb(dummy_tensor, position_ids)
+                query_states, key_states = modeling_gemma.apply_rotary_pos_emb(
+                    query_states, key_states, cos, sin, unsqueeze_dim=1
+                )
                 batch_size = query_states.shape[0]
                 scaling = self.paligemma.language_model.layers[layer_idx].self_attn.scaling
                 # Attention computation
                 att_output, _ = modeling_gemma.eager_attention_forward(
+                    self.paligemma.language_model.layers[layer_idx].self_attn,
+                    query_states,
+                    key_states,
+                    value_states,
+                    attention_mask,
+                    scaling,
                 )
                 # Get head_dim from the current layer, not from the model
                 head_dim = self.paligemma.language_model.layers[layer_idx].self_attn.head_dim
                     if att_output.dtype != layer.self_attn.o_proj.weight.dtype:
                         att_output = att_output.to(layer.self_attn.o_proj.weight.dtype)
+                    out_emb = layer.self_attn.o_proj(att_output[:, start_pos:end_pos])
                     # first residual
+                    out_emb = modeling_gemma._gated_residual(hidden_states, out_emb, gates[i])  # noqa: SLF001
                     after_first_residual = out_emb.clone()
                     out_emb, gate = layer.post_attention_layernorm(out_emb, cond=adarms_cond[i])
                     # Convert to bfloat16 if the next layer (mlp) uses bfloat16
                     out_emb = layer.mlp(out_emb)
                     # second residual
+                    out_emb = modeling_gemma._gated_residual(after_first_residual, out_emb, gate)  # noqa: SLF001
                     outputs_embeds.append(out_emb)
                     start_pos = end_pos
                 return outputs_embeds
             # Process all layers with gradient checkpointing if enabled
                 if use_gradient_checkpointing:
                     inputs_embeds = torch.utils.checkpoint.checkpoint(
                         compute_layer_complete,
+                        layer_idx,
+                        inputs_embeds,
+                        attention_mask,
+                        position_ids,
+                        adarms_cond,
                         use_reentrant=False,
+                        preserve_rng_state=False,
                     )
                 else:
+                    inputs_embeds = compute_layer_complete(
+                        layer_idx, inputs_embeds, attention_mask, position_ids, adarms_cond
+                    )
                 # Old code removed - now using compute_layer_complete function above
                     out_emb, _ = models[i].norm(hidden_states, cond=adarms_cond[i])
                     outputs_embeds.append(out_emb)
                 return outputs_embeds
             # Apply gradient checkpointing to final norm if enabled
             if use_gradient_checkpointing:
                 outputs_embeds = torch.utils.checkpoint.checkpoint(
+                    compute_final_norms, inputs_embeds, adarms_cond, use_reentrant=False, preserve_rng_state=False
                 )
             else:
                 outputs_embeds = compute_final_norms(inputs_embeds, adarms_cond)
             suffix_output = outputs_embeds[1]
             prefix_past_key_values = None
+        return [prefix_output, suffix_output], prefix_past_key_values

src/openpi/models_pytorch/pi0_pytorch.py CHANGED Viewed

@@ -1,10 +1,10 @@
-import math
 import logging
 import torch
 from torch import Tensor
 from torch import nn
-import torch.nn.functional as F
 import openpi.models.gemma as _gemma
 from openpi.models_pytorch.gemma_pytorch import PaliGemmaWithExpertModel
@@ -17,7 +17,7 @@ def get_safe_dtype(target_dtype, device_type):
         # CPU doesn't support bfloat16, use float32 instead
         if target_dtype == torch.bfloat16:
             return torch.float32
-        elif target_dtype == torch.float64:
             return torch.float64
     return target_dtype
@@ -39,16 +39,14 @@ def create_sinusoidal_pos_embedding(
     # Compute the outer product
     scaling_factor = 1.0 / period * 2 * math.pi
     sin_input = scaling_factor[None, :] * time[:, None]
-    pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
-    return pos_emb
 def sample_beta(alpha, beta, bsize, device):
     alpha_t = torch.as_tensor(alpha, dtype=torch.float32, device=device)
     beta_t = torch.as_tensor(beta, dtype=torch.float32, device=device)
     dist = torch.distributions.Beta(alpha_t, beta_t)
-    samples = dist.sample((bsize,))
-    return samples
 def make_att_2d_masks(pad_masks, att_masks):
@@ -80,8 +78,7 @@ def make_att_2d_masks(pad_masks, att_masks):
     cumsum = torch.cumsum(att_masks, dim=1)
     att_2d_masks = cumsum[:, None, :] <= cumsum[:, :, None]
     pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None]
-    att_2d_masks = att_2d_masks & pad_2d_masks
-    return att_2d_masks
 class PI0Pytorch(nn.Module):
@@ -93,7 +90,12 @@ class PI0Pytorch(nn.Module):
         paligemma_config = _gemma.get_config(config.paligemma_variant)
         action_expert_config = _gemma.get_config(config.action_expert_variant)
-        self.paligemma_with_expert = PaliGemmaWithExpertModel(paligemma_config, action_expert_config, use_adarms=[False, True] if self.pi05 else [False, False], precision=config.dtype)
         self.action_in_proj = nn.Linear(32, action_expert_config.width)
         self.action_out_proj = nn.Linear(action_expert_config.width, 32)
@@ -106,17 +108,20 @@ class PI0Pytorch(nn.Module):
             self.action_time_mlp_in = nn.Linear(2 * action_expert_config.width, action_expert_config.width)
             self.action_time_mlp_out = nn.Linear(action_expert_config.width, action_expert_config.width)
-        torch.set_float32_matmul_precision('high')
         self.sample_actions = torch.compile(self.sample_actions, mode="max-autotune")
         # Initialize gradient checkpointing flag
         self.gradient_checkpointing_enabled = False
         try:
             from transformers.models.siglip import check
             if not check.check_whether_transformers_replace_is_installed_correctly():
-                raise ValueError("TransformersReplace is not installed correctly. Please install it with `uv pip install transformers==4.53.2` and `cp -r ./src/openpi/models_pytorch/transformers_replace/* .venv/lib/python3.11/site-packages/transformers/`.")
         except ImportError:
-            raise ValueError("TransformersReplace is not installed correctly. Please install it with `uv pip install transformers==4.53.2` and `cp -r ./src/openpi/models_pytorch/transformers_replace/* .venv/lib/python3.11/site-packages/transformers/`.")
     def gradient_checkpointing_enable(self):
         """Enable gradient checkpointing for memory optimization."""
@@ -124,7 +129,7 @@ class PI0Pytorch(nn.Module):
         self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing = True
         self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing = True
         self.paligemma_with_expert.gemma_expert.model.gradient_checkpointing = True
         logging.info("Enabled gradient checkpointing for PI0Pytorch model")
     def gradient_checkpointing_disable(self):
@@ -133,7 +138,7 @@ class PI0Pytorch(nn.Module):
         self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing = False
         self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing = False
         self.paligemma_with_expert.gemma_expert.model.gradient_checkpointing = False
         logging.info("Disabled gradient checkpointing for PI0Pytorch model")
     def is_gradient_checkpointing_enabled(self):
@@ -146,15 +151,14 @@ class PI0Pytorch(nn.Module):
             return torch.utils.checkpoint.checkpoint(
                 func, *args, use_reentrant=False, preserve_rng_state=False, **kwargs
             )
-        else:
-            return func(*args, **kwargs)
     def _prepare_attention_masks_4d(self, att_2d_masks):
         """Helper method to prepare 4D attention masks for transformer."""
         att_2d_masks_4d = att_2d_masks[:, None, :, :]
         return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38)
-    def _preprocess_observation(self, observation, train=True):
         """Helper method to preprocess observation."""
         observation = _preprocessing.preprocess_observation_pytorch(observation, train=train)
         return (
@@ -162,18 +166,17 @@ class PI0Pytorch(nn.Module):
             list(observation.image_masks.values()),
             observation.tokenized_prompt,
             observation.tokenized_prompt_mask,
-            observation.state
         )
     def sample_noise(self, shape, device):
-        noise = torch.normal(
             mean=0.0,
             std=1.0,
             size=shape,
             dtype=torch.float32,
             device=device,
         )
-        return noise
     def sample_time(self, bsize, device):
         time_beta = sample_beta(1.5, 1.0, bsize, device)
@@ -189,19 +192,19 @@ class PI0Pytorch(nn.Module):
         embs = []
         pad_masks = []
         att_masks = []
         # Process images
-        for img, img_mask in zip(images, img_masks):
             def image_embed_func(img):
                 return self.paligemma_with_expert.embed_image(img)
             img_emb = self._apply_checkpoint(image_embed_func, img)
             bsize, num_img_embs = img_emb.shape[:2]
-            img_mask = img_mask[:, None].expand(bsize, num_img_embs)
             embs.append(img_emb)
-            pad_masks.append(img_mask)
             # Create attention masks so that image tokens attend to each other
             att_masks += [0] * num_img_embs
@@ -211,7 +214,7 @@ class PI0Pytorch(nn.Module):
             lang_emb = self.paligemma_with_expert.embed_language_tokens(lang_tokens)
             lang_emb_dim = lang_emb.shape[-1]
             return lang_emb * math.sqrt(lang_emb_dim)
         lang_emb = self._apply_checkpoint(lang_embed_func, lang_tokens)
         embs.append(lang_emb)
@@ -239,16 +242,16 @@ class PI0Pytorch(nn.Module):
         if not self.pi05:
             if self.state_proj.weight.dtype == torch.float32:
-                 state = state.to(torch.float32)
             # Embed state
             def state_proj_func(state):
                 return self.state_proj(state)
             state_emb = self._apply_checkpoint(state_proj_func, state)
             embs.append(state_emb[:, None, :])
             bsize = state_emb.shape[0]
-            dtype = state_emb.dtype
             device = state_emb.device
             state_mask = torch.ones(bsize, 1, dtype=torch.bool, device=device)
@@ -266,20 +269,19 @@ class PI0Pytorch(nn.Module):
         # Fuse timestep + action information using an MLP
         def action_proj_func(noisy_actions):
             return self.action_in_proj(noisy_actions)
         action_emb = self._apply_checkpoint(action_proj_func, noisy_actions)
         if not self.pi05:
             time_emb = time_emb[:, None, :].expand_as(action_emb)
             action_time_emb = torch.cat([action_emb, time_emb], dim=2)
             # Apply MLP layers
             def mlp_func(action_time_emb):
                 x = self.action_time_mlp_in(action_time_emb)
                 x = F.silu(x)  # swish == silu
-                x = self.action_time_mlp_out(x)
-                return x
             action_time_emb = self._apply_checkpoint(mlp_func, action_time_emb)
             adarms_cond = None
         else:
@@ -288,9 +290,8 @@ class PI0Pytorch(nn.Module):
                 x = self.time_mlp_in(time_emb)
                 x = F.silu(x)  # swish == silu
                 x = self.time_mlp_out(x)
-                x = F.silu(x)
-                return x
             time_emb = self._apply_checkpoint(time_mlp_func, time_emb)
             action_time_emb = action_emb
             adarms_cond = time_emb
@@ -328,7 +329,10 @@ class PI0Pytorch(nn.Module):
         prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(images, img_masks, lang_tokens, lang_masks)
         suffix_embs, suffix_pad_masks, suffix_att_masks, adarms_cond = self.embed_suffix(state, x_t, time)
-        if self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype == torch.bfloat16:
             suffix_embs = suffix_embs.to(dtype=torch.bfloat16)
             prefix_embs = prefix_embs.to(dtype=torch.bfloat16)
@@ -349,25 +353,24 @@ class PI0Pytorch(nn.Module):
                 past_key_values=None,
                 inputs_embeds=[prefix_embs, suffix_embs],
                 use_cache=False,
-                adarms_cond=[None, adarms_cond]
             )
             return suffix_out
         suffix_out = self._apply_checkpoint(
             forward_func, prefix_embs, suffix_embs, att_2d_masks_4d, position_ids, adarms_cond
         )
         suffix_out = suffix_out[:, -self.config.action_horizon :]
         suffix_out = suffix_out.to(dtype=torch.float32)
         # Apply gradient checkpointing to final action projection if enabled
         def action_out_proj_func(suffix_out):
             return self.action_out_proj(suffix_out)
         v_t = self._apply_checkpoint(action_out_proj_func, suffix_out)
-        losses = F.mse_loss(u_t, v_t, reduction="none")
-        return losses
     @torch.no_grad()
     def sample_actions(self, device, observation, noise=None, num_steps=10) -> Tensor:
@@ -376,7 +379,7 @@ class PI0Pytorch(nn.Module):
         if noise is None:
             actions_shape = (bsize, self.config.action_horizon, self.config.action_dim)
             noise = self.sample_noise(actions_shape, device)
         images, img_masks, lang_tokens, lang_masks, state = self._preprocess_observation(observation, train=False)
         prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(images, img_masks, lang_tokens, lang_masks)
@@ -385,7 +388,7 @@ class PI0Pytorch(nn.Module):
         # Compute image and language key value cache
         prefix_att_2d_masks_4d = self._prepare_attention_masks_4d(prefix_att_2d_masks)
-        self.paligemma_with_expert.paligemma.language_model.config._attn_implementation = "eager"
         _, past_key_values = self.paligemma_with_expert.forward(
             attention_mask=prefix_att_2d_masks_4d,
@@ -441,7 +444,7 @@ class PI0Pytorch(nn.Module):
         # Prepare attention masks
         full_att_2d_masks_4d = self._prepare_attention_masks_4d(full_att_2d_masks)
-        self.paligemma_with_expert.gemma_expert.model.config._attn_implementation = "eager"
         outputs_embeds, _ = self.paligemma_with_expert.forward(
             attention_mask=full_att_2d_masks_4d,
@@ -449,12 +452,10 @@ class PI0Pytorch(nn.Module):
             past_key_values=past_key_values,
             inputs_embeds=[None, suffix_embs],
             use_cache=False,
-            adarms_cond=[None, adarms_cond]
         )
         suffix_out = outputs_embeds[1]
         suffix_out = suffix_out[:, -self.config.action_horizon :]
         suffix_out = suffix_out.to(dtype=torch.float32)
-        v_t = self.action_out_proj(suffix_out)
-        return v_t

 import logging
+import math
 import torch
 from torch import Tensor
 from torch import nn
+import torch.nn.functional as F  # noqa: N812
 import openpi.models.gemma as _gemma
 from openpi.models_pytorch.gemma_pytorch import PaliGemmaWithExpertModel
         # CPU doesn't support bfloat16, use float32 instead
         if target_dtype == torch.bfloat16:
             return torch.float32
+        if target_dtype == torch.float64:
             return torch.float64
     return target_dtype
     # Compute the outer product
     scaling_factor = 1.0 / period * 2 * math.pi
     sin_input = scaling_factor[None, :] * time[:, None]
+    return torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
 def sample_beta(alpha, beta, bsize, device):
     alpha_t = torch.as_tensor(alpha, dtype=torch.float32, device=device)
     beta_t = torch.as_tensor(beta, dtype=torch.float32, device=device)
     dist = torch.distributions.Beta(alpha_t, beta_t)
+    return dist.sample((bsize,))
 def make_att_2d_masks(pad_masks, att_masks):
     cumsum = torch.cumsum(att_masks, dim=1)
     att_2d_masks = cumsum[:, None, :] <= cumsum[:, :, None]
     pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None]
+    return att_2d_masks & pad_2d_masks
 class PI0Pytorch(nn.Module):
         paligemma_config = _gemma.get_config(config.paligemma_variant)
         action_expert_config = _gemma.get_config(config.action_expert_variant)
+        self.paligemma_with_expert = PaliGemmaWithExpertModel(
+            paligemma_config,
+            action_expert_config,
+            use_adarms=[False, True] if self.pi05 else [False, False],
+            precision=config.dtype,
+        )
         self.action_in_proj = nn.Linear(32, action_expert_config.width)
         self.action_out_proj = nn.Linear(action_expert_config.width, 32)
             self.action_time_mlp_in = nn.Linear(2 * action_expert_config.width, action_expert_config.width)
             self.action_time_mlp_out = nn.Linear(action_expert_config.width, action_expert_config.width)
+        torch.set_float32_matmul_precision("high")
         self.sample_actions = torch.compile(self.sample_actions, mode="max-autotune")
         # Initialize gradient checkpointing flag
         self.gradient_checkpointing_enabled = False
+        msg = "transformers_replace is not installed correctly. Please install it with `uv pip install transformers==4.53.2` and `cp -r ./src/openpi/models_pytorch/transformers_replace/* .venv/lib/python3.11/site-packages/transformers/`."
         try:
             from transformers.models.siglip import check
             if not check.check_whether_transformers_replace_is_installed_correctly():
+                raise ValueError(msg)
         except ImportError:
+            raise ValueError(msg) from None
     def gradient_checkpointing_enable(self):
         """Enable gradient checkpointing for memory optimization."""
         self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing = True
         self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing = True
         self.paligemma_with_expert.gemma_expert.model.gradient_checkpointing = True
         logging.info("Enabled gradient checkpointing for PI0Pytorch model")
     def gradient_checkpointing_disable(self):
         self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing = False
         self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing = False
         self.paligemma_with_expert.gemma_expert.model.gradient_checkpointing = False
         logging.info("Disabled gradient checkpointing for PI0Pytorch model")
     def is_gradient_checkpointing_enabled(self):
             return torch.utils.checkpoint.checkpoint(
                 func, *args, use_reentrant=False, preserve_rng_state=False, **kwargs
             )
+        return func(*args, **kwargs)
     def _prepare_attention_masks_4d(self, att_2d_masks):
         """Helper method to prepare 4D attention masks for transformer."""
         att_2d_masks_4d = att_2d_masks[:, None, :, :]
         return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38)
+    def _preprocess_observation(self, observation, *, train=True):
         """Helper method to preprocess observation."""
         observation = _preprocessing.preprocess_observation_pytorch(observation, train=train)
         return (
             list(observation.image_masks.values()),
             observation.tokenized_prompt,
             observation.tokenized_prompt_mask,
+            observation.state,
         )
     def sample_noise(self, shape, device):
+        return torch.normal(
             mean=0.0,
             std=1.0,
             size=shape,
             dtype=torch.float32,
             device=device,
         )
     def sample_time(self, bsize, device):
         time_beta = sample_beta(1.5, 1.0, bsize, device)
         embs = []
         pad_masks = []
         att_masks = []
         # Process images
+        for img, img_mask in zip(images, img_masks, strict=True):
             def image_embed_func(img):
                 return self.paligemma_with_expert.embed_image(img)
             img_emb = self._apply_checkpoint(image_embed_func, img)
             bsize, num_img_embs = img_emb.shape[:2]
             embs.append(img_emb)
+            pad_masks.append(img_mask[:, None].expand(bsize, num_img_embs))
             # Create attention masks so that image tokens attend to each other
             att_masks += [0] * num_img_embs
             lang_emb = self.paligemma_with_expert.embed_language_tokens(lang_tokens)
             lang_emb_dim = lang_emb.shape[-1]
             return lang_emb * math.sqrt(lang_emb_dim)
         lang_emb = self._apply_checkpoint(lang_embed_func, lang_tokens)
         embs.append(lang_emb)
         if not self.pi05:
             if self.state_proj.weight.dtype == torch.float32:
+                state = state.to(torch.float32)
             # Embed state
             def state_proj_func(state):
                 return self.state_proj(state)
             state_emb = self._apply_checkpoint(state_proj_func, state)
             embs.append(state_emb[:, None, :])
             bsize = state_emb.shape[0]
             device = state_emb.device
             state_mask = torch.ones(bsize, 1, dtype=torch.bool, device=device)
         # Fuse timestep + action information using an MLP
         def action_proj_func(noisy_actions):
             return self.action_in_proj(noisy_actions)
         action_emb = self._apply_checkpoint(action_proj_func, noisy_actions)
         if not self.pi05:
             time_emb = time_emb[:, None, :].expand_as(action_emb)
             action_time_emb = torch.cat([action_emb, time_emb], dim=2)
             # Apply MLP layers
             def mlp_func(action_time_emb):
                 x = self.action_time_mlp_in(action_time_emb)
                 x = F.silu(x)  # swish == silu
+                return self.action_time_mlp_out(x)
             action_time_emb = self._apply_checkpoint(mlp_func, action_time_emb)
             adarms_cond = None
         else:
                 x = self.time_mlp_in(time_emb)
                 x = F.silu(x)  # swish == silu
                 x = self.time_mlp_out(x)
+                return F.silu(x)
             time_emb = self._apply_checkpoint(time_mlp_func, time_emb)
             action_time_emb = action_emb
             adarms_cond = time_emb
         prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(images, img_masks, lang_tokens, lang_masks)
         suffix_embs, suffix_pad_masks, suffix_att_masks, adarms_cond = self.embed_suffix(state, x_t, time)
+        if (
+            self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype
+            == torch.bfloat16
+        ):
             suffix_embs = suffix_embs.to(dtype=torch.bfloat16)
             prefix_embs = prefix_embs.to(dtype=torch.bfloat16)
                 past_key_values=None,
                 inputs_embeds=[prefix_embs, suffix_embs],
                 use_cache=False,
+                adarms_cond=[None, adarms_cond],
             )
             return suffix_out
         suffix_out = self._apply_checkpoint(
             forward_func, prefix_embs, suffix_embs, att_2d_masks_4d, position_ids, adarms_cond
         )
         suffix_out = suffix_out[:, -self.config.action_horizon :]
         suffix_out = suffix_out.to(dtype=torch.float32)
         # Apply gradient checkpointing to final action projection if enabled
         def action_out_proj_func(suffix_out):
             return self.action_out_proj(suffix_out)
         v_t = self._apply_checkpoint(action_out_proj_func, suffix_out)
+        return F.mse_loss(u_t, v_t, reduction="none")
     @torch.no_grad()
     def sample_actions(self, device, observation, noise=None, num_steps=10) -> Tensor:
         if noise is None:
             actions_shape = (bsize, self.config.action_horizon, self.config.action_dim)
             noise = self.sample_noise(actions_shape, device)
         images, img_masks, lang_tokens, lang_masks, state = self._preprocess_observation(observation, train=False)
         prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(images, img_masks, lang_tokens, lang_masks)
         # Compute image and language key value cache
         prefix_att_2d_masks_4d = self._prepare_attention_masks_4d(prefix_att_2d_masks)
+        self.paligemma_with_expert.paligemma.language_model.config._attn_implementation = "eager"  # noqa: SLF001
         _, past_key_values = self.paligemma_with_expert.forward(
             attention_mask=prefix_att_2d_masks_4d,
         # Prepare attention masks
         full_att_2d_masks_4d = self._prepare_attention_masks_4d(full_att_2d_masks)
+        self.paligemma_with_expert.gemma_expert.model.config._attn_implementation = "eager"  # noqa: SLF001
         outputs_embeds, _ = self.paligemma_with_expert.forward(
             attention_mask=full_att_2d_masks_4d,
             past_key_values=past_key_values,
             inputs_embeds=[None, suffix_embs],
             use_cache=False,
+            adarms_cond=[None, adarms_cond],
         )
         suffix_out = outputs_embeds[1]
         suffix_out = suffix_out[:, -self.config.action_horizon :]
         suffix_out = suffix_out.to(dtype=torch.float32)
+        return self.action_out_proj(suffix_out)

src/openpi/models_pytorch/preprocessing_pytorch.py CHANGED Viewed

@@ -1,5 +1,6 @@
-import logging
 from collections.abc import Sequence
 import torch
 from openpi.shared import image_tools
@@ -15,6 +16,7 @@ IMAGE_KEYS = (
 IMAGE_RESOLUTION = (224, 224)
 def preprocess_observation_pytorch(
     observation,
     *,
@@ -23,7 +25,7 @@ def preprocess_observation_pytorch(
     image_resolution: tuple[int, int] = IMAGE_RESOLUTION,
 ):
     """Torch.compile-compatible version of preprocess_observation_pytorch with simplified type annotations.
     This function avoids complex type annotations that can cause torch.compile issues.
     """
     if not set(image_keys).issubset(observation.images):
@@ -67,14 +69,14 @@ def preprocess_observation_pytorch(
                     # Use tensor operations instead of .item() for torch.compile compatibility
                     start_h = torch.randint(0, max_h + 1, (1,), device=image.device)
                     start_w = torch.randint(0, max_w + 1, (1,), device=image.device)
-                    image = image[:, start_h:start_h + crop_height, start_w:start_w + crop_width, :]
                 # Resize back to original size
                 image = torch.nn.functional.interpolate(
                     image.permute(0, 3, 1, 2),  # [b, h, w, c] -> [b, c, h, w]
                     size=(height, width),
-                    mode='bilinear',
-                    align_corners=False
                 ).permute(0, 2, 3, 1)  # [b, c, h, w] -> [b, h, w, c]
                 # Random rotation (small angles)
@@ -93,7 +95,7 @@ def preprocess_observation_pytorch(
                     grid_y = torch.linspace(-1, 1, height, device=image.device)
                     # Create meshgrid
-                    grid_y, grid_x = torch.meshgrid(grid_y, grid_x, indexing='ij')
                     # Expand to batch dimension
                     grid_x = grid_x.unsqueeze(0).expand(image.shape[0], -1, -1)
@@ -109,9 +111,9 @@ def preprocess_observation_pytorch(
                     image = torch.nn.functional.grid_sample(
                         image.permute(0, 3, 1, 2),  # [b, h, w, c] -> [b, c, h, w]
                         grid,
-                        mode='bilinear',
-                        padding_mode='zeros',
-                        align_corners=False
                     ).permute(0, 2, 3, 1)  # [b, c, h, w] -> [b, h, w, c]
             # Color augmentations for all cameras
@@ -159,7 +161,7 @@ def preprocess_observation_pytorch(
         def __init__(self, **kwargs):
             for key, value in kwargs.items():
                 setattr(self, key, value)
     return SimpleProcessedObservation(
         images=out_images,
         image_masks=out_masks,

 from collections.abc import Sequence
+import logging
 import torch
 from openpi.shared import image_tools
 IMAGE_RESOLUTION = (224, 224)
 def preprocess_observation_pytorch(
     observation,
     *,
     image_resolution: tuple[int, int] = IMAGE_RESOLUTION,
 ):
     """Torch.compile-compatible version of preprocess_observation_pytorch with simplified type annotations.
     This function avoids complex type annotations that can cause torch.compile issues.
     """
     if not set(image_keys).issubset(observation.images):
                     # Use tensor operations instead of .item() for torch.compile compatibility
                     start_h = torch.randint(0, max_h + 1, (1,), device=image.device)
                     start_w = torch.randint(0, max_w + 1, (1,), device=image.device)
+                    image = image[:, start_h : start_h + crop_height, start_w : start_w + crop_width, :]
                 # Resize back to original size
                 image = torch.nn.functional.interpolate(
                     image.permute(0, 3, 1, 2),  # [b, h, w, c] -> [b, c, h, w]
                     size=(height, width),
+                    mode="bilinear",
+                    align_corners=False,
                 ).permute(0, 2, 3, 1)  # [b, c, h, w] -> [b, h, w, c]
                 # Random rotation (small angles)
                     grid_y = torch.linspace(-1, 1, height, device=image.device)
                     # Create meshgrid
+                    grid_y, grid_x = torch.meshgrid(grid_y, grid_x, indexing="ij")
                     # Expand to batch dimension
                     grid_x = grid_x.unsqueeze(0).expand(image.shape[0], -1, -1)
                     image = torch.nn.functional.grid_sample(
                         image.permute(0, 3, 1, 2),  # [b, h, w, c] -> [b, c, h, w]
                         grid,
+                        mode="bilinear",
+                        padding_mode="zeros",
+                        align_corners=False,
                     ).permute(0, 2, 3, 1)  # [b, c, h, w] -> [b, h, w, c]
             # Color augmentations for all cameras
         def __init__(self, **kwargs):
             for key, value in kwargs.items():
                 setattr(self, key, value)
     return SimpleProcessedObservation(
         images=out_images,
         image_masks=out_masks,

src/openpi/policies/policy.py CHANGED Viewed

@@ -35,7 +35,7 @@ class Policy(BasePolicy):
         is_pytorch: bool = False,
     ):
         """Initialize the Policy.
         Args:
             model: The model to use for action sampling.
             rng: Random number generator key for JAX models. Ignored for PyTorch models.
@@ -43,7 +43,7 @@ class Policy(BasePolicy):
             output_transforms: Output data transformations to apply after inference.
             sample_kwargs: Additional keyword arguments to pass to model.sample_actions.
             metadata: Additional metadata to store with the policy.
-            pytorch_device: Device to use for PyTorch models (e.g., "cpu", "cuda:0").
                           Only relevant when is_pytorch=True.
             is_pytorch: Whether the model is a PyTorch model. If False, assumes JAX model.
         """
@@ -81,10 +81,7 @@ class Policy(BasePolicy):
         # Prepare kwargs for sample_actions
         sample_kwargs = dict(self._sample_kwargs)
         if noise is not None:
-            if self._is_pytorch_model:
-                noise = torch.from_numpy(noise).to(self._pytorch_device)
-            else:
-                noise = jnp.asarray(noise)
             if noise.ndim == 2:  # If noise is (action_horizon, action_dim), add batch dimension
                 noise = noise[None, ...]  # Make it (1, action_horizon, action_dim)

         is_pytorch: bool = False,
     ):
         """Initialize the Policy.
         Args:
             model: The model to use for action sampling.
             rng: Random number generator key for JAX models. Ignored for PyTorch models.
             output_transforms: Output data transformations to apply after inference.
             sample_kwargs: Additional keyword arguments to pass to model.sample_actions.
             metadata: Additional metadata to store with the policy.
+            pytorch_device: Device to use for PyTorch models (e.g., "cpu", "cuda:0").
                           Only relevant when is_pytorch=True.
             is_pytorch: Whether the model is a PyTorch model. If False, assumes JAX model.
         """
         # Prepare kwargs for sample_actions
         sample_kwargs = dict(self._sample_kwargs)
         if noise is not None:
+            noise = torch.from_numpy(noise).to(self._pytorch_device) if self._is_pytorch_model else jnp.asarray(noise)
             if noise.ndim == 2:  # If noise is (action_horizon, action_dim), add batch dimension
                 noise = noise[None, ...]  # Make it (1, action_horizon, action_dim)

src/openpi/policies/policy_config.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
-import pathlib
 import os
 from typing import Any
 import jax.numpy as jnp
@@ -35,9 +35,9 @@ def create_trained_policy(
             data if it doesn't already exist.
         norm_stats: The norm stats to use for the policy. If not provided, the norm stats will be loaded
             from the checkpoint directory.
-        pytorch_device: Device to use for PyTorch models (e.g., "cpu", "cuda", "cuda:0").
                       If None and is_pytorch=True, will use "cuda" if available, otherwise "cpu".
     Note:
         The function automatically detects whether the model is PyTorch-based by checking for the
         presence of "model.safensors" in the checkpoint directory.
@@ -52,7 +52,7 @@ def create_trained_policy(
     logging.info("Loading model...")
     if is_pytorch:
         model = train_config.model.load_pytorch(train_config, weight_path)
-        model.paligemma_with_expert.to_bfloat16_for_selected_params('bfloat16')
     else:
         model = train_config.model.load(_model.restore_params(checkpoint_dir / "params", dtype=jnp.bfloat16))
     data_config = train_config.data.create(train_config.assets_dirs, train_config.model)
@@ -67,13 +67,11 @@ def create_trained_policy(
     if is_pytorch and pytorch_device is None:
         try:
             import torch
-            if torch.cuda.is_available():
-                pytorch_device = "cuda"
-            else:
-                pytorch_device = "cpu"
         except ImportError:
             pytorch_device = "cpu"
     return _policy.Policy(
         model,
         transforms=[

 import logging
 import os
+import pathlib
 from typing import Any
 import jax.numpy as jnp
             data if it doesn't already exist.
         norm_stats: The norm stats to use for the policy. If not provided, the norm stats will be loaded
             from the checkpoint directory.
+        pytorch_device: Device to use for PyTorch models (e.g., "cpu", "cuda", "cuda:0").
                       If None and is_pytorch=True, will use "cuda" if available, otherwise "cpu".
     Note:
         The function automatically detects whether the model is PyTorch-based by checking for the
         presence of "model.safensors" in the checkpoint directory.
     logging.info("Loading model...")
     if is_pytorch:
         model = train_config.model.load_pytorch(train_config, weight_path)
+        model.paligemma_with_expert.to_bfloat16_for_selected_params("bfloat16")
     else:
         model = train_config.model.load(_model.restore_params(checkpoint_dir / "params", dtype=jnp.bfloat16))
     data_config = train_config.data.create(train_config.assets_dirs, train_config.model)
     if is_pytorch and pytorch_device is None:
         try:
             import torch
+            pytorch_device = "cuda" if torch.cuda.is_available() else "cpu"
         except ImportError:
             pytorch_device = "cpu"
     return _policy.Policy(
         model,
         transforms=[

src/openpi/shared/array_typing.py CHANGED Viewed

@@ -7,7 +7,6 @@ import beartype
 import jax
 import jax._src.tree_util as private_tree_util
 import jax.core
-from jaxtyping import Array  # noqa: F401
 from jaxtyping import ArrayLike
 from jaxtyping import Bool  # noqa: F401
 from jaxtyping import DTypeLike  # noqa: F401
@@ -31,6 +30,7 @@ _original_check_dataclass_annotations = jaxtyping._decorator._check_dataclass_an
 # Redefine Array to include both JAX arrays and PyTorch tensors
 Array = jax.Array | torch.Tensor
 def _check_dataclass_annotations(self, typechecker):
     if not any(
         frame.frame.f_globals.get("__name__") in {"jax._src.tree_util", "flax.nnx.transforms.compilation"}

 import jax
 import jax._src.tree_util as private_tree_util
 import jax.core
 from jaxtyping import ArrayLike
 from jaxtyping import Bool  # noqa: F401
 from jaxtyping import DTypeLike  # noqa: F401
 # Redefine Array to include both JAX arrays and PyTorch tensors
 Array = jax.Array | torch.Tensor
 def _check_dataclass_annotations(self, typechecker):
     if not any(
         frame.frame.f_globals.get("__name__") in {"jax._src.tree_util", "flax.nnx.transforms.compilation"}

src/openpi/shared/image_tools.py CHANGED Viewed

@@ -3,7 +3,7 @@ import functools
 import jax
 import jax.numpy as jnp
 import torch
-import torch.nn.functional as F
 import openpi.shared.array_typing as at
@@ -60,13 +60,13 @@ def resize_with_pad_torch(
 ) -> torch.Tensor:
     """PyTorch version of resize_with_pad. Resizes an image to a target height and width without distortion
     by padding with black. If the image is float32, it must be in the range [-1, 1].
     Args:
         images: Tensor of shape [*b, h, w, c] or [*b, c, h, w]
         height: Target height
         width: Target width
         mode: Interpolation mode ('bilinear', 'nearest', etc.)
     Returns:
         Resized and padded tensor with same shape format as input
     """
@@ -91,10 +91,7 @@ def resize_with_pad_torch(
     # Resize
     resized_images = F.interpolate(
-        images,
-        size=(resized_height, resized_width),
-        mode=mode,
-        align_corners=False if mode == "bilinear" else None
     )
     # Handle dtype-specific clipping
@@ -116,8 +113,8 @@ def resize_with_pad_torch(
     padded_images = F.pad(
         resized_images,
         (pad_w0, pad_w1, pad_h0, pad_h1),  # left, right, top, bottom
-        mode='constant',
-        value=constant_value
     )
     # Convert back to original format if needed
@@ -126,4 +123,4 @@ def resize_with_pad_torch(
         if batch_size == 1 and images.shape[0] == 1:
             padded_images = padded_images.squeeze(0)  # Remove batch dimension if it was added
-    return padded_images

 import jax
 import jax.numpy as jnp
 import torch
+import torch.nn.functional as F  # noqa: N812
 import openpi.shared.array_typing as at
 ) -> torch.Tensor:
     """PyTorch version of resize_with_pad. Resizes an image to a target height and width without distortion
     by padding with black. If the image is float32, it must be in the range [-1, 1].
     Args:
         images: Tensor of shape [*b, h, w, c] or [*b, c, h, w]
         height: Target height
         width: Target width
         mode: Interpolation mode ('bilinear', 'nearest', etc.)
     Returns:
         Resized and padded tensor with same shape format as input
     """
     # Resize
     resized_images = F.interpolate(
+        images, size=(resized_height, resized_width), mode=mode, align_corners=False if mode == "bilinear" else None
     )
     # Handle dtype-specific clipping
     padded_images = F.pad(
         resized_images,
         (pad_w0, pad_w1, pad_h0, pad_h1),  # left, right, top, bottom
+        mode="constant",
+        value=constant_value,
     )
     # Convert back to original format if needed
         if batch_size == 1 and images.shape[0] == 1:
             padded_images = padded_images.squeeze(0)  # Remove batch dimension if it was added
+    return padded_images

src/openpi/training/config.py CHANGED Viewed

@@ -6,7 +6,7 @@ import dataclasses
 import difflib
 import logging
 import pathlib
-from typing import Any, Protocol, TypeAlias, Literal
 import etils.epath as epath
 import flax.nnx as nnx
@@ -623,7 +623,7 @@ _CONFIGS = [
         data=SimpleDataConfig(
             assets=AssetsConfig(asset_id="droid"),
             data_transforms=lambda model: _transforms.Group(
-                inputs=[droid_policy.DroidInputs( model_type=ModelType.PI05)],
                 outputs=[droid_policy.DroidOutputs()],
             ),
             base_config=DataConfig(

 import difflib
 import logging
 import pathlib
+from typing import Any, Literal, Protocol, TypeAlias
 import etils.epath as epath
 import flax.nnx as nnx
         data=SimpleDataConfig(
             assets=AssetsConfig(asset_id="droid"),
             data_transforms=lambda model: _transforms.Group(
+                inputs=[droid_policy.DroidInputs(model_type=ModelType.PI05)],
                 outputs=[droid_policy.DroidOutputs()],
             ),
             base_config=DataConfig(

src/openpi/training/data_loader.py CHANGED Viewed

@@ -1,14 +1,13 @@
 from collections.abc import Iterator, Sequence
-from typing import Literal
 import multiprocessing
 import os
 import typing
-from typing import Protocol, SupportsIndex, TypeVar
 import jax
 import jax.numpy as jnp
 import lerobot.common.datasets.lerobot_dataset as lerobot_dataset
-import logging
 import numpy as np
 import torch
@@ -231,7 +230,7 @@ def create_data_loader(
     framework: Literal["jax", "pytorch"],
 ) -> DataLoader[tuple[_model.Observation, _model.Actions]]:
     """Create a data loader for training.
     Args:
         config: The training configuration.
         sharding: The sharding to use for the data loader (JAX only).
@@ -367,22 +366,21 @@ def create_rlds_data_loader(
     """
     if framework == "pytorch":
         raise NotImplementedError("PyTorch RLDS data loader is not supported yet")
-    else:
-        dataset = create_rlds_dataset(data_config, action_horizon, batch_size, shuffle=shuffle)
-        dataset = transform_iterable_dataset(dataset, data_config, skip_norm_stats=skip_norm_stats, is_batched=True)
-        data_loader = RLDSDataLoader(
-            dataset,
-            sharding=sharding,
-            num_batches=num_batches,
-        )
     return DataLoaderImpl(data_config, data_loader)
 class TorchDataLoader:
     """Torch data loader implementation."""
     def __init__(
         self,
         dataset,

 from collections.abc import Iterator, Sequence
+import logging
 import multiprocessing
 import os
 import typing
+from typing import Literal, Protocol, SupportsIndex, TypeVar
 import jax
 import jax.numpy as jnp
 import lerobot.common.datasets.lerobot_dataset as lerobot_dataset
 import numpy as np
 import torch
     framework: Literal["jax", "pytorch"],
 ) -> DataLoader[tuple[_model.Observation, _model.Actions]]:
     """Create a data loader for training.
     Args:
         config: The training configuration.
         sharding: The sharding to use for the data loader (JAX only).
     """
     if framework == "pytorch":
         raise NotImplementedError("PyTorch RLDS data loader is not supported yet")
+    dataset = create_rlds_dataset(data_config, action_horizon, batch_size, shuffle=shuffle)
+    dataset = transform_iterable_dataset(dataset, data_config, skip_norm_stats=skip_norm_stats, is_batched=True)
+    data_loader = RLDSDataLoader(
+        dataset,
+        sharding=sharding,
+        num_batches=num_batches,
+    )
     return DataLoaderImpl(data_config, data_loader)
 class TorchDataLoader:
     """Torch data loader implementation."""
     def __init__(
         self,
         dataset,