G0-VLA / G0Plus_PP_CKPT /config.yaml
whitbrunn's picture
1231: upload g0plus CKPT in TRT format for pp demo in the wild
5f3ded9 verified
seed: 7
resume_ckpt: /vla_fulltime/jianning.cui/code/GalaxeaFM/runs/merge_pipeline/real/r1lite_g0_pp_bbox_400_tasks/2025-12-22_05-53-31/checkpoints/step_124838.pt
output_dir: ${hydra:runtime.output_dir}
checkpointing_steps: 17834
logger:
type: wandb
log_steps: 10
task: ${hydra:runtime.choices.task}
project: ${split:${logger.task},0}
experiment_name: ${split:${logger.task},-1}
mode: online
workspace: cuijianning1996-galaxea-ai
dir: ${output_dir}/wandb
batch_size_val: 16
eval_episodes_num: 1
ckpt_path: /data/trt_ckpts/model_state_dict.pt
env: R1ProBlocksStackEasy
target_controller_type: bimanual_relaxed_ik
tags: null
edp:
card: null
training_time: ${now:%Y-%m-%d}_${now:%H-%M-%S}
git_branch: null
git_commit: null
root: null
repo_ids: null
save_dir: ${output_dir}
tags: ${tags}
max_steps: ${model.max_steps}
batch_size: ${model.batch_size}
data:
_target_: galaxea_fm.data.galaxea_lerobot_dataset.GalaxeaLerobotDataset
dataset_dirs: null
shape_meta:
action:
- key: left_arm
raw_shape: 6
shape: 6
- key: left_gripper
raw_shape: 1
shape: 1
- key: right_arm
raw_shape: 6
shape: 6
- key: right_gripper
raw_shape: 1
shape: 1
state:
- key: left_arm
raw_shape: 6
shape: 6
- key: left_gripper
raw_shape: 1
shape: 1
- key: right_arm
raw_shape: 6
shape: 6
- key: right_gripper
raw_shape: 1
shape: 1
images:
- key: head_condition
raw_shape:
- 3
- 224
- 224
shape:
- 3
- 224
- 224
- key: head_rgb
raw_shape:
- 3
- 720
- 1280
shape:
- 3
- 224
- 224
- key: left_wrist_rgb
raw_shape:
- 3
- 720
- 1280
shape:
- 3
- 224
- 224
- key: right_wrist_rgb
raw_shape:
- 3
- 720
- 1280
shape:
- 3
- 224
- 224
action_size: 32
past_action_size: 0
obs_size: 1
ee_start_moving_thresh: 0.0
val_set_proportion: 0.05
use_bbox_condition: true
dataset_root: /galaxea_dataset/galaxea/pp_project/lerobot_with_bbox
dataset_prefixes:
- BENCH
- Bench
model:
pretrained_ckpt: /galaxea_dataset/mnt/tmp/pp_wt_img_cond/checkpoints/org2fm_v2.pt
use_pretrained_norm_stats: true
model_weights_to_bf16: false
enable_bf16_training: true
use_torch_compile: false
find_unused_parameters: false
batch_size: 2
num_workers: 4
pin_memory: true
persistent_workers: true
max_epochs: 4
max_steps: null
grad_accumulation_steps: 2
use_8bit_optimizer: false
learning_rate: 2.5e-05
weight_decay: 1.0e-06
betas:
- 0.9
- 0.999
lr_scheduler_type: cosine
warmup_steps: 500
max_grad_norm: 1.0
use_ema: false
ema:
update_after_step: 0
power: 0.67
use_sync_bn: false
processor:
_target_: galaxea_fm.processors.galaxea_zero_processor.GalaxeaZeroProcessor
shape_meta: ${data.shape_meta}
num_obs_steps: ${data.obs_size}
action_state_transforms:
- _target_: galaxea_fm.transforms.relative_action.RelativeJointTransform
keys:
- left_arm
- right_arm
use_stepwise_action_norm: true
norm_default_mode: z-score
norm_exception_mode:
action:
left_gripper: 0/100
right_gripper: 0/100
action_state_merger:
_target_: galaxea_fm.transforms.action_state_merger.ConcatLeftAlign
train_transforms:
head_condition:
- _target_: torchvision.transforms.Resize
size:
- 224
- 224
- _target_: galaxea_fm.transforms.image.ToTensor
- _target_: torchvision.transforms.Normalize
mean:
- 0.5
- 0.5
- 0.5
std:
- 0.5
- 0.5
- 0.5
head_rgb:
- _target_: torchvision.transforms.Resize
size:
- 224
- 224
- _target_: galaxea_fm.transforms.image.ToTensor
- _target_: torchvision.transforms.Normalize
mean:
- 0.5
- 0.5
- 0.5
std:
- 0.5
- 0.5
- 0.5
left_wrist_rgb:
- _target_: torchvision.transforms.Resize
size:
- 224
- 224
- _target_: galaxea_fm.transforms.image.ToTensor
- _target_: torchvision.transforms.Normalize
mean:
- 0.5
- 0.5
- 0.5
std:
- 0.5
- 0.5
- 0.5
right_wrist_rgb:
- _target_: torchvision.transforms.Resize
size:
- 224
- 224
- _target_: galaxea_fm.transforms.image.ToTensor
- _target_: torchvision.transforms.Normalize
mean:
- 0.5
- 0.5
- 0.5
std:
- 0.5
- 0.5
- 0.5
val_transforms:
head_condition:
- _target_: torchvision.transforms.Resize
size:
- 224
- 224
- _target_: galaxea_fm.transforms.image.ToTensor
- _target_: torchvision.transforms.Normalize
mean:
- 0.5
- 0.5
- 0.5
std:
- 0.5
- 0.5
- 0.5
head_rgb:
- _target_: torchvision.transforms.Resize
size:
- 224
- 224
- _target_: galaxea_fm.transforms.image.ToTensor
- _target_: torchvision.transforms.Normalize
mean:
- 0.5
- 0.5
- 0.5
std:
- 0.5
- 0.5
- 0.5
left_wrist_rgb:
- _target_: torchvision.transforms.Resize
size:
- 224
- 224
- _target_: galaxea_fm.transforms.image.ToTensor
- _target_: torchvision.transforms.Normalize
mean:
- 0.5
- 0.5
- 0.5
std:
- 0.5
- 0.5
- 0.5
right_wrist_rgb:
- _target_: torchvision.transforms.Resize
size:
- 224
- 224
- _target_: galaxea_fm.transforms.image.ToTensor
- _target_: torchvision.transforms.Normalize
mean:
- 0.5
- 0.5
- 0.5
std:
- 0.5
- 0.5
- 0.5
num_output_cameras: 4
use_zh_instruction: false
drop_high_level_prob: 1.0
pad_token_id: ${model.model_arch.pad_token_id}
image_token_index: ${model.model_arch.image_token_index}
tokenizer_params:
pretrained_model_name_or_path: /data/google/paligemma-3b-pt-224
local_files_only: false
token: null
max_text_tokens: ${model.model_arch.max_text_tokens}
max_image_text_tokens: ${model.model_arch.max_image_text_tokens}
num_input_cameras: ${model.model_arch.num_input_images}
num_image_tokens_per_camera: ${model.model_arch.vision.num_image_tokens}
model_arch:
_target_: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZeroPolicy
model_name: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZero
pretrained_model_path: /data/google/paligemma-3b-pt-224
vla_training_strategy: vla-full-train
backbone_lr_multiplier: 1.0
image_token_index: 257152
pad_token_id: 0
vocab_size: 257216
cond_steps: ${data.obs_size}
horizon_steps: ${data.action_size}
max_text_tokens: 55
max_image_text_tokens: ${eval:'${model.model_arch.num_input_images} * ${model.model_arch.vision.num_image_tokens}
+ ${model.model_arch.max_text_tokens}'}
num_input_images: ${eval:'${model.model_arch.cond_steps} * ${model.processor.num_output_cameras}'}
num_extra_image_tokens_per_camera: 0
final_action_clip_value: null
action_dim: 14
proprio_dim: 14
action_decoder_layers: 2
action_expert_adaptive_mode: null
flow_sampling: beta
num_inference_steps: 10
vision:
name: galaxea_fm.models.galaxea_zero.paligemma.siglip.SiglipVisionModel
hidden_size: 1152
intermediate_size: 4304
num_hidden_layers: 27
num_attention_heads: 16
num_channels: 3
image_size: 224
patch_size: 14
layer_norm_eps: 1.0e-06
attention_dropout: 0.0
num_image_tokens: 256
vision_projector:
name: galaxea_fm.models.galaxea_zero.paligemma.siglip.PaliGemmaMultiModalProjector
vision_config:
hidden_size: 1152
projection_dim: 2048
joint:
name: galaxea_fm.models.galaxea_zero.joint_model.JointModel
action_expert_adaptive_mode: null
mixture:
vlm:
hidden_size: 2048
intermediate_size: 16384
use_final_norm: false
cache: true
proprio:
hidden_size: 1024
intermediate_size: 4096
use_final_norm: true
cache: true
adaptive_mode: null
action:
hidden_size: 1024
intermediate_size: 4096
use_final_norm: true
cache: false
adaptive_mode: null
time_hidden_size: 256
num_hidden_layers: 18
num_attention_heads: 8
num_key_value_heads: 1
head_dim: 256
max_position_embeddings: 8192
rms_norm_eps: 1.0e-06
rope_theta: 10000.0
attention_bias: false
attention_dropout: 0.0