File size: 7,916 Bytes
90c97df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
run_name: glue_20251002_155411
seed: 6198
epoch: null
dry_run: false
model:
  d_model: 3584
  n_heads: 28
  n_kv_heads: 4
  qkv_bias: true
  clip_qkv: null
  n_layers: 28
  mlp_ratio: 4
  mlp_hidden_size: 37888
  activation_type: swiglu
  block_type: sequential
  block_group_size: 1
  rope: true
  rope_full_precision: true
  rope_theta: 1000000.0
  vision_backbone:
    image_model_type: openai
    image_default_input_size:
    - 336
    - 336
    image_patch_size: 14
    image_pos_patch_size: 14
    image_emb_dim: 1024
    image_num_heads: 16
    image_num_key_value_heads: 16
    image_num_layers: 23
    image_head_dim: 64
    image_mlp_dim: 4096
    image_mlp_activations: quick_gelu
    image_dropout_rate: 0.0
    image_num_pos: 577
    image_norm_eps: 1.0e-05
    attention_dropout: 0.0
    residual_dropout: 0.0
    initializer_range: 0.02
    fsdp_wrap: false
    resize_mode: default
  vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
  llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
  low_cpu_fsdp: true
  attention_type: sdpa
  float32_attention: true
  attention_dropout: 0.0
  attention_layer_norm: false
  residual_dropout: 0.1
  response_residual_dropout: 0.0
  embedding_dropout: 0.0
  layer_norm_type: rms
  layer_norm_with_affine: true
  layer_norm_eps: 1.0e-06
  attention_layer_norm_with_affine: true
  max_sequence_length: 4096
  max_position_embeddings: null
  include_bias: false
  bias_for_layer_norm: null
  scale_logits: false
  vocab_size: 152064
  embedding_size: 152064
  ff_out_size: null
  additional_vocab_size: 128
  new_embedding_init_range: 0.02
  weight_tying: false
  init_device: null
  init_fn: normal
  init_std: 0.02
  init_cutoff_factor: null
  norm_after: false
  precision: amp_bf16
  max_crops: 12
  crop_mode: overlap-and-resize-c2
  use_col_tokens: true
  prompt_type: uber_model
  system_prompt_kind: demo_or_style
  message_formatting: role
  always_start_with_space: true
  multi_annotation_weighting: root_subsegments
  default_inference_len: 65
  overlap_margins:
  - 4
  - 4
  pad_value: 0.0
  image_padding_embed: pad_and_partial_pad
  fix_image_padding: true
  vit_layers:
  - -2
  - -9
  image_pooling_h: 2
  image_pooling_w: 2
  image_pooling_2d: attention_meanq
  image_projector: mlp
  image_feature_dropout: 0.0
  initializer_range: 0.02
  normalize_input_embeds: false
  use_position_ids: true
  head_dim: null
  action_tokenizer:
    identifier: physical-intelligence/fast
    tokenizer_dir: null
  action_dim: 7
  horizon: 8
  tokenizer:
    identifier: Qwen/Qwen2-7B
    tokenizer_dir: null
  pad_tokenizer: true
  moe_num_experts: 8
  moe_top_k: 2
  moe_mlp_impl: sparse
  moe_log_expert_assignment: false
  moe_shared_expert: false
  moe_lbl_in_fp32: false
  moe_interleave: false
  moe_loss_weight: 0.1
  moe_zloss_weight: null
  moe_dropless: true
  moe_capacity_factor: 1.25
  action_head: l1_regression
  num_diffusion_steps: 1000
  num_diffusion_inference_steps: 30
  use_proprio: true
  action_head_dit_hidden_size: 1152
  action_head_dit_depth: 28
  action_head_dit_num_heads: 16
  llm_causal_attention: false
  action_use_left_eef: true
  action_use_mobile_base: false
allow_resume: false
ft_llm: true
ft_vit: false
ft_connector: false
ft_embedding: lm_head
lora: false
use_lora: true
lora_rank: 8
lora_llm: false
lora_vit: false
lora_connector: false
early_exit: false
train_exit_random_layer: false
optimizer:
  name: adamw
  learning_rate: 0.0001
  weight_decay: 0.01
  betas:
  - 0.9
  - 0.95
  eps: 1.0e-05
  connector_learning_rate: 0.0002
  vit_learning_rate: 6.0e-06
  llm_learning_rate: 5.0e-05
  connector_weight_decay: 0.0
  vit_weight_decay: 0.0
  llm_weight_decay: 0.0
  connector_betas:
  - 0.9
  - 0.95
  vit_betas:
  - 0.9
  - 0.95
  llm_betas:
  - 0.9
  - 0.95
  connector_eps: 1.0e-06
  vit_eps: 1.0e-06
  llm_eps: 1.0e-06
  metrics_log_interval: 20
scheduler:
  name: multimodal
  units: steps
  t_warmup: 100
  t_max: null
  alpha_f: 0.1
  connector_t_warmup: 200
  vit_t_warmup: 2000
  llm_t_warmup: 2000
  grad_clip_warmup_steps: null
  grad_clip_warmup_factor: null
  warmup_min_lr: 0.0
data:
  dataset: vla_dataset_realworld
  mixture: null
  root_size_mixture: null
  split: train
  seed: 95818
  shuffle_messages: false
  pad: to_max
  sequence_length: 1600
  shuffle: true
  for_inference: false
  multi_modal: torch
  num_workers: 0
  drop_last: true
  pin_memory: true
  prefetch_factor: null
  persistent_workers: false
  timeout: 0
  rlds_dataset_name: libero_4_task_suites_no_noops
  rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
  use_wrist_image: true
  use_proprio: true
  rlds_shuffle_buffer_size: 100000
  rlds_traj_threads: 8
  rlds_read_threads: 8
  lerobot_episode_index_start: null
  lerobot_episode_index_end: null
restore_dataloader: true
fast_forward_batches: null
evaluators:
- label: val
  data:
    dataset: vla_dataset_realworld
    mixture: null
    root_size_mixture: null
    split: validation
    seed: null
    shuffle_messages: false
    pad: to_max
    sequence_length: 1600
    shuffle: false
    for_inference: false
    multi_modal: torch
    num_workers: 0
    drop_last: true
    pin_memory: true
    prefetch_factor: null
    persistent_workers: true
    timeout: 0
    rlds_dataset_name: libero_4_task_suites_no_noops
    rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
    use_wrist_image: true
    use_proprio: true
    rlds_shuffle_buffer_size: 256000
    rlds_traj_threads: 8
    rlds_read_threads: 8
    lerobot_episode_index_start: 353
    lerobot_episode_index_end: 765
  device_eval_batch_size: null
  subset_num_batches: 64
  max_examples: null
  max_new_tokens: 448
  mm_evaluator: null
  save_dir: null
  save_to_checkpoint_dir: false
  eval_name: null
  skip_if_metrics_cached: true
eval_interval: 0
inf_eval_interval: -1
inf_evaluators: []
save_folder: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/
remote_save_folder: null
canceled_check_interval: 50
save_interval: 500
save_interval_unsharded: 500
save_interval_ephemeral: null
save_interval_action_head: 500
save_num_checkpoints_to_keep: 1
save_num_unsharded_checkpoints_to_keep: 1
save_num_action_head_checkpoints_to_keep: 2
save_overwrite: true
force_save_unsharded: false
no_pre_train_checkpoint: true
initial_model_checkpoint: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
load_model_config: null
checkpoint_dir: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
load_path: null
load_path_sharded_checkpointer: null
reset_optimizer_state: false
reset_trainer_state: false
save_dataloader_state: false
reset_dataloader_state: false
keep_lr_on_load: true
sharded_checkpointer: torch_legacy
max_duration: 500000
global_train_batch_size: 126
device_train_batch_size: 15
device_train_microbatch_size: 16
device_eval_batch_size: 4
eval_subset_num_batches: -1
eval_on_load: false
device_inf_eval_batch_size: 16
inf_eval_subset_num_batches: -1
device_train_grad_accum: 0
max_grad_norm: 1.0
multi_component_grad_norm: true
batch_divisor: global_batch
max_grad_norm_ratio: null
precision: amp_bf16
wandb:
  project: a1-realworld
  entity: henryeap
  group: null
  name: glue_20251002_155411
  tags:
  - watching
  log_artifacts: false
  rank_zero_only: true
  log_interval: 1
speed_monitor:
  window_size: 20
  gpu_flops_available: null
console_log_interval: 1
gen1_gc_interval: 1
compile: null
fsdp:
  use_orig_params: true
  sharding_strategy: FULL_SHARD
  wrapping_strategy: by_block_and_size
  precision: float
  hybrid_sharding_num_model_replicas: null
softmax_auxiliary_loss: true
softmax_auxiliary_loss_scale: 0.0001
time_limit: null
extra_steps_after_cancel: 10
python_profiling: false
torch_profiling: false
stop_at: 500000
stop_after: null
activation_checkpointing: whole_layer
fused_loss: null